From 446d8077dad5ae0faad7b9cff9c61529e34d21bb Mon Sep 17 00:00:00 2001 From: Philip Withnall Date: Tue, 25 Nov 2025 19:27:51 +0000 Subject: [PATCH 1/4] gutf8: Add tests and clarify documentation for g_unichar_to_utf8() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While using `g_unichar_to_utf8()`, I noticed the return type is `int`, which suggests that it could return negative values. Wanting to make sure I properly handled this in my code, I checked the implementation of `g_unichar_to_utf8()` and found that it can actually only return [1, 6]. It looks like the `int` return type is a historical C-ism, and it should really return an unsigned integer type. Improve the docs and add some unit tests while I’m there. Signed-off-by: Philip Withnall --- glib/gutf8.c | 8 ++++---- glib/tests/unicode.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/glib/gutf8.c b/glib/gutf8.c index 06cb9e779..5deee968e 100644 --- a/glib/gutf8.c +++ b/glib/gutf8.c @@ -454,16 +454,16 @@ g_utf8_strncpy (gchar *dest, * * Converts a single character to UTF-8. * - * Returns: number of bytes written + * Returns: number of bytes written, guaranteed to be in the range [1, 6] */ int g_unichar_to_utf8 (gunichar c, gchar *outbuf) { /* If this gets modified, also update the copy in g_string_insert_unichar() */ - guint len = 0; - int first; - int i; + size_t len = 0; + char first; + size_t i; if (c < 0x80) { diff --git a/glib/tests/unicode.c b/glib/tests/unicode.c index c81c9b8e8..8aacb8167 100644 --- a/glib/tests/unicode.c +++ b/glib/tests/unicode.c @@ -525,6 +525,41 @@ test_wide (void) } }; +static void +test_unichar_to_utf8 (void) +{ + const struct + { + gunichar unichar; + int expected_length; + const char *expected_output; /* must be of length `expected_length` */ + } + vectors[] = + { + { 0x21, 1, "!" }, + { 0x00A1, 2, "\xc2\xa1" }, + { 0x0800, 3, "\xe0\xa0\x80" }, + { 0x10000, 4, "\xf0\x90\x80\x80" }, + { 0x200000, 5, "\xf8\x88\x80\x80\x80" }, + { 0x4000000, 6, "\xfc\x84\x80\x80\x80\x80" }, + }; + + for (size_t i = 0; i < G_N_ELEMENTS (vectors); i++) + { + int length; + char output[6]; + + length = g_unichar_to_utf8 (vectors[i].unichar, NULL); + g_assert_cmpint (length, ==, vectors[i].expected_length); + + length = g_unichar_to_utf8 (vectors[i].unichar, output); + g_assert_cmpint (length, ==, vectors[i].expected_length); + g_assert_cmpmem (output, length, vectors[i].expected_output, vectors[i].expected_length); + } +} + +/* Test that g_unichar_compose() returns the correct value for various + * ASCII and Unicode alphabetic, numeric, and other, codepoints. */ static void test_compose (void) { @@ -946,6 +981,7 @@ main (int argc, g_test_add_func ("/unicode/fully-decompose-len", test_fully_decompose_len); g_test_add_func ("/unicode/iso15924", test_iso15924); g_test_add_func ("/unicode/cases", test_cases); + g_test_add_func ("/unicode/unichar-to-utf8", test_unichar_to_utf8); return g_test_run(); } -- 2.53.0 From 207d406271e3b5f89337ebdcefc0362c5b724d12 Mon Sep 17 00:00:00 2001 From: Philip Withnall Date: Tue, 25 Nov 2025 19:02:56 +0000 Subject: [PATCH 2/4] gvariant-parser: Fix potential integer overflow parsing (byte)strings The termination condition for parsing string and bytestring literals in GVariant text format input was subject to an integer overflow for input string (or bytestring) literals longer than `INT_MAX`. Fix that by counting as a `size_t` rather than as an `int`. The counter can never correctly be negative. Spotted by treeplus. Thanks to the Sovereign Tech Resilience programme from the Sovereign Tech Agency. ID: #YWH-PGM9867-145 Signed-off-by: Philip Withnall Fixes: #3834 --- glib/gvariant-parser.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/glib/gvariant-parser.c b/glib/gvariant-parser.c index 3261bc1af..4c1d2626b 100644 --- a/glib/gvariant-parser.c +++ b/glib/gvariant-parser.c @@ -585,7 +585,7 @@ ast_resolve (AST *ast, { GVariant *value; gchar *pattern; - gint i, j = 0; + size_t i, j = 0; pattern = ast_get_pattern (ast, error); @@ -1525,10 +1525,10 @@ string_free (AST *ast) static gboolean unicode_unescape (const gchar *src, - gint *src_ofs, + size_t *src_ofs, gchar *dest, - gint *dest_ofs, - gint length, + size_t *dest_ofs, + gsize length, SourceRef *ref, GError **error) { @@ -1548,7 +1548,7 @@ unicode_unescape (const gchar *src, { parser_set_error (error, ref, NULL, G_VARIANT_PARSE_ERROR_INVALID_CHARACTER, - "invalid %d-character unicode escape", length); + "invalid %zu-character unicode escape", length); return FALSE; } @@ -1576,7 +1576,7 @@ string_parse (TokenStream *stream, gsize length; gchar quote; gchar *str; - gint i, j; + size_t i, j; token_stream_start_ref (stream, &ref); token = token_stream_get (stream); @@ -1704,7 +1704,7 @@ bytestring_parse (TokenStream *stream, gsize length; gchar quote; gchar *str; - gint i, j; + size_t i, j; token_stream_start_ref (stream, &ref); token = token_stream_get (stream); -- 2.53.0 From abadf4e327d2936012089a7368b7e0ee7dcfabd3 Mon Sep 17 00:00:00 2001 From: Philip Withnall Date: Tue, 25 Nov 2025 19:19:16 +0000 Subject: [PATCH 3/4] gvariant-parser: Use size_t to count numbers of child elements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rather than using `gint`, which could overflow for arrays (or dicts, or tuples) longer than `INT_MAX`. There may be other limits which prevent parsed containers becoming that long, but we might as well make the type system reflect the programmer’s intention as best it can anyway. For arrays and tuples this is straightforward. For dictionaries, it’s slightly complicated by the fact that the code used `dict->n_children == -1` to indicate that the `Dictionary` struct in question actually represented a single freestanding dict entry. In GVariant text format, that would be `{1, "one"}`. The implementation previously didn’t define the semantics of `dict->n_children < -1`. Now, instead, change `Dictionary.n_children` to `size_t`, and define a magic value `DICTIONARY_N_CHILDREN_FREESTANDING_ENTRY` to indicate that the `Dictionary` represents a single freestanding dict entry. This magic value is `SIZE_MAX`, and given that a dictionary entry takes more than one byte to represent in GVariant text format, that means it’s not possible to have that many entries in a parsed dictionary, so this magic value won’t be hit by a normal dictionary. An assertion checks this anyway. Spotted while working on #3834. Signed-off-by: Philip Withnall --- glib/gvariant-parser.c | 58 ++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/glib/gvariant-parser.c b/glib/gvariant-parser.c index 4c1d2626b..adc559408 100644 --- a/glib/gvariant-parser.c +++ b/glib/gvariant-parser.c @@ -637,9 +637,9 @@ static AST *parse (TokenStream *stream, GError **error); static void -ast_array_append (AST ***array, - gint *n_items, - AST *ast) +ast_array_append (AST ***array, + size_t *n_items, + AST *ast) { if ((*n_items & (*n_items - 1)) == 0) *array = g_renew (AST *, *array, *n_items ? 2 ** n_items : 1); @@ -648,10 +648,10 @@ ast_array_append (AST ***array, } static void -ast_array_free (AST **array, - gint n_items) +ast_array_free (AST **array, + size_t n_items) { - gint i; + size_t i; for (i = 0; i < n_items; i++) ast_free (array[i]); @@ -660,11 +660,11 @@ ast_array_free (AST **array, static gchar * ast_array_get_pattern (AST **array, - gint n_items, + size_t n_items, GError **error) { gchar *pattern; - gint i; + size_t i; pattern = ast_get_pattern (array[0], error); @@ -693,7 +693,7 @@ ast_array_get_pattern (AST **array, * pair of values. */ { - int j = 0; + size_t j = 0; while (TRUE) { @@ -867,7 +867,7 @@ typedef struct AST ast; AST **children; - gint n_children; + size_t n_children; } Array; static gchar * @@ -900,7 +900,7 @@ array_get_value (AST *ast, Array *array = (Array *) ast; const GVariantType *childtype; GVariantBuilder builder; - gint i; + size_t i; if (!g_variant_type_is_array (type)) return ast_type_error (ast, type, error); @@ -985,7 +985,7 @@ typedef struct AST ast; AST **children; - gint n_children; + size_t n_children; } Tuple; static gchar * @@ -995,7 +995,7 @@ tuple_get_pattern (AST *ast, Tuple *tuple = (Tuple *) ast; gchar *result = NULL; gchar **parts; - gint i; + size_t i; parts = g_new (gchar *, tuple->n_children + 4); parts[tuple->n_children + 1] = (gchar *) ")"; @@ -1025,7 +1025,7 @@ tuple_get_value (AST *ast, Tuple *tuple = (Tuple *) ast; const GVariantType *childtype; GVariantBuilder builder; - gint i; + size_t i; if (!g_variant_type_is_tuple (type)) return ast_type_error (ast, type, error); @@ -1215,9 +1215,16 @@ typedef struct AST **keys; AST **values; - gint n_children; + + /* Iff this is DICTIONARY_N_CHILDREN_FREESTANDING_ENTRY then this struct + * represents a single freestanding dict entry (`{1, "one"}`) rather than a + * full dict. In the freestanding case, @keys and @values have exactly one + * member each. */ + size_t n_children; } Dictionary; +#define DICTIONARY_N_CHILDREN_FREESTANDING_ENTRY ((size_t) -1) + static gchar * dictionary_get_pattern (AST *ast, GError **error) @@ -1232,7 +1239,7 @@ dictionary_get_pattern (AST *ast, return g_strdup ("Ma{**}"); key_pattern = ast_array_get_pattern (dict->keys, - abs (dict->n_children), + (dict->n_children == DICTIONARY_N_CHILDREN_FREESTANDING_ENTRY) ? 1 : dict->n_children, error); if (key_pattern == NULL) @@ -1263,7 +1270,7 @@ dictionary_get_pattern (AST *ast, return NULL; result = g_strdup_printf ("M%s{%c%s}", - dict->n_children > 0 ? "a" : "", + (dict->n_children > 0 && dict->n_children != DICTIONARY_N_CHILDREN_FREESTANDING_ENTRY) ? "a" : "", key_char, value_pattern); g_free (value_pattern); @@ -1277,7 +1284,7 @@ dictionary_get_value (AST *ast, { Dictionary *dict = (Dictionary *) ast; - if (dict->n_children == -1) + if (dict->n_children == DICTIONARY_N_CHILDREN_FREESTANDING_ENTRY) { const GVariantType *subtype; GVariantBuilder builder; @@ -1310,7 +1317,7 @@ dictionary_get_value (AST *ast, { const GVariantType *entry, *key, *val; GVariantBuilder builder; - gint i; + size_t i; if (!g_variant_type_is_subtype_of (type, G_VARIANT_TYPE_DICTIONARY)) return ast_type_error (ast, type, error); @@ -1351,12 +1358,12 @@ static void dictionary_free (AST *ast) { Dictionary *dict = (Dictionary *) ast; - gint n_children; + size_t n_children; - if (dict->n_children > -1) - n_children = dict->n_children; - else + if (dict->n_children == DICTIONARY_N_CHILDREN_FREESTANDING_ENTRY) n_children = 1; + else + n_children = dict->n_children; ast_array_free (dict->keys, n_children); ast_array_free (dict->values, n_children); @@ -1373,7 +1380,7 @@ dictionary_parse (TokenStream *stream, maybe_wrapper, dictionary_get_value, dictionary_free }; - gint n_keys, n_values; + size_t n_keys, n_values; gboolean only_one; Dictionary *dict; AST *first; @@ -1416,7 +1423,7 @@ dictionary_parse (TokenStream *stream, goto error; g_assert (n_keys == 1 && n_values == 1); - dict->n_children = -1; + dict->n_children = DICTIONARY_N_CHILDREN_FREESTANDING_ENTRY; return (AST *) dict; } @@ -1449,6 +1456,7 @@ dictionary_parse (TokenStream *stream, } g_assert (n_keys == n_values); + g_assert (n_keys != DICTIONARY_N_CHILDREN_FREESTANDING_ENTRY); dict->n_children = n_keys; return (AST *) dict; -- 2.53.0 From 4de574f856d5ed40fc6981f313a35454d7678109 Mon Sep 17 00:00:00 2001 From: Philip Withnall Date: Tue, 25 Nov 2025 19:25:58 +0000 Subject: [PATCH 4/4] gvariant-parser: Convert error handling code to use size_t The error handling code allows for printing out the range of input bytes related to a parsing error. This was previously done using `gint`, but the input could be longer than `INT_MAX`, so it should really be done using `size_t`. Spotted while working on #3834. Signed-off-by: Philip Withnall --- glib/gvariant-parser.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/glib/gvariant-parser.c b/glib/gvariant-parser.c index adc559408..199bd1f2c 100644 --- a/glib/gvariant-parser.c +++ b/glib/gvariant-parser.c @@ -86,7 +86,9 @@ g_variant_parser_get_error_quark (void) typedef struct { - gint start, end; + /* Offsets from the start of the input, in bytes. Can be equal when referring + * to a point rather than a range. The invariant `end >= start` always holds. */ + size_t start, end; } SourceRef; G_GNUC_PRINTF(5, 0) @@ -101,14 +103,16 @@ parser_set_error_va (GError **error, GString *msg = g_string_new (NULL); if (location->start == location->end) - g_string_append_printf (msg, "%d", location->start); + g_string_append_printf (msg, "%" G_GSIZE_FORMAT, location->start); else - g_string_append_printf (msg, "%d-%d", location->start, location->end); + g_string_append_printf (msg, "%" G_GSIZE_FORMAT "-%" G_GSIZE_FORMAT, + location->start, location->end); if (other != NULL) { g_assert (other->start != other->end); - g_string_append_printf (msg, ",%d-%d", other->start, other->end); + g_string_append_printf (msg, ",%" G_GSIZE_FORMAT "-%" G_GSIZE_FORMAT, + other->start, other->end); } g_string_append_c (msg, ':'); @@ -135,11 +139,15 @@ parser_set_error (GError **error, typedef struct { + /* We should always have the following ordering constraint: + * start <= this <= stream <= end + * Additionally, unless in an error or EOF state, `this < stream`. + */ const gchar *start; const gchar *stream; const gchar *end; - const gchar *this; + const gchar *this; /* (nullable) */ } TokenStream; @@ -170,7 +178,7 @@ token_stream_set_error (TokenStream *stream, static gboolean token_stream_prepare (TokenStream *stream) { - gint brackets = 0; + gssize brackets = 0; const gchar *end; if (stream->this != NULL) @@ -395,7 +403,7 @@ static void pattern_copy (gchar **out, const gchar **in) { - gint brackets = 0; + gssize brackets = 0; while (**in == 'a' || **in == 'm' || **in == 'M') *(*out)++ = *(*in)++; @@ -2592,7 +2600,7 @@ g_variant_builder_add_parsed (GVariantBuilder *builder, static gboolean parse_num (const gchar *num, const gchar *limit, - gint *result) + size_t *result) { gchar *endptr; gint64 bignum; @@ -2602,10 +2610,12 @@ parse_num (const gchar *num, if (endptr != limit) return FALSE; + /* The upper bound here is more restrictive than it technically needs to be, + * but should be enough for any practical situation: */ if (bignum < 0 || bignum > G_MAXINT) return FALSE; - *result = bignum; + *result = (size_t) bignum; return TRUE; } @@ -2616,7 +2626,7 @@ add_last_line (GString *err, { const gchar *last_nl; gchar *chomped; - gint i; + size_t i; /* This is an error at the end of input. If we have a file * with newlines, that's probably the empty string after the @@ -2761,7 +2771,7 @@ g_variant_parse_error_print_context (GError *error, if (dash == NULL || colon < dash) { - gint point; + size_t point; /* we have a single point */ if (!parse_num (error->message, colon, &point)) @@ -2779,7 +2789,7 @@ g_variant_parse_error_print_context (GError *error, /* We have one or two ranges... */ if (comma && comma < colon) { - gint start1, end1, start2, end2; + size_t start1, end1, start2, end2; const gchar *dash2; /* Two ranges */ @@ -2795,7 +2805,7 @@ g_variant_parse_error_print_context (GError *error, } else { - gint start, end; + size_t start, end; /* One range */ if (!parse_num (error->message, dash, &start) || !parse_num (dash + 1, colon, &end)) -- 2.53.0