diff --git a/.gitignore b/.gitignore index df84f0e..6e4b5f5 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,4 @@ expat-2.0.1.tar.gz /expat-2.2.8.tar.gz /expat-2.2.10.tar.gz /expat-2.4.7.tar.gz +/expat-2.4.9.tar.gz diff --git a/expat-2.2.10-Add-missing-validation-of-encoding.patch b/expat-2.2.10-Add-missing-validation-of-encoding.patch deleted file mode 100644 index 2d526c2..0000000 --- a/expat-2.2.10-Add-missing-validation-of-encoding.patch +++ /dev/null @@ -1,281 +0,0 @@ -From ee2a5b50e7d1940ba8745715b62ceb9efd3a96da Mon Sep 17 00:00:00 2001 -From: Sebastian Pipping -Date: Tue, 8 Feb 2022 17:37:14 +0100 -Subject: [PATCH 1/5] lib: Drop unused macro UTF8_GET_NAMING - ---- - expat/lib/xmltok.c | 5 ----- - 1 file changed, 5 deletions(-) - -diff --git a/lib/xmltok.c b/lib/xmltok.c -index a72200e8..3bddf125 100644 ---- a/lib/xmltok.c -+++ b/lib/xmltok.c -@@ -98,11 +98,6 @@ - + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)] \ - & (1u << (((byte)[2]) & 0x1F))) - --#define UTF8_GET_NAMING(pages, p, n) \ -- ((n) == 2 \ -- ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \ -- : ((n) == 3 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) : 0)) -- - /* Detection of invalid UTF-8 sequences is based on Table 3.1B - of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/ - with the additional restriction of not allowing the Unicode - -From 3f0a0cb644438d4d8e3294cd0b1245d0edb0c6c6 Mon Sep 17 00:00:00 2001 -From: Sebastian Pipping -Date: Tue, 8 Feb 2022 04:32:20 +0100 -Subject: [PATCH 2/5] lib: Add missing validation of encoding (CVE-2022-25235) - ---- - expat/lib/xmltok_impl.c | 8 ++++++-- - 1 file changed, 6 insertions(+), 2 deletions(-) - -diff --git a/lib/xmltok_impl.c b/lib/xmltok_impl.c -index 0430591b..64a3b2c1 100644 ---- a/lib/xmltok_impl.c -+++ b/lib/xmltok_impl.c -@@ -69,7 +69,7 @@ - case BT_LEAD##n: \ - if (end - ptr < n) \ - return XML_TOK_PARTIAL_CHAR; \ -- if (! IS_NAME_CHAR(enc, ptr, n)) { \ -+ if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NAME_CHAR(enc, ptr, n)) { \ - *nextTokPtr = ptr; \ - return XML_TOK_INVALID; \ - } \ -@@ -98,7 +98,7 @@ - case BT_LEAD##n: \ - if (end - ptr < n) \ - return XML_TOK_PARTIAL_CHAR; \ -- if (! IS_NMSTRT_CHAR(enc, ptr, n)) { \ -+ if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NMSTRT_CHAR(enc, ptr, n)) { \ - *nextTokPtr = ptr; \ - return XML_TOK_INVALID; \ - } \ -@@ -1142,6 +1142,10 @@ PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, - case BT_LEAD##n: \ - if (end - ptr < n) \ - return XML_TOK_PARTIAL_CHAR; \ -+ if (IS_INVALID_CHAR(enc, ptr, n)) { \ -+ *nextTokPtr = ptr; \ -+ return XML_TOK_INVALID; \ -+ } \ - if (IS_NMSTRT_CHAR(enc, ptr, n)) { \ - ptr += n; \ - tok = XML_TOK_NAME; \ - -From c85a3025e7a1be086dc34e7559fbc543914d047f Mon Sep 17 00:00:00 2001 -From: Sebastian Pipping -Date: Wed, 9 Feb 2022 01:00:38 +0100 -Subject: [PATCH 3/5] lib: Add comments to BT_LEAD* cases where encoding has - already been validated - ---- - expat/lib/xmltok_impl.c | 10 +++++----- - 1 file changed, 5 insertions(+), 5 deletions(-) - -diff --git a/lib/xmltok_impl.c b/lib/xmltok_impl.c -index 64a3b2c1..84ff35f9 100644 ---- a/lib/xmltok_impl.c -+++ b/lib/xmltok_impl.c -@@ -1274,7 +1274,7 @@ PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end, - switch (BYTE_TYPE(enc, ptr)) { - # define LEAD_CASE(n) \ - case BT_LEAD##n: \ -- ptr += n; \ -+ ptr += n; /* NOTE: The encoding has already been validated. */ \ - break; - LEAD_CASE(2) - LEAD_CASE(3) -@@ -1343,7 +1343,7 @@ PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end, - switch (BYTE_TYPE(enc, ptr)) { - # define LEAD_CASE(n) \ - case BT_LEAD##n: \ -- ptr += n; \ -+ ptr += n; /* NOTE: The encoding has already been validated. */ \ - break; - LEAD_CASE(2) - LEAD_CASE(3) -@@ -1522,7 +1522,7 @@ PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax, - state = inName; \ - } - # define LEAD_CASE(n) \ -- case BT_LEAD##n: \ -+ case BT_LEAD##n: /* NOTE: The encoding has already been validated. */ \ - START_NAME ptr += (n - MINBPC(enc)); \ - break; - LEAD_CASE(2) -@@ -1734,7 +1734,7 @@ PREFIX(nameLength)(const ENCODING *enc, const char *ptr) { - switch (BYTE_TYPE(enc, ptr)) { - # define LEAD_CASE(n) \ - case BT_LEAD##n: \ -- ptr += n; \ -+ ptr += n; /* NOTE: The encoding has already been validated. */ \ - break; - LEAD_CASE(2) - LEAD_CASE(3) -@@ -1779,7 +1779,7 @@ PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end, - switch (BYTE_TYPE(enc, ptr)) { - # define LEAD_CASE(n) \ - case BT_LEAD##n: \ -- ptr += n; \ -+ ptr += n; /* NOTE: The encoding has already been validated. */ \ - pos->columnNumber++; \ - break; - LEAD_CASE(2) - -From 6a5510bc6b7efe743356296724e0b38300f05379 Mon Sep 17 00:00:00 2001 -From: Sebastian Pipping -Date: Tue, 8 Feb 2022 04:06:21 +0100 -Subject: [PATCH 4/5] tests: Cover missing validation of encoding - (CVE-2022-25235) - ---- - expat/tests/runtests.c | 109 +++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 109 insertions(+) - -diff --git a/tests/runtests.c b/tests/runtests.c -index bc5344b1..9b155b82 100644 ---- a/tests/runtests.c -+++ b/tests/runtests.c -@@ -5998,6 +5998,105 @@ START_TEST(test_utf8_in_cdata_section_2) { - } - END_TEST - -+START_TEST(test_utf8_in_start_tags) { -+ struct test_case { -+ bool goodName; -+ bool goodNameStart; -+ const char *tagName; -+ }; -+ -+ // The idea with the tests below is this: -+ // We want to cover 1-, 2- and 3-byte sequences, 4-byte sequences -+ // go to isNever and are hence not a concern. -+ // -+ // We start with a character that is a valid name character -+ // (or even name-start character, see XML 1.0r4 spec) and then we flip -+ // single bits at places where (1) the result leaves the UTF-8 encoding space -+ // and (2) we stay in the same n-byte sequence family. -+ // -+ // The flipped bits are highlighted in angle brackets in comments, -+ // e.g. "[<1>011 1001]" means we had [0011 1001] but we now flipped -+ // the most significant bit to 1 to leave UTF-8 encoding space. -+ struct test_case cases[] = { -+ // 1-byte UTF-8: [0xxx xxxx] -+ {true, true, "\x3A"}, // [0011 1010] = ASCII colon ':' -+ {false, false, "\xBA"}, // [<1>011 1010] -+ {true, false, "\x39"}, // [0011 1001] = ASCII nine '9' -+ {false, false, "\xB9"}, // [<1>011 1001] -+ -+ // 2-byte UTF-8: [110x xxxx] [10xx xxxx] -+ {true, true, "\xDB\xA5"}, // [1101 1011] [1010 0101] = -+ // Arabic small waw U+06E5 -+ {false, false, "\x9B\xA5"}, // [1<0>01 1011] [1010 0101] -+ {false, false, "\xDB\x25"}, // [1101 1011] [<0>010 0101] -+ {false, false, "\xDB\xE5"}, // [1101 1011] [1<1>10 0101] -+ {true, false, "\xCC\x81"}, // [1100 1100] [1000 0001] = -+ // combining char U+0301 -+ {false, false, "\x8C\x81"}, // [1<0>00 1100] [1000 0001] -+ {false, false, "\xCC\x01"}, // [1100 1100] [<0>000 0001] -+ {false, false, "\xCC\xC1"}, // [1100 1100] [1<1>00 0001] -+ -+ // 3-byte UTF-8: [1110 xxxx] [10xx xxxx] [10xxxxxx] -+ {true, true, "\xE0\xA4\x85"}, // [1110 0000] [1010 0100] [1000 0101] = -+ // Devanagari Letter A U+0905 -+ {false, false, "\xA0\xA4\x85"}, // [1<0>10 0000] [1010 0100] [1000 0101] -+ {false, false, "\xE0\x24\x85"}, // [1110 0000] [<0>010 0100] [1000 0101] -+ {false, false, "\xE0\xE4\x85"}, // [1110 0000] [1<1>10 0100] [1000 0101] -+ {false, false, "\xE0\xA4\x05"}, // [1110 0000] [1010 0100] [<0>000 0101] -+ {false, false, "\xE0\xA4\xC5"}, // [1110 0000] [1010 0100] [1<1>00 0101] -+ {true, false, "\xE0\xA4\x81"}, // [1110 0000] [1010 0100] [1000 0001] = -+ // combining char U+0901 -+ {false, false, "\xA0\xA4\x81"}, // [1<0>10 0000] [1010 0100] [1000 0001] -+ {false, false, "\xE0\x24\x81"}, // [1110 0000] [<0>010 0100] [1000 0001] -+ {false, false, "\xE0\xE4\x81"}, // [1110 0000] [1<1>10 0100] [1000 0001] -+ {false, false, "\xE0\xA4\x01"}, // [1110 0000] [1010 0100] [<0>000 0001] -+ {false, false, "\xE0\xA4\xC1"}, // [1110 0000] [1010 0100] [1<1>00 0001] -+ }; -+ const bool atNameStart[] = {true, false}; -+ -+ size_t i = 0; -+ char doc[1024]; -+ size_t failCount = 0; -+ -+ for (; i < sizeof(cases) / sizeof(cases[0]); i++) { -+ size_t j = 0; -+ for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) { -+ const bool expectedSuccess -+ = atNameStart[j] ? cases[i].goodNameStart : cases[i].goodName; -+ sprintf(doc, "<%s%s>