expat/expat-2.2.10-Add-missing-validation-of-encoding.patch

From ee2a5b50e7d1940ba8745715b62ceb9efd3a96da Mon Sep 17 00:00:00 2001
From: Sebastian Pipping <sebastian@pipping.org>
Date: Tue, 8 Feb 2022 17:37:14 +0100
Subject: [PATCH 1/5] lib: Drop unused macro UTF8_GET_NAMING

---
 expat/lib/xmltok.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/lib/xmltok.c b/lib/xmltok.c
index a72200e8..3bddf125 100644
--- a/lib/xmltok.c
+++ b/lib/xmltok.c
@@ -98,11 +98,6 @@
         + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)]                 \
    & (1u << (((byte)[2]) & 0x1F)))
 
-#define UTF8_GET_NAMING(pages, p, n)                                           \
-  ((n) == 2                                                                    \
-       ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p))                   \
-       : ((n) == 3 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) : 0))
-
 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
    of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
    with the additional restriction of not allowing the Unicode

From 3f0a0cb644438d4d8e3294cd0b1245d0edb0c6c6 Mon Sep 17 00:00:00 2001
From: Sebastian Pipping <sebastian@pipping.org>
Date: Tue, 8 Feb 2022 04:32:20 +0100
Subject: [PATCH 2/5] lib: Add missing validation of encoding (CVE-2022-25235)

---
 expat/lib/xmltok_impl.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/lib/xmltok_impl.c b/lib/xmltok_impl.c
index 0430591b..64a3b2c1 100644
--- a/lib/xmltok_impl.c
+++ b/lib/xmltok_impl.c
@@ -69,7 +69,7 @@
   case BT_LEAD##n:                                                             \
     if (end - ptr < n)                                                         \
       return XML_TOK_PARTIAL_CHAR;                                             \
-    if (! IS_NAME_CHAR(enc, ptr, n)) {                                         \
+    if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NAME_CHAR(enc, ptr, n)) {         \
       *nextTokPtr = ptr;                                                       \
       return XML_TOK_INVALID;                                                  \
     }                                                                          \
@@ -98,7 +98,7 @@
   case BT_LEAD##n:                                                             \
     if (end - ptr < n)                                                         \
       return XML_TOK_PARTIAL_CHAR;                                             \
-    if (! IS_NMSTRT_CHAR(enc, ptr, n)) {                                       \
+    if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NMSTRT_CHAR(enc, ptr, n)) {       \
       *nextTokPtr = ptr;                                                       \
       return XML_TOK_INVALID;                                                  \
     }                                                                          \
@@ -1142,6 +1142,10 @@ PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
   case BT_LEAD##n:                                                             \
     if (end - ptr < n)                                                         \
       return XML_TOK_PARTIAL_CHAR;                                             \
+    if (IS_INVALID_CHAR(enc, ptr, n)) {                                        \
+      *nextTokPtr = ptr;                                                       \
+      return XML_TOK_INVALID;                                                  \
+    }                                                                          \
     if (IS_NMSTRT_CHAR(enc, ptr, n)) {                                         \
       ptr += n;                                                                \
       tok = XML_TOK_NAME;                                                      \

From c85a3025e7a1be086dc34e7559fbc543914d047f Mon Sep 17 00:00:00 2001
From: Sebastian Pipping <sebastian@pipping.org>
Date: Wed, 9 Feb 2022 01:00:38 +0100
Subject: [PATCH 3/5] lib: Add comments to BT_LEAD* cases where encoding has
 already been validated

---
 expat/lib/xmltok_impl.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/xmltok_impl.c b/lib/xmltok_impl.c
index 64a3b2c1..84ff35f9 100644
--- a/lib/xmltok_impl.c
+++ b/lib/xmltok_impl.c
@@ -1274,7 +1274,7 @@ PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
     switch (BYTE_TYPE(enc, ptr)) {
 #  define LEAD_CASE(n)                                                         \
   case BT_LEAD##n:                                                             \
-    ptr += n;                                                                  \
+    ptr += n; /* NOTE: The encoding has already been validated. */             \
     break;
       LEAD_CASE(2)
       LEAD_CASE(3)
@@ -1343,7 +1343,7 @@ PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
     switch (BYTE_TYPE(enc, ptr)) {
 #  define LEAD_CASE(n)                                                         \
   case BT_LEAD##n:                                                             \
-    ptr += n;                                                                  \
+    ptr += n; /* NOTE: The encoding has already been validated. */             \
     break;
       LEAD_CASE(2)
       LEAD_CASE(3)
@@ -1522,7 +1522,7 @@ PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax,
       state = inName;                                                          \
     }
 #  define LEAD_CASE(n)                                                         \
-  case BT_LEAD##n:                                                             \
+  case BT_LEAD##n: /* NOTE: The encoding has already been validated. */        \
     START_NAME ptr += (n - MINBPC(enc));                                       \
     break;
       LEAD_CASE(2)
@@ -1734,7 +1734,7 @@ PREFIX(nameLength)(const ENCODING *enc, const char *ptr) {
     switch (BYTE_TYPE(enc, ptr)) {
 #  define LEAD_CASE(n)                                                         \
   case BT_LEAD##n:                                                             \
-    ptr += n;                                                                  \
+    ptr += n; /* NOTE: The encoding has already been validated. */             \
     break;
       LEAD_CASE(2)
       LEAD_CASE(3)
@@ -1779,7 +1779,7 @@ PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end,
     switch (BYTE_TYPE(enc, ptr)) {
 #  define LEAD_CASE(n)                                                         \
   case BT_LEAD##n:                                                             \
-    ptr += n;                                                                  \
+    ptr += n; /* NOTE: The encoding has already been validated. */             \
     pos->columnNumber++;                                                       \
     break;
       LEAD_CASE(2)

From 6a5510bc6b7efe743356296724e0b38300f05379 Mon Sep 17 00:00:00 2001
From: Sebastian Pipping <sebastian@pipping.org>
Date: Tue, 8 Feb 2022 04:06:21 +0100
Subject: [PATCH 4/5] tests: Cover missing validation of encoding
 (CVE-2022-25235)

---
 expat/tests/runtests.c | 109 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)

diff --git a/tests/runtests.c b/tests/runtests.c
index bc5344b1..9b155b82 100644
--- a/tests/runtests.c
+++ b/tests/runtests.c
@@ -5998,6 +5998,105 @@ START_TEST(test_utf8_in_cdata_section_2) {
 }
 END_TEST
 
+START_TEST(test_utf8_in_start_tags) {
+  struct test_case {
+    bool goodName;
+    bool goodNameStart;
+    const char *tagName;
+  };
+
+  // The idea with the tests below is this:
+  // We want to cover 1-, 2- and 3-byte sequences, 4-byte sequences
+  // go to isNever and are hence not a concern.
+  //
+  // We start with a character that is a valid name character
+  // (or even name-start character, see XML 1.0r4 spec) and then we flip
+  // single bits at places where (1) the result leaves the UTF-8 encoding space
+  // and (2) we stay in the same n-byte sequence family.
+  //
+  // The flipped bits are highlighted in angle brackets in comments,
+  // e.g. "[<1>011 1001]" means we had [0011 1001] but we now flipped
+  // the most significant bit to 1 to leave UTF-8 encoding space.
+  struct test_case cases[] = {
+      // 1-byte UTF-8: [0xxx xxxx]
+      {true, true, "\x3A"},   // [0011 1010] = ASCII colon ':'
+      {false, false, "\xBA"}, // [<1>011 1010]
+      {true, false, "\x39"},  // [0011 1001] = ASCII nine '9'
+      {false, false, "\xB9"}, // [<1>011 1001]
+
+      // 2-byte UTF-8: [110x xxxx] [10xx xxxx]
+      {true, true, "\xDB\xA5"},   // [1101 1011] [1010 0101] =
+                                  // Arabic small waw U+06E5
+      {false, false, "\x9B\xA5"}, // [1<0>01 1011] [1010 0101]
+      {false, false, "\xDB\x25"}, // [1101 1011] [<0>010 0101]
+      {false, false, "\xDB\xE5"}, // [1101 1011] [1<1>10 0101]
+      {true, false, "\xCC\x81"},  // [1100 1100] [1000 0001] =
+                                  // combining char U+0301
+      {false, false, "\x8C\x81"}, // [1<0>00 1100] [1000 0001]
+      {false, false, "\xCC\x01"}, // [1100 1100] [<0>000 0001]
+      {false, false, "\xCC\xC1"}, // [1100 1100] [1<1>00 0001]
+
+      // 3-byte UTF-8: [1110 xxxx] [10xx xxxx] [10xxxxxx]
+      {true, true, "\xE0\xA4\x85"},   // [1110 0000] [1010 0100] [1000 0101] =
+                                      // Devanagari Letter A U+0905
+      {false, false, "\xA0\xA4\x85"}, // [1<0>10 0000] [1010 0100] [1000 0101]
+      {false, false, "\xE0\x24\x85"}, // [1110 0000] [<0>010 0100] [1000 0101]
+      {false, false, "\xE0\xE4\x85"}, // [1110 0000] [1<1>10 0100] [1000 0101]
+      {false, false, "\xE0\xA4\x05"}, // [1110 0000] [1010 0100] [<0>000 0101]
+      {false, false, "\xE0\xA4\xC5"}, // [1110 0000] [1010 0100] [1<1>00 0101]
+      {true, false, "\xE0\xA4\x81"},  // [1110 0000] [1010 0100] [1000 0001] =
+                                      // combining char U+0901
+      {false, false, "\xA0\xA4\x81"}, // [1<0>10 0000] [1010 0100] [1000 0001]
+      {false, false, "\xE0\x24\x81"}, // [1110 0000] [<0>010 0100] [1000 0001]
+      {false, false, "\xE0\xE4\x81"}, // [1110 0000] [1<1>10 0100] [1000 0001]
+      {false, false, "\xE0\xA4\x01"}, // [1110 0000] [1010 0100] [<0>000 0001]
+      {false, false, "\xE0\xA4\xC1"}, // [1110 0000] [1010 0100] [1<1>00 0001]
+  };
+  const bool atNameStart[] = {true, false};
+
+  size_t i = 0;
+  char doc[1024];
+  size_t failCount = 0;
+
+  for (; i < sizeof(cases) / sizeof(cases[0]); i++) {
+    size_t j = 0;
+    for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) {
+      const bool expectedSuccess
+          = atNameStart[j] ? cases[i].goodNameStart : cases[i].goodName;
+      sprintf(doc, "<%s%s><!--", atNameStart[j] ? "" : "a", cases[i].tagName);
+      XML_Parser parser = XML_ParserCreate(NULL);
+
+      const enum XML_Status status
+          = XML_Parse(parser, doc, (int)strlen(doc), /*isFinal=*/XML_FALSE);
+
+      bool success = true;
+      if ((status == XML_STATUS_OK) != expectedSuccess) {
+        success = false;
+      }
+      if ((status == XML_STATUS_ERROR)
+          && (XML_GetErrorCode(parser) != XML_ERROR_INVALID_TOKEN)) {
+        success = false;
+      }
+
+      if (! success) {
+        fprintf(
+            stderr,
+            "FAIL case %2u (%sat name start, %u-byte sequence, error code %d)\n",
+            (unsigned)i + 1u, atNameStart[j] ? "    " : "not ",
+            (unsigned)strlen(cases[i].tagName), XML_GetErrorCode(parser));
+        failCount++;
+      }
+
+      XML_ParserFree(parser);
+    }
+  }
+
+  if (failCount > 0) {
+    fail("UTF-8 regression detected");
+  }
+}
+END_TEST
+
 /* Test trailing spaces in elements are accepted */
 static void XMLCALL
 record_element_end_handler(void *userData, const XML_Char *name) {
@@ -6175,6 +6274,14 @@ START_TEST(test_bad_doctype) {
 }
 END_TEST
 
+START_TEST(test_bad_doctype_utf8) {
+  const char *text = "<!DOCTYPE \xDB\x25"
+                     "doc><doc/>"; // [1101 1011] [<0>010 0101]
+  expect_failure(text, XML_ERROR_INVALID_TOKEN,
+                 "Invalid UTF-8 in DOCTYPE not faulted");
+}
+END_TEST
+
 START_TEST(test_bad_doctype_utf16) {
   const char text[] =
       /* <!DOCTYPE doc [ \x06f2 ]><doc/>
@@ -11870,6 +11977,7 @@ make_suite(void) {
   tcase_add_test(tc_basic, test_ext_entity_utf8_non_bom);
   tcase_add_test(tc_basic, test_utf8_in_cdata_section);
   tcase_add_test(tc_basic, test_utf8_in_cdata_section_2);
+  tcase_add_test(tc_basic, test_utf8_in_start_tags);
   tcase_add_test(tc_basic, test_trailing_spaces_in_elements);
   tcase_add_test(tc_basic, test_utf16_attribute);
   tcase_add_test(tc_basic, test_utf16_second_attr);
@@ -11878,6 +11986,7 @@ make_suite(void) {
   tcase_add_test(tc_basic, test_bad_attr_desc_keyword);
   tcase_add_test(tc_basic, test_bad_attr_desc_keyword_utf16);
   tcase_add_test(tc_basic, test_bad_doctype);
+  tcase_add_test(tc_basic, test_bad_doctype_utf8);
   tcase_add_test(tc_basic, test_bad_doctype_utf16);
   tcase_add_test(tc_basic, test_bad_doctype_plus);
   tcase_add_test(tc_basic, test_bad_doctype_star);
Fix multiple CVEs CVE-2022-25236 expat: namespace-separator characters in "xmlns[:prefix]" attribute values can lead to arbitrary code execution CVE-2022-25235 expat: malformed 2- and 3-byte UTF-8 sequences can lead to arbitrary code execution CVE-2022-25315 expat: integer overflow in storeRawNames() Resolves: CVE-2022-25236 Resolves: CVE-2022-25235 Resolves: CVE-2022-25315 2022-03-02 08:55:10 +00:00			`From ee2a5b50e7d1940ba8745715b62ceb9efd3a96da Mon Sep 17 00:00:00 2001`
			`From: Sebastian Pipping <sebastian@pipping.org>`
			`Date: Tue, 8 Feb 2022 17:37:14 +0100`
			`Subject: [PATCH 1/5] lib: Drop unused macro UTF8_GET_NAMING`

			`---`
			`expat/lib/xmltok.c \| 5 -----`
			`1 file changed, 5 deletions(-)`

			`diff --git a/lib/xmltok.c b/lib/xmltok.c`
			`index a72200e8..3bddf125 100644`
			`--- a/lib/xmltok.c`
			`+++ b/lib/xmltok.c`
			`@@ -98,11 +98,6 @@`
			`+ ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)] \`
			`& (1u << (((byte)[2]) & 0x1F)))`

			`-#define UTF8_GET_NAMING(pages, p, n) \`
			`- ((n) == 2 \`
			`- ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \`
			`- : ((n) == 3 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) : 0))`
			`-`
			`/* Detection of invalid UTF-8 sequences is based on Table 3.1B`
			`of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/`
			`with the additional restriction of not allowing the Unicode`

			`From 3f0a0cb644438d4d8e3294cd0b1245d0edb0c6c6 Mon Sep 17 00:00:00 2001`
			`From: Sebastian Pipping <sebastian@pipping.org>`
			`Date: Tue, 8 Feb 2022 04:32:20 +0100`
			`Subject: [PATCH 2/5] lib: Add missing validation of encoding (CVE-2022-25235)`

			`---`
			`expat/lib/xmltok_impl.c \| 8 ++++++--`
			`1 file changed, 6 insertions(+), 2 deletions(-)`

			`diff --git a/lib/xmltok_impl.c b/lib/xmltok_impl.c`
			`index 0430591b..64a3b2c1 100644`
			`--- a/lib/xmltok_impl.c`
			`+++ b/lib/xmltok_impl.c`
			`@@ -69,7 +69,7 @@`
			`case BT_LEAD##n: \`
			`if (end - ptr < n) \`
			`return XML_TOK_PARTIAL_CHAR; \`
			`- if (! IS_NAME_CHAR(enc, ptr, n)) { \`
			`+ if (IS_INVALID_CHAR(enc, ptr, n) \|\| ! IS_NAME_CHAR(enc, ptr, n)) { \`
			`*nextTokPtr = ptr; \`
			`return XML_TOK_INVALID; \`
			`} \`
			`@@ -98,7 +98,7 @@`
			`case BT_LEAD##n: \`
			`if (end - ptr < n) \`
			`return XML_TOK_PARTIAL_CHAR; \`
			`- if (! IS_NMSTRT_CHAR(enc, ptr, n)) { \`
			`+ if (IS_INVALID_CHAR(enc, ptr, n) \|\| ! IS_NMSTRT_CHAR(enc, ptr, n)) { \`
			`*nextTokPtr = ptr; \`
			`return XML_TOK_INVALID; \`
			`} \`
			`@@ -1142,6 +1142,10 @@ PREFIX(prologTok)(const ENCODING enc, const char ptr, const char *end,`
			`case BT_LEAD##n: \`
			`if (end - ptr < n) \`
			`return XML_TOK_PARTIAL_CHAR; \`
			`+ if (IS_INVALID_CHAR(enc, ptr, n)) { \`
			`+ *nextTokPtr = ptr; \`
			`+ return XML_TOK_INVALID; \`
			`+ } \`
			`if (IS_NMSTRT_CHAR(enc, ptr, n)) { \`
			`ptr += n; \`
			`tok = XML_TOK_NAME; \`

			`From c85a3025e7a1be086dc34e7559fbc543914d047f Mon Sep 17 00:00:00 2001`
			`From: Sebastian Pipping <sebastian@pipping.org>`
			`Date: Wed, 9 Feb 2022 01:00:38 +0100`
			`Subject: [PATCH 3/5] lib: Add comments to BT_LEAD* cases where encoding has`
			`already been validated`

			`---`
			`expat/lib/xmltok_impl.c \| 10 +++++-----`
			`1 file changed, 5 insertions(+), 5 deletions(-)`

			`diff --git a/lib/xmltok_impl.c b/lib/xmltok_impl.c`
			`index 64a3b2c1..84ff35f9 100644`
			`--- a/lib/xmltok_impl.c`
			`+++ b/lib/xmltok_impl.c`
			`@@ -1274,7 +1274,7 @@ PREFIX(attributeValueTok)(const ENCODING enc, const char ptr, const char *end,`
			`switch (BYTE_TYPE(enc, ptr)) {`
			`# define LEAD_CASE(n) \`
			`case BT_LEAD##n: \`
			`- ptr += n; \`
			`+ ptr += n; /* NOTE: The encoding has already been validated. */ \`
			`break;`
			`LEAD_CASE(2)`
			`LEAD_CASE(3)`
			`@@ -1343,7 +1343,7 @@ PREFIX(entityValueTok)(const ENCODING enc, const char ptr, const char *end,`
			`switch (BYTE_TYPE(enc, ptr)) {`
			`# define LEAD_CASE(n) \`
			`case BT_LEAD##n: \`
			`- ptr += n; \`
			`+ ptr += n; /* NOTE: The encoding has already been validated. */ \`
			`break;`
			`LEAD_CASE(2)`
			`LEAD_CASE(3)`
			`@@ -1522,7 +1522,7 @@ PREFIX(getAtts)(const ENCODING enc, const char ptr, int attsMax,`
			`state = inName; \`
			`}`
			`# define LEAD_CASE(n) \`
			`- case BT_LEAD##n: \`
			`+ case BT_LEAD##n: /* NOTE: The encoding has already been validated. */ \`
			`START_NAME ptr += (n - MINBPC(enc)); \`
			`break;`
			`LEAD_CASE(2)`
			`@@ -1734,7 +1734,7 @@ PREFIX(nameLength)(const ENCODING enc, const char ptr) {`
			`switch (BYTE_TYPE(enc, ptr)) {`
			`# define LEAD_CASE(n) \`
			`case BT_LEAD##n: \`
			`- ptr += n; \`
			`+ ptr += n; /* NOTE: The encoding has already been validated. */ \`
			`break;`
			`LEAD_CASE(2)`
			`LEAD_CASE(3)`
			`@@ -1779,7 +1779,7 @@ PREFIX(updatePosition)(const ENCODING enc, const char ptr, const char *end,`
			`switch (BYTE_TYPE(enc, ptr)) {`
			`# define LEAD_CASE(n) \`
			`case BT_LEAD##n: \`
			`- ptr += n; \`
			`+ ptr += n; /* NOTE: The encoding has already been validated. */ \`
			`pos->columnNumber++; \`
			`break;`
			`LEAD_CASE(2)`

			`From 6a5510bc6b7efe743356296724e0b38300f05379 Mon Sep 17 00:00:00 2001`
			`From: Sebastian Pipping <sebastian@pipping.org>`
			`Date: Tue, 8 Feb 2022 04:06:21 +0100`
			`Subject: [PATCH 4/5] tests: Cover missing validation of encoding`
			`(CVE-2022-25235)`

			`---`
			`expat/tests/runtests.c \| 109 +++++++++++++++++++++++++++++++++++++++++`
			`1 file changed, 109 insertions(+)`

			`diff --git a/tests/runtests.c b/tests/runtests.c`
			`index bc5344b1..9b155b82 100644`
			`--- a/tests/runtests.c`
			`+++ b/tests/runtests.c`
			`@@ -5998,6 +5998,105 @@ START_TEST(test_utf8_in_cdata_section_2) {`
			`}`
			`END_TEST`

			`+START_TEST(test_utf8_in_start_tags) {`
			`+ struct test_case {`
			`+ bool goodName;`
			`+ bool goodNameStart;`
			`+ const char *tagName;`
			`+ };`
			`+`
			`+ // The idea with the tests below is this:`
			`+ // We want to cover 1-, 2- and 3-byte sequences, 4-byte sequences`
			`+ // go to isNever and are hence not a concern.`
			`+ //`
			`+ // We start with a character that is a valid name character`
			`+ // (or even name-start character, see XML 1.0r4 spec) and then we flip`
			`+ // single bits at places where (1) the result leaves the UTF-8 encoding space`
			`+ // and (2) we stay in the same n-byte sequence family.`
			`+ //`
			`+ // The flipped bits are highlighted in angle brackets in comments,`
			`+ // e.g. "[<1>011 1001]" means we had [0011 1001] but we now flipped`
			`+ // the most significant bit to 1 to leave UTF-8 encoding space.`
			`+ struct test_case cases[] = {`
			`+ // 1-byte UTF-8: [0xxx xxxx]`
			`+ {true, true, "\x3A"}, // [0011 1010] = ASCII colon ':'`
			`+ {false, false, "\xBA"}, // [<1>011 1010]`
			`+ {true, false, "\x39"}, // [0011 1001] = ASCII nine '9'`
			`+ {false, false, "\xB9"}, // [<1>011 1001]`
			`+`
			`+ // 2-byte UTF-8: [110x xxxx] [10xx xxxx]`
			`+ {true, true, "\xDB\xA5"}, // [1101 1011] [1010 0101] =`
			`+ // Arabic small waw U+06E5`
			`+ {false, false, "\x9B\xA5"}, // [1<0>01 1011] [1010 0101]`
			`+ {false, false, "\xDB\x25"}, // [1101 1011] [<0>010 0101]`
			`+ {false, false, "\xDB\xE5"}, // [1101 1011] [1<1>10 0101]`
			`+ {true, false, "\xCC\x81"}, // [1100 1100] [1000 0001] =`
			`+ // combining char U+0301`
			`+ {false, false, "\x8C\x81"}, // [1<0>00 1100] [1000 0001]`
			`+ {false, false, "\xCC\x01"}, // [1100 1100] [<0>000 0001]`
			`+ {false, false, "\xCC\xC1"}, // [1100 1100] [1<1>00 0001]`
			`+`
			`+ // 3-byte UTF-8: [1110 xxxx] [10xx xxxx] [10xxxxxx]`
			`+ {true, true, "\xE0\xA4\x85"}, // [1110 0000] [1010 0100] [1000 0101] =`
			`+ // Devanagari Letter A U+0905`
			`+ {false, false, "\xA0\xA4\x85"}, // [1<0>10 0000] [1010 0100] [1000 0101]`
			`+ {false, false, "\xE0\x24\x85"}, // [1110 0000] [<0>010 0100] [1000 0101]`
			`+ {false, false, "\xE0\xE4\x85"}, // [1110 0000] [1<1>10 0100] [1000 0101]`
			`+ {false, false, "\xE0\xA4\x05"}, // [1110 0000] [1010 0100] [<0>000 0101]`
			`+ {false, false, "\xE0\xA4\xC5"}, // [1110 0000] [1010 0100] [1<1>00 0101]`
			`+ {true, false, "\xE0\xA4\x81"}, // [1110 0000] [1010 0100] [1000 0001] =`
			`+ // combining char U+0901`
			`+ {false, false, "\xA0\xA4\x81"}, // [1<0>10 0000] [1010 0100] [1000 0001]`
			`+ {false, false, "\xE0\x24\x81"}, // [1110 0000] [<0>010 0100] [1000 0001]`
			`+ {false, false, "\xE0\xE4\x81"}, // [1110 0000] [1<1>10 0100] [1000 0001]`
			`+ {false, false, "\xE0\xA4\x01"}, // [1110 0000] [1010 0100] [<0>000 0001]`
			`+ {false, false, "\xE0\xA4\xC1"}, // [1110 0000] [1010 0100] [1<1>00 0001]`
			`+ };`
			`+ const bool atNameStart[] = {true, false};`
			`+`
			`+ size_t i = 0;`
			`+ char doc[1024];`
			`+ size_t failCount = 0;`
			`+`
			`+ for (; i < sizeof(cases) / sizeof(cases[0]); i++) {`
			`+ size_t j = 0;`
			`+ for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) {`
			`+ const bool expectedSuccess`
			`+ = atNameStart[j] ? cases[i].goodNameStart : cases[i].goodName;`
			`+ sprintf(doc, "<%s%s><!--", atNameStart[j] ? "" : "a", cases[i].tagName);`
			`+ XML_Parser parser = XML_ParserCreate(NULL);`
			`+`
			`+ const enum XML_Status status`
			`+ = XML_Parse(parser, doc, (int)strlen(doc), /isFinal=/XML_FALSE);`
			`+`
			`+ bool success = true;`
			`+ if ((status == XML_STATUS_OK) != expectedSuccess) {`
			`+ success = false;`
			`+ }`
			`+ if ((status == XML_STATUS_ERROR)`
			`+ && (XML_GetErrorCode(parser) != XML_ERROR_INVALID_TOKEN)) {`
			`+ success = false;`
			`+ }`
			`+`
			`+ if (! success) {`
			`+ fprintf(`
			`+ stderr,`
			`+ "FAIL case %2u (%sat name start, %u-byte sequence, error code %d)\n",`
			`+ (unsigned)i + 1u, atNameStart[j] ? " " : "not ",`
			`+ (unsigned)strlen(cases[i].tagName), XML_GetErrorCode(parser));`
			`+ failCount++;`
			`+ }`
			`+`
			`+ XML_ParserFree(parser);`
			`+ }`
			`+ }`
			`+`
			`+ if (failCount > 0) {`
			`+ fail("UTF-8 regression detected");`
			`+ }`
			`+}`
			`+END_TEST`
			`+`
			`/* Test trailing spaces in elements are accepted */`
			`static void XMLCALL`
			`record_element_end_handler(void userData, const XML_Char name) {`
			`@@ -6175,6 +6274,14 @@ START_TEST(test_bad_doctype) {`
			`}`
			`END_TEST`

			`+START_TEST(test_bad_doctype_utf8) {`
			`+ const char *text = "<!DOCTYPE \xDB\x25"`
			`+ "doc><doc/>"; // [1101 1011] [<0>010 0101]`
			`+ expect_failure(text, XML_ERROR_INVALID_TOKEN,`
			`+ "Invalid UTF-8 in DOCTYPE not faulted");`
			`+}`
			`+END_TEST`
			`+`
			`START_TEST(test_bad_doctype_utf16) {`
			`const char text[] =`
			`/* <!DOCTYPE doc [ \x06f2 ]><doc/>`
			`@@ -11870,6 +11977,7 @@ make_suite(void) {`
			`tcase_add_test(tc_basic, test_ext_entity_utf8_non_bom);`
			`tcase_add_test(tc_basic, test_utf8_in_cdata_section);`
			`tcase_add_test(tc_basic, test_utf8_in_cdata_section_2);`
			`+ tcase_add_test(tc_basic, test_utf8_in_start_tags);`
			`tcase_add_test(tc_basic, test_trailing_spaces_in_elements);`
			`tcase_add_test(tc_basic, test_utf16_attribute);`
			`tcase_add_test(tc_basic, test_utf16_second_attr);`
			`@@ -11878,6 +11986,7 @@ make_suite(void) {`
			`tcase_add_test(tc_basic, test_bad_attr_desc_keyword);`
			`tcase_add_test(tc_basic, test_bad_attr_desc_keyword_utf16);`
			`tcase_add_test(tc_basic, test_bad_doctype);`
			`+ tcase_add_test(tc_basic, test_bad_doctype_utf8);`
			`tcase_add_test(tc_basic, test_bad_doctype_utf16);`
			`tcase_add_test(tc_basic, test_bad_doctype_plus);`
			`tcase_add_test(tc_basic, test_bad_doctype_star);`