From 583bb044473317f3ee9e36e0f773851d0852eb18 Mon Sep 17 00:00:00 2001
From: Tomas Korbar
Date: Mon, 25 Mar 2024 14:53:28 +0100
Subject: [PATCH] CVE-2023-52425 expat: parsing large tokens can trigger a
denial of service
Resolves: RHEL-29321
---
expat-2.2.5-CVE-2023-52425.patch | 1393 ++++++++++++++++++++++++++++++
expat.spec | 19 +-
2 files changed, 1411 insertions(+), 1 deletion(-)
create mode 100644 expat-2.2.5-CVE-2023-52425.patch
diff --git a/expat-2.2.5-CVE-2023-52425.patch b/expat-2.2.5-CVE-2023-52425.patch
new file mode 100644
index 0000000..564b01f
--- /dev/null
+++ b/expat-2.2.5-CVE-2023-52425.patch
@@ -0,0 +1,1393 @@
+commit d9904191c90476ed039ce9d33aee9ef56c807f8e
+Author: Tomas Korbar
+Date: Mon Mar 25 14:25:24 2024 +0100
+
+ Fix CVE-2023-52425
+
+ upstream PR #789
+
+diff --git a/expat/Makefile.am b/expat/Makefile.am
+index 5ed9ac4..0e3181d 100644
+--- a/expat/Makefile.am
++++ b/expat/Makefile.am
+@@ -120,6 +120,11 @@ buildlib:
+ run-benchmark:
+ $(MAKE) -C tests/benchmark
+ ./run.sh tests/benchmark/benchmark@EXEEXT@ -n $(top_srcdir)/../testdata/largefiles/recset.xml 65535 3
++ ./run.sh tests/benchmark/benchmark@EXEEXT@ -n $(top_srcdir)/../testdata/largefiles/aaaaaa_attr.xml 4096 3
++ ./run.sh tests/benchmark/benchmark@EXEEXT@ -n $(top_srcdir)/../testdata/largefiles/aaaaaa_cdata.xml 4096 3
++ ./run.sh tests/benchmark/benchmark@EXEEXT@ -n $(top_srcdir)/../testdata/largefiles/aaaaaa_comment.xml 4096 3
++ ./run.sh tests/benchmark/benchmark@EXEEXT@ -n $(top_srcdir)/../testdata/largefiles/aaaaaa_tag.xml 4096 3
++ ./run.sh tests/benchmark/benchmark@EXEEXT@ -n $(top_srcdir)/../testdata/largefiles/aaaaaa_text.xml 4096 3
+
+ tests/xmlts.zip:
+ if test "$(XMLTS_ZIP)" = ""; then \
+diff --git a/expat/doc/reference.html b/expat/doc/reference.html
+index efc19f4..95c33c7 100644
+--- a/expat/doc/reference.html
++++ b/expat/doc/reference.html
+@@ -1996,6 +1996,27 @@ parse position may be before the beginning of the buffer.
+ return NULL.
+
+
++XML_SetReparseDeferralEnabled
++
++/* Added in Expat 2.6.0. */
++XML_Bool XMLCALL
++XML_SetReparseDeferralEnabled(XML_Parser parser, XML_Bool enabled);
++
++
++
++ Large tokens may require many parse calls before enough data is available for Expat to parse it in full.
++ If Expat retried parsing the token on every parse call, parsing could take quadratic time.
++ To avoid this, Expat only retries once a significant amount of new data is available.
++ This function allows disabling this behavior.
++
++
++ The enabled
argument should be XML_TRUE
or XML_FALSE
.
++
++
++ Returns XML_TRUE
on success, and XML_FALSE
on error.
++
++
++
+
+
+ The functions in this section either obtain state information from
+diff --git a/expat/doc/xmlwf.xml b/expat/doc/xmlwf.xml
+index 5e2a4ae..6b719eb 100644
+--- a/expat/doc/xmlwf.xml
++++ b/expat/doc/xmlwf.xml
+@@ -246,6 +246,16 @@ supports both.
+
+
+
++
++ -q
++
++
++ Disable reparse deferral, and allow quadratic parse runtime
++ on large tokens (default: reparse deferral enabled).
++
++
++
++
+
+ -r
+
+diff --git a/expat/lib/.gitignore b/expat/lib/.gitignore
+index 9c9cf88..cd5b24f 100644
+--- a/expat/lib/.gitignore
++++ b/expat/lib/.gitignore
+@@ -1,7 +1,6 @@
+ Makefile
+ .libs
+ *.lo
+-expat.h
+ Debug
+ Debug-w
+ Release
+diff --git a/expat/lib/expat.h b/expat/lib/expat.h
+index 1f608c0..afe12c5 100644
+--- a/expat/lib/expat.h
++++ b/expat/lib/expat.h
+@@ -1071,6 +1071,10 @@ XMLPARSEAPI(const XML_Feature *)
+ XML_GetFeatureList(void);
+
+
++/* Added in Expat 2.6.0. */
++XMLPARSEAPI(XML_Bool)
++XML_SetReparseDeferralEnabled(XML_Parser parser, XML_Bool enabled);
++
+ /* Expat follows the semantic versioning convention.
+ See http://semver.org.
+ */
+diff --git a/expat/lib/internal.h b/expat/lib/internal.h
+index e33fdcb..78b5bc1 100644
+--- a/expat/lib/internal.h
++++ b/expat/lib/internal.h
+@@ -109,6 +109,7 @@
+ # endif
+ #endif
+
++#include "expat.h"
+
+ #ifdef __cplusplus
+ extern "C" {
+@@ -119,6 +120,9 @@ void
+ _INTERNAL_trim_to_complete_utf8_characters(const char * from, const char ** fromLimRef);
+
+
++extern XML_Bool g_reparseDeferralEnabledDefault; // written ONLY in runtests.c
++extern unsigned int g_parseAttempts; // used for testing only
++
+ #ifdef __cplusplus
+ }
+ #endif
+diff --git a/expat/lib/libexpat.def b/expat/lib/libexpat.def
+index d08f5b7..163870b 100644
+--- a/expat/lib/libexpat.def
++++ b/expat/lib/libexpat.def
+@@ -75,4 +75,5 @@ EXPORTS
+ ; XML_GetAttributeInfo @66
+ XML_SetHashSalt @67@
+ ; added with version 2.2.5
+- _INTERNAL_trim_to_complete_utf8_characters @68@
+\ No newline at end of file
++ _INTERNAL_trim_to_complete_utf8_characters @68@
++ XML_SetReparseDeferralEnabled @69
+diff --git a/expat/lib/xmlparse.c b/expat/lib/xmlparse.c
+index 3f765f7..488f63f 100644
+--- a/expat/lib/xmlparse.c
++++ b/expat/lib/xmlparse.c
+@@ -34,6 +34,7 @@
+ # define _GNU_SOURCE 1 /* syscall prototype */
+ #endif
+
++#include
+ #include
+ #include /* memset(), memcpy() */
+ #include
+@@ -173,6 +174,8 @@ typedef char ICHAR;
+ #endif /* HAVE_BCOPY */
+ #endif /* HAVE_MEMMOVE */
+
++#define EXPAT_MIN(a, b) (((a) < (b)) ? (a) : (b))
++
+ #include "internal.h"
+ #include "xmltok.h"
+ #include "xmlrole.h"
+@@ -544,6 +547,9 @@ parserInit(XML_Parser parser, const XML_Char *encodingName);
+ ? 0 \
+ : ((*((pool)->ptr)++ = c), 1))
+
++XML_Bool g_reparseDeferralEnabledDefault = XML_TRUE; // write ONLY in runtests.c
++unsigned int g_parseAttempts = 0; // used for testing only
++
+ struct XML_ParserStruct {
+ /* The first member must be m_userData so that the XML_GetUserData
+ macro works. */
+@@ -559,6 +565,9 @@ struct XML_ParserStruct {
+ const char *m_bufferLim;
+ XML_Index m_parseEndByteIndex;
+ const char *m_parseEndPtr;
++ size_t m_partialTokenBytesBefore; /* used in heuristic to avoid O(n^2) */
++ XML_Bool m_reparseDeferralEnabled;
++ int m_lastBufferRequestSize;
+ XML_Char *m_dataBuf;
+ XML_Char *m_dataBufEnd;
+ XML_StartElementHandler m_startElementHandler;
+@@ -892,6 +901,48 @@ get_hash_secret_salt(XML_Parser parser) {
+ return parser->m_hash_secret_salt;
+ }
+
++static enum XML_Error
++callProcessor(XML_Parser parser, const char *start, const char *end,
++ const char **endPtr) {
++ const size_t have_now = EXPAT_SAFE_PTR_DIFF(end, start);
++
++ if (parser->m_reparseDeferralEnabled
++ && ! parser->m_parsingStatus.finalBuffer) {
++ // Heuristic: don't try to parse a partial token again until the amount of
++ // available data has increased significantly.
++ const size_t had_before = parser->m_partialTokenBytesBefore;
++ // ...but *do* try anyway if we're close to causing a reallocation.
++ size_t available_buffer
++ = EXPAT_SAFE_PTR_DIFF(parser->m_bufferPtr, parser->m_buffer);
++#ifdef XML_CONTEXT_BYTES
++ available_buffer -= EXPAT_MIN(available_buffer, XML_CONTEXT_BYTES);
++#endif
++ available_buffer
++ += EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_bufferEnd);
++ // m_lastBufferRequestSize is never assigned a value < 0, so the cast is ok
++ const bool enough
++ = (have_now >= 2 * had_before)
++ || ((size_t)parser->m_lastBufferRequestSize > available_buffer);
++
++ if (! enough) {
++ *endPtr = start; // callers may expect this to be set
++ return XML_ERROR_NONE;
++ }
++ }
++ g_parseAttempts += 1;
++ const enum XML_Error ret = parser->m_processor(parser, start, end, endPtr);
++ if (ret == XML_ERROR_NONE) {
++ // if we consumed nothing, remember what we had on this parse attempt.
++ if (*endPtr == start) {
++ parser->m_partialTokenBytesBefore = have_now;
++ } else {
++ parser->m_partialTokenBytesBefore = 0;
++ }
++ }
++ return ret;
++}
++
++
+ static XML_Bool /* only valid for root parser */
+ startParsing(XML_Parser parser)
+ {
+@@ -1078,6 +1129,9 @@ parserInit(XML_Parser parser, const XML_Char *encodingName)
+ parser->m_bufferEnd = parser->m_buffer;
+ parser->m_parseEndByteIndex = 0;
+ parser->m_parseEndPtr = NULL;
++ parser->m_partialTokenBytesBefore = 0;
++ parser->m_reparseDeferralEnabled = g_reparseDeferralEnabledDefault;
++ parser->m_lastBufferRequestSize = 0;
+ parser->m_declElementType = NULL;
+ parser->m_declAttributeId = NULL;
+ parser->m_declEntity = NULL;
+@@ -1239,6 +1293,7 @@ XML_ExternalEntityParserCreate(XML_Parser oldParser,
+ to worry which hash secrets each table has.
+ */
+ unsigned long oldhash_secret_salt;
++ XML_Bool oldReparseDeferralEnabled;
+
+ /* Validate the oldParser parameter before we pull everything out of it */
+ if (oldParser == NULL)
+@@ -1283,6 +1338,7 @@ XML_ExternalEntityParserCreate(XML_Parser oldParser,
+ to worry which hash secrets each table has.
+ */
+ oldhash_secret_salt = parser->m_hash_secret_salt;
++ oldReparseDeferralEnabled = parser->m_reparseDeferralEnabled;
+
+ #ifdef XML_DTD
+ if (!context)
+@@ -1336,6 +1392,7 @@ XML_ExternalEntityParserCreate(XML_Parser oldParser,
+ parser->m_defaultExpandInternalEntities = oldDefaultExpandInternalEntities;
+ parser->m_ns_triplets = oldns_triplets;
+ parser->m_hash_secret_salt = oldhash_secret_salt;
++ parser->m_reparseDeferralEnabled = oldReparseDeferralEnabled;
+ parser->m_parentParser = oldParser;
+ #ifdef XML_DTD
+ parser->m_paramEntityParsing = oldParamEntityParsing;
+@@ -1833,52 +1890,8 @@ XML_Parse(XML_Parser parser, const char *s, int len, int isFinal)
+ parser->m_parsingStatus.parsing = XML_PARSING;
+ }
+
+- if (len == 0) {
+- parser->m_parsingStatus.finalBuffer = (XML_Bool)isFinal;
+- if (!isFinal)
+- return XML_STATUS_OK;
+- parser->m_positionPtr = parser->m_bufferPtr;
+- parser->m_parseEndPtr = parser->m_bufferEnd;
+-
+- /* If data are left over from last buffer, and we now know that these
+- data are the final chunk of input, then we have to check them again
+- to detect errors based on that fact.
+- */
+- parser->m_errorCode = parser->m_processor(parser, parser->m_bufferPtr, parser->m_parseEndPtr, &parser->m_bufferPtr);
+-
+- if (parser->m_errorCode == XML_ERROR_NONE) {
+- switch (parser->m_parsingStatus.parsing) {
+- case XML_SUSPENDED:
+- /* It is hard to be certain, but it seems that this case
+- * cannot occur. This code is cleaning up a previous parse
+- * with no new data (since len == 0). Changing the parsing
+- * state requires getting to execute a handler function, and
+- * there doesn't seem to be an opportunity for that while in
+- * this circumstance.
+- *
+- * Given the uncertainty, we retain the code but exclude it
+- * from coverage tests.
+- *
+- * LCOV_EXCL_START
+- */
+- XmlUpdatePosition(parser->m_encoding, parser->m_positionPtr, parser->m_bufferPtr, &parser->m_position);
+- parser->m_positionPtr = parser->m_bufferPtr;
+- return XML_STATUS_SUSPENDED;
+- /* LCOV_EXCL_STOP */
+- case XML_INITIALIZED:
+- case XML_PARSING:
+- parser->m_parsingStatus.parsing = XML_FINISHED;
+- /* fall through */
+- default:
+- return XML_STATUS_OK;
+- }
+- }
+- parser->m_eventEndPtr = parser->m_eventPtr;
+- parser->m_processor = errorProcessor;
+- return XML_STATUS_ERROR;
+- }
+ #ifndef XML_CONTEXT_BYTES
+- else if (parser->m_bufferPtr == parser->m_bufferEnd) {
++ if (parser->m_bufferPtr == parser->m_bufferEnd) {
+ const char *end;
+ int nLeftOver;
+ enum XML_Status result;
+@@ -1893,7 +1906,7 @@ XML_Parse(XML_Parser parser, const char *s, int len, int isFinal)
+ parser->m_positionPtr = s;
+ parser->m_parsingStatus.finalBuffer = (XML_Bool)isFinal;
+
+- parser->m_errorCode = parser->m_processor(parser, s, parser->m_parseEndPtr = s + len, &end);
++ parser->m_errorCode = callProcessor(parser, s, parser->m_parseEndPtr = s + len, &end);
+
+ if (parser->m_errorCode != XML_ERROR_NONE) {
+ parser->m_eventEndPtr = parser->m_eventPtr;
+@@ -1920,22 +1933,25 @@ XML_Parse(XML_Parser parser, const char *s, int len, int isFinal)
+ XmlUpdatePosition(parser->m_encoding, parser->m_positionPtr, end, &parser->m_position);
+ nLeftOver = s + len - end;
+ if (nLeftOver) {
+- if (parser->m_buffer == NULL || nLeftOver > parser->m_bufferLim - parser->m_buffer) {
+- /* avoid _signed_ integer overflow */
+- char *temp = NULL;
+- const int bytesToAllocate = (int)((unsigned)len * 2U);
+- if (bytesToAllocate > 0) {
+- temp = (char *)REALLOC(parser, parser->m_buffer, bytesToAllocate);
+- }
+- if (temp == NULL) {
+- parser->m_errorCode = XML_ERROR_NO_MEMORY;
+- parser->m_eventPtr = parser->m_eventEndPtr = NULL;
+- parser->m_processor = errorProcessor;
+- return XML_STATUS_ERROR;
+- }
+- parser->m_buffer = temp;
+- parser->m_bufferLim = parser->m_buffer + bytesToAllocate;
++ // Back up and restore the parsing status to avoid XML_ERROR_SUSPENDED
++ // (and XML_ERROR_FINISHED) from XML_GetBuffer.
++ const enum XML_Parsing originalStatus = parser->m_parsingStatus.parsing;
++ parser->m_parsingStatus.parsing = XML_PARSING;
++ void *const temp = XML_GetBuffer(parser, nLeftOver);
++ parser->m_parsingStatus.parsing = originalStatus;
++ // GetBuffer may have overwritten this, but we want to remember what the
++ // app requested, not how many bytes were left over after parsing.
++ parser->m_lastBufferRequestSize = len;
++ if (temp == NULL) {
++ // NOTE: parser->m_errorCode has already been set by XML_GetBuffer().
++ parser->m_eventPtr = parser->m_eventEndPtr = NULL;
++ parser->m_processor = errorProcessor;
++ return XML_STATUS_ERROR;
+ }
++ // Since we know that the buffer was empty and XML_CONTEXT_BYTES is 0, we
++ // don't have any data to preserve, and can copy straight into the start
++ // of the buffer rather than the GetBuffer return pointer (which may be
++ // pointing further into the allocated buffer).
+ memcpy(parser->m_buffer, end, nLeftOver);
+ }
+ parser->m_bufferPtr = parser->m_buffer;
+@@ -1947,15 +1963,14 @@ XML_Parse(XML_Parser parser, const char *s, int len, int isFinal)
+ return result;
+ }
+ #endif /* not defined XML_CONTEXT_BYTES */
+- else {
+- void *buff = XML_GetBuffer(parser, len);
+- if (buff == NULL)
+- return XML_STATUS_ERROR;
+- else {
+- memcpy(buff, s, len);
+- return XML_ParseBuffer(parser, len, isFinal);
+- }
++ void *buff = XML_GetBuffer(parser, len);
++ if (buff == NULL)
++ return XML_STATUS_ERROR;
++ if (len > 0) {
++ assert(s != NULL); // make sure s==NULL && len!=0 was rejected above
++ memcpy(buff, s, len);
+ }
++ return XML_ParseBuffer(parser, len, isFinal);
+ }
+
+ enum XML_Status XMLCALL
+@@ -1989,7 +2004,8 @@ XML_ParseBuffer(XML_Parser parser, int len, int isFinal)
+ parser->m_parseEndByteIndex += len;
+ parser->m_parsingStatus.finalBuffer = (XML_Bool)isFinal;
+
+- parser->m_errorCode = parser->m_processor(parser, start, parser->m_parseEndPtr, &parser->m_bufferPtr);
++ parser->m_errorCode = callProcessor(parser, start, parser->m_parseEndPtr,
++ &parser->m_bufferPtr);
+
+ if (parser->m_errorCode != XML_ERROR_NONE) {
+ parser->m_eventEndPtr = parser->m_eventPtr;
+@@ -2035,8 +2051,14 @@ XML_GetBuffer(XML_Parser parser, int len)
+ default: ;
+ }
+
+- if (len > EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_bufferEnd)) {
++ // whether or not the request succeeds, `len` seems to be the app's preferred
++ // buffer fill size; remember it.
++ parser->m_lastBufferRequestSize = len;
++ if (len > EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_bufferEnd)
++ || parser->m_buffer == NULL) {
++#ifdef XML_CONTEXT_BYTES
+ int keep;
++#endif /* XML_CONTEXT_BYTES > 0 */
+ /* Do not invoke signed arithmetic overflow: */
+ int neededSize = (int)((unsigned)len
+ + (unsigned)EXPAT_SAFE_PTR_DIFF(
+@@ -2055,8 +2077,9 @@ XML_GetBuffer(XML_Parser parser, int len)
+ return NULL;
+ }
+ neededSize += keep;
+- if (neededSize
+- <= EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_buffer)) {
++ if (parser->m_buffer && parser->m_bufferPtr
++ && neededSize
++ <= EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_buffer)) {
+ #ifdef XML_CONTEXT_BYTES
+ if (keep < EXPAT_SAFE_PTR_DIFF(parser->m_bufferPtr, parser->m_buffer)) {
+ int offset
+@@ -2070,19 +2093,17 @@ XML_GetBuffer(XML_Parser parser, int len)
+ parser->m_bufferPtr -= offset;
+ }
+ #else
+- if (parser->m_buffer && parser->m_bufferPtr) {
+- memmove(parser->m_buffer, parser->m_bufferPtr,
+- EXPAT_SAFE_PTR_DIFF(parser->m_bufferEnd, parser->m_bufferPtr));
+- parser->m_bufferEnd
+- = parser->m_buffer
+- + EXPAT_SAFE_PTR_DIFF(parser->m_bufferEnd, parser->m_bufferPtr);
+- parser->m_bufferPtr = parser->m_buffer;
+- }
+-#endif /* not defined XML_CONTEXT_BYTES */
++ memmove(parser->m_buffer, parser->m_bufferPtr,
++ EXPAT_SAFE_PTR_DIFF(parser->m_bufferEnd, parser->m_bufferPtr));
++ parser->m_bufferEnd
++ = parser->m_buffer
++ + EXPAT_SAFE_PTR_DIFF(parser->m_bufferEnd, parser->m_bufferPtr);
++ parser->m_bufferPtr = parser->m_buffer;
++#endif /* XML_CONTEXT_BYTES > 0 */
+ } else {
+ char *newBuf;
+ int bufferSize
+- = (int)EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_bufferPtr);
++ = (int)EXPAT_SAFE_PTR_DIFF(parser->m_bufferLim, parser->m_buffer);
+ if (bufferSize == 0)
+ bufferSize = INIT_BUFFER_SIZE;
+ do {
+@@ -2099,7 +2120,7 @@ XML_GetBuffer(XML_Parser parser, int len)
+ return NULL;
+ }
+ parser->m_bufferLim = newBuf + bufferSize;
+-#ifdef XML_CONTEXT_BYTES
++#if XML_CONTEXT_BYTES > 0
+ if (parser->m_bufferPtr) {
+ memcpy(newBuf, &parser->m_bufferPtr[-keep],
+ EXPAT_SAFE_PTR_DIFF(parser->m_bufferEnd, parser->m_bufferPtr)
+@@ -2182,7 +2203,7 @@ XML_ResumeParser(XML_Parser parser)
+ }
+ parser->m_parsingStatus.parsing = XML_PARSING;
+
+- parser->m_errorCode = parser->m_processor(parser, parser->m_bufferPtr, parser->m_parseEndPtr, &parser->m_bufferPtr);
++ parser->m_errorCode = callProcessor(parser, parser->m_bufferPtr, parser->m_parseEndPtr, &parser->m_bufferPtr);
+
+ if (parser->m_errorCode != XML_ERROR_NONE) {
+ parser->m_eventEndPtr = parser->m_eventPtr;
+@@ -2504,6 +2525,15 @@ XML_GetFeatureList(void)
+ return features;
+ }
+
++XML_Bool XMLCALL
++XML_SetReparseDeferralEnabled(XML_Parser parser, XML_Bool enabled) {
++ if (parser != NULL && (enabled == XML_TRUE || enabled == XML_FALSE)) {
++ parser->m_reparseDeferralEnabled = enabled;
++ return XML_TRUE;
++ }
++ return XML_FALSE;
++}
++
+ /* Initially tag->rawName always points into the parse buffer;
+ for those TAG instances opened while the current parse buffer was
+ processed, and not yet closed, we need to store tag->rawName in a more
+@@ -4440,16 +4470,17 @@ entityValueInitProcessor(XML_Parser parser,
+ parser->m_processor = entityValueProcessor;
+ return entityValueProcessor(parser, next, end, nextPtr);
+ }
+- /* If we are at the end of the buffer, this would cause XmlPrologTok to
+- return XML_TOK_NONE on the next call, which would then cause the
+- function to exit with *nextPtr set to s - that is what we want for other
+- tokens, but not for the BOM - we would rather like to skip it;
+- then, when this routine is entered the next time, XmlPrologTok will
+- return XML_TOK_INVALID, since the BOM is still in the buffer
+- */
+- else if (tok == XML_TOK_BOM && next == end && !parser->m_parsingStatus.finalBuffer) {
++ /* XmlPrologTok has now set the encoding based on the BOM it found, and we
++ must move s and nextPtr forward to consume the BOM.
++
++ If we didn't, and got XML_TOK_NONE from the next XmlPrologTok call, we
++ would leave the BOM in the buffer and return. On the next call to this
++ function, our XmlPrologTok call would return XML_TOK_INVALID, since it
++ is not valid to have multiple BOMs.
++ */
++ else if (tok == XML_TOK_BOM) {
+ *nextPtr = next;
+- return XML_ERROR_NONE;
++ s = next;
+ }
+ /* If we get this token, we have the start of what might be a
+ normal tag, but not a declaration (i.e. it doesn't begin with
+diff --git a/expat/tests/minicheck.c b/expat/tests/minicheck.c
+index be1e37e..6c694a0 100644
+--- a/expat/tests/minicheck.c
++++ b/expat/tests/minicheck.c
+@@ -209,6 +209,21 @@ srunner_run_all(SRunner *runner, int verbosity)
+ }
+ }
+
++void
++_fail(const char *file, int line, const char *msg) {
++ /* Always print the error message so it isn't lost. In this case,
++ we have a failure, so there's no reason to be quiet about what
++ it is.
++ */
++ _check_current_filename = file;
++ _check_current_lineno = line;
++ if (msg != NULL) {
++ const int has_newline = (msg[strlen(msg) - 1] == '\n');
++ fprintf(stderr, "ERROR: %s%s", msg, has_newline ? "" : "\n");
++ }
++ longjmp(env, 1);
++}
++
+ void
+ _fail_unless(int UNUSED_P(condition), const char *UNUSED_P(file), int UNUSED_P(line), const char *msg)
+ {
+diff --git a/expat/tests/minicheck.h b/expat/tests/minicheck.h
+index a2f57dd..894895a 100644
+--- a/expat/tests/minicheck.h
++++ b/expat/tests/minicheck.h
+@@ -60,7 +60,13 @@ extern "C" {
+ {
+ #define END_TEST } }
+
+-#define fail(msg) _fail_unless(0, __FILE__, __LINE__, msg)
++# define fail(msg) _fail(__FILE__, __LINE__, msg)
++# define assert_true(cond) \
++ do { \
++ if (! (cond)) { \
++ _fail(__FILE__, __LINE__, "check failed: " #cond); \
++ } \
++ } while (0)
+
+ typedef void (*tcase_setup_function)(void);
+ typedef void (*tcase_teardown_function)(void);
+@@ -101,6 +107,11 @@ void _check_set_test_info(char const *function,
+ * Prototypes for the actual implementation.
+ */
+
++# if defined(__GNUC__)
++__attribute__((noreturn))
++# endif
++void
++_fail(const char *file, int line, const char *msg);
+ void _fail_unless(int condition, const char *file, int line, const char *msg);
+ Suite *suite_create(const char *name);
+ TCase *tcase_create(const char *name);
+diff --git a/expat/tests/runtests.c b/expat/tests/runtests.c
+index f58f794..486073f 100644
+--- a/expat/tests/runtests.c
++++ b/expat/tests/runtests.c
+@@ -46,6 +46,7 @@
+ #include /* ptrdiff_t */
+ #include
+ #include
++#include
+
+ #if ! defined(__cplusplus)
+ # if defined(_MSC_VER) && (_MSC_VER <= 1700)
+@@ -1112,7 +1113,7 @@ START_TEST(test_column_number_after_parse)
+ const char *text = " ";
+ XML_Size colno;
+
+- if (_XML_Parse_SINGLE_BYTES(parser, text, strlen(text), XML_FALSE) == XML_STATUS_ERROR)
++ if (_XML_Parse_SINGLE_BYTES(parser, text, strlen(text), XML_TRUE) == XML_STATUS_ERROR)
+ xml_failure(parser);
+ colno = XML_GetCurrentColumnNumber(parser);
+ if (colno != 11) {
+@@ -2769,7 +2770,7 @@ START_TEST(test_default_current)
+ if (_XML_Parse_SINGLE_BYTES(parser, text, strlen(text),
+ XML_TRUE) == XML_STATUS_ERROR)
+ xml_failure(parser);
+- CharData_CheckXMLChars(&storage, XCS("DCDCDCDCDCDD"));
++ CharData_CheckXMLChars(&storage, XCS("DCDCDCDD"));
+
+ /* Again, without the defaulting */
+ XML_ParserReset(parser, NULL);
+@@ -2780,7 +2781,7 @@ START_TEST(test_default_current)
+ if (_XML_Parse_SINGLE_BYTES(parser, text, strlen(text),
+ XML_TRUE) == XML_STATUS_ERROR)
+ xml_failure(parser);
+- CharData_CheckXMLChars(&storage, XCS("DcccccD"));
++ CharData_CheckXMLChars(&storage, XCS("DcccD"));
+
+ /* Now with an internal entity to complicate matters */
+ XML_ParserReset(parser, NULL);
+@@ -4216,6 +4217,19 @@ END_TEST
+ #endif // defined(XML_CONTEXT_BYTES)
+
+
++START_TEST(test_getbuffer_allocates_on_zero_len) {
++ for (int first_len = 1; first_len >= 0; first_len--) {
++ XML_Parser parser = XML_ParserCreate(NULL);
++ assert_true(parser != NULL);
++ assert_true(XML_GetBuffer(parser, first_len) != NULL);
++ assert_true(XML_GetBuffer(parser, 0) != NULL);
++ if (XML_ParseBuffer(parser, 0, XML_FALSE) != XML_STATUS_OK)
++ xml_failure(parser);
++ XML_ParserFree(parser);
++ }
++}
++END_TEST
++
+ /* Test position information macros */
+ START_TEST(test_byte_info_at_end)
+ {
+@@ -6747,6 +6761,12 @@ START_TEST(test_utf8_in_start_tags) {
+ char doc[1024];
+ size_t failCount = 0;
+
++ // we need all the bytes to be parsed, but we don't want the errors that can
++ // trigger on isFinal=XML_TRUE, so we skip the test if the heuristic is on.
++ if (g_reparseDeferralEnabledDefault) {
++ return;
++ }
++
+ for (; i < sizeof(cases) / sizeof(cases[0]); i++) {
+ size_t j = 0;
+ for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) {
+@@ -7352,6 +7372,609 @@ START_TEST(test_empty_element_abort)
+ }
+ END_TEST
+
++/* Regression test for quadratic parsing on large tokens */
++START_TEST(test_big_tokens_take_linear_time) {
++ const char *const too_slow_failure_message
++ = "Compared to the baseline runtime of the first test, this test has a "
++ "slowdown of more than . "
++ "Please keep increasing the value by 1 until it reliably passes the "
++ "test on your hardware and open a bug sharing that number with us. "
++ "Thanks in advance!";
++ const struct {
++ const char *pre;
++ const char *post;
++ } text[] = {
++ {"", " "}, // assumed good, used as baseline
++ {" "}, // CDATA, performed OK before patch
++ {" "}, // big attribute, used to be O(N²)
++ {" "}, // long comment, used to be O(N²)
++ {"<", "/> "}, // big elem name, used to be O(N²)
++ };
++ const int num_cases = sizeof(text) / sizeof(text[0]);
++ // For the test we need a value that is:
++ // (1) big enough that the test passes reliably (avoiding flaky tests), and
++ // (2) small enough that the test actually catches regressions.
++ const int max_slowdown = 15;
++ char aaaaaa[4096];
++ const int fillsize = (int)sizeof(aaaaaa);
++ const int fillcount = 100;
++
++ memset(aaaaaa, 'a', fillsize);
++
++ if (! g_reparseDeferralEnabledDefault) {
++ return; // heuristic is disabled; we would get O(n^2) and fail.
++ }
++#if defined(_WIN32)
++ if (CLOCKS_PER_SEC < 100000) {
++ // Skip this test if clock() doesn't have reasonably good resolution.
++ // This workaround is only applied to Windows targets, since XSI requires
++ // the value to be 1 000 000 (10x the condition here), and we want to be
++ // very sure that at least one platform in CI can catch regressions.
++ return;
++ }
++#endif
++
++ clock_t baseline = 0;
++ for (int i = 0; i < num_cases; ++i) {
++ XML_Parser parser = XML_ParserCreate(NULL);
++ assert_true(parser != NULL);
++ enum XML_Status status;
++ const clock_t start = clock();
++
++ // parse the start text
++ status = _XML_Parse_SINGLE_BYTES(parser, text[i].pre,
++ (int)strlen(text[i].pre), XML_FALSE);
++ if (status != XML_STATUS_OK) {
++ xml_failure(parser);
++ }
++ // parse lots of 'a', failing the test early if it takes too long
++ for (int f = 0; f < fillcount; ++f) {
++ status = _XML_Parse_SINGLE_BYTES(parser, aaaaaa, fillsize, XML_FALSE);
++ if (status != XML_STATUS_OK) {
++ xml_failure(parser);
++ }
++ // i == 0 means we're still calculating the baseline value
++ if (i > 0) {
++ const clock_t now = clock();
++ const clock_t clocks_so_far = now - start;
++ const int slowdown = clocks_so_far / baseline;
++ if (slowdown >= max_slowdown) {
++ fprintf(
++ stderr,
++ "fill#%d: clocks_so_far=%d baseline=%d slowdown=%d max_slowdown=%d\n",
++ f, (int)clocks_so_far, (int)baseline, slowdown, max_slowdown);
++ fail(too_slow_failure_message);
++ }
++ }
++ }
++ // parse the end text
++ status = _XML_Parse_SINGLE_BYTES(parser, text[i].post,
++ (int)strlen(text[i].post), XML_TRUE);
++ if (status != XML_STATUS_OK) {
++ xml_failure(parser);
++ }
++
++ // how long did it take in total?
++ const clock_t end = clock();
++ const clock_t taken = end - start;
++ if (i == 0) {
++ assert_true(taken > 0); // just to make sure we don't div-by-0 later
++ baseline = taken;
++ }
++ const int slowdown = taken / baseline;
++ if (slowdown >= max_slowdown) {
++ fprintf(stderr, "taken=%d baseline=%d slowdown=%d max_slowdown=%d\n",
++ (int)taken, (int)baseline, slowdown, max_slowdown);
++ fail(too_slow_failure_message);
++ }
++
++ XML_ParserFree(parser);
++ }
++}
++END_TEST
++
++START_TEST(test_set_reparse_deferral) {
++ const char *const pre = "";
++ const char *const start = " ";
++ char eeeeee[100];
++ const int fillsize = (int)sizeof(eeeeee);
++ memset(eeeeee, 'e', fillsize);
++
++ for (int enabled = 0; enabled <= 1; enabled += 1) {
++
++ XML_Parser parser = XML_ParserCreate(NULL);
++ assert_true(parser != NULL);
++ assert_true(XML_SetReparseDeferralEnabled(parser, enabled));
++ // pre-grow the buffer to avoid reparsing due to almost-fullness
++ assert_true(XML_GetBuffer(parser, fillsize * 10103) != NULL);
++
++ CharData storage;
++ CharData_Init(&storage);
++ XML_SetUserData(parser, &storage);
++ XML_SetStartElementHandler(parser, start_element_event_handler);
++
++ enum XML_Status status;
++ // parse the start text
++ status = XML_Parse(parser, pre, (int)strlen(pre), XML_FALSE);
++ if (status != XML_STATUS_OK) {
++ xml_failure(parser);
++ }
++ CharData_CheckXMLChars(&storage, XCS("d")); // first element should be done
++
++ // ..and the start of the token
++ status = XML_Parse(parser, start, (int)strlen(start), XML_FALSE);
++ if (status != XML_STATUS_OK) {
++ xml_failure(parser);
++ }
++ CharData_CheckXMLChars(&storage, XCS("d")); // still just the first one
++
++ // try to parse lots of 'e', but the token isn't finished
++ for (int c = 0; c < 100; ++c) {
++ status = XML_Parse(parser, eeeeee, fillsize, XML_FALSE);
++ if (status != XML_STATUS_OK) {
++ xml_failure(parser);
++ }
++ }
++ CharData_CheckXMLChars(&storage, XCS("d")); // *still* just the first one
++
++ // end the token.
++ status = XML_Parse(parser, end, (int)strlen(end), XML_FALSE);
++ if (status != XML_STATUS_OK) {
++ xml_failure(parser);
++ }
++
++ if (enabled) {
++ // In general, we may need to push more data to trigger a reparse attempt,
++ // but in this test, the data is constructed to always require it.
++ CharData_CheckXMLChars(&storage, XCS("d")); // or the test is incorrect
++ // 2x the token length should suffice; the +1 covers the start and end.
++ for (int c = 0; c < 101; ++c) {
++ status = XML_Parse(parser, eeeeee, fillsize, XML_FALSE);
++ if (status != XML_STATUS_OK) {
++ xml_failure(parser);
++ }
++ }
++ }
++ CharData_CheckXMLChars(&storage, XCS("dx")); // the should be done
++
++ XML_ParserFree(parser);
++ }
++}
++END_TEST
++
++struct element_decl_data {
++ XML_Parser parser;
++ int count;
++};
++
++static void
++element_decl_counter(void *userData, const XML_Char *UNUSED_P(name), XML_Content *model) {
++ struct element_decl_data *testdata = (struct element_decl_data *)userData;
++ testdata->count += 1;
++ XML_FreeContentModel(testdata->parser, model);
++}
++
++static int
++external_inherited_parser(XML_Parser p, const XML_Char *context,
++ const XML_Char *UNUSED_P(base), const XML_Char *UNUSED_P(systemId),
++ const XML_Char *UNUSED_P(publicId)) {
++ const char *const pre = "\n";
++ const char *const start = "\n";
++ const char *const post = "\n";
++ const int enabled = *(int *)XML_GetUserData(p);
++ char eeeeee[100];
++ char spaces[100];
++ const int fillsize = (int)sizeof(eeeeee);
++ assert_true(fillsize == (int)sizeof(spaces));
++ memset(eeeeee, 'e', fillsize);
++ memset(spaces, ' ', fillsize);
++
++ XML_Parser parser = XML_ExternalEntityParserCreate(p, context, NULL);
++ assert_true(parser != NULL);
++ // pre-grow the buffer to avoid reparsing due to almost-fullness
++ assert_true(XML_GetBuffer(parser, fillsize * 10103) != NULL);
++
++ struct element_decl_data testdata;
++ testdata.parser = parser;
++ testdata.count = 0;
++ XML_SetUserData(parser, &testdata);
++ XML_SetElementDeclHandler(parser, element_decl_counter);
++
++ enum XML_Status status;
++ // parse the initial text
++ status = XML_Parse(parser, pre, (int)strlen(pre), XML_FALSE);
++ if (status != XML_STATUS_OK) {
++ xml_failure(parser);
++ }
++ assert_true(testdata.count == 1); // first element should be done
++
++ // ..and the start of the big token
++ status = XML_Parse(parser, start, (int)strlen(start), XML_FALSE);
++ if (status != XML_STATUS_OK) {
++ xml_failure(parser);
++ }
++ assert_true(testdata.count == 1); // still just the first one
++
++ // try to parse lots of 'e', but the token isn't finished
++ for (int c = 0; c < 100; ++c) {
++ status = XML_Parse(parser, eeeeee, fillsize, XML_FALSE);
++ if (status != XML_STATUS_OK) {
++ xml_failure(parser);
++ }
++ }
++ assert_true(testdata.count == 1); // *still* just the first one
++
++ // end the big token.
++ status = XML_Parse(parser, end, (int)strlen(end), XML_FALSE);
++ if (status != XML_STATUS_OK) {
++ xml_failure(parser);
++ }
++
++ if (enabled) {
++ // In general, we may need to push more data to trigger a reparse attempt,
++ // but in this test, the data is constructed to always require it.
++ assert_true(testdata.count == 1); // or the test is incorrect
++ // 2x the token length should suffice; the +1 covers the start and end.
++ for (int c = 0; c < 101; ++c) {
++ status = XML_Parse(parser, spaces, fillsize, XML_FALSE);
++ if (status != XML_STATUS_OK) {
++ xml_failure(parser);
++ }
++ }
++ }
++ assert_true(testdata.count == 2); // the big token should be done
++
++ // parse the final text
++ status = XML_Parse(parser, post, (int)strlen(post), XML_TRUE);
++ if (status != XML_STATUS_OK) {
++ xml_failure(parser);
++ }
++ assert_true(testdata.count == 3); // after isFinal=XML_TRUE, all must be done
++
++ XML_ParserFree(parser);
++ return XML_STATUS_OK;
++}
++
++START_TEST(test_reparse_deferral_is_inherited) {
++ const char *const text
++ = " ";
++ for (int enabled = 0; enabled <= 1; ++enabled) {
++
++ XML_Parser parser = XML_ParserCreate(NULL);
++ assert_true(parser != NULL);
++ XML_SetUserData(parser, (void *)&enabled);
++ XML_SetParamEntityParsing(parser, XML_PARAM_ENTITY_PARSING_ALWAYS);
++ // this handler creates a sub-parser and checks that its deferral behavior
++ // is what we expected, based on the value of `enabled` (in userdata).
++ XML_SetExternalEntityRefHandler(parser, external_inherited_parser);
++ assert_true(XML_SetReparseDeferralEnabled(parser, enabled));
++ if (XML_Parse(parser, text, (int)strlen(text), XML_TRUE) != XML_STATUS_OK)
++ xml_failure(parser);
++
++ XML_ParserFree(parser);
++ }
++}
++END_TEST
++
++START_TEST(test_set_reparse_deferral_on_null_parser) {
++ assert_true(XML_SetReparseDeferralEnabled(NULL, 0) == XML_FALSE);
++ assert_true(XML_SetReparseDeferralEnabled(NULL, 1) == XML_FALSE);
++ assert_true(XML_SetReparseDeferralEnabled(NULL, 10) == XML_FALSE);
++ assert_true(XML_SetReparseDeferralEnabled(NULL, 100) == XML_FALSE);
++ assert_true(XML_SetReparseDeferralEnabled(NULL, (XML_Bool)INT_MIN)
++ == XML_FALSE);
++ assert_true(XML_SetReparseDeferralEnabled(NULL, (XML_Bool)INT_MAX)
++ == XML_FALSE);
++}
++END_TEST
++
++START_TEST(test_set_reparse_deferral_on_the_fly) {
++ const char *const pre = " ";
++ char iiiiii[100];
++ const int fillsize = (int)sizeof(iiiiii);
++ memset(iiiiii, 'i', fillsize);
++
++ XML_Parser parser = XML_ParserCreate(NULL);
++ assert_true(parser != NULL);
++ assert_true(XML_SetReparseDeferralEnabled(parser, XML_TRUE));
++
++ CharData storage;
++ CharData_Init(&storage);
++ XML_SetUserData(parser, &storage);
++ XML_SetStartElementHandler(parser, start_element_event_handler);
++
++ enum XML_Status status;
++ // parse the start text
++ status = XML_Parse(parser, pre, (int)strlen(pre), XML_FALSE);
++ if (status != XML_STATUS_OK) {
++ xml_failure(parser);
++ }
++ CharData_CheckXMLChars(&storage, XCS("d")); // first element should be done
++
++ // try to parse some 'i', but the token isn't finished
++ status = XML_Parse(parser, iiiiii, fillsize, XML_FALSE);
++ if (status != XML_STATUS_OK) {
++ xml_failure(parser);
++ }
++ CharData_CheckXMLChars(&storage, XCS("d")); // *still* just the first one
++
++ // end the token.
++ status = XML_Parse(parser, end, (int)strlen(end), XML_FALSE);
++ if (status != XML_STATUS_OK) {
++ xml_failure(parser);
++ }
++ CharData_CheckXMLChars(&storage, XCS("d")); // not yet.
++
++ // now change the heuristic setting and add *no* data
++ assert_true(XML_SetReparseDeferralEnabled(parser, XML_FALSE));
++ // we avoid isFinal=XML_TRUE, because that would force-bypass the heuristic.
++ status = XML_Parse(parser, "", 0, XML_FALSE);
++ if (status != XML_STATUS_OK) {
++ xml_failure(parser);
++ }
++ CharData_CheckXMLChars(&storage, XCS("dx"));
++
++ XML_ParserFree(parser);
++}
++END_TEST
++
++START_TEST(test_set_bad_reparse_option) {
++ XML_Parser parser = XML_ParserCreate(NULL);
++ assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 2));
++ assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 3));
++ assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 99));
++ assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 127));
++ assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 128));
++ assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 129));
++ assert_true(XML_FALSE == XML_SetReparseDeferralEnabled(parser, 255));
++ assert_true(XML_TRUE == XML_SetReparseDeferralEnabled(parser, 0));
++ assert_true(XML_TRUE == XML_SetReparseDeferralEnabled(parser, 1));
++ XML_ParserFree(parser);
++}
++END_TEST
++
++static size_t g_totalAlloc = 0;
++static size_t g_biggestAlloc = 0;
++
++static void *
++counting_realloc(void *ptr, size_t size) {
++ g_totalAlloc += size;
++ if (size > g_biggestAlloc) {
++ g_biggestAlloc = size;
++ }
++ return realloc(ptr, size);
++}
++
++static void *
++counting_malloc(size_t size) {
++ return counting_realloc(NULL, size);
++}
++
++START_TEST(test_bypass_heuristic_when_close_to_bufsize) {
++ if (! g_reparseDeferralEnabledDefault) {
++ return; // this test is irrelevant when the deferral heuristic is disabled.
++ }
++
++ const int document_length = 65536;
++ char *const document = (char *)malloc(document_length);
++
++ const XML_Memory_Handling_Suite memfuncs = {
++ counting_malloc,
++ counting_realloc,
++ free,
++ };
++
++ const int leading_list[] = {0, 3, 61, 96, 400, 401, 4000, 4010, 4099, -1};
++ const int bigtoken_list[] = {3000, 4000, 4001, 4096, 4099, 5000, 20000, -1};
++ const int fillsize_list[] = {131, 256, 399, 400, 401, 1025, 4099, 4321, -1};
++
++ for (const int *leading = leading_list; *leading >= 0; leading++) {
++ for (const int *bigtoken = bigtoken_list; *bigtoken >= 0; bigtoken++) {
++ for (const int *fillsize = fillsize_list; *fillsize >= 0; fillsize++) {
++ // start by checking that the test looks reasonably valid
++ assert_true(*leading + *bigtoken <= document_length);
++
++ // put 'x' everywhere; some will be overwritten by elements.
++ memset(document, 'x', document_length);
++ // maybe add an initial tag
++ if (*leading) {
++ assert_true(*leading >= 3); // or the test case is invalid
++ memcpy(document, "", 3);
++ }
++ // add the large token
++ document[*leading + 0] = '<';
++ document[*leading + 1] = 'b';
++ memset(&document[*leading + 2], ' ', *bigtoken - 2); // a spacy token
++ document[*leading + *bigtoken - 1] = '>';
++
++ // 1 for 'b', plus 1 or 0 depending on the presence of 'a'
++ const int expected_elem_total = 1 + (*leading ? 1 : 0);
++
++ XML_Parser parser = XML_ParserCreate_MM(NULL, &memfuncs, NULL);
++ assert_true(parser != NULL);
++
++ CharData storage;
++ CharData_Init(&storage);
++ XML_SetUserData(parser, &storage);
++ XML_SetStartElementHandler(parser, start_element_event_handler);
++
++ g_biggestAlloc = 0;
++ g_totalAlloc = 0;
++ int offset = 0;
++ // fill data until the big token is covered (but not necessarily parsed)
++ while (offset < *leading + *bigtoken) {
++ assert_true(offset + *fillsize <= document_length);
++ const enum XML_Status status
++ = XML_Parse(parser, &document[offset], *fillsize, XML_FALSE);
++ if (status != XML_STATUS_OK) {
++ xml_failure(parser);
++ }
++ offset += *fillsize;
++ }
++ // Now, check that we've had a buffer allocation that could fit the
++ // context bytes and our big token. In order to detect a special case,
++ // we need to know how many bytes of our big token were included in the
++ // first push that contained _any_ bytes of the big token:
++ const int bigtok_first_chunk_bytes = *fillsize - (*leading % *fillsize);
++ if (bigtok_first_chunk_bytes >= *bigtoken && XML_CONTEXT_BYTES == 0) {
++ // Special case: we aren't saving any context, and the whole big token
++ // was covered by a single fill, so Expat may have parsed directly
++ // from our input pointer, without allocating an internal buffer.
++ } else if (*leading < XML_CONTEXT_BYTES) {
++ assert_true(g_biggestAlloc >= *leading + (size_t)*bigtoken);
++ } else {
++ assert_true(g_biggestAlloc >= XML_CONTEXT_BYTES + (size_t)*bigtoken);
++ }
++ // fill data until the big token is actually parsed
++ while (storage.count < expected_elem_total) {
++ const size_t alloc_before = g_totalAlloc;
++ assert_true(offset + *fillsize <= document_length);
++ const enum XML_Status status
++ = XML_Parse(parser, &document[offset], *fillsize, XML_FALSE);
++ if (status != XML_STATUS_OK) {
++ xml_failure(parser);
++ }
++ offset += *fillsize;
++ // since all the bytes of the big token are already in the buffer,
++ // the bufsize ceiling should make us finish its parsing without any
++ // further buffer allocations. We assume that there will be no other
++ // large allocations in this test.
++ assert_true(g_totalAlloc - alloc_before < 4096);
++ }
++ // test-the-test: was our alloc even called?
++ assert_true(g_totalAlloc > 0);
++ // test-the-test: there shouldn't be any extra start elements
++ assert_true(storage.count == expected_elem_total);
++
++ XML_ParserFree(parser);
++ }
++ }
++ }
++ free(document);
++}
++END_TEST
++
++START_TEST(test_varying_buffer_fills) {
++ const int KiB = 1024;
++ const int MiB = 1024 * KiB;
++ const int document_length = 16 * MiB;
++ const int big = 7654321; // arbitrarily chosen between 4 and 8 MiB
++
++ char *const document = (char *)malloc(document_length);
++ assert_true(document != NULL);
++ memset(document, 'x', document_length);
++ document[0] = '<';
++ document[1] = 't';
++ memset(&document[2], ' ', big - 2); // a very spacy token
++ document[big - 1] = '>';
++
++ // Each testcase is a list of buffer fill sizes, terminated by a value < 0.
++ // When reparse deferral is enabled, the final (negated) value is the expected
++ // maximum number of bytes scanned in parse attempts.
++ const int testcases[][30] = {
++ {8 * MiB, -8 * MiB},
++ {4 * MiB, 4 * MiB, -12 * MiB}, // try at 4MB, then 8MB = 12 MB total
++ // zero-size fills shouldn't trigger the bypass
++ {4 * MiB, 0, 4 * MiB, -12 * MiB},
++ {4 * MiB, 0, 0, 4 * MiB, -12 * MiB},
++ {4 * MiB, 0, 1 * MiB, 0, 3 * MiB, -12 * MiB},
++ // try to hit the buffer ceiling only once (at the end)
++ {4 * MiB, 2 * MiB, 1 * MiB, 512 * KiB, 256 * KiB, 256 * KiB, -12 * MiB},
++ // try to hit the same buffer ceiling multiple times
++ {4 * MiB + 1, 2 * MiB, 1 * MiB, 512 * KiB, -25 * MiB},
++
++ // try to hit every ceiling, by always landing 1K shy of the buffer size
++ {1 * KiB, 2 * KiB, 4 * KiB, 8 * KiB, 16 * KiB, 32 * KiB, 64 * KiB,
++ 128 * KiB, 256 * KiB, 512 * KiB, 1 * MiB, 2 * MiB, 4 * MiB, -16 * MiB},
++
++ // try to avoid every ceiling, by always landing 1B past the buffer size
++ // the normal 2x heuristic threshold still forces parse attempts.
++ {2 * KiB + 1, // will attempt 2KiB + 1 ==> total 2KiB + 1
++ 2 * KiB, 4 * KiB, // will attempt 8KiB + 1 ==> total 10KiB + 2
++ 8 * KiB, 16 * KiB, // will attempt 32KiB + 1 ==> total 42KiB + 3
++ 32 * KiB, 64 * KiB, // will attempt 128KiB + 1 ==> total 170KiB + 4
++ 128 * KiB, 256 * KiB, // will attempt 512KiB + 1 ==> total 682KiB + 5
++ 512 * KiB, 1 * MiB, // will attempt 2MiB + 1 ==> total 2M + 682K + 6
++ 2 * MiB, 4 * MiB, // will attempt 8MiB + 1 ==> total 10M + 682K + 7
++ -(10 * MiB + 682 * KiB + 7)},
++ // try to avoid every ceiling again, except on our last fill.
++ {2 * KiB + 1, // will attempt 2KiB + 1 ==> total 2KiB + 1
++ 2 * KiB, 4 * KiB, // will attempt 8KiB + 1 ==> total 10KiB + 2
++ 8 * KiB, 16 * KiB, // will attempt 32KiB + 1 ==> total 42KiB + 3
++ 32 * KiB, 64 * KiB, // will attempt 128KiB + 1 ==> total 170KiB + 4
++ 128 * KiB, 256 * KiB, // will attempt 512KiB + 1 ==> total 682KiB + 5
++ 512 * KiB, 1 * MiB, // will attempt 2MiB + 1 ==> total 2M + 682K + 6
++ 2 * MiB, 4 * MiB - 1, // will attempt 8MiB ==> total 10M + 682K + 6
++ -(10 * MiB + 682 * KiB + 6)},
++
++ // try to hit ceilings on the way multiple times
++ {512 * KiB + 1, 256 * KiB, 128 * KiB, 128 * KiB - 1, // 1 MiB buffer
++ 512 * KiB + 1, 256 * KiB, 128 * KiB, 128 * KiB - 1, // 2 MiB buffer
++ 1 * MiB + 1, 512 * KiB, 256 * KiB, 256 * KiB - 1, // 4 MiB buffer
++ 2 * MiB + 1, 1 * MiB, 512 * KiB, // 8 MiB buffer
++ // we'll make a parse attempt at every parse call
++ -(45 * MiB + 12)},
++ };
++ const int testcount = sizeof(testcases) / sizeof(testcases[0]);
++ for (int test_i = 0; test_i < testcount; test_i++) {
++ const int *fillsize = testcases[test_i];
++ XML_Parser parser = XML_ParserCreate(NULL);
++ assert_true(parser != NULL);
++ g_parseAttempts = 0;
++
++ CharData storage;
++ CharData_Init(&storage);
++ XML_SetUserData(parser, &storage);
++ XML_SetStartElementHandler(parser, start_element_event_handler);
++
++ int worstcase_bytes = 0; // sum of (buffered bytes at each XML_Parse call)
++ int scanned_bytes = 0; // sum of (buffered bytes at each actual parse)
++ int offset = 0;
++ while (*fillsize >= 0) {
++ assert_true(offset + *fillsize <= document_length); // or test is invalid
++ const unsigned attempts_before = g_parseAttempts;
++ const enum XML_Status status
++ = XML_Parse(parser, &document[offset], *fillsize, XML_FALSE);
++ if (status != XML_STATUS_OK) {
++ xml_failure(parser);
++ }
++ offset += *fillsize;
++ fillsize++;
++ assert_true(offset <= INT_MAX - worstcase_bytes); // avoid overflow
++ worstcase_bytes += offset; // we might've tried to parse all pending bytes
++ if (g_parseAttempts != attempts_before) {
++ assert_true(g_parseAttempts == attempts_before + 1); // max 1/XML_Parse
++ assert_true(offset <= INT_MAX - scanned_bytes); // avoid overflow
++ scanned_bytes += offset; // we *did* try to parse all pending bytes
++ }
++ }
++ assert_true(storage.count == 1); // the big token should've been parsed
++ assert_true(scanned_bytes > 0); // test-the-test: does our counter work?
++ if (g_reparseDeferralEnabledDefault) {
++ // heuristic is enabled; some XML_Parse calls may have deferred reparsing
++ const int max_bytes_scanned = -*fillsize;
++ if (scanned_bytes > max_bytes_scanned) {
++ fprintf(stderr,
++ "bytes scanned in parse attempts: actual=%d limit=%d \n",
++ scanned_bytes, max_bytes_scanned);
++ fail("too many bytes scanned in parse attempts");
++ }
++ assert_true(scanned_bytes <= worstcase_bytes);
++ } else {
++ // heuristic is disabled; every XML_Parse() will have reparsed
++ assert_true(scanned_bytes == worstcase_bytes);
++ }
++
++ XML_ParserFree(parser);
++ }
++ free(document);
++}
++END_TEST
++
++
+ /*
+ * Namespaces tests.
+ */
+@@ -7435,13 +8058,13 @@ START_TEST(test_return_ns_triplet)
+ if (_XML_Parse_SINGLE_BYTES(parser, text, strlen(text),
+ XML_FALSE) == XML_STATUS_ERROR)
+ xml_failure(parser);
+- if (!triplet_start_flag)
+- fail("triplet_start_checker not invoked");
+ /* Check that unsetting "return triplets" fails while still parsing */
+ XML_SetReturnNSTriplet(parser, XML_FALSE);
+ if (_XML_Parse_SINGLE_BYTES(parser, epilog, strlen(epilog),
+ XML_TRUE) == XML_STATUS_ERROR)
+ xml_failure(parser);
++ if (!triplet_start_flag)
++ fail("triplet_start_checker not invoked");
+ if (!triplet_end_flag)
+ fail("triplet_end_checker not invoked");
+ if (dummy_handler_flags != (DUMMY_START_NS_DECL_HANDLER_FLAG |
+@@ -12476,6 +13099,7 @@ make_suite(void)
+ #if defined(XML_CONTEXT_BYTES)
+ tcase_add_test(tc_basic, test_get_buffer_3_overflow);
+ #endif
++ tcase_add_test(tc_basic, test_getbuffer_allocates_on_zero_len);
+ tcase_add_test(tc_basic, test_byte_info_at_end);
+ tcase_add_test(tc_basic, test_byte_info_at_error);
+ tcase_add_test(tc_basic, test_byte_info_at_cdata);
+@@ -12588,6 +13212,14 @@ make_suite(void)
+ tcase_add_test(tc_basic, test_bad_notation);
+ tcase_add_test(tc_basic, test_default_doctype_handler);
+ tcase_add_test(tc_basic, test_empty_element_abort);
++ tcase_add_test(tc_basic, test_big_tokens_take_linear_time);
++ tcase_add_test(tc_basic, test_set_reparse_deferral);
++ tcase_add_test(tc_basic, test_reparse_deferral_is_inherited);
++ tcase_add_test(tc_basic, test_set_reparse_deferral_on_null_parser);
++ tcase_add_test(tc_basic, test_set_reparse_deferral_on_the_fly);
++ tcase_add_test(tc_basic, test_set_bad_reparse_option);
++ tcase_add_test(tc_basic, test_bypass_heuristic_when_close_to_bufsize);
++ tcase_add_test(tc_basic, test_varying_buffer_fills);
+
+ suite_add_tcase(s, tc_namespace);
+ tcase_add_checked_fixture(tc_namespace,
+diff --git a/expat/xmlwf/xmlwf.c b/expat/xmlwf/xmlwf.c
+index 82d028e..cd26919 100644
+--- a/expat/xmlwf/xmlwf.c
++++ b/expat/xmlwf/xmlwf.c
+@@ -35,6 +35,7 @@
+ #include
+ #include
+ #include
++#include
+
+ #include "expat.h"
+ #include "codepage.h"
+@@ -892,7 +893,7 @@ static void
+ usage(const XML_Char *prog, int rc)
+ {
+ ftprintf(stderr,
+- T("usage: %s [-s] [-n] [-p] [-x] [-e encoding] [-w] [-d output-dir] [-c] [-m] [-r] [-t] [-N] [file ...]\n"), prog);
++ T("usage: %s [-s] [-n] [-p] [-x] [-e encoding] [-w] [-d output-dir] [-c] [-m] [-r] [-t] [-N] [-q] [file ...]\n"), prog);
+ exit(rc);
+ }
+
+@@ -917,6 +918,8 @@ tmain(int argc, XML_Char **argv)
+ XML_PARAM_ENTITY_PARSING_NEVER;
+ int useStdin = 0;
+ XmlwfUserData userData = { NULL, NULL, NULL };
++ XML_Bool disableDeferral = XML_FALSE;
++
+
+ #ifdef _MSC_VER
+ _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF|_CRTDBG_LEAK_CHECK_DF);
+@@ -1003,6 +1006,11 @@ tmain(int argc, XML_Char **argv)
+ case T('v'):
+ showVersion(argv[0]);
+ return 0;
++ case T('q'): {
++ disableDeferral = XML_TRUE;
++ j++;
++ break;
++ }
+ case T('\0'):
+ if (j > 1) {
+ i++;
+@@ -1033,6 +1041,16 @@ tmain(int argc, XML_Char **argv)
+ exit(1);
+ }
+
++ if (disableDeferral) {
++ const XML_Bool success = XML_SetReparseDeferralEnabled(parser, XML_FALSE);
++ if (! success) {
++ // This prevents tperror(..) from reporting misleading "[..]: Success"
++ errno = EINVAL;
++ tperror(T("Failed to disable reparse deferral"));
++ exit(1);
++ }
++ }
++
+ if (requireStandalone)
+ XML_SetNotStandaloneHandler(parser, notStandalone);
+ XML_SetParamEntityParsing(parser, paramEntityParsing);
+diff --git a/testdata/largefiles/aaaaaa_attr.xml b/testdata/largefiles/aaaaaa_attr.xml
+new file mode 100644
+index 0000000..66e3d25
+--- /dev/null
++++ b/testdata/largefiles/aaaaaa_attr.xml
+@@ -0,0 +1 @@
++
+\ No newline at end of file
+diff --git a/testdata/largefiles/aaaaaa_cdata.xml b/testdata/largefiles/aaaaaa_cdata.xml
+new file mode 100644
+index 0000000..66f64bd
+--- /dev/null
++++ b/testdata/largefiles/aaaaaa_cdata.xml
+@@ -0,0 +1 @@
++
+\ No newline at end of file
+diff --git a/testdata/largefiles/aaaaaa_comment.xml b/testdata/largefiles/aaaaaa_comment.xml
+new file mode 100644
+index 0000000..bb9af13
+--- /dev/null
++++ b/testdata/largefiles/aaaaaa_comment.xml
+@@ -0,0 +1 @@
++
+\ No newline at end of file
+diff --git a/testdata/largefiles/aaaaaa_tag.xml b/testdata/largefiles/aaaaaa_tag.xml
+new file mode 100644
+index 0000000..946f701
+--- /dev/null
++++ b/testdata/largefiles/aaaaaa_tag.xml
+@@ -0,0 +1 @@
++
+\ No newline at end of file
+diff --git a/testdata/largefiles/aaaaaa_text.xml b/testdata/largefiles/aaaaaa_text.xml
+new file mode 100644
+index 0000000..e266acb
+--- /dev/null
++++ b/testdata/largefiles/aaaaaa_text.xml
+@@ -0,0 +1 @@
++ACHARS
+\ No newline at end of file
diff --git a/expat.spec b/expat.spec
index 5ceff69..293d4a2 100644
--- a/expat.spec
+++ b/expat.spec
@@ -3,7 +3,7 @@
Summary: An XML parser library
Name: expat
Version: %(echo %{unversion} | sed 's/_/./g')
-Release: 11%{?dist}
+Release: 12%{?dist}
Source: https://github.com/libexpat/libexpat/archive/R_%{unversion}.tar.gz#/expat-%{version}.tar.gz
URL: https://libexpat.github.io/
License: MIT
@@ -22,6 +22,7 @@ Patch10: expat-2.2.5-Prevent-integer-overflow-in-copyString.patch
Patch11: expat-2.2.5-Prevent-stack-exhaustion-in-build_model.patch
Patch12: expat-2.2.5-Ensure-raw-tagnames-are-safe-exiting-internalEntityParser.patch
Patch13: expat-2.2.5-CVE-2022-43680.patch
+Patch14: expat-2.2.5-CVE-2023-52425.patch
%description
This is expat, the C library for parsing XML, written by James Clark. Expat
@@ -63,6 +64,9 @@ Install it if you need to link statically with expat.
%patch11 -p1 -b .CVE-2022-25313
%patch12 -p1 -b .CVE-2022-40674
%patch13 -p1 -b .CVE-2022-43680
+pushd ..
+%patch14 -p1 -b .CVE-2023-52425
+popd
sed -i 's/install-data-hook/do-nothing-please/' lib/Makefile.am
./buildconf.sh
@@ -79,6 +83,15 @@ make install DESTDIR=$RPM_BUILD_ROOT
rm -f $RPM_BUILD_ROOT%{_libdir}/*.la
%check
+bash -c "for i in {1..500000}; do printf AAAAAAAAAAAAAAAAAAAA >> achars.txt; done"
+for testfile in ../testdata/largefiles/aaaaaa_*; do
+ first_part="$(sed 's/\(.*\)ACHARS.*/\1/g' $testfile)"
+ second_part="$(sed 's/.*ACHARS\(.*\)/\1/g' $testfile)"
+ printf "$first_part" > "$testfile"
+ cat achars.txt >> "$testfile"
+ printf "$second_part" >> "$testfile"
+done
+
make check
%ldconfig_scriptlets
@@ -101,6 +114,10 @@ make check
%{_libdir}/lib*.a
%changelog
+* Thu Mar 21 2024 Tomas Korbar - 2.2.5-12
+- CVE-2023-52425 expat: parsing large tokens can trigger a denial of service
+- Resolves: RHEL-29321
+
* Mon Nov 14 2022 Tomas Korbar - 2.2.5-11
- CVE-2022-43680 expat: use-after free caused by overeager destruction of a shared DTD in XML_ExternalEntityParserCreate
- Resolves: CVE-2022-43680