squid/SOURCES/0001-Fix-incremental-parsin...

From 96d95b036e28c863c810b334f17d0ec619bf421c Mon Sep 17 00:00:00 2001
From: Eduard Bagdasaryan <eduard.bagdasaryan@measurement-factory.com>
Date: Sun, 5 Nov 2023 11:20:35 +0000
Subject: [PATCH 1/2] Fix incremental parsing of chunked quoted extensions
 (#310)

Before this change, incremental parsing of quoted chunked extensions
was broken for two reasons:

* Http::One::Parser::skipLineTerminator() unexpectedly threw after
  partially received quoted chunk extension value.

* When Http::One::Tokenizer was unable to parse a quoted extension,
  it incorrectly restored the input buffer to the beginning of the
  extension value (instead of the extension itself), thus making
  further incremental parsing iterations impossible.

IMO, the reason for this problem was that Http::One::Tokenizer::qdText()
could not distinguish two cases (returning false in both):

* the end of the quoted string not yet reached

* an input error, e.g., wrong/unexpected character

A possible approach could be to improve Http::One::Tokenizer, making it
aware about "needs more data" state.  However, to be acceptable,
these improvements should be done in the base Parser::Tokenizer
class instead. These changes seem to be non-trivial and could be
done separately and later.

Another approach, used here, is to simplify the complex and error-prone
chunked extensions parsing algorithm, fixing incremental parsing bugs
and still parse incrementally in almost all cases. The performance
regression could be expected only in relatively rare cases of partially
received or malformed extensions.

Also:
* fixed parsing of partial use-original-body extension values
* do not treat an invalid use-original-body as an unknown extension
* optimization: parse use-original-body extension only in ICAP context
  (i.e., where it is expected)
* improvement: added a new API to TeChunkedParser to specify known
  chunked extensions list

Modified-by: Alex Burmashev <alexander.burmashev@oracle.com>
Signed-off-by: Alex Burmashev <alexander.burmashev@oracle.com>
---
 src/adaptation/icap/ModXact.cc  |  22 ++++-
 src/adaptation/icap/ModXact.h   |  20 +++++
 src/http/one/Parser.cc          |  35 ++++----
 src/http/one/Parser.h           |  10 ++-
 src/http/one/RequestParser.cc   |  16 ++--
 src/http/one/RequestParser.h    |   8 +-
 src/http/one/ResponseParser.cc  |  17 ++--
 src/http/one/ResponseParser.h   |   2 +-
 src/http/one/TeChunkedParser.cc | 139 ++++++++++++++++++--------------
 src/http/one/TeChunkedParser.h  |  41 ++++++++--
 src/http/one/Tokenizer.cc       | 104 ++++++++++++------------
 src/http/one/Tokenizer.h        |  89 ++++++++------------
 src/http/one/forward.h          |   3 +
 src/parser/BinaryTokenizer.h    |   3 +-
 src/parser/Makefile.am          |   1 +
 src/parser/Tokenizer.cc         |  40 +++++++++
 src/parser/Tokenizer.h          |  13 +++
 src/parser/forward.h            |  22 +++++
 18 files changed, 364 insertions(+), 221 deletions(-)
 create mode 100644 src/parser/forward.h

diff --git a/src/adaptation/icap/ModXact.cc b/src/adaptation/icap/ModXact.cc
index 2db0a68..22a87f5 100644
--- a/src/adaptation/icap/ModXact.cc
+++ b/src/adaptation/icap/ModXact.cc
@@ -25,12 +25,13 @@
 #include "comm.h"
 #include "comm/Connection.h"
 #include "err_detail_type.h"
-#include "http/one/TeChunkedParser.h"
 #include "HttpHeaderTools.h"
 #include "HttpMsg.h"
 #include "HttpReply.h"
 #include "HttpRequest.h"
 #include "MasterXaction.h"
+#include "parser/Tokenizer.h"
+#include "sbuf/Stream.h"
 #include "SquidTime.h"

 // flow and terminology:
@@ -44,6 +45,8 @@ CBDATA_NAMESPACED_CLASS_INIT(Adaptation::Icap, ModXactLauncher);

 static const size_t TheBackupLimit = BodyPipe::MaxCapacity;

+const SBuf Adaptation::Icap::ChunkExtensionValueParser::UseOriginalBodyName("use-original-body");
+
 Adaptation::Icap::ModXact::State::State()
 {
     memset(this, 0, sizeof(*this));
@@ -1108,6 +1111,7 @@ void Adaptation::Icap::ModXact::decideOnParsingBody()
         state.parsing = State::psBody;
         replyHttpBodySize = 0;
         bodyParser = new Http1::TeChunkedParser;
+        bodyParser->parseExtensionValuesWith(&extensionParser);
         makeAdaptedBodyPipe("adapted response from the ICAP server");
         Must(state.sending == State::sendingAdapted);
     } else {
@@ -1142,9 +1146,8 @@ void Adaptation::Icap::ModXact::parseBody()
     }

     if (parsed) {
-        if (state.readyForUob && bodyParser->useOriginBody >= 0) {
-            prepPartialBodyEchoing(
-                static_cast<uint64_t>(bodyParser->useOriginBody));
+        if (state.readyForUob && extensionParser.sawUseOriginalBody()) {
+            prepPartialBodyEchoing(extensionParser.useOriginalBody());
             stopParsing();
             return;
         }
@@ -2014,3 +2017,14 @@ void Adaptation::Icap::ModXactLauncher::updateHistory(bool doStart)
     }
 }

+void
+Adaptation::Icap::ChunkExtensionValueParser::parse(Tokenizer &tok, const SBuf &extName)
+{
+    if (extName == UseOriginalBodyName) {
+        useOriginalBody_ = tok.udec64("use-original-body");
+        assert(useOriginalBody_ >= 0);
+    } else {
+        Ignore(tok, extName);
+    }
+}
+
diff --git a/src/adaptation/icap/ModXact.h b/src/adaptation/icap/ModXact.h
index f7afa69..fb4dec0 100644
--- a/src/adaptation/icap/ModXact.h
+++ b/src/adaptation/icap/ModXact.h
@@ -15,6 +15,7 @@
 #include "adaptation/icap/Xaction.h"
 #include "BodyPipe.h"
 #include "http/one/forward.h"
+#include "http/one/TeChunkedParser.h"

 /*
  * ICAPModXact implements ICAP REQMOD and RESPMOD transaction using
@@ -105,6 +106,23 @@ private:
     enum State { stDisabled, stWriting, stIeof, stDone } theState;
 };

+/// handles ICAP-specific chunk extensions supported by Squid
+class ChunkExtensionValueParser: public Http1::ChunkExtensionValueParser
+{
+public:
+    /* Http1::ChunkExtensionValueParser API */
+    virtual void parse(Tokenizer &tok, const SBuf &extName) override;
+
+    bool sawUseOriginalBody() const { return useOriginalBody_ >= 0; }
+    uint64_t useOriginalBody() const { assert(sawUseOriginalBody()); return static_cast<uint64_t>(useOriginalBody_); }
+
+private:
+    static const SBuf UseOriginalBodyName;
+
+    /// the value of the parsed use-original-body chunk extension (or -1)
+    int64_t useOriginalBody_ = -1;
+};
+
 class ModXact: public Xaction, public BodyProducer, public BodyConsumer
 {
     CBDATA_CLASS(ModXact);
@@ -270,6 +288,8 @@ private:

     int adaptHistoryId; ///< adaptation history slot reservation

+    ChunkExtensionValueParser extensionParser;
+
     class State
     {

diff --git a/src/http/one/Parser.cc b/src/http/one/Parser.cc
index 0c86733..affe0b1 100644
--- a/src/http/one/Parser.cc
+++ b/src/http/one/Parser.cc
@@ -7,10 +7,11 @@
  */

 #include "squid.h"
+#include "base/CharacterSet.h"
 #include "Debug.h"
 #include "http/one/Parser.h"
-#include "http/one/Tokenizer.h"
 #include "mime_header.h"
+#include "parser/Tokenizer.h"
 #include "SquidConfig.h"

 /// RFC 7230 section 2.6 - 7 magic octets
@@ -61,20 +62,19 @@ Http::One::Parser::DelimiterCharacters()
            RelaxedDelimiterCharacters() : CharacterSet::SP;
 }

-bool
-Http::One::Parser::skipLineTerminator(Http1::Tokenizer &tok) const
+void
+Http::One::Parser::skipLineTerminator(Tokenizer &tok) const
 {
     if (tok.skip(Http1::CrLf()))
-        return true;
+        return;

     if (Config.onoff.relaxed_header_parser && tok.skipOne(CharacterSet::LF))
-        return true;
+        return;

     if (tok.atEnd() || (tok.remaining().length() == 1 && tok.remaining().at(0) == '\r'))
-        return false; // need more data
+        throw InsufficientInput();

     throw TexcHere("garbage instead of CRLF line terminator");
-    return false; // unreachable, but make naive compilers happy
 }

 /// all characters except the LF line terminator
@@ -102,7 +102,7 @@ LineCharacters()
 void
 Http::One::Parser::cleanMimePrefix()
 {
-    Http1::Tokenizer tok(mimeHeaderBlock_);
+    Tokenizer tok(mimeHeaderBlock_);
     while (tok.skipOne(RelaxedDelimiterCharacters())) {
         (void)tok.skipAll(LineCharacters()); // optional line content
         // LF terminator is required.
@@ -137,7 +137,7 @@ Http::One::Parser::cleanMimePrefix()
 void
 Http::One::Parser::unfoldMime()
 {
-    Http1::Tokenizer tok(mimeHeaderBlock_);
+    Tokenizer tok(mimeHeaderBlock_);
     const auto szLimit = mimeHeaderBlock_.length();
     mimeHeaderBlock_.clear();
     // prevent the mime sender being able to make append() realloc/grow multiple times.
@@ -228,7 +228,7 @@ Http::One::Parser::getHostHeaderField()
     debugs(25, 5, "looking for " << name);

     // while we can find more LF in the SBuf
-    Http1::Tokenizer tok(mimeHeaderBlock_);
+    Tokenizer tok(mimeHeaderBlock_);
     SBuf p;

     while (tok.prefix(p, LineCharacters())) {
@@ -250,7 +250,7 @@ Http::One::Parser::getHostHeaderField()
         p.consume(namelen + 1);

         // TODO: optimize SBuf::trim to take CharacterSet directly
-        Http1::Tokenizer t(p);
+        Tokenizer t(p);
         t.skipAll(CharacterSet::WSP);
         p = t.remaining();

@@ -278,10 +278,15 @@ Http::One::ErrorLevel()
 }

 // BWS = *( SP / HTAB ) ; WhitespaceCharacters() may relax this RFC 7230 rule
-bool
-Http::One::ParseBws(Tokenizer &tok)
+void
+Http::One::ParseBws(Parser::Tokenizer &tok)
 {
-    if (const auto count = tok.skipAll(Parser::WhitespaceCharacters())) {
+    const auto count = tok.skipAll(Parser::WhitespaceCharacters());
+
+    if (tok.atEnd())
+        throw InsufficientInput(); // even if count is positive
+
+    if (count) {
         // Generating BWS is a MUST-level violation so warn about it as needed.
         debugs(33, ErrorLevel(), "found " << count << " BWS octets");
         // RFC 7230 says we MUST parse BWS, so we fall through even if
@@ -289,6 +294,6 @@ Http::One::ParseBws(Tokenizer &tok)
     }
     // else we successfully "parsed" an empty BWS sequence

-    return true;
+    // success: no more BWS characters expected
 }

diff --git a/src/http/one/Parser.h b/src/http/one/Parser.h
index 58a5cae..40e281b 100644
--- a/src/http/one/Parser.h
+++ b/src/http/one/Parser.h
@@ -12,6 +12,7 @@
 #include "anyp/ProtocolVersion.h"
 #include "http/one/forward.h"
 #include "http/StatusCode.h"
+#include "parser/forward.h"
 #include "sbuf/SBuf.h"

 namespace Http {
@@ -40,6 +41,7 @@ class Parser : public RefCountable
 {
 public:
     typedef SBuf::size_type size_type;
+    typedef ::Parser::Tokenizer Tokenizer;

     Parser() : parseStatusCode(Http::scNone), parsingStage_(HTTP_PARSE_NONE), hackExpectsMime_(false) {}
     virtual ~Parser() {}
@@ -118,11 +120,11 @@ protected:
      * detect and skip the CRLF or (if tolerant) LF line terminator
      * consume from the tokenizer.
      *
-     * throws if non-terminator is detected.
+     * \throws exception on bad or InsuffientInput.
      * \retval true only if line terminator found.
      * \retval false incomplete or missing line terminator, need more data.
      */
-    bool skipLineTerminator(Http1::Tokenizer &tok) const;
+    void skipLineTerminator(Tokenizer &) const;

     /**
      * Scan to find the mime headers block for current message.
@@ -159,8 +161,8 @@ private:
 };

 /// skips and, if needed, warns about RFC 7230 BWS ("bad" whitespace)
-/// \returns true (always; unlike all the skip*() functions)
-bool ParseBws(Tokenizer &tok);
+/// \throws InsufficientInput when the end of BWS cannot be confirmed
+void ParseBws(Parser::Tokenizer &);

 /// the right debugs() level for logging HTTP violation messages
 int ErrorLevel();
diff --git a/src/http/one/RequestParser.cc b/src/http/one/RequestParser.cc
index a325f7d..0f13c92 100644
--- a/src/http/one/RequestParser.cc
+++ b/src/http/one/RequestParser.cc
@@ -9,8 +9,8 @@
 #include "squid.h"
 #include "Debug.h"
 #include "http/one/RequestParser.h"
-#include "http/one/Tokenizer.h"
 #include "http/ProtocolVersion.h"
+#include "parser/Tokenizer.h"
 #include "profiler/Profiler.h"
 #include "SquidConfig.h"

@@ -64,7 +64,7 @@ Http::One::RequestParser::skipGarbageLines()
  *  RFC 7230 section 2.6, 3.1 and 3.5
  */
 bool
-Http::One::RequestParser::parseMethodField(Http1::Tokenizer &tok)
+Http::One::RequestParser::parseMethodField(Tokenizer &tok)
 {
     // method field is a sequence of TCHAR.
     // Limit to 32 characters to prevent overly long sequences of non-HTTP
@@ -145,7 +145,7 @@ Http::One::RequestParser::RequestTargetCharacters()
 }

 bool
-Http::One::RequestParser::parseUriField(Http1::Tokenizer &tok)
+Http::One::RequestParser::parseUriField(Tokenizer &tok)
 {
     /* Arbitrary 64KB URI upper length limit.
      *
@@ -178,7 +178,7 @@ Http::One::RequestParser::parseUriField(Http1::Tokenizer &tok)
 }

 bool
-Http::One::RequestParser::parseHttpVersionField(Http1::Tokenizer &tok)
+Http::One::RequestParser::parseHttpVersionField(Tokenizer &tok)
 {
     static const SBuf http1p0("HTTP/1.0");
     static const SBuf http1p1("HTTP/1.1");
@@ -253,7 +253,7 @@ Http::One::RequestParser::skipDelimiter(const size_t count, const char *where)

 /// Parse CRs at the end of request-line, just before the terminating LF.
 bool
-Http::One::RequestParser::skipTrailingCrs(Http1::Tokenizer &tok)
+Http::One::RequestParser::skipTrailingCrs(Tokenizer &tok)
 {
     if (Config.onoff.relaxed_header_parser) {
         (void)tok.skipAllTrailing(CharacterSet::CR); // optional; multiple OK
@@ -289,12 +289,12 @@ Http::One::RequestParser::parseRequestFirstLine()
     // Earlier, skipGarbageLines() took care of any leading LFs (if allowed).
     // Now, the request line has to end at the first LF.
     static const CharacterSet lineChars = CharacterSet::LF.complement("notLF");
-    ::Parser::Tokenizer lineTok(buf_);
+    Tokenizer lineTok(buf_);
     if (!lineTok.prefix(line, lineChars) || !lineTok.skip('\n')) {
         if (buf_.length() >= Config.maxRequestHeaderSize) {
             /* who should we blame for our failure to parse this line? */

-            Http1::Tokenizer methodTok(buf_);
+            Tokenizer methodTok(buf_);
             if (!parseMethodField(methodTok))
                 return -1; // blame a bad method (or its delimiter)

@@ -308,7 +308,7 @@ Http::One::RequestParser::parseRequestFirstLine()
         return 0;
     }

-    Http1::Tokenizer tok(line);
+    Tokenizer tok(line);

     if (!parseMethodField(tok))
         return -1;
diff --git a/src/http/one/RequestParser.h b/src/http/one/RequestParser.h
index 7086548..26697cd 100644
--- a/src/http/one/RequestParser.h
+++ b/src/http/one/RequestParser.h
@@ -54,11 +54,11 @@ private:
     bool doParse(const SBuf &aBuf);

     /* all these return false and set parseStatusCode on parsing failures */
-    bool parseMethodField(Http1::Tokenizer &);
-    bool parseUriField(Http1::Tokenizer &);
-    bool parseHttpVersionField(Http1::Tokenizer &);
+    bool parseMethodField(Tokenizer &);
+    bool parseUriField(Tokenizer &);
+    bool parseHttpVersionField(Tokenizer &);
     bool skipDelimiter(const size_t count, const char *where);
-    bool skipTrailingCrs(Http1::Tokenizer &tok);
+    bool skipTrailingCrs(Tokenizer &tok);

     bool http0() const {return !msgProtocol_.major;}
     static const CharacterSet &RequestTargetCharacters();
diff --git a/src/http/one/ResponseParser.cc b/src/http/one/ResponseParser.cc
index 24af849..65baf09 100644
--- a/src/http/one/ResponseParser.cc
+++ b/src/http/one/ResponseParser.cc
@@ -9,8 +9,8 @@
 #include "squid.h"
 #include "Debug.h"
 #include "http/one/ResponseParser.h"
-#include "http/one/Tokenizer.h"
 #include "http/ProtocolVersion.h"
+#include "parser/Tokenizer.h"
 #include "profiler/Profiler.h"
 #include "SquidConfig.h"

@@ -47,7 +47,7 @@ Http::One::ResponseParser::firstLineSize() const
 // NP: we found the protocol version and consumed it already.
 // just need the status code and reason phrase
 int
-Http::One::ResponseParser::parseResponseStatusAndReason(Http1::Tokenizer &tok, const CharacterSet &WspDelim)
+Http::One::ResponseParser::parseResponseStatusAndReason(Tokenizer &tok, const CharacterSet &WspDelim)
 {
     if (!completedStatus_) {
         debugs(74, 9, "seek status-code in: " << tok.remaining().substr(0,10) << "...");
@@ -87,14 +87,13 @@ Http::One::ResponseParser::parseResponseStatusAndReason(Http1::Tokenizer &tok, c
     static const CharacterSet phraseChars = CharacterSet::WSP + CharacterSet::VCHAR + CharacterSet::OBSTEXT;
     (void)tok.prefix(reasonPhrase_, phraseChars); // optional, no error if missing
     try {
-        if (skipLineTerminator(tok)) {
-            debugs(74, DBG_DATA, "parse remaining buf={length=" << tok.remaining().length() << ", data='" << tok.remaining() << "'}");
-            buf_ = tok.remaining(); // resume checkpoint
-            return 1;
-        }
+        skipLineTerminator(tok);
+        buf_ = tok.remaining(); // resume checkpoint
+        debugs(74, DBG_DATA, Raw("leftovers", buf_.rawContent(), buf_.length()));
+        return 1;
+    } catch (const InsufficientInput &) {
         reasonPhrase_.clear();
         return 0; // need more to be sure we have it all
-
     } catch (const std::exception &ex) {
         debugs(74, 6, "invalid status-line: " << ex.what());
     }
@@ -119,7 +118,7 @@ Http::One::ResponseParser::parseResponseStatusAndReason(Http1::Tokenizer &tok, c
 int
 Http::One::ResponseParser::parseResponseFirstLine()
 {
-    Http1::Tokenizer tok(buf_);
+    Tokenizer tok(buf_);

     const CharacterSet &WspDelim = DelimiterCharacters();

diff --git a/src/http/one/ResponseParser.h b/src/http/one/ResponseParser.h
index 15db4a0..cf13b4d 100644
--- a/src/http/one/ResponseParser.h
+++ b/src/http/one/ResponseParser.h
@@ -43,7 +43,7 @@ public:

 private:
     int parseResponseFirstLine();
-    int parseResponseStatusAndReason(Http1::Tokenizer&, const CharacterSet &);
+    int parseResponseStatusAndReason(Tokenizer&, const CharacterSet &);

     /// magic prefix for identifying ICY response messages
     static const SBuf IcyMagic;
diff --git a/src/http/one/TeChunkedParser.cc b/src/http/one/TeChunkedParser.cc
index 754086e..6d2f8ea 100644
--- a/src/http/one/TeChunkedParser.cc
+++ b/src/http/one/TeChunkedParser.cc
@@ -13,10 +13,13 @@
 #include "http/one/Tokenizer.h"
 #include "http/ProtocolVersion.h"
 #include "MemBuf.h"
+#include "parser/Tokenizer.h"
 #include "Parsing.h"
+#include "sbuf/Stream.h"
 #include "SquidConfig.h"

-Http::One::TeChunkedParser::TeChunkedParser()
+Http::One::TeChunkedParser::TeChunkedParser():
+    customExtensionValueParser(nullptr)
 {
     // chunked encoding only exists in HTTP/1.1
     Http1::Parser::msgProtocol_ = Http::ProtocolVersion(1,1);
@@ -31,7 +34,11 @@ Http::One::TeChunkedParser::clear()
     buf_.clear();
     theChunkSize = theLeftBodySize = 0;
     theOut = NULL;
-    useOriginBody = -1;
+    // XXX: We do not reset customExtensionValueParser here. Based on the
+    // clear() API description, we must, but it makes little sense and could
+    // break method callers if they appear because some of them may forget to
+    // reset customExtensionValueParser. TODO: Remove Http1::Parser as our
+    // parent class and this unnecessary method with it.
 }

 bool
@@ -49,14 +56,14 @@ Http::One::TeChunkedParser::parse(const SBuf &aBuf)
     if (parsingStage_ == Http1::HTTP_PARSE_NONE)
         parsingStage_ = Http1::HTTP_PARSE_CHUNK_SZ;

-    Http1::Tokenizer tok(buf_);
+    Tokenizer tok(buf_);

     // loop for as many chunks as we can
     // use do-while instead of while so that we can incrementally
     // restart in the middle of a chunk/frame
     do {

-        if (parsingStage_ == Http1::HTTP_PARSE_CHUNK_EXT && !parseChunkExtension(tok, theChunkSize))
+        if (parsingStage_ == Http1::HTTP_PARSE_CHUNK_EXT && !parseChunkMetadataSuffix(tok))
             return false;

         if (parsingStage_ == Http1::HTTP_PARSE_CHUNK && !parseChunkBody(tok))
@@ -80,7 +87,7 @@ Http::One::TeChunkedParser::needsMoreSpace() const

 /// RFC 7230 section 4.1 chunk-size
 bool
-Http::One::TeChunkedParser::parseChunkSize(Http1::Tokenizer &tok)
+Http::One::TeChunkedParser::parseChunkSize(Tokenizer &tok)
 {
     Must(theChunkSize <= 0); // Should(), really

@@ -104,66 +111,75 @@ Http::One::TeChunkedParser::parseChunkSize(Http1::Tokenizer &tok)
     return false; // should not be reachable
 }

-/**
- * Parses chunk metadata suffix, looking for interesting extensions and/or
- * getting to the line terminator. RFC 7230 section 4.1.1 and its Errata #4667:
- *
- *   chunk-ext = *( BWS  ";" BWS chunk-ext-name [ BWS "=" BWS chunk-ext-val ] )
- *   chunk-ext-name = token
- *   chunk-ext-val  = token / quoted-string
- *
- * ICAP 'use-original-body=N' extension is supported.
- */
+/// Parses "[chunk-ext] CRLF" from RFC 7230 section 4.1.1:
+///   chunk = chunk-size [ chunk-ext ] CRLF chunk-data CRLF
+///   last-chunk = 1*"0" [ chunk-ext ] CRLF
 bool
-Http::One::TeChunkedParser::parseChunkExtension(Http1::Tokenizer &tok, bool skipKnown)
+Http::One::TeChunkedParser::parseChunkMetadataSuffix(Tokenizer &tok)
 {
-    SBuf ext;
-    SBuf value;
-    while (
-        ParseBws(tok) && // Bug 4492: IBM_HTTP_Server sends SP after chunk-size
-        tok.skip(';') &&
-        ParseBws(tok) && // Bug 4492: ICAP servers send SP before chunk-ext-name
-        tok.prefix(ext, CharacterSet::TCHAR)) { // chunk-ext-name
-
-        // whole value part is optional. if no '=' expect next chunk-ext
-        if (ParseBws(tok) && tok.skip('=') && ParseBws(tok)) {
-
-            if (!skipKnown) {
-                if (ext.cmp("use-original-body",17) == 0 && tok.int64(useOriginBody, 10)) {
-                    debugs(94, 3, "Found chunk extension " << ext << "=" << useOriginBody);
-                    buf_ = tok.remaining(); // parse checkpoint
-                    continue;
-                }
-            }
-
-            debugs(94, 5, "skipping unknown chunk extension " << ext);
-
-            // unknown might have a value token or quoted-string
-            if (tok.quotedStringOrToken(value) && !tok.atEnd()) {
-                buf_ = tok.remaining(); // parse checkpoint
-                continue;
-            }
-
-            // otherwise need more data OR corrupt syntax
-            break;
-        }
-
-        if (!tok.atEnd())
-            buf_ = tok.remaining(); // parse checkpoint (unless there might be more token name)
-    }
-
-    if (skipLineTerminator(tok)) {
-        buf_ = tok.remaining(); // checkpoint
-        // non-0 chunk means data, 0-size means optional Trailer follows
+    // Code becomes much simpler when incremental parsing functions throw on
+    // bad or insufficient input, like in the code below. TODO: Expand up.
+    try {
+        parseChunkExtensions(tok); // a possibly empty chunk-ext list
+        skipLineTerminator(tok);
+        buf_ = tok.remaining();
         parsingStage_ = theChunkSize ? Http1::HTTP_PARSE_CHUNK : Http1::HTTP_PARSE_MIME;
         return true;
+    } catch (const InsufficientInput &) {
+        tok.reset(buf_); // backtrack to the last commit point
+        return false;
     }
+    // other exceptions bubble up to kill message parsing
+}

-    return false;
+/// Parses the chunk-ext list (RFC 7230 section 4.1.1 and its Errata #4667):
+/// chunk-ext = *( BWS ";" BWS chunk-ext-name [ BWS "=" BWS chunk-ext-val ] )
+void
+Http::One::TeChunkedParser::parseChunkExtensions(Tokenizer &tok)
+{
+    do {
+        ParseBws(tok); // Bug 4492: IBM_HTTP_Server sends SP after chunk-size
+
+        if (!tok.skip(';'))
+            return; // reached the end of extensions (if any)
+
+        parseOneChunkExtension(tok);
+        buf_ = tok.remaining(); // got one extension
+    } while (true);
+}
+
+void
+Http::One::ChunkExtensionValueParser::Ignore(Tokenizer &tok, const SBuf &extName)
+{
+    const auto ignoredValue = tokenOrQuotedString(tok);
+    debugs(94, 5, extName << " with value " << ignoredValue);
+}
+
+/// Parses a single chunk-ext list element:
+/// chunk-ext = *( BWS ";" BWS chunk-ext-name [ BWS "=" BWS chunk-ext-val ] )
+void
+Http::One::TeChunkedParser::parseOneChunkExtension(Tokenizer &tok)
+{
+    ParseBws(tok); // Bug 4492: ICAP servers send SP before chunk-ext-name
+
+    const auto extName = tok.prefix("chunk-ext-name", CharacterSet::TCHAR);
+
+    ParseBws(tok);
+
+    if (!tok.skip('='))
+        return; // parsed a valueless chunk-ext
+
+    ParseBws(tok);
+
+    // optimization: the only currently supported extension needs last-chunk
+    if (!theChunkSize && customExtensionValueParser)
+        customExtensionValueParser->parse(tok, extName);
+    else
+        ChunkExtensionValueParser::Ignore(tok, extName);
 }

 bool
-Http::One::TeChunkedParser::parseChunkBody(Http1::Tokenizer &tok)
+Http::One::TeChunkedParser::parseChunkBody(Tokenizer &tok)
 {
     if (theLeftBodySize > 0) {
         buf_ = tok.remaining(); // sync buffers before buf_ use
@@ -188,17 +204,20 @@ Http::One::TeChunkedParser::parseChunkBody(Http1::Tokenizer &tok)
 }

 bool
-Http::One::TeChunkedParser::parseChunkEnd(Http1::Tokenizer &tok)
+Http::One::TeChunkedParser::parseChunkEnd(Tokenizer &tok)
 {
     Must(theLeftBodySize == 0); // Should(), really

-    if (skipLineTerminator(tok)) {
+    try {
+        skipLineTerminator(tok);
         buf_ = tok.remaining(); // parse checkpoint
         theChunkSize = 0; // done with the current chunk
         parsingStage_ = Http1::HTTP_PARSE_CHUNK_SZ;
         return true;
     }
-
-    return false;
+    catch (const InsufficientInput &) {
+        return false;
+    }
+    // other exceptions bubble up to kill message parsing
 }

diff --git a/src/http/one/TeChunkedParser.h b/src/http/one/TeChunkedParser.h
index 1b0319e..2ca8988 100644
--- a/src/http/one/TeChunkedParser.h
+++ b/src/http/one/TeChunkedParser.h
@@ -18,6 +18,26 @@ namespace Http
 namespace One
 {

+using ::Parser::InsufficientInput;
+
+// TODO: Move this class into http/one/ChunkExtensionValueParser.*
+/// A customizable parser of a single chunk extension value (chunk-ext-val).
+/// From RFC 7230 section 4.1.1 and its Errata #4667:
+/// chunk-ext = *( BWS  ";" BWS chunk-ext-name [ BWS "=" BWS chunk-ext-val ] )
+/// chunk-ext-name = token
+/// chunk-ext-val  = token / quoted-string
+class ChunkExtensionValueParser
+{
+public:
+    typedef ::Parser::Tokenizer Tokenizer;
+
+    /// extracts and ignores the value of a named extension
+    static void Ignore(Tokenizer &tok, const SBuf &extName);
+
+    /// extracts and then interprets (or ignores) the extension value
+    virtual void parse(Tokenizer &tok, const SBuf &extName) = 0;
+};
+
 /**
  * An incremental parser for chunked transfer coding
  * defined in RFC 7230 section 4.1.
@@ -25,7 +45,7 @@ namespace One
  *
  * The parser shovels content bytes from the raw
  * input buffer into the content output buffer, both caller-supplied.
- * Ignores chunk extensions except for ICAP's ieof.
+ * Chunk extensions like use-original-body are handled via parseExtensionValuesWith().
  * Trailers are available via mimeHeader() if wanted.
  */
 class TeChunkedParser : public Http1::Parser
@@ -37,6 +57,10 @@ public:
     /// set the buffer to be used to store decoded chunk data
     void setPayloadBuffer(MemBuf *parsedContent) {theOut = parsedContent;}

+    /// Instead of ignoring all chunk extension values, give the supplied
+    /// parser a chance to handle them. Only applied to last-chunk (for now).
+    void parseExtensionValuesWith(ChunkExtensionValueParser *parser) { customExtensionValueParser = parser; }
+
     bool needsMoreSpace() const;

     /* Http1::Parser API */
@@ -45,17 +69,20 @@ public:
     virtual Parser::size_type firstLineSize() const {return 0;} // has no meaning with multiple chunks

 private:
-    bool parseChunkSize(Http1::Tokenizer &tok);
-    bool parseChunkExtension(Http1::Tokenizer &tok, bool skipKnown);
-    bool parseChunkBody(Http1::Tokenizer &tok);
-    bool parseChunkEnd(Http1::Tokenizer &tok);
+    bool parseChunkSize(Tokenizer &tok);
+    bool parseChunkMetadataSuffix(Tokenizer &);
+    void parseChunkExtensions(Tokenizer &);
+    void parseOneChunkExtension(Tokenizer &);
+    bool parseChunkBody(Tokenizer &tok);
+    bool parseChunkEnd(Tokenizer &tok);

     MemBuf *theOut;
     uint64_t theChunkSize;
     uint64_t theLeftBodySize;

-public:
-    int64_t useOriginBody;
+    /// An optional plugin for parsing and interpreting custom chunk-ext-val.
+    /// This "visitor" object is owned by our creator.
+    ChunkExtensionValueParser *customExtensionValueParser;
 };

 } // namespace One
diff --git a/src/http/one/Tokenizer.cc b/src/http/one/Tokenizer.cc
index 804b8e1..3a6bef3 100644
--- a/src/http/one/Tokenizer.cc
+++ b/src/http/one/Tokenizer.cc
@@ -8,35 +8,18 @@

 #include "squid.h"
 #include "Debug.h"
+#include "http/one/Parser.h"
 #include "http/one/Tokenizer.h"
-
-bool
-Http::One::Tokenizer::quotedString(SBuf &returnedToken, const bool http1p0)
-{
-    checkpoint();
-
-    if (!skip('"'))
-        return false;
-
-    return qdText(returnedToken, http1p0);
-}
-
-bool
-Http::One::Tokenizer::quotedStringOrToken(SBuf &returnedToken, const bool http1p0)
+#include "parser/Tokenizer.h"
+#include "sbuf/Stream.h"
+
+/// Extracts quoted-string after the caller removes the initial '"'.
+/// \param http1p0 whether to prohibit \-escaped characters in quoted strings
+/// \throws InsufficientInput when input can be a token _prefix_
+/// \returns extracted quoted string (without quotes and with chars unescaped)
+static SBuf
+parseQuotedStringSuffix(Parser::Tokenizer &tok, const bool http1p0)
 {
-    checkpoint();
-
-    if (!skip('"'))
-        return prefix(returnedToken, CharacterSet::TCHAR);
-
-    return qdText(returnedToken, http1p0);
-}
-
-bool
-Http::One::Tokenizer::qdText(SBuf &returnedToken, const bool http1p0)
-{
-    // the initial DQUOTE has been skipped by the caller
-
     /*
      * RFC 1945 - defines qdtext:
      *   inclusive of LWS (which includes CR and LF)
@@ -61,12 +44,17 @@ Http::One::Tokenizer::qdText(SBuf &returnedToken, const bool http1p0)
     // best we can do is a conditional reference since http1p0 value may change per-client
     const CharacterSet &tokenChars = (http1p0 ? qdtext1p0 : qdtext1p1);

-    for (;;) {
-        SBuf::size_type prefixLen = buf().findFirstNotOf(tokenChars);
-        returnedToken.append(consume(prefixLen));
+    SBuf parsedToken;
+
+    while (!tok.atEnd()) {
+        SBuf qdText;
+        if (tok.prefix(qdText, tokenChars))
+            parsedToken.append(qdText);
+
+        if (!http1p0 && tok.skip('\\')) { // HTTP/1.1 allows quoted-pair, HTTP/1.0 does not
+            if (tok.atEnd())
+                break;

-        // HTTP/1.1 allows quoted-pair, HTTP/1.0 does not
-        if (!http1p0 && skip('\\')) {
             /* RFC 7230 section 3.2.6
              *
              * The backslash octet ("\") can be used as a single-octet quoting
@@ -78,32 +66,42 @@ Http::One::Tokenizer::qdText(SBuf &returnedToken, const bool http1p0)
              */
             static const CharacterSet qPairChars = CharacterSet::HTAB + CharacterSet::SP + CharacterSet::VCHAR + CharacterSet::OBSTEXT;
             SBuf escaped;
-            if (!prefix(escaped, qPairChars, 1)) {
-                returnedToken.clear();
-                restoreLastCheckpoint();
-                return false;
-            }
-            returnedToken.append(escaped);
+            if (!tok.prefix(escaped, qPairChars, 1))
+                throw TexcHere("invalid escaped character in quoted-pair");
+
+            parsedToken.append(escaped);
             continue;
+        }

-        } else if (skip('"')) {
-            break; // done
+        if (tok.skip('"'))
+            return parsedToken; // may be empty

-        } else if (atEnd()) {
-            // need more data
-            returnedToken.clear();
-            restoreLastCheckpoint();
-            return false;
-        }
+        if (tok.atEnd())
+            break;

-        // else, we have an error
-        debugs(24, 8, "invalid bytes for set " << tokenChars.name);
-        returnedToken.clear();
-        restoreLastCheckpoint();
-        return false;
+        throw TexcHere(ToSBuf("invalid bytes for set ", tokenChars.name));
     }

-    // found the whole string
-    return true;
+    throw Http::One::InsufficientInput();
+}
+
+SBuf
+Http::One::tokenOrQuotedString(Parser::Tokenizer &tok, const bool http1p0)
+{
+    if (tok.skip('"'))
+        return parseQuotedStringSuffix(tok, http1p0);
+
+    if (tok.atEnd())
+        throw InsufficientInput();
+
+    SBuf parsedToken;
+    if (!tok.prefix(parsedToken, CharacterSet::TCHAR))
+        throw TexcHere("invalid input while expecting an HTTP token");
+
+    if (tok.atEnd())
+        throw InsufficientInput();
+
+    // got the complete token
+    return parsedToken;
 }

diff --git a/src/http/one/Tokenizer.h b/src/http/one/Tokenizer.h
index 658875f..2d40574 100644
--- a/src/http/one/Tokenizer.h
+++ b/src/http/one/Tokenizer.h
@@ -9,68 +9,47 @@
 #ifndef SQUID_SRC_HTTP_ONE_TOKENIZER_H
 #define SQUID_SRC_HTTP_ONE_TOKENIZER_H

-#include "parser/Tokenizer.h"
+#include "parser/forward.h"
+#include "sbuf/forward.h"

 namespace Http {
 namespace One {

 /**
- * Lexical processor extended to tokenize HTTP/1.x syntax.
+ * Extracts either an HTTP/1 token or quoted-string while dealing with
+ * possibly incomplete input typical for incremental text parsers.
+ * Unescapes escaped characters in HTTP/1.1 quoted strings.
  *
- * \see ::Parser::Tokenizer for more detail
+ * \param http1p0 whether to prohibit \-escaped characters in quoted strings
+ * \throws InsufficientInput as appropriate, including on unterminated tokens
+ * \returns extracted token or quoted string (without quotes)
+ *
+ * Governed by:
+ *  - RFC 1945 section 2.1
+ *  "
+ *    A string of text is parsed as a single word if it is quoted using
+ *    double-quote marks.
+ *
+ *        quoted-string  = ( <"> *(qdtext) <"> )
+ *
+ *        qdtext         = <any CHAR except <"> and CTLs,
+ *                         but including LWS>
+ *
+ *    Single-character quoting using the backslash ("\") character is not
+ *    permitted in HTTP/1.0.
+ *  "
+ *
+ *  - RFC 7230 section 3.2.6
+ *  "
+ *    A string of text is parsed as a single value if it is quoted using
+ *    double-quote marks.
+ *
+ *    quoted-string  = DQUOTE *( qdtext / quoted-pair ) DQUOTE
+ *    qdtext         = HTAB / SP /%x21 / %x23-5B / %x5D-7E / obs-text
+ *    obs-text       = %x80-FF
+ *  "
  */
-class Tokenizer : public ::Parser::Tokenizer
-{
-public:
-    Tokenizer(SBuf &s) : ::Parser::Tokenizer(s), savedStats_(0) {}
-
-    /**
-     * Attempt to parse a quoted-string lexical construct.
-     *
-     * Governed by:
-     *  - RFC 1945 section 2.1
-     *  "
-     *    A string of text is parsed as a single word if it is quoted using
-     *    double-quote marks.
-     *
-     *        quoted-string  = ( <"> *(qdtext) <"> )
-     *
-     *        qdtext         = <any CHAR except <"> and CTLs,
-     *                         but including LWS>
-     *
-     *    Single-character quoting using the backslash ("\") character is not
-     *    permitted in HTTP/1.0.
-     *  "
-     *
-     *  - RFC 7230 section 3.2.6
-     *  "
-     *    A string of text is parsed as a single value if it is quoted using
-     *    double-quote marks.
-     *
-     *    quoted-string  = DQUOTE *( qdtext / quoted-pair ) DQUOTE
-     *    qdtext         = HTAB / SP /%x21 / %x23-5B / %x5D-7E / obs-text
-     *    obs-text       = %x80-FF
-     *  "
-     *
-     * \param escaped HTTP/1.0 does not permit \-escaped characters
-     */
-    bool quotedString(SBuf &value, const bool http1p0 = false);
-
-    /**
-     * Attempt to parse a (token / quoted-string ) lexical construct.
-     */
-    bool quotedStringOrToken(SBuf &value, const bool http1p0 = false);
-
-private:
-    /// parse the internal component of a quote-string, and terminal DQUOTE
-    bool qdText(SBuf &value, const bool http1p0);
-
-    void checkpoint() { savedCheckpoint_ = buf(); savedStats_ = parsedSize(); }
-    void restoreLastCheckpoint() { undoParse(savedCheckpoint_, savedStats_); }
-
-    SBuf savedCheckpoint_;
-    SBuf::size_type savedStats_;
-};
+SBuf tokenOrQuotedString(Parser::Tokenizer &tok, const bool http1p0 = false);

 } // namespace One
 } // namespace Http
diff --git a/src/http/one/forward.h b/src/http/one/forward.h
index c90dc34..2b4ad28 100644
--- a/src/http/one/forward.h
+++ b/src/http/one/forward.h
@@ -10,6 +10,7 @@
 #define SQUID_SRC_HTTP_ONE_FORWARD_H

 #include "base/RefCount.h"
+#include "parser/forward.h"
 #include "sbuf/forward.h"

 namespace Http {
@@ -31,6 +32,8 @@ typedef RefCount<Http::One::ResponseParser> ResponseParserPointer;
 /// CRLF textual representation
 const SBuf &CrLf();

+using ::Parser::InsufficientInput;
+
 } // namespace One
 } // namespace Http

diff --git a/src/parser/BinaryTokenizer.h b/src/parser/BinaryTokenizer.h
index acebd4d..24042d4 100644
--- a/src/parser/BinaryTokenizer.h
+++ b/src/parser/BinaryTokenizer.h
@@ -9,6 +9,7 @@
 #ifndef SQUID_SRC_PARSER_BINARYTOKENIZER_H
 #define SQUID_SRC_PARSER_BINARYTOKENIZER_H

+#include "parser/forward.h"
 #include "sbuf/SBuf.h"

 namespace Parser
@@ -44,7 +45,7 @@ public:
 class BinaryTokenizer
 {
 public:
-    class InsufficientInput {}; // thrown when a method runs out of data
+    typedef ::Parser::InsufficientInput InsufficientInput;
     typedef uint64_t size_type; // enough for the largest supported offset

     BinaryTokenizer();
diff --git a/src/parser/Makefile.am b/src/parser/Makefile.am
index af2b759..0daa5a8 100644
--- a/src/parser/Makefile.am
+++ b/src/parser/Makefile.am
@@ -13,6 +13,7 @@ noinst_LTLIBRARIES = libparser.la
 libparser_la_SOURCES = \
 	BinaryTokenizer.h \
 	BinaryTokenizer.cc \
+	forward.h \
 	Tokenizer.h \
 	Tokenizer.cc

diff --git a/src/parser/Tokenizer.cc b/src/parser/Tokenizer.cc
index 7e73e04..68f4aec 100644
--- a/src/parser/Tokenizer.cc
+++ b/src/parser/Tokenizer.cc
@@ -10,7 +10,9 @@

 #include "squid.h"
 #include "Debug.h"
+#include "parser/forward.h"
 #include "parser/Tokenizer.h"
+#include "sbuf/Stream.h"

 #include <cerrno>
 #if HAVE_CTYPE_H
@@ -96,6 +98,23 @@ Parser::Tokenizer::prefix(SBuf &returnedToken, const CharacterSet &tokenChars, c
     return true;
 }

+SBuf
+Parser::Tokenizer::prefix(const char *description, const CharacterSet &tokenChars, const SBuf::size_type limit)
+{
+    if (atEnd())
+        throw InsufficientInput();
+
+    SBuf result;
+
+    if (!prefix(result, tokenChars, limit))
+        throw TexcHere(ToSBuf("cannot parse ", description));
+
+    if (atEnd())
+        throw InsufficientInput();
+
+    return result;
+}
+
 bool
 Parser::Tokenizer::suffix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit)
 {
@@ -283,3 +302,24 @@ Parser::Tokenizer::int64(int64_t & result, int base, bool allowSign, const SBuf:
     return success(s - range.rawContent());
 }

+int64_t
+Parser::Tokenizer::udec64(const char *description, const SBuf::size_type limit)
+{
+    if (atEnd())
+        throw InsufficientInput();
+
+    int64_t result = 0;
+
+    // Since we only support unsigned decimals, a parsing failure with a
+    // non-empty input always implies invalid/malformed input (or a buggy
+    // limit=0 caller). TODO: Support signed and non-decimal integers by
+    // refactoring int64() to detect insufficient input.
+    if (!int64(result, 10, false, limit))
+        throw TexcHere(ToSBuf("cannot parse ", description));
+
+    if (atEnd())
+        throw InsufficientInput(); // more digits may be coming
+
+    return result;
+}
+
diff --git a/src/parser/Tokenizer.h b/src/parser/Tokenizer.h
index 54414be..03a8388 100644
--- a/src/parser/Tokenizer.h
+++ b/src/parser/Tokenizer.h
@@ -143,6 +143,19 @@ public:
      */
     bool int64(int64_t &result, int base = 0, bool allowSign = true, SBuf::size_type limit = SBuf::npos);

+    /*
+     * The methods below mimic their counterparts documented above, but they
+     * throw on errors, including InsufficientInput. The field description
+     * parameter is used for error reporting and debugging.
+     */
+
+    /// prefix() wrapper but throws InsufficientInput if input contains
+    /// nothing but the prefix (i.e. if the prefix is not "terminated")
+    SBuf prefix(const char *description, const CharacterSet &tokenChars, SBuf::size_type limit = SBuf::npos);
+
+    /// int64() wrapper but limited to unsigned decimal integers (for now)
+    int64_t udec64(const char *description, SBuf::size_type limit = SBuf::npos);
+
 protected:
     SBuf consume(const SBuf::size_type n);
     SBuf::size_type success(const SBuf::size_type n);
diff --git a/src/parser/forward.h b/src/parser/forward.h
new file mode 100644
index 0000000..5a95b7a
--- /dev/null
+++ b/src/parser/forward.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 1996-2019 The Squid Software Foundation and contributors
+ *
+ * Squid software is distributed under GPLv2+ license and includes
+ * contributions from numerous individuals and organizations.
+ * Please see the COPYING and CONTRIBUTORS files for details.
+ */
+
+#ifndef SQUID_PARSER_FORWARD_H
+#define SQUID_PARSER_FORWARD_H
+
+namespace Parser {
+class Tokenizer;
+class BinaryTokenizer;
+
+// TODO: Move this declaration (to parser/Elements.h) if we need more like it.
+/// thrown by modern "incremental" parsers when they need more data
+class InsufficientInput {};
+} // namespace Parser
+
+#endif /* SQUID_PARSER_FORWARD_H */
+
--
2.39.3