diff --git a/src/adaptation/icap/ModXact.cc b/src/adaptation/icap/ModXact.cc index 2db0a68..370f077 100644 --- a/src/adaptation/icap/ModXact.cc +++ b/src/adaptation/icap/ModXact.cc @@ -25,12 +25,13 @@ #include "comm.h" #include "comm/Connection.h" #include "err_detail_type.h" -#include "http/one/TeChunkedParser.h" #include "HttpHeaderTools.h" #include "HttpMsg.h" #include "HttpReply.h" #include "HttpRequest.h" #include "MasterXaction.h" +#include "parser/Tokenizer.h" +#include "sbuf/Stream.h" #include "SquidTime.h" // flow and terminology: @@ -44,6 +45,8 @@ CBDATA_NAMESPACED_CLASS_INIT(Adaptation::Icap, ModXactLauncher); static const size_t TheBackupLimit = BodyPipe::MaxCapacity; +const SBuf Adaptation::Icap::ChunkExtensionValueParser::UseOriginalBodyName("use-original-body"); + Adaptation::Icap::ModXact::State::State() { memset(this, 0, sizeof(*this)); @@ -1108,6 +1111,7 @@ void Adaptation::Icap::ModXact::decideOnParsingBody() state.parsing = State::psBody; replyHttpBodySize = 0; bodyParser = new Http1::TeChunkedParser; + bodyParser->parseExtensionValuesWith(&extensionParser); makeAdaptedBodyPipe("adapted response from the ICAP server"); Must(state.sending == State::sendingAdapted); } else { @@ -1142,9 +1146,8 @@ void Adaptation::Icap::ModXact::parseBody() } if (parsed) { - if (state.readyForUob && bodyParser->useOriginBody >= 0) { - prepPartialBodyEchoing( - static_cast(bodyParser->useOriginBody)); + if (state.readyForUob && extensionParser.sawUseOriginalBody()) { + prepPartialBodyEchoing(extensionParser.useOriginalBody()); stopParsing(); return; } @@ -2014,3 +2017,14 @@ void Adaptation::Icap::ModXactLauncher::updateHistory(bool doStart) } } +void +Adaptation::Icap::ChunkExtensionValueParser::parse(Tokenizer &tok, const SBuf &extName) +{ + if (extName == UseOriginalBodyName) { + useOriginalBody_ = tok.udec64("use-original-body"); + assert(useOriginalBody_ >= 0); + } else { + Ignore(tok, extName); + } +} + diff --git a/src/adaptation/icap/ModXact.h b/src/adaptation/icap/ModXact.h index f7afa69..fb4dec0 100644 --- a/src/adaptation/icap/ModXact.h +++ b/src/adaptation/icap/ModXact.h @@ -15,6 +15,7 @@ #include "adaptation/icap/Xaction.h" #include "BodyPipe.h" #include "http/one/forward.h" +#include "http/one/TeChunkedParser.h" /* * ICAPModXact implements ICAP REQMOD and RESPMOD transaction using @@ -105,6 +106,23 @@ private: enum State { stDisabled, stWriting, stIeof, stDone } theState; }; +/// handles ICAP-specific chunk extensions supported by Squid +class ChunkExtensionValueParser: public Http1::ChunkExtensionValueParser +{ +public: + /* Http1::ChunkExtensionValueParser API */ + virtual void parse(Tokenizer &tok, const SBuf &extName) override; + + bool sawUseOriginalBody() const { return useOriginalBody_ >= 0; } + uint64_t useOriginalBody() const { assert(sawUseOriginalBody()); return static_cast(useOriginalBody_); } + +private: + static const SBuf UseOriginalBodyName; + + /// the value of the parsed use-original-body chunk extension (or -1) + int64_t useOriginalBody_ = -1; +}; + class ModXact: public Xaction, public BodyProducer, public BodyConsumer { CBDATA_CLASS(ModXact); @@ -270,6 +288,8 @@ private: int adaptHistoryId; ///< adaptation history slot reservation + ChunkExtensionValueParser extensionParser; + class State { diff --git a/src/http/one/Parser.cc b/src/http/one/Parser.cc index 0c86733..affe0b1 100644 --- a/src/http/one/Parser.cc +++ b/src/http/one/Parser.cc @@ -7,10 +7,11 @@ */ #include "squid.h" +#include "base/CharacterSet.h" #include "Debug.h" #include "http/one/Parser.h" -#include "http/one/Tokenizer.h" #include "mime_header.h" +#include "parser/Tokenizer.h" #include "SquidConfig.h" /// RFC 7230 section 2.6 - 7 magic octets @@ -61,20 +62,19 @@ Http::One::Parser::DelimiterCharacters() RelaxedDelimiterCharacters() : CharacterSet::SP; } -bool -Http::One::Parser::skipLineTerminator(Http1::Tokenizer &tok) const +void +Http::One::Parser::skipLineTerminator(Tokenizer &tok) const { if (tok.skip(Http1::CrLf())) - return true; + return; if (Config.onoff.relaxed_header_parser && tok.skipOne(CharacterSet::LF)) - return true; + return; if (tok.atEnd() || (tok.remaining().length() == 1 && tok.remaining().at(0) == '\r')) - return false; // need more data + throw InsufficientInput(); throw TexcHere("garbage instead of CRLF line terminator"); - return false; // unreachable, but make naive compilers happy } /// all characters except the LF line terminator @@ -102,7 +102,7 @@ LineCharacters() void Http::One::Parser::cleanMimePrefix() { - Http1::Tokenizer tok(mimeHeaderBlock_); + Tokenizer tok(mimeHeaderBlock_); while (tok.skipOne(RelaxedDelimiterCharacters())) { (void)tok.skipAll(LineCharacters()); // optional line content // LF terminator is required. @@ -137,7 +137,7 @@ Http::One::Parser::cleanMimePrefix() void Http::One::Parser::unfoldMime() { - Http1::Tokenizer tok(mimeHeaderBlock_); + Tokenizer tok(mimeHeaderBlock_); const auto szLimit = mimeHeaderBlock_.length(); mimeHeaderBlock_.clear(); // prevent the mime sender being able to make append() realloc/grow multiple times. @@ -228,7 +228,7 @@ Http::One::Parser::getHostHeaderField() debugs(25, 5, "looking for " << name); // while we can find more LF in the SBuf - Http1::Tokenizer tok(mimeHeaderBlock_); + Tokenizer tok(mimeHeaderBlock_); SBuf p; while (tok.prefix(p, LineCharacters())) { @@ -250,7 +250,7 @@ Http::One::Parser::getHostHeaderField() p.consume(namelen + 1); // TODO: optimize SBuf::trim to take CharacterSet directly - Http1::Tokenizer t(p); + Tokenizer t(p); t.skipAll(CharacterSet::WSP); p = t.remaining(); @@ -278,10 +278,15 @@ Http::One::ErrorLevel() } // BWS = *( SP / HTAB ) ; WhitespaceCharacters() may relax this RFC 7230 rule -bool -Http::One::ParseBws(Tokenizer &tok) +void +Http::One::ParseBws(Parser::Tokenizer &tok) { - if (const auto count = tok.skipAll(Parser::WhitespaceCharacters())) { + const auto count = tok.skipAll(Parser::WhitespaceCharacters()); + + if (tok.atEnd()) + throw InsufficientInput(); // even if count is positive + + if (count) { // Generating BWS is a MUST-level violation so warn about it as needed. debugs(33, ErrorLevel(), "found " << count << " BWS octets"); // RFC 7230 says we MUST parse BWS, so we fall through even if @@ -289,6 +294,6 @@ Http::One::ParseBws(Tokenizer &tok) } // else we successfully "parsed" an empty BWS sequence - return true; + // success: no more BWS characters expected } diff --git a/src/http/one/Parser.h b/src/http/one/Parser.h index 58a5cae..40e281b 100644 --- a/src/http/one/Parser.h +++ b/src/http/one/Parser.h @@ -12,6 +12,7 @@ #include "anyp/ProtocolVersion.h" #include "http/one/forward.h" #include "http/StatusCode.h" +#include "parser/forward.h" #include "sbuf/SBuf.h" namespace Http { @@ -40,6 +41,7 @@ class Parser : public RefCountable { public: typedef SBuf::size_type size_type; + typedef ::Parser::Tokenizer Tokenizer; Parser() : parseStatusCode(Http::scNone), parsingStage_(HTTP_PARSE_NONE), hackExpectsMime_(false) {} virtual ~Parser() {} @@ -118,11 +120,11 @@ protected: * detect and skip the CRLF or (if tolerant) LF line terminator * consume from the tokenizer. * - * throws if non-terminator is detected. + * \throws exception on bad or InsuffientInput. * \retval true only if line terminator found. * \retval false incomplete or missing line terminator, need more data. */ - bool skipLineTerminator(Http1::Tokenizer &tok) const; + void skipLineTerminator(Tokenizer &) const; /** * Scan to find the mime headers block for current message. @@ -159,8 +161,8 @@ private: }; /// skips and, if needed, warns about RFC 7230 BWS ("bad" whitespace) -/// \returns true (always; unlike all the skip*() functions) -bool ParseBws(Tokenizer &tok); +/// \throws InsufficientInput when the end of BWS cannot be confirmed +void ParseBws(Parser::Tokenizer &); /// the right debugs() level for logging HTTP violation messages int ErrorLevel(); diff --git a/src/http/one/RequestParser.cc b/src/http/one/RequestParser.cc index a325f7d..0f13c92 100644 --- a/src/http/one/RequestParser.cc +++ b/src/http/one/RequestParser.cc @@ -9,8 +9,8 @@ #include "squid.h" #include "Debug.h" #include "http/one/RequestParser.h" -#include "http/one/Tokenizer.h" #include "http/ProtocolVersion.h" +#include "parser/Tokenizer.h" #include "profiler/Profiler.h" #include "SquidConfig.h" @@ -64,7 +64,7 @@ Http::One::RequestParser::skipGarbageLines() * RFC 7230 section 2.6, 3.1 and 3.5 */ bool -Http::One::RequestParser::parseMethodField(Http1::Tokenizer &tok) +Http::One::RequestParser::parseMethodField(Tokenizer &tok) { // method field is a sequence of TCHAR. // Limit to 32 characters to prevent overly long sequences of non-HTTP @@ -145,7 +145,7 @@ Http::One::RequestParser::RequestTargetCharacters() } bool -Http::One::RequestParser::parseUriField(Http1::Tokenizer &tok) +Http::One::RequestParser::parseUriField(Tokenizer &tok) { /* Arbitrary 64KB URI upper length limit. * @@ -178,7 +178,7 @@ Http::One::RequestParser::parseUriField(Http1::Tokenizer &tok) } bool -Http::One::RequestParser::parseHttpVersionField(Http1::Tokenizer &tok) +Http::One::RequestParser::parseHttpVersionField(Tokenizer &tok) { static const SBuf http1p0("HTTP/1.0"); static const SBuf http1p1("HTTP/1.1"); @@ -253,7 +253,7 @@ Http::One::RequestParser::skipDelimiter(const size_t count, const char *where) /// Parse CRs at the end of request-line, just before the terminating LF. bool -Http::One::RequestParser::skipTrailingCrs(Http1::Tokenizer &tok) +Http::One::RequestParser::skipTrailingCrs(Tokenizer &tok) { if (Config.onoff.relaxed_header_parser) { (void)tok.skipAllTrailing(CharacterSet::CR); // optional; multiple OK @@ -289,12 +289,12 @@ Http::One::RequestParser::parseRequestFirstLine() // Earlier, skipGarbageLines() took care of any leading LFs (if allowed). // Now, the request line has to end at the first LF. static const CharacterSet lineChars = CharacterSet::LF.complement("notLF"); - ::Parser::Tokenizer lineTok(buf_); + Tokenizer lineTok(buf_); if (!lineTok.prefix(line, lineChars) || !lineTok.skip('\n')) { if (buf_.length() >= Config.maxRequestHeaderSize) { /* who should we blame for our failure to parse this line? */ - Http1::Tokenizer methodTok(buf_); + Tokenizer methodTok(buf_); if (!parseMethodField(methodTok)) return -1; // blame a bad method (or its delimiter) @@ -308,7 +308,7 @@ Http::One::RequestParser::parseRequestFirstLine() return 0; } - Http1::Tokenizer tok(line); + Tokenizer tok(line); if (!parseMethodField(tok)) return -1; diff --git a/src/http/one/RequestParser.h b/src/http/one/RequestParser.h index 7086548..26697cd 100644 --- a/src/http/one/RequestParser.h +++ b/src/http/one/RequestParser.h @@ -54,11 +54,11 @@ private: bool doParse(const SBuf &aBuf); /* all these return false and set parseStatusCode on parsing failures */ - bool parseMethodField(Http1::Tokenizer &); - bool parseUriField(Http1::Tokenizer &); - bool parseHttpVersionField(Http1::Tokenizer &); + bool parseMethodField(Tokenizer &); + bool parseUriField(Tokenizer &); + bool parseHttpVersionField(Tokenizer &); bool skipDelimiter(const size_t count, const char *where); - bool skipTrailingCrs(Http1::Tokenizer &tok); + bool skipTrailingCrs(Tokenizer &tok); bool http0() const {return !msgProtocol_.major;} static const CharacterSet &RequestTargetCharacters(); diff --git a/src/http/one/ResponseParser.cc b/src/http/one/ResponseParser.cc index 24af849..65baf09 100644 --- a/src/http/one/ResponseParser.cc +++ b/src/http/one/ResponseParser.cc @@ -9,8 +9,8 @@ #include "squid.h" #include "Debug.h" #include "http/one/ResponseParser.h" -#include "http/one/Tokenizer.h" #include "http/ProtocolVersion.h" +#include "parser/Tokenizer.h" #include "profiler/Profiler.h" #include "SquidConfig.h" @@ -47,7 +47,7 @@ Http::One::ResponseParser::firstLineSize() const // NP: we found the protocol version and consumed it already. // just need the status code and reason phrase int -Http::One::ResponseParser::parseResponseStatusAndReason(Http1::Tokenizer &tok, const CharacterSet &WspDelim) +Http::One::ResponseParser::parseResponseStatusAndReason(Tokenizer &tok, const CharacterSet &WspDelim) { if (!completedStatus_) { debugs(74, 9, "seek status-code in: " << tok.remaining().substr(0,10) << "..."); @@ -87,14 +87,13 @@ Http::One::ResponseParser::parseResponseStatusAndReason(Http1::Tokenizer &tok, c static const CharacterSet phraseChars = CharacterSet::WSP + CharacterSet::VCHAR + CharacterSet::OBSTEXT; (void)tok.prefix(reasonPhrase_, phraseChars); // optional, no error if missing try { - if (skipLineTerminator(tok)) { - debugs(74, DBG_DATA, "parse remaining buf={length=" << tok.remaining().length() << ", data='" << tok.remaining() << "'}"); - buf_ = tok.remaining(); // resume checkpoint - return 1; - } + skipLineTerminator(tok); + buf_ = tok.remaining(); // resume checkpoint + debugs(74, DBG_DATA, Raw("leftovers", buf_.rawContent(), buf_.length())); + return 1; + } catch (const InsufficientInput &) { reasonPhrase_.clear(); return 0; // need more to be sure we have it all - } catch (const std::exception &ex) { debugs(74, 6, "invalid status-line: " << ex.what()); } @@ -119,7 +118,7 @@ Http::One::ResponseParser::parseResponseStatusAndReason(Http1::Tokenizer &tok, c int Http::One::ResponseParser::parseResponseFirstLine() { - Http1::Tokenizer tok(buf_); + Tokenizer tok(buf_); const CharacterSet &WspDelim = DelimiterCharacters(); diff --git a/src/http/one/ResponseParser.h b/src/http/one/ResponseParser.h index 15db4a0..cf13b4d 100644 --- a/src/http/one/ResponseParser.h +++ b/src/http/one/ResponseParser.h @@ -43,7 +43,7 @@ public: private: int parseResponseFirstLine(); - int parseResponseStatusAndReason(Http1::Tokenizer&, const CharacterSet &); + int parseResponseStatusAndReason(Tokenizer&, const CharacterSet &); /// magic prefix for identifying ICY response messages static const SBuf IcyMagic; diff --git a/src/http/one/TeChunkedParser.cc b/src/http/one/TeChunkedParser.cc index 754086e..6d2f8ea 100644 --- a/src/http/one/TeChunkedParser.cc +++ b/src/http/one/TeChunkedParser.cc @@ -13,10 +13,13 @@ #include "http/one/Tokenizer.h" #include "http/ProtocolVersion.h" #include "MemBuf.h" +#include "parser/Tokenizer.h" #include "Parsing.h" +#include "sbuf/Stream.h" #include "SquidConfig.h" -Http::One::TeChunkedParser::TeChunkedParser() +Http::One::TeChunkedParser::TeChunkedParser(): + customExtensionValueParser(nullptr) { // chunked encoding only exists in HTTP/1.1 Http1::Parser::msgProtocol_ = Http::ProtocolVersion(1,1); @@ -31,7 +34,11 @@ Http::One::TeChunkedParser::clear() buf_.clear(); theChunkSize = theLeftBodySize = 0; theOut = NULL; - useOriginBody = -1; + // XXX: We do not reset customExtensionValueParser here. Based on the + // clear() API description, we must, but it makes little sense and could + // break method callers if they appear because some of them may forget to + // reset customExtensionValueParser. TODO: Remove Http1::Parser as our + // parent class and this unnecessary method with it. } bool @@ -49,14 +56,14 @@ Http::One::TeChunkedParser::parse(const SBuf &aBuf) if (parsingStage_ == Http1::HTTP_PARSE_NONE) parsingStage_ = Http1::HTTP_PARSE_CHUNK_SZ; - Http1::Tokenizer tok(buf_); + Tokenizer tok(buf_); // loop for as many chunks as we can // use do-while instead of while so that we can incrementally // restart in the middle of a chunk/frame do { - if (parsingStage_ == Http1::HTTP_PARSE_CHUNK_EXT && !parseChunkExtension(tok, theChunkSize)) + if (parsingStage_ == Http1::HTTP_PARSE_CHUNK_EXT && !parseChunkMetadataSuffix(tok)) return false; if (parsingStage_ == Http1::HTTP_PARSE_CHUNK && !parseChunkBody(tok)) @@ -80,7 +87,7 @@ Http::One::TeChunkedParser::needsMoreSpace() const /// RFC 7230 section 4.1 chunk-size bool -Http::One::TeChunkedParser::parseChunkSize(Http1::Tokenizer &tok) +Http::One::TeChunkedParser::parseChunkSize(Tokenizer &tok) { Must(theChunkSize <= 0); // Should(), really @@ -104,66 +111,75 @@ Http::One::TeChunkedParser::parseChunkSize(Http1::Tokenizer &tok) return false; // should not be reachable } -/** - * Parses chunk metadata suffix, looking for interesting extensions and/or - * getting to the line terminator. RFC 7230 section 4.1.1 and its Errata #4667: - * - * chunk-ext = *( BWS ";" BWS chunk-ext-name [ BWS "=" BWS chunk-ext-val ] ) - * chunk-ext-name = token - * chunk-ext-val = token / quoted-string - * - * ICAP 'use-original-body=N' extension is supported. - */ +/// Parses "[chunk-ext] CRLF" from RFC 7230 section 4.1.1: +/// chunk = chunk-size [ chunk-ext ] CRLF chunk-data CRLF +/// last-chunk = 1*"0" [ chunk-ext ] CRLF bool -Http::One::TeChunkedParser::parseChunkExtension(Http1::Tokenizer &tok, bool skipKnown) +Http::One::TeChunkedParser::parseChunkMetadataSuffix(Tokenizer &tok) { - SBuf ext; - SBuf value; - while ( - ParseBws(tok) && // Bug 4492: IBM_HTTP_Server sends SP after chunk-size - tok.skip(';') && - ParseBws(tok) && // Bug 4492: ICAP servers send SP before chunk-ext-name - tok.prefix(ext, CharacterSet::TCHAR)) { // chunk-ext-name - - // whole value part is optional. if no '=' expect next chunk-ext - if (ParseBws(tok) && tok.skip('=') && ParseBws(tok)) { - - if (!skipKnown) { - if (ext.cmp("use-original-body",17) == 0 && tok.int64(useOriginBody, 10)) { - debugs(94, 3, "Found chunk extension " << ext << "=" << useOriginBody); - buf_ = tok.remaining(); // parse checkpoint - continue; - } - } - - debugs(94, 5, "skipping unknown chunk extension " << ext); - - // unknown might have a value token or quoted-string - if (tok.quotedStringOrToken(value) && !tok.atEnd()) { - buf_ = tok.remaining(); // parse checkpoint - continue; - } - - // otherwise need more data OR corrupt syntax - break; - } - - if (!tok.atEnd()) - buf_ = tok.remaining(); // parse checkpoint (unless there might be more token name) - } - - if (skipLineTerminator(tok)) { - buf_ = tok.remaining(); // checkpoint - // non-0 chunk means data, 0-size means optional Trailer follows + // Code becomes much simpler when incremental parsing functions throw on + // bad or insufficient input, like in the code below. TODO: Expand up. + try { + parseChunkExtensions(tok); // a possibly empty chunk-ext list + skipLineTerminator(tok); + buf_ = tok.remaining(); parsingStage_ = theChunkSize ? Http1::HTTP_PARSE_CHUNK : Http1::HTTP_PARSE_MIME; return true; + } catch (const InsufficientInput &) { + tok.reset(buf_); // backtrack to the last commit point + return false; } + // other exceptions bubble up to kill message parsing +} - return false; +/// Parses the chunk-ext list (RFC 7230 section 4.1.1 and its Errata #4667): +/// chunk-ext = *( BWS ";" BWS chunk-ext-name [ BWS "=" BWS chunk-ext-val ] ) +void +Http::One::TeChunkedParser::parseChunkExtensions(Tokenizer &tok) +{ + do { + ParseBws(tok); // Bug 4492: IBM_HTTP_Server sends SP after chunk-size + + if (!tok.skip(';')) + return; // reached the end of extensions (if any) + + parseOneChunkExtension(tok); + buf_ = tok.remaining(); // got one extension + } while (true); +} + +void +Http::One::ChunkExtensionValueParser::Ignore(Tokenizer &tok, const SBuf &extName) +{ + const auto ignoredValue = tokenOrQuotedString(tok); + debugs(94, 5, extName << " with value " << ignoredValue); +} + +/// Parses a single chunk-ext list element: +/// chunk-ext = *( BWS ";" BWS chunk-ext-name [ BWS "=" BWS chunk-ext-val ] ) +void +Http::One::TeChunkedParser::parseOneChunkExtension(Tokenizer &tok) +{ + ParseBws(tok); // Bug 4492: ICAP servers send SP before chunk-ext-name + + const auto extName = tok.prefix("chunk-ext-name", CharacterSet::TCHAR); + + ParseBws(tok); + + if (!tok.skip('=')) + return; // parsed a valueless chunk-ext + + ParseBws(tok); + + // optimization: the only currently supported extension needs last-chunk + if (!theChunkSize && customExtensionValueParser) + customExtensionValueParser->parse(tok, extName); + else + ChunkExtensionValueParser::Ignore(tok, extName); } bool -Http::One::TeChunkedParser::parseChunkBody(Http1::Tokenizer &tok) +Http::One::TeChunkedParser::parseChunkBody(Tokenizer &tok) { if (theLeftBodySize > 0) { buf_ = tok.remaining(); // sync buffers before buf_ use @@ -188,17 +204,20 @@ Http::One::TeChunkedParser::parseChunkBody(Http1::Tokenizer &tok) } bool -Http::One::TeChunkedParser::parseChunkEnd(Http1::Tokenizer &tok) +Http::One::TeChunkedParser::parseChunkEnd(Tokenizer &tok) { Must(theLeftBodySize == 0); // Should(), really - if (skipLineTerminator(tok)) { + try { + skipLineTerminator(tok); buf_ = tok.remaining(); // parse checkpoint theChunkSize = 0; // done with the current chunk parsingStage_ = Http1::HTTP_PARSE_CHUNK_SZ; return true; } - - return false; + catch (const InsufficientInput &) { + return false; + } + // other exceptions bubble up to kill message parsing } diff --git a/src/http/one/TeChunkedParser.h b/src/http/one/TeChunkedParser.h index 1b0319e..2ca8988 100644 --- a/src/http/one/TeChunkedParser.h +++ b/src/http/one/TeChunkedParser.h @@ -18,6 +18,26 @@ namespace Http namespace One { +using ::Parser::InsufficientInput; + +// TODO: Move this class into http/one/ChunkExtensionValueParser.* +/// A customizable parser of a single chunk extension value (chunk-ext-val). +/// From RFC 7230 section 4.1.1 and its Errata #4667: +/// chunk-ext = *( BWS ";" BWS chunk-ext-name [ BWS "=" BWS chunk-ext-val ] ) +/// chunk-ext-name = token +/// chunk-ext-val = token / quoted-string +class ChunkExtensionValueParser +{ +public: + typedef ::Parser::Tokenizer Tokenizer; + + /// extracts and ignores the value of a named extension + static void Ignore(Tokenizer &tok, const SBuf &extName); + + /// extracts and then interprets (or ignores) the extension value + virtual void parse(Tokenizer &tok, const SBuf &extName) = 0; +}; + /** * An incremental parser for chunked transfer coding * defined in RFC 7230 section 4.1. @@ -25,7 +45,7 @@ namespace One * * The parser shovels content bytes from the raw * input buffer into the content output buffer, both caller-supplied. - * Ignores chunk extensions except for ICAP's ieof. + * Chunk extensions like use-original-body are handled via parseExtensionValuesWith(). * Trailers are available via mimeHeader() if wanted. */ class TeChunkedParser : public Http1::Parser @@ -37,6 +57,10 @@ public: /// set the buffer to be used to store decoded chunk data void setPayloadBuffer(MemBuf *parsedContent) {theOut = parsedContent;} + /// Instead of ignoring all chunk extension values, give the supplied + /// parser a chance to handle them. Only applied to last-chunk (for now). + void parseExtensionValuesWith(ChunkExtensionValueParser *parser) { customExtensionValueParser = parser; } + bool needsMoreSpace() const; /* Http1::Parser API */ @@ -45,17 +69,20 @@ public: virtual Parser::size_type firstLineSize() const {return 0;} // has no meaning with multiple chunks private: - bool parseChunkSize(Http1::Tokenizer &tok); - bool parseChunkExtension(Http1::Tokenizer &tok, bool skipKnown); - bool parseChunkBody(Http1::Tokenizer &tok); - bool parseChunkEnd(Http1::Tokenizer &tok); + bool parseChunkSize(Tokenizer &tok); + bool parseChunkMetadataSuffix(Tokenizer &); + void parseChunkExtensions(Tokenizer &); + void parseOneChunkExtension(Tokenizer &); + bool parseChunkBody(Tokenizer &tok); + bool parseChunkEnd(Tokenizer &tok); MemBuf *theOut; uint64_t theChunkSize; uint64_t theLeftBodySize; -public: - int64_t useOriginBody; + /// An optional plugin for parsing and interpreting custom chunk-ext-val. + /// This "visitor" object is owned by our creator. + ChunkExtensionValueParser *customExtensionValueParser; }; } // namespace One diff --git a/src/http/one/Tokenizer.cc b/src/http/one/Tokenizer.cc index 804b8e1..3a6bef3 100644 --- a/src/http/one/Tokenizer.cc +++ b/src/http/one/Tokenizer.cc @@ -8,35 +8,18 @@ #include "squid.h" #include "Debug.h" +#include "http/one/Parser.h" #include "http/one/Tokenizer.h" - -bool -Http::One::Tokenizer::quotedString(SBuf &returnedToken, const bool http1p0) -{ - checkpoint(); - - if (!skip('"')) - return false; - - return qdText(returnedToken, http1p0); -} - -bool -Http::One::Tokenizer::quotedStringOrToken(SBuf &returnedToken, const bool http1p0) +#include "parser/Tokenizer.h" +#include "sbuf/Stream.h" + +/// Extracts quoted-string after the caller removes the initial '"'. +/// \param http1p0 whether to prohibit \-escaped characters in quoted strings +/// \throws InsufficientInput when input can be a token _prefix_ +/// \returns extracted quoted string (without quotes and with chars unescaped) +static SBuf +parseQuotedStringSuffix(Parser::Tokenizer &tok, const bool http1p0) { - checkpoint(); - - if (!skip('"')) - return prefix(returnedToken, CharacterSet::TCHAR); - - return qdText(returnedToken, http1p0); -} - -bool -Http::One::Tokenizer::qdText(SBuf &returnedToken, const bool http1p0) -{ - // the initial DQUOTE has been skipped by the caller - /* * RFC 1945 - defines qdtext: * inclusive of LWS (which includes CR and LF) @@ -61,12 +44,17 @@ Http::One::Tokenizer::qdText(SBuf &returnedToken, const bool http1p0) // best we can do is a conditional reference since http1p0 value may change per-client const CharacterSet &tokenChars = (http1p0 ? qdtext1p0 : qdtext1p1); - for (;;) { - SBuf::size_type prefixLen = buf().findFirstNotOf(tokenChars); - returnedToken.append(consume(prefixLen)); + SBuf parsedToken; + + while (!tok.atEnd()) { + SBuf qdText; + if (tok.prefix(qdText, tokenChars)) + parsedToken.append(qdText); + + if (!http1p0 && tok.skip('\\')) { // HTTP/1.1 allows quoted-pair, HTTP/1.0 does not + if (tok.atEnd()) + break; - // HTTP/1.1 allows quoted-pair, HTTP/1.0 does not - if (!http1p0 && skip('\\')) { /* RFC 7230 section 3.2.6 * * The backslash octet ("\") can be used as a single-octet quoting @@ -78,32 +66,42 @@ Http::One::Tokenizer::qdText(SBuf &returnedToken, const bool http1p0) */ static const CharacterSet qPairChars = CharacterSet::HTAB + CharacterSet::SP + CharacterSet::VCHAR + CharacterSet::OBSTEXT; SBuf escaped; - if (!prefix(escaped, qPairChars, 1)) { - returnedToken.clear(); - restoreLastCheckpoint(); - return false; - } - returnedToken.append(escaped); + if (!tok.prefix(escaped, qPairChars, 1)) + throw TexcHere("invalid escaped character in quoted-pair"); + + parsedToken.append(escaped); continue; + } - } else if (skip('"')) { - break; // done + if (tok.skip('"')) + return parsedToken; // may be empty - } else if (atEnd()) { - // need more data - returnedToken.clear(); - restoreLastCheckpoint(); - return false; - } + if (tok.atEnd()) + break; - // else, we have an error - debugs(24, 8, "invalid bytes for set " << tokenChars.name); - returnedToken.clear(); - restoreLastCheckpoint(); - return false; + throw TexcHere(ToSBuf("invalid bytes for set ", tokenChars.name)); } - // found the whole string - return true; + throw Http::One::InsufficientInput(); +} + +SBuf +Http::One::tokenOrQuotedString(Parser::Tokenizer &tok, const bool http1p0) +{ + if (tok.skip('"')) + return parseQuotedStringSuffix(tok, http1p0); + + if (tok.atEnd()) + throw InsufficientInput(); + + SBuf parsedToken; + if (!tok.prefix(parsedToken, CharacterSet::TCHAR)) + throw TexcHere("invalid input while expecting an HTTP token"); + + if (tok.atEnd()) + throw InsufficientInput(); + + // got the complete token + return parsedToken; } diff --git a/src/http/one/Tokenizer.h b/src/http/one/Tokenizer.h index 658875f..2d40574 100644 --- a/src/http/one/Tokenizer.h +++ b/src/http/one/Tokenizer.h @@ -9,68 +9,47 @@ #ifndef SQUID_SRC_HTTP_ONE_TOKENIZER_H #define SQUID_SRC_HTTP_ONE_TOKENIZER_H -#include "parser/Tokenizer.h" +#include "parser/forward.h" +#include "sbuf/forward.h" namespace Http { namespace One { /** - * Lexical processor extended to tokenize HTTP/1.x syntax. + * Extracts either an HTTP/1 token or quoted-string while dealing with + * possibly incomplete input typical for incremental text parsers. + * Unescapes escaped characters in HTTP/1.1 quoted strings. * - * \see ::Parser::Tokenizer for more detail + * \param http1p0 whether to prohibit \-escaped characters in quoted strings + * \throws InsufficientInput as appropriate, including on unterminated tokens + * \returns extracted token or quoted string (without quotes) + * + * Governed by: + * - RFC 1945 section 2.1 + * " + * A string of text is parsed as a single word if it is quoted using + * double-quote marks. + * + * quoted-string = ( <"> *(qdtext) <"> ) + * + * qdtext = and CTLs, + * but including LWS> + * + * Single-character quoting using the backslash ("\") character is not + * permitted in HTTP/1.0. + * " + * + * - RFC 7230 section 3.2.6 + * " + * A string of text is parsed as a single value if it is quoted using + * double-quote marks. + * + * quoted-string = DQUOTE *( qdtext / quoted-pair ) DQUOTE + * qdtext = HTAB / SP /%x21 / %x23-5B / %x5D-7E / obs-text + * obs-text = %x80-FF + * " */ -class Tokenizer : public ::Parser::Tokenizer -{ -public: - Tokenizer(SBuf &s) : ::Parser::Tokenizer(s), savedStats_(0) {} - - /** - * Attempt to parse a quoted-string lexical construct. - * - * Governed by: - * - RFC 1945 section 2.1 - * " - * A string of text is parsed as a single word if it is quoted using - * double-quote marks. - * - * quoted-string = ( <"> *(qdtext) <"> ) - * - * qdtext = and CTLs, - * but including LWS> - * - * Single-character quoting using the backslash ("\") character is not - * permitted in HTTP/1.0. - * " - * - * - RFC 7230 section 3.2.6 - * " - * A string of text is parsed as a single value if it is quoted using - * double-quote marks. - * - * quoted-string = DQUOTE *( qdtext / quoted-pair ) DQUOTE - * qdtext = HTAB / SP /%x21 / %x23-5B / %x5D-7E / obs-text - * obs-text = %x80-FF - * " - * - * \param escaped HTTP/1.0 does not permit \-escaped characters - */ - bool quotedString(SBuf &value, const bool http1p0 = false); - - /** - * Attempt to parse a (token / quoted-string ) lexical construct. - */ - bool quotedStringOrToken(SBuf &value, const bool http1p0 = false); - -private: - /// parse the internal component of a quote-string, and terminal DQUOTE - bool qdText(SBuf &value, const bool http1p0); - - void checkpoint() { savedCheckpoint_ = buf(); savedStats_ = parsedSize(); } - void restoreLastCheckpoint() { undoParse(savedCheckpoint_, savedStats_); } - - SBuf savedCheckpoint_; - SBuf::size_type savedStats_; -}; +SBuf tokenOrQuotedString(Parser::Tokenizer &tok, const bool http1p0 = false); } // namespace One } // namespace Http diff --git a/src/http/one/forward.h b/src/http/one/forward.h index c90dc34..2b4ad28 100644 --- a/src/http/one/forward.h +++ b/src/http/one/forward.h @@ -10,6 +10,7 @@ #define SQUID_SRC_HTTP_ONE_FORWARD_H #include "base/RefCount.h" +#include "parser/forward.h" #include "sbuf/forward.h" namespace Http { @@ -31,6 +32,8 @@ typedef RefCount ResponseParserPointer; /// CRLF textual representation const SBuf &CrLf(); +using ::Parser::InsufficientInput; + } // namespace One } // namespace Http diff --git a/src/parser/BinaryTokenizer.h b/src/parser/BinaryTokenizer.h index acebd4d..24042d4 100644 --- a/src/parser/BinaryTokenizer.h +++ b/src/parser/BinaryTokenizer.h @@ -9,6 +9,7 @@ #ifndef SQUID_SRC_PARSER_BINARYTOKENIZER_H #define SQUID_SRC_PARSER_BINARYTOKENIZER_H +#include "parser/forward.h" #include "sbuf/SBuf.h" namespace Parser @@ -44,7 +45,7 @@ public: class BinaryTokenizer { public: - class InsufficientInput {}; // thrown when a method runs out of data + typedef ::Parser::InsufficientInput InsufficientInput; typedef uint64_t size_type; // enough for the largest supported offset BinaryTokenizer(); diff --git a/src/parser/Makefile.am b/src/parser/Makefile.am index af2b759..0daa5a8 100644 --- a/src/parser/Makefile.am +++ b/src/parser/Makefile.am @@ -13,6 +13,7 @@ noinst_LTLIBRARIES = libparser.la libparser_la_SOURCES = \ BinaryTokenizer.h \ BinaryTokenizer.cc \ + forward.h \ Tokenizer.h \ Tokenizer.cc diff --git a/src/parser/Tokenizer.cc b/src/parser/Tokenizer.cc index 7e73e04..68f4aec 100644 --- a/src/parser/Tokenizer.cc +++ b/src/parser/Tokenizer.cc @@ -10,7 +10,9 @@ #include "squid.h" #include "Debug.h" +#include "parser/forward.h" #include "parser/Tokenizer.h" +#include "sbuf/Stream.h" #include #if HAVE_CTYPE_H @@ -96,6 +98,23 @@ Parser::Tokenizer::prefix(SBuf &returnedToken, const CharacterSet &tokenChars, c return true; } +SBuf +Parser::Tokenizer::prefix(const char *description, const CharacterSet &tokenChars, const SBuf::size_type limit) +{ + if (atEnd()) + throw InsufficientInput(); + + SBuf result; + + if (!prefix(result, tokenChars, limit)) + throw TexcHere(ToSBuf("cannot parse ", description)); + + if (atEnd()) + throw InsufficientInput(); + + return result; +} + bool Parser::Tokenizer::suffix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit) { @@ -283,3 +302,24 @@ Parser::Tokenizer::int64(int64_t & result, int base, bool allowSign, const SBuf: return success(s - range.rawContent()); } +int64_t +Parser::Tokenizer::udec64(const char *description, const SBuf::size_type limit) +{ + if (atEnd()) + throw InsufficientInput(); + + int64_t result = 0; + + // Since we only support unsigned decimals, a parsing failure with a + // non-empty input always implies invalid/malformed input (or a buggy + // limit=0 caller). TODO: Support signed and non-decimal integers by + // refactoring int64() to detect insufficient input. + if (!int64(result, 10, false, limit)) + throw TexcHere(ToSBuf("cannot parse ", description)); + + if (atEnd()) + throw InsufficientInput(); // more digits may be coming + + return result; +} + diff --git a/src/parser/Tokenizer.h b/src/parser/Tokenizer.h index 54414be..03a8388 100644 --- a/src/parser/Tokenizer.h +++ b/src/parser/Tokenizer.h @@ -143,6 +143,19 @@ public: */ bool int64(int64_t &result, int base = 0, bool allowSign = true, SBuf::size_type limit = SBuf::npos); + /* + * The methods below mimic their counterparts documented above, but they + * throw on errors, including InsufficientInput. The field description + * parameter is used for error reporting and debugging. + */ + + /// prefix() wrapper but throws InsufficientInput if input contains + /// nothing but the prefix (i.e. if the prefix is not "terminated") + SBuf prefix(const char *description, const CharacterSet &tokenChars, SBuf::size_type limit = SBuf::npos); + + /// int64() wrapper but limited to unsigned decimal integers (for now) + int64_t udec64(const char *description, SBuf::size_type limit = SBuf::npos); + protected: SBuf consume(const SBuf::size_type n); SBuf::size_type success(const SBuf::size_type n); diff --git a/src/parser/forward.h b/src/parser/forward.h new file mode 100644 index 0000000..5a95b7a --- /dev/null +++ b/src/parser/forward.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 1996-2019 The Squid Software Foundation and contributors + * + * Squid software is distributed under GPLv2+ license and includes + * contributions from numerous individuals and organizations. + * Please see the COPYING and CONTRIBUTORS files for details. + */ + +#ifndef SQUID_PARSER_FORWARD_H +#define SQUID_PARSER_FORWARD_H + +namespace Parser { +class Tokenizer; +class BinaryTokenizer; + +// TODO: Move this declaration (to parser/Elements.h) if we need more like it. +/// thrown by modern "incremental" parsers when they need more data +class InsufficientInput {}; +} // namespace Parser + +#endif /* SQUID_PARSER_FORWARD_H */ + diff --git a/src/http/one/Parser.cc b/src/http/one/Parser.cc index affe0b1..05591fe 100644 --- a/src/http/one/Parser.cc +++ b/src/http/one/Parser.cc @@ -65,16 +65,10 @@ Http::One::Parser::DelimiterCharacters() void Http::One::Parser::skipLineTerminator(Tokenizer &tok) const { - if (tok.skip(Http1::CrLf())) - return; - if (Config.onoff.relaxed_header_parser && tok.skipOne(CharacterSet::LF)) return; - if (tok.atEnd() || (tok.remaining().length() == 1 && tok.remaining().at(0) == '\r')) - throw InsufficientInput(); - - throw TexcHere("garbage instead of CRLF line terminator"); + tok.skipRequired("line-terminating CRLF", Http1::CrLf()); } /// all characters except the LF line terminator diff --git a/src/http/one/Parser.h b/src/http/one/Parser.h index 40e281b..9a2a4ad 100644 --- a/src/http/one/Parser.h +++ b/src/http/one/Parser.h @@ -120,9 +120,7 @@ protected: * detect and skip the CRLF or (if tolerant) LF line terminator * consume from the tokenizer. * - * \throws exception on bad or InsuffientInput. - * \retval true only if line terminator found. - * \retval false incomplete or missing line terminator, need more data. + * \throws exception on bad or InsufficientInput */ void skipLineTerminator(Tokenizer &) const; diff --git a/src/http/one/TeChunkedParser.cc b/src/http/one/TeChunkedParser.cc index 6d2f8ea..3bff6c7 100644 --- a/src/http/one/TeChunkedParser.cc +++ b/src/http/one/TeChunkedParser.cc @@ -91,6 +91,11 @@ Http::One::TeChunkedParser::parseChunkSize(Tokenizer &tok) { Must(theChunkSize <= 0); // Should(), really + static const SBuf bannedHexPrefixLower("0x"); + static const SBuf bannedHexPrefixUpper("0X"); + if (tok.skip(bannedHexPrefixLower) || tok.skip(bannedHexPrefixUpper)) + throw TextException("chunk starts with 0x", Here()); + int64_t size = -1; if (tok.int64(size, 16, false) && !tok.atEnd()) { if (size < 0) @@ -121,7 +126,7 @@ Http::One::TeChunkedParser::parseChunkMetadataSuffix(Tokenizer &tok) // bad or insufficient input, like in the code below. TODO: Expand up. try { parseChunkExtensions(tok); // a possibly empty chunk-ext list - skipLineTerminator(tok); + tok.skipRequired("CRLF after [chunk-ext]", Http1::CrLf()); buf_ = tok.remaining(); parsingStage_ = theChunkSize ? Http1::HTTP_PARSE_CHUNK : Http1::HTTP_PARSE_MIME; return true; @@ -132,12 +137,14 @@ Http::One::TeChunkedParser::parseChunkMetadataSuffix(Tokenizer &tok) // other exceptions bubble up to kill message parsing } -/// Parses the chunk-ext list (RFC 7230 section 4.1.1 and its Errata #4667): +/// Parses the chunk-ext list (RFC 9112 section 7.1.1: /// chunk-ext = *( BWS ";" BWS chunk-ext-name [ BWS "=" BWS chunk-ext-val ] ) void -Http::One::TeChunkedParser::parseChunkExtensions(Tokenizer &tok) +Http::One::TeChunkedParser::parseChunkExtensions(Tokenizer &callerTok) { do { + auto tok = callerTok; + ParseBws(tok); // Bug 4492: IBM_HTTP_Server sends SP after chunk-size if (!tok.skip(';')) @@ -145,6 +152,7 @@ Http::One::TeChunkedParser::parseChunkExtensions(Tokenizer &tok) parseOneChunkExtension(tok); buf_ = tok.remaining(); // got one extension + callerTok = tok; } while (true); } @@ -158,11 +166,14 @@ Http::One::ChunkExtensionValueParser::Ignore(Tokenizer &tok, const SBuf &extName /// Parses a single chunk-ext list element: /// chunk-ext = *( BWS ";" BWS chunk-ext-name [ BWS "=" BWS chunk-ext-val ] ) void -Http::One::TeChunkedParser::parseOneChunkExtension(Tokenizer &tok) +Http::One::TeChunkedParser::parseOneChunkExtension(Tokenizer &callerTok) { + auto tok = callerTok; + ParseBws(tok); // Bug 4492: ICAP servers send SP before chunk-ext-name const auto extName = tok.prefix("chunk-ext-name", CharacterSet::TCHAR); + callerTok = tok; // in case we determine that this is a valueless chunk-ext ParseBws(tok); @@ -176,6 +187,8 @@ Http::One::TeChunkedParser::parseOneChunkExtension(Tokenizer &tok) customExtensionValueParser->parse(tok, extName); else ChunkExtensionValueParser::Ignore(tok, extName); + + callerTok = tok; } bool @@ -209,7 +222,7 @@ Http::One::TeChunkedParser::parseChunkEnd(Tokenizer &tok) Must(theLeftBodySize == 0); // Should(), really try { - skipLineTerminator(tok); + tok.skipRequired("chunk CRLF", Http1::CrLf()); buf_ = tok.remaining(); // parse checkpoint theChunkSize = 0; // done with the current chunk parsingStage_ = Http1::HTTP_PARSE_CHUNK_SZ; diff --git a/src/parser/Tokenizer.cc b/src/parser/Tokenizer.cc index 68f4aec..8516869 100644 --- a/src/parser/Tokenizer.cc +++ b/src/parser/Tokenizer.cc @@ -147,6 +147,18 @@ Parser::Tokenizer::skipAll(const CharacterSet &tokenChars) return success(prefixLen); } +void +Parser::Tokenizer::skipRequired(const char *description, const SBuf &tokenToSkip) +{ + if (skip(tokenToSkip) || tokenToSkip.isEmpty()) + return; + + if (tokenToSkip.startsWith(buf_)) + throw InsufficientInput(); + + throw TextException(ToSBuf("cannot skip ", description), Here()); +} + bool Parser::Tokenizer::skipOne(const CharacterSet &chars) { diff --git a/src/parser/Tokenizer.h b/src/parser/Tokenizer.h index 03a8388..78ab9e7 100644 --- a/src/parser/Tokenizer.h +++ b/src/parser/Tokenizer.h @@ -115,6 +115,13 @@ public: */ SBuf::size_type skipAll(const CharacterSet &discardables); + /** skips a given character sequence (string); + * does nothing if the sequence is empty + * + * \throws exception on mismatching prefix or InsufficientInput + */ + void skipRequired(const char *description, const SBuf &tokenToSkip); + /** Removes a single trailing character from the set. * * \return whether a character was removed