From 9c69597d5add6fe64652f0c3c4de6b4878b5a939 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miro=20Hron=C4=8Dok?= Date: Tue, 16 Dec 2025 12:54:12 +0100 Subject: [PATCH 1/5] Add Python 3.12 to nox config --- noxfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index 32366a57..3503de98 100644 --- a/noxfile.py +++ b/noxfile.py @@ -45,7 +45,7 @@ def tests_impl(session, extras="socks,secure,brotli"): ) -@nox.session(python=["2.7", "3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "pypy"]) +@nox.session(python=["2.7", "3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "pypy"]) def test(session): tests_impl(session) -- 2.52.0 From 033eb797df2bc488148c8d267d2c9fab1d802487 Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Mon, 14 Nov 2022 16:54:47 +0400 Subject: [PATCH 2/5] Standardize HTTPResponse.read(X) behavior regardless of compression Co-authored-by: Franek Magiera (cherry picked from commit c35033f6cc54106ca66ef2d48a9e3564d4fb0e07) --- changelog/2128.removal.rst | 7 + src/urllib3/response.py | 167 +++++++++++++++++----- test/test_response.py | 162 ++++++++++++++++++--- test/with_dummyserver/test_socketlevel.py | 9 +- 4 files changed, 278 insertions(+), 67 deletions(-) create mode 100644 changelog/2128.removal.rst diff --git a/changelog/2128.removal.rst b/changelog/2128.removal.rst new file mode 100644 index 00000000..abf412f9 --- /dev/null +++ b/changelog/2128.removal.rst @@ -0,0 +1,7 @@ +Standardized :meth:`~urllib3.response.HTTPResponse.read` to respect the semantics of BufferedIOBase regardless of compression. Specifically, this method: + +* only returns an empty bytes object to indicate EOF (that is, the response has been fully consumed), +* never returns more bytes than requested, +* can issue any number of system calls: zero, one or multiple. + +If you want each :meth:`~urllib3.response.HTTPResponse.read` call to issue a single system call, you need to disable decompression by setting ``decode_content=False``. diff --git a/src/urllib3/response.py b/src/urllib3/response.py index 0bd13d40..9793de48 100644 --- a/src/urllib3/response.py +++ b/src/urllib3/response.py @@ -1,5 +1,6 @@ from __future__ import absolute_import +import collections import io import logging import sys @@ -160,6 +161,63 @@ def _get_decoder(mode): return DeflateDecoder() +class BytesQueueBuffer: + """Memory-efficient bytes buffer + + To return decoded data in read() and still follow the BufferedIOBase API, we need a + buffer to always return the correct amount of bytes. + + This buffer should be filled using calls to put() + + Our maximum memory usage is determined by the sum of the size of: + + * self.buffer, which contains the full data + * the largest chunk that we will copy in get() + + The worst case scenario is a single chunk, in which case we'll make a full copy of + the data inside get(). + """ + + def __init__(self): + self.buffer = collections.deque() + self._size = 0 + + def __len__(self): + return self._size + + def put(self, data): + self.buffer.append(data) + self._size += len(data) + + def get(self, n): + if not self.buffer: + raise RuntimeError("buffer is empty") + elif n < 0: + raise ValueError("n should be > 0") + + fetched = 0 + ret = io.BytesIO() + while fetched < n: + remaining = n - fetched + chunk = self.buffer.popleft() + chunk_length = len(chunk) + if remaining < chunk_length: + left_chunk, right_chunk = chunk[:remaining], chunk[remaining:] + ret.write(left_chunk) + self.buffer.appendleft(right_chunk) + self._size -= remaining + break + else: + ret.write(chunk) + self._size -= chunk_length + fetched += chunk_length + + if not self.buffer: + break + + return ret.getvalue() + + class HTTPResponse(io.IOBase): """ HTTP Response container. @@ -261,6 +319,9 @@ class HTTPResponse(io.IOBase): # Determine length of response self.length_remaining = self._init_length(request_method) + # Used to return the correct amount of bytes for partial read()s + self._decoded_buffer = BytesQueueBuffer() + # If requested, preload the body. if preload_content and not self._body: self._body = self.read(decode_content=decode_content) @@ -532,6 +593,45 @@ class HTTPResponse(io.IOBase): # StringIO doesn't like amt=None return self._fp.read(amt) if amt is not None else self._fp.read() + def _raw_read(self, amt=None): + """ + Reads `amt` of bytes from the socket. + """ + if self._fp is None: + return None # type: ignore[return-value] + + fp_closed = getattr(self._fp, "closed", False) + + with self._error_catcher(): + data = self._fp_read(amt) if not fp_closed else b"" + if amt is not None and amt != 0 and not data: + # Platform-specific: Buggy versions of Python. + # Close the connection when no data is returned + # + # This is redundant to what httplib/http.client _should_ + # already do. However, versions of python released before + # December 15, 2012 (http://bugs.python.org/issue16298) do + # not properly close the connection in all cases. There is + # no harm in redundantly calling close. + self._fp.close() + if ( + self.enforce_content_length + and self.length_remaining is not None + and self.length_remaining != 0 + ): + # This is an edge case that httplib failed to cover due + # to concerns of backward compatibility. We're + # addressing it here to make sure IncompleteRead is + # raised during streaming, so all calls with incorrect + # Content-Length are caught. + raise IncompleteRead(self._fp_bytes_read, self.length_remaining) + + if data: + self._fp_bytes_read += len(data) + if self.length_remaining is not None: + self.length_remaining -= len(data) + return data + def read(self, amt=None, decode_content=None, cache_content=False): """ Similar to :meth:`http.client.HTTPResponse.read`, but with two additional @@ -557,50 +657,43 @@ class HTTPResponse(io.IOBase): if decode_content is None: decode_content = self.decode_content - if self._fp is None: - return + if amt is not None: + cache_content = False + + if len(self._decoded_buffer) >= amt: + return self._decoded_buffer.get(amt) + + data = self._raw_read(amt) flush_decoder = False - fp_closed = getattr(self._fp, "closed", False) + if amt is None: + flush_decoder = True + elif amt != 0 and not data: + flush_decoder = True - with self._error_catcher(): - data = self._fp_read(amt) if not fp_closed else b"" - if amt is None: - flush_decoder = True - else: - cache_content = False - if ( - amt != 0 and not data - ): # Platform-specific: Buggy versions of Python. - # Close the connection when no data is returned - # - # This is redundant to what httplib/http.client _should_ - # already do. However, versions of python released before - # December 15, 2012 (http://bugs.python.org/issue16298) do - # not properly close the connection in all cases. There is - # no harm in redundantly calling close. - self._fp.close() - flush_decoder = True - if self.enforce_content_length and self.length_remaining not in ( - 0, - None, - ): - # This is an edge case that httplib failed to cover due - # to concerns of backward compatibility. We're - # addressing it here to make sure IncompleteRead is - # raised during streaming, so all calls with incorrect - # Content-Length are caught. - raise IncompleteRead(self._fp_bytes_read, self.length_remaining) - - if data: - self._fp_bytes_read += len(data) - if self.length_remaining is not None: - self.length_remaining -= len(data) + if not data and len(self._decoded_buffer) == 0: + return data + if amt is None: data = self._decode(data, decode_content, flush_decoder) - if cache_content: self._body = data + else: + # do not waste memory on buffer when not decoding + if not decode_content: + return data + + decoded_data = self._decode(data, decode_content, flush_decoder) + self._decoded_buffer.put(decoded_data) + + while len(self._decoded_buffer) < amt and data: + # TODO make sure to initially read enough data to get past the headers + # For example, the GZ file header takes 10 bytes, we don't want to read + # it one byte at a time + data = self._raw_read(amt) + decoded_data = self._decode(data, decode_content, flush_decoder) + self._decoded_buffer.put(decoded_data) + data = self._decoded_buffer.get(amt) return data diff --git a/test/test_response.py b/test/test_response.py index e09e3858..44a630af 100644 --- a/test/test_response.py +++ b/test/test_response.py @@ -4,6 +4,7 @@ import contextlib import re import socket import ssl +import sys import zlib from base64 import b64decode from io import BufferedReader, BytesIO, TextIOWrapper @@ -25,10 +26,60 @@ from urllib3.exceptions import ( httplib_IncompleteRead, ) from urllib3.packages.six.moves import http_client as httplib -from urllib3.response import HTTPResponse, brotli +from urllib3.response import HTTPResponse, BytesQueueBuffer, brotli from urllib3.util.response import is_fp_closed from urllib3.util.retry import RequestHistory, Retry + +class TestBytesQueueBuffer: + def test_single_chunk(self): + buffer = BytesQueueBuffer() + assert len(buffer) == 0 + with pytest.raises(RuntimeError, match="buffer is empty"): + assert buffer.get(10) + + buffer.put(b"foo") + with pytest.raises(ValueError, match="n should be > 0"): + buffer.get(-1) + + assert buffer.get(1) == b"f" + assert buffer.get(2) == b"oo" + with pytest.raises(RuntimeError, match="buffer is empty"): + assert buffer.get(10) + + def test_read_too_much(self): + buffer = BytesQueueBuffer() + buffer.put(b"foo") + assert buffer.get(100) == b"foo" + + def test_multiple_chunks(self): + buffer = BytesQueueBuffer() + buffer.put(b"foo") + buffer.put(b"bar") + buffer.put(b"baz") + assert len(buffer) == 9 + + assert buffer.get(1) == b"f" + assert len(buffer) == 8 + assert buffer.get(4) == b"ooba" + assert len(buffer) == 4 + assert buffer.get(4) == b"rbaz" + assert len(buffer) == 0 + + @pytest.mark.skipif( + sys.version_info < (3, 8), reason="pytest-memray requires Python 3.8+" + ) + @pytest.mark.limit_memory("12.5 MB") # assert that we're not doubling memory usage + def test_memory_usage(self): + # Allocate 10 1MiB chunks + buffer = BytesQueueBuffer() + for i in range(10): + # This allocates 2MiB, putting the max at around 12MiB. Not sure why. + buffer.put(bytes(2**20)) + + assert len(buffer.get(10 * 2**20)) == 10 * 2**20 + + # A known random (i.e, not-too-compressible) payload generated with: # "".join(random.choice(string.printable) for i in xrange(512)) # .encode("zlib").encode("base64") @@ -76,9 +127,19 @@ class TestLegacyResponse(object): class TestResponse(object): def test_cache_content(self): - r = HTTPResponse("foo") - assert r.data == "foo" - assert r._body == "foo" + r = HTTPResponse(b"foo") + assert r._body == b"foo" + assert r.data == b"foo" + assert r._body == b"foo" + + def test_cache_content_preload_false(self): + fp = BytesIO(b"foo") + r = HTTPResponse(fp, preload_content=False) + + assert not r._body + assert r.data == b"foo" + assert r._body == b"foo" + assert r.data == b"foo" def test_default(self): r = HTTPResponse() @@ -143,12 +204,7 @@ class TestResponse(object): fp, headers={"content-encoding": "deflate"}, preload_content=False ) - assert r.read(3) == b"" - # Buffer in case we need to switch to the raw stream - assert r._decoder._data is not None assert r.read(1) == b"f" - # Now that we've decoded data, we just stream through the decoder - assert r._decoder._data is None assert r.read(2) == b"oo" assert r.read() == b"" assert r.read() == b"" @@ -163,10 +219,7 @@ class TestResponse(object): fp, headers={"content-encoding": "deflate"}, preload_content=False ) - assert r.read(1) == b"" assert r.read(1) == b"f" - # Once we've decoded data, we just stream to the decoder; no buffering - assert r._decoder._data is None assert r.read(2) == b"oo" assert r.read() == b"" assert r.read() == b"" @@ -181,7 +234,6 @@ class TestResponse(object): fp, headers={"content-encoding": "gzip"}, preload_content=False ) - assert r.read(11) == b"" assert r.read(1) == b"f" assert r.read(2) == b"oo" assert r.read() == b"" @@ -295,6 +347,23 @@ class TestResponse(object): assert r.data == b"foo" + def test_read_multi_decoding_deflate_deflate(self): + msg = b"foobarbaz" * 42 + data = zlib.compress(zlib.compress(msg)) + + fp = BytesIO(data) + r = HTTPResponse( + fp, headers={"content-encoding": "deflate, deflate"}, preload_content=False + ) + + assert r.read(3) == b"foo" + assert r.read(3) == b"bar" + assert r.read(3) == b"baz" + assert r.read(9) == b"foobarbaz" + assert r.read(9 * 3) == b"foobarbaz" * 3 + assert r.read(9 * 37) == b"foobarbaz" * 37 + assert r.read() == b"" + def test_body_blob(self): resp = HTTPResponse(b"foo") assert resp.data == b"foo" @@ -491,8 +560,8 @@ class TestResponse(object): ) stream = resp.stream(2) - assert next(stream) == b"f" - assert next(stream) == b"oo" + assert next(stream) == b"fo" + assert next(stream) == b"o" with pytest.raises(StopIteration): next(stream) @@ -521,6 +590,7 @@ class TestResponse(object): # Ensure that ``tell()`` returns the correct number of bytes when # part-way through streaming compressed content. NUMBER_OF_READS = 10 + PART_SIZE = 64 class MockCompressedDataReading(BytesIO): """ @@ -549,7 +619,7 @@ class TestResponse(object): resp = HTTPResponse( fp, headers={"content-encoding": "deflate"}, preload_content=False ) - stream = resp.stream() + stream = resp.stream(PART_SIZE) parts_positions = [(part, resp.tell()) for part in stream] end_of_stream = resp.tell() @@ -564,12 +634,28 @@ class TestResponse(object): assert uncompressed_data == payload # Check that the positions in the stream are correct - expected = [(i + 1) * payload_part_size for i in range(NUMBER_OF_READS)] - assert expected == list(positions) + # It is difficult to determine programatically what the positions + # returned by `tell` will be because the `HTTPResponse.read` method may + # call socket `read` a couple of times if it doesn't have enough data + # in the buffer or not call socket `read` at all if it has enough. All + # this depends on the message, how it was compressed, what is + # `PART_SIZE` and `payload_part_size`. + # So for simplicity the expected values are hardcoded. + expected = (92, 184, 230, 276, 322, 368, 414, 460) + assert expected == positions # Check that the end of the stream is in the correct place assert len(ZLIB_PAYLOAD) == end_of_stream + # Check that all parts have expected length + expected_last_part_size = len(uncompressed_data) % PART_SIZE + whole_parts = len(uncompressed_data) // PART_SIZE + if expected_last_part_size == 0: + expected_lengths = [PART_SIZE] * whole_parts + else: + expected_lengths = [PART_SIZE] * whole_parts + [expected_last_part_size] + assert expected_lengths == [len(part) for part in parts] + def test_deflate_streaming(self): data = zlib.compress(b"foo") @@ -579,8 +665,8 @@ class TestResponse(object): ) stream = resp.stream(2) - assert next(stream) == b"f" - assert next(stream) == b"oo" + assert next(stream) == b"fo" + assert next(stream) == b"o" with pytest.raises(StopIteration): next(stream) @@ -595,8 +681,8 @@ class TestResponse(object): ) stream = resp.stream(2) - assert next(stream) == b"f" - assert next(stream) == b"oo" + assert next(stream) == b"fo" + assert next(stream) == b"o" with pytest.raises(StopIteration): next(stream) @@ -608,6 +694,38 @@ class TestResponse(object): with pytest.raises(StopIteration): next(stream) + @pytest.mark.parametrize( + "preload_content, amt", + [(True, None), (False, None), (False, 10 * 2**20)], + ) + @pytest.mark.limit_memory("25 MB") + def test_buffer_memory_usage_decode_one_chunk( + self, preload_content, amt + ): + content_length = 10 * 2**20 # 10 MiB + fp = BytesIO(zlib.compress(bytes(content_length))) + resp = HTTPResponse( + fp, + preload_content=preload_content, + headers={"content-encoding": "deflate"}, + ) + data = resp.data if preload_content else resp.read(amt) + assert len(data) == content_length + + @pytest.mark.parametrize( + "preload_content, amt", + [(True, None), (False, None), (False, 10 * 2**20)], + ) + @pytest.mark.limit_memory("10.5 MB") + def test_buffer_memory_usage_no_decoding( + self, preload_content, amt + ): + content_length = 10 * 2**20 # 10 MiB + fp = BytesIO(bytes(content_length)) + resp = HTTPResponse(fp, preload_content=preload_content, decode_content=False) + data = resp.data if preload_content else resp.read(amt) + assert len(data) == content_length + def test_length_no_header(self): fp = BytesIO(b"12345") resp = HTTPResponse(fp, preload_content=False) diff --git a/test/with_dummyserver/test_socketlevel.py b/test/with_dummyserver/test_socketlevel.py index 254899ef..3309b8aa 100644 --- a/test/with_dummyserver/test_socketlevel.py +++ b/test/with_dummyserver/test_socketlevel.py @@ -1866,15 +1866,8 @@ class TestBadContentLength(SocketDummyServerTestCase): "GET", url="/", preload_content=False, enforce_content_length=True ) data = get_response.stream(100) - # Read "good" data before we try to read again. - # This won't trigger till generator is exhausted. - next(data) - try: + with pytest.raises(ProtocolError, match="12 bytes read, 10 more expected"): next(data) - assert False - except ProtocolError as e: - assert "12 bytes read, 10 more expected" in str(e) - done_event.set() def test_enforce_content_length_no_body(self): -- 2.52.0 From 10fb2b10818ee0caf0ee6ac7c02268ae69337a36 Mon Sep 17 00:00:00 2001 From: Seth Michael Larson Date: Wed, 3 May 2023 15:46:21 -0500 Subject: [PATCH 3/5] Continue reading the response stream if there is buffered decompressed data (cherry picked from commit 4714836a667eb4837d005eb89d34fae60b9dc6cc) --- changelog/3009.bugfix | 3 ++ src/urllib3/response.py | 2 +- test/with_dummyserver/test_socketlevel.py | 49 +++++++++++++++++++++++ 3 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 changelog/3009.bugfix diff --git a/changelog/3009.bugfix b/changelog/3009.bugfix new file mode 100644 index 00000000..61f54a49 --- /dev/null +++ b/changelog/3009.bugfix @@ -0,0 +1,3 @@ +Fixed ``HTTPResponse.stream()`` to continue yielding bytes if buffered decompressed data +was still available to be read even if the underlying socket is closed. This prevents +a compressed response from being truncated. diff --git a/src/urllib3/response.py b/src/urllib3/response.py index 9793de48..4b763cf8 100644 --- a/src/urllib3/response.py +++ b/src/urllib3/response.py @@ -717,7 +717,7 @@ class HTTPResponse(io.IOBase): for line in self.read_chunked(amt, decode_content=decode_content): yield line else: - while not is_fp_closed(self._fp): + while not is_fp_closed(self._fp) or len(self._decoded_buffer) > 0: data = self.read(amt=amt, decode_content=decode_content) if data: diff --git a/test/with_dummyserver/test_socketlevel.py b/test/with_dummyserver/test_socketlevel.py index 3309b8aa..ff4bc9b9 100644 --- a/test/with_dummyserver/test_socketlevel.py +++ b/test/with_dummyserver/test_socketlevel.py @@ -42,6 +42,7 @@ import socket import ssl import sys import tempfile +import zlib from collections import OrderedDict from test import ( LONG_TIMEOUT, @@ -1837,6 +1838,54 @@ class TestStream(SocketDummyServerTestCase): done_event.set() + def test_large_compressed_stream(self): + done_event = Event() + expected_total_length = 296085 + + def socket_handler(listener: socket.socket): + compress = zlib.compressobj(6, zlib.DEFLATED, 16 + zlib.MAX_WBITS) + data = compress.compress(b"x" * expected_total_length) + data += compress.flush() + + sock = listener.accept()[0] + + buf = b"" + while not buf.endswith(b"\r\n\r\n"): + buf += sock.recv(65536) + + sock.sendall( + b"HTTP/1.1 200 OK\r\n" + b"Content-Length: %d\r\n" + b"Content-Encoding: gzip\r\n" + b"\r\n" % (len(data),) + data + ) + + done_event.wait(5) + sock.close() + + self._start_server(socket_handler) + + with HTTPConnectionPool(self.host, self.port, retries=False) as pool: + r = pool.request("GET", "/", timeout=LONG_TIMEOUT, preload_content=False) + + # Chunks must all be equal or less than 10240 + # and only the last chunk is allowed to be smaller + # than 10240. + total_length = 0 + chunks_smaller_than_10240 = 0 + for chunk in r.stream(10240, decode_content=True): + assert 0 < len(chunk) <= 10240 + if len(chunk) < 10240: + chunks_smaller_than_10240 += 1 + else: + assert chunks_smaller_than_10240 == 0 + total_length += len(chunk) + + assert chunks_smaller_than_10240 == 1 + assert expected_total_length == total_length + + done_event.set() + class TestBadContentLength(SocketDummyServerTestCase): def test_enforce_content_length_get(self): -- 2.52.0 From e82a62787e04da58d08254a54a4da8b65d92e8b8 Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Sun, 30 Apr 2023 00:29:17 +0400 Subject: [PATCH 4/5] Fix HTTPResponse.read(0) when underlying buffer is empty (#2998) (cherry picked from commit 02ae65a45654bb3ced12b9ad22278c11e214aaf8) --- changelog/2998.bugfix.rst | 1 + src/urllib3/response.py | 2 ++ test/test_response.py | 3 +++ 3 files changed, 6 insertions(+) create mode 100644 changelog/2998.bugfix.rst diff --git a/changelog/2998.bugfix.rst b/changelog/2998.bugfix.rst new file mode 100644 index 00000000..584f309a --- /dev/null +++ b/changelog/2998.bugfix.rst @@ -0,0 +1 @@ +Fixed ``HTTPResponse.read(0)`` call when underlying buffer is empty. diff --git a/src/urllib3/response.py b/src/urllib3/response.py index 4b763cf8..f6cf33a8 100644 --- a/src/urllib3/response.py +++ b/src/urllib3/response.py @@ -190,6 +190,8 @@ class BytesQueueBuffer: self._size += len(data) def get(self, n): + if n == 0: + return b"" if not self.buffer: raise RuntimeError("buffer is empty") elif n < 0: diff --git a/test/test_response.py b/test/test_response.py index 44a630af..e85984f1 100644 --- a/test/test_response.py +++ b/test/test_response.py @@ -38,6 +38,8 @@ class TestBytesQueueBuffer: with pytest.raises(RuntimeError, match="buffer is empty"): assert buffer.get(10) + assert buffer.get(0) == b"" + buffer.put(b"foo") with pytest.raises(ValueError, match="n should be > 0"): buffer.get(-1) @@ -175,6 +177,7 @@ class TestResponse(object): fp = BytesIO(b"foo") r = HTTPResponse(fp, preload_content=False) + assert r.read(0) == b"" assert r.read(1) == b"f" assert r.read(2) == b"oo" assert r.read() == b"" -- 2.52.0 From c59306e9d91508332ecb90b6e9485ee100bcb81a Mon Sep 17 00:00:00 2001 From: Illia Volochii Date: Fri, 5 Dec 2025 16:40:41 +0200 Subject: [PATCH 5/5] Security fix for CVE-2025-66471 (cherry picked from commit c19571de34c47de3a766541b041637ba5f716ed7) --- CHANGES.rst | 8 ++ docs/advanced-usage.rst | 3 +- docs/user-guide.rst | 4 +- src/urllib3/response.py | 208 ++++++++++++++++++++++++++++++++++------ test/test_response.py | 176 ++++++++++++++++++++++++++++++++++ 5 files changed, 365 insertions(+), 34 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index c45b3d16..943c7679 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,14 @@ Changes ======= +Backports +--------- + +- Fixed a security issue where streaming API could improperly handle highly + compressed HTTP content ("decompression bombs") leading to excessive resource + consumption even when a small amount of data was requested. Reading small + chunks of compressed data is safer and much more efficient now. + 1.26.19 (2024-06-17) -------------------- diff --git a/docs/advanced-usage.rst b/docs/advanced-usage.rst index d38e7190..d2b4c400 100644 --- a/docs/advanced-usage.rst +++ b/docs/advanced-usage.rst @@ -57,7 +57,8 @@ When using ``preload_content=True`` (the default setting) the response body will be read immediately into memory and the HTTP connection will be released back into the pool without manual intervention. -However, when dealing with large responses it's often better to stream the response +However, when dealing with responses of large or unknown length, +it's often better to stream the response content using ``preload_content=False``. Setting ``preload_content`` to ``False`` means that urllib3 will only read from the socket when data is requested. diff --git a/docs/user-guide.rst b/docs/user-guide.rst index abd53233..bb16b9a8 100644 --- a/docs/user-guide.rst +++ b/docs/user-guide.rst @@ -99,8 +99,8 @@ to a byte string representing the response content: >>> r.data b'\xaa\xa5H?\x95\xe9\x9b\x11' -.. note:: For larger responses, it's sometimes better to :ref:`stream ` - the response. +.. note:: For responses of large or unknown length, it's sometimes better to + :ref:`stream ` the response. Using io Wrappers with Response Content ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/urllib3/response.py b/src/urllib3/response.py index f6cf33a8..d0665533 100644 --- a/src/urllib3/response.py +++ b/src/urllib3/response.py @@ -24,6 +24,7 @@ from .connection import BaseSSLError, HTTPException from .exceptions import ( BodyNotHttplibCompatible, DecodeError, + DependencyWarning, HTTPError, IncompleteRead, InvalidChunkLength, @@ -42,33 +43,60 @@ log = logging.getLogger(__name__) class DeflateDecoder(object): def __init__(self): self._first_try = True - self._data = b"" + self._first_try_data = b"" + self._unfed_data = b"" self._obj = zlib.decompressobj() def __getattr__(self, name): return getattr(self._obj, name) - def decompress(self, data): - if not data: + def decompress(self, data, max_length=-1): + data = self._unfed_data + data + self._unfed_data = b"" + if not data and not self._obj.unconsumed_tail: return data + original_max_length = max_length + if original_max_length < 0: + max_length = 0 + elif original_max_length == 0: + # We should not pass 0 to the zlib decompressor because 0 is + # the default value that will make zlib decompress without a + # length limit. + # Data should be stored for subsequent calls. + self._unfed_data = data + return b"" + # Subsequent calls always reuse `self._obj`. zlib requires + # passing the unconsumed tail if decompression is to continue. if not self._first_try: - return self._obj.decompress(data) + return self._obj.decompress( + self._obj.unconsumed_tail + data, max_length=max_length + ) - self._data += data + # First call tries with RFC 1950 ZLIB format. + self._first_try_data += data try: - decompressed = self._obj.decompress(data) + decompressed = self._obj.decompress(data, max_length=max_length) if decompressed: self._first_try = False - self._data = None + self._first_try_data = b"" return decompressed + # On failure, it falls back to RFC 1951 DEFLATE format. except zlib.error: self._first_try = False self._obj = zlib.decompressobj(-zlib.MAX_WBITS) try: - return self.decompress(self._data) + return self.decompress( + self._first_try_data, max_length=original_max_length + ) finally: - self._data = None + self._first_try_data = b"" + + @property + def has_unconsumed_tail(self): + return bool(self._unfed_data) or ( + bool(self._obj.unconsumed_tail) and not self._first_try + ) class GzipDecoderState(object): @@ -82,30 +110,64 @@ class GzipDecoder(object): def __init__(self): self._obj = zlib.decompressobj(16 + zlib.MAX_WBITS) self._state = GzipDecoderState.FIRST_MEMBER + self._unconsumed_tail = b"" def __getattr__(self, name): return getattr(self._obj, name) - def decompress(self, data): + def decompress(self, data, max_length=-1): ret = bytearray() - if self._state == GzipDecoderState.SWALLOW_DATA or not data: + if self._state == GzipDecoderState.SWALLOW_DATA: return bytes(ret) + + if max_length == 0: + # We should not pass 0 to the zlib decompressor because 0 is + # the default value that will make zlib decompress without a + # length limit. + # Data should be stored for subsequent calls. + self._unconsumed_tail += data + return b"" + + # zlib requires passing the unconsumed tail to the subsequent + # call if decompression is to continue. + data = self._unconsumed_tail + data + if not data and self._obj.eof: + return bytes(ret) + while True: try: - ret += self._obj.decompress(data) + ret += self._obj.decompress( + data, max_length=max(max_length - len(ret), 0) + ) except zlib.error: previous_state = self._state # Ignore data after the first error self._state = GzipDecoderState.SWALLOW_DATA + self._unconsumed_tail = b"" if previous_state == GzipDecoderState.OTHER_MEMBERS: # Allow trailing garbage acceptable in other gzip clients return bytes(ret) raise - data = self._obj.unused_data + + self._unconsumed_tail = data = ( + self._obj.unconsumed_tail or self._obj.unused_data + ) + if max_length > 0 and len(ret) >= max_length: + break + if not data: return bytes(ret) - self._state = GzipDecoderState.OTHER_MEMBERS - self._obj = zlib.decompressobj(16 + zlib.MAX_WBITS) + # When the end of a gzip member is reached, a new decompressor + # must be created for unused (possibly future) data. + if self._obj.eof: + self._state = GzipDecoderState.OTHER_MEMBERS + self._obj = zlib.decompressobj(16 + zlib.MAX_WBITS) + + return bytes(ret) + + @property + def has_unconsumed_tail(self): + return bool(self._unconsumed_tail) if brotli is not None: @@ -117,9 +179,35 @@ if brotli is not None: def __init__(self): self._obj = brotli.Decompressor() if hasattr(self._obj, "decompress"): - self.decompress = self._obj.decompress + self._decompress = self._obj.decompress else: - self.decompress = self._obj.process + self._decompress = self._obj.process + + # Requires Brotli >= 1.2.0 for `output_buffer_limit`. + def _decompress(self, data, output_buffer_limit=-1): + raise NotImplementedError() + + def decompress(self, data, max_length=-1): + try: + if max_length > 0: + return self._decompress(data, output_buffer_limit=max_length) + else: + return self._decompress(data) + except TypeError: + # Fallback for Brotli/brotlicffi/brotlipy versions without + # the `output_buffer_limit` parameter. + warnings.warn( + "Brotli >= 1.2.0 is required to prevent decompression bombs.", + DependencyWarning, + ) + return self._decompress(data) + + @property + def has_unconsumed_tail(self): + try: + return not self._obj.can_accept_more_data() + except AttributeError: + return False def flush(self): if hasattr(self._obj, "flush"): @@ -142,10 +230,35 @@ class MultiDecoder(object): def flush(self): return self._decoders[0].flush() - def decompress(self, data): - for d in reversed(self._decoders): - data = d.decompress(data) - return data + def decompress(self, data, max_length=-1): + if max_length <= 0: + for d in reversed(self._decoders): + data = d.decompress(data) + return data + + ret = bytearray() + # Every while loop iteration goes through all decoders once. + # It exits when enough data is read or no more data can be read. + # It is possible that the while loop iteration does not produce + # any data because we retrieve up to `max_length` from every + # decoder, and the amount of bytes may be insufficient for the + # next decoder to produce enough/any output. + while True: + any_data = False + for d in reversed(self._decoders): + data = d.decompress(data, max_length=max_length - len(ret)) + if data: + any_data = True + # We should not break when no data is returned because + # next decoders may produce data even with empty input. + ret += data + if not any_data or len(ret) >= max_length: + return bytes(ret) + data = b"" + + @property + def has_unconsumed_tail(self): + return any(d.has_unconsumed_tail for d in self._decoders) def _get_decoder(mode): @@ -173,9 +286,6 @@ class BytesQueueBuffer: * self.buffer, which contains the full data * the largest chunk that we will copy in get() - - The worst case scenario is a single chunk, in which case we'll make a full copy of - the data inside get(). """ def __init__(self): @@ -197,6 +307,10 @@ class BytesQueueBuffer: elif n < 0: raise ValueError("n should be > 0") + if len(self.buffer[0]) == n and isinstance(self.buffer[0], bytes): + self._size -= n + return self.buffer.popleft() + fetched = 0 ret = io.BytesIO() while fetched < n: @@ -458,16 +572,19 @@ class HTTPResponse(io.IOBase): if brotli is not None: DECODER_ERROR_CLASSES += (brotli.error,) - def _decode(self, data, decode_content, flush_decoder): + def _decode(self, data, decode_content, flush_decoder, max_length=None): """ Decode the data passed in and potentially flush the decoder. """ if not decode_content: return data + if max_length is None or flush_decoder: + max_length = -1 + try: if self._decoder: - data = self._decoder.decompress(data) + data = self._decoder.decompress(data, max_length=max_length) except self.DECODER_ERROR_CLASSES as e: content_encoding = self.headers.get("content-encoding", "").lower() raise DecodeError( @@ -662,6 +779,14 @@ class HTTPResponse(io.IOBase): if amt is not None: cache_content = False + if self._decoder and self._decoder.has_unconsumed_tail: + decoded_data = self._decode( + b"", + decode_content, + flush_decoder=False, + max_length=amt - len(self._decoded_buffer), + ) + self._decoded_buffer.put(decoded_data) if len(self._decoded_buffer) >= amt: return self._decoded_buffer.get(amt) @@ -673,7 +798,11 @@ class HTTPResponse(io.IOBase): elif amt != 0 and not data: flush_decoder = True - if not data and len(self._decoded_buffer) == 0: + if ( + not data + and len(self._decoded_buffer) == 0 + and not (self._decoder and self._decoder.has_unconsumed_tail) + ): return data if amt is None: @@ -685,7 +814,12 @@ class HTTPResponse(io.IOBase): if not decode_content: return data - decoded_data = self._decode(data, decode_content, flush_decoder) + decoded_data = self._decode( + data, + decode_content, + flush_decoder, + max_length=amt - len(self._decoded_buffer), + ) self._decoded_buffer.put(decoded_data) while len(self._decoded_buffer) < amt and data: @@ -693,7 +827,12 @@ class HTTPResponse(io.IOBase): # For example, the GZ file header takes 10 bytes, we don't want to read # it one byte at a time data = self._raw_read(amt) - decoded_data = self._decode(data, decode_content, flush_decoder) + decoded_data = self._decode( + data, + decode_content, + flush_decoder, + max_length=amt - len(self._decoded_buffer), + ) self._decoded_buffer.put(decoded_data) data = self._decoded_buffer.get(amt) @@ -719,7 +858,11 @@ class HTTPResponse(io.IOBase): for line in self.read_chunked(amt, decode_content=decode_content): yield line else: - while not is_fp_closed(self._fp) or len(self._decoded_buffer) > 0: + while ( + not is_fp_closed(self._fp) + or len(self._decoded_buffer) > 0 + or (self._decoder and self._decoder.has_unconsumed_tail) + ): data = self.read(amt=amt, decode_content=decode_content) if data: @@ -925,7 +1068,10 @@ class HTTPResponse(io.IOBase): break chunk = self._handle_chunk(amt) decoded = self._decode( - chunk, decode_content=decode_content, flush_decoder=False + chunk, + decode_content=decode_content, + flush_decoder=False, + max_length=amt, ) if decoded: yield decoded diff --git a/test/test_response.py b/test/test_response.py index e85984f1..f949b2b1 100644 --- a/test/test_response.py +++ b/test/test_response.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import contextlib +import gzip import re import socket import ssl @@ -31,6 +32,22 @@ from urllib3.util.response import is_fp_closed from urllib3.util.retry import RequestHistory, Retry +def deflate2_compress(data): + compressor = zlib.compressobj(6, zlib.DEFLATED, -zlib.MAX_WBITS) + return compressor.compress(data) + compressor.flush() + + +if brotli: + try: + brotli.Decompressor().process(b"", output_buffer_limit=1024) + _brotli_gte_1_2_0_available = True + except (AttributeError, TypeError): + _brotli_gte_1_2_0_available = False +else: + _brotli_gte_1_2_0_available = False +_zstd_available = False + + class TestBytesQueueBuffer: def test_single_chunk(self): buffer = BytesQueueBuffer() @@ -318,6 +335,165 @@ class TestResponse(object): with pytest.raises(DecodeError): HTTPResponse(fp, headers={"content-encoding": "br"}) + _test_compressor_params = [ + ("deflate1", ("deflate", zlib.compress)), + ("deflate2", ("deflate", deflate2_compress)), + ("gzip", ("gzip", gzip.compress)), + ] + if _brotli_gte_1_2_0_available: + _test_compressor_params.append(("brotli", ("br", brotli.compress))) + else: + _test_compressor_params.append(("brotli", None)) + if _zstd_available: + _test_compressor_params.append(("zstd", ("zstd", zstd_compress))) + else: + _test_compressor_params.append(("zstd", None)) + + @pytest.mark.parametrize("read_method", ("read",)) + @pytest.mark.parametrize( + "data", + [d[1] for d in _test_compressor_params], + ids=[d[0] for d in _test_compressor_params], + ) + def test_read_with_all_data_already_in_decompressor( + self, + request, + read_method, + data, + ): + if data is None: + pytest.skip(f"Proper {request.node.callspec.id} decoder is not available") + original_data = b"bar" * 1000 + name, compress_func = data + compressed_data = compress_func(original_data) + fp = mock.Mock(read=mock.Mock(return_value=b"")) + r = HTTPResponse(fp, headers={"content-encoding": name}, preload_content=False) + # Put all data in the decompressor's buffer. + r._init_decoder() + assert r._decoder is not None # for mypy + decoded = r._decoder.decompress(compressed_data, max_length=0) + if name == "br": + # It's known that some Brotli libraries do not respect + # `max_length`. + r._decoded_buffer.put(decoded) + else: + assert decoded == b"" + # Read the data via `HTTPResponse`. + read = getattr(r, read_method) + assert read(0) == b"" + assert read(2500) == original_data[:2500] + assert read(500) == original_data[2500:] + assert read(0) == b"" + assert read() == b"" + + @pytest.mark.parametrize( + "delta", + ( + 0, # First read from socket returns all compressed data. + -1, # First read from socket returns all but one byte of compressed data. + ), + ) + @pytest.mark.parametrize("read_method", ("read",)) + @pytest.mark.parametrize( + "data", + [d[1] for d in _test_compressor_params], + ids=[d[0] for d in _test_compressor_params], + ) + def test_decode_with_max_length_close_to_compressed_data_size( + self, + request, + delta, + read_method, + data, + ): + """ + Test decoding when the first read from the socket returns all or + almost all the compressed data, but then it has to be + decompressed in a couple of read calls. + """ + if data is None: + pytest.skip(f"Proper {request.node.callspec.id} decoder is not available") + + original_data = b"foo" * 1000 + name, compress_func = data + compressed_data = compress_func(original_data) + fp = BytesIO(compressed_data) + r = HTTPResponse(fp, headers={"content-encoding": name}, preload_content=False) + initial_limit = len(compressed_data) + delta + read = getattr(r, read_method) + initial_chunk = read(amt=initial_limit, decode_content=True) + assert len(initial_chunk) == initial_limit + assert ( + len(read(amt=len(original_data), decode_content=True)) + == len(original_data) - initial_limit + ) + + # Prepare 50 MB of compressed data outside of the test measuring + # memory usage. + _test_memory_usage_decode_with_max_length_params = [ + ( + params[0], + (params[1][0], params[1][1](b"A" * (50 * 2**20))) if params[1] else None, + ) + for params in _test_compressor_params + ] + + @pytest.mark.parametrize( + "data", + [d[1] for d in _test_memory_usage_decode_with_max_length_params], + ids=[d[0] for d in _test_memory_usage_decode_with_max_length_params], + ) + @pytest.mark.parametrize("read_method", ("read", "read_chunked", "stream")) + # Decoders consume different amounts of memory during decompression. + # We set the 10 MB limit to ensure that the whole decompressed data + # is not stored unnecessarily. + # + # FYI, the following consumption was observed for the test with + # `read` on CPython 3.14.0: + # - deflate: 2.3 MiB + # - deflate2: 2.1 MiB + # - gzip: 2.1 MiB + # - brotli: + # - brotli v1.2.0: 9 MiB + # - brotlicffi v1.2.0.0: 6 MiB + # - brotlipy v0.7.0: 105.8 MiB + # - zstd: 4.5 MiB + @pytest.mark.limit_memory("10 MB", current_thread_only=True) + def test_memory_usage_decode_with_max_length( + self, + request, + read_method, + data, + ): + if data is None: + pytest.skip(f"Proper {request.node.callspec.id} decoder is not available") + + name, compressed_data = data + limit = 1024 * 1024 # 1 MiB + if read_method in ("read_chunked", "stream"): + httplib_r = httplib.HTTPResponse(MockSock) # type: ignore[arg-type] + httplib_r.fp = MockChunkedEncodingResponse([compressed_data]) # type: ignore[assignment] + r = HTTPResponse( + httplib_r, + preload_content=False, + headers={"transfer-encoding": "chunked", "content-encoding": name}, + ) + next(getattr(r, read_method)(amt=limit, decode_content=True)) + else: + fp = BytesIO(compressed_data) + r = HTTPResponse( + fp, headers={"content-encoding": name}, preload_content=False + ) + getattr(r, read_method)(amt=limit, decode_content=True) + + # Check that the internal decoded buffer is empty unless brotli + # is used. + # Google's brotli library does not fully respect the output + # buffer limit: https://github.com/google/brotli/issues/1396 + # And unmaintained brotlipy cannot limit the output buffer size. + if name != "br" or brotli.__name__ == "brotlicffi": + assert len(r._decoded_buffer) == 0 + def test_multi_decoding_deflate_deflate(self): data = zlib.compress(zlib.compress(b"foo")) -- 2.52.0