Skip to content

Commit 85c5898

Browse files
giannitedescotomchristiecdeler
authored
Change LineDecoder to match stdlib splitlines, resulting in significant speed up (#2423)
* Replace quadratic algo in LineDecoder Leading to enormous speedups when doing things such as Response(...).iter_lines() as described on issue #2422 * Update httpx/_decoders.py * Update _decoders.py Handle text ending in `\r` more gracefully. Return as much content as possible. * Update test_decoders.py * Update _decoders.py * Update _decoders.py * Update _decoders.py * Update httpx/_decoders.py Co-authored-by: cdeler <[email protected]> * Update _decoders.py --------- Co-authored-by: Tom Christie <[email protected]> Co-authored-by: cdeler <[email protected]>
1 parent e486fbc commit 85c5898

File tree

3 files changed

+57
-67
lines changed

3 files changed

+57
-67
lines changed

httpx/_decoders.py

+41-51
Original file line numberDiff line numberDiff line change
@@ -259,66 +259,56 @@ class LineDecoder:
259259
"""
260260
Handles incrementally reading lines from text.
261261
262-
Uses universal line decoding, supporting any of `\n`, `\r`, or `\r\n`
263-
as line endings, normalizing to `\n`.
262+
Has the same behaviour as the stdllib splitlines, but handling the input iteratively.
264263
"""
265264

266265
def __init__(self) -> None:
267-
self.buffer = ""
266+
self.buffer: typing.List[str] = []
267+
self.trailing_cr: bool = False
268268

269269
def decode(self, text: str) -> typing.List[str]:
270-
lines = []
271-
272-
if text and self.buffer and self.buffer[-1] == "\r":
273-
if text.startswith("\n"):
274-
# Handle the case where we have an "\r\n" split across
275-
# our previous input, and our new chunk.
276-
lines.append(self.buffer[:-1] + "\n")
277-
self.buffer = ""
278-
text = text[1:]
279-
else:
280-
# Handle the case where we have "\r" at the end of our
281-
# previous input.
282-
lines.append(self.buffer[:-1] + "\n")
283-
self.buffer = ""
284-
285-
while text:
286-
num_chars = len(text)
287-
for idx in range(num_chars):
288-
char = text[idx]
289-
next_char = None if idx + 1 == num_chars else text[idx + 1]
290-
if char == "\n":
291-
lines.append(self.buffer + text[: idx + 1])
292-
self.buffer = ""
293-
text = text[idx + 1 :]
294-
break
295-
elif char == "\r" and next_char == "\n":
296-
lines.append(self.buffer + text[:idx] + "\n")
297-
self.buffer = ""
298-
text = text[idx + 2 :]
299-
break
300-
elif char == "\r" and next_char is not None:
301-
lines.append(self.buffer + text[:idx] + "\n")
302-
self.buffer = ""
303-
text = text[idx + 1 :]
304-
break
305-
elif next_char is None:
306-
self.buffer += text
307-
text = ""
308-
break
270+
# See https://docs.python.org/3/library/stdtypes.html#str.splitlines
271+
NEWLINE_CHARS = "\n\r\x0b\x0c\x1c\x1d\x1e\x85\u2028\u2029"
272+
273+
# We always push a trailing `\r` into the next decode iteration.
274+
if self.trailing_cr:
275+
text = "\r" + text
276+
self.trailing_cr = False
277+
if text.endswith("\r"):
278+
self.trailing_cr = True
279+
text = text[:-1]
280+
281+
if not text:
282+
return []
283+
284+
trailing_newline = text[-1] in NEWLINE_CHARS
285+
lines = text.splitlines()
286+
287+
if len(lines) == 1 and not trailing_newline:
288+
# No new lines, buffer the input and continue.
289+
self.buffer.append(lines[0])
290+
return []
291+
292+
if self.buffer:
293+
# Include any existing buffer in the first portion of the
294+
# splitlines result.
295+
lines = ["".join(self.buffer) + lines[0]] + lines[1:]
296+
self.buffer = []
297+
298+
if not trailing_newline:
299+
# If the last segment of splitlines is not newline terminated,
300+
# then drop it from our output and start a new buffer.
301+
self.buffer = [lines.pop()]
309302

310303
return lines
311304

312305
def flush(self) -> typing.List[str]:
313-
if self.buffer.endswith("\r"):
314-
# Handle the case where we had a trailing '\r', which could have
315-
# been a '\r\n' pair.
316-
lines = [self.buffer[:-1] + "\n"]
317-
elif self.buffer:
318-
lines = [self.buffer]
319-
else:
320-
lines = []
321-
self.buffer = ""
306+
if not self.buffer and not self.trailing_cr:
307+
return []
308+
309+
lines = ["".join(self.buffer)]
310+
self.buffer = []
311+
self.trailing_cr = False
322312
return lines
323313

324314

tests/models/test_responses.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -639,7 +639,7 @@ def test_iter_lines():
639639
content=b"Hello,\nworld!",
640640
)
641641
content = [line for line in response.iter_lines()]
642-
assert content == ["Hello,\n", "world!"]
642+
assert content == ["Hello,", "world!"]
643643

644644

645645
@pytest.mark.anyio
@@ -652,7 +652,7 @@ async def test_aiter_lines():
652652
content = []
653653
async for line in response.aiter_lines():
654654
content.append(line)
655-
assert content == ["Hello,\n", "world!"]
655+
assert content == ["Hello,", "world!"]
656656

657657

658658
def test_sync_streaming_response():

tests/test_decoders.py

+14-14
Original file line numberDiff line numberDiff line change
@@ -225,69 +225,69 @@ def test_text_decoder_empty_cases():
225225
def test_line_decoder_nl():
226226
decoder = LineDecoder()
227227
assert decoder.decode("") == []
228-
assert decoder.decode("a\n\nb\nc") == ["a\n", "\n", "b\n"]
228+
assert decoder.decode("a\n\nb\nc") == ["a", "", "b"]
229229
assert decoder.flush() == ["c"]
230230

231231
decoder = LineDecoder()
232232
assert decoder.decode("") == []
233-
assert decoder.decode("a\n\nb\nc\n") == ["a\n", "\n", "b\n", "c\n"]
233+
assert decoder.decode("a\n\nb\nc\n") == ["a", "", "b", "c"]
234234
assert decoder.flush() == []
235235

236236
# Issue #1033
237237
decoder = LineDecoder()
238238
assert decoder.decode("") == []
239-
assert decoder.decode("12345\n") == ["12345\n"]
239+
assert decoder.decode("12345\n") == ["12345"]
240240
assert decoder.decode("foo ") == []
241241
assert decoder.decode("bar ") == []
242-
assert decoder.decode("baz\n") == ["foo bar baz\n"]
242+
assert decoder.decode("baz\n") == ["foo bar baz"]
243243
assert decoder.flush() == []
244244

245245

246246
def test_line_decoder_cr():
247247
decoder = LineDecoder()
248248
assert decoder.decode("") == []
249-
assert decoder.decode("a\r\rb\rc") == ["a\n", "\n", "b\n"]
249+
assert decoder.decode("a\r\rb\rc") == ["a", "", "b"]
250250
assert decoder.flush() == ["c"]
251251

252252
decoder = LineDecoder()
253253
assert decoder.decode("") == []
254-
assert decoder.decode("a\r\rb\rc\r") == ["a\n", "\n", "b\n"]
255-
assert decoder.flush() == ["c\n"]
254+
assert decoder.decode("a\r\rb\rc\r") == ["a", "", "b"]
255+
assert decoder.flush() == ["c"]
256256

257257
# Issue #1033
258258
decoder = LineDecoder()
259259
assert decoder.decode("") == []
260260
assert decoder.decode("12345\r") == []
261-
assert decoder.decode("foo ") == ["12345\n"]
261+
assert decoder.decode("foo ") == ["12345"]
262262
assert decoder.decode("bar ") == []
263263
assert decoder.decode("baz\r") == []
264-
assert decoder.flush() == ["foo bar baz\n"]
264+
assert decoder.flush() == ["foo bar baz"]
265265

266266

267267
def test_line_decoder_crnl():
268268
decoder = LineDecoder()
269269
assert decoder.decode("") == []
270-
assert decoder.decode("a\r\n\r\nb\r\nc") == ["a\n", "\n", "b\n"]
270+
assert decoder.decode("a\r\n\r\nb\r\nc") == ["a", "", "b"]
271271
assert decoder.flush() == ["c"]
272272

273273
decoder = LineDecoder()
274274
assert decoder.decode("") == []
275-
assert decoder.decode("a\r\n\r\nb\r\nc\r\n") == ["a\n", "\n", "b\n", "c\n"]
275+
assert decoder.decode("a\r\n\r\nb\r\nc\r\n") == ["a", "", "b", "c"]
276276
assert decoder.flush() == []
277277

278278
decoder = LineDecoder()
279279
assert decoder.decode("") == []
280280
assert decoder.decode("a\r") == []
281-
assert decoder.decode("\n\r\nb\r\nc") == ["a\n", "\n", "b\n"]
281+
assert decoder.decode("\n\r\nb\r\nc") == ["a", "", "b"]
282282
assert decoder.flush() == ["c"]
283283

284284
# Issue #1033
285285
decoder = LineDecoder()
286286
assert decoder.decode("") == []
287-
assert decoder.decode("12345\r\n") == ["12345\n"]
287+
assert decoder.decode("12345\r\n") == ["12345"]
288288
assert decoder.decode("foo ") == []
289289
assert decoder.decode("bar ") == []
290-
assert decoder.decode("baz\r\n") == ["foo bar baz\n"]
290+
assert decoder.decode("baz\r\n") == ["foo bar baz"]
291291
assert decoder.flush() == []
292292

293293

0 commit comments

Comments
 (0)