Skip to content

Commit acb5e6a

Browse files
authored
Add charset_normalizer detection. (#1791)
* Add charset_normalizer detection * Tweak JSON tests for slightly different charset decoding behaviour * Add charset-normalizer to docs
1 parent 7724661 commit acb5e6a

File tree

7 files changed

+82
-91
lines changed

7 files changed

+82
-91
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ The HTTPX project relies on these excellent libraries:
119119
* `h11` - HTTP/1.1 support.
120120
* `h2` - HTTP/2 support. *(Optional)*
121121
* `certifi` - SSL certificates.
122+
* `charset_normalizer` - Charset auto-detection.
122123
* `rfc3986` - URL parsing & normalization.
123124
* `idna` - Internationalized domain name support.
124125
* `sniffio` - Async library autodetection.

docs/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ The HTTPX project relies on these excellent libraries:
111111
* `h11` - HTTP/1.1 support.
112112
* `h2` - HTTP/2 support. *(Optional)*
113113
* `certifi` - SSL certificates.
114+
* `charset_normalizer` - Charset auto-detection.
114115
* `rfc3986` - URL parsing & normalization.
115116
* `idna` - Internationalized domain name support.
116117
* `sniffio` - Async library autodetection.

httpx/_decoders.py

Lines changed: 2 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -241,52 +241,13 @@ class TextDecoder:
241241
Handles incrementally decoding bytes into text
242242
"""
243243

244-
def __init__(self, encoding: typing.Optional[str] = None):
245-
self.decoder: typing.Optional[codecs.IncrementalDecoder] = None
246-
if encoding is not None:
247-
self.decoder = codecs.getincrementaldecoder(encoding)(errors="strict")
244+
def __init__(self, encoding: str = "utf-8"):
245+
self.decoder = codecs.getincrementaldecoder(encoding)(errors="replace")
248246

249247
def decode(self, data: bytes) -> str:
250-
"""
251-
If an encoding is explicitly specified, then we use that.
252-
Otherwise our strategy is to attempt UTF-8, and fallback to Windows 1252.
253-
254-
Note that UTF-8 is a strict superset of ascii, and Windows 1252 is a
255-
superset of the non-control characters in iso-8859-1, so we essentially
256-
end up supporting any of ascii, utf-8, iso-8859-1, cp1252.
257-
258-
Given that UTF-8 is now by *far* the most widely used encoding, this
259-
should be a pretty robust strategy for cases where a charset has
260-
not been explicitly included.
261-
262-
Useful stats on the prevalence of different charsets in the wild...
263-
264-
* https://w3techs.com/technologies/overview/character_encoding
265-
* https://w3techs.com/technologies/history_overview/character_encoding
266-
267-
The HTML5 spec also has some useful guidelines, suggesting defaults of
268-
either UTF-8 or Windows 1252 in most cases...
269-
270-
* https://dev.w3.org/html5/spec-LC/Overview.html
271-
"""
272-
if self.decoder is None:
273-
# If this is the first decode pass then we need to determine which
274-
# encoding to use by attempting UTF-8 and raising any decode errors.
275-
attempt_utf_8 = codecs.getincrementaldecoder("utf-8")(errors="strict")
276-
try:
277-
attempt_utf_8.decode(data)
278-
except UnicodeDecodeError:
279-
# Could not decode as UTF-8. Use Windows 1252.
280-
self.decoder = codecs.getincrementaldecoder("cp1252")(errors="replace")
281-
else:
282-
# Can decode as UTF-8. Use UTF-8 with lenient error settings.
283-
self.decoder = codecs.getincrementaldecoder("utf-8")(errors="replace")
284-
285248
return self.decoder.decode(data)
286249

287250
def flush(self) -> str:
288-
if self.decoder is None:
289-
return ""
290251
return self.decoder.decode(b"", True)
291252

292253

httpx/_models.py

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from http.cookiejar import Cookie, CookieJar
99
from urllib.parse import parse_qs, quote, unquote, urlencode
1010

11+
import charset_normalizer
1112
import idna
1213
import rfc3986
1314
import rfc3986.exceptions
@@ -1314,22 +1315,26 @@ def text(self) -> str:
13141315
if not content:
13151316
self._text = ""
13161317
else:
1317-
decoder = TextDecoder(encoding=self.encoding)
1318+
decoder = TextDecoder(encoding=self.encoding or "utf-8")
13181319
self._text = "".join([decoder.decode(self.content), decoder.flush()])
13191320
return self._text
13201321

13211322
@property
13221323
def encoding(self) -> typing.Optional[str]:
13231324
"""
1324-
Return the encoding, which may have been set explicitly, or may have
1325-
been specified by the Content-Type header.
1325+
Return an encoding to use for decoding the byte content into text.
1326+
The priority for determining this is given by...
1327+
1328+
* `.encoding = <>` has been set explicitly.
1329+
* The encoding as specified by the charset parameter in the Content-Type header.
1330+
* The encoding as determined by `charset_normalizer`.
1331+
* UTF-8.
13261332
"""
13271333
if not hasattr(self, "_encoding"):
13281334
encoding = self.charset_encoding
13291335
if encoding is None or not is_known_encoding(encoding):
1330-
self._encoding = None
1331-
else:
1332-
self._encoding = encoding
1336+
encoding = self.apparent_encoding
1337+
self._encoding = encoding
13331338
return self._encoding
13341339

13351340
@encoding.setter
@@ -1351,6 +1356,19 @@ def charset_encoding(self) -> typing.Optional[str]:
13511356

13521357
return params["charset"].strip("'\"")
13531358

1359+
@property
1360+
def apparent_encoding(self) -> typing.Optional[str]:
1361+
"""
1362+
Return the encoding, as detemined by `charset_normalizer`.
1363+
"""
1364+
content = getattr(self, "_content", b"")
1365+
if len(content) < 32:
1366+
# charset_normalizer will issue warnings if we run it with
1367+
# fewer bytes than this cutoff.
1368+
return None
1369+
match = charset_normalizer.from_bytes(self.content).best()
1370+
return None if match is None else match.encoding
1371+
13541372
def _get_content_decoder(self) -> ContentDecoder:
13551373
"""
13561374
Returns a decoder instance which can be used to decode the raw byte
@@ -1411,10 +1429,7 @@ def json(self, **kwargs: typing.Any) -> typing.Any:
14111429
if self.charset_encoding is None and self.content and len(self.content) > 3:
14121430
encoding = guess_json_utf(self.content)
14131431
if encoding is not None:
1414-
try:
1415-
return jsonlib.loads(self.content.decode(encoding), **kwargs)
1416-
except UnicodeDecodeError:
1417-
pass
1432+
return jsonlib.loads(self.content.decode(encoding), **kwargs)
14181433
return jsonlib.loads(self.text, **kwargs)
14191434

14201435
@property
@@ -1495,7 +1510,7 @@ def iter_text(self, chunk_size: int = None) -> typing.Iterator[str]:
14951510
that handles both gzip, deflate, etc but also detects the content's
14961511
string encoding.
14971512
"""
1498-
decoder = TextDecoder(encoding=self.encoding)
1513+
decoder = TextDecoder(encoding=self.encoding or "utf-8")
14991514
chunker = TextChunker(chunk_size=chunk_size)
15001515
with request_context(request=self._request):
15011516
for byte_content in self.iter_bytes():
@@ -1593,7 +1608,7 @@ async def aiter_text(self, chunk_size: int = None) -> typing.AsyncIterator[str]:
15931608
that handles both gzip, deflate, etc but also detects the content's
15941609
string encoding.
15951610
"""
1596-
decoder = TextDecoder(encoding=self.encoding)
1611+
decoder = TextDecoder(encoding=self.encoding or "utf-8")
15971612
chunker = TextChunker(chunk_size=chunk_size)
15981613
with request_context(request=self._request):
15991614
async for byte_content in self.aiter_bytes():

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ def get_packages(package):
5757
zip_safe=False,
5858
install_requires=[
5959
"certifi",
60+
"charset_normalizer",
6061
"sniffio",
6162
"rfc3986[idna2008]>=1.3,<2",
6263
"httpcore>=0.13.3,<0.14.0",

tests/models/test_responses.py

Lines changed: 45 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import json
22
import pickle
3-
from unittest import mock
43

54
import brotlicffi
65
import pytest
@@ -197,31 +196,33 @@ def test_response_no_charset_with_iso_8859_1_content():
197196
A response with ISO 8859-1 encoded content should decode correctly,
198197
even with no charset specified.
199198
"""
200-
content = "Accented: Österreich".encode("iso-8859-1")
199+
content = "Accented: Österreich abcdefghijklmnopqrstuzwxyz".encode("iso-8859-1")
201200
headers = {"Content-Type": "text/plain"}
202201
response = httpx.Response(
203202
200,
204203
content=content,
205204
headers=headers,
206205
)
207-
assert response.text == "Accented: Österreich"
208-
assert response.encoding is None
206+
assert response.text == "Accented: Österreich abcdefghijklmnopqrstuzwxyz"
207+
assert response.charset_encoding is None
208+
assert response.apparent_encoding is not None
209209

210210

211211
def test_response_no_charset_with_cp_1252_content():
212212
"""
213213
A response with Windows 1252 encoded content should decode correctly,
214214
even with no charset specified.
215215
"""
216-
content = "Euro Currency: €".encode("cp1252")
216+
content = "Euro Currency: € abcdefghijklmnopqrstuzwxyz".encode("cp1252")
217217
headers = {"Content-Type": "text/plain"}
218218
response = httpx.Response(
219219
200,
220220
content=content,
221221
headers=headers,
222222
)
223-
assert response.text == "Euro Currency: €"
224-
assert response.encoding is None
223+
assert response.text == "Euro Currency: € abcdefghijklmnopqrstuzwxyz"
224+
assert response.charset_encoding is None
225+
assert response.apparent_encoding is not None
225226

226227

227228
def test_response_non_text_encoding():
@@ -718,9 +719,22 @@ def test_json_with_options():
718719
assert response.json(parse_int=str)["amount"] == "1"
719720

720721

721-
def test_json_without_specified_encoding():
722+
@pytest.mark.parametrize(
723+
"encoding",
724+
[
725+
"utf-8",
726+
"utf-8-sig",
727+
"utf-16",
728+
"utf-16-be",
729+
"utf-16-le",
730+
"utf-32",
731+
"utf-32-be",
732+
"utf-32-le",
733+
],
734+
)
735+
def test_json_without_specified_charset(encoding):
722736
data = {"greeting": "hello", "recipient": "world"}
723-
content = json.dumps(data).encode("utf-32-be")
737+
content = json.dumps(data).encode(encoding)
724738
headers = {"Content-Type": "application/json"}
725739
response = httpx.Response(
726740
200,
@@ -730,30 +744,29 @@ def test_json_without_specified_encoding():
730744
assert response.json() == data
731745

732746

733-
def test_json_without_specified_encoding_decode_error():
734-
data = {"greeting": "hello", "recipient": "world"}
735-
content = json.dumps(data).encode("utf-32-be")
736-
headers = {"Content-Type": "application/json"}
737-
# force incorrect guess from `guess_json_utf` to trigger error
738-
with mock.patch("httpx._models.guess_json_utf", return_value="utf-32-le"):
739-
response = httpx.Response(
740-
200,
741-
content=content,
742-
headers=headers,
743-
)
744-
with pytest.raises(json.decoder.JSONDecodeError):
745-
response.json()
746-
747-
748-
def test_json_without_specified_encoding_value_error():
747+
@pytest.mark.parametrize(
748+
"encoding",
749+
[
750+
"utf-8",
751+
"utf-8-sig",
752+
"utf-16",
753+
"utf-16-be",
754+
"utf-16-le",
755+
"utf-32",
756+
"utf-32-be",
757+
"utf-32-le",
758+
],
759+
)
760+
def test_json_with_specified_charset(encoding):
749761
data = {"greeting": "hello", "recipient": "world"}
750-
content = json.dumps(data).encode("utf-32-be")
751-
headers = {"Content-Type": "application/json"}
752-
# force incorrect guess from `guess_json_utf` to trigger error
753-
with mock.patch("httpx._models.guess_json_utf", return_value="utf-32-le"):
754-
response = httpx.Response(200, content=content, headers=headers)
755-
with pytest.raises(json.decoder.JSONDecodeError):
756-
response.json()
762+
content = json.dumps(data).encode(encoding)
763+
headers = {"Content-Type": f"application/json; charset={encoding}"}
764+
response = httpx.Response(
765+
200,
766+
content=content,
767+
headers=headers,
768+
)
769+
assert response.json() == data
757770

758771

759772
@pytest.mark.parametrize(

tests/test_decoders.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -179,8 +179,8 @@ def test_decoding_errors(header_value):
179179
[
180180
((b"Hello,", b" world!"), "ascii"),
181181
((b"\xe3\x83", b"\x88\xe3\x83\xa9", b"\xe3", b"\x83\x99\xe3\x83\xab"), "utf-8"),
182-
((b"Euro character: \x88!", b""), "cp1252"),
183-
((b"Accented: \xd6sterreich", b""), "iso-8859-1"),
182+
((b"Euro character: \x88! abcdefghijklmnopqrstuvwxyz", b""), "cp1252"),
183+
((b"Accented: \xd6sterreich abcdefghijklmnopqrstuvwxyz", b""), "iso-8859-1"),
184184
],
185185
)
186186
@pytest.mark.asyncio
@@ -199,10 +199,9 @@ async def iterator():
199199
assert response.text == (b"".join(data)).decode(encoding)
200200

201201
# Streaming `.aiter_text` iteratively.
202-
response = httpx.Response(
203-
200,
204-
content=iterator(),
205-
)
202+
# Note that if we streamed the text *without* having read it first, then
203+
# we won't get a `charset_normalizer` guess, and will instead always rely
204+
# on utf-8 if no charset is specified.
206205
text = "".join([part async for part in response.aiter_text()])
207206
assert text == (b"".join(data)).decode(encoding)
208207

0 commit comments

Comments
 (0)