Skip to content

Commit b30c0cd

Browse files
Remove chardet/charset-normalizer. (#7589)
Add fallback_charset_resolver ClientSession parameter. (#7561) Co-authored-by: Sam Bull <[email protected]> (cherry picked from commit 6755796) --------- Co-authored-by: Sam Bull <[email protected]>
1 parent 5946c74 commit b30c0cd

9 files changed

+121
-98
lines changed

CHANGES/7561.feature

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Replace automatic character set detection with a `fallback_charset_resolver` parameter
2+
in `ClientSession` to allow user-supplied character set detection functions.

CONTRIBUTORS.txt

+1
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ Jesus Cea
163163
Jian Zeng
164164
Jinkyu Yi
165165
Joel Watts
166+
John Parton
166167
Jon Nabozny
167168
Jonas Krüger Svensson
168169
Jonas Obrist

aiohttp/client.py

+26
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,11 @@
8888
from .tracing import Trace, TraceConfig
8989
from .typedefs import Final, JSONEncoder, LooseCookies, LooseHeaders, StrOrURL
9090

91+
try:
92+
import cchardet as chardet
93+
except ImportError: # pragma: no cover
94+
import charset_normalizer as chardet # type: ignore[no-redef]
95+
9196
__all__ = (
9297
# client_exceptions
9398
"ClientConnectionError",
@@ -159,6 +164,22 @@ class ClientTimeout:
159164
DEFAULT_TIMEOUT: Final[ClientTimeout] = ClientTimeout(total=5 * 60)
160165

161166
_RetType = TypeVar("_RetType")
167+
_CharsetResolver = Callable[[ClientResponse, bytes], str]
168+
169+
170+
def _default_fallback_charset_resolver(response: ClientResponse, body: bytes) -> str:
171+
172+
ret: str = chardet.detect(body)["encoding"] or "utf-8"
173+
174+
if ret != "utf-8":
175+
warnings.warn(
176+
"Automatic charset detection will be removed in 3.9, see: "
177+
"https://docs.aiohttp.org/en/stable/client_advanced.html#character-set-detection", # noqa: E501
178+
DeprecationWarning,
179+
stacklevel=3,
180+
)
181+
182+
return ret
162183

163184

164185
class ClientSession:
@@ -220,6 +241,9 @@ def __init__(
220241
requote_redirect_url: bool = True,
221242
trace_configs: Optional[List[TraceConfig]] = None,
222243
read_bufsize: int = 2**16,
244+
fallback_charset_resolver: _CharsetResolver = (
245+
_default_fallback_charset_resolver
246+
),
223247
) -> None:
224248
if loop is None:
225249
if connector is not None:
@@ -313,6 +337,8 @@ def __init__(
313337
for trace_config in self._trace_configs:
314338
trace_config.freeze()
315339

340+
self._resolve_charset = fallback_charset_resolver
341+
316342
def __init_subclass__(cls: Type["ClientSession"]) -> None:
317343
warnings.warn(
318344
"Inheritance class {} from ClientSession "

aiohttp/client_reqrep.py

+28-27
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import asyncio
22
import codecs
3+
import contextlib
34
import functools
45
import io
56
import re
@@ -12,6 +13,7 @@
1213
from typing import (
1314
TYPE_CHECKING,
1415
Any,
16+
Callable,
1517
Dict,
1618
Iterable,
1719
List,
@@ -66,11 +68,6 @@
6668
ssl = None # type: ignore[assignment]
6769
SSLContext = object # type: ignore[misc,assignment]
6870

69-
try:
70-
import cchardet as chardet
71-
except ImportError: # pragma: no cover
72-
import charset_normalizer as chardet # type: ignore[no-redef]
73-
7471

7572
__all__ = ("ClientRequest", "ClientResponse", "RequestInfo", "Fingerprint")
7673

@@ -722,8 +719,8 @@ class ClientResponse(HeadersMixin):
722719
_raw_headers: RawHeaders = None # type: ignore[assignment] # Response raw headers
723720

724721
_connection = None # current connection
725-
_source_traceback = None
726-
# setted up by ClientRequest after ClientResponse object creation
722+
_source_traceback: Optional[traceback.StackSummary] = None
723+
# set up by ClientRequest after ClientResponse object creation
727724
# post-init stage allows to not change ctor signature
728725
_closed = True # to allow __del__ for non-initialized properly response
729726
_released = False
@@ -760,6 +757,15 @@ def __init__(
760757
self._loop = loop
761758
# store a reference to session #1985
762759
self._session: Optional[ClientSession] = session
760+
# Save reference to _resolve_charset, so that get_encoding() will still
761+
# work after the response has finished reading the body.
762+
if session is None:
763+
# TODO: Fix session=None in tests (see ClientRequest.__init__).
764+
self._resolve_charset: Callable[
765+
["ClientResponse", bytes], str
766+
] = lambda *_: "utf-8"
767+
else:
768+
self._resolve_charset = session._resolve_charset
763769
if loop.get_debug():
764770
self._source_traceback = traceback.extract_stack(sys._getframe(1))
765771

@@ -1053,27 +1059,22 @@ def get_encoding(self) -> str:
10531059

10541060
encoding = mimetype.parameters.get("charset")
10551061
if encoding:
1056-
try:
1057-
codecs.lookup(encoding)
1058-
except LookupError:
1059-
encoding = None
1060-
if not encoding:
1061-
if mimetype.type == "application" and (
1062-
mimetype.subtype == "json" or mimetype.subtype == "rdap"
1063-
):
1064-
# RFC 7159 states that the default encoding is UTF-8.
1065-
# RFC 7483 defines application/rdap+json
1066-
encoding = "utf-8"
1067-
elif self._body is None:
1068-
raise RuntimeError(
1069-
"Cannot guess the encoding of " "a not yet read body"
1070-
)
1071-
else:
1072-
encoding = chardet.detect(self._body)["encoding"]
1073-
if not encoding:
1074-
encoding = "utf-8"
1062+
with contextlib.suppress(LookupError):
1063+
return codecs.lookup(encoding).name
1064+
1065+
if mimetype.type == "application" and (
1066+
mimetype.subtype == "json" or mimetype.subtype == "rdap"
1067+
):
1068+
# RFC 7159 states that the default encoding is UTF-8.
1069+
# RFC 7483 defines application/rdap+json
1070+
return "utf-8"
1071+
1072+
if self._body is None:
1073+
raise RuntimeError(
1074+
"Cannot compute fallback encoding of a not yet read body"
1075+
)
10751076

1076-
return encoding
1077+
return self._resolve_charset(self, self._body)
10771078

10781079
async def text(self, encoding: Optional[str] = None, errors: str = "strict") -> str:
10791080
"""Read response payload and decode."""

docs/client_advanced.rst

+30
Original file line numberDiff line numberDiff line change
@@ -640,3 +640,33 @@ are changed so that aiohttp itself can wait on the underlying
640640
connection to close. Please follow issue `#1925
641641
<https://github.com/aio-libs/aiohttp/issues/1925>`_ for the progress
642642
on this.
643+
644+
645+
Character Set Detection
646+
-----------------------
647+
648+
If you encounter an 'Automatic charset detection will be removed' warning
649+
when using :meth:`ClientResponse.text()` this may be because the response
650+
does not include the charset needed to decode the body.
651+
652+
If you know the correct encoding for a request, you can simply specify
653+
the encoding as a parameter (e.g. ``resp.text("windows-1252")``).
654+
655+
Alternatively, :class:`ClientSession` accepts a ``fallback_charset_resolver`` parameter which
656+
can be used to reintroduce charset guessing functionality. When a charset is not found
657+
in the Content-Type header, this function will be called to get the charset encoding. For
658+
example, this can be used with the ``chardetng_py`` library.::
659+
660+
from chardetng_py import detect
661+
662+
def charset_resolver(resp: ClientResponse, body: bytes) -> str:
663+
tld = resp.url.host.rsplit(".", maxsplit=1)[-1]
664+
return detect(body, allow_utf8=True, tld=tld)
665+
666+
ClientSession(fallback_charset_resolver=charset_resolver)
667+
668+
Or, if ``chardetng_py`` doesn't work for you, then ``charset-normalizer`` is another option::
669+
670+
from charset_normalizer import detect
671+
672+
ClientSession(fallback_charset_resolver=lamba r, b: detect(b)["encoding"] or "utf-8")

docs/client_reference.rst

+23-28
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ The client session supports the context manager protocol for self closing.
5151
read_bufsize=2**16, \
5252
requote_redirect_url=False, \
5353
trust_env=False, \
54-
trace_configs=None)
54+
trace_configs=None, \
55+
fallback_charset_resolver=_chardet_resolver)
5556

5657
The class for creating client sessions and making requests.
5758

@@ -200,6 +201,18 @@ The client session supports the context manager protocol for self closing.
200201
disabling. See :ref:`aiohttp-client-tracing-reference` for
201202
more information.
202203

204+
:param Callable[[ClientResponse,bytes],str] fallback_charset_resolver:
205+
A :term:`callable` that accepts a :class:`ClientResponse` and the
206+
:class:`bytes` contents, and returns a :class:`str` which will be used as
207+
the encoding parameter to :meth:`bytes.decode()`.
208+
209+
This function will be called when the charset is not known (e.g. not specified in the
210+
Content-Type header). The default function in 3.8.6 calls ``chardetng``
211+
or ``charset-normaliser``. In 3.9+ this be replaced with a function that
212+
simply defaults to ``utf-8``.
213+
214+
.. versionadded:: 3.8.6
215+
203216
.. attribute:: closed
204217

205218
``True`` if the session has been closed, ``False`` otherwise.
@@ -1400,12 +1413,8 @@ Response object
14001413
Read response's body and return decoded :class:`str` using
14011414
specified *encoding* parameter.
14021415

1403-
If *encoding* is ``None`` content encoding is autocalculated
1404-
using ``Content-Type`` HTTP header and *charset-normalizer* tool if the
1405-
header is not provided by server.
1406-
1407-
:term:`cchardet` is used with fallback to :term:`charset-normalizer` if
1408-
*cchardet* is not available.
1416+
If *encoding* is ``None`` content encoding is determined from the
1417+
Content-Type header, or using the ``fallback_charset_resolver`` function.
14091418

14101419
Close underlying connection if data reading gets an error,
14111420
release connection otherwise.
@@ -1414,10 +1423,7 @@ Response object
14141423
``None`` for encoding autodetection
14151424
(default).
14161425

1417-
:return str: decoded *BODY*
14181426

1419-
:raise LookupError: if the encoding detected by cchardet is
1420-
unknown by Python (e.g. VISCII).
14211427

14221428
.. note::
14231429

@@ -1430,18 +1436,15 @@ Response object
14301436

14311437
await resp.text('ISO-8859-1')
14321438

1433-
.. comethod:: json(*, encoding=None, loads=json.loads, \
1439+
.. method:: json(*, encoding=None, loads=json.loads, \
14341440
content_type='application/json')
1441+
:async:
14351442

14361443
Read response's body as *JSON*, return :class:`dict` using
14371444
specified *encoding* and *loader*. If data is not still available
1438-
a ``read`` call will be done,
1445+
a ``read`` call will be done.
14391446

1440-
If *encoding* is ``None`` content encoding is autocalculated
1441-
using :term:`cchardet` or :term:`charset-normalizer` as fallback if
1442-
*cchardet* is not available.
1443-
1444-
if response's `content-type` does not match `content_type` parameter
1447+
If response's `content-type` does not match `content_type` parameter
14451448
:exc:`aiohttp.ContentTypeError` get raised.
14461449
To disable content type check pass ``None`` value.
14471450

@@ -1473,17 +1476,9 @@ Response object
14731476

14741477
.. method:: get_encoding()
14751478

1476-
Automatically detect content encoding using ``charset`` info in
1477-
``Content-Type`` HTTP header. If this info is not exists or there
1478-
are no appropriate codecs for encoding then :term:`cchardet` /
1479-
:term:`charset-normalizer` is used.
1480-
1481-
Beware that it is not always safe to use the result of this function to
1482-
decode a response. Some encodings detected by cchardet are not known by
1483-
Python (e.g. VISCII). *charset-normalizer* is not concerned by that issue.
1484-
1485-
:raise RuntimeError: if called before the body has been read,
1486-
for :term:`cchardet` usage
1479+
Retrieve content encoding using ``charset`` info in ``Content-Type`` HTTP header.
1480+
If no charset is present or the charset is not understood by Python, the
1481+
``fallback_charset_resolver`` function associated with the ``ClientSession`` is called.
14871482

14881483
.. versionadded:: 3.0
14891484

docs/index.rst

-8
Original file line numberDiff line numberDiff line change
@@ -162,14 +162,6 @@ Dependencies
162162
- *charset-normalizer*
163163
- *multidict*
164164
- *yarl*
165-
- *Optional* :term:`cchardet` as faster replacement for
166-
:term:`charset-normalizer`.
167-
168-
Install it explicitly via:
169-
170-
.. code-block:: bash
171-
172-
$ pip install cchardet
173165

174166
- *Optional* :term:`aiodns` for fast DNS resolving. The
175167
library is highly recommended.

setup.cfg

+1
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ filterwarnings =
150150
# can be dropped with the next release of `certify`, specifically
151151
# `certify > 2022.06.15`.
152152
ignore:path is deprecated. Use files.. instead. Refer to https.//importlib-resources.readthedocs.io/en/latest/using.html#migrating-from-legacy for migration advice.:DeprecationWarning:certifi.core
153+
ignore:Automatic charset detection will be removed in 3.9:DeprecationWarning
153154
junit_suite_name = aiohttp_test_suite
154155
norecursedirs = dist docs build .tox .eggs
155156
minversion = 3.8.2

0 commit comments

Comments
 (0)