Skip to content

Commit 15cc6e2

Browse files
Winandjreback
authored andcommitted
PERF: Improved performance for .str.encode/decode
closes #13008
1 parent 65ed3af commit 15cc6e2

File tree

2 files changed

+22
-2
lines changed

2 files changed

+22
-2
lines changed

doc/source/whatsnew/v0.18.1.txt

+2
Original file line numberDiff line numberDiff line change
@@ -408,6 +408,8 @@ Performance Improvements
408408

409409
- Improved performance of ``DataFrame.to_sql`` when checking case sensitivity for tables. Now only checks if table has been created correctly when table name is not lower case. (:issue:`12876`)
410410
- Improved performance of ``Period`` construction and time series plotting (:issue:`12903`, :issue:`11831`).
411+
- Improved performance of ``.str.encode()`` and ``.str.decode()`` methods (:issue:`13008`)
412+
411413

412414

413415

pandas/core/strings.py

+20-2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,14 @@
1313
import pandas.lib as lib
1414
import warnings
1515
import textwrap
16+
import codecs
17+
18+
_cpython_optimized_encoders = (
19+
"utf-8", "utf8", "latin-1", "latin1", "iso-8859-1", "mbcs", "ascii"
20+
)
21+
_cpython_optimized_decoders = _cpython_optimized_encoders + (
22+
"utf-16", "utf-32"
23+
)
1624

1725
_shared_docs = dict()
1826

@@ -1182,7 +1190,12 @@ def str_decode(arr, encoding, errors="strict"):
11821190
-------
11831191
decoded : Series/Index of objects
11841192
"""
1185-
f = lambda x: x.decode(encoding, errors)
1193+
if encoding in _cpython_optimized_decoders:
1194+
# CPython optimized implementation
1195+
f = lambda x: x.decode(encoding, errors)
1196+
else:
1197+
decoder = codecs.getdecoder(encoding)
1198+
f = lambda x: decoder(x, errors)[0]
11861199
return _na_map(f, arr)
11871200

11881201

@@ -1200,7 +1213,12 @@ def str_encode(arr, encoding, errors="strict"):
12001213
-------
12011214
encoded : Series/Index of objects
12021215
"""
1203-
f = lambda x: x.encode(encoding, errors)
1216+
if encoding in _cpython_optimized_encoders:
1217+
# CPython optimized implementation
1218+
f = lambda x: x.encode(encoding, errors)
1219+
else:
1220+
encoder = codecs.getencoder(encoding)
1221+
f = lambda x: encoder(x, errors)[0]
12041222
return _na_map(f, arr)
12051223

12061224

0 commit comments

Comments
 (0)