From 4842d2699c35421e14329da68b1b51e02f9f160a Mon Sep 17 00:00:00 2001 From: Winand Date: Wed, 27 Apr 2016 14:02:51 +0300 Subject: [PATCH] Improved performance for .str.encode/decode Use default implementation for optimized encodings, see https://docs.python.org/3.4/library/codecs.html#standard-encodings --- doc/source/whatsnew/v0.18.1.txt | 2 ++ pandas/core/strings.py | 22 ++++++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 4ab82ed3fcdd5..d8f817a0bea68 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -412,6 +412,8 @@ Performance Improvements - Improved performance of ``DataFrame.to_sql`` when checking case sensitivity for tables. Now only checks if table has been created correctly when table name is not lower case. (:issue:`12876`) - Improved performance of ``Period`` construction and plotting of ``Period``s. (:issue:`12903`, :issue:`11831`) +- Improved performance of ``.str.encode()`` and ``.str.decode()`` methods + diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 66e4638a9e4b4..549b43be9abe5 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -13,6 +13,14 @@ import pandas.lib as lib import warnings import textwrap +import codecs + +_cpython_optimized_encoders = ( + "utf-8", "utf8", "latin-1", "latin1", "iso-8859-1", "mbcs", "ascii" +) +_cpython_optimized_decoders = _cpython_optimized_encoders + ( + "utf-16", "utf-32" +) _shared_docs = dict() @@ -1182,7 +1190,12 @@ def str_decode(arr, encoding, errors="strict"): ------- decoded : Series/Index of objects """ - f = lambda x: x.decode(encoding, errors) + if encoding in _cpython_optimized_decoders: + #CPython optimized implementation + f = lambda x: x.decode(encoding, errors) + else: + decoder = codecs.getdecoder(encoding) + f = lambda x: decoder(x, errors)[0] return _na_map(f, arr) @@ -1200,7 +1213,12 @@ def str_encode(arr, encoding, errors="strict"): ------- encoded : Series/Index of objects """ - f = lambda x: x.encode(encoding, errors) + if encoding in _cpython_optimized_encoders: + #CPython optimized implementation + f = lambda x: x.encode(encoding, errors) + else: + encoder = codecs.getencoder(encoding) + f = lambda x: encoder(x, errors)[0] return _na_map(f, arr)