Skip to content

Commit d96ccd2

Browse files
committed
Merge pull request #10024 from cpcloud/fixlen-string-faster
Speed up max_len_string_array
2 parents 28b1488 + ee2626e commit d96ccd2

File tree

5 files changed

+49
-20
lines changed

5 files changed

+49
-20
lines changed

doc/source/whatsnew/v0.16.1.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ Performance Improvements
171171

172172
- Improved csv write performance with mixed dtypes, including datetimes by up to 5x (:issue:`9940`)
173173
- Improved csv write performance generally by 2x (:issue:`9940`)
174-
174+
- Improved the performance of ``pd.lib.max_len_string_array`` by 5-7x (:issue:`10024`)
175175

176176

177177

pandas/io/stata.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1626,7 +1626,7 @@ def _dtype_to_stata_type(dtype, column):
16261626
elif dtype.type == np.object_: # try to coerce it to the biggest string
16271627
# not memory efficient, what else could we
16281628
# do?
1629-
itemsize = max_len_string_array(column.values)
1629+
itemsize = max_len_string_array(com._ensure_object(column.values))
16301630
return chr(max(itemsize, 1))
16311631
elif dtype == np.float64:
16321632
return chr(255)
@@ -1664,7 +1664,7 @@ def _dtype_to_default_stata_fmt(dtype, column):
16641664
if not (inferred_dtype in ('string', 'unicode')
16651665
or len(column) == 0):
16661666
raise ValueError('Writing general object arrays is not supported')
1667-
itemsize = max_len_string_array(column.values)
1667+
itemsize = max_len_string_array(com._ensure_object(column.values))
16681668
if itemsize > 244:
16691669
raise ValueError(excessive_string_length_error % column.name)
16701670
return "%" + str(max(itemsize, 1)) + "s"

pandas/io/tests/test_data.py

+1
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ def test_get_multi_all_invalid(self):
105105
sl = ['INVALID', 'INVALID2', 'INVALID3']
106106
self.assertRaises(RemoteDataError, web.get_data_google, sl, '2012')
107107

108+
@network
108109
def test_get_multi2(self):
109110
with warnings.catch_warnings(record=True) as w:
110111
for locale in self.locales:

pandas/lib.pyx

+30-13
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
cimport numpy as np
22
cimport cython
33
import numpy as np
4+
import sys
45

56
from numpy cimport *
67

@@ -10,6 +11,7 @@ cdef extern from "numpy/arrayobject.h":
1011
cdef enum NPY_TYPES:
1112
NPY_intp "NPY_INTP"
1213

14+
1315
from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem,
1416
PyDict_Contains, PyDict_Keys,
1517
Py_INCREF, PyTuple_SET_ITEM,
@@ -18,7 +20,14 @@ from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem,
1820
PyBytes_Check,
1921
PyTuple_SetItem,
2022
PyTuple_New,
21-
PyObject_SetAttrString)
23+
PyObject_SetAttrString,
24+
PyBytes_GET_SIZE,
25+
PyUnicode_GET_SIZE)
26+
27+
try:
28+
from cpython cimport PyString_GET_SIZE
29+
except ImportError:
30+
from cpython cimport PyUnicode_GET_SIZE as PyString_GET_SIZE
2231

2332
cdef extern from "Python.h":
2433
Py_ssize_t PY_SSIZE_T_MAX
@@ -32,7 +41,6 @@ cdef extern from "Python.h":
3241
Py_ssize_t *slicelength) except -1
3342

3443

35-
3644
cimport cpython
3745

3846
isnan = np.isnan
@@ -896,23 +904,32 @@ def clean_index_list(list obj):
896904

897905
return maybe_convert_objects(converted), 0
898906

907+
908+
ctypedef fused pandas_string:
909+
str
910+
unicode
911+
bytes
912+
913+
899914
@cython.boundscheck(False)
900915
@cython.wraparound(False)
901-
def max_len_string_array(ndarray arr):
916+
cpdef Py_ssize_t max_len_string_array(pandas_string[:] arr):
902917
""" return the maximum size of elements in a 1-dim string array """
903918
cdef:
904-
int i, m, l
905-
int length = arr.shape[0]
906-
object v
919+
Py_ssize_t i, m = 0, l = 0, length = arr.shape[0]
920+
pandas_string v
907921

908-
m = 0
909-
for i from 0 <= i < length:
922+
for i in range(length):
910923
v = arr[i]
911-
if PyString_Check(v) or PyBytes_Check(v) or PyUnicode_Check(v):
912-
l = len(v)
913-
914-
if l > m:
915-
m = l
924+
if PyString_Check(v):
925+
l = PyString_GET_SIZE(v)
926+
elif PyBytes_Check(v):
927+
l = PyBytes_GET_SIZE(v)
928+
elif PyUnicode_Check(v):
929+
l = PyUnicode_GET_SIZE(v)
930+
931+
if l > m:
932+
m = l
916933

917934
return m
918935

pandas/tests/test_lib.py

+15-4
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,29 @@
88
import pandas.util.testing as tm
99
from pandas.compat import u
1010

11+
1112
class TestMisc(tm.TestCase):
1213

1314
def test_max_len_string_array(self):
1415

15-
arr = np.array(['foo','b',np.nan],dtype='object')
16-
self.assertTrue(max_len_string_array(arr),3)
16+
arr = a = np.array(['foo', 'b', np.nan], dtype='object')
17+
self.assertTrue(max_len_string_array(arr), 3)
1718

1819
# unicode
19-
arr = arr.astype('U')
20-
self.assertTrue(max_len_string_array(arr),3)
20+
arr = a.astype('U').astype(object)
21+
self.assertTrue(max_len_string_array(arr), 3)
22+
23+
# bytes for python3
24+
arr = a.astype('S').astype(object)
25+
self.assertTrue(max_len_string_array(arr), 3)
26+
27+
# raises
28+
tm.assertRaises(TypeError,
29+
lambda: max_len_string_array(arr.astype('U')))
30+
2131

2232
class TestIsscalar(tm.TestCase):
33+
2334
def test_isscalar_builtin_scalars(self):
2435
self.assertTrue(isscalar(None))
2536
self.assertTrue(isscalar(True))

0 commit comments

Comments
 (0)