Merge pull request #10024 from cpcloud/fixlen-string-faster

cpcloud · cpcloud · commit d96ccd264c73 · 2015-04-30T11:09:30.000-04:00
Speed up max_len_string_array
diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
@@ -171,7 +171,7 @@ Performance Improvements
 
 - Improved csv write performance with mixed dtypes, including datetimes by up to 5x (:issue:`9940`)
 - Improved csv write performance generally by 2x (:issue:`9940`)
-
+- Improved the performance of ``pd.lib.max_len_string_array`` by 5-7x (:issue:`10024`)
 
 
 
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -1626,7 +1626,7 @@ def _dtype_to_stata_type(dtype, column):
     elif dtype.type == np.object_:  # try to coerce it to the biggest string
                                     # not memory efficient, what else could we
                                     # do?
-        itemsize = max_len_string_array(column.values)
+        itemsize = max_len_string_array(com._ensure_object(column.values))
         return chr(max(itemsize, 1))
     elif dtype == np.float64:
         return chr(255)
@@ -1664,7 +1664,7 @@ def _dtype_to_default_stata_fmt(dtype, column):
         if not (inferred_dtype in ('string', 'unicode')
                 or len(column) == 0):
             raise ValueError('Writing general object arrays is not supported')
-        itemsize = max_len_string_array(column.values)
+        itemsize = max_len_string_array(com._ensure_object(column.values))
         if itemsize > 244:
             raise ValueError(excessive_string_length_error % column.name)
         return "%" + str(max(itemsize, 1)) + "s"
diff --git a/pandas/io/tests/test_data.py b/pandas/io/tests/test_data.py
@@ -105,6 +105,7 @@ def test_get_multi_all_invalid(self):
         sl = ['INVALID', 'INVALID2', 'INVALID3']
         self.assertRaises(RemoteDataError, web.get_data_google, sl, '2012')
 
+    @network
     def test_get_multi2(self):
         with warnings.catch_warnings(record=True) as w:
             for locale in self.locales:
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
@@ -1,6 +1,7 @@
 cimport numpy as np
 cimport cython
 import numpy as np
+import sys
 
 from numpy cimport *
 
@@ -10,6 +11,7 @@ cdef extern from "numpy/arrayobject.h":
     cdef enum NPY_TYPES:
         NPY_intp "NPY_INTP"
 
+
 from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem,
                       PyDict_Contains, PyDict_Keys,
                       Py_INCREF, PyTuple_SET_ITEM,
@@ -18,7 +20,14 @@ from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem,
                       PyBytes_Check,
                       PyTuple_SetItem,
                       PyTuple_New,
-                      PyObject_SetAttrString)
+                      PyObject_SetAttrString,
+                      PyBytes_GET_SIZE,
+                      PyUnicode_GET_SIZE)
+
+try:
+    from cpython cimport PyString_GET_SIZE
+except ImportError:
+    from cpython cimport PyUnicode_GET_SIZE as PyString_GET_SIZE
 
 cdef extern from "Python.h":
     Py_ssize_t PY_SSIZE_T_MAX
@@ -32,7 +41,6 @@ cdef extern from "Python.h":
         Py_ssize_t *slicelength) except -1
 
 
-
 cimport cpython
 
 isnan = np.isnan
@@ -896,23 +904,32 @@ def clean_index_list(list obj):
 
     return maybe_convert_objects(converted), 0
 
+
+ctypedef fused pandas_string:
+    str
+    unicode
+    bytes
+
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def max_len_string_array(ndarray arr):
+cpdef Py_ssize_t max_len_string_array(pandas_string[:] arr):
     """ return the maximum size of elements in a 1-dim string array """
     cdef:
-        int i, m, l
-        int length = arr.shape[0]
-        object v
+        Py_ssize_t i, m = 0, l = 0, length = arr.shape[0]
+        pandas_string v
 
-    m = 0
-    for i from 0 <= i < length:
+    for i in range(length):
         v = arr[i]
-        if PyString_Check(v) or PyBytes_Check(v) or PyUnicode_Check(v):
-            l = len(v)
-
-            if l > m:
-                m = l
+        if PyString_Check(v):
+            l = PyString_GET_SIZE(v)
+        elif PyBytes_Check(v):
+            l = PyBytes_GET_SIZE(v)
+        elif PyUnicode_Check(v):
+            l = PyUnicode_GET_SIZE(v)
+
+        if l > m:
+            m = l
 
     return m
 
diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py
@@ -8,18 +8,29 @@
 import pandas.util.testing as tm
 from pandas.compat import u
 
+
 class TestMisc(tm.TestCase):
 
     def test_max_len_string_array(self):
 
-        arr = np.array(['foo','b',np.nan],dtype='object')
-        self.assertTrue(max_len_string_array(arr),3)
+        arr = a = np.array(['foo', 'b', np.nan], dtype='object')
+        self.assertTrue(max_len_string_array(arr), 3)
 
         # unicode
-        arr = arr.astype('U')
-        self.assertTrue(max_len_string_array(arr),3)
+        arr = a.astype('U').astype(object)
+        self.assertTrue(max_len_string_array(arr), 3)
+
+        # bytes for python3
+        arr = a.astype('S').astype(object)
+        self.assertTrue(max_len_string_array(arr), 3)
+
+        # raises
+        tm.assertRaises(TypeError,
+                        lambda: max_len_string_array(arr.astype('U')))
+
 
 class TestIsscalar(tm.TestCase):
+
     def test_isscalar_builtin_scalars(self):
         self.assertTrue(isscalar(None))
         self.assertTrue(isscalar(True))