PERF: use StringHasTable for strings

jreback · ischurov · commit 5d5a3c228ffb · 2016-12-19T04:05:00.000+03:00
xref pandas-dev#13745 provides a modest speedup for all string hashing. The key thing is, it will release the GIL on more operations where this is possible (mainly factorize). can be easily extended to value_counts() and .duplicated() (for strings) Author: Jeff Reback <jeff@reback.net> Closes pandas-dev#14859 from jreback/string and squashes the following commits: 98f46c2 [Jeff Reback] PERF: use StringHashTable for strings in factorizing
diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
@@ -8,6 +8,7 @@ class Algorithms(object):
 
     def setup(self):
         N = 100000
+        np.random.seed(1234)
 
         self.int_unique = pd.Int64Index(np.arange(N * 5))
         # cache is_unique
@@ -23,11 +24,15 @@ def setup(self):
         self.arrpos = np.arange(1000000)
         self.arrneg = np.arange(-1000000, 0)
         self.arrmixed = np.array([1, -1]).repeat(500000)
+        self.strings = tm.makeStringIndex(100000)
 
         # match
         self.uniques = tm.makeStringIndex(1000).values
         self.all = self.uniques.repeat(10)
 
+    def time_factorize_string(self):
+        self.strings.factorize()
+
     def time_factorize_int(self):
         self.int.factorize()
 
diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py
@@ -379,3 +379,38 @@ def pg_read_csv_datetime(self):
 
     def time_read_csv_datetime(self):
         self.pg_read_csv_datetime()
+
+
+class nogil_factorize(object):
+    number = 1
+    repeat = 5
+
+    def setup(self):
+        if (not have_real_test_parallel):
+            raise NotImplementedError
+
+        np.random.seed(1234)
+        self.strings = tm.makeStringIndex(100000)
+
+    def factorize_strings(self):
+        pd.factorize(self.strings)
+
+    @test_parallel(num_threads=4)
+    def _pg_factorize_strings_4(self):
+        self.factorize_strings()
+
+    def time_factorize_strings_4(self):
+        for i in range(2):
+            self._pg_factorize_strings_4()
+
+    @test_parallel(num_threads=2)
+    def _pg_factorize_strings_2(self):
+        self.factorize_strings()
+
+    def time_factorize_strings_2(self):
+        for i in range(4):
+            self._pg_factorize_strings_2()
+
+    def time_factorize_strings(self):
+        for i in range(8):
+            self.factorize_strings()
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -135,7 +135,7 @@ Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
 - Improved performance of ``pd.wide_to_long()`` (:issue:`14779`)
-
+- Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`)
 
 
 
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -65,7 +65,7 @@ def match(to_match, values, na_sentinel=-1):
         values = np.array(values, dtype='O')
 
     f = lambda htype, caster: _match_generic(to_match, values, htype, caster)
-    result = _hashtable_algo(f, values.dtype, np.int64)
+    result = _hashtable_algo(f, values, np.int64)
 
     if na_sentinel != -1:
 
@@ -102,7 +102,7 @@ def unique(values):
     values = com._asarray_tuplesafe(values)
 
     f = lambda htype, caster: _unique_generic(values, htype, caster)
-    return _hashtable_algo(f, values.dtype)
+    return _hashtable_algo(f, values)
 
 
 def _unique_generic(values, table_type, type_caster):
@@ -759,10 +759,12 @@ def _finalize_nsmallest(arr, kth_val, n, keep, narr):
 # helpers #
 # ------- #
 
-def _hashtable_algo(f, dtype, return_dtype=None):
+def _hashtable_algo(f, values, return_dtype=None):
     """
     f(HashTable, type_caster) -> result
     """
+
+    dtype = values.dtype
     if is_float_dtype(dtype):
         return f(htable.Float64HashTable, _ensure_float64)
     elif is_integer_dtype(dtype):
@@ -773,17 +775,25 @@ def _hashtable_algo(f, dtype, return_dtype=None):
     elif is_timedelta64_dtype(dtype):
         return_dtype = return_dtype or 'm8[ns]'
         return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)
-    else:
-        return f(htable.PyObjectHashTable, _ensure_object)
+
+    # its cheaper to use a String Hash Table than Object
+    if lib.infer_dtype(values) in ['string']:
+        return f(htable.StringHashTable, _ensure_object)
+
+    # use Object
+    return f(htable.PyObjectHashTable, _ensure_object)
 
 _hashtables = {
     'float64': (htable.Float64HashTable, htable.Float64Vector),
     'int64': (htable.Int64HashTable, htable.Int64Vector),
+    'string': (htable.StringHashTable, htable.ObjectVector),
     'generic': (htable.PyObjectHashTable, htable.ObjectVector)
 }
 
 
 def _get_data_algo(values, func_map):
+
+    f = None
     if is_float_dtype(values):
         f = func_map['float64']
         values = _ensure_float64(values)
@@ -796,8 +806,19 @@ def _get_data_algo(values, func_map):
         f = func_map['int64']
         values = _ensure_int64(values)
     else:
-        f = func_map['generic']
+
         values = _ensure_object(values)
+
+        # its cheaper to use a String Hash Table than Object
+        if lib.infer_dtype(values) in ['string']:
+            try:
+                f = func_map['string']
+            except KeyError:
+                pass
+
+    if f is None:
+        f = func_map['generic']
+
     return f, values
 
 
diff --git a/pandas/hashtable.pxd b/pandas/hashtable.pxd
@@ -1,4 +1,4 @@
-from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, int64_t, float64_t
+from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, kh_str_t, int64_t, float64_t
 
 # prototypes for sharing
 
@@ -22,3 +22,9 @@ cdef class PyObjectHashTable(HashTable):
 
     cpdef get_item(self, object val)
     cpdef set_item(self, object key, Py_ssize_t val)
+
+cdef class StringHashTable(HashTable):
+    cdef kh_str_t *table
+
+    cpdef get_item(self, object val)
+    cpdef set_item(self, object key, Py_ssize_t val)
diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
@@ -4,7 +4,11 @@ from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check
 
 from khash cimport *
 from numpy cimport *
-from cpython cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
+
+from libc.stdlib cimport malloc, free
+from cpython cimport (PyMem_Malloc, PyMem_Realloc, PyMem_Free,
+                      PyString_Check, PyBytes_Check,
+                      PyUnicode_Check)
 
 from util cimport _checknan
 cimport util
@@ -33,7 +37,7 @@ PyDateTime_IMPORT
 cdef extern from "Python.h":
     int PySlice_Check(object)
 
-cdef size_t _INIT_VEC_CAP = 32
+cdef size_t _INIT_VEC_CAP = 128
 
 
 include "hashtable_class_helper.pxi"
diff --git a/pandas/src/hashtable_class_helper.pxi.in b/pandas/src/hashtable_class_helper.pxi.in