Skip to content

Commit 5d5a3c2

Browse files
jrebackischurov
authored andcommitted
PERF: use StringHasTable for strings
xref pandas-dev#13745 provides a modest speedup for all string hashing. The key thing is, it will release the GIL on more operations where this is possible (mainly factorize). can be easily extended to value_counts() and .duplicated() (for strings) Author: Jeff Reback <[email protected]> Closes pandas-dev#14859 from jreback/string and squashes the following commits: 98f46c2 [Jeff Reback] PERF: use StringHashTable for strings in factorizing
1 parent 50ac7ed commit 5d5a3c2

File tree

7 files changed

+330
-66
lines changed

7 files changed

+330
-66
lines changed

asv_bench/benchmarks/algorithms.py

+5
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ class Algorithms(object):
88

99
def setup(self):
1010
N = 100000
11+
np.random.seed(1234)
1112

1213
self.int_unique = pd.Int64Index(np.arange(N * 5))
1314
# cache is_unique
@@ -23,11 +24,15 @@ def setup(self):
2324
self.arrpos = np.arange(1000000)
2425
self.arrneg = np.arange(-1000000, 0)
2526
self.arrmixed = np.array([1, -1]).repeat(500000)
27+
self.strings = tm.makeStringIndex(100000)
2628

2729
# match
2830
self.uniques = tm.makeStringIndex(1000).values
2931
self.all = self.uniques.repeat(10)
3032

33+
def time_factorize_string(self):
34+
self.strings.factorize()
35+
3136
def time_factorize_int(self):
3237
self.int.factorize()
3338

asv_bench/benchmarks/gil.py

+35
Original file line numberDiff line numberDiff line change
@@ -379,3 +379,38 @@ def pg_read_csv_datetime(self):
379379

380380
def time_read_csv_datetime(self):
381381
self.pg_read_csv_datetime()
382+
383+
384+
class nogil_factorize(object):
385+
number = 1
386+
repeat = 5
387+
388+
def setup(self):
389+
if (not have_real_test_parallel):
390+
raise NotImplementedError
391+
392+
np.random.seed(1234)
393+
self.strings = tm.makeStringIndex(100000)
394+
395+
def factorize_strings(self):
396+
pd.factorize(self.strings)
397+
398+
@test_parallel(num_threads=4)
399+
def _pg_factorize_strings_4(self):
400+
self.factorize_strings()
401+
402+
def time_factorize_strings_4(self):
403+
for i in range(2):
404+
self._pg_factorize_strings_4()
405+
406+
@test_parallel(num_threads=2)
407+
def _pg_factorize_strings_2(self):
408+
self.factorize_strings()
409+
410+
def time_factorize_strings_2(self):
411+
for i in range(4):
412+
self._pg_factorize_strings_2()
413+
414+
def time_factorize_strings(self):
415+
for i in range(8):
416+
self.factorize_strings()

doc/source/whatsnew/v0.20.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ Performance Improvements
135135
~~~~~~~~~~~~~~~~~~~~~~~~
136136

137137
- Improved performance of ``pd.wide_to_long()`` (:issue:`14779`)
138-
138+
- Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`)
139139

140140

141141

pandas/core/algorithms.py

+27-6
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def match(to_match, values, na_sentinel=-1):
6565
values = np.array(values, dtype='O')
6666

6767
f = lambda htype, caster: _match_generic(to_match, values, htype, caster)
68-
result = _hashtable_algo(f, values.dtype, np.int64)
68+
result = _hashtable_algo(f, values, np.int64)
6969

7070
if na_sentinel != -1:
7171

@@ -102,7 +102,7 @@ def unique(values):
102102
values = com._asarray_tuplesafe(values)
103103

104104
f = lambda htype, caster: _unique_generic(values, htype, caster)
105-
return _hashtable_algo(f, values.dtype)
105+
return _hashtable_algo(f, values)
106106

107107

108108
def _unique_generic(values, table_type, type_caster):
@@ -759,10 +759,12 @@ def _finalize_nsmallest(arr, kth_val, n, keep, narr):
759759
# helpers #
760760
# ------- #
761761

762-
def _hashtable_algo(f, dtype, return_dtype=None):
762+
def _hashtable_algo(f, values, return_dtype=None):
763763
"""
764764
f(HashTable, type_caster) -> result
765765
"""
766+
767+
dtype = values.dtype
766768
if is_float_dtype(dtype):
767769
return f(htable.Float64HashTable, _ensure_float64)
768770
elif is_integer_dtype(dtype):
@@ -773,17 +775,25 @@ def _hashtable_algo(f, dtype, return_dtype=None):
773775
elif is_timedelta64_dtype(dtype):
774776
return_dtype = return_dtype or 'm8[ns]'
775777
return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)
776-
else:
777-
return f(htable.PyObjectHashTable, _ensure_object)
778+
779+
# its cheaper to use a String Hash Table than Object
780+
if lib.infer_dtype(values) in ['string']:
781+
return f(htable.StringHashTable, _ensure_object)
782+
783+
# use Object
784+
return f(htable.PyObjectHashTable, _ensure_object)
778785

779786
_hashtables = {
780787
'float64': (htable.Float64HashTable, htable.Float64Vector),
781788
'int64': (htable.Int64HashTable, htable.Int64Vector),
789+
'string': (htable.StringHashTable, htable.ObjectVector),
782790
'generic': (htable.PyObjectHashTable, htable.ObjectVector)
783791
}
784792

785793

786794
def _get_data_algo(values, func_map):
795+
796+
f = None
787797
if is_float_dtype(values):
788798
f = func_map['float64']
789799
values = _ensure_float64(values)
@@ -796,8 +806,19 @@ def _get_data_algo(values, func_map):
796806
f = func_map['int64']
797807
values = _ensure_int64(values)
798808
else:
799-
f = func_map['generic']
809+
800810
values = _ensure_object(values)
811+
812+
# its cheaper to use a String Hash Table than Object
813+
if lib.infer_dtype(values) in ['string']:
814+
try:
815+
f = func_map['string']
816+
except KeyError:
817+
pass
818+
819+
if f is None:
820+
f = func_map['generic']
821+
801822
return f, values
802823

803824

pandas/hashtable.pxd

+7-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, int64_t, float64_t
1+
from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, kh_str_t, int64_t, float64_t
22

33
# prototypes for sharing
44

@@ -22,3 +22,9 @@ cdef class PyObjectHashTable(HashTable):
2222

2323
cpdef get_item(self, object val)
2424
cpdef set_item(self, object key, Py_ssize_t val)
25+
26+
cdef class StringHashTable(HashTable):
27+
cdef kh_str_t *table
28+
29+
cpdef get_item(self, object val)
30+
cpdef set_item(self, object key, Py_ssize_t val)

pandas/hashtable.pyx

+6-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@ from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check
44

55
from khash cimport *
66
from numpy cimport *
7-
from cpython cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
7+
8+
from libc.stdlib cimport malloc, free
9+
from cpython cimport (PyMem_Malloc, PyMem_Realloc, PyMem_Free,
10+
PyString_Check, PyBytes_Check,
11+
PyUnicode_Check)
812

913
from util cimport _checknan
1014
cimport util
@@ -33,7 +37,7 @@ PyDateTime_IMPORT
3337
cdef extern from "Python.h":
3438
int PySlice_Check(object)
3539

36-
cdef size_t _INIT_VEC_CAP = 32
40+
cdef size_t _INIT_VEC_CAP = 128
3741

3842

3943
include "hashtable_class_helper.pxi"

0 commit comments

Comments
 (0)