Skip to content

Commit a68c402

Browse files
committed
PERF: use StringHashTable for strings in factorizing
allows releasing the GIL on these dtypes xref pandas-dev#13745
1 parent abdfa3e commit a68c402

File tree

7 files changed

+328
-66
lines changed

7 files changed

+328
-66
lines changed

asv_bench/benchmarks/algorithms.py

+4
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,15 @@ def setup(self):
2323
self.arrpos = np.arange(1000000)
2424
self.arrneg = np.arange(-1000000, 0)
2525
self.arrmixed = np.array([1, -1]).repeat(500000)
26+
self.strings = tm.makeStringIndex(100000)
2627

2728
# match
2829
self.uniques = tm.makeStringIndex(1000).values
2930
self.all = self.uniques.repeat(10)
3031

32+
def time_factorize_string(self):
33+
self.strings.factorize()
34+
3135
def time_factorize_int(self):
3236
self.int.factorize()
3337

asv_bench/benchmarks/gil.py

+34
Original file line numberDiff line numberDiff line change
@@ -379,3 +379,37 @@ def pg_read_csv_datetime(self):
379379

380380
def time_read_csv_datetime(self):
381381
self.pg_read_csv_datetime()
382+
383+
384+
class nogil_factorize(object):
385+
number = 1
386+
repeat = 5
387+
388+
def setup(self):
389+
if (not have_real_test_parallel):
390+
raise NotImplementedError
391+
392+
self.strings = tm.makeStringIndex(100000)
393+
394+
def factorize_strings(self):
395+
pd.factorize(self.strings)
396+
397+
@test_parallel(num_threads=4)
398+
def _pg_factorize_strings_4(self):
399+
self.factorize_strings()
400+
401+
def time_factorize_strings_4(self):
402+
for i in range(2):
403+
self._pg_factorize_strings_4()
404+
405+
@test_parallel(num_threads=2)
406+
def _pg_factorize_strings_2(self):
407+
self.factorize_strings()
408+
409+
def time_factorize_strings_2(self):
410+
for i in range(4):
411+
self._pg_factorize_strings_2()
412+
413+
def time_factorize_strings(self):
414+
for i in range(8):
415+
self.factorize_strings()

doc/source/whatsnew/v0.20.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ Performance Improvements
117117
~~~~~~~~~~~~~~~~~~~~~~~~
118118

119119
- Improved performance of ``pd.wide_to_long()`` (:issue:`14779`)
120-
120+
- Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`)
121121

122122

123123

pandas/core/algorithms.py

+27-6
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def match(to_match, values, na_sentinel=-1):
6565
values = np.array(values, dtype='O')
6666

6767
f = lambda htype, caster: _match_generic(to_match, values, htype, caster)
68-
result = _hashtable_algo(f, values.dtype, np.int64)
68+
result = _hashtable_algo(f, values, np.int64)
6969

7070
if na_sentinel != -1:
7171

@@ -102,7 +102,7 @@ def unique(values):
102102
values = com._asarray_tuplesafe(values)
103103

104104
f = lambda htype, caster: _unique_generic(values, htype, caster)
105-
return _hashtable_algo(f, values.dtype)
105+
return _hashtable_algo(f, values)
106106

107107

108108
def _unique_generic(values, table_type, type_caster):
@@ -759,10 +759,12 @@ def _finalize_nsmallest(arr, kth_val, n, keep, narr):
759759
# helpers #
760760
# ------- #
761761

762-
def _hashtable_algo(f, dtype, return_dtype=None):
762+
def _hashtable_algo(f, values, return_dtype=None):
763763
"""
764764
f(HashTable, type_caster) -> result
765765
"""
766+
767+
dtype = values.dtype
766768
if is_float_dtype(dtype):
767769
return f(htable.Float64HashTable, _ensure_float64)
768770
elif is_integer_dtype(dtype):
@@ -773,17 +775,25 @@ def _hashtable_algo(f, dtype, return_dtype=None):
773775
elif is_timedelta64_dtype(dtype):
774776
return_dtype = return_dtype or 'm8[ns]'
775777
return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)
776-
else:
777-
return f(htable.PyObjectHashTable, _ensure_object)
778+
779+
# its cheaper to use a String Hash Table than Object
780+
if lib.infer_dtype(values) in ['string']:
781+
return f(htable.StringHashTable, _ensure_object)
782+
783+
# use Object
784+
return f(htable.PyObjectHashTable, _ensure_object)
778785

779786
_hashtables = {
780787
'float64': (htable.Float64HashTable, htable.Float64Vector),
781788
'int64': (htable.Int64HashTable, htable.Int64Vector),
789+
'string': (htable.StringHashTable, htable.ObjectVector),
782790
'generic': (htable.PyObjectHashTable, htable.ObjectVector)
783791
}
784792

785793

786794
def _get_data_algo(values, func_map):
795+
796+
f = None
787797
if is_float_dtype(values):
788798
f = func_map['float64']
789799
values = _ensure_float64(values)
@@ -796,8 +806,19 @@ def _get_data_algo(values, func_map):
796806
f = func_map['int64']
797807
values = _ensure_int64(values)
798808
else:
799-
f = func_map['generic']
809+
800810
values = _ensure_object(values)
811+
812+
# its cheaper to use a String Hash Table than Object
813+
if lib.infer_dtype(values) in ['string']:
814+
try:
815+
f = func_map['string']
816+
except KeyError:
817+
pass
818+
819+
if f is None:
820+
f = func_map['generic']
821+
801822
return f, values
802823

803824

pandas/hashtable.pxd

+7-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, int64_t, float64_t
1+
from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, kh_str_t, int64_t, float64_t
22

33
# prototypes for sharing
44

@@ -22,3 +22,9 @@ cdef class PyObjectHashTable(HashTable):
2222

2323
cpdef get_item(self, object val)
2424
cpdef set_item(self, object key, Py_ssize_t val)
25+
26+
cdef class StringHashTable(HashTable):
27+
cdef kh_str_t *table
28+
29+
cpdef get_item(self, object val)
30+
cpdef set_item(self, object key, Py_ssize_t val)

pandas/hashtable.pyx

+6-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@ from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check
44

55
from khash cimport *
66
from numpy cimport *
7-
from cpython cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
7+
8+
from libc.stdlib cimport malloc, free
9+
from cpython cimport (PyMem_Malloc, PyMem_Realloc, PyMem_Free,
10+
PyString_Check, PyBytes_Check,
11+
PyUnicode_Check)
812

913
from util cimport _checknan
1014
cimport util
@@ -33,7 +37,7 @@ PyDateTime_IMPORT
3337
cdef extern from "Python.h":
3438
int PySlice_Check(object)
3539

36-
cdef size_t _INIT_VEC_CAP = 32
40+
cdef size_t _INIT_VEC_CAP = 128
3741

3842

3943
include "hashtable_class_helper.pxi"

0 commit comments

Comments
 (0)