@@ -65,7 +65,7 @@ def match(to_match, values, na_sentinel=-1):
65
65
values = np .array (values , dtype = 'O' )
66
66
67
67
f = lambda htype , caster : _match_generic (to_match , values , htype , caster )
68
- result = _hashtable_algo (f , values . dtype , np .int64 )
68
+ result = _hashtable_algo (f , values , np .int64 )
69
69
70
70
if na_sentinel != - 1 :
71
71
@@ -102,7 +102,7 @@ def unique(values):
102
102
values = com ._asarray_tuplesafe (values )
103
103
104
104
f = lambda htype , caster : _unique_generic (values , htype , caster )
105
- return _hashtable_algo (f , values . dtype )
105
+ return _hashtable_algo (f , values )
106
106
107
107
108
108
def _unique_generic (values , table_type , type_caster ):
@@ -759,10 +759,12 @@ def _finalize_nsmallest(arr, kth_val, n, keep, narr):
759
759
# helpers #
760
760
# ------- #
761
761
762
- def _hashtable_algo (f , dtype , return_dtype = None ):
762
+ def _hashtable_algo (f , values , return_dtype = None ):
763
763
"""
764
764
f(HashTable, type_caster) -> result
765
765
"""
766
+
767
+ dtype = values .dtype
766
768
if is_float_dtype (dtype ):
767
769
return f (htable .Float64HashTable , _ensure_float64 )
768
770
elif is_integer_dtype (dtype ):
@@ -773,17 +775,25 @@ def _hashtable_algo(f, dtype, return_dtype=None):
773
775
elif is_timedelta64_dtype (dtype ):
774
776
return_dtype = return_dtype or 'm8[ns]'
775
777
return f (htable .Int64HashTable , _ensure_int64 ).view (return_dtype )
776
- else :
777
- return f (htable .PyObjectHashTable , _ensure_object )
778
+
779
+ # its cheaper to use a String Hash Table than Object
780
+ if lib .infer_dtype (values ) in ['string' ]:
781
+ return f (htable .StringHashTable , _ensure_object )
782
+
783
+ # use Object
784
+ return f (htable .PyObjectHashTable , _ensure_object )
778
785
779
786
_hashtables = {
780
787
'float64' : (htable .Float64HashTable , htable .Float64Vector ),
781
788
'int64' : (htable .Int64HashTable , htable .Int64Vector ),
789
+ 'string' : (htable .StringHashTable , htable .ObjectVector ),
782
790
'generic' : (htable .PyObjectHashTable , htable .ObjectVector )
783
791
}
784
792
785
793
786
794
def _get_data_algo (values , func_map ):
795
+
796
+ f = None
787
797
if is_float_dtype (values ):
788
798
f = func_map ['float64' ]
789
799
values = _ensure_float64 (values )
@@ -796,8 +806,19 @@ def _get_data_algo(values, func_map):
796
806
f = func_map ['int64' ]
797
807
values = _ensure_int64 (values )
798
808
else :
799
- f = func_map [ 'generic' ]
809
+
800
810
values = _ensure_object (values )
811
+
812
+ # its cheaper to use a String Hash Table than Object
813
+ if lib .infer_dtype (values ) in ['string' ]:
814
+ try :
815
+ f = func_map ['string' ]
816
+ except KeyError :
817
+ pass
818
+
819
+ if f is None :
820
+ f = func_map ['generic' ]
821
+
801
822
return f , values
802
823
803
824
0 commit comments