5
5
6
6
import numpy as np
7
7
from pandas import _hash , Series , factorize , Categorical , Index , MultiIndex
8
+ import pandas .core .algorithms as algos
8
9
from pandas .lib import is_bool_array
9
10
from pandas .types .generic import ABCIndexClass , ABCSeries , ABCDataFrame
10
11
from pandas .types .common import (is_categorical_dtype , is_numeric_dtype ,
@@ -58,15 +59,16 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
58
59
hash_key = _default_hash_key
59
60
60
61
if isinstance (obj , MultiIndex ):
61
- return _hash_tuples (obj , encoding , hash_key )
62
+ return Series (hash_tuples (obj , encoding , hash_key ),
63
+ dtype = 'uint64' , copy = False )
62
64
63
65
if isinstance (obj , ABCIndexClass ):
64
66
h = hash_array (obj .values , encoding , hash_key ,
65
- categorize ).astype ('uint64' )
66
- h = Series (h , index = obj , dtype = 'uint64' )
67
+ categorize ).astype ('uint64' , copy = False )
68
+ h = Series (h , index = obj , dtype = 'uint64' , copy = False )
67
69
elif isinstance (obj , ABCSeries ):
68
70
h = hash_array (obj .values , encoding , hash_key ,
69
- categorize ).astype ('uint64' )
71
+ categorize ).astype ('uint64' , copy = False )
70
72
if index :
71
73
h = _combine_hash_arrays (iter ([
72
74
h ,
@@ -76,7 +78,7 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
76
78
hash_key = hash_key ,
77
79
categorize = categorize ).values ]),
78
80
2 )
79
- h = Series (h , index = obj .index , dtype = 'uint64' )
81
+ h = Series (h , index = obj .index , dtype = 'uint64' , copy = False )
80
82
elif isinstance (obj , ABCDataFrame ):
81
83
hashes = (hash_array (series .values ) for _ , series in obj .iteritems ())
82
84
num_items = len (obj .columns )
@@ -91,34 +93,81 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
91
93
hashes = itertools .chain (hashes , index_hash_generator )
92
94
h = _combine_hash_arrays (hashes , num_items )
93
95
94
- h = Series (h , index = obj .index , dtype = 'uint64' )
96
+ h = Series (h , index = obj .index , dtype = 'uint64' , copy = False )
95
97
else :
96
98
raise TypeError ("Unexpected type for hashing %s" % type (obj ))
97
99
return h
98
100
99
101
100
- def _hash_tuples (vals , encoding , hash_key ):
102
+ def _hash_lists (vals , encoding = 'utf8' , hash_key = None ):
103
+ """
104
+
105
+ Parameters
106
+ ----------
107
+ vals : list of ndarrays
108
+ encoding : string, default 'utf8'
109
+ encoding for data & key when strings
110
+ hash_key : string key to encode, default to _default_hash_key
111
+
112
+ Returns
113
+ -------
114
+ 1d uint64 numpy array of hash values, same length as the vals[0]
115
+ """
116
+
117
+ if not isinstance (vals , list ):
118
+ raise TypeError ("only can accept lists" )
119
+
120
+ if not len (vals ):
121
+ raise ValueError ("must pass a non-zero length vals" )
122
+
123
+ if not isinstance (vals [0 ], np .ndarray ):
124
+ raise ValueError ("must pass a ndarray" )
125
+
126
+ hashes = (hash_array (l , encoding = encoding , hash_key = hash_key )
127
+ for l in vals )
128
+ h = _combine_hash_arrays (hashes , len (vals ))
129
+ return h
130
+
131
+
132
+ def hash_tuples (vals , encoding = 'utf8' , hash_key = None ):
101
133
"""
102
134
Hash an MultiIndex / array_of_tuples efficiently
103
135
104
136
Parameters
105
137
----------
106
- vals : MultiIndex or ndarray of tuples
138
+ vals : MultiIndex, ndarray of tuples, or single tuple
107
139
encoding : string, default 'utf8'
108
140
hash_key : string key to encode, default to _default_hash_key
109
141
110
142
Returns
111
143
-------
112
- ndarray of hashed values array, same size as len(c)
144
+ ndarray of hashed values array
113
145
"""
114
146
147
+ is_tuple = False
148
+ if isinstance (vals , tuple ):
149
+ vals = [vals ]
150
+ is_tuple = True
151
+
115
152
if not isinstance (vals , MultiIndex ):
116
153
vals = MultiIndex .from_tuples (vals )
117
154
118
- # efficiently turn us into a DataFrame and hash
119
- return hash_pandas_object (vals .to_frame (index = False ),
120
- index = False , encoding = encoding ,
121
- hash_key = hash_key , categorize = False )
155
+ # create a list-of-ndarrays & hash
156
+ def get_level_values (num ):
157
+ unique = vals .levels [num ] # .values
158
+ labels = vals .labels [num ]
159
+ filled = algos .take_1d (unique .values , labels ,
160
+ fill_value = unique ._na_value )
161
+ return filled
162
+
163
+ vals = [get_level_values (level )
164
+ for level in range (vals .nlevels )]
165
+
166
+ result = _hash_lists (vals , encoding = encoding , hash_key = hash_key )
167
+ if is_tuple :
168
+ result = result [0 ]
169
+
170
+ return result
122
171
123
172
124
173
def _hash_categorical (c , encoding , hash_key ):
@@ -138,7 +187,7 @@ def _hash_categorical(c, encoding, hash_key):
138
187
"""
139
188
cat_hashed = hash_array (c .categories .values , encoding , hash_key ,
140
189
categorize = False ).astype (np .uint64 , copy = False )
141
- return c .rename_categories (cat_hashed ).astype (np .uint64 )
190
+ return c .rename_categories (cat_hashed ).astype (np .uint64 , copy = False )
142
191
143
192
144
193
def hash_array (vals , encoding = 'utf8' , hash_key = None , categorize = True ):
@@ -168,10 +217,6 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
168
217
if hash_key is None :
169
218
hash_key = _default_hash_key
170
219
171
- if isinstance (vals , list ) and len (vals ) and isinstance (vals [0 ], tuple ):
172
- # we hash an list of tuples similar to a MultiIndex
173
- return _hash_tuples (vals , encoding , hash_key ).values
174
-
175
220
# For categoricals, we hash the categories, then remap the codes to the
176
221
# hash values. (This check is above the complex check so that we don't ask
177
222
# numpy if categorical is a subdtype of complex, as it will choke.
@@ -187,9 +232,10 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
187
232
# manage it.
188
233
if is_bool_array (vals ):
189
234
vals = vals .astype ('u8' )
190
- elif ((is_datetime64_dtype (vals ) or
191
- is_timedelta64_dtype (vals ) or
192
- is_numeric_dtype (vals )) and vals .dtype .itemsize <= 8 ):
235
+ elif (is_datetime64_dtype (vals ) or
236
+ is_timedelta64_dtype (vals )):
237
+ vals = vals .view ('i8' ).astype ('u8' , copy = False )
238
+ elif (is_numeric_dtype (vals ) and vals .dtype .itemsize <= 8 ):
193
239
vals = vals .view ('u{}' .format (vals .dtype .itemsize )).astype ('u8' )
194
240
else :
195
241
# With repeated values, its MUCH faster to categorize object dtypes,
0 commit comments