1
1
"""
2
2
data hash pandas / numpy objects
3
3
"""
4
+ import itertools
4
5
5
6
import numpy as np
6
- from pandas import _hash , Series , factorize , Categorical , Index
7
+ from pandas import _hash , Series , factorize , Categorical , Index , MultiIndex
8
+ import pandas .core .algorithms as algos
7
9
from pandas .lib import is_bool_array
8
10
from pandas .types .generic import ABCIndexClass , ABCSeries , ABCDataFrame
9
11
from pandas .types .common import (is_categorical_dtype , is_numeric_dtype ,
10
- is_datetime64_dtype , is_timedelta64_dtype )
12
+ is_datetime64_dtype , is_timedelta64_dtype ,
13
+ is_list_like )
11
14
12
15
# 16 byte long hashing key
13
16
_default_hash_key = '0123456789123456'
14
17
15
18
19
+ def _combine_hash_arrays (arrays , num_items ):
20
+ """
21
+ Parameters
22
+ ----------
23
+ arrays : generator
24
+ num_items : int
25
+
26
+ Should be the same as CPython's tupleobject.c
27
+ """
28
+ try :
29
+ first = next (arrays )
30
+ except StopIteration :
31
+ return np .array ([], dtype = np .uint64 )
32
+
33
+ arrays = itertools .chain ([first ], arrays )
34
+
35
+ mult = np .uint64 (1000003 )
36
+ out = np .zeros_like (first ) + np .uint64 (0x345678 )
37
+ for i , a in enumerate (arrays ):
38
+ inverse_i = num_items - i
39
+ out ^= a
40
+ out *= mult
41
+ mult += np .uint64 (82520 + inverse_i + inverse_i )
42
+ assert i + 1 == num_items , 'Fed in wrong num_items'
43
+ out += np .uint64 (97531 )
44
+ return out
45
+
46
+
16
47
def hash_pandas_object (obj , index = True , encoding = 'utf8' , hash_key = None ,
17
48
categorize = True ):
18
49
"""
@@ -41,45 +72,97 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
41
72
if hash_key is None :
42
73
hash_key = _default_hash_key
43
74
44
- def adder ( h , hashed_to_add ):
45
- h = np . multiply ( h , np . uint ( 3 ), h )
46
- return np . add ( h , hashed_to_add , h )
75
+ if isinstance ( obj , MultiIndex ):
76
+ return Series ( hash_tuples ( obj , encoding , hash_key ),
77
+ dtype = 'uint64' , copy = False )
47
78
48
79
if isinstance (obj , ABCIndexClass ):
49
80
h = hash_array (obj .values , encoding , hash_key ,
50
- categorize ).astype ('uint64' )
51
- h = Series (h , index = obj , dtype = 'uint64' )
81
+ categorize ).astype ('uint64' , copy = False )
82
+ h = Series (h , index = obj , dtype = 'uint64' , copy = False )
52
83
elif isinstance (obj , ABCSeries ):
53
84
h = hash_array (obj .values , encoding , hash_key ,
54
- categorize ).astype ('uint64' )
85
+ categorize ).astype ('uint64' , copy = False )
55
86
if index :
56
- h = adder (h , hash_pandas_object (obj .index ,
57
- index = False ,
58
- encoding = encoding ,
59
- hash_key = hash_key ,
60
- categorize = categorize ).values )
61
- h = Series (h , index = obj .index , dtype = 'uint64' )
87
+ index_iter = (hash_pandas_object (obj .index ,
88
+ index = False ,
89
+ encoding = encoding ,
90
+ hash_key = hash_key ,
91
+ categorize = categorize ).values
92
+ for _ in [None ])
93
+ arrays = itertools .chain ([h ], index_iter )
94
+ h = _combine_hash_arrays (arrays , 2 )
95
+
96
+ h = Series (h , index = obj .index , dtype = 'uint64' , copy = False )
97
+
62
98
elif isinstance (obj , ABCDataFrame ):
63
- cols = obj .iteritems ()
64
- first_series = next (cols )[1 ]
65
- h = hash_array (first_series .values , encoding ,
66
- hash_key , categorize ).astype ('uint64' )
67
- for _ , col in cols :
68
- h = adder (h , hash_array (col .values , encoding , hash_key ,
69
- categorize ))
99
+ hashes = (hash_array (series .values ) for _ , series in obj .iteritems ())
100
+ num_items = len (obj .columns )
70
101
if index :
71
- h = adder (h , hash_pandas_object (obj .index ,
72
- index = False ,
73
- encoding = encoding ,
74
- hash_key = hash_key ,
75
- categorize = categorize ).values )
102
+ index_hash_generator = (hash_pandas_object (obj .index ,
103
+ index = False ,
104
+ encoding = encoding ,
105
+ hash_key = hash_key ,
106
+ categorize = categorize ).values # noqa
107
+ for _ in [None ])
108
+ num_items += 1
109
+ hashes = itertools .chain (hashes , index_hash_generator )
110
+ h = _combine_hash_arrays (hashes , num_items )
76
111
77
- h = Series (h , index = obj .index , dtype = 'uint64' )
112
+ h = Series (h , index = obj .index , dtype = 'uint64' , copy = False )
78
113
else :
79
114
raise TypeError ("Unexpected type for hashing %s" % type (obj ))
80
115
return h
81
116
82
117
118
+ def hash_tuples (vals , encoding = 'utf8' , hash_key = None ):
119
+ """
120
+ Hash an MultiIndex / list-of-tuples efficiently
121
+
122
+ .. versionadded:: 0.20.0
123
+
124
+ Parameters
125
+ ----------
126
+ vals : MultiIndex, list-of-tuples, or single tuple
127
+ encoding : string, default 'utf8'
128
+ hash_key : string key to encode, default to _default_hash_key
129
+
130
+ Returns
131
+ -------
132
+ ndarray of hashed values array
133
+ """
134
+
135
+ is_tuple = False
136
+ if isinstance (vals , tuple ):
137
+ vals = [vals ]
138
+ is_tuple = True
139
+ elif not is_list_like (vals ):
140
+ raise TypeError ("must be convertible to a list-of-tuples" )
141
+
142
+ if not isinstance (vals , MultiIndex ):
143
+ vals = MultiIndex .from_tuples (vals )
144
+
145
+ # create a list-of-ndarrays
146
+ def get_level_values (num ):
147
+ unique = vals .levels [num ] # .values
148
+ labels = vals .labels [num ]
149
+ filled = algos .take_1d (unique ._values , labels ,
150
+ fill_value = unique ._na_value )
151
+ return filled
152
+
153
+ vals = [get_level_values (level )
154
+ for level in range (vals .nlevels )]
155
+
156
+ # hash the list-of-ndarrays
157
+ hashes = (hash_array (l , encoding = encoding , hash_key = hash_key )
158
+ for l in vals )
159
+ h = _combine_hash_arrays (hashes , len (vals ))
160
+ if is_tuple :
161
+ h = h [0 ]
162
+
163
+ return h
164
+
165
+
83
166
def _hash_categorical (c , encoding , hash_key ):
84
167
"""
85
168
Hash a Categorical by hashing its categories, and then mapping the codes
@@ -97,7 +180,7 @@ def _hash_categorical(c, encoding, hash_key):
97
180
"""
98
181
cat_hashed = hash_array (c .categories .values , encoding , hash_key ,
99
182
categorize = False ).astype (np .uint64 , copy = False )
100
- return c .rename_categories (cat_hashed ).astype (np .uint64 )
183
+ return c .rename_categories (cat_hashed ).astype (np .uint64 , copy = False )
101
184
102
185
103
186
def hash_array (vals , encoding = 'utf8' , hash_key = None , categorize = True ):
@@ -108,7 +191,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
108
191
109
192
Parameters
110
193
----------
111
- vals : ndarray
194
+ vals : ndarray, Categorical
112
195
encoding : string, default 'utf8'
113
196
encoding for data & key when strings
114
197
hash_key : string key to encode, default to _default_hash_key
@@ -124,6 +207,9 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
124
207
125
208
"""
126
209
210
+ if not hasattr (vals , 'dtype' ):
211
+ raise TypeError ("must pass a ndarray-like" )
212
+
127
213
if hash_key is None :
128
214
hash_key = _default_hash_key
129
215
@@ -142,9 +228,10 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
142
228
# manage it.
143
229
if is_bool_array (vals ):
144
230
vals = vals .astype ('u8' )
145
- elif ((is_datetime64_dtype (vals ) or
146
- is_timedelta64_dtype (vals ) or
147
- is_numeric_dtype (vals )) and vals .dtype .itemsize <= 8 ):
231
+ elif (is_datetime64_dtype (vals ) or
232
+ is_timedelta64_dtype (vals )):
233
+ vals = vals .view ('i8' ).astype ('u8' , copy = False )
234
+ elif (is_numeric_dtype (vals ) and vals .dtype .itemsize <= 8 ):
148
235
vals = vals .view ('u{}' .format (vals .dtype .itemsize )).astype ('u8' )
149
236
else :
150
237
# With repeated values, its MUCH faster to categorize object dtypes,
@@ -156,7 +243,12 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
156
243
ordered = False , fastpath = True )
157
244
return _hash_categorical (cat , encoding , hash_key )
158
245
159
- vals = _hash .hash_object_array (vals , hash_key , encoding )
246
+ try :
247
+ vals = _hash .hash_object_array (vals , hash_key , encoding )
248
+ except TypeError :
249
+ # we have mixed types
250
+ vals = _hash .hash_object_array (vals .astype (str ).astype (object ),
251
+ hash_key , encoding )
160
252
161
253
# Then, redistribute these 64-bit ints within the space of 64-bit ints
162
254
vals ^= vals >> 30
0 commit comments