1
1
"""
2
2
data hash pandas / numpy objects
3
3
"""
4
+ import itertools
4
5
5
6
import numpy as np
6
7
from pandas import _hash , Series , factorize , Categorical , Index , MultiIndex
7
8
from pandas .lib import is_bool_array
8
9
from pandas .types .generic import ABCIndexClass , ABCSeries , ABCDataFrame
9
10
from pandas .types .common import (is_categorical_dtype , is_numeric_dtype ,
10
- is_datetime64_dtype , is_timedelta64_dtype ,
11
- is_object_dtype )
11
+ is_datetime64_dtype , is_timedelta64_dtype )
12
12
13
13
# 16 byte long hashing key
14
14
_default_hash_key = '0123456789123456'
15
15
16
16
17
+ def _combine_hash_arrays (arrays , num_items ):
18
+ first = next (arrays )
19
+ arrays = itertools .chain ([first ], arrays )
20
+
21
+ mult = np .zeros_like (first ) + np .uint64 (1000003 )
22
+ out = np .zeros_like (first ) + np .uint64 (0x345678 )
23
+ for i , a in enumerate (arrays ):
24
+ inverse_i = num_items - i
25
+ out = (out ^ a ) * mult
26
+ mult += np .uint64 (82520 + inverse_i + inverse_i )
27
+ assert i + 1 == num_items , 'Fed in wrong num_items'
28
+ out += np .uint64 (97531 )
29
+ return out
30
+
31
+
17
32
def hash_pandas_object (obj , index = True , encoding = 'utf8' , hash_key = None ,
18
33
categorize = True ):
19
34
"""
@@ -42,10 +57,6 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
42
57
if hash_key is None :
43
58
hash_key = _default_hash_key
44
59
45
- def adder (h , hashed_to_add ):
46
- h = np .multiply (h , np .uint (3 ), h )
47
- return np .add (h , hashed_to_add , h )
48
-
49
60
if isinstance (obj , MultiIndex ):
50
61
return _hash_tuples (obj , encoding , hash_key )
51
62
@@ -57,26 +68,28 @@ def adder(h, hashed_to_add):
57
68
h = hash_array (obj .values , encoding , hash_key ,
58
69
categorize ).astype ('uint64' )
59
70
if index :
60
- h = adder (h , hash_pandas_object (obj .index ,
61
- index = False ,
62
- encoding = encoding ,
63
- hash_key = hash_key ,
64
- categorize = categorize ).values )
71
+ h = _combine_hash_arrays (iter ([
72
+ h ,
73
+ hash_pandas_object (obj .index ,
74
+ index = False ,
75
+ encoding = encoding ,
76
+ hash_key = hash_key ,
77
+ categorize = categorize ).values ]),
78
+ 2 )
65
79
h = Series (h , index = obj .index , dtype = 'uint64' )
66
80
elif isinstance (obj , ABCDataFrame ):
67
- cols = obj .iteritems ()
68
- first_series = next (cols )[1 ]
69
- h = hash_array (first_series .values , encoding ,
70
- hash_key , categorize ).astype ('uint64' )
71
- for _ , col in cols :
72
- h = adder (h , hash_array (col .values , encoding , hash_key ,
73
- categorize ))
81
+ hashes = (hash_array (series .values ) for _ , series in obj .iteritems ())
82
+ num_items = len (obj .columns )
74
83
if index :
75
- h = adder (h , hash_pandas_object (obj .index ,
76
- index = False ,
77
- encoding = encoding ,
78
- hash_key = hash_key ,
79
- categorize = categorize ).values )
84
+ index_hash_generator = (hash_pandas_object (obj .index ,
85
+ index = False ,
86
+ encoding = encoding ,
87
+ hash_key = hash_key ,
88
+ categorize = categorize ).values # noqa
89
+ for _ in [None ])
90
+ num_items += 1
91
+ hashes = itertools .chain (hashes , index_hash_generator )
92
+ h = _combine_hash_arrays (hashes , num_items )
80
93
81
94
h = Series (h , index = obj .index , dtype = 'uint64' )
82
95
else :
@@ -103,7 +116,7 @@ def _hash_tuples(vals, encoding, hash_key):
103
116
vals = MultiIndex .from_tuples (vals )
104
117
105
118
# efficiently turn us into a DataFrame and hash
106
- return hash_pandas_object (vals .to_dataframe (index = False ),
119
+ return hash_pandas_object (vals .to_frame (index = False ),
107
120
index = False , encoding = encoding ,
108
121
hash_key = hash_key , categorize = False )
109
122
0 commit comments