1
1
"""
2
2
data hash pandas / numpy objects
3
3
"""
4
+ import itertools
4
5
5
6
import numpy as np
6
7
from pandas import _hash , Series , factorize , Categorical , Index
13
14
_default_hash_key = '0123456789123456'
14
15
15
16
17
+ def _combine_hash_arrays (arrays , num_items ):
18
+ first = next (arrays )
19
+ arrays = itertools .chain ([first ], arrays )
20
+
21
+ mult = np .zeros_like (first ) + np .uint64 (1000003 )
22
+ out = np .zeros_like (first ) + np .uint64 (0x345678 )
23
+ for i , a in enumerate (arrays ):
24
+ inverse_i = num_items - i
25
+ out = (out ^ a ) * mult
26
+ mult += np .uint64 (82520 + inverse_i + inverse_i )
27
+ assert i + 1 == num_items , 'Fed in wrong num_items'
28
+ out += np .uint64 (97531 )
29
+ return out
30
+
16
31
def hash_pandas_object (obj , index = True , encoding = 'utf8' , hash_key = None ,
17
32
categorize = True ):
18
33
"""
@@ -41,10 +56,6 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
41
56
if hash_key is None :
42
57
hash_key = _default_hash_key
43
58
44
- def adder (h , hashed_to_add ):
45
- h = np .multiply (h , np .uint (3 ), h )
46
- return np .add (h , hashed_to_add , h )
47
-
48
59
if isinstance (obj , ABCIndexClass ):
49
60
h = hash_array (obj .values , encoding , hash_key ,
50
61
categorize ).astype ('uint64' )
@@ -53,26 +64,28 @@ def adder(h, hashed_to_add):
53
64
h = hash_array (obj .values , encoding , hash_key ,
54
65
categorize ).astype ('uint64' )
55
66
if index :
56
- h = adder (h , hash_pandas_object (obj .index ,
57
- index = False ,
58
- encoding = encoding ,
59
- hash_key = hash_key ,
60
- categorize = categorize ).values )
67
+ h = _combine_hash_arrays (iter ([
68
+ h ,
69
+ hash_pandas_object (obj .index ,
70
+ index = False ,
71
+ encoding = encoding ,
72
+ hash_key = hash_key ,
73
+ categorize = categorize ).values ]),
74
+ 2 )
61
75
h = Series (h , index = obj .index , dtype = 'uint64' )
62
76
elif isinstance (obj , ABCDataFrame ):
63
- cols = obj .iteritems ()
64
- first_series = next (cols )[1 ]
65
- h = hash_array (first_series .values , encoding ,
66
- hash_key , categorize ).astype ('uint64' )
67
- for _ , col in cols :
68
- h = adder (h , hash_array (col .values , encoding , hash_key ,
69
- categorize ))
77
+ hashes = (hash_array (series .values ) for _ , series in obj .iteritems ())
78
+ num_items = len (obj .columns )
70
79
if index :
71
- h = adder (h , hash_pandas_object (obj .index ,
72
- index = False ,
73
- encoding = encoding ,
74
- hash_key = hash_key ,
75
- categorize = categorize ).values )
80
+ index_hash_generator = (hash_pandas_object (obj .index ,
81
+ index = False ,
82
+ encoding = encoding ,
83
+ hash_key = hash_key ,
84
+ categorize = categorize ).values
85
+ for _ in [None ])
86
+ num_items += 1
87
+ hashes = itertools .chain (hashes , index_hash_generator )
88
+ h = _combine_hash_arrays (hashes , num_items )
76
89
77
90
h = Series (h , index = obj .index , dtype = 'uint64' )
78
91
else :
0 commit comments