1
1
"""
2
2
data hash pandas / numpy objects
3
3
"""
4
+ import itertools
4
5
5
6
import numpy as np
6
7
from pandas import _hash , Series , factorize , Categorical , Index
13
14
_default_hash_key = '0123456789123456'
14
15
15
16
17
+ def _combine_hash_arrays (arrays , num_items ):
18
+ "Should be the same as CPython's tupleobject.c"
19
+ first = next (arrays )
20
+ arrays = itertools .chain ([first ], arrays )
21
+
22
+ mult = np .zeros_like (first ) + np .uint64 (1000003 )
23
+ out = np .zeros_like (first ) + np .uint64 (0x345678 )
24
+ for i , a in enumerate (arrays ):
25
+ inverse_i = num_items - i
26
+ out ^= a
27
+ out *= mult
28
+ mult += np .uint64 (82520 + inverse_i + inverse_i )
29
+ assert i + 1 == num_items , 'Fed in wrong num_items'
30
+ out += np .uint64 (97531 )
31
+ return out
32
+
16
33
def hash_pandas_object (obj , index = True , encoding = 'utf8' , hash_key = None ,
17
34
categorize = True ):
18
35
"""
@@ -41,10 +58,6 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
41
58
if hash_key is None :
42
59
hash_key = _default_hash_key
43
60
44
- def adder (h , hashed_to_add ):
45
- h = np .multiply (h , np .uint (3 ), h )
46
- return np .add (h , hashed_to_add , h )
47
-
48
61
if isinstance (obj , ABCIndexClass ):
49
62
h = hash_array (obj .values , encoding , hash_key ,
50
63
categorize ).astype ('uint64' )
@@ -53,26 +66,29 @@ def adder(h, hashed_to_add):
53
66
h = hash_array (obj .values , encoding , hash_key ,
54
67
categorize ).astype ('uint64' )
55
68
if index :
56
- h = adder (h , hash_pandas_object (obj .index ,
57
- index = False ,
58
- encoding = encoding ,
59
- hash_key = hash_key ,
60
- categorize = categorize ).values )
69
+ index_iter = (hash_pandas_object (obj .index ,
70
+ index = False ,
71
+ encoding = encoding ,
72
+ hash_key = hash_key ,
73
+ categorize = categorize ).values
74
+ for _ in [None ])
75
+ arrays = itertools .chain ([h ], index_iter )
76
+ h = _combine_hash_arrays (arrays , 2 )
77
+
61
78
h = Series (h , index = obj .index , dtype = 'uint64' )
62
79
elif isinstance (obj , ABCDataFrame ):
63
- cols = obj .iteritems ()
64
- first_series = next (cols )[1 ]
65
- h = hash_array (first_series .values , encoding ,
66
- hash_key , categorize ).astype ('uint64' )
67
- for _ , col in cols :
68
- h = adder (h , hash_array (col .values , encoding , hash_key ,
69
- categorize ))
80
+ hashes = (hash_array (series .values ) for _ , series in obj .iteritems ())
81
+ num_items = len (obj .columns )
70
82
if index :
71
- h = adder (h , hash_pandas_object (obj .index ,
72
- index = False ,
73
- encoding = encoding ,
74
- hash_key = hash_key ,
75
- categorize = categorize ).values )
83
+ index_hash_generator = (hash_pandas_object (obj .index ,
84
+ index = False ,
85
+ encoding = encoding ,
86
+ hash_key = hash_key ,
87
+ categorize = categorize ).values
88
+ for _ in [None ])
89
+ num_items += 1
90
+ hashes = itertools .chain (hashes , index_hash_generator )
91
+ h = _combine_hash_arrays (hashes , num_items )
76
92
77
93
h = Series (h , index = obj .index , dtype = 'uint64' )
78
94
else :
0 commit comments