Skip to content

Commit 47cdca1

Browse files
feedback
1 parent e88b658 commit 47cdca1

File tree

4 files changed

+97
-10
lines changed

4 files changed

+97
-10
lines changed

pandas/_libs/hashtable.pxd

+2
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ cdef class MultiIndexHashTable(HashTable):
3838

3939
cpdef get_item(self, object val)
4040
cpdef set_item(self, object key, Py_ssize_t val)
41+
cdef inline void _check_for_collision(self, Py_ssize_t loc, object label)
42+
4143

4244
cdef class StringHashTable(HashTable):
4345
cdef kh_str_t *table

pandas/_libs/hashtable_class_helper.pxi.in

+7-3
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@ Template for each `dtype` helper function for hashtable
44
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
55
"""
66

7-
from pandas.core.dtypes.missing import array_equivalent
7+
from lib cimport is_null_datetimelike
8+
89

910
#----------------------------------------------------------------------
1011
# VectorData
@@ -923,12 +924,15 @@ cdef class MultiIndexHashTable(HashTable):
923924
"hash collision\nlocs:\n{}\n"
924925
"result:\n{}\nmi:\n{}".format(alocs, result, mi))
925926

926-
def _check_for_collision(self, Py_ssize_t loc, object label):
927+
cdef inline void _check_for_collision(self, Py_ssize_t loc, object label):
927928
# validate that the loc maps to the actual value
928929
# version of _check_for_collisions above for single label (tuple)
929930

930931
result = self.mi[loc]
931-
if not array_equivalent(result, label):
932+
933+
if not all(l == r or (is_null_datetimelike(l)
934+
and is_null_datetimelike(r))
935+
for l, r in zip(result, label)):
932936
raise AssertionError(
933937
"hash collision\nloc:\n{}\n"
934938
"result:\n{}\nmi:\n{}".format(loc, result, label))

pandas/core/util/hashing.py

+73-2
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,16 @@
55

66
import numpy as np
77
from pandas._libs import hashing
8+
from pandas.compat import string_and_binary_types, text_type
89
from pandas.core.dtypes.generic import (
910
ABCMultiIndex,
1011
ABCIndexClass,
1112
ABCSeries,
1213
ABCDataFrame)
1314
from pandas.core.dtypes.common import (
1415
is_categorical_dtype, is_list_like)
16+
from pandas.core.dtypes.missing import isnull
17+
1518

1619
# 16 byte long hashing key
1720
_default_hash_key = '0123456789123456'
@@ -179,9 +182,17 @@ def hash_tuple(val, encoding='utf8', hash_key=None):
179182
hash
180183
181184
"""
182-
hashes = (hash_array(np.array([v]), encoding=encoding, hash_key=hash_key,
183-
categorize=False)
185+
#def to_array(v):
186+
# dtype, arr = infer_dtype_from_array([v])
187+
# return np.asarray(arr, dtype=dtype)
188+
189+
#hashes = (hash_array(to_array(v), encoding=encoding, hash_key=hash_key,
190+
# categorize=False)
191+
# for v in val)
192+
193+
hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key)
184194
for v in val)
195+
185196
h = _combine_hash_arrays(hashes, len(val))[0]
186197

187198
return h
@@ -299,3 +310,63 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
299310
vals *= np.uint64(0x94d049bb133111eb)
300311
vals ^= vals >> 31
301312
return vals
313+
314+
315+
def _hash_scalar(val, encoding='utf8', hash_key=None):
316+
"""
317+
Hash scalar value
318+
319+
Returns
320+
-------
321+
1d uint64 numpy array of hash value, of length 1
322+
"""
323+
324+
if hash_key is None:
325+
hash_key = _default_hash_key
326+
327+
if isnull(val):
328+
# this is to be consistent with the _hash_categorical implementation
329+
return np.array([np.iinfo(np.uint64).max], dtype='u8')
330+
331+
if isinstance(val, string_and_binary_types + (text_type,)):
332+
vals = np.array([val], dtype=object)
333+
string_like = True
334+
else:
335+
vals = np.array([val])
336+
string_like = False
337+
338+
dtype = vals.dtype
339+
340+
#dtype, vals = infer_dtype_from_array([vals])
341+
#if dtype == np.object_:
342+
# vals = np.asarray(vals, dtype='object')
343+
# dtype = vals.dtype
344+
345+
# we'll be working with everything as 64-bit values, so handle this
346+
# 128-bit value early
347+
if np.issubdtype(dtype, np.complex128):
348+
return hash_array(vals.real) + 23 * hash_array(vals.imag)
349+
350+
# First, turn whatever array this is into unsigned 64-bit ints, if we can
351+
# manage it.
352+
elif isinstance(dtype, np.bool):
353+
vals = vals.astype('u8')
354+
elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
355+
vals = vals.view('i8').astype('u8', copy=False)
356+
elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
357+
vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
358+
else:
359+
if not string_like:
360+
from pandas import Index
361+
vals = Index(vals).values
362+
return hash_array(vals, hash_key=hash_key, encoding=encoding,
363+
categorize=False)
364+
vals = hashing.hash_object_array(vals, hash_key, encoding)
365+
366+
# Then, redistribute these 64-bit ints within the space of 64-bit ints
367+
vals ^= vals >> 30
368+
vals *= np.uint64(0xbf58476d1ce4e5b9)
369+
vals ^= vals >> 27
370+
vals *= np.uint64(0x94d049bb133111eb)
371+
vals ^= vals >> 31
372+
return vals

pandas/tests/util/test_hashing.py

+15-5
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from pandas import DataFrame, Series, Index, MultiIndex
88
from pandas.util import hash_array, hash_pandas_object
9-
from pandas.core.util.hashing import hash_tuples, hash_tuple
9+
from pandas.core.util.hashing import hash_tuples, hash_tuple, _hash_scalar
1010
import pandas.util.testing as tm
1111

1212

@@ -81,10 +81,20 @@ def test_hash_tuples(self):
8181

8282
def test_hash_tuple(self):
8383
# test equivalence between hash_tuples and hash_tuple
84-
tup = (1, 'one')
85-
result = hash_tuple(tup)
86-
expected = hash_tuples([tup])[0]
87-
assert result == expected
84+
for tup in [(1, 'one'), (1, np.nan)]:
85+
result = hash_tuple(tup)
86+
expected = hash_tuples([tup])[0]
87+
assert result == expected
88+
89+
def test_hash_scalar(self):
90+
for val in [1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"),
91+
pd.Timestamp("2012-01-01", tz='Europe/Brussels'),
92+
pd.Period('2012-01-01', freq='D'), pd.Timedelta('1 days'),
93+
pd.Interval(0, 1), np.nan, pd.NaT, None]:
94+
result = _hash_scalar(val)
95+
expected = hash_array(np.array([val], dtype=object),
96+
categorize=True)
97+
assert result[0] == expected[0]
8898

8999
def test_hash_tuples_err(self):
90100

0 commit comments

Comments
 (0)