|
5 | 5 |
|
6 | 6 | import numpy as np
|
7 | 7 | from pandas._libs import hashing
|
| 8 | +from pandas.compat import string_and_binary_types, text_type |
8 | 9 | from pandas.core.dtypes.generic import (
|
9 | 10 | ABCMultiIndex,
|
10 | 11 | ABCIndexClass,
|
11 | 12 | ABCSeries,
|
12 | 13 | ABCDataFrame)
|
13 | 14 | from pandas.core.dtypes.common import (
|
14 | 15 | is_categorical_dtype, is_list_like)
|
| 16 | +from pandas.core.dtypes.missing import isnull |
| 17 | + |
15 | 18 |
|
16 | 19 | # 16 byte long hashing key
|
17 | 20 | _default_hash_key = '0123456789123456'
|
@@ -179,9 +182,17 @@ def hash_tuple(val, encoding='utf8', hash_key=None):
|
179 | 182 | hash
|
180 | 183 |
|
181 | 184 | """
|
182 |
| - hashes = (hash_array(np.array([v]), encoding=encoding, hash_key=hash_key, |
183 |
| - categorize=False) |
| 185 | + #def to_array(v): |
| 186 | + # dtype, arr = infer_dtype_from_array([v]) |
| 187 | + # return np.asarray(arr, dtype=dtype) |
| 188 | + |
| 189 | + #hashes = (hash_array(to_array(v), encoding=encoding, hash_key=hash_key, |
| 190 | + # categorize=False) |
| 191 | + # for v in val) |
| 192 | + |
| 193 | + hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key) |
184 | 194 | for v in val)
|
| 195 | + |
185 | 196 | h = _combine_hash_arrays(hashes, len(val))[0]
|
186 | 197 |
|
187 | 198 | return h
|
@@ -299,3 +310,63 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
|
299 | 310 | vals *= np.uint64(0x94d049bb133111eb)
|
300 | 311 | vals ^= vals >> 31
|
301 | 312 | return vals
|
| 313 | + |
| 314 | + |
| 315 | +def _hash_scalar(val, encoding='utf8', hash_key=None): |
| 316 | + """ |
| 317 | + Hash scalar value |
| 318 | +
|
| 319 | + Returns |
| 320 | + ------- |
| 321 | + 1d uint64 numpy array of hash value, of length 1 |
| 322 | + """ |
| 323 | + |
| 324 | + if hash_key is None: |
| 325 | + hash_key = _default_hash_key |
| 326 | + |
| 327 | + if isnull(val): |
| 328 | + # this is to be consistent with the _hash_categorical implementation |
| 329 | + return np.array([np.iinfo(np.uint64).max], dtype='u8') |
| 330 | + |
| 331 | + if isinstance(val, string_and_binary_types + (text_type,)): |
| 332 | + vals = np.array([val], dtype=object) |
| 333 | + string_like = True |
| 334 | + else: |
| 335 | + vals = np.array([val]) |
| 336 | + string_like = False |
| 337 | + |
| 338 | + dtype = vals.dtype |
| 339 | + |
| 340 | + #dtype, vals = infer_dtype_from_array([vals]) |
| 341 | + #if dtype == np.object_: |
| 342 | + # vals = np.asarray(vals, dtype='object') |
| 343 | + # dtype = vals.dtype |
| 344 | + |
| 345 | + # we'll be working with everything as 64-bit values, so handle this |
| 346 | + # 128-bit value early |
| 347 | + if np.issubdtype(dtype, np.complex128): |
| 348 | + return hash_array(vals.real) + 23 * hash_array(vals.imag) |
| 349 | + |
| 350 | + # First, turn whatever array this is into unsigned 64-bit ints, if we can |
| 351 | + # manage it. |
| 352 | + elif isinstance(dtype, np.bool): |
| 353 | + vals = vals.astype('u8') |
| 354 | + elif issubclass(dtype.type, (np.datetime64, np.timedelta64)): |
| 355 | + vals = vals.view('i8').astype('u8', copy=False) |
| 356 | + elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8: |
| 357 | + vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') |
| 358 | + else: |
| 359 | + if not string_like: |
| 360 | + from pandas import Index |
| 361 | + vals = Index(vals).values |
| 362 | + return hash_array(vals, hash_key=hash_key, encoding=encoding, |
| 363 | + categorize=False) |
| 364 | + vals = hashing.hash_object_array(vals, hash_key, encoding) |
| 365 | + |
| 366 | + # Then, redistribute these 64-bit ints within the space of 64-bit ints |
| 367 | + vals ^= vals >> 30 |
| 368 | + vals *= np.uint64(0xbf58476d1ce4e5b9) |
| 369 | + vals ^= vals >> 27 |
| 370 | + vals *= np.uint64(0x94d049bb133111eb) |
| 371 | + vals ^= vals >> 31 |
| 372 | + return vals |
0 commit comments