|
4 | 4 | import itertools
|
5 | 5 |
|
6 | 6 | import numpy as np
|
7 |
| -from pandas._libs import hashing |
| 7 | +from pandas._libs import hashing, tslib |
8 | 8 | from pandas.core.dtypes.generic import (
|
9 | 9 | ABCMultiIndex,
|
10 | 10 | ABCIndexClass,
|
11 | 11 | ABCSeries,
|
12 | 12 | ABCDataFrame)
|
13 | 13 | from pandas.core.dtypes.common import (
|
14 | 14 | is_categorical_dtype, is_list_like)
|
| 15 | +from pandas.core.dtypes.missing import isnull |
| 16 | +from pandas.core.dtypes.cast import infer_dtype_from_scalar |
| 17 | + |
15 | 18 |
|
16 | 19 | # 16 byte long hashing key
|
17 | 20 | _default_hash_key = '0123456789123456'
|
@@ -164,6 +167,29 @@ def hash_tuples(vals, encoding='utf8', hash_key=None):
|
164 | 167 | return h
|
165 | 168 |
|
166 | 169 |
|
| 170 | +def hash_tuple(val, encoding='utf8', hash_key=None): |
| 171 | + """ |
| 172 | + Hash a single tuple efficiently |
| 173 | +
|
| 174 | + Parameters |
| 175 | + ---------- |
| 176 | + val : single tuple |
| 177 | + encoding : string, default 'utf8' |
| 178 | + hash_key : string key to encode, default to _default_hash_key |
| 179 | +
|
| 180 | + Returns |
| 181 | + ------- |
| 182 | + hash |
| 183 | +
|
| 184 | + """ |
| 185 | + hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key) |
| 186 | + for v in val) |
| 187 | + |
| 188 | + h = _combine_hash_arrays(hashes, len(val))[0] |
| 189 | + |
| 190 | + return h |
| 191 | + |
| 192 | + |
167 | 193 | def _hash_categorical(c, encoding, hash_key):
|
168 | 194 | """
|
169 | 195 | Hash a Categorical by hashing its categories, and then mapping the codes
|
@@ -276,3 +302,31 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
|
276 | 302 | vals *= np.uint64(0x94d049bb133111eb)
|
277 | 303 | vals ^= vals >> 31
|
278 | 304 | return vals
|
| 305 | + |
| 306 | + |
| 307 | +def _hash_scalar(val, encoding='utf8', hash_key=None): |
| 308 | + """ |
| 309 | + Hash scalar value |
| 310 | +
|
| 311 | + Returns |
| 312 | + ------- |
| 313 | + 1d uint64 numpy array of hash value, of length 1 |
| 314 | + """ |
| 315 | + |
| 316 | + if isnull(val): |
| 317 | + # this is to be consistent with the _hash_categorical implementation |
| 318 | + return np.array([np.iinfo(np.uint64).max], dtype='u8') |
| 319 | + |
| 320 | + if getattr(val, 'tzinfo', None) is not None: |
| 321 | + # for tz-aware datetimes, we need the underlying naive UTC value and |
| 322 | + # not the tz aware object or pd extension type (as |
| 323 | + # infer_dtype_from_scalar would do) |
| 324 | + if not isinstance(val, tslib.Timestamp): |
| 325 | + val = tslib.Timestamp(val) |
| 326 | + val = val.tz_convert(None) |
| 327 | + |
| 328 | + dtype, val = infer_dtype_from_scalar(val) |
| 329 | + vals = np.array([val], dtype=dtype) |
| 330 | + |
| 331 | + return hash_array(vals, hash_key=hash_key, encoding=encoding, |
| 332 | + categorize=False) |
0 commit comments