|
9 | 9 | Hashable,
|
10 | 10 | Iterable,
|
11 | 11 | Iterator,
|
12 |
| - cast, |
13 | 12 | )
|
14 | 13 |
|
15 | 14 | import numpy as np
|
16 | 15 |
|
17 |
| -from pandas._libs import lib |
18 | 16 | from pandas._libs.hashing import hash_object_array
|
19 | 17 | from pandas._typing import (
|
20 | 18 | ArrayLike,
|
21 | 19 | npt,
|
22 | 20 | )
|
23 | 21 |
|
24 |
| -from pandas.core.dtypes.common import ( |
25 |
| - is_categorical_dtype, |
26 |
| - is_list_like, |
27 |
| -) |
| 22 | +from pandas.core.dtypes.common import is_list_like |
28 | 23 | from pandas.core.dtypes.generic import (
|
29 | 24 | ABCDataFrame,
|
30 | 25 | ABCExtensionArray,
|
|
35 | 30 |
|
36 | 31 | if TYPE_CHECKING:
|
37 | 32 | from pandas import (
|
38 |
| - Categorical, |
39 | 33 | DataFrame,
|
40 | 34 | Index,
|
41 | 35 | MultiIndex,
|
@@ -214,53 +208,14 @@ def hash_tuples(
|
214 | 208 |
|
215 | 209 | # hash the list-of-ndarrays
|
216 | 210 | hashes = (
|
217 |
| - _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in cat_vals |
| 211 | + cat._hash_pandas_object(encoding=encoding, hash_key=hash_key, categorize=False) |
| 212 | + for cat in cat_vals |
218 | 213 | )
|
219 | 214 | h = combine_hash_arrays(hashes, len(cat_vals))
|
220 | 215 |
|
221 | 216 | return h
|
222 | 217 |
|
223 | 218 |
|
224 |
| -def _hash_categorical( |
225 |
| - cat: Categorical, encoding: str, hash_key: str |
226 |
| -) -> npt.NDArray[np.uint64]: |
227 |
| - """ |
228 |
| - Hash a Categorical by hashing its categories, and then mapping the codes |
229 |
| - to the hashes |
230 |
| -
|
231 |
| - Parameters |
232 |
| - ---------- |
233 |
| - cat : Categorical |
234 |
| - encoding : str |
235 |
| - hash_key : str |
236 |
| -
|
237 |
| - Returns |
238 |
| - ------- |
239 |
| - ndarray[np.uint64] of hashed values, same size as len(c) |
240 |
| - """ |
241 |
| - # Convert ExtensionArrays to ndarrays |
242 |
| - values = np.asarray(cat.categories._values) |
243 |
| - hashed = hash_array(values, encoding, hash_key, categorize=False) |
244 |
| - |
245 |
| - # we have uint64, as we don't directly support missing values |
246 |
| - # we don't want to use take_nd which will coerce to float |
247 |
| - # instead, directly construct the result with a |
248 |
| - # max(np.uint64) as the missing value indicator |
249 |
| - # |
250 |
| - # TODO: GH 15362 |
251 |
| - |
252 |
| - mask = cat.isna() |
253 |
| - if len(hashed): |
254 |
| - result = hashed.take(cat.codes) |
255 |
| - else: |
256 |
| - result = np.zeros(len(mask), dtype="uint64") |
257 |
| - |
258 |
| - if mask.any(): |
259 |
| - result[mask] = lib.u8max |
260 |
| - |
261 |
| - return result |
262 |
| - |
263 |
| - |
264 | 219 | def hash_array(
|
265 | 220 | vals: ArrayLike,
|
266 | 221 | encoding: str = "utf8",
|
@@ -288,17 +243,11 @@ def hash_array(
|
288 | 243 | """
|
289 | 244 | if not hasattr(vals, "dtype"):
|
290 | 245 | raise TypeError("must pass a ndarray-like")
|
291 |
| - dtype = vals.dtype |
292 |
| - |
293 |
| - # For categoricals, we hash the categories, then remap the codes to the |
294 |
| - # hash values. (This check is above the complex check so that we don't ask |
295 |
| - # numpy if categorical is a subdtype of complex, as it will choke). |
296 |
| - if is_categorical_dtype(dtype): |
297 |
| - vals = cast("Categorical", vals) |
298 |
| - return _hash_categorical(vals, encoding, hash_key) |
299 | 246 |
|
300 |
| - elif isinstance(vals, ABCExtensionArray): |
301 |
| - vals, _ = vals._values_for_factorize() |
| 247 | + if isinstance(vals, ABCExtensionArray): |
| 248 | + return vals._hash_pandas_object( |
| 249 | + encoding=encoding, hash_key=hash_key, categorize=categorize |
| 250 | + ) |
302 | 251 |
|
303 | 252 | elif not isinstance(vals, np.ndarray):
|
304 | 253 | # GH#42003
|
@@ -347,7 +296,9 @@ def _hash_ndarray(
|
347 | 296 |
|
348 | 297 | codes, categories = factorize(vals, sort=False)
|
349 | 298 | cat = Categorical(codes, Index(categories), ordered=False, fastpath=True)
|
350 |
| - return _hash_categorical(cat, encoding, hash_key) |
| 299 | + return cat._hash_pandas_object( |
| 300 | + encoding=encoding, hash_key=hash_key, categorize=False |
| 301 | + ) |
351 | 302 |
|
352 | 303 | try:
|
353 | 304 | vals = hash_object_array(vals, hash_key, encoding)
|
|
0 commit comments