-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
TYP: hashing #39949
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
TYP: hashing #39949
Changes from 5 commits
cd2ca27
ae9cc28
60e2c4f
c3115e9
89becd9
a813783
9ab4be5
c423757
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,26 @@ | ||
""" | ||
data hash pandas / numpy objects | ||
""" | ||
from __future__ import annotations | ||
|
||
import itertools | ||
from typing import Optional | ||
from typing import ( | ||
TYPE_CHECKING, | ||
Iterator, | ||
List, | ||
Optional, | ||
Tuple, | ||
Union, | ||
cast, | ||
) | ||
|
||
import numpy as np | ||
|
||
import pandas._libs.hashing as hashing | ||
from pandas._libs.hashing import hash_object_array | ||
from pandas._typing import ( | ||
ArrayLike, | ||
FrameOrSeriesUnion, | ||
) | ||
|
||
from pandas.core.dtypes.common import ( | ||
is_categorical_dtype, | ||
|
@@ -20,17 +34,30 @@ | |
ABCSeries, | ||
) | ||
|
||
if TYPE_CHECKING: | ||
from pandas import ( | ||
Categorical, | ||
Index, | ||
MultiIndex, | ||
Series, | ||
) | ||
|
||
|
||
# 16 byte long hashing key | ||
_default_hash_key = "0123456789123456" | ||
|
||
|
||
def combine_hash_arrays(arrays, num_items: int): | ||
def combine_hash_arrays(arrays: Iterator[np.ndarray], num_items: int) -> np.ndarray: | ||
""" | ||
Parameters | ||
---------- | ||
arrays : generator | ||
arrays : Iterator[np.ndarray] | ||
num_items : int | ||
|
||
Returns | ||
------- | ||
np.ndarray[int64] | ||
|
||
Should be the same as CPython's tupleobject.c | ||
""" | ||
try: | ||
|
@@ -53,17 +80,18 @@ def combine_hash_arrays(arrays, num_items: int): | |
|
||
|
||
def hash_pandas_object( | ||
obj, | ||
obj: Union[Index, FrameOrSeriesUnion], | ||
index: bool = True, | ||
encoding: str = "utf8", | ||
hash_key: Optional[str] = _default_hash_key, | ||
categorize: bool = True, | ||
): | ||
) -> Series: | ||
""" | ||
Return a data hash of the Index/Series/DataFrame. | ||
|
||
Parameters | ||
---------- | ||
obj : Index, Series, or DataFrame | ||
index : bool, default True | ||
Include the index in the hash (if Series/DataFrame). | ||
encoding : str, default 'utf8' | ||
|
@@ -139,25 +167,25 @@ def hash_pandas_object( | |
return h | ||
|
||
|
||
def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key): | ||
def hash_tuples( | ||
vals: Union[MultiIndex, List[Tuple]], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. whats the rule for Sequence vs Iterable? In this case, we only get here with MultiIndex and List[Tuple]. I get the wider-is-better for public APIs, but for purely internal code strictness seems beneficial. is that distinction not relevant? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. AFAIK mypy works by checking each function is isolation. i.e. it checks that all the code within the function is compatible with the function annotation. i.e. checks the types in function calls within the body with the type parameters of the called functions. so doing the reverse of this, i.e. annotating with the actual types passed seems like extra work imo. (and could be simply achieved using MonkeyType) Iterable because Using List is bad practice. function code could modify the contents of the List. If the parameter is typed as an immutable container, mypy won't allow it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks, updated There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. type parameters for |
||
encoding: str = "utf8", | ||
hash_key: str = _default_hash_key, | ||
) -> np.ndarray: | ||
""" | ||
Hash an MultiIndex / list-of-tuples efficiently | ||
Hash an MultiIndex / list-of-tuples efficiently. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. list-like-of-tuples |
||
|
||
Parameters | ||
---------- | ||
vals : MultiIndex, list-of-tuples, or single tuple | ||
vals : MultiIndex or list-of-tuples | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. list-like of tuples |
||
encoding : str, default 'utf8' | ||
hash_key : str, default _default_hash_key | ||
|
||
Returns | ||
------- | ||
ndarray of hashed values array | ||
""" | ||
is_tuple = False | ||
if isinstance(vals, tuple): | ||
vals = [vals] | ||
is_tuple = True | ||
elif not is_list_like(vals): | ||
if not is_list_like(vals): | ||
raise TypeError("must be convertible to a list-of-tuples") | ||
|
||
from pandas import ( | ||
|
@@ -166,33 +194,33 @@ def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key): | |
) | ||
|
||
if not isinstance(vals, ABCMultiIndex): | ||
vals = MultiIndex.from_tuples(vals) | ||
mi = MultiIndex.from_tuples(vals) | ||
simonjayhawkins marked this conversation as resolved.
Show resolved
Hide resolved
|
||
else: | ||
mi = vals | ||
|
||
# create a list-of-Categoricals | ||
vals = [ | ||
Categorical(vals.codes[level], vals.levels[level], ordered=False, fastpath=True) | ||
for level in range(vals.nlevels) | ||
cat_vals = [ | ||
Categorical(mi.codes[level], mi.levels[level], ordered=False, fastpath=True) | ||
for level in range(mi.nlevels) | ||
] | ||
|
||
# hash the list-of-ndarrays | ||
hashes = ( | ||
_hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in vals | ||
_hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in cat_vals | ||
) | ||
h = combine_hash_arrays(hashes, len(vals)) | ||
if is_tuple: | ||
h = h[0] | ||
h = combine_hash_arrays(hashes, len(cat_vals)) | ||
|
||
return h | ||
|
||
|
||
def _hash_categorical(c, encoding: str, hash_key: str): | ||
def _hash_categorical(cat: Categorical, encoding: str, hash_key: str) -> np.ndarray: | ||
""" | ||
Hash a Categorical by hashing its categories, and then mapping the codes | ||
to the hashes | ||
|
||
Parameters | ||
---------- | ||
c : Categorical | ||
cat : Categorical | ||
encoding : str | ||
hash_key : str | ||
|
||
|
@@ -201,7 +229,7 @@ def _hash_categorical(c, encoding: str, hash_key: str): | |
ndarray of hashed values array, same size as len(c) | ||
""" | ||
# Convert ExtensionArrays to ndarrays | ||
values = np.asarray(c.categories._values) | ||
values = np.asarray(cat.categories._values) | ||
hashed = hash_array(values, encoding, hash_key, categorize=False) | ||
|
||
# we have uint64, as we don't directly support missing values | ||
|
@@ -211,9 +239,9 @@ def _hash_categorical(c, encoding: str, hash_key: str): | |
# | ||
# TODO: GH 15362 | ||
|
||
mask = c.isna() | ||
mask = cat.isna() | ||
if len(hashed): | ||
result = hashed.take(c.codes) | ||
result = hashed.take(cat.codes) | ||
else: | ||
result = np.zeros(len(mask), dtype="uint64") | ||
|
||
|
@@ -224,11 +252,11 @@ def _hash_categorical(c, encoding: str, hash_key: str): | |
|
||
|
||
def hash_array( | ||
vals, | ||
vals: ArrayLike, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
encoding: str = "utf8", | ||
hash_key: str = _default_hash_key, | ||
categorize: bool = True, | ||
): | ||
) -> np.ndarray: | ||
""" | ||
Given a 1d array, return an array of deterministic integers. | ||
|
||
|
@@ -255,11 +283,26 @@ def hash_array( | |
# hash values. (This check is above the complex check so that we don't ask | ||
# numpy if categorical is a subdtype of complex, as it will choke). | ||
if is_categorical_dtype(dtype): | ||
vals = cast("Categorical", vals) | ||
return _hash_categorical(vals, encoding, hash_key) | ||
elif is_extension_array_dtype(dtype): | ||
vals, _ = vals._values_for_factorize() | ||
dtype = vals.dtype | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good catch, will update |
||
|
||
return _hash_ndarray(vals, encoding, hash_key, categorize) | ||
|
||
|
||
def _hash_ndarray( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. had to separate this out, as i couldn't convince mypy to recognize |
||
vals: np.ndarray, | ||
encoding: str = "utf8", | ||
hash_key: str = _default_hash_key, | ||
categorize: bool = True, | ||
) -> np.ndarray: | ||
""" | ||
See hash_array.__doc__. | ||
""" | ||
dtype = vals.dtype | ||
|
||
# we'll be working with everything as 64-bit values, so handle this | ||
# 128-bit value early | ||
if np.issubdtype(dtype, np.complex128): | ||
|
@@ -289,10 +332,10 @@ def hash_array( | |
return _hash_categorical(cat, encoding, hash_key) | ||
|
||
try: | ||
vals = hashing.hash_object_array(vals, hash_key, encoding) | ||
vals = hash_object_array(vals, hash_key, encoding) | ||
except TypeError: | ||
# we have mixed types | ||
vals = hashing.hash_object_array( | ||
vals = hash_object_array( | ||
vals.astype(str).astype(object), hash_key, encoding | ||
) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why change
axis
?DataFrame._get_axis
acceptsAxis
.from https://github.com/microsoft/pyright/blob/master/docs/typed-libraries.md#wide-vs-narrow-types
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
its only called from count, and count calls _get_axis_number just before doing so