Skip to content

TYPES: add types in core.util.hashing #28916

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 29 additions & 16 deletions pandas/core/util/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

import numpy as np

from pandas._libs import Timestamp
import pandas._libs.hashing as hashing
import pandas._libs.tslibs as tslibs

from pandas.core.dtypes.cast import infer_dtype_from_scalar
from pandas.core.dtypes.common import (
Expand All @@ -26,13 +26,19 @@
_default_hash_key = "0123456789123456"


def _combine_hash_arrays(arrays, num_items: int):
# Note: The return type is technically a np.uint64, see GH#28916 for
# annotation discussion.
def _combine_hash_arrays(arrays, num_items: int) -> int:
"""
Parameters
----------
arrays : generator
num_items : int

Returns
-------
np.uint64

Should be the same as CPython's tupleobject.c
"""
try:
Expand All @@ -58,7 +64,7 @@ def hash_pandas_object(
obj,
index: bool = True,
encoding: str = "utf8",
hash_key=None,
hash_key: str = _default_hash_key,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you remove L91 and L92?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch, thanks

categorize: bool = True,
):
"""
Expand All @@ -84,9 +90,6 @@ def hash_pandas_object(
"""
from pandas import Series

if hash_key is None:
hash_key = _default_hash_key

if isinstance(obj, ABCMultiIndex):
return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False)

Expand Down Expand Up @@ -142,7 +145,7 @@ def hash_pandas_object(
return h


def hash_tuples(vals, encoding="utf8", hash_key=None):
def hash_tuples(vals, encoding: str = "utf8", hash_key: str = _default_hash_key):
"""
Hash an MultiIndex / list-of-tuples efficiently

Expand Down Expand Up @@ -187,7 +190,7 @@ def hash_tuples(vals, encoding="utf8", hash_key=None):
return h


def hash_tuple(val, encoding: str = "utf8", hash_key=None):
def hash_tuple(val, encoding: str = "utf8", hash_key: str = _default_hash_key):
"""
Hash a single tuple efficiently

Expand Down Expand Up @@ -247,7 +250,12 @@ def _hash_categorical(c, encoding: str, hash_key: str):
return result


def hash_array(vals, encoding: str = "utf8", hash_key=None, categorize: bool = True):
def hash_array(
vals,
encoding: str = "utf8",
hash_key: str = _default_hash_key,
categorize: bool = True,
):
"""
Given a 1d array, return an array of deterministic integers.

Expand All @@ -273,9 +281,6 @@ def hash_array(vals, encoding: str = "utf8", hash_key=None, categorize: bool = T
raise TypeError("must pass a ndarray-like")
dtype = vals.dtype

if hash_key is None:
hash_key = _default_hash_key

# For categoricals, we hash the categories, then remap the codes to the
# hash values. (This check is above the complex check so that we don't ask
# numpy if categorical is a subdtype of complex, as it will choke).
Expand Down Expand Up @@ -326,9 +331,17 @@ def hash_array(vals, encoding: str = "utf8", hash_key=None, categorize: bool = T
return vals


def _hash_scalar(val, encoding: str = "utf8", hash_key=None):
def _hash_scalar(
val, encoding: str = "utf8", hash_key: str = _default_hash_key
) -> np.ndarray:
"""
Hash scalar value
Hash scalar value.

Parameters
----------
val : scalar
encoding : str, default "utf8"
hash_key : str, default _default_hash_key

Returns
-------
Expand All @@ -343,8 +356,8 @@ def _hash_scalar(val, encoding: str = "utf8", hash_key=None):
# for tz-aware datetimes, we need the underlying naive UTC value and
# not the tz aware object or pd extension type (as
# infer_dtype_from_scalar would do)
if not isinstance(val, tslibs.Timestamp):
val = tslibs.Timestamp(val)
if not isinstance(val, Timestamp):
val = Timestamp(val)
val = val.tz_convert(None)

dtype, val = infer_dtype_from_scalar(val)
Expand Down