Skip to content

TYP: hashing #39949

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Mar 8, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion pandas/_libs/hashing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ DEF dROUNDS = 4


@cython.boundscheck(False)
def hash_object_array(ndarray[object] arr, str key, str encoding="utf8"):
def hash_object_array(
ndarray[object] arr, str key, str encoding="utf8"
) -> np.ndarray[np.uint64]:
"""
Parameters
----------
Expand Down
37 changes: 22 additions & 15 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ def item_from_zerodim(val: object) -> object:

@cython.wraparound(False)
@cython.boundscheck(False)
def fast_unique_multiple(list arrays, sort: bool = True):
def fast_unique_multiple(list arrays, sort: bool = True) -> list:
"""
Generate a list of unique values from a list of arrays.

Expand Down Expand Up @@ -345,7 +345,7 @@ def fast_unique_multiple_list(lists: list, sort: bool = True) -> list:

@cython.wraparound(False)
@cython.boundscheck(False)
def fast_unique_multiple_list_gen(object gen, bint sort=True):
def fast_unique_multiple_list_gen(object gen, bint sort=True) -> list:
"""
Generate a list of unique values from a generator of lists.

Expand Down Expand Up @@ -409,7 +409,7 @@ def dicts_to_array(dicts: list, columns: list):
return result


def fast_zip(list ndarrays):
def fast_zip(list ndarrays) -> ndarray[object]:
"""
For zipping multiple ndarrays into an ndarray of tuples.
"""
Expand Down Expand Up @@ -621,7 +621,7 @@ def array_equivalent_object(left: object[:], right: object[:]) -> bool:

@cython.wraparound(False)
@cython.boundscheck(False)
def astype_intsafe(ndarray[object] arr, new_dtype):
def astype_intsafe(ndarray[object] arr, new_dtype) -> ndarray:
cdef:
Py_ssize_t i, n = len(arr)
object val
Expand Down Expand Up @@ -891,7 +891,7 @@ def generate_slices(const int64_t[:] labels, Py_ssize_t ngroups):


def indices_fast(ndarray index, const int64_t[:] labels, list keys,
list sorted_labels):
list sorted_labels) -> dict:
"""
Parameters
----------
Expand Down Expand Up @@ -1979,8 +1979,12 @@ cpdef bint is_interval_array(ndarray values):

@cython.boundscheck(False)
@cython.wraparound(False)
def maybe_convert_numeric(ndarray[object] values, set na_values,
bint convert_empty=True, bint coerce_numeric=False):
def maybe_convert_numeric(
ndarray[object] values,
set na_values,
bint convert_empty=True,
bint coerce_numeric=False,
) -> ndarray:
"""
Convert object array to a numeric array if possible.

Expand Down Expand Up @@ -2154,7 +2158,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
bint safe=False, bint convert_datetime=False,
bint convert_timedelta=False,
bint convert_to_nullable_integer=False):
bint convert_to_nullable_integer=False) -> "ArrayLike":
"""
Type inference function-- convert object array to proper dtype

Expand All @@ -2181,6 +2185,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
Returns
-------
np.ndarray or ExtensionArray
Array of converted object values to more specific dtypes if applicable.
"""
cdef:
Py_ssize_t i, n
Expand Down Expand Up @@ -2408,13 +2413,13 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,


# Note: no_default is exported to the public API in pandas.api.extensions
no_default = object() #: Sentinel indicating the default value.
no_default = object() # Sentinel indicating the default value.


@cython.boundscheck(False)
@cython.wraparound(False)
def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=True,
object na_value=no_default, object dtype=object):
object na_value=no_default, object dtype=object) -> "ArrayLike":
"""
Substitute for np.vectorize with pandas-friendly dtype inference.

Expand Down Expand Up @@ -2469,7 +2474,9 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=Tr

@cython.boundscheck(False)
@cython.wraparound(False)
def map_infer(ndarray arr, object f, bint convert=True, bint ignore_na=False):
def map_infer(
ndarray arr, object f, bint convert=True, bint ignore_na=False
) -> "ArrayLike":
"""
Substitute for np.vectorize with pandas-friendly dtype inference.

Expand All @@ -2483,7 +2490,7 @@ def map_infer(ndarray arr, object f, bint convert=True, bint ignore_na=False):

Returns
-------
ndarray
np.ndarray or ExtensionArray
"""
cdef:
Py_ssize_t i, n
Expand Down Expand Up @@ -2513,7 +2520,7 @@ def map_infer(ndarray arr, object f, bint convert=True, bint ignore_na=False):
return result


def to_object_array(rows: object, int min_width=0):
def to_object_array(rows: object, min_width: int = 0) -> ndarray:
"""
Convert a list of lists into an object array.

Expand All @@ -2529,7 +2536,7 @@ def to_object_array(rows: object, int min_width=0):

Returns
-------
numpy array of the object dtype.
np.ndarray[object, ndim=2]
"""
cdef:
Py_ssize_t i, j, n, k, tmp
Expand Down Expand Up @@ -2621,7 +2628,7 @@ def to_object_array_tuples(rows: object):

@cython.wraparound(False)
@cython.boundscheck(False)
def fast_multiget(dict mapping, ndarray keys, default=np.nan):
def fast_multiget(dict mapping, ndarray keys, default=np.nan) -> "ArrayLike":
cdef:
Py_ssize_t i, n = len(keys)
object val
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9150,7 +9150,7 @@ def count(

return result.astype("int64")

def _count_level(self, level: Level, axis: Axis = 0, numeric_only=False):
def _count_level(self, level: Level, axis: int = 0, numeric_only: bool = False):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why change axis? DataFrame._get_axis accepts Axis.

from https://github.com/microsoft/pyright/blob/master/docs/typed-libraries.md#wide-vs-narrow-types

In general, a function input parameter should be annotated with the widest possible type supported by the implementation.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

its only called from count, and count calls _get_axis_number just before doing so

if numeric_only:
frame = self._get_numeric_data()
else:
Expand Down
105 changes: 74 additions & 31 deletions pandas/core/util/hashing.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,26 @@
"""
data hash pandas / numpy objects
"""
from __future__ import annotations

import itertools
from typing import Optional
from typing import (
TYPE_CHECKING,
Iterator,
List,
Optional,
Tuple,
Union,
cast,
)

import numpy as np

import pandas._libs.hashing as hashing
from pandas._libs.hashing import hash_object_array
from pandas._typing import (
ArrayLike,
FrameOrSeriesUnion,
)

from pandas.core.dtypes.common import (
is_categorical_dtype,
Expand All @@ -20,17 +34,30 @@
ABCSeries,
)

if TYPE_CHECKING:
from pandas import (
Categorical,
Index,
MultiIndex,
Series,
)


# 16 byte long hashing key
_default_hash_key = "0123456789123456"


def combine_hash_arrays(arrays, num_items: int):
def combine_hash_arrays(arrays: Iterator[np.ndarray], num_items: int) -> np.ndarray:
"""
Parameters
----------
arrays : generator
arrays : Iterator[np.ndarray]
num_items : int

Returns
-------
np.ndarray[int64]

Should be the same as CPython's tupleobject.c
"""
try:
Expand All @@ -53,17 +80,18 @@ def combine_hash_arrays(arrays, num_items: int):


def hash_pandas_object(
obj,
obj: Union[Index, FrameOrSeriesUnion],
index: bool = True,
encoding: str = "utf8",
hash_key: Optional[str] = _default_hash_key,
categorize: bool = True,
):
) -> Series:
"""
Return a data hash of the Index/Series/DataFrame.

Parameters
----------
obj : Index, Series, or DataFrame
index : bool, default True
Include the index in the hash (if Series/DataFrame).
encoding : str, default 'utf8'
Expand Down Expand Up @@ -139,25 +167,25 @@ def hash_pandas_object(
return h


def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key):
def hash_tuples(
vals: Union[MultiIndex, List[Tuple]],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

List -> Iterable

again from https://github.com/microsoft/pyright/blob/master/docs/typed-libraries.md#wide-vs-narrow-types

As a specific application of the “use the widest type possible” rule, libraries should generally use immutable forms of container types instead of mutable forms (unless the function needs to modify the container). Use Sequence rather than List, Mapping rather than Dict, etc.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

whats the rule for Sequence vs Iterable?

In this case, we only get here with MultiIndex and List[Tuple]. I get the wider-is-better for public APIs, but for purely internal code strictness seems beneficial. is that distinction not relevant?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIK mypy works by checking each function is isolation. i.e. it checks that all the code within the function is compatible with the function annotation. i.e. checks the types in function calls within the body with the type parameters of the called functions.

so doing the reverse of this, i.e. annotating with the actual types passed seems like extra work imo. (and could be simply achieved using MonkeyType)

Iterable because c_is_list_like accepts an iterable as list-like

Using List is bad practice. function code could modify the contents of the List. If the parameter is typed as an immutable container, mypy won't allow it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks, updated

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

type parameters for Tuple.

encoding: str = "utf8",
hash_key: str = _default_hash_key,
) -> np.ndarray:
"""
Hash an MultiIndex / list-of-tuples efficiently
Hash an MultiIndex / list-of-tuples efficiently.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

list-like-of-tuples


Parameters
----------
vals : MultiIndex, list-of-tuples, or single tuple
vals : MultiIndex or list-of-tuples
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

list-like of tuples

encoding : str, default 'utf8'
hash_key : str, default _default_hash_key

Returns
-------
ndarray of hashed values array
"""
is_tuple = False
if isinstance(vals, tuple):
vals = [vals]
is_tuple = True
elif not is_list_like(vals):
if not is_list_like(vals):
raise TypeError("must be convertible to a list-of-tuples")

from pandas import (
Expand All @@ -166,33 +194,33 @@ def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key):
)

if not isinstance(vals, ABCMultiIndex):
vals = MultiIndex.from_tuples(vals)
mi = MultiIndex.from_tuples(vals)
else:
mi = vals

# create a list-of-Categoricals
vals = [
Categorical(vals.codes[level], vals.levels[level], ordered=False, fastpath=True)
for level in range(vals.nlevels)
cat_vals = [
Categorical(mi.codes[level], mi.levels[level], ordered=False, fastpath=True)
for level in range(mi.nlevels)
]

# hash the list-of-ndarrays
hashes = (
_hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in vals
_hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in cat_vals
)
h = combine_hash_arrays(hashes, len(vals))
if is_tuple:
h = h[0]
h = combine_hash_arrays(hashes, len(cat_vals))

return h


def _hash_categorical(c, encoding: str, hash_key: str):
def _hash_categorical(cat: Categorical, encoding: str, hash_key: str) -> np.ndarray:
"""
Hash a Categorical by hashing its categories, and then mapping the codes
to the hashes

Parameters
----------
c : Categorical
cat : Categorical
encoding : str
hash_key : str

Expand All @@ -201,7 +229,7 @@ def _hash_categorical(c, encoding: str, hash_key: str):
ndarray of hashed values array, same size as len(c)
"""
# Convert ExtensionArrays to ndarrays
values = np.asarray(c.categories._values)
values = np.asarray(cat.categories._values)
hashed = hash_array(values, encoding, hash_key, categorize=False)

# we have uint64, as we don't directly support missing values
Expand All @@ -211,9 +239,9 @@ def _hash_categorical(c, encoding: str, hash_key: str):
#
# TODO: GH 15362

mask = c.isna()
mask = cat.isna()
if len(hashed):
result = hashed.take(c.codes)
result = hashed.take(cat.codes)
else:
result = np.zeros(len(mask), dtype="uint64")

Expand All @@ -224,11 +252,11 @@ def _hash_categorical(c, encoding: str, hash_key: str):


def hash_array(
vals,
vals: ArrayLike,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vals in docstring also needs updating.

encoding: str = "utf8",
hash_key: str = _default_hash_key,
categorize: bool = True,
):
) -> np.ndarray:
"""
Given a 1d array, return an array of deterministic integers.

Expand All @@ -255,11 +283,26 @@ def hash_array(
# hash values. (This check is above the complex check so that we don't ask
# numpy if categorical is a subdtype of complex, as it will choke).
if is_categorical_dtype(dtype):
vals = cast("Categorical", vals)
return _hash_categorical(vals, encoding, hash_key)
elif is_extension_array_dtype(dtype):
vals, _ = vals._values_for_factorize()
dtype = vals.dtype
Copy link
Member

@simonjayhawkins simonjayhawkins Mar 8, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dtype = vals.dtype not now needed?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch, will update


return _hash_ndarray(vals, encoding, hash_key, categorize)


def _hash_ndarray(
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

had to separate this out, as i couldn't convince mypy to recognize vals = cast(np.ndarray, vals) or assert isinstance(vals, np.ndarray)

vals: np.ndarray,
encoding: str = "utf8",
hash_key: str = _default_hash_key,
categorize: bool = True,
) -> np.ndarray:
"""
See hash_array.__doc__.
"""
dtype = vals.dtype

# we'll be working with everything as 64-bit values, so handle this
# 128-bit value early
if np.issubdtype(dtype, np.complex128):
Expand Down Expand Up @@ -289,10 +332,10 @@ def hash_array(
return _hash_categorical(cat, encoding, hash_key)

try:
vals = hashing.hash_object_array(vals, hash_key, encoding)
vals = hash_object_array(vals, hash_key, encoding)
except TypeError:
# we have mixed types
vals = hashing.hash_object_array(
vals = hash_object_array(
vals.astype(str).astype(object), hash_key, encoding
)

Expand Down
Loading