diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index ead967386ed1d..2dd2f1feadd70 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -27,7 +27,9 @@ DEF dROUNDS = 4 @cython.boundscheck(False) -def hash_object_array(ndarray[object] arr, str key, str encoding="utf8"): +def hash_object_array( + ndarray[object] arr, str key, str encoding="utf8" +) -> np.ndarray[np.uint64]: """ Parameters ---------- diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4e04425436af4..1ff481553e413 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -269,7 +269,7 @@ def item_from_zerodim(val: object) -> object: @cython.wraparound(False) @cython.boundscheck(False) -def fast_unique_multiple(list arrays, sort: bool = True): +def fast_unique_multiple(list arrays, sort: bool = True) -> list: """ Generate a list of unique values from a list of arrays. @@ -345,7 +345,7 @@ def fast_unique_multiple_list(lists: list, sort: bool = True) -> list: @cython.wraparound(False) @cython.boundscheck(False) -def fast_unique_multiple_list_gen(object gen, bint sort=True): +def fast_unique_multiple_list_gen(object gen, bint sort=True) -> list: """ Generate a list of unique values from a generator of lists. @@ -409,7 +409,7 @@ def dicts_to_array(dicts: list, columns: list): return result -def fast_zip(list ndarrays): +def fast_zip(list ndarrays) -> ndarray[object]: """ For zipping multiple ndarrays into an ndarray of tuples. """ @@ -621,7 +621,7 @@ def array_equivalent_object(left: object[:], right: object[:]) -> bool: @cython.wraparound(False) @cython.boundscheck(False) -def astype_intsafe(ndarray[object] arr, new_dtype): +def astype_intsafe(ndarray[object] arr, new_dtype) -> ndarray: cdef: Py_ssize_t i, n = len(arr) object val @@ -891,7 +891,7 @@ def generate_slices(const int64_t[:] labels, Py_ssize_t ngroups): def indices_fast(ndarray index, const int64_t[:] labels, list keys, - list sorted_labels): + list sorted_labels) -> dict: """ Parameters ---------- @@ -1979,8 +1979,12 @@ cpdef bint is_interval_array(ndarray values): @cython.boundscheck(False) @cython.wraparound(False) -def maybe_convert_numeric(ndarray[object] values, set na_values, - bint convert_empty=True, bint coerce_numeric=False): +def maybe_convert_numeric( + ndarray[object] values, + set na_values, + bint convert_empty=True, + bint coerce_numeric=False, +) -> ndarray: """ Convert object array to a numeric array if possible. @@ -2154,7 +2158,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, def maybe_convert_objects(ndarray[object] objects, bint try_float=False, bint safe=False, bint convert_datetime=False, bint convert_timedelta=False, - bint convert_to_nullable_integer=False): + bint convert_to_nullable_integer=False) -> "ArrayLike": """ Type inference function-- convert object array to proper dtype @@ -2181,6 +2185,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, Returns ------- np.ndarray or ExtensionArray + Array of converted object values to more specific dtypes if applicable. """ cdef: Py_ssize_t i, n @@ -2408,13 +2413,13 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, # Note: no_default is exported to the public API in pandas.api.extensions -no_default = object() #: Sentinel indicating the default value. +no_default = object() # Sentinel indicating the default value. @cython.boundscheck(False) @cython.wraparound(False) def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=True, - object na_value=no_default, object dtype=object): + object na_value=no_default, object dtype=object) -> "ArrayLike": """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2469,7 +2474,9 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=Tr @cython.boundscheck(False) @cython.wraparound(False) -def map_infer(ndarray arr, object f, bint convert=True, bint ignore_na=False): +def map_infer( + ndarray arr, object f, bint convert=True, bint ignore_na=False +) -> "ArrayLike": """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2483,7 +2490,7 @@ def map_infer(ndarray arr, object f, bint convert=True, bint ignore_na=False): Returns ------- - ndarray + np.ndarray or ExtensionArray """ cdef: Py_ssize_t i, n @@ -2513,7 +2520,7 @@ def map_infer(ndarray arr, object f, bint convert=True, bint ignore_na=False): return result -def to_object_array(rows: object, int min_width=0): +def to_object_array(rows: object, min_width: int = 0) -> ndarray: """ Convert a list of lists into an object array. @@ -2529,7 +2536,7 @@ def to_object_array(rows: object, int min_width=0): Returns ------- - numpy array of the object dtype. + np.ndarray[object, ndim=2] """ cdef: Py_ssize_t i, j, n, k, tmp @@ -2621,7 +2628,7 @@ def to_object_array_tuples(rows: object): @cython.wraparound(False) @cython.boundscheck(False) -def fast_multiget(dict mapping, ndarray keys, default=np.nan): +def fast_multiget(dict mapping, ndarray keys, default=np.nan) -> "ArrayLike": cdef: Py_ssize_t i, n = len(keys) object val diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 62549079309f6..dbf2446f43af3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9151,7 +9151,7 @@ def count( return result.astype("int64") - def _count_level(self, level: Level, axis: Axis = 0, numeric_only=False): + def _count_level(self, level: Level, axis: int = 0, numeric_only: bool = False): if numeric_only: frame = self._get_numeric_data() else: diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 6d375a92ea50a..9d488bb13b0f1 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -1,12 +1,27 @@ """ data hash pandas / numpy objects """ +from __future__ import annotations + import itertools -from typing import Optional +from typing import ( + TYPE_CHECKING, + Hashable, + Iterable, + Iterator, + Optional, + Tuple, + Union, + cast, +) import numpy as np -import pandas._libs.hashing as hashing +from pandas._libs.hashing import hash_object_array +from pandas._typing import ( + ArrayLike, + FrameOrSeriesUnion, +) from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -20,17 +35,30 @@ ABCSeries, ) +if TYPE_CHECKING: + from pandas import ( + Categorical, + Index, + MultiIndex, + Series, + ) + + # 16 byte long hashing key _default_hash_key = "0123456789123456" -def combine_hash_arrays(arrays, num_items: int): +def combine_hash_arrays(arrays: Iterator[np.ndarray], num_items: int) -> np.ndarray: """ Parameters ---------- - arrays : generator + arrays : Iterator[np.ndarray] num_items : int + Returns + ------- + np.ndarray[int64] + Should be the same as CPython's tupleobject.c """ try: @@ -53,17 +81,18 @@ def combine_hash_arrays(arrays, num_items: int): def hash_pandas_object( - obj, + obj: Union[Index, FrameOrSeriesUnion], index: bool = True, encoding: str = "utf8", hash_key: Optional[str] = _default_hash_key, categorize: bool = True, -): +) -> Series: """ Return a data hash of the Index/Series/DataFrame. Parameters ---------- + obj : Index, Series, or DataFrame index : bool, default True Include the index in the hash (if Series/DataFrame). encoding : str, default 'utf8' @@ -139,13 +168,17 @@ def hash_pandas_object( return h -def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key): +def hash_tuples( + vals: Union[MultiIndex, Iterable[Tuple[Hashable, ...]]], + encoding: str = "utf8", + hash_key: str = _default_hash_key, +) -> np.ndarray: """ - Hash an MultiIndex / list-of-tuples efficiently + Hash an MultiIndex / listlike-of-tuples efficiently. Parameters ---------- - vals : MultiIndex, list-of-tuples, or single tuple + vals : MultiIndex or listlike-of-tuples encoding : str, default 'utf8' hash_key : str, default _default_hash_key @@ -153,11 +186,7 @@ def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key): ------- ndarray of hashed values array """ - is_tuple = False - if isinstance(vals, tuple): - vals = [vals] - is_tuple = True - elif not is_list_like(vals): + if not is_list_like(vals): raise TypeError("must be convertible to a list-of-tuples") from pandas import ( @@ -166,33 +195,33 @@ def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key): ) if not isinstance(vals, ABCMultiIndex): - vals = MultiIndex.from_tuples(vals) + mi = MultiIndex.from_tuples(vals) + else: + mi = vals # create a list-of-Categoricals - vals = [ - Categorical(vals.codes[level], vals.levels[level], ordered=False, fastpath=True) - for level in range(vals.nlevels) + cat_vals = [ + Categorical(mi.codes[level], mi.levels[level], ordered=False, fastpath=True) + for level in range(mi.nlevels) ] # hash the list-of-ndarrays hashes = ( - _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in vals + _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in cat_vals ) - h = combine_hash_arrays(hashes, len(vals)) - if is_tuple: - h = h[0] + h = combine_hash_arrays(hashes, len(cat_vals)) return h -def _hash_categorical(c, encoding: str, hash_key: str): +def _hash_categorical(cat: Categorical, encoding: str, hash_key: str) -> np.ndarray: """ Hash a Categorical by hashing its categories, and then mapping the codes to the hashes Parameters ---------- - c : Categorical + cat : Categorical encoding : str hash_key : str @@ -201,7 +230,7 @@ def _hash_categorical(c, encoding: str, hash_key: str): ndarray of hashed values array, same size as len(c) """ # Convert ExtensionArrays to ndarrays - values = np.asarray(c.categories._values) + values = np.asarray(cat.categories._values) hashed = hash_array(values, encoding, hash_key, categorize=False) # we have uint64, as we don't directly support missing values @@ -211,9 +240,9 @@ def _hash_categorical(c, encoding: str, hash_key: str): # # TODO: GH 15362 - mask = c.isna() + mask = cat.isna() if len(hashed): - result = hashed.take(c.codes) + result = hashed.take(cat.codes) else: result = np.zeros(len(mask), dtype="uint64") @@ -224,17 +253,17 @@ def _hash_categorical(c, encoding: str, hash_key: str): def hash_array( - vals, + vals: ArrayLike, encoding: str = "utf8", hash_key: str = _default_hash_key, categorize: bool = True, -): +) -> np.ndarray: """ Given a 1d array, return an array of deterministic integers. Parameters ---------- - vals : ndarray, Categorical + vals : ndarray or ExtensionArray encoding : str, default 'utf8' Encoding for data & key when strings. hash_key : str, default _default_hash_key @@ -255,10 +284,24 @@ def hash_array( # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke). if is_categorical_dtype(dtype): + vals = cast("Categorical", vals) return _hash_categorical(vals, encoding, hash_key) elif is_extension_array_dtype(dtype): vals, _ = vals._values_for_factorize() - dtype = vals.dtype + + return _hash_ndarray(vals, encoding, hash_key, categorize) + + +def _hash_ndarray( + vals: np.ndarray, + encoding: str = "utf8", + hash_key: str = _default_hash_key, + categorize: bool = True, +) -> np.ndarray: + """ + See hash_array.__doc__. + """ + dtype = vals.dtype # we'll be working with everything as 64-bit values, so handle this # 128-bit value early @@ -289,10 +332,10 @@ def hash_array( return _hash_categorical(cat, encoding, hash_key) try: - vals = hashing.hash_object_array(vals, hash_key, encoding) + vals = hash_object_array(vals, hash_key, encoding) except TypeError: # we have mixed types - vals = hashing.hash_object_array( + vals = hash_object_array( vals.astype(str).astype(object), hash_key, encoding ) diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index 94786292adb51..e373323dfb6e1 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -113,8 +113,10 @@ def test_hash_tuples(): expected = hash_pandas_object(MultiIndex.from_tuples(tuples)).values tm.assert_numpy_array_equal(result, expected) - result = hash_tuples(tuples[0]) - assert result == expected[0] + # We only need to support MultiIndex and list-of-tuples + msg = "|".join(["object is not iterable", "zip argument #1 must support iteration"]) + with pytest.raises(TypeError, match=msg): + hash_tuples(tuples[0]) @pytest.mark.parametrize("val", [5, "foo", pd.Timestamp("20130101")])