From cd2ca27b265d1645a7ceac2150230c22bc5adc93 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 20 Feb 2021 19:20:22 -0800 Subject: [PATCH 1/6] TYP: hashing --- pandas/_libs/hashing.pyx | 4 +- pandas/core/util/hashing.py | 67 +++++++++++++++++++++---------- pandas/tests/util/test_hashing.py | 6 ++- 3 files changed, 53 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index ead967386ed1d..2dd2f1feadd70 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -27,7 +27,9 @@ DEF dROUNDS = 4 @cython.boundscheck(False) -def hash_object_array(ndarray[object] arr, str key, str encoding="utf8"): +def hash_object_array( + ndarray[object] arr, str key, str encoding="utf8" +) -> np.ndarray[np.uint64]: """ Parameters ---------- diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 6d375a92ea50a..ec29d9ec9510a 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -1,12 +1,25 @@ """ data hash pandas / numpy objects """ +from __future__ import annotations + import itertools -from typing import Optional +from typing import ( + TYPE_CHECKING, + Iterator, + List, + Optional, + Tuple, + Union, +) import numpy as np -import pandas._libs.hashing as hashing +from pandas._libs.hashing import hash_object_array +from pandas._typing import ( + ArrayLike, + FrameOrSeriesUnion, +) from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -20,17 +33,30 @@ ABCSeries, ) +if TYPE_CHECKING: + from pandas import ( + Categorical, + Index, + MultiIndex, + Series, + ) + + # 16 byte long hashing key _default_hash_key = "0123456789123456" -def combine_hash_arrays(arrays, num_items: int): +def combine_hash_arrays(arrays: Iterator[np.ndarray], num_items: int) -> np.ndarray: """ Parameters ---------- - arrays : generator + arrays : Iterator[np.ndarray] num_items : int + Returns + ------- + np.ndarray[int64] + Should be the same as CPython's tupleobject.c """ try: @@ -53,17 +79,18 @@ def combine_hash_arrays(arrays, num_items: int): def hash_pandas_object( - obj, + obj: Union[Index, FrameOrSeriesUnion], index: bool = True, encoding: str = "utf8", hash_key: Optional[str] = _default_hash_key, categorize: bool = True, -): +) -> Series: """ Return a data hash of the Index/Series/DataFrame. Parameters ---------- + obj : Index, Series, or DataFrame index : bool, default True Include the index in the hash (if Series/DataFrame). encoding : str, default 'utf8' @@ -139,13 +166,17 @@ def hash_pandas_object( return h -def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key): +def hash_tuples( + vals: Union[MultiIndex, List[Tuple]], + encoding: str = "utf8", + hash_key: str = _default_hash_key, +) -> np.ndarray: """ - Hash an MultiIndex / list-of-tuples efficiently + Hash an MultiIndex / list-of-tuples efficiently. Parameters ---------- - vals : MultiIndex, list-of-tuples, or single tuple + vals : MultiIndex or list-of-tuples encoding : str, default 'utf8' hash_key : str, default _default_hash_key @@ -153,11 +184,7 @@ def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key): ------- ndarray of hashed values array """ - is_tuple = False - if isinstance(vals, tuple): - vals = [vals] - is_tuple = True - elif not is_list_like(vals): + if not is_list_like(vals): raise TypeError("must be convertible to a list-of-tuples") from pandas import ( @@ -179,13 +206,11 @@ def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key): _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in vals ) h = combine_hash_arrays(hashes, len(vals)) - if is_tuple: - h = h[0] return h -def _hash_categorical(c, encoding: str, hash_key: str): +def _hash_categorical(c: Categorical, encoding: str, hash_key: str) -> np.ndarray: """ Hash a Categorical by hashing its categories, and then mapping the codes to the hashes @@ -224,11 +249,11 @@ def _hash_categorical(c, encoding: str, hash_key: str): def hash_array( - vals, + vals: ArrayLike, encoding: str = "utf8", hash_key: str = _default_hash_key, categorize: bool = True, -): +) -> np.ndarray: """ Given a 1d array, return an array of deterministic integers. @@ -289,10 +314,10 @@ def hash_array( return _hash_categorical(cat, encoding, hash_key) try: - vals = hashing.hash_object_array(vals, hash_key, encoding) + vals = hash_object_array(vals, hash_key, encoding) except TypeError: # we have mixed types - vals = hashing.hash_object_array( + vals = hash_object_array( vals.astype(str).astype(object), hash_key, encoding ) diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index 94786292adb51..c0c5edda6bec2 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -113,8 +113,10 @@ def test_hash_tuples(): expected = hash_pandas_object(MultiIndex.from_tuples(tuples)).values tm.assert_numpy_array_equal(result, expected) - result = hash_tuples(tuples[0]) - assert result == expected[0] + # We only need to support MultiIndex and list-of-tuples + msg = "object is not iterable" + with pytest.raises(TypeError, match=msg): + hash_tuples(tuples[0]) @pytest.mark.parametrize("val", [5, "foo", pd.Timestamp("20130101")]) From ae9cc2800a8c44f52aefa41c721f3259726900e8 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 21 Feb 2021 09:11:02 -0800 Subject: [PATCH 2/6] platform compat, mypy fixup --- pandas/core/util/hashing.py | 38 +++++++++++++++++++++++-------- pandas/tests/util/test_hashing.py | 2 +- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index ec29d9ec9510a..7a9a7d16877cf 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -11,6 +11,7 @@ Optional, Tuple, Union, + cast, ) import numpy as np @@ -193,31 +194,33 @@ def hash_tuples( ) if not isinstance(vals, ABCMultiIndex): - vals = MultiIndex.from_tuples(vals) + mi = MultiIndex.from_tuples(vals) + else: + mi = vals # create a list-of-Categoricals - vals = [ - Categorical(vals.codes[level], vals.levels[level], ordered=False, fastpath=True) - for level in range(vals.nlevels) + cat_vals = [ + Categorical(mi.codes[level], mi.levels[level], ordered=False, fastpath=True) + for level in range(mi.nlevels) ] # hash the list-of-ndarrays hashes = ( - _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in vals + _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in cat_vals ) h = combine_hash_arrays(hashes, len(vals)) return h -def _hash_categorical(c: Categorical, encoding: str, hash_key: str) -> np.ndarray: +def _hash_categorical(cat: Categorical, encoding: str, hash_key: str) -> np.ndarray: """ Hash a Categorical by hashing its categories, and then mapping the codes to the hashes Parameters ---------- - c : Categorical + cat : Categorical encoding : str hash_key : str @@ -226,7 +229,7 @@ def _hash_categorical(c: Categorical, encoding: str, hash_key: str) -> np.ndarra ndarray of hashed values array, same size as len(c) """ # Convert ExtensionArrays to ndarrays - values = np.asarray(c.categories._values) + values = np.asarray(cat.categories._values) hashed = hash_array(values, encoding, hash_key, categorize=False) # we have uint64, as we don't directly support missing values @@ -236,9 +239,9 @@ def _hash_categorical(c: Categorical, encoding: str, hash_key: str) -> np.ndarra # # TODO: GH 15362 - mask = c.isna() + mask = cat.isna() if len(hashed): - result = hashed.take(c.codes) + result = hashed.take(cat.codes) else: result = np.zeros(len(mask), dtype="uint64") @@ -280,11 +283,26 @@ def hash_array( # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke). if is_categorical_dtype(dtype): + vals = cast("Categorical", vals) return _hash_categorical(vals, encoding, hash_key) elif is_extension_array_dtype(dtype): vals, _ = vals._values_for_factorize() dtype = vals.dtype + return _hash_ndarray(vals, encoding, hash_key, categorize) + + +def _hash_ndarray( + vals: np.ndarray, + encoding: str = "utf8", + hash_key: str = _default_hash_key, + categorize: bool = True, +) -> np.ndarray: + """ + See hash_array.__doc__. + """ + dtype = vals.dtype + # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(dtype, np.complex128): diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index c0c5edda6bec2..e373323dfb6e1 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -114,7 +114,7 @@ def test_hash_tuples(): tm.assert_numpy_array_equal(result, expected) # We only need to support MultiIndex and list-of-tuples - msg = "object is not iterable" + msg = "|".join(["object is not iterable", "zip argument #1 must support iteration"]) with pytest.raises(TypeError, match=msg): hash_tuples(tuples[0]) From 60e2c4f4694efe6cdd5b97531874f84f4aa28f13 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 21 Feb 2021 11:28:05 -0800 Subject: [PATCH 3/6] setops fix --- pandas/core/util/hashing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 7a9a7d16877cf..9bb360597cdf3 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -208,7 +208,7 @@ def hash_tuples( hashes = ( _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in cat_vals ) - h = combine_hash_arrays(hashes, len(vals)) + h = combine_hash_arrays(hashes, len(cat_vals)) return h From 89becd926189d8d3b45f8c5e3d51d5236788ec3f Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 7 Mar 2021 14:19:28 -0800 Subject: [PATCH 4/6] port more annotations --- pandas/_libs/lib.pyx | 37 ++++++++++++++++++++++--------------- pandas/core/frame.py | 2 +- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4e04425436af4..1ff481553e413 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -269,7 +269,7 @@ def item_from_zerodim(val: object) -> object: @cython.wraparound(False) @cython.boundscheck(False) -def fast_unique_multiple(list arrays, sort: bool = True): +def fast_unique_multiple(list arrays, sort: bool = True) -> list: """ Generate a list of unique values from a list of arrays. @@ -345,7 +345,7 @@ def fast_unique_multiple_list(lists: list, sort: bool = True) -> list: @cython.wraparound(False) @cython.boundscheck(False) -def fast_unique_multiple_list_gen(object gen, bint sort=True): +def fast_unique_multiple_list_gen(object gen, bint sort=True) -> list: """ Generate a list of unique values from a generator of lists. @@ -409,7 +409,7 @@ def dicts_to_array(dicts: list, columns: list): return result -def fast_zip(list ndarrays): +def fast_zip(list ndarrays) -> ndarray[object]: """ For zipping multiple ndarrays into an ndarray of tuples. """ @@ -621,7 +621,7 @@ def array_equivalent_object(left: object[:], right: object[:]) -> bool: @cython.wraparound(False) @cython.boundscheck(False) -def astype_intsafe(ndarray[object] arr, new_dtype): +def astype_intsafe(ndarray[object] arr, new_dtype) -> ndarray: cdef: Py_ssize_t i, n = len(arr) object val @@ -891,7 +891,7 @@ def generate_slices(const int64_t[:] labels, Py_ssize_t ngroups): def indices_fast(ndarray index, const int64_t[:] labels, list keys, - list sorted_labels): + list sorted_labels) -> dict: """ Parameters ---------- @@ -1979,8 +1979,12 @@ cpdef bint is_interval_array(ndarray values): @cython.boundscheck(False) @cython.wraparound(False) -def maybe_convert_numeric(ndarray[object] values, set na_values, - bint convert_empty=True, bint coerce_numeric=False): +def maybe_convert_numeric( + ndarray[object] values, + set na_values, + bint convert_empty=True, + bint coerce_numeric=False, +) -> ndarray: """ Convert object array to a numeric array if possible. @@ -2154,7 +2158,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, def maybe_convert_objects(ndarray[object] objects, bint try_float=False, bint safe=False, bint convert_datetime=False, bint convert_timedelta=False, - bint convert_to_nullable_integer=False): + bint convert_to_nullable_integer=False) -> "ArrayLike": """ Type inference function-- convert object array to proper dtype @@ -2181,6 +2185,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, Returns ------- np.ndarray or ExtensionArray + Array of converted object values to more specific dtypes if applicable. """ cdef: Py_ssize_t i, n @@ -2408,13 +2413,13 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, # Note: no_default is exported to the public API in pandas.api.extensions -no_default = object() #: Sentinel indicating the default value. +no_default = object() # Sentinel indicating the default value. @cython.boundscheck(False) @cython.wraparound(False) def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=True, - object na_value=no_default, object dtype=object): + object na_value=no_default, object dtype=object) -> "ArrayLike": """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2469,7 +2474,9 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=Tr @cython.boundscheck(False) @cython.wraparound(False) -def map_infer(ndarray arr, object f, bint convert=True, bint ignore_na=False): +def map_infer( + ndarray arr, object f, bint convert=True, bint ignore_na=False +) -> "ArrayLike": """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2483,7 +2490,7 @@ def map_infer(ndarray arr, object f, bint convert=True, bint ignore_na=False): Returns ------- - ndarray + np.ndarray or ExtensionArray """ cdef: Py_ssize_t i, n @@ -2513,7 +2520,7 @@ def map_infer(ndarray arr, object f, bint convert=True, bint ignore_na=False): return result -def to_object_array(rows: object, int min_width=0): +def to_object_array(rows: object, min_width: int = 0) -> ndarray: """ Convert a list of lists into an object array. @@ -2529,7 +2536,7 @@ def to_object_array(rows: object, int min_width=0): Returns ------- - numpy array of the object dtype. + np.ndarray[object, ndim=2] """ cdef: Py_ssize_t i, j, n, k, tmp @@ -2621,7 +2628,7 @@ def to_object_array_tuples(rows: object): @cython.wraparound(False) @cython.boundscheck(False) -def fast_multiget(dict mapping, ndarray keys, default=np.nan): +def fast_multiget(dict mapping, ndarray keys, default=np.nan) -> "ArrayLike": cdef: Py_ssize_t i, n = len(keys) object val diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 38cd730efabd1..170abaecc313c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9150,7 +9150,7 @@ def count( return result.astype("int64") - def _count_level(self, level: Level, axis: Axis = 0, numeric_only=False): + def _count_level(self, level: Level, axis: int = 0, numeric_only: bool = False): if numeric_only: frame = self._get_numeric_data() else: From 9ab4be5d24e5dfcdc7a7017ae8a3a57bc0033de2 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 8 Mar 2021 08:16:57 -0800 Subject: [PATCH 5/6] update annotations, docstring --- pandas/core/util/hashing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 9bb360597cdf3..1885438345f9b 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -6,6 +6,7 @@ import itertools from typing import ( TYPE_CHECKING, + Hashable, Iterator, List, Optional, @@ -168,7 +169,7 @@ def hash_pandas_object( def hash_tuples( - vals: Union[MultiIndex, List[Tuple]], + vals: Union[MultiIndex, List[Tuple[Hashable, ...]]], encoding: str = "utf8", hash_key: str = _default_hash_key, ) -> np.ndarray: @@ -262,7 +263,7 @@ def hash_array( Parameters ---------- - vals : ndarray, Categorical + vals : ndarray or ExtensionArray encoding : str, default 'utf8' Encoding for data & key when strings. hash_key : str, default _default_hash_key @@ -287,7 +288,6 @@ def hash_array( return _hash_categorical(vals, encoding, hash_key) elif is_extension_array_dtype(dtype): vals, _ = vals._values_for_factorize() - dtype = vals.dtype return _hash_ndarray(vals, encoding, hash_key, categorize) From c42375767c5f475e80baf4e5ff44957767c3b479 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 8 Mar 2021 09:02:41 -0800 Subject: [PATCH 6/6] List-> Iterable --- pandas/core/util/hashing.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 1885438345f9b..9d488bb13b0f1 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -7,8 +7,8 @@ from typing import ( TYPE_CHECKING, Hashable, + Iterable, Iterator, - List, Optional, Tuple, Union, @@ -169,16 +169,16 @@ def hash_pandas_object( def hash_tuples( - vals: Union[MultiIndex, List[Tuple[Hashable, ...]]], + vals: Union[MultiIndex, Iterable[Tuple[Hashable, ...]]], encoding: str = "utf8", hash_key: str = _default_hash_key, ) -> np.ndarray: """ - Hash an MultiIndex / list-of-tuples efficiently. + Hash an MultiIndex / listlike-of-tuples efficiently. Parameters ---------- - vals : MultiIndex or list-of-tuples + vals : MultiIndex or listlike-of-tuples encoding : str, default 'utf8' hash_key : str, default _default_hash_key