From cd2ca27b265d1645a7ceac2150230c22bc5adc93 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Sat, 20 Feb 2021 19:20:22 -0800
Subject: [PATCH 1/6] TYP: hashing

---
 pandas/_libs/hashing.pyx          |  4 +-
 pandas/core/util/hashing.py       | 67 +++++++++++++++++++++----------
 pandas/tests/util/test_hashing.py |  6 ++-
 3 files changed, 53 insertions(+), 24 deletions(-)

diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx
index ead967386ed1d..2dd2f1feadd70 100644
--- a/pandas/_libs/hashing.pyx
+++ b/pandas/_libs/hashing.pyx
@@ -27,7 +27,9 @@ DEF dROUNDS = 4
 
 
 @cython.boundscheck(False)
-def hash_object_array(ndarray[object] arr, str key, str encoding="utf8"):
+def hash_object_array(
+    ndarray[object] arr, str key, str encoding="utf8"
+) -> np.ndarray[np.uint64]:
     """
     Parameters
     ----------
diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
index 6d375a92ea50a..ec29d9ec9510a 100644
--- a/pandas/core/util/hashing.py
+++ b/pandas/core/util/hashing.py
@@ -1,12 +1,25 @@
 """
 data hash pandas / numpy objects
 """
+from __future__ import annotations
+
 import itertools
-from typing import Optional
+from typing import (
+    TYPE_CHECKING,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
 
 import numpy as np
 
-import pandas._libs.hashing as hashing
+from pandas._libs.hashing import hash_object_array
+from pandas._typing import (
+    ArrayLike,
+    FrameOrSeriesUnion,
+)
 
 from pandas.core.dtypes.common import (
     is_categorical_dtype,
@@ -20,17 +33,30 @@
     ABCSeries,
 )
 
+if TYPE_CHECKING:
+    from pandas import (
+        Categorical,
+        Index,
+        MultiIndex,
+        Series,
+    )
+
+
 # 16 byte long hashing key
 _default_hash_key = "0123456789123456"
 
 
-def combine_hash_arrays(arrays, num_items: int):
+def combine_hash_arrays(arrays: Iterator[np.ndarray], num_items: int) -> np.ndarray:
     """
     Parameters
     ----------
-    arrays : generator
+    arrays : Iterator[np.ndarray]
     num_items : int
 
+    Returns
+    -------
+    np.ndarray[int64]
+
     Should be the same as CPython's tupleobject.c
     """
     try:
@@ -53,17 +79,18 @@ def combine_hash_arrays(arrays, num_items: int):
 
 
 def hash_pandas_object(
-    obj,
+    obj: Union[Index, FrameOrSeriesUnion],
     index: bool = True,
     encoding: str = "utf8",
     hash_key: Optional[str] = _default_hash_key,
     categorize: bool = True,
-):
+) -> Series:
     """
     Return a data hash of the Index/Series/DataFrame.
 
     Parameters
     ----------
+    obj : Index, Series, or DataFrame
     index : bool, default True
         Include the index in the hash (if Series/DataFrame).
     encoding : str, default 'utf8'
@@ -139,13 +166,17 @@ def hash_pandas_object(
     return h
 
 
-def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key):
+def hash_tuples(
+    vals: Union[MultiIndex, List[Tuple]],
+    encoding: str = "utf8",
+    hash_key: str = _default_hash_key,
+) -> np.ndarray:
     """
-    Hash an MultiIndex / list-of-tuples efficiently
+    Hash an MultiIndex / list-of-tuples efficiently.
 
     Parameters
     ----------
-    vals : MultiIndex, list-of-tuples, or single tuple
+    vals : MultiIndex or list-of-tuples
     encoding : str, default 'utf8'
     hash_key : str, default _default_hash_key
 
@@ -153,11 +184,7 @@ def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key):
     -------
     ndarray of hashed values array
     """
-    is_tuple = False
-    if isinstance(vals, tuple):
-        vals = [vals]
-        is_tuple = True
-    elif not is_list_like(vals):
+    if not is_list_like(vals):
         raise TypeError("must be convertible to a list-of-tuples")
 
     from pandas import (
@@ -179,13 +206,11 @@ def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key):
         _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in vals
     )
     h = combine_hash_arrays(hashes, len(vals))
-    if is_tuple:
-        h = h[0]
 
     return h
 
 
-def _hash_categorical(c, encoding: str, hash_key: str):
+def _hash_categorical(c: Categorical, encoding: str, hash_key: str) -> np.ndarray:
     """
     Hash a Categorical by hashing its categories, and then mapping the codes
     to the hashes
@@ -224,11 +249,11 @@ def _hash_categorical(c, encoding: str, hash_key: str):
 
 
 def hash_array(
-    vals,
+    vals: ArrayLike,
     encoding: str = "utf8",
     hash_key: str = _default_hash_key,
     categorize: bool = True,
-):
+) -> np.ndarray:
     """
     Given a 1d array, return an array of deterministic integers.
 
@@ -289,10 +314,10 @@ def hash_array(
             return _hash_categorical(cat, encoding, hash_key)
 
         try:
-            vals = hashing.hash_object_array(vals, hash_key, encoding)
+            vals = hash_object_array(vals, hash_key, encoding)
         except TypeError:
             # we have mixed types
-            vals = hashing.hash_object_array(
+            vals = hash_object_array(
                 vals.astype(str).astype(object), hash_key, encoding
             )
 
diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py
index 94786292adb51..c0c5edda6bec2 100644
--- a/pandas/tests/util/test_hashing.py
+++ b/pandas/tests/util/test_hashing.py
@@ -113,8 +113,10 @@ def test_hash_tuples():
     expected = hash_pandas_object(MultiIndex.from_tuples(tuples)).values
     tm.assert_numpy_array_equal(result, expected)
 
-    result = hash_tuples(tuples[0])
-    assert result == expected[0]
+    # We only need to support MultiIndex and list-of-tuples
+    msg = "object is not iterable"
+    with pytest.raises(TypeError, match=msg):
+        hash_tuples(tuples[0])
 
 
 @pytest.mark.parametrize("val", [5, "foo", pd.Timestamp("20130101")])

From ae9cc2800a8c44f52aefa41c721f3259726900e8 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Sun, 21 Feb 2021 09:11:02 -0800
Subject: [PATCH 2/6] platform compat, mypy fixup

---
 pandas/core/util/hashing.py       | 38 +++++++++++++++++++++++--------
 pandas/tests/util/test_hashing.py |  2 +-
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
index ec29d9ec9510a..7a9a7d16877cf 100644
--- a/pandas/core/util/hashing.py
+++ b/pandas/core/util/hashing.py
@@ -11,6 +11,7 @@
     Optional,
     Tuple,
     Union,
+    cast,
 )
 
 import numpy as np
@@ -193,31 +194,33 @@ def hash_tuples(
     )
 
     if not isinstance(vals, ABCMultiIndex):
-        vals = MultiIndex.from_tuples(vals)
+        mi = MultiIndex.from_tuples(vals)
+    else:
+        mi = vals
 
     # create a list-of-Categoricals
-    vals = [
-        Categorical(vals.codes[level], vals.levels[level], ordered=False, fastpath=True)
-        for level in range(vals.nlevels)
+    cat_vals = [
+        Categorical(mi.codes[level], mi.levels[level], ordered=False, fastpath=True)
+        for level in range(mi.nlevels)
     ]
 
     # hash the list-of-ndarrays
     hashes = (
-        _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in vals
+        _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in cat_vals
     )
     h = combine_hash_arrays(hashes, len(vals))
 
     return h
 
 
-def _hash_categorical(c: Categorical, encoding: str, hash_key: str) -> np.ndarray:
+def _hash_categorical(cat: Categorical, encoding: str, hash_key: str) -> np.ndarray:
     """
     Hash a Categorical by hashing its categories, and then mapping the codes
     to the hashes
 
     Parameters
     ----------
-    c : Categorical
+    cat : Categorical
     encoding : str
     hash_key : str
 
@@ -226,7 +229,7 @@ def _hash_categorical(c: Categorical, encoding: str, hash_key: str) -> np.ndarra
     ndarray of hashed values array, same size as len(c)
     """
     # Convert ExtensionArrays to ndarrays
-    values = np.asarray(c.categories._values)
+    values = np.asarray(cat.categories._values)
     hashed = hash_array(values, encoding, hash_key, categorize=False)
 
     # we have uint64, as we don't directly support missing values
@@ -236,9 +239,9 @@ def _hash_categorical(c: Categorical, encoding: str, hash_key: str) -> np.ndarra
     #
     # TODO: GH 15362
 
-    mask = c.isna()
+    mask = cat.isna()
     if len(hashed):
-        result = hashed.take(c.codes)
+        result = hashed.take(cat.codes)
     else:
         result = np.zeros(len(mask), dtype="uint64")
 
@@ -280,11 +283,26 @@ def hash_array(
     # hash values. (This check is above the complex check so that we don't ask
     # numpy if categorical is a subdtype of complex, as it will choke).
     if is_categorical_dtype(dtype):
+        vals = cast("Categorical", vals)
         return _hash_categorical(vals, encoding, hash_key)
     elif is_extension_array_dtype(dtype):
         vals, _ = vals._values_for_factorize()
         dtype = vals.dtype
 
+    return _hash_ndarray(vals, encoding, hash_key, categorize)
+
+
+def _hash_ndarray(
+    vals: np.ndarray,
+    encoding: str = "utf8",
+    hash_key: str = _default_hash_key,
+    categorize: bool = True,
+) -> np.ndarray:
+    """
+    See hash_array.__doc__.
+    """
+    dtype = vals.dtype
+
     # we'll be working with everything as 64-bit values, so handle this
     # 128-bit value early
     if np.issubdtype(dtype, np.complex128):
diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py
index c0c5edda6bec2..e373323dfb6e1 100644
--- a/pandas/tests/util/test_hashing.py
+++ b/pandas/tests/util/test_hashing.py
@@ -114,7 +114,7 @@ def test_hash_tuples():
     tm.assert_numpy_array_equal(result, expected)
 
     # We only need to support MultiIndex and list-of-tuples
-    msg = "object is not iterable"
+    msg = "|".join(["object is not iterable", "zip argument #1 must support iteration"])
     with pytest.raises(TypeError, match=msg):
         hash_tuples(tuples[0])
 

From 60e2c4f4694efe6cdd5b97531874f84f4aa28f13 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Sun, 21 Feb 2021 11:28:05 -0800
Subject: [PATCH 3/6] setops fix

---
 pandas/core/util/hashing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
index 7a9a7d16877cf..9bb360597cdf3 100644
--- a/pandas/core/util/hashing.py
+++ b/pandas/core/util/hashing.py
@@ -208,7 +208,7 @@ def hash_tuples(
     hashes = (
         _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in cat_vals
     )
-    h = combine_hash_arrays(hashes, len(vals))
+    h = combine_hash_arrays(hashes, len(cat_vals))
 
     return h
 

From 89becd926189d8d3b45f8c5e3d51d5236788ec3f Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Sun, 7 Mar 2021 14:19:28 -0800
Subject: [PATCH 4/6] port more annotations

---
 pandas/_libs/lib.pyx | 37 ++++++++++++++++++++++---------------
 pandas/core/frame.py |  2 +-
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 4e04425436af4..1ff481553e413 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -269,7 +269,7 @@ def item_from_zerodim(val: object) -> object:
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def fast_unique_multiple(list arrays, sort: bool = True):
+def fast_unique_multiple(list arrays, sort: bool = True) -> list:
     """
     Generate a list of unique values from a list of arrays.
 
@@ -345,7 +345,7 @@ def fast_unique_multiple_list(lists: list, sort: bool = True) -> list:
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def fast_unique_multiple_list_gen(object gen, bint sort=True):
+def fast_unique_multiple_list_gen(object gen, bint sort=True) -> list:
     """
     Generate a list of unique values from a generator of lists.
 
@@ -409,7 +409,7 @@ def dicts_to_array(dicts: list, columns: list):
     return result
 
 
-def fast_zip(list ndarrays):
+def fast_zip(list ndarrays) -> ndarray[object]:
     """
     For zipping multiple ndarrays into an ndarray of tuples.
     """
@@ -621,7 +621,7 @@ def array_equivalent_object(left: object[:], right: object[:]) -> bool:
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def astype_intsafe(ndarray[object] arr, new_dtype):
+def astype_intsafe(ndarray[object] arr, new_dtype) -> ndarray:
     cdef:
         Py_ssize_t i, n = len(arr)
         object val
@@ -891,7 +891,7 @@ def generate_slices(const int64_t[:] labels, Py_ssize_t ngroups):
 
 
 def indices_fast(ndarray index, const int64_t[:] labels, list keys,
-                 list sorted_labels):
+                 list sorted_labels) -> dict:
     """
     Parameters
     ----------
@@ -1979,8 +1979,12 @@ cpdef bint is_interval_array(ndarray values):
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def maybe_convert_numeric(ndarray[object] values, set na_values,
-                          bint convert_empty=True, bint coerce_numeric=False):
+def maybe_convert_numeric(
+    ndarray[object] values,
+    set na_values,
+    bint convert_empty=True,
+    bint coerce_numeric=False,
+) -> ndarray:
     """
     Convert object array to a numeric array if possible.
 
@@ -2154,7 +2158,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
 def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
                           bint safe=False, bint convert_datetime=False,
                           bint convert_timedelta=False,
-                          bint convert_to_nullable_integer=False):
+                          bint convert_to_nullable_integer=False) -> "ArrayLike":
     """
     Type inference function-- convert object array to proper dtype
 
@@ -2181,6 +2185,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
     Returns
     -------
     np.ndarray or ExtensionArray
+        Array of converted object values to more specific dtypes if applicable.
     """
     cdef:
         Py_ssize_t i, n
@@ -2408,13 +2413,13 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
 
 
 # Note: no_default is exported to the public API in pandas.api.extensions
-no_default = object()  #: Sentinel indicating the default value.
+no_default = object()  # Sentinel indicating the default value.
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=True,
-                   object na_value=no_default, object dtype=object):
+                   object na_value=no_default, object dtype=object) -> "ArrayLike":
     """
     Substitute for np.vectorize with pandas-friendly dtype inference.
 
@@ -2469,7 +2474,9 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=Tr
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def map_infer(ndarray arr, object f, bint convert=True, bint ignore_na=False):
+def map_infer(
+    ndarray arr, object f, bint convert=True, bint ignore_na=False
+) -> "ArrayLike":
     """
     Substitute for np.vectorize with pandas-friendly dtype inference.
 
@@ -2483,7 +2490,7 @@ def map_infer(ndarray arr, object f, bint convert=True, bint ignore_na=False):
 
     Returns
     -------
-    ndarray
+    np.ndarray or ExtensionArray
     """
     cdef:
         Py_ssize_t i, n
@@ -2513,7 +2520,7 @@ def map_infer(ndarray arr, object f, bint convert=True, bint ignore_na=False):
     return result
 
 
-def to_object_array(rows: object, int min_width=0):
+def to_object_array(rows: object, min_width: int = 0) -> ndarray:
     """
     Convert a list of lists into an object array.
 
@@ -2529,7 +2536,7 @@ def to_object_array(rows: object, int min_width=0):
 
     Returns
     -------
-    numpy array of the object dtype.
+    np.ndarray[object, ndim=2]
     """
     cdef:
         Py_ssize_t i, j, n, k, tmp
@@ -2621,7 +2628,7 @@ def to_object_array_tuples(rows: object):
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def fast_multiget(dict mapping, ndarray keys, default=np.nan):
+def fast_multiget(dict mapping, ndarray keys, default=np.nan) -> "ArrayLike":
     cdef:
         Py_ssize_t i, n = len(keys)
         object val
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 38cd730efabd1..170abaecc313c 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -9150,7 +9150,7 @@ def count(
 
         return result.astype("int64")
 
-    def _count_level(self, level: Level, axis: Axis = 0, numeric_only=False):
+    def _count_level(self, level: Level, axis: int = 0, numeric_only: bool = False):
         if numeric_only:
             frame = self._get_numeric_data()
         else:

From 9ab4be5d24e5dfcdc7a7017ae8a3a57bc0033de2 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 8 Mar 2021 08:16:57 -0800
Subject: [PATCH 5/6] update annotations, docstring

---
 pandas/core/util/hashing.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
index 9bb360597cdf3..1885438345f9b 100644
--- a/pandas/core/util/hashing.py
+++ b/pandas/core/util/hashing.py
@@ -6,6 +6,7 @@
 import itertools
 from typing import (
     TYPE_CHECKING,
+    Hashable,
     Iterator,
     List,
     Optional,
@@ -168,7 +169,7 @@ def hash_pandas_object(
 
 
 def hash_tuples(
-    vals: Union[MultiIndex, List[Tuple]],
+    vals: Union[MultiIndex, List[Tuple[Hashable, ...]]],
     encoding: str = "utf8",
     hash_key: str = _default_hash_key,
 ) -> np.ndarray:
@@ -262,7 +263,7 @@ def hash_array(
 
     Parameters
     ----------
-    vals : ndarray, Categorical
+    vals : ndarray or ExtensionArray
     encoding : str, default 'utf8'
         Encoding for data & key when strings.
     hash_key : str, default _default_hash_key
@@ -287,7 +288,6 @@ def hash_array(
         return _hash_categorical(vals, encoding, hash_key)
     elif is_extension_array_dtype(dtype):
         vals, _ = vals._values_for_factorize()
-        dtype = vals.dtype
 
     return _hash_ndarray(vals, encoding, hash_key, categorize)
 

From c42375767c5f475e80baf4e5ff44957767c3b479 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 8 Mar 2021 09:02:41 -0800
Subject: [PATCH 6/6] List-> Iterable

---
 pandas/core/util/hashing.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
index 1885438345f9b..9d488bb13b0f1 100644
--- a/pandas/core/util/hashing.py
+++ b/pandas/core/util/hashing.py
@@ -7,8 +7,8 @@
 from typing import (
     TYPE_CHECKING,
     Hashable,
+    Iterable,
     Iterator,
-    List,
     Optional,
     Tuple,
     Union,
@@ -169,16 +169,16 @@ def hash_pandas_object(
 
 
 def hash_tuples(
-    vals: Union[MultiIndex, List[Tuple[Hashable, ...]]],
+    vals: Union[MultiIndex, Iterable[Tuple[Hashable, ...]]],
     encoding: str = "utf8",
     hash_key: str = _default_hash_key,
 ) -> np.ndarray:
     """
-    Hash an MultiIndex / list-of-tuples efficiently.
+    Hash an MultiIndex / listlike-of-tuples efficiently.
 
     Parameters
     ----------
-    vals : MultiIndex or list-of-tuples
+    vals : MultiIndex or listlike-of-tuples
     encoding : str, default 'utf8'
     hash_key : str, default _default_hash_key