pandas-dev · jbrockmendel · Mar 8, 2021 · Feb 21, 2021 · Feb 21, 2021 · Feb 21, 2021
diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx
@@ -27,7 +27,9 @@ DEF dROUNDS = 4
 
 
 @cython.boundscheck(False)
-def hash_object_array(ndarray[object] arr, str key, str encoding="utf8"):
+def hash_object_array(
+    ndarray[object] arr, str key, str encoding="utf8"
+) -> np.ndarray[np.uint64]:
     """
     Parameters
     ----------

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -269,7 +269,7 @@ def item_from_zerodim(val: object) -> object:
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def fast_unique_multiple(list arrays, sort: bool = True):
+def fast_unique_multiple(list arrays, sort: bool = True) -> list:
     """
     Generate a list of unique values from a list of arrays.
 
@@ -345,7 +345,7 @@ def fast_unique_multiple_list(lists: list, sort: bool = True) -> list:
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def fast_unique_multiple_list_gen(object gen, bint sort=True):
+def fast_unique_multiple_list_gen(object gen, bint sort=True) -> list:
     """
     Generate a list of unique values from a generator of lists.
 
@@ -409,7 +409,7 @@ def dicts_to_array(dicts: list, columns: list):
     return result
 
 
-def fast_zip(list ndarrays):
+def fast_zip(list ndarrays) -> ndarray[object]:
     """
     For zipping multiple ndarrays into an ndarray of tuples.
     """
@@ -621,7 +621,7 @@ def array_equivalent_object(left: object[:], right: object[:]) -> bool:
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def astype_intsafe(ndarray[object] arr, new_dtype):
+def astype_intsafe(ndarray[object] arr, new_dtype) -> ndarray:
     cdef:
         Py_ssize_t i, n = len(arr)
         object val
@@ -891,7 +891,7 @@ def generate_slices(const int64_t[:] labels, Py_ssize_t ngroups):
 
 
 def indices_fast(ndarray index, const int64_t[:] labels, list keys,
-                 list sorted_labels):
+                 list sorted_labels) -> dict:
     """
     Parameters
     ----------
@@ -1979,8 +1979,12 @@ cpdef bint is_interval_array(ndarray values):
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def maybe_convert_numeric(ndarray[object] values, set na_values,
-                          bint convert_empty=True, bint coerce_numeric=False):
+def maybe_convert_numeric(
+    ndarray[object] values,
+    set na_values,
+    bint convert_empty=True,
+    bint coerce_numeric=False,
+) -> ndarray:
     """
     Convert object array to a numeric array if possible.
 
@@ -2154,7 +2158,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
 def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
                           bint safe=False, bint convert_datetime=False,
                           bint convert_timedelta=False,
-                          bint convert_to_nullable_integer=False):
+                          bint convert_to_nullable_integer=False) -> "ArrayLike":
     """
     Type inference function-- convert object array to proper dtype
 
@@ -2181,6 +2185,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
     Returns
     -------
     np.ndarray or ExtensionArray
+        Array of converted object values to more specific dtypes if applicable.
     """
     cdef:
         Py_ssize_t i, n
@@ -2408,13 +2413,13 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
 
 
 # Note: no_default is exported to the public API in pandas.api.extensions
-no_default = object()  #: Sentinel indicating the default value.
+no_default = object()  # Sentinel indicating the default value.
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=True,
-                   object na_value=no_default, object dtype=object):
+                   object na_value=no_default, object dtype=object) -> "ArrayLike":
     """
     Substitute for np.vectorize with pandas-friendly dtype inference.
 
@@ -2469,7 +2474,9 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=Tr
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def map_infer(ndarray arr, object f, bint convert=True, bint ignore_na=False):
+def map_infer(
+    ndarray arr, object f, bint convert=True, bint ignore_na=False
+) -> "ArrayLike":
     """
     Substitute for np.vectorize with pandas-friendly dtype inference.
 
@@ -2483,7 +2490,7 @@ def map_infer(ndarray arr, object f, bint convert=True, bint ignore_na=False):
 
     Returns
     -------
-    ndarray
+    np.ndarray or ExtensionArray
     """
     cdef:
         Py_ssize_t i, n
@@ -2513,7 +2520,7 @@ def map_infer(ndarray arr, object f, bint convert=True, bint ignore_na=False):
     return result
 
 
-def to_object_array(rows: object, int min_width=0):
+def to_object_array(rows: object, min_width: int = 0) -> ndarray:
     """
     Convert a list of lists into an object array.
 
@@ -2529,7 +2536,7 @@ def to_object_array(rows: object, int min_width=0):
 
     Returns
     -------
-    numpy array of the object dtype.
+    np.ndarray[object, ndim=2]
     """
     cdef:
         Py_ssize_t i, j, n, k, tmp
@@ -2621,7 +2628,7 @@ def to_object_array_tuples(rows: object):
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def fast_multiget(dict mapping, ndarray keys, default=np.nan):
+def fast_multiget(dict mapping, ndarray keys, default=np.nan) -> "ArrayLike":
     cdef:
         Py_ssize_t i, n = len(keys)
         object val

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -9150,7 +9150,7 @@ def count(
 
         return result.astype("int64")
 
-    def _count_level(self, level: Level, axis: Axis = 0, numeric_only=False):
+    def _count_level(self, level: Level, axis: int = 0, numeric_only: bool = False):
         if numeric_only:
             frame = self._get_numeric_data()
         else:

diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
@@ -1,12 +1,26 @@
 """
 data hash pandas / numpy objects
 """
+from __future__ import annotations
+
 import itertools
-from typing import Optional
+from typing import (
+    TYPE_CHECKING,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Union,
+    cast,
+)
 
 import numpy as np
 
-import pandas._libs.hashing as hashing
+from pandas._libs.hashing import hash_object_array
+from pandas._typing import (
+    ArrayLike,
+    FrameOrSeriesUnion,
+)
 
 from pandas.core.dtypes.common import (
     is_categorical_dtype,
@@ -20,17 +34,30 @@
     ABCSeries,
 )
 
+if TYPE_CHECKING:
+    from pandas import (
+        Categorical,
+        Index,
+        MultiIndex,
+        Series,
+    )
+
+
 # 16 byte long hashing key
 _default_hash_key = "0123456789123456"
 
 
-def combine_hash_arrays(arrays, num_items: int):
+def combine_hash_arrays(arrays: Iterator[np.ndarray], num_items: int) -> np.ndarray:
     """
     Parameters
     ----------
-    arrays : generator
+    arrays : Iterator[np.ndarray]
     num_items : int
 
+    Returns
+    -------
+    np.ndarray[int64]
+
     Should be the same as CPython's tupleobject.c
     """
     try:
@@ -53,17 +80,18 @@ def combine_hash_arrays(arrays, num_items: int):
 
 
 def hash_pandas_object(
-    obj,
+    obj: Union[Index, FrameOrSeriesUnion],
     index: bool = True,
     encoding: str = "utf8",
     hash_key: Optional[str] = _default_hash_key,
     categorize: bool = True,
-):
+) -> Series:
     """
     Return a data hash of the Index/Series/DataFrame.
 
     Parameters
     ----------
+    obj : Index, Series, or DataFrame
     index : bool, default True
         Include the index in the hash (if Series/DataFrame).
     encoding : str, default 'utf8'
@@ -139,25 +167,25 @@ def hash_pandas_object(
     return h
 
 
-def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key):
+def hash_tuples(
+    vals: Union[MultiIndex, List[Tuple]],
+    encoding: str = "utf8",
+    hash_key: str = _default_hash_key,
+) -> np.ndarray:
     """
-    Hash an MultiIndex / list-of-tuples efficiently
+    Hash an MultiIndex / list-of-tuples efficiently.
 
     Parameters
     ----------
-    vals : MultiIndex, list-of-tuples, or single tuple
+    vals : MultiIndex or list-of-tuples
     encoding : str, default 'utf8'
     hash_key : str, default _default_hash_key
 
     Returns
     -------
     ndarray of hashed values array
     """
-    is_tuple = False
-    if isinstance(vals, tuple):
-        vals = [vals]
-        is_tuple = True
-    elif not is_list_like(vals):
+    if not is_list_like(vals):
         raise TypeError("must be convertible to a list-of-tuples")
 
     from pandas import (
@@ -166,33 +194,33 @@ def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key):
     )
 
     if not isinstance(vals, ABCMultiIndex):
-        vals = MultiIndex.from_tuples(vals)
+        mi = MultiIndex.from_tuples(vals)
+    else:
+        mi = vals
 
     # create a list-of-Categoricals
-    vals = [
-        Categorical(vals.codes[level], vals.levels[level], ordered=False, fastpath=True)
-        for level in range(vals.nlevels)
+    cat_vals = [
+        Categorical(mi.codes[level], mi.levels[level], ordered=False, fastpath=True)
+        for level in range(mi.nlevels)
     ]
 
     # hash the list-of-ndarrays
     hashes = (
-        _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in vals
+        _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in cat_vals
     )
-    h = combine_hash_arrays(hashes, len(vals))
-    if is_tuple:
-        h = h[0]
+    h = combine_hash_arrays(hashes, len(cat_vals))
 
     return h
 
 
-def _hash_categorical(c, encoding: str, hash_key: str):
+def _hash_categorical(cat: Categorical, encoding: str, hash_key: str) -> np.ndarray:
     """
     Hash a Categorical by hashing its categories, and then mapping the codes
     to the hashes
 
     Parameters
     ----------
-    c : Categorical
+    cat : Categorical
     encoding : str
     hash_key : str
 
@@ -201,7 +229,7 @@ def _hash_categorical(c, encoding: str, hash_key: str):
     ndarray of hashed values array, same size as len(c)
     """
     # Convert ExtensionArrays to ndarrays
-    values = np.asarray(c.categories._values)
+    values = np.asarray(cat.categories._values)
     hashed = hash_array(values, encoding, hash_key, categorize=False)
 
     # we have uint64, as we don't directly support missing values
@@ -211,9 +239,9 @@ def _hash_categorical(c, encoding: str, hash_key: str):
     #
     # TODO: GH 15362
 
-    mask = c.isna()
+    mask = cat.isna()
     if len(hashed):
-        result = hashed.take(c.codes)
+        result = hashed.take(cat.codes)
     else:
         result = np.zeros(len(mask), dtype="uint64")
 
@@ -224,11 +252,11 @@ def _hash_categorical(c, encoding: str, hash_key: str):
 
 
 def hash_array(
-    vals,
+    vals: ArrayLike,
     encoding: str = "utf8",
     hash_key: str = _default_hash_key,
     categorize: bool = True,
-):
+) -> np.ndarray:
     """
     Given a 1d array, return an array of deterministic integers.
 
@@ -255,11 +283,26 @@ def hash_array(
     # hash values. (This check is above the complex check so that we don't ask
     # numpy if categorical is a subdtype of complex, as it will choke).
     if is_categorical_dtype(dtype):
+        vals = cast("Categorical", vals)
         return _hash_categorical(vals, encoding, hash_key)
     elif is_extension_array_dtype(dtype):
         vals, _ = vals._values_for_factorize()
         dtype = vals.dtype
 
+    return _hash_ndarray(vals, encoding, hash_key, categorize)
+
+
+def _hash_ndarray(
+    vals: np.ndarray,
+    encoding: str = "utf8",
+    hash_key: str = _default_hash_key,
+    categorize: bool = True,
+) -> np.ndarray:
+    """
+    See hash_array.__doc__.
+    """
+    dtype = vals.dtype
+
     # we'll be working with everything as 64-bit values, so handle this
     # 128-bit value early
     if np.issubdtype(dtype, np.complex128):
@@ -289,10 +332,10 @@ def hash_array(
             return _hash_categorical(cat, encoding, hash_key)
 
         try:
-            vals = hashing.hash_object_array(vals, hash_key, encoding)
+            vals = hash_object_array(vals, hash_key, encoding)
         except TypeError:
             # we have mixed types
-            vals = hashing.hash_object_array(
+            vals = hash_object_array(
                 vals.astype(str).astype(object), hash_key, encoding
             )