From b0e057adeb1ca83ae89b7ec7e48dd50b523eaac8 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Fri, 25 Oct 2019 14:44:50 -0700
Subject: [PATCH 1/3] CLN: further algorithms cleanup

---
 pandas/core/algorithms.py | 76 +++++++++++++++++++--------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 8f72245b1f4eb..37fa5ef7c3486 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -77,27 +77,28 @@ def _ensure_data(values, dtype=None):
 
     Returns
     -------
-    (ndarray, pandas_dtype, algo dtype as a string)
+    values : ndarray
+    pandas_dtype : str or dtype
 
     """
 
     # we check some simple dtypes first
     if is_object_dtype(dtype):
-        return ensure_object(np.asarray(values)), "object", "object"
+        return ensure_object(np.asarray(values)), "object"
     elif is_object_dtype(values) and dtype is None:
-        return ensure_object(np.asarray(values)), "object", "object"
+        return ensure_object(np.asarray(values)), "object"
 
     try:
         if is_bool_dtype(values) or is_bool_dtype(dtype):
             # we are actually coercing to uint64
             # until our algos support uint8 directly (see TODO)
-            return np.asarray(values).astype("uint64"), "bool", "uint64"
+            return np.asarray(values).astype("uint64"), "bool"
         elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype):
-            return ensure_int64(values), "int64", "int64"
+            return ensure_int64(values), "int64"
         elif is_unsigned_integer_dtype(values) or is_unsigned_integer_dtype(dtype):
-            return ensure_uint64(values), "uint64", "uint64"
+            return ensure_uint64(values), "uint64"
         elif is_float_dtype(values) or is_float_dtype(dtype):
-            return ensure_float64(values), "float64", "float64"
+            return ensure_float64(values), "float64"
         elif is_complex_dtype(values) or is_complex_dtype(dtype):
 
             # ignore the fact that we are casting to float
@@ -105,12 +106,12 @@ def _ensure_data(values, dtype=None):
             with catch_warnings():
                 simplefilter("ignore", np.ComplexWarning)
                 values = ensure_float64(values)
-            return values, "float64", "float64"
+            return values, "float64"
 
     except (TypeError, ValueError, OverflowError):
         # if we are trying to coerce to a dtype
         # and it is incompat this will fall thru to here
-        return ensure_object(values), "object", "object"
+        return ensure_object(values), "object"
 
     # datetimelike
     if (
@@ -136,14 +137,14 @@ def _ensure_data(values, dtype=None):
                 # Note: this is reached by DataFrame.rank calls GH#27027
                 asi8 = values.view("i8")
                 dtype = values.dtype
-                return asi8, dtype, "int64"
+                return asi8, dtype
 
             from pandas import DatetimeIndex
 
             values = DatetimeIndex(values)
             dtype = values.dtype
 
-        return values.asi8, dtype, "int64"
+        return values.asi8, dtype
 
     elif is_categorical_dtype(values) and (
         is_categorical_dtype(dtype) or dtype is None
@@ -156,11 +157,11 @@ def _ensure_data(values, dtype=None):
         # until our algos support int* directly (not all do)
         values = ensure_int64(values)
 
-        return values, dtype, "int64"
+        return values, dtype
 
     # we have failed, return object
     values = np.asarray(values, dtype=np.object)
-    return ensure_object(values), "object", "object"
+    return ensure_object(values), "object"
 
 
 def _reconstruct_data(values, dtype, original):
@@ -228,7 +229,8 @@ def _get_hashtable_algo(values):
     values : ndarray
     dtype : str or dtype
     """
-    values, dtype, ndtype = _ensure_data(values)
+    values, dtype = _ensure_data(values)
+    ndtype = values.dtype.name
 
     if ndtype == "object":
 
@@ -246,12 +248,13 @@ def _get_values_for_rank(values):
     if is_categorical_dtype(values):
         values = values._values_for_rank()
 
-    values, _, ndtype = _ensure_data(values)
-    return values, ndtype
+    values, _ = _ensure_data(values)
+    return values
 
 
 def _get_data_algo(values):
-    values, ndtype = _get_values_for_rank(values)
+    values = _get_values_for_rank(values)
+    ndtype = values.dtype.name
 
     if ndtype == "object":
 
@@ -284,16 +287,13 @@ def match(to_match, values, na_sentinel=-1):
     na_sentinel : int, default -1
         Value to mark "not found"
 
-    Examples
-    --------
-
     Returns
     -------
     match : ndarray of integers
     """
     values = com.asarray_tuplesafe(values)
     htable, values, dtype = _get_hashtable_algo(values)
-    to_match, _, _ = _ensure_data(to_match, dtype)
+    to_match, _ = _ensure_data(to_match, dtype)
     table = htable(min(len(to_match), 1000000))
     table.map_locations(values)
     result = table.lookup(to_match)
@@ -445,11 +445,11 @@ def isin(comps, values):
 
     comps = com.values_from_object(comps)
 
-    comps, dtype, _ = _ensure_data(comps)
-    values, _, _ = _ensure_data(values, dtype=dtype)
+    comps, dtype = _ensure_data(comps)
+    values, _ = _ensure_data(values, dtype=dtype)
 
     # faster for larger cases to use np.in1d
-    f = lambda x, y: htable.ismember_object(x, values)
+    f = lambda x, y: htable.ismember_object(x, y)
 
     # GH16012
     # Ensure np.in1d doesn't get object types or it *may* throw an exception
@@ -655,7 +655,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
         labels, uniques = values.factorize(na_sentinel=na_sentinel)
         dtype = original.dtype
     else:
-        values, dtype, _ = _ensure_data(values)
+        values, dtype = _ensure_data(values)
 
         if original.dtype.kind in ["m", "M"]:
             na_value = na_value_for_dtype(original.dtype)
@@ -716,15 +716,14 @@ def value_counts(
     -------
     Series
     """
-    from pandas.core.series import Series, Index
+    from pandas.core.series import Series
 
     name = getattr(values, "name", None)
 
     if bins is not None:
+        from pandas.core.reshape.tile import cut
+        values = Series(values)
         try:
-            from pandas.core.reshape.tile import cut
-
-            values = Series(values)
             ii = cut(values, bins, include_lowest=True)
         except TypeError:
             raise TypeError("bins argument only works with numeric data.")
@@ -754,8 +753,6 @@ def value_counts(
         else:
             keys, counts = _value_counts_arraylike(values, dropna)
 
-            if not isinstance(keys, Index):
-                keys = Index(keys)
             result = Series(counts, index=keys, name=name)
 
     if sort:
@@ -781,9 +778,10 @@ def _value_counts_arraylike(values, dropna):
     """
     values = _ensure_arraylike(values)
     original = values
-    values, dtype, ndtype = _ensure_data(values)
+    values, dtype = _ensure_data(values)
+    ndtype = values.dtype.name
 
-    if needs_i8_conversion(dtype):
+    if needs_i8_conversion(original.dtype):
         # i8
 
         keys, counts = htable.value_count_int64(values, dropna)
@@ -830,7 +828,8 @@ def duplicated(values, keep="first"):
     duplicated : ndarray
     """
 
-    values, _, ndtype = _ensure_data(values)
+    values, _ = _ensure_data(values)
+    ndtype = values.dtype.name
     f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype))
     return f(values, keep=keep)
 
@@ -867,7 +866,8 @@ def mode(values, dropna: bool = True):
         mask = values.isnull()
         values = values[~mask]
 
-    values, _, ndtype = _ensure_data(values)
+    values, _ = _ensure_data(values)
+    ndtype = values.dtype.name
 
     f = getattr(htable, "mode_{dtype}".format(dtype=ndtype))
     result = f(values, dropna=dropna)
@@ -905,7 +905,7 @@ def rank(values, axis=0, method="average", na_option="keep", ascending=True, pct
         (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
     """
     if values.ndim == 1:
-        values, _ = _get_values_for_rank(values)
+        values = _get_values_for_rank(values)
         ranks = algos.rank_1d(
             values,
             ties_method=method,
@@ -914,7 +914,7 @@ def rank(values, axis=0, method="average", na_option="keep", ascending=True, pct
             pct=pct,
         )
     elif values.ndim == 2:
-        values, _ = _get_values_for_rank(values)
+        values = _get_values_for_rank(values)
         ranks = algos.rank_2d(
             values,
             axis=axis,
@@ -1157,7 +1157,7 @@ def compute(self, method):
             return dropped[slc].sort_values(ascending=ascending).head(n)
 
         # fast method
-        arr, pandas_dtype, _ = _ensure_data(dropped.values)
+        arr, pandas_dtype = _ensure_data(dropped.values)
         if method == "nlargest":
             arr = -arr
             if is_integer_dtype(pandas_dtype):

From 91c42785aafa3c75bc9445274dec51d39ab3c407 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Tue, 29 Oct 2019 20:00:25 -0700
Subject: [PATCH 2/3] CLN: algos simplification, de-duplication, typing

---
 pandas/core/algorithms.py | 102 +++++++++++++++++++++++---------------
 1 file changed, 62 insertions(+), 40 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 10eb44c2d8b47..adc1f734dbeb6 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -79,7 +79,6 @@ def _ensure_data(values, dtype=None):
     -------
     values : ndarray
     pandas_dtype : str or dtype
-
     """
 
     # we check some simple dtypes first
@@ -229,16 +228,8 @@ def _get_hashtable_algo(values):
     values : ndarray
     """
     values, _ = _ensure_data(values)
-    ndtype = values.dtype.name
-
-    if ndtype == "object":
-
-        # it's cheaper to use a String Hash Table than Object; we infer
-        # including nulls because that is the only difference between
-        # StringHashTable and ObjectHashtable
-        if lib.infer_dtype(values, skipna=False) in ["string"]:
-            ndtype = "string"
 
+    ndtype = _check_object_for_strings(values)
     htable = _hashtables[ndtype]
     return htable, values
 
@@ -253,8 +244,27 @@ def _get_values_for_rank(values):
 
 def _get_data_algo(values):
     values = _get_values_for_rank(values)
-    ndtype = values.dtype.name
 
+    ndtype = _check_object_for_strings(values)
+    htable = _hashtables.get(ndtype, _hashtables["object"])
+
+    return htable, values
+
+
+def _check_object_for_strings(values) -> str:
+    """
+    Check if we can use string hashtable instead of object hashtable.
+
+    Parameters
+    ----------
+    values : ndarray
+    ndtype : str
+
+    Returns
+    -------
+    str
+    """
+    ndtype = values.dtype.name
     if ndtype == "object":
 
         # it's cheaper to use a String Hash Table than Object; we infer
@@ -262,10 +272,7 @@ def _get_data_algo(values):
         # StringHashTable and ObjectHashtable
         if lib.infer_dtype(values, skipna=False) in ["string"]:
             ndtype = "string"
-
-    htable = _hashtables.get(ndtype, _hashtables["object"])
-
-    return htable, values
+    return ndtype
 
 
 # --------------- #
@@ -370,9 +377,9 @@ def unique(values):
 unique1d = unique
 
 
-def isin(comps, values):
+def isin(comps, values) -> np.ndarray:
     """
-    Compute the isin boolean array
+    Compute the isin boolean array.
 
     Parameters
     ----------
@@ -381,7 +388,8 @@ def isin(comps, values):
 
     Returns
     -------
-    boolean array same length as comps
+    ndarray[bool]
+        Same length as `comps`.
     """
 
     if not is_list_like(comps):
@@ -413,17 +421,17 @@ def isin(comps, values):
     values, _ = _ensure_data(values, dtype=dtype)
 
     # faster for larger cases to use np.in1d
-    f = lambda x, y: htable.ismember_object(x, y)
+    f = htable.ismember_object
 
     # GH16012
     # Ensure np.in1d doesn't get object types or it *may* throw an exception
     if len(comps) > 1000000 and not is_object_dtype(comps):
-        f = lambda x, y: np.in1d(x, y)
+        f = np.in1d
     elif is_integer_dtype(comps):
         try:
             values = values.astype("int64", copy=False)
             comps = comps.astype("int64", copy=False)
-            f = lambda x, y: htable.ismember_int64(x, y)
+            f = htable.ismember_int64
         except (TypeError, ValueError, OverflowError):
             values = values.astype(object)
             comps = comps.astype(object)
@@ -432,7 +440,7 @@ def isin(comps, values):
         try:
             values = values.astype("float64", copy=False)
             comps = comps.astype("float64", copy=False)
-            f = lambda x, y: htable.ismember_float64(x, y)
+            f = htable.ismember_float64
         except (TypeError, ValueError):
             values = values.astype(object)
             comps = comps.astype(object)
@@ -440,7 +448,7 @@ def isin(comps, values):
     return f(comps, values)
 
 
-def _factorize_array(values, na_sentinel=-1, size_hint=None, na_value=None):
+def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=None):
     """
     Factorize an array-like to labels and uniques.
 
@@ -601,7 +609,7 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, na_value=None):
 )
 @Appender(_shared_docs["factorize"])
 @deprecate_kwarg(old_arg_name="order", new_arg_name=None)
-def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
+def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=None):
     # Implementation notes: This method is responsible for 3 things
     # 1.) coercing data to array-like (ndarray, Index, extension array)
     # 2.) factorizing labels and uniques
@@ -657,7 +665,7 @@ def value_counts(
     normalize: bool = False,
     bins=None,
     dropna: bool = True,
-):
+) -> ABCSeries:
     """
     Compute a histogram of the counts of non-null values.
 
@@ -686,6 +694,7 @@ def value_counts(
 
     if bins is not None:
         from pandas.core.reshape.tile import cut
+
         values = Series(values)
         try:
             ii = cut(values, bins, include_lowest=True)
@@ -728,25 +737,25 @@ def value_counts(
     return result
 
 
-def _value_counts_arraylike(values, dropna):
+def _value_counts_arraylike(values, dropna: bool):
     """
     Parameters
     ----------
     values : arraylike
-    dropna : boolean
+    dropna : bool
 
     Returns
     -------
-    (uniques, counts)
-
+    uniques : np.ndarray or ExtensionArray
+    counts : np.ndarray
     """
     values = _ensure_arraylike(values)
     original = values
-    values, dtype = _ensure_data(values)
+    values, _ = _ensure_data(values)
     ndtype = values.dtype.name
 
     if needs_i8_conversion(original.dtype):
-        # i8
+        # datetime, timedelta, or period
 
         keys, counts = htable.value_count_int64(values, dropna)
 
@@ -772,7 +781,7 @@ def _value_counts_arraylike(values, dropna):
     return keys, counts
 
 
-def duplicated(values, keep="first"):
+def duplicated(values, keep="first") -> np.ndarray:
     """
     Return boolean ndarray denoting duplicate values.
 
@@ -798,7 +807,7 @@ def duplicated(values, keep="first"):
     return f(values, keep=keep)
 
 
-def mode(values, dropna: bool = True):
+def mode(values, dropna: bool = True) -> ABCSeries:
     """
     Returns the mode(s) of an array.
 
@@ -844,7 +853,14 @@ def mode(values, dropna: bool = True):
     return Series(result)
 
 
-def rank(values, axis=0, method="average", na_option="keep", ascending=True, pct=False):
+def rank(
+    values,
+    axis: int = 0,
+    method: str = "average",
+    na_option: str = "keep",
+    ascending: bool = True,
+    pct: bool = False,
+):
     """
     Rank the values along a given axis.
 
@@ -1058,7 +1074,7 @@ def _get_score(at):
 
 
 class SelectN:
-    def __init__(self, obj, n, keep):
+    def __init__(self, obj, n: int, keep: str):
         self.obj = obj
         self.n = n
         self.keep = keep
@@ -1168,7 +1184,7 @@ class SelectNFrame(SelectN):
     nordered : DataFrame
     """
 
-    def __init__(self, obj, n, keep, columns):
+    def __init__(self, obj, n: int, keep: str, columns):
         super().__init__(obj, n, keep)
         if not is_list_like(columns) or isinstance(columns, tuple):
             columns = [columns]
@@ -1307,7 +1323,7 @@ def _take_2d_multi_object(arr, indexer, out, fill_value, mask_info):
             out[i, j] = arr[u_, v]
 
 
-def _take_nd_object(arr, indexer, out, axis, fill_value, mask_info):
+def _take_nd_object(arr, indexer, out, axis: int, fill_value, mask_info):
     if mask_info is not None:
         mask, needs_masking = mask_info
     else:
@@ -1424,7 +1440,7 @@ def _take_nd_object(arr, indexer, out, axis, fill_value, mask_info):
 }
 
 
-def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis=0, mask_info=None):
+def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis: int = 0, mask_info=None):
     if ndim <= 2:
         tup = (arr_dtype.name, out_dtype.name)
         if ndim == 1:
@@ -1458,7 +1474,7 @@ def func(arr, indexer, out, fill_value=np.nan):
     return func
 
 
-def take(arr, indices, axis=0, allow_fill=False, fill_value=None):
+def take(arr, indices, axis=0, allow_fill: bool = False, fill_value=None):
     """
     Take elements from an array.
 
@@ -1552,7 +1568,13 @@ def take(arr, indices, axis=0, allow_fill=False, fill_value=None):
 
 
 def take_nd(
-    arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, allow_fill=True
+    arr,
+    indexer,
+    axis=0,
+    out=None,
+    fill_value=np.nan,
+    mask_info=None,
+    allow_fill: bool = True,
 ):
     """
     Specialized Cython take which sets NaN values in one pass

From e7f6c6f7de122de189bdb0ef64278393a19ae67c Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Tue, 29 Oct 2019 22:34:03 -0700
Subject: [PATCH 3/3] STY: edits to make mypy happy

---
 pandas/core/algorithms.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index adc1f734dbeb6..98a090ef26f2a 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -1465,13 +1465,13 @@ def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis: int = 0, mask_info=N
             func = _convert_wrapper(func, out_dtype)
             return func
 
-    def func(arr, indexer, out, fill_value=np.nan):
+    def func2(arr, indexer, out, fill_value=np.nan):
         indexer = ensure_int64(indexer)
         _take_nd_object(
             arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info
         )
 
-    return func
+    return func2
 
 
 def take(arr, indices, axis=0, allow_fill: bool = False, fill_value=None):
@@ -1661,9 +1661,9 @@ def take_nd(
     # at this point, it's guaranteed that dtype can hold both the arr values
     # and the fill_value
     if out is None:
-        out_shape = list(arr.shape)
-        out_shape[axis] = len(indexer)
-        out_shape = tuple(out_shape)
+        out_shape_ = list(arr.shape)
+        out_shape_[axis] = len(indexer)
+        out_shape = tuple(out_shape_)
         if arr.flags.f_contiguous and axis == arr.ndim - 1:
             # minor tweak that can make an order-of-magnitude difference
             # for dataframes initialized directly from 2-d ndarrays