CLN: simplify core.algorithms (pandas-dev#29199)

jbrockmendel · proost · commit 4084eb3d6b5e · 2019-12-20T01:10:56.000+09:00
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -47,7 +47,7 @@
 from pandas.core.dtypes.missing import isna, na_value_for_dtype
 
 from pandas.core import common as com
-from pandas.core.construction import array
+from pandas.core.construction import array, extract_array
 from pandas.core.indexers import validate_indices
 
 _shared_docs = {}  # type: Dict[str, str]
@@ -82,9 +82,12 @@ def _ensure_data(values, dtype=None):
     """
 
     # we check some simple dtypes first
+    if is_object_dtype(dtype):
+        return ensure_object(np.asarray(values)), "object", "object"
+    elif is_object_dtype(values) and dtype is None:
+        return ensure_object(np.asarray(values)), "object", "object"
+
     try:
-        if is_object_dtype(dtype):
-            return ensure_object(np.asarray(values)), "object", "object"
         if is_bool_dtype(values) or is_bool_dtype(dtype):
             # we are actually coercing to uint64
             # until our algos support uint8 directly (see TODO)
@@ -95,8 +98,6 @@ def _ensure_data(values, dtype=None):
             return ensure_uint64(values), "uint64", "uint64"
         elif is_float_dtype(values) or is_float_dtype(dtype):
             return ensure_float64(values), "float64", "float64"
-        elif is_object_dtype(values) and dtype is None:
-            return ensure_object(np.asarray(values)), "object", "object"
         elif is_complex_dtype(values) or is_complex_dtype(dtype):
 
             # ignore the fact that we are casting to float
@@ -207,11 +208,11 @@ def _ensure_arraylike(values):
 
 
 _hashtables = {
-    "float64": (htable.Float64HashTable, htable.Float64Vector),
-    "uint64": (htable.UInt64HashTable, htable.UInt64Vector),
-    "int64": (htable.Int64HashTable, htable.Int64Vector),
-    "string": (htable.StringHashTable, htable.ObjectVector),
-    "object": (htable.PyObjectHashTable, htable.ObjectVector),
+    "float64": htable.Float64HashTable,
+    "uint64": htable.UInt64HashTable,
+    "int64": htable.Int64HashTable,
+    "string": htable.StringHashTable,
+    "object": htable.PyObjectHashTable,
 }
 
 
@@ -223,11 +224,9 @@ def _get_hashtable_algo(values):
 
     Returns
     -------
-    tuples(hashtable class,
-           vector class,
-           values,
-           dtype,
-           ndtype)
+    htable : HashTable subclass
+    values : ndarray
+    dtype : str or dtype
     """
     values, dtype, ndtype = _ensure_data(values)
 
@@ -238,23 +237,21 @@ def _get_hashtable_algo(values):
         # StringHashTable and ObjectHashtable
         if lib.infer_dtype(values, skipna=False) in ["string"]:
             ndtype = "string"
-        else:
-            ndtype = "object"
 
-    htable, table = _hashtables[ndtype]
-    return (htable, table, values, dtype, ndtype)
+    htable = _hashtables[ndtype]
+    return htable, values, dtype
 
 
 def _get_values_for_rank(values):
     if is_categorical_dtype(values):
         values = values._values_for_rank()
 
-    values, dtype, ndtype = _ensure_data(values)
-    return values, dtype, ndtype
+    values, _, ndtype = _ensure_data(values)
+    return values, ndtype
 
 
-def _get_data_algo(values, func_map):
-    values, dtype, ndtype = _get_values_for_rank(values)
+def _get_data_algo(values):
+    values, ndtype = _get_values_for_rank(values)
 
     if ndtype == "object":
 
@@ -264,7 +261,7 @@ def _get_data_algo(values, func_map):
         if lib.infer_dtype(values, skipna=False) in ["string"]:
             ndtype = "string"
 
-    f = func_map.get(ndtype, func_map["object"])
+    f = _hashtables.get(ndtype, _hashtables["object"])
 
     return f, values
 
@@ -295,7 +292,7 @@ def match(to_match, values, na_sentinel=-1):
     match : ndarray of integers
     """
     values = com.asarray_tuplesafe(values)
-    htable, _, values, dtype, ndtype = _get_hashtable_algo(values)
+    htable, values, dtype = _get_hashtable_algo(values)
     to_match, _, _ = _ensure_data(to_match, dtype)
     table = htable(min(len(to_match), 1000000))
     table.map_locations(values)
@@ -398,7 +395,7 @@ def unique(values):
         return values.unique()
 
     original = values
-    htable, _, values, dtype, ndtype = _get_hashtable_algo(values)
+    htable, values, _ = _get_hashtable_algo(values)
 
     table = htable(len(values))
     uniques = table.unique(values)
@@ -480,7 +477,8 @@ def isin(comps, values):
 
 
 def _factorize_array(values, na_sentinel=-1, size_hint=None, na_value=None):
-    """Factorize an array-like to labels and uniques.
+    """
+    Factorize an array-like to labels and uniques.
 
     This doesn't do any coercion of types or unboxing before factorization.
 
@@ -498,9 +496,10 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, na_value=None):
 
     Returns
     -------
-    labels, uniques : ndarray
+    labels : ndarray
+    uniques : ndarray
     """
-    (hash_klass, _), values = _get_data_algo(values, _hashtables)
+    hash_klass, values = _get_data_algo(values)
 
     table = hash_klass(size_hint or len(values))
     uniques, labels = table.factorize(
@@ -652,17 +651,13 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
     original = values
 
     if is_extension_array_dtype(values):
-        values = getattr(values, "_values", values)
+        values = extract_array(values)
         labels, uniques = values.factorize(na_sentinel=na_sentinel)
         dtype = original.dtype
     else:
         values, dtype, _ = _ensure_data(values)
 
-        if (
-            is_datetime64_any_dtype(original)
-            or is_timedelta64_dtype(original)
-            or is_period_dtype(original)
-        ):
+        if original.dtype.kind in ["m", "M"]:
             na_value = na_value_for_dtype(original.dtype)
         else:
             na_value = None
@@ -835,7 +830,7 @@ def duplicated(values, keep="first"):
     duplicated : ndarray
     """
 
-    values, dtype, ndtype = _ensure_data(values)
+    values, _, ndtype = _ensure_data(values)
     f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype))
     return f(values, keep=keep)
 
@@ -872,7 +867,7 @@ def mode(values, dropna: bool = True):
         mask = values.isnull()
         values = values[~mask]
 
-    values, dtype, ndtype = _ensure_data(values)
+    values, _, ndtype = _ensure_data(values)
 
     f = getattr(htable, "mode_{dtype}".format(dtype=ndtype))
     result = f(values, dropna=dropna)
@@ -910,7 +905,7 @@ def rank(values, axis=0, method="average", na_option="keep", ascending=True, pct
         (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
     """
     if values.ndim == 1:
-        values, _, _ = _get_values_for_rank(values)
+        values, _ = _get_values_for_rank(values)
         ranks = algos.rank_1d(
             values,
             ties_method=method,
@@ -919,7 +914,7 @@ def rank(values, axis=0, method="average", na_option="keep", ascending=True, pct
             pct=pct,
         )
     elif values.ndim == 2:
-        values, _, _ = _get_values_for_rank(values)
+        values, _ = _get_values_for_rank(values)
         ranks = algos.rank_2d(
             values,
             axis=axis,
@@ -1634,9 +1629,7 @@ def take_nd(
     if is_extension_array_dtype(arr):
         return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
 
-    if isinstance(arr, (ABCIndexClass, ABCSeries)):
-        arr = arr._values
-
+    arr = extract_array(arr)
     arr = np.asarray(arr)
 
     if indexer is None:
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -47,14 +47,7 @@
 from pandas.core import ops
 from pandas.core.accessor import PandasDelegate, delegate_names
 import pandas.core.algorithms as algorithms
-from pandas.core.algorithms import (
-    _get_data_algo,
-    _hashtables,
-    factorize,
-    take,
-    take_1d,
-    unique1d,
-)
+from pandas.core.algorithms import _get_data_algo, factorize, take, take_1d, unique1d
 from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs
 import pandas.core.common as com
 from pandas.core.construction import array, extract_array, sanitize_array
@@ -2097,7 +2090,6 @@ def __setitem__(self, key, value):
         """
         Item assignment.
 
-
         Raises
         ------
         ValueError
@@ -2631,8 +2623,8 @@ def _get_codes_for_values(values, categories):
         values = ensure_object(values)
         categories = ensure_object(categories)
 
-    (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
-    (_, _), cats = _get_data_algo(categories, _hashtables)
+    hash_klass, vals = _get_data_algo(values)
+    _, cats = _get_data_algo(categories)
     t = hash_klass(len(cats))
     t.map_locations(cats)
     return coerce_indexer_dtype(t.lookup(vals), cats)
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -21,7 +21,6 @@
     ensure_str,
     is_bool,
     is_bool_dtype,
-    is_categorical_dtype,
     is_complex,
     is_complex_dtype,
     is_datetime64_dtype,
@@ -1325,14 +1324,10 @@ def construct_1d_arraylike_from_scalar(value, length, dtype):
     np.ndarray / pandas type of length, filled with value
 
     """
-    if is_datetime64tz_dtype(dtype):
-        from pandas import DatetimeIndex
-
-        subarr = DatetimeIndex([value] * length, dtype=dtype)
-    elif is_categorical_dtype(dtype):
-        from pandas import Categorical
+    if is_extension_array_dtype(dtype):
+        cls = dtype.construct_array_type()
+        subarr = cls._from_sequence([value] * length, dtype=dtype)
 
-        subarr = Categorical([value] * length, dtype=dtype)
     else:
         if not isinstance(dtype, (np.dtype, type(np.dtype))):
             dtype = dtype.dtype
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
@@ -484,9 +484,7 @@ def sort_mixed(values):
 
     if sorter is None:
         # mixed types
-        (hash_klass, _), values = algorithms._get_data_algo(
-            values, algorithms._hashtables
-        )
+        hash_klass, values = algorithms._get_data_algo(values)
         t = hash_klass(len(values))
         t.map_locations(values)
         sorter = ensure_platform_int(t.lookup(ordered))