REF: avoid unnecessary casting in algorithms (#41256)

jbrockmendel · web-flow · commit b4375a40dee6 · 2021-05-02T19:16:56.000-04:00
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -37,31 +37,26 @@
 from pandas.core.dtypes.cast import (
     construct_1d_object_array_from_listlike,
     infer_dtype_from_array,
+    sanitize_to_nanoseconds,
 )
 from pandas.core.dtypes.common import (
     ensure_float64,
-    ensure_int64,
     ensure_object,
     ensure_platform_int,
-    ensure_uint64,
     is_array_like,
     is_bool_dtype,
     is_categorical_dtype,
     is_complex_dtype,
     is_datetime64_dtype,
-    is_datetime64_ns_dtype,
     is_extension_array_dtype,
     is_float_dtype,
     is_integer,
     is_integer_dtype,
     is_list_like,
     is_numeric_dtype,
     is_object_dtype,
-    is_period_dtype,
     is_scalar,
-    is_signed_integer_dtype,
     is_timedelta64_dtype,
-    is_unsigned_integer_dtype,
     needs_i8_conversion,
     pandas_dtype,
 )
@@ -134,71 +129,49 @@ def _ensure_data(values: ArrayLike) -> tuple[np.ndarray, DtypeObj]:
         values = extract_array(values, extract_numpy=True)
 
     # we check some simple dtypes first
-    if is_object_dtype(values):
+    if is_object_dtype(values.dtype):
         return ensure_object(np.asarray(values)), np.dtype("object")
 
-    try:
-        if is_bool_dtype(values):
-            # we are actually coercing to uint64
-            # until our algos support uint8 directly (see TODO)
-            return np.asarray(values).astype("uint64"), np.dtype("bool")
-        elif is_signed_integer_dtype(values):
-            return ensure_int64(values), values.dtype
-        elif is_unsigned_integer_dtype(values):
-            return ensure_uint64(values), values.dtype
-        elif is_float_dtype(values):
+    elif is_bool_dtype(values.dtype):
+        if isinstance(values, np.ndarray):
+            # i.e. actually dtype == np.dtype("bool")
+            return np.asarray(values).view("uint8"), values.dtype
+        else:
+            # i.e. all-bool Categorical, BooleanArray
+            return np.asarray(values).astype("uint8", copy=False), values.dtype
+
+    elif is_integer_dtype(values.dtype):
+        return np.asarray(values), values.dtype
+
+    elif is_float_dtype(values.dtype):
+        # Note: checking `values.dtype == "float128"` raises on Windows and 32bit
+        # error: Item "ExtensionDtype" of "Union[Any, ExtensionDtype, dtype[Any]]"
+        # has no attribute "itemsize"
+        if values.dtype.itemsize in [2, 12, 16]:  # type: ignore[union-attr]
+            # we dont (yet) have float128 hashtable support
             return ensure_float64(values), values.dtype
-        elif is_complex_dtype(values):
-
-            # ignore the fact that we are casting to float
-            # which discards complex parts
-            with catch_warnings():
-                simplefilter("ignore", np.ComplexWarning)
-                values = ensure_float64(values)
-            return values, np.dtype("float64")
+        return np.asarray(values), values.dtype
 
-    except (TypeError, ValueError, OverflowError):
-        # if we are trying to coerce to a dtype
-        # and it is incompatible this will fall through to here
-        return ensure_object(values), np.dtype("object")
+    elif is_complex_dtype(values.dtype):
+        # ignore the fact that we are casting to float
+        # which discards complex parts
+        with catch_warnings():
+            simplefilter("ignore", np.ComplexWarning)
+            values = ensure_float64(values)
+        return values, np.dtype("float64")
 
     # datetimelike
-    if needs_i8_conversion(values.dtype):
-        if is_period_dtype(values.dtype):
-            from pandas import PeriodIndex
-
-            values = PeriodIndex(values)._data
-        elif is_timedelta64_dtype(values.dtype):
-            from pandas import TimedeltaIndex
-
-            values = TimedeltaIndex(values)._data
-        else:
-            # Datetime
-            if values.ndim > 1 and is_datetime64_ns_dtype(values.dtype):
-                # Avoid calling the DatetimeIndex constructor as it is 1D only
-                # Note: this is reached by DataFrame.rank calls GH#27027
-                # TODO(EA2D): special case not needed with 2D EAs
-                asi8 = values.view("i8")
-                dtype = values.dtype
-                # error: Incompatible return value type (got "Tuple[Any,
-                # Union[dtype, ExtensionDtype, None]]", expected
-                # "Tuple[ndarray, Union[dtype, ExtensionDtype]]")
-                return asi8, dtype  # type: ignore[return-value]
-
-            from pandas import DatetimeIndex
-
-            values = DatetimeIndex(values)._data
-        dtype = values.dtype
-        return values.asi8, dtype
+    elif needs_i8_conversion(values.dtype):
+        if isinstance(values, np.ndarray):
+            values = sanitize_to_nanoseconds(values)
+        npvalues = values.view("i8")
+        npvalues = cast(np.ndarray, npvalues)
+        return npvalues, values.dtype
 
     elif is_categorical_dtype(values.dtype):
         values = cast("Categorical", values)
         values = values.codes
         dtype = pandas_dtype("category")
-
-        # we are actually coercing to int64
-        # until our algos support int* directly (not all do)
-        values = ensure_int64(values)
         return values, dtype
 
     # we have failed, return object
@@ -268,8 +241,15 @@ def _ensure_arraylike(values) -> ArrayLike:
 
 _hashtables = {
     "float64": htable.Float64HashTable,
+    "float32": htable.Float32HashTable,
     "uint64": htable.UInt64HashTable,
+    "uint32": htable.UInt32HashTable,
+    "uint16": htable.UInt16HashTable,
+    "uint8": htable.UInt8HashTable,
     "int64": htable.Int64HashTable,
+    "int32": htable.Int32HashTable,
+    "int16": htable.Int16HashTable,
+    "int8": htable.Int8HashTable,
     "string": htable.StringHashTable,
     "object": htable.PyObjectHashTable,
 }
@@ -298,6 +278,10 @@ def _get_values_for_rank(values: ArrayLike) -> np.ndarray:
         values = cast("Categorical", values)._values_for_rank()
 
     values, _ = _ensure_data(values)
+    if values.dtype.kind in ["i", "u", "f"]:
+        # rank_t includes only object, int64, uint64, float64
+        dtype = values.dtype.kind + "8"
+        values = values.astype(dtype, copy=False)
     return values
 
 
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -550,7 +550,7 @@ def _from_factorized(cls, values, original):
     # Data
     # ------------------------------------------------------------------------
     @property
-    def sp_index(self):
+    def sp_index(self) -> SparseIndex:
         """
         The SparseIndex containing the location of non- ``fill_value`` points.
         """
@@ -570,7 +570,7 @@ def sp_values(self) -> np.ndarray:
         return self._sparse_values
 
     @property
-    def dtype(self):
+    def dtype(self) -> SparseDtype:
         return self._dtype
 
     @property
@@ -597,7 +597,7 @@ def kind(self) -> str:
             return "block"
 
     @property
-    def _valid_sp_values(self):
+    def _valid_sp_values(self) -> np.ndarray:
         sp_vals = self.sp_values
         mask = notna(sp_vals)
         return sp_vals[mask]
@@ -620,7 +620,7 @@ def nbytes(self) -> int:
         return self.sp_values.nbytes + self.sp_index.nbytes
 
     @property
-    def density(self):
+    def density(self) -> float:
         """
         The percent of non- ``fill_value`` points, as decimal.
 
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -1756,14 +1756,15 @@ def _check(arr):
         _check(np.array([np.nan, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 3, np.nan]))
         _check(np.array([4.0, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 4.0, np.nan]))
 
-    def test_basic(self, writable):
+    @pytest.mark.parametrize("dtype", np.typecodes["AllInteger"])
+    def test_basic(self, writable, dtype):
         exp = np.array([1, 2], dtype=np.float64)
 
-        for dtype in np.typecodes["AllInteger"]:
-            data = np.array([1, 100], dtype=dtype)
-            data.setflags(write=writable)
-            s = Series(data)
-            tm.assert_numpy_array_equal(algos.rank(s), exp)
+        data = np.array([1, 100], dtype=dtype)
+        data.setflags(write=writable)
+        ser = Series(data)
+        result = algos.rank(ser)
+        tm.assert_numpy_array_equal(result, exp)
 
     def test_uint64_overflow(self):
         exp = np.array([1, 2], dtype=np.float64)