diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 16ec2bb5f253c..2c4477056a112 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -37,19 +37,17 @@ from pandas.core.dtypes.cast import ( construct_1d_object_array_from_listlike, infer_dtype_from_array, + sanitize_to_nanoseconds, ) from pandas.core.dtypes.common import ( ensure_float64, - ensure_int64, ensure_object, ensure_platform_int, - ensure_uint64, is_array_like, is_bool_dtype, is_categorical_dtype, is_complex_dtype, is_datetime64_dtype, - is_datetime64_ns_dtype, is_extension_array_dtype, is_float_dtype, is_integer, @@ -57,11 +55,8 @@ is_list_like, is_numeric_dtype, is_object_dtype, - is_period_dtype, is_scalar, - is_signed_integer_dtype, is_timedelta64_dtype, - is_unsigned_integer_dtype, needs_i8_conversion, pandas_dtype, ) @@ -134,71 +129,49 @@ def _ensure_data(values: ArrayLike) -> tuple[np.ndarray, DtypeObj]: values = extract_array(values, extract_numpy=True) # we check some simple dtypes first - if is_object_dtype(values): + if is_object_dtype(values.dtype): return ensure_object(np.asarray(values)), np.dtype("object") - try: - if is_bool_dtype(values): - # we are actually coercing to uint64 - # until our algos support uint8 directly (see TODO) - return np.asarray(values).astype("uint64"), np.dtype("bool") - elif is_signed_integer_dtype(values): - return ensure_int64(values), values.dtype - elif is_unsigned_integer_dtype(values): - return ensure_uint64(values), values.dtype - elif is_float_dtype(values): + elif is_bool_dtype(values.dtype): + if isinstance(values, np.ndarray): + # i.e. actually dtype == np.dtype("bool") + return np.asarray(values).view("uint8"), values.dtype + else: + # i.e. all-bool Categorical, BooleanArray + return np.asarray(values).astype("uint8", copy=False), values.dtype + + elif is_integer_dtype(values.dtype): + return np.asarray(values), values.dtype + + elif is_float_dtype(values.dtype): + # Note: checking `values.dtype == "float128"` raises on Windows and 32bit + # error: Item "ExtensionDtype" of "Union[Any, ExtensionDtype, dtype[Any]]" + # has no attribute "itemsize" + if values.dtype.itemsize in [2, 12, 16]: # type: ignore[union-attr] + # we dont (yet) have float128 hashtable support return ensure_float64(values), values.dtype - elif is_complex_dtype(values): - - # ignore the fact that we are casting to float - # which discards complex parts - with catch_warnings(): - simplefilter("ignore", np.ComplexWarning) - values = ensure_float64(values) - return values, np.dtype("float64") + return np.asarray(values), values.dtype - except (TypeError, ValueError, OverflowError): - # if we are trying to coerce to a dtype - # and it is incompatible this will fall through to here - return ensure_object(values), np.dtype("object") + elif is_complex_dtype(values.dtype): + # ignore the fact that we are casting to float + # which discards complex parts + with catch_warnings(): + simplefilter("ignore", np.ComplexWarning) + values = ensure_float64(values) + return values, np.dtype("float64") # datetimelike - if needs_i8_conversion(values.dtype): - if is_period_dtype(values.dtype): - from pandas import PeriodIndex - - values = PeriodIndex(values)._data - elif is_timedelta64_dtype(values.dtype): - from pandas import TimedeltaIndex - - values = TimedeltaIndex(values)._data - else: - # Datetime - if values.ndim > 1 and is_datetime64_ns_dtype(values.dtype): - # Avoid calling the DatetimeIndex constructor as it is 1D only - # Note: this is reached by DataFrame.rank calls GH#27027 - # TODO(EA2D): special case not needed with 2D EAs - asi8 = values.view("i8") - dtype = values.dtype - # error: Incompatible return value type (got "Tuple[Any, - # Union[dtype, ExtensionDtype, None]]", expected - # "Tuple[ndarray, Union[dtype, ExtensionDtype]]") - return asi8, dtype # type: ignore[return-value] - - from pandas import DatetimeIndex - - values = DatetimeIndex(values)._data - dtype = values.dtype - return values.asi8, dtype + elif needs_i8_conversion(values.dtype): + if isinstance(values, np.ndarray): + values = sanitize_to_nanoseconds(values) + npvalues = values.view("i8") + npvalues = cast(np.ndarray, npvalues) + return npvalues, values.dtype elif is_categorical_dtype(values.dtype): values = cast("Categorical", values) values = values.codes dtype = pandas_dtype("category") - - # we are actually coercing to int64 - # until our algos support int* directly (not all do) - values = ensure_int64(values) return values, dtype # we have failed, return object @@ -268,8 +241,15 @@ def _ensure_arraylike(values) -> ArrayLike: _hashtables = { "float64": htable.Float64HashTable, + "float32": htable.Float32HashTable, "uint64": htable.UInt64HashTable, + "uint32": htable.UInt32HashTable, + "uint16": htable.UInt16HashTable, + "uint8": htable.UInt8HashTable, "int64": htable.Int64HashTable, + "int32": htable.Int32HashTable, + "int16": htable.Int16HashTable, + "int8": htable.Int8HashTable, "string": htable.StringHashTable, "object": htable.PyObjectHashTable, } @@ -298,6 +278,10 @@ def _get_values_for_rank(values: ArrayLike) -> np.ndarray: values = cast("Categorical", values)._values_for_rank() values, _ = _ensure_data(values) + if values.dtype.kind in ["i", "u", "f"]: + # rank_t includes only object, int64, uint64, float64 + dtype = values.dtype.kind + "8" + values = values.astype(dtype, copy=False) return values diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 091efa68c67da..4847372f18239 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -550,7 +550,7 @@ def _from_factorized(cls, values, original): # Data # ------------------------------------------------------------------------ @property - def sp_index(self): + def sp_index(self) -> SparseIndex: """ The SparseIndex containing the location of non- ``fill_value`` points. """ @@ -570,7 +570,7 @@ def sp_values(self) -> np.ndarray: return self._sparse_values @property - def dtype(self): + def dtype(self) -> SparseDtype: return self._dtype @property @@ -597,7 +597,7 @@ def kind(self) -> str: return "block" @property - def _valid_sp_values(self): + def _valid_sp_values(self) -> np.ndarray: sp_vals = self.sp_values mask = notna(sp_vals) return sp_vals[mask] @@ -620,7 +620,7 @@ def nbytes(self) -> int: return self.sp_values.nbytes + self.sp_index.nbytes @property - def density(self): + def density(self) -> float: """ The percent of non- ``fill_value`` points, as decimal. diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index c7af104f62770..964dd9bdd0e0a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1756,14 +1756,15 @@ def _check(arr): _check(np.array([np.nan, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 3, np.nan])) _check(np.array([4.0, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 4.0, np.nan])) - def test_basic(self, writable): + @pytest.mark.parametrize("dtype", np.typecodes["AllInteger"]) + def test_basic(self, writable, dtype): exp = np.array([1, 2], dtype=np.float64) - for dtype in np.typecodes["AllInteger"]: - data = np.array([1, 100], dtype=dtype) - data.setflags(write=writable) - s = Series(data) - tm.assert_numpy_array_equal(algos.rank(s), exp) + data = np.array([1, 100], dtype=dtype) + data.setflags(write=writable) + ser = Series(data) + result = algos.rank(ser) + tm.assert_numpy_array_equal(result, exp) def test_uint64_overflow(self): exp = np.array([1, 2], dtype=np.float64)