diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index 12c6691fe6c63..78e12fb3995fd 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -74,6 +74,14 @@ def quantile_with_mask( Quantile is computed along axis=1. """ + assert values.shape == mask.shape + if values.ndim == 1: + # unsqueeze, operate, re-squeeze + values = np.atleast_2d(values) + mask = np.atleast_2d(mask) + res_values = quantile_with_mask(values, mask, fill_value, qs, interpolation) + return res_values[0] + assert values.ndim == 2 is_empty = values.shape[1] == 0 @@ -189,7 +197,18 @@ def _nanpercentile( _nanpercentile_1d(val, m, qs, na_value, interpolation=interpolation) for (val, m) in zip(list(values), list(mask)) ] - result = np.array(result, dtype=values.dtype, copy=False).T + if values.dtype.kind == "f": + # preserve itemsize + result = np.array(result, dtype=values.dtype, copy=False).T + else: + result = np.array(result, copy=False).T + if ( + result.dtype != values.dtype + and (result == result.astype(values.dtype, copy=False)).all() + ): + # e.g. values id integer dtype and result is floating dtype, + # only cast back to integer dtype if result values are all-integer. + result = result.astype(values.dtype, copy=False) return result else: return np.percentile( diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 4563759c63a36..b15e0624963ea 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -473,21 +473,14 @@ def _quantile( ) -> NDArrayBackedExtensionArrayT: # TODO: disable for Categorical if not ordered? - # asarray needed for Sparse, see GH#24600 mask = np.asarray(self.isna()) - mask = np.atleast_2d(mask) - - arr = np.atleast_2d(self._ndarray) + arr = self._ndarray fill_value = self._internal_fill_value res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation) - res_values = self._cast_quantile_result(res_values) - result = self._from_backing_data(res_values) - if self.ndim == 1: - assert result.shape == (1, len(qs)), result.shape - result = result[0] - return result + res_values = self._cast_quantile_result(res_values) + return self._from_backing_data(res_values) # TODO: see if we can share this with other dispatch-wrapping methods def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index a188692a2d8f7..eb3c6d6d26101 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1586,25 +1586,12 @@ def _quantile( ------- same type as self """ - # asarray needed for Sparse, see GH#24600 mask = np.asarray(self.isna()) - mask = np.atleast_2d(mask) - - arr = np.atleast_2d(np.asarray(self)) + arr = np.asarray(self) fill_value = np.nan res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation) - - if self.ndim == 2: - # i.e. DatetimeArray - result = type(self)._from_sequence(res_values) - - else: - # shape[0] should be 1 as long as EAs are 1D - assert res_values.shape == (1, len(qs)), res_values.shape - result = type(self)._from_sequence(res_values[0]) - - return result + return type(self)._from_sequence(res_values) def _mode(self: ExtensionArrayT, dropna: bool = True) -> ExtensionArrayT: """ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 01a04b7aa63d9..c1f5fadd1de3c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -351,7 +351,6 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi # For comparisons, so that numpy uses our implementation if the compare # ops, which raise __array_priority__ = 1000 - _internal_fill_value = -1 # tolist is not actually deprecated, just suppressed in the __dir__ _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"]) _typ = "categorical" @@ -476,6 +475,13 @@ def dtype(self) -> CategoricalDtype: """ return self._dtype + @property + def _internal_fill_value(self) -> int: + # using the specific numpy integer instead of python int to get + # the correct dtype back from _quantile in the all-NA case + dtype = self._ndarray.dtype + return dtype.type(-1) + @property def _constructor(self) -> type[Categorical]: return Categorical @@ -2300,7 +2306,7 @@ def unique(self): def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: # make sure we have correct itemsize for resulting codes - res_values = coerce_indexer_dtype(res_values, self.dtype.categories) + assert res_values.dtype == self._ndarray.dtype return res_values def equals(self, other: object) -> bool: diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 90f56a3eea0fb..a7bb9520841b6 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -958,7 +958,7 @@ def equals(self, other) -> bool: def _quantile( self: BaseMaskedArrayT, qs: npt.NDArray[np.float64], interpolation: str - ) -> BaseMaskedArrayT: + ) -> BaseMaskedArray: """ Dispatch to quantile_with_mask, needed because we do not have _from_factorized. @@ -967,29 +967,30 @@ def _quantile( ----- We assume that all impacted cases are 1D-only. """ - mask = np.atleast_2d(np.asarray(self.isna())) - npvalues: np.ndarray = np.atleast_2d(np.asarray(self)) - res = quantile_with_mask( - npvalues, - mask=mask, - fill_value=self.dtype.na_value, + self._data, + mask=self._mask, + # TODO(GH#40932): na_value_for_dtype(self.dtype.numpy_dtype) + # instead of np.nan + fill_value=np.nan, qs=qs, interpolation=interpolation, ) - assert res.ndim == 2 - assert res.shape[0] == 1 - res = res[0] - try: - out = type(self)._from_sequence(res, dtype=self.dtype) - except TypeError: - # GH#42626: not able to safely cast Int64 - # for floating point output - # error: Incompatible types in assignment (expression has type - # "ndarray[Any, dtype[floating[_64Bit]]]", variable has type - # "BaseMaskedArrayT") - out = np.asarray(res, dtype=np.float64) # type: ignore[assignment] - return out + + if self._hasna: + # Our result mask is all-False unless we are all-NA, in which + # case it is all-True. + if self.ndim == 2: + # I think this should be out_mask=self.isna().all(axis=1) + # but am holding off until we have tests + raise NotImplementedError + elif self.isna().all(): + out_mask = np.ones(res.shape, dtype=bool) + else: + out_mask = np.zeros(res.shape, dtype=bool) + else: + out_mask = np.zeros(res.shape, dtype=bool) + return self._maybe_mask_result(res, mask=out_mask) # ------------------------------------------------------------------ # Reductions diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 065e597537be9..f313d0d67c344 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -698,11 +698,16 @@ def fillna(self, value=None, method=None, limit=None) -> PeriodArray: return result.view(self.dtype) # type: ignore[return-value] return super().fillna(value=value, method=method, limit=limit) - # TODO: alternately could override _quantile like searchsorted - def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: - # quantile_with_mask may return float64 instead of int64, in which - # case we need to cast back - return res_values.astype(np.int64, copy=False) + def _quantile( + self: PeriodArray, + qs: npt.NDArray[np.float64], + interpolation: str, + ) -> PeriodArray: + # dispatch to DatetimeArray implementation + dtres = self.view("M8[ns]")._quantile(qs, interpolation) + # error: Incompatible return value type (got "Union[ExtensionArray, + # ndarray[Any, Any]]", expected "PeriodArray") + return dtres.view(self.dtype) # type: ignore[return-value] # ------------------------------------------------------------------ # Arithmetic Methods diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index c2a13961687fd..0e340bd7a7e97 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -75,6 +75,7 @@ from pandas.core import arraylike import pandas.core.algorithms as algos +from pandas.core.array_algos.quantile import quantile_with_mask from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse.dtype import SparseDtype @@ -890,10 +891,27 @@ def value_counts(self, dropna: bool = True) -> Series: return Series(counts, index=keys) def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str): + + if self._null_fill_value or self.sp_index.ngaps == 0: + # We can avoid densifying + npvalues = self.sp_values + mask = np.zeros(npvalues.shape, dtype=bool) + else: + npvalues = self.to_numpy() + mask = self.isna() + + fill_value = na_value_for_dtype(npvalues.dtype, compat=False) + res_values = quantile_with_mask( + npvalues, + mask, + fill_value, + qs, + interpolation, + ) + # Special case: the returned array isn't _really_ sparse, so we don't # wrap it in a SparseArray - result = super()._quantile(qs, interpolation) - return np.asarray(result) + return res_values # -------- # Indexing diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 20f190fcdfd4d..655624a4b59ff 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -665,9 +665,14 @@ def test_quantile_ea(self, obj, index): qs = [0.5, 0, 1] result = self.compute_quantile(obj, qs) + exp_dtype = index.dtype + if index.dtype == "Int64": + # match non-nullable casting behavior + exp_dtype = "Float64" + # expected here assumes len(index) == 9 expected = Series( - [index[4], index[0], index[-1]], dtype=index.dtype, index=qs, name="A" + [index[4], index[0], index[-1]], dtype=exp_dtype, index=qs, name="A" ) expected = type(obj)(expected) @@ -712,6 +717,8 @@ def test_quantile_ea_all_na(self, obj, index): expected = index.take([-1, -1, -1], allow_fill=True, fill_value=index._na_value) expected = Series(expected, index=qs, name="A") + if expected.dtype == "Int64": + expected = expected.astype("Float64") expected = type(obj)(expected) tm.assert_equal(result, expected) @@ -726,7 +733,11 @@ def test_quantile_ea_scalar(self, obj, index): qs = 0.5 result = self.compute_quantile(obj, qs) - expected = Series({"A": index[4]}, dtype=index.dtype, name=0.5) + exp_dtype = index.dtype + if index.dtype == "Int64": + exp_dtype = "Float64" + + expected = Series({"A": index[4]}, dtype=exp_dtype, name=0.5) if isinstance(obj, Series): expected = expected["A"] assert result == expected diff --git a/pandas/tests/series/methods/test_quantile.py b/pandas/tests/series/methods/test_quantile.py index 84bfe8524634b..aeff5b3adfe56 100644 --- a/pandas/tests/series/methods/test_quantile.py +++ b/pandas/tests/series/methods/test_quantile.py @@ -222,4 +222,6 @@ def test_quantile_empty(self): def test_quantile_dtypes(self, dtype): result = Series([1, 2, 3], dtype=dtype).quantile(np.arange(0, 1, 0.25)) expected = Series(np.arange(1, 3, 0.5), index=np.arange(0, 1, 0.25)) + if dtype == "Int64": + expected = expected.astype("Float64") tm.assert_series_equal(result, expected)