From de59af46524238f2ea7c97450fa03a7ed4a1f8fc Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 8 Mar 2022 14:50:01 -0800 Subject: [PATCH 1/2] BUG: MaskedArray._quantile match non-nullable behavior --- pandas/core/array_algos/quantile.py | 8 ++++ pandas/core/arrays/_mixins.py | 13 ++---- pandas/core/arrays/base.py | 17 +------- pandas/core/arrays/categorical.py | 10 ++++- pandas/core/arrays/masked.py | 41 ++++++++++--------- pandas/core/arrays/period.py | 15 ++++--- pandas/core/arrays/sparse/array.py | 22 +++++++++- pandas/tests/frame/methods/test_quantile.py | 13 +++++- .../tests/io/formats/style/test_highlight.py | 11 +++++ pandas/tests/series/methods/test_quantile.py | 2 + 10 files changed, 96 insertions(+), 56 deletions(-) diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index 12c6691fe6c63..11dc9bf89ab3b 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -74,6 +74,14 @@ def quantile_with_mask( Quantile is computed along axis=1. """ + assert values.shape == mask.shape + if values.ndim == 1: + # unsqueeze, operate, re-squeeze + values = np.atleast_2d(values) + mask = np.atleast_2d(mask) + res_values = quantile_with_mask(values, mask, fill_value, qs, interpolation) + return res_values[0] + assert values.ndim == 2 is_empty = values.shape[1] == 0 diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index d86c60d78195b..c884a32ad4cec 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -476,21 +476,14 @@ def _quantile( ) -> NDArrayBackedExtensionArrayT: # TODO: disable for Categorical if not ordered? - # asarray needed for Sparse, see GH#24600 mask = np.asarray(self.isna()) - mask = np.atleast_2d(mask) - - arr = np.atleast_2d(self._ndarray) + arr = self._ndarray fill_value = self._internal_fill_value res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation) - res_values = self._cast_quantile_result(res_values) - result = self._from_backing_data(res_values) - if self.ndim == 1: - assert result.shape == (1, len(qs)), result.shape - result = result[0] - return result + res_values = self._cast_quantile_result(res_values) + return self._from_backing_data(res_values) # TODO: see if we can share this with other dispatch-wrapping methods def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 6392f819e3acf..ffc06afd6041c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1586,25 +1586,12 @@ def _quantile( ------- same type as self """ - # asarray needed for Sparse, see GH#24600 mask = np.asarray(self.isna()) - mask = np.atleast_2d(mask) - - arr = np.atleast_2d(np.asarray(self)) + arr = np.asarray(self) fill_value = np.nan res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation) - - if self.ndim == 2: - # i.e. DatetimeArray - result = type(self)._from_sequence(res_values) - - else: - # shape[0] should be 1 as long as EAs are 1D - assert res_values.shape == (1, len(qs)), res_values.shape - result = type(self)._from_sequence(res_values[0]) - - return result + return type(self)._from_sequence(res_values) def _mode(self: ExtensionArrayT, dropna: bool = True) -> ExtensionArrayT: """ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 9d649c533619e..b1c93aeaf8781 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -352,7 +352,6 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi # For comparisons, so that numpy uses our implementation if the compare # ops, which raise __array_priority__ = 1000 - _internal_fill_value = -1 # tolist is not actually deprecated, just suppressed in the __dir__ _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"]) _typ = "categorical" @@ -477,6 +476,13 @@ def dtype(self) -> CategoricalDtype: """ return self._dtype + @property + def _internal_fill_value(self) -> int: + # using the specific numpy integer instead of python int to get + # the correct dtype back from _quantile in the all-NA case + dtype = self._ndarray.dtype + return dtype.type(-1) + @property def _constructor(self) -> type[Categorical]: return Categorical @@ -2303,7 +2309,7 @@ def _values_for_factorize(self): def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: # make sure we have correct itemsize for resulting codes - res_values = coerce_indexer_dtype(res_values, self.dtype.categories) + assert res_values.dtype == self._ndarray.dtype return res_values def equals(self, other: object) -> bool: diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 95363e598a06c..e57fa5a86a8c4 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -948,7 +948,7 @@ def equals(self, other) -> bool: def _quantile( self: BaseMaskedArrayT, qs: npt.NDArray[np.float64], interpolation: str - ) -> BaseMaskedArrayT: + ) -> BaseMaskedArray: """ Dispatch to quantile_with_mask, needed because we do not have _from_factorized. @@ -957,29 +957,30 @@ def _quantile( ----- We assume that all impacted cases are 1D-only. """ - mask = np.atleast_2d(np.asarray(self.isna())) - npvalues: np.ndarray = np.atleast_2d(np.asarray(self)) - res = quantile_with_mask( - npvalues, - mask=mask, - fill_value=self.dtype.na_value, + self._data, + mask=self._mask, + # TODO(GH#40932): na_value_for_dtype(self.dtype.numpy_dtype) + # instead of np.nan + fill_value=np.nan, qs=qs, interpolation=interpolation, ) - assert res.ndim == 2 - assert res.shape[0] == 1 - res = res[0] - try: - out = type(self)._from_sequence(res, dtype=self.dtype) - except TypeError: - # GH#42626: not able to safely cast Int64 - # for floating point output - # error: Incompatible types in assignment (expression has type - # "ndarray[Any, dtype[floating[_64Bit]]]", variable has type - # "BaseMaskedArrayT") - out = np.asarray(res, dtype=np.float64) # type: ignore[assignment] - return out + + if self._hasna: + # Our result mask is all-False unless we are all-NA, in which + # case it is all-True. + if self.ndim == 2: + # I think this should be out_mask=self.isna().all(axis=1) + # but am holding off until we have tests + raise NotImplementedError + elif self.isna().all(): + out_mask = np.ones(res.shape, dtype=bool) + else: + out_mask = np.zeros(res.shape, dtype=bool) + else: + out_mask = np.zeros(res.shape, dtype=bool) + return self._maybe_mask_result(res, mask=out_mask) # ------------------------------------------------------------------ # Reductions diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 762f681f97e6d..56ecba861a284 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -699,11 +699,16 @@ def fillna(self, value=None, method=None, limit=None) -> PeriodArray: return result.view(self.dtype) # type: ignore[return-value] return super().fillna(value=value, method=method, limit=limit) - # TODO: alternately could override _quantile like searchsorted - def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: - # quantile_with_mask may return float64 instead of int64, in which - # case we need to cast back - return res_values.astype(np.int64, copy=False) + def _quantile( + self: PeriodArray, + qs: npt.NDArray[np.float64], + interpolation: str, + ) -> PeriodArray: + # dispatch to DatetimeArray implementation + dtres = self.view("M8[ns]")._quantile(qs, interpolation) + # error: Incompatible return value type (got "Union[ExtensionArray, + # ndarray[Any, Any]]", expected "PeriodArray") + return dtres.view(self.dtype) # type: ignore[return-value] # ------------------------------------------------------------------ # Arithmetic Methods diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 25c4a15127200..f8d7b4dca3d34 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -75,6 +75,7 @@ from pandas.core import arraylike import pandas.core.algorithms as algos +from pandas.core.array_algos.quantile import quantile_with_mask from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse.dtype import SparseDtype @@ -891,10 +892,27 @@ def value_counts(self, dropna: bool = True) -> Series: return Series(counts, index=keys) def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str): + + if self._null_fill_value or self.sp_index.ngaps == 0: + # We can avoid densifying + npvalues = self.sp_values + mask = np.zeros(npvalues.shape, dtype=bool) + else: + npvalues = self.to_numpy() + mask = self.isna() + + fill_value = na_value_for_dtype(npvalues.dtype, compat=False) + res_values = quantile_with_mask( + npvalues, + mask, + fill_value, + qs, + interpolation, + ) + # Special case: the returned array isn't _really_ sparse, so we don't # wrap it in a SparseArray - result = super()._quantile(qs, interpolation) - return np.asarray(result) + return res_values # -------- # Indexing diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 040b981c41593..13026a2432ee3 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -643,9 +643,14 @@ def test_quantile_ea(self, obj, index): qs = [0.5, 0, 1] result = self.compute_quantile(obj, qs) + exp_dtype = index.dtype + if index.dtype == "Int64": + # match non-nullable casting behavior + exp_dtype = "Float64" + # expected here assumes len(index) == 9 expected = Series( - [index[4], index[0], index[-1]], dtype=index.dtype, index=qs, name="A" + [index[4], index[0], index[-1]], dtype=exp_dtype, index=qs, name="A" ) expected = type(obj)(expected) @@ -704,7 +709,11 @@ def test_quantile_ea_scalar(self, obj, index): qs = 0.5 result = self.compute_quantile(obj, qs) - expected = Series({"A": index[4]}, dtype=index.dtype, name=0.5) + exp_dtype = index.dtype + if index.dtype == "Int64": + exp_dtype = "Float64" + + expected = Series({"A": index[4]}, dtype=exp_dtype, name=0.5) if isinstance(obj, Series): expected = expected["A"] assert result == expected diff --git a/pandas/tests/io/formats/style/test_highlight.py b/pandas/tests/io/formats/style/test_highlight.py index 63138d87bc72f..418966c754e0a 100644 --- a/pandas/tests/io/formats/style/test_highlight.py +++ b/pandas/tests/io/formats/style/test_highlight.py @@ -181,9 +181,20 @@ def test_highlight_between_inclusive(styler, inclusive, expected): ) def test_highlight_quantile(styler, kwargs): expected = { + (0, 1): [("background-color", "yellow")], (2, 0): [("background-color", "yellow")], (2, 1): [("background-color", "yellow")], } + if styler.data.dtypes["B"] != "Int64": + expected.pop((0, 1)) + else: + if kwargs.get("axis", -1) is None: + expected.pop((0, 1)) + elif kwargs.get("q_left", -1) == 0: + expected.pop((0, 1)) + elif "subset" in kwargs: + expected.pop((0, 1)) + result = styler.highlight_quantile(**kwargs)._compute().ctx assert result == expected diff --git a/pandas/tests/series/methods/test_quantile.py b/pandas/tests/series/methods/test_quantile.py index 84bfe8524634b..aeff5b3adfe56 100644 --- a/pandas/tests/series/methods/test_quantile.py +++ b/pandas/tests/series/methods/test_quantile.py @@ -222,4 +222,6 @@ def test_quantile_empty(self): def test_quantile_dtypes(self, dtype): result = Series([1, 2, 3], dtype=dtype).quantile(np.arange(0, 1, 0.25)) expected = Series(np.arange(1, 3, 0.5), index=np.arange(0, 1, 0.25)) + if dtype == "Int64": + expected = expected.astype("Float64") tm.assert_series_equal(result, expected) From fe2d38e624efcf986ad6c56a41ab37f9df03da7b Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 3 May 2022 09:39:14 -0700 Subject: [PATCH 2/2] fix incorrect dtype --- pandas/core/array_algos/quantile.py | 13 ++++++++++++- pandas/tests/frame/methods/test_quantile.py | 2 ++ pandas/tests/io/formats/style/test_highlight.py | 11 ----------- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index 11dc9bf89ab3b..78e12fb3995fd 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -197,7 +197,18 @@ def _nanpercentile( _nanpercentile_1d(val, m, qs, na_value, interpolation=interpolation) for (val, m) in zip(list(values), list(mask)) ] - result = np.array(result, dtype=values.dtype, copy=False).T + if values.dtype.kind == "f": + # preserve itemsize + result = np.array(result, dtype=values.dtype, copy=False).T + else: + result = np.array(result, copy=False).T + if ( + result.dtype != values.dtype + and (result == result.astype(values.dtype, copy=False)).all() + ): + # e.g. values id integer dtype and result is floating dtype, + # only cast back to integer dtype if result values are all-integer. + result = result.astype(values.dtype, copy=False) return result else: return np.percentile( diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 31362ce5d2dd5..655624a4b59ff 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -717,6 +717,8 @@ def test_quantile_ea_all_na(self, obj, index): expected = index.take([-1, -1, -1], allow_fill=True, fill_value=index._na_value) expected = Series(expected, index=qs, name="A") + if expected.dtype == "Int64": + expected = expected.astype("Float64") expected = type(obj)(expected) tm.assert_equal(result, expected) diff --git a/pandas/tests/io/formats/style/test_highlight.py b/pandas/tests/io/formats/style/test_highlight.py index 418966c754e0a..63138d87bc72f 100644 --- a/pandas/tests/io/formats/style/test_highlight.py +++ b/pandas/tests/io/formats/style/test_highlight.py @@ -181,20 +181,9 @@ def test_highlight_between_inclusive(styler, inclusive, expected): ) def test_highlight_quantile(styler, kwargs): expected = { - (0, 1): [("background-color", "yellow")], (2, 0): [("background-color", "yellow")], (2, 1): [("background-color", "yellow")], } - if styler.data.dtypes["B"] != "Int64": - expected.pop((0, 1)) - else: - if kwargs.get("axis", -1) is None: - expected.pop((0, 1)) - elif kwargs.get("q_left", -1) == 0: - expected.pop((0, 1)) - elif "subset" in kwargs: - expected.pop((0, 1)) - result = styler.highlight_quantile(**kwargs)._compute().ctx assert result == expected