BUG: MaskedArray._quantile match non-nullable behavior (#46282)

jbrockmendel · web-flow · commit 6edb64c9ed6f · 2022-05-09T08:54:25.000-04:00
diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py
@@ -74,6 +74,14 @@ def quantile_with_mask(
 
     Quantile is computed along axis=1.
     """
+    assert values.shape == mask.shape
+    if values.ndim == 1:
+        # unsqueeze, operate, re-squeeze
+        values = np.atleast_2d(values)
+        mask = np.atleast_2d(mask)
+        res_values = quantile_with_mask(values, mask, fill_value, qs, interpolation)
+        return res_values[0]
+
     assert values.ndim == 2
 
     is_empty = values.shape[1] == 0
@@ -189,7 +197,18 @@ def _nanpercentile(
             _nanpercentile_1d(val, m, qs, na_value, interpolation=interpolation)
             for (val, m) in zip(list(values), list(mask))
         ]
-        result = np.array(result, dtype=values.dtype, copy=False).T
+        if values.dtype.kind == "f":
+            # preserve itemsize
+            result = np.array(result, dtype=values.dtype, copy=False).T
+        else:
+            result = np.array(result, copy=False).T
+            if (
+                result.dtype != values.dtype
+                and (result == result.astype(values.dtype, copy=False)).all()
+            ):
+                # e.g. values id integer dtype and result is floating dtype,
+                #  only cast back to integer dtype if result values are all-integer.
+                result = result.astype(values.dtype, copy=False)
         return result
     else:
         return np.percentile(
diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
@@ -473,21 +473,14 @@ def _quantile(
     ) -> NDArrayBackedExtensionArrayT:
         # TODO: disable for Categorical if not ordered?
 
-        # asarray needed for Sparse, see GH#24600
         mask = np.asarray(self.isna())
-        mask = np.atleast_2d(mask)
-
-        arr = np.atleast_2d(self._ndarray)
+        arr = self._ndarray
         fill_value = self._internal_fill_value
 
         res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)
-        res_values = self._cast_quantile_result(res_values)
-        result = self._from_backing_data(res_values)
-        if self.ndim == 1:
-            assert result.shape == (1, len(qs)), result.shape
-            result = result[0]
 
-        return result
+        res_values = self._cast_quantile_result(res_values)
+        return self._from_backing_data(res_values)
 
     # TODO: see if we can share this with other dispatch-wrapping methods
     def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -1586,25 +1586,12 @@ def _quantile(
         -------
         same type as self
         """
-        # asarray needed for Sparse, see GH#24600
         mask = np.asarray(self.isna())
-        mask = np.atleast_2d(mask)
-
-        arr = np.atleast_2d(np.asarray(self))
+        arr = np.asarray(self)
         fill_value = np.nan
 
         res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)
-
-        if self.ndim == 2:
-            # i.e. DatetimeArray
-            result = type(self)._from_sequence(res_values)
-
-        else:
-            # shape[0] should be 1 as long as EAs are 1D
-            assert res_values.shape == (1, len(qs)), res_values.shape
-            result = type(self)._from_sequence(res_values[0])
-
-        return result
+        return type(self)._from_sequence(res_values)
 
     def _mode(self: ExtensionArrayT, dropna: bool = True) -> ExtensionArrayT:
         """
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -351,7 +351,6 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi
     # For comparisons, so that numpy uses our implementation if the compare
     # ops, which raise
     __array_priority__ = 1000
-    _internal_fill_value = -1
     # tolist is not actually deprecated, just suppressed in the __dir__
     _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"])
     _typ = "categorical"
@@ -476,6 +475,13 @@ def dtype(self) -> CategoricalDtype:
         """
         return self._dtype
 
+    @property
+    def _internal_fill_value(self) -> int:
+        # using the specific numpy integer instead of python int to get
+        #  the correct dtype back from _quantile in the all-NA case
+        dtype = self._ndarray.dtype
+        return dtype.type(-1)
+
     @property
     def _constructor(self) -> type[Categorical]:
         return Categorical
@@ -2300,7 +2306,7 @@ def unique(self):
 
     def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
         # make sure we have correct itemsize for resulting codes
-        res_values = coerce_indexer_dtype(res_values, self.dtype.categories)
+        assert res_values.dtype == self._ndarray.dtype
         return res_values
 
     def equals(self, other: object) -> bool:
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -958,7 +958,7 @@ def equals(self, other) -> bool:
 
     def _quantile(
         self: BaseMaskedArrayT, qs: npt.NDArray[np.float64], interpolation: str
-    ) -> BaseMaskedArrayT:
+    ) -> BaseMaskedArray:
         """
         Dispatch to quantile_with_mask, needed because we do not have
         _from_factorized.
@@ -967,29 +967,30 @@ def _quantile(
         -----
         We assume that all impacted cases are 1D-only.
         """
-        mask = np.atleast_2d(np.asarray(self.isna()))
-        npvalues: np.ndarray = np.atleast_2d(np.asarray(self))
-
         res = quantile_with_mask(
-            npvalues,
-            mask=mask,
-            fill_value=self.dtype.na_value,
+            self._data,
+            mask=self._mask,
+            # TODO(GH#40932): na_value_for_dtype(self.dtype.numpy_dtype)
+            #  instead of np.nan
+            fill_value=np.nan,
             qs=qs,
             interpolation=interpolation,
         )
-        assert res.ndim == 2
-        assert res.shape[0] == 1
-        res = res[0]
-        try:
-            out = type(self)._from_sequence(res, dtype=self.dtype)
-        except TypeError:
-            # GH#42626: not able to safely cast Int64
-            # for floating point output
-            # error: Incompatible types in assignment (expression has type
-            # "ndarray[Any, dtype[floating[_64Bit]]]", variable has type
-            # "BaseMaskedArrayT")
-            out = np.asarray(res, dtype=np.float64)  # type: ignore[assignment]
-        return out
+
+        if self._hasna:
+            # Our result mask is all-False unless we are all-NA, in which
+            #  case it is all-True.
+            if self.ndim == 2:
+                # I think this should be out_mask=self.isna().all(axis=1)
+                #  but am holding off until we have tests
+                raise NotImplementedError
+            elif self.isna().all():
+                out_mask = np.ones(res.shape, dtype=bool)
+            else:
+                out_mask = np.zeros(res.shape, dtype=bool)
+        else:
+            out_mask = np.zeros(res.shape, dtype=bool)
+        return self._maybe_mask_result(res, mask=out_mask)
 
     # ------------------------------------------------------------------
     # Reductions
diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py
@@ -698,11 +698,16 @@ def fillna(self, value=None, method=None, limit=None) -> PeriodArray:
             return result.view(self.dtype)  # type: ignore[return-value]
         return super().fillna(value=value, method=method, limit=limit)
 
-    # TODO: alternately could override _quantile like searchsorted
-    def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
-        # quantile_with_mask may return float64 instead of int64, in which
-        #  case we need to cast back
-        return res_values.astype(np.int64, copy=False)
+    def _quantile(
+        self: PeriodArray,
+        qs: npt.NDArray[np.float64],
+        interpolation: str,
+    ) -> PeriodArray:
+        # dispatch to DatetimeArray implementation
+        dtres = self.view("M8[ns]")._quantile(qs, interpolation)
+        # error: Incompatible return value type (got "Union[ExtensionArray,
+        # ndarray[Any, Any]]", expected "PeriodArray")
+        return dtres.view(self.dtype)  # type: ignore[return-value]
 
     # ------------------------------------------------------------------
     # Arithmetic Methods
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -75,6 +75,7 @@
 
 from pandas.core import arraylike
 import pandas.core.algorithms as algos
+from pandas.core.array_algos.quantile import quantile_with_mask
 from pandas.core.arraylike import OpsMixin
 from pandas.core.arrays import ExtensionArray
 from pandas.core.arrays.sparse.dtype import SparseDtype
@@ -890,10 +891,27 @@ def value_counts(self, dropna: bool = True) -> Series:
         return Series(counts, index=keys)
 
     def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str):
+
+        if self._null_fill_value or self.sp_index.ngaps == 0:
+            # We can avoid densifying
+            npvalues = self.sp_values
+            mask = np.zeros(npvalues.shape, dtype=bool)
+        else:
+            npvalues = self.to_numpy()
+            mask = self.isna()
+
+        fill_value = na_value_for_dtype(npvalues.dtype, compat=False)
+        res_values = quantile_with_mask(
+            npvalues,
+            mask,
+            fill_value,
+            qs,
+            interpolation,
+        )
+
         # Special case: the returned array isn't _really_ sparse, so we don't
         #  wrap it in a SparseArray
-        result = super()._quantile(qs, interpolation)
-        return np.asarray(result)
+        return res_values
 
     # --------
     # Indexing
diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py
@@ -665,9 +665,14 @@ def test_quantile_ea(self, obj, index):
         qs = [0.5, 0, 1]
         result = self.compute_quantile(obj, qs)
 
+        exp_dtype = index.dtype
+        if index.dtype == "Int64":
+            # match non-nullable casting behavior
+            exp_dtype = "Float64"
+
         # expected here assumes len(index) == 9
         expected = Series(
-            [index[4], index[0], index[-1]], dtype=index.dtype, index=qs, name="A"
+            [index[4], index[0], index[-1]], dtype=exp_dtype, index=qs, name="A"
         )
         expected = type(obj)(expected)
 
@@ -712,6 +717,8 @@ def test_quantile_ea_all_na(self, obj, index):
 
         expected = index.take([-1, -1, -1], allow_fill=True, fill_value=index._na_value)
         expected = Series(expected, index=qs, name="A")
+        if expected.dtype == "Int64":
+            expected = expected.astype("Float64")
         expected = type(obj)(expected)
         tm.assert_equal(result, expected)
 
@@ -726,7 +733,11 @@ def test_quantile_ea_scalar(self, obj, index):
         qs = 0.5
         result = self.compute_quantile(obj, qs)
 
-        expected = Series({"A": index[4]}, dtype=index.dtype, name=0.5)
+        exp_dtype = index.dtype
+        if index.dtype == "Int64":
+            exp_dtype = "Float64"
+
+        expected = Series({"A": index[4]}, dtype=exp_dtype, name=0.5)
         if isinstance(obj, Series):
             expected = expected["A"]
             assert result == expected
diff --git a/pandas/tests/series/methods/test_quantile.py b/pandas/tests/series/methods/test_quantile.py
@@ -222,4 +222,6 @@ def test_quantile_empty(self):
     def test_quantile_dtypes(self, dtype):
         result = Series([1, 2, 3], dtype=dtype).quantile(np.arange(0, 1, 0.25))
         expected = Series(np.arange(1, 3, 0.5), index=np.arange(0, 1, 0.25))
+        if dtype == "Int64":
+            expected = expected.astype("Float64")
         tm.assert_series_equal(result, expected)