From de59af46524238f2ea7c97450fa03a7ed4a1f8fc Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 8 Mar 2022 14:50:01 -0800
Subject: [PATCH 1/2] BUG: MaskedArray._quantile match non-nullable behavior

---
 pandas/core/array_algos/quantile.py           |  8 ++++
 pandas/core/arrays/_mixins.py                 | 13 ++----
 pandas/core/arrays/base.py                    | 17 +-------
 pandas/core/arrays/categorical.py             | 10 ++++-
 pandas/core/arrays/masked.py                  | 41 ++++++++++---------
 pandas/core/arrays/period.py                  | 15 ++++---
 pandas/core/arrays/sparse/array.py            | 22 +++++++++-
 pandas/tests/frame/methods/test_quantile.py   | 13 +++++-
 .../tests/io/formats/style/test_highlight.py  | 11 +++++
 pandas/tests/series/methods/test_quantile.py  |  2 +
 10 files changed, 96 insertions(+), 56 deletions(-)

diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py
index 12c6691fe6c63..11dc9bf89ab3b 100644
--- a/pandas/core/array_algos/quantile.py
+++ b/pandas/core/array_algos/quantile.py
@@ -74,6 +74,14 @@ def quantile_with_mask(
 
     Quantile is computed along axis=1.
     """
+    assert values.shape == mask.shape
+    if values.ndim == 1:
+        # unsqueeze, operate, re-squeeze
+        values = np.atleast_2d(values)
+        mask = np.atleast_2d(mask)
+        res_values = quantile_with_mask(values, mask, fill_value, qs, interpolation)
+        return res_values[0]
+
     assert values.ndim == 2
 
     is_empty = values.shape[1] == 0
diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
index d86c60d78195b..c884a32ad4cec 100644
--- a/pandas/core/arrays/_mixins.py
+++ b/pandas/core/arrays/_mixins.py
@@ -476,21 +476,14 @@ def _quantile(
     ) -> NDArrayBackedExtensionArrayT:
         # TODO: disable for Categorical if not ordered?
 
-        # asarray needed for Sparse, see GH#24600
         mask = np.asarray(self.isna())
-        mask = np.atleast_2d(mask)
-
-        arr = np.atleast_2d(self._ndarray)
+        arr = self._ndarray
         fill_value = self._internal_fill_value
 
         res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)
-        res_values = self._cast_quantile_result(res_values)
-        result = self._from_backing_data(res_values)
-        if self.ndim == 1:
-            assert result.shape == (1, len(qs)), result.shape
-            result = result[0]
 
-        return result
+        res_values = self._cast_quantile_result(res_values)
+        return self._from_backing_data(res_values)
 
     # TODO: see if we can share this with other dispatch-wrapping methods
     def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 6392f819e3acf..ffc06afd6041c 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -1586,25 +1586,12 @@ def _quantile(
         -------
         same type as self
         """
-        # asarray needed for Sparse, see GH#24600
         mask = np.asarray(self.isna())
-        mask = np.atleast_2d(mask)
-
-        arr = np.atleast_2d(np.asarray(self))
+        arr = np.asarray(self)
         fill_value = np.nan
 
         res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)
-
-        if self.ndim == 2:
-            # i.e. DatetimeArray
-            result = type(self)._from_sequence(res_values)
-
-        else:
-            # shape[0] should be 1 as long as EAs are 1D
-            assert res_values.shape == (1, len(qs)), res_values.shape
-            result = type(self)._from_sequence(res_values[0])
-
-        return result
+        return type(self)._from_sequence(res_values)
 
     def _mode(self: ExtensionArrayT, dropna: bool = True) -> ExtensionArrayT:
         """
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 9d649c533619e..b1c93aeaf8781 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -352,7 +352,6 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi
     # For comparisons, so that numpy uses our implementation if the compare
     # ops, which raise
     __array_priority__ = 1000
-    _internal_fill_value = -1
     # tolist is not actually deprecated, just suppressed in the __dir__
     _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"])
     _typ = "categorical"
@@ -477,6 +476,13 @@ def dtype(self) -> CategoricalDtype:
         """
         return self._dtype
 
+    @property
+    def _internal_fill_value(self) -> int:
+        # using the specific numpy integer instead of python int to get
+        #  the correct dtype back from _quantile in the all-NA case
+        dtype = self._ndarray.dtype
+        return dtype.type(-1)
+
     @property
     def _constructor(self) -> type[Categorical]:
         return Categorical
@@ -2303,7 +2309,7 @@ def _values_for_factorize(self):
 
     def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
         # make sure we have correct itemsize for resulting codes
-        res_values = coerce_indexer_dtype(res_values, self.dtype.categories)
+        assert res_values.dtype == self._ndarray.dtype
         return res_values
 
     def equals(self, other: object) -> bool:
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index 95363e598a06c..e57fa5a86a8c4 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -948,7 +948,7 @@ def equals(self, other) -> bool:
 
     def _quantile(
         self: BaseMaskedArrayT, qs: npt.NDArray[np.float64], interpolation: str
-    ) -> BaseMaskedArrayT:
+    ) -> BaseMaskedArray:
         """
         Dispatch to quantile_with_mask, needed because we do not have
         _from_factorized.
@@ -957,29 +957,30 @@ def _quantile(
         -----
         We assume that all impacted cases are 1D-only.
         """
-        mask = np.atleast_2d(np.asarray(self.isna()))
-        npvalues: np.ndarray = np.atleast_2d(np.asarray(self))
-
         res = quantile_with_mask(
-            npvalues,
-            mask=mask,
-            fill_value=self.dtype.na_value,
+            self._data,
+            mask=self._mask,
+            # TODO(GH#40932): na_value_for_dtype(self.dtype.numpy_dtype)
+            #  instead of np.nan
+            fill_value=np.nan,
             qs=qs,
             interpolation=interpolation,
         )
-        assert res.ndim == 2
-        assert res.shape[0] == 1
-        res = res[0]
-        try:
-            out = type(self)._from_sequence(res, dtype=self.dtype)
-        except TypeError:
-            # GH#42626: not able to safely cast Int64
-            # for floating point output
-            # error: Incompatible types in assignment (expression has type
-            # "ndarray[Any, dtype[floating[_64Bit]]]", variable has type
-            # "BaseMaskedArrayT")
-            out = np.asarray(res, dtype=np.float64)  # type: ignore[assignment]
-        return out
+
+        if self._hasna:
+            # Our result mask is all-False unless we are all-NA, in which
+            #  case it is all-True.
+            if self.ndim == 2:
+                # I think this should be out_mask=self.isna().all(axis=1)
+                #  but am holding off until we have tests
+                raise NotImplementedError
+            elif self.isna().all():
+                out_mask = np.ones(res.shape, dtype=bool)
+            else:
+                out_mask = np.zeros(res.shape, dtype=bool)
+        else:
+            out_mask = np.zeros(res.shape, dtype=bool)
+        return self._maybe_mask_result(res, mask=out_mask)
 
     # ------------------------------------------------------------------
     # Reductions
diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py
index 762f681f97e6d..56ecba861a284 100644
--- a/pandas/core/arrays/period.py
+++ b/pandas/core/arrays/period.py
@@ -699,11 +699,16 @@ def fillna(self, value=None, method=None, limit=None) -> PeriodArray:
             return result.view(self.dtype)  # type: ignore[return-value]
         return super().fillna(value=value, method=method, limit=limit)
 
-    # TODO: alternately could override _quantile like searchsorted
-    def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
-        # quantile_with_mask may return float64 instead of int64, in which
-        #  case we need to cast back
-        return res_values.astype(np.int64, copy=False)
+    def _quantile(
+        self: PeriodArray,
+        qs: npt.NDArray[np.float64],
+        interpolation: str,
+    ) -> PeriodArray:
+        # dispatch to DatetimeArray implementation
+        dtres = self.view("M8[ns]")._quantile(qs, interpolation)
+        # error: Incompatible return value type (got "Union[ExtensionArray,
+        # ndarray[Any, Any]]", expected "PeriodArray")
+        return dtres.view(self.dtype)  # type: ignore[return-value]
 
     # ------------------------------------------------------------------
     # Arithmetic Methods
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
index 25c4a15127200..f8d7b4dca3d34 100644
--- a/pandas/core/arrays/sparse/array.py
+++ b/pandas/core/arrays/sparse/array.py
@@ -75,6 +75,7 @@
 
 from pandas.core import arraylike
 import pandas.core.algorithms as algos
+from pandas.core.array_algos.quantile import quantile_with_mask
 from pandas.core.arraylike import OpsMixin
 from pandas.core.arrays import ExtensionArray
 from pandas.core.arrays.sparse.dtype import SparseDtype
@@ -891,10 +892,27 @@ def value_counts(self, dropna: bool = True) -> Series:
         return Series(counts, index=keys)
 
     def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str):
+
+        if self._null_fill_value or self.sp_index.ngaps == 0:
+            # We can avoid densifying
+            npvalues = self.sp_values
+            mask = np.zeros(npvalues.shape, dtype=bool)
+        else:
+            npvalues = self.to_numpy()
+            mask = self.isna()
+
+        fill_value = na_value_for_dtype(npvalues.dtype, compat=False)
+        res_values = quantile_with_mask(
+            npvalues,
+            mask,
+            fill_value,
+            qs,
+            interpolation,
+        )
+
         # Special case: the returned array isn't _really_ sparse, so we don't
         #  wrap it in a SparseArray
-        result = super()._quantile(qs, interpolation)
-        return np.asarray(result)
+        return res_values
 
     # --------
     # Indexing
diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py
index 040b981c41593..13026a2432ee3 100644
--- a/pandas/tests/frame/methods/test_quantile.py
+++ b/pandas/tests/frame/methods/test_quantile.py
@@ -643,9 +643,14 @@ def test_quantile_ea(self, obj, index):
         qs = [0.5, 0, 1]
         result = self.compute_quantile(obj, qs)
 
+        exp_dtype = index.dtype
+        if index.dtype == "Int64":
+            # match non-nullable casting behavior
+            exp_dtype = "Float64"
+
         # expected here assumes len(index) == 9
         expected = Series(
-            [index[4], index[0], index[-1]], dtype=index.dtype, index=qs, name="A"
+            [index[4], index[0], index[-1]], dtype=exp_dtype, index=qs, name="A"
         )
         expected = type(obj)(expected)
 
@@ -704,7 +709,11 @@ def test_quantile_ea_scalar(self, obj, index):
         qs = 0.5
         result = self.compute_quantile(obj, qs)
 
-        expected = Series({"A": index[4]}, dtype=index.dtype, name=0.5)
+        exp_dtype = index.dtype
+        if index.dtype == "Int64":
+            exp_dtype = "Float64"
+
+        expected = Series({"A": index[4]}, dtype=exp_dtype, name=0.5)
         if isinstance(obj, Series):
             expected = expected["A"]
             assert result == expected
diff --git a/pandas/tests/io/formats/style/test_highlight.py b/pandas/tests/io/formats/style/test_highlight.py
index 63138d87bc72f..418966c754e0a 100644
--- a/pandas/tests/io/formats/style/test_highlight.py
+++ b/pandas/tests/io/formats/style/test_highlight.py
@@ -181,9 +181,20 @@ def test_highlight_between_inclusive(styler, inclusive, expected):
 )
 def test_highlight_quantile(styler, kwargs):
     expected = {
+        (0, 1): [("background-color", "yellow")],
         (2, 0): [("background-color", "yellow")],
         (2, 1): [("background-color", "yellow")],
     }
+    if styler.data.dtypes["B"] != "Int64":
+        expected.pop((0, 1))
+    else:
+        if kwargs.get("axis", -1) is None:
+            expected.pop((0, 1))
+        elif kwargs.get("q_left", -1) == 0:
+            expected.pop((0, 1))
+        elif "subset" in kwargs:
+            expected.pop((0, 1))
+
     result = styler.highlight_quantile(**kwargs)._compute().ctx
     assert result == expected
 
diff --git a/pandas/tests/series/methods/test_quantile.py b/pandas/tests/series/methods/test_quantile.py
index 84bfe8524634b..aeff5b3adfe56 100644
--- a/pandas/tests/series/methods/test_quantile.py
+++ b/pandas/tests/series/methods/test_quantile.py
@@ -222,4 +222,6 @@ def test_quantile_empty(self):
     def test_quantile_dtypes(self, dtype):
         result = Series([1, 2, 3], dtype=dtype).quantile(np.arange(0, 1, 0.25))
         expected = Series(np.arange(1, 3, 0.5), index=np.arange(0, 1, 0.25))
+        if dtype == "Int64":
+            expected = expected.astype("Float64")
         tm.assert_series_equal(result, expected)

From fe2d38e624efcf986ad6c56a41ab37f9df03da7b Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 3 May 2022 09:39:14 -0700
Subject: [PATCH 2/2] fix incorrect dtype

---
 pandas/core/array_algos/quantile.py             | 13 ++++++++++++-
 pandas/tests/frame/methods/test_quantile.py     |  2 ++
 pandas/tests/io/formats/style/test_highlight.py | 11 -----------
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py
index 11dc9bf89ab3b..78e12fb3995fd 100644
--- a/pandas/core/array_algos/quantile.py
+++ b/pandas/core/array_algos/quantile.py
@@ -197,7 +197,18 @@ def _nanpercentile(
             _nanpercentile_1d(val, m, qs, na_value, interpolation=interpolation)
             for (val, m) in zip(list(values), list(mask))
         ]
-        result = np.array(result, dtype=values.dtype, copy=False).T
+        if values.dtype.kind == "f":
+            # preserve itemsize
+            result = np.array(result, dtype=values.dtype, copy=False).T
+        else:
+            result = np.array(result, copy=False).T
+            if (
+                result.dtype != values.dtype
+                and (result == result.astype(values.dtype, copy=False)).all()
+            ):
+                # e.g. values id integer dtype and result is floating dtype,
+                #  only cast back to integer dtype if result values are all-integer.
+                result = result.astype(values.dtype, copy=False)
         return result
     else:
         return np.percentile(
diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py
index 31362ce5d2dd5..655624a4b59ff 100644
--- a/pandas/tests/frame/methods/test_quantile.py
+++ b/pandas/tests/frame/methods/test_quantile.py
@@ -717,6 +717,8 @@ def test_quantile_ea_all_na(self, obj, index):
 
         expected = index.take([-1, -1, -1], allow_fill=True, fill_value=index._na_value)
         expected = Series(expected, index=qs, name="A")
+        if expected.dtype == "Int64":
+            expected = expected.astype("Float64")
         expected = type(obj)(expected)
         tm.assert_equal(result, expected)
 
diff --git a/pandas/tests/io/formats/style/test_highlight.py b/pandas/tests/io/formats/style/test_highlight.py
index 418966c754e0a..63138d87bc72f 100644
--- a/pandas/tests/io/formats/style/test_highlight.py
+++ b/pandas/tests/io/formats/style/test_highlight.py
@@ -181,20 +181,9 @@ def test_highlight_between_inclusive(styler, inclusive, expected):
 )
 def test_highlight_quantile(styler, kwargs):
     expected = {
-        (0, 1): [("background-color", "yellow")],
         (2, 0): [("background-color", "yellow")],
         (2, 1): [("background-color", "yellow")],
     }
-    if styler.data.dtypes["B"] != "Int64":
-        expected.pop((0, 1))
-    else:
-        if kwargs.get("axis", -1) is None:
-            expected.pop((0, 1))
-        elif kwargs.get("q_left", -1) == 0:
-            expected.pop((0, 1))
-        elif "subset" in kwargs:
-            expected.pop((0, 1))
-
     result = styler.highlight_quantile(**kwargs)._compute().ctx
     assert result == expected