BUG: quantile for ExtensionArray (#39606)

jbrockmendel · web-flow · commit 0a88eaaebe03 · 2021-02-11T20:24:39.000-05:00
diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py
@@ -0,0 +1,77 @@
+import numpy as np
+
+from pandas._libs import lib
+
+from pandas.core.dtypes.common import is_list_like
+
+from pandas.core.nanops import nanpercentile
+
+
+def quantile_with_mask(
+    values: np.ndarray,
+    mask: np.ndarray,
+    fill_value,
+    qs,
+    interpolation: str,
+    axis: int,
+) -> np.ndarray:
+    """
+    Compute the quantiles of the given values for each quantile in `qs`.
+
+    Parameters
+    ----------
+    values : np.ndarray
+        For ExtensionArray, this is _values_for_factorize()[0]
+    mask : np.ndarray[bool]
+        mask = isna(values)
+        For ExtensionArray, this is computed before calling _value_for_factorize
+    fill_value : Scalar
+        The value to interpret fill NA entries with
+        For ExtensionArray, this is _values_for_factorize()[1]
+    qs : a scalar or list of the quantiles to be computed
+    interpolation : str
+        Type of interpolation
+    axis : int
+        Axis along which to compute quantiles.
+
+    Returns
+    -------
+    np.ndarray
+
+    Notes
+    -----
+    Assumes values is already 2D.  For ExtensionArray this means np.atleast_2d
+    has been called on _values_for_factorize()[0]
+    """
+    is_empty = values.shape[axis] == 0
+    orig_scalar = not is_list_like(qs)
+    if orig_scalar:
+        # make list-like, unpack later
+        qs = [qs]
+
+    if is_empty:
+        # create the array of na_values
+        # 2d len(values) * len(qs)
+        flat = np.array([fill_value] * len(qs))
+        result = np.repeat(flat, len(values)).reshape(len(values), len(qs))
+    else:
+        # asarray needed for Sparse, see GH#24600
+        result = nanpercentile(
+            values,
+            np.array(qs) * 100,
+            axis=axis,
+            na_value=fill_value,
+            mask=mask,
+            ndim=values.ndim,
+            interpolation=interpolation,
+        )
+
+        result = np.array(result, copy=False)
+        result = result.T
+
+    if orig_scalar:
+        assert result.shape[-1] == 1, result.shape
+        result = result[..., 0]
+        result = lib.item_from_zerodim(result)
+
+    return result
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -422,7 +422,8 @@ def copy(self: DatetimeLikeArrayT) -> DatetimeLikeArrayT:
         return new_obj
 
     def _values_for_factorize(self):
-        return self._ndarray, iNaT
+        # int64 instead of int ensures we have a "view" method
+        return self._ndarray, np.int64(iNaT)
 
     @classmethod
     def _from_factorized(
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -56,6 +56,7 @@
     putmask_smart,
     putmask_without_repeat,
 )
+from pandas.core.array_algos.quantile import quantile_with_mask
 from pandas.core.array_algos.replace import (
     compare_or_regex_search,
     replace_regex,
@@ -79,7 +80,6 @@
     is_scalar_indexer,
 )
 import pandas.core.missing as missing
-from pandas.core.nanops import nanpercentile
 
 if TYPE_CHECKING:
     from pandas import Float64Index, Index
@@ -1405,31 +1405,11 @@ def quantile(
         assert axis == 1  # only ever called this way
         assert is_list_like(qs)  # caller is responsible for this
 
-        values = self.get_values()
-
-        is_empty = values.shape[axis] == 0
-
-        if is_empty:
-            # create the array of na_values
-            # 2d len(values) * len(qs)
-            result = np.repeat(
-                np.array([self.fill_value] * len(qs)), len(values)
-            ).reshape(len(values), len(qs))
-        else:
-            # asarray needed for Sparse, see GH#24600
-            mask = np.asarray(isna(values))
-            result = nanpercentile(
-                values,
-                np.array(qs) * 100,
-                axis=axis,
-                na_value=self.fill_value,
-                mask=mask,
-                ndim=values.ndim,
-                interpolation=interpolation,
-            )
+        fill_value = self.fill_value
+        values = self.values
+        mask = np.asarray(isna(values))
 
-            result = np.array(result, copy=False)
-            result = result.T
+        result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis)
 
         return make_block(result, placement=self.mgr_locs, ndim=2)
 
@@ -1860,6 +1840,24 @@ def _unstack(self, unstacker, fill_value, new_placement):
         ]
         return blocks, mask
 
+    def quantile(self, qs, interpolation="linear", axis: int = 0) -> Block:
+        # asarray needed for Sparse, see GH#24600
+        mask = np.asarray(isna(self.values))
+        mask = np.atleast_2d(mask)
+
+        values, fill_value = self.values._values_for_factorize()
+
+        values = np.atleast_2d(values)
+
+        result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis)
+
+        if not is_sparse(self.dtype):
+            # shape[0] should be 1 as long as EAs are 1D
+            assert result.shape == (1, len(qs)), result.shape
+            result = type(self.values)._from_factorized(result[0], self.values)
+
+        return make_block(result, placement=self.mgr_locs, ndim=2)
+
 
 class HybridMixin:
     """
@@ -2191,22 +2189,6 @@ def fillna(
             value, limit=limit, inplace=inplace, downcast=downcast
         )
 
-    def quantile(
-        self, qs: Float64Index, interpolation="linear", axis: int = 0
-    ) -> Block:
-        assert axis == 1  # only ever called this way
-        naive = self.values.view("M8[ns]")
-
-        # TODO(EA2D): kludge for 2D block with 1D values
-        naive = naive.reshape(self.shape)
-
-        blk = self.make_block(naive)
-        res_blk = blk.quantile(qs, interpolation=interpolation, axis=axis)
-
-        # TODO(EA2D): ravel is kludge for 2D block with 1D values, assumes column-like
-        aware = self._holder(res_blk.values.ravel(), dtype=self.dtype)
-        return self.make_block_same_class(aware, ndim=res_blk.ndim)
-
     def _check_ndim(self, values, ndim):
         """
         ndim inference and validation.
diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py
@@ -533,3 +533,113 @@ def test_quantile_item_cache(self):
         ser.values[0] = 99
 
         assert df.iloc[0, 0] == df["A"][0]
+
+
+class TestQuantileExtensionDtype:
+    # TODO: tests for axis=1?
+    # TODO: empty case?  might as well do dt64 and td64 here too
+
+    @pytest.fixture(
+        params=[
+            pytest.param(
+                pd.IntervalIndex.from_breaks(range(10)),
+                marks=pytest.mark.xfail(reason="raises when trying to add Intervals"),
+            ),
+            pd.period_range("2016-01-01", periods=9, freq="D"),
+            pd.date_range("2016-01-01", periods=9, tz="US/Pacific"),
+            pytest.param(
+                pd.array(np.arange(9), dtype="Int64"),
+                marks=pytest.mark.xfail(reason="doesnt implement from_factorized"),
+            ),
+            pytest.param(
+                pd.array(np.arange(9), dtype="Float64"),
+                marks=pytest.mark.xfail(reason="doesnt implement from_factorized"),
+            ),
+        ],
+        ids=lambda x: str(x.dtype),
+    )
+    def index(self, request):
+        idx = request.param
+        idx.name = "A"
+        return idx
+
+    def compute_quantile(self, obj, qs):
+        if isinstance(obj, Series):
+            result = obj.quantile(qs)
+        else:
+            result = obj.quantile(qs, numeric_only=False)
+        return result
+
+    def test_quantile_ea(self, index, frame_or_series):
+        obj = frame_or_series(index).copy()
+
+        # result should be invariant to shuffling
+        indexer = np.arange(len(index), dtype=np.intp)
+        np.random.shuffle(indexer)
+        obj = obj.iloc[indexer]
+
+        qs = [0.5, 0, 1]
+        result = self.compute_quantile(obj, qs)
+
+        # expected here assumes len(index) == 9
+        expected = Series([index[4], index[0], index[-1]], index=qs, name="A")
+        expected = frame_or_series(expected)
+
+        tm.assert_equal(result, expected)
+
+    def test_quantile_ea_with_na(self, index, frame_or_series):
+        obj = frame_or_series(index).copy()
+
+        obj.iloc[0] = index._na_value
+        obj.iloc[-1] = index._na_value
+
+        # result should be invariant to shuffling
+        indexer = np.arange(len(index), dtype=np.intp)
+        np.random.shuffle(indexer)
+        obj = obj.iloc[indexer]
+
+        qs = [0.5, 0, 1]
+        result = self.compute_quantile(obj, qs)
+
+        # expected here assumes len(index) == 9
+        expected = Series([index[4], index[1], index[-2]], index=qs, name="A")
+        expected = frame_or_series(expected)
+        tm.assert_equal(result, expected)
+
+    def test_quantile_ea_all_na(self, index, frame_or_series):
+
+        obj = frame_or_series(index).copy()
+
+        obj.iloc[:] = index._na_value
+
+        # result should be invariant to shuffling
+        indexer = np.arange(len(index), dtype=np.intp)
+        np.random.shuffle(indexer)
+        obj = obj.iloc[indexer]
+
+        qs = [0.5, 0, 1]
+        result = self.compute_quantile(obj, qs)
+
+        expected = index.take([-1, -1, -1], allow_fill=True, fill_value=index._na_value)
+        expected = Series(expected, index=qs)
+        expected = frame_or_series(expected)
+        tm.assert_equal(result, expected)
+
+    def test_quantile_ea_scalar(self, index, frame_or_series):
+        # scalar qs
+        obj = frame_or_series(index).copy()
+
+        # result should be invariant to shuffling
+        indexer = np.arange(len(index), dtype=np.intp)
+        np.random.shuffle(indexer)
+        obj = obj.iloc[indexer]
+
+        qs = 0.5
+        result = self.compute_quantile(obj, qs)
+
+        expected = Series({"A": index[4]}, name=0.5)
+        if frame_or_series is Series:
+            expected = expected["A"]
+            assert result == expected
+        else:
+            tm.assert_series_equal(result, expected)