diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py new file mode 100644 index 0000000000000..8d4dd7be28839 --- /dev/null +++ b/pandas/core/array_algos/quantile.py @@ -0,0 +1,77 @@ +import numpy as np + +from pandas._libs import lib + +from pandas.core.dtypes.common import is_list_like + +from pandas.core.nanops import nanpercentile + + +def quantile_with_mask( + values: np.ndarray, + mask: np.ndarray, + fill_value, + qs, + interpolation: str, + axis: int, +) -> np.ndarray: + """ + Compute the quantiles of the given values for each quantile in `qs`. + + Parameters + ---------- + values : np.ndarray + For ExtensionArray, this is _values_for_factorize()[0] + mask : np.ndarray[bool] + mask = isna(values) + For ExtensionArray, this is computed before calling _value_for_factorize + fill_value : Scalar + The value to interpret fill NA entries with + For ExtensionArray, this is _values_for_factorize()[1] + qs : a scalar or list of the quantiles to be computed + interpolation : str + Type of interpolation + axis : int + Axis along which to compute quantiles. + + Returns + ------- + np.ndarray + + Notes + ----- + Assumes values is already 2D. For ExtensionArray this means np.atleast_2d + has been called on _values_for_factorize()[0] + """ + is_empty = values.shape[axis] == 0 + orig_scalar = not is_list_like(qs) + if orig_scalar: + # make list-like, unpack later + qs = [qs] + + if is_empty: + # create the array of na_values + # 2d len(values) * len(qs) + flat = np.array([fill_value] * len(qs)) + result = np.repeat(flat, len(values)).reshape(len(values), len(qs)) + else: + # asarray needed for Sparse, see GH#24600 + result = nanpercentile( + values, + np.array(qs) * 100, + axis=axis, + na_value=fill_value, + mask=mask, + ndim=values.ndim, + interpolation=interpolation, + ) + + result = np.array(result, copy=False) + result = result.T + + if orig_scalar: + assert result.shape[-1] == 1, result.shape + result = result[..., 0] + result = lib.item_from_zerodim(result) + + return result diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 48df98930244a..162a69370bc61 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -422,7 +422,8 @@ def copy(self: DatetimeLikeArrayT) -> DatetimeLikeArrayT: return new_obj def _values_for_factorize(self): - return self._ndarray, iNaT + # int64 instead of int ensures we have a "view" method + return self._ndarray, np.int64(iNaT) @classmethod def _from_factorized( diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 30e94b99b53c9..b38f3f3d7a87e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -56,6 +56,7 @@ putmask_smart, putmask_without_repeat, ) +from pandas.core.array_algos.quantile import quantile_with_mask from pandas.core.array_algos.replace import ( compare_or_regex_search, replace_regex, @@ -79,7 +80,6 @@ is_scalar_indexer, ) import pandas.core.missing as missing -from pandas.core.nanops import nanpercentile if TYPE_CHECKING: from pandas import Float64Index, Index @@ -1405,31 +1405,11 @@ def quantile( assert axis == 1 # only ever called this way assert is_list_like(qs) # caller is responsible for this - values = self.get_values() - - is_empty = values.shape[axis] == 0 - - if is_empty: - # create the array of na_values - # 2d len(values) * len(qs) - result = np.repeat( - np.array([self.fill_value] * len(qs)), len(values) - ).reshape(len(values), len(qs)) - else: - # asarray needed for Sparse, see GH#24600 - mask = np.asarray(isna(values)) - result = nanpercentile( - values, - np.array(qs) * 100, - axis=axis, - na_value=self.fill_value, - mask=mask, - ndim=values.ndim, - interpolation=interpolation, - ) + fill_value = self.fill_value + values = self.values + mask = np.asarray(isna(values)) - result = np.array(result, copy=False) - result = result.T + result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis) return make_block(result, placement=self.mgr_locs, ndim=2) @@ -1860,6 +1840,24 @@ def _unstack(self, unstacker, fill_value, new_placement): ] return blocks, mask + def quantile(self, qs, interpolation="linear", axis: int = 0) -> Block: + # asarray needed for Sparse, see GH#24600 + mask = np.asarray(isna(self.values)) + mask = np.atleast_2d(mask) + + values, fill_value = self.values._values_for_factorize() + + values = np.atleast_2d(values) + + result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis) + + if not is_sparse(self.dtype): + # shape[0] should be 1 as long as EAs are 1D + assert result.shape == (1, len(qs)), result.shape + result = type(self.values)._from_factorized(result[0], self.values) + + return make_block(result, placement=self.mgr_locs, ndim=2) + class HybridMixin: """ @@ -2191,22 +2189,6 @@ def fillna( value, limit=limit, inplace=inplace, downcast=downcast ) - def quantile( - self, qs: Float64Index, interpolation="linear", axis: int = 0 - ) -> Block: - assert axis == 1 # only ever called this way - naive = self.values.view("M8[ns]") - - # TODO(EA2D): kludge for 2D block with 1D values - naive = naive.reshape(self.shape) - - blk = self.make_block(naive) - res_blk = blk.quantile(qs, interpolation=interpolation, axis=axis) - - # TODO(EA2D): ravel is kludge for 2D block with 1D values, assumes column-like - aware = self._holder(res_blk.values.ravel(), dtype=self.dtype) - return self.make_block_same_class(aware, ndim=res_blk.ndim) - def _check_ndim(self, values, ndim): """ ndim inference and validation. diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 3f7f2e51add96..6d6016df52238 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -533,3 +533,113 @@ def test_quantile_item_cache(self): ser.values[0] = 99 assert df.iloc[0, 0] == df["A"][0] + + +class TestQuantileExtensionDtype: + # TODO: tests for axis=1? + # TODO: empty case? might as well do dt64 and td64 here too + + @pytest.fixture( + params=[ + pytest.param( + pd.IntervalIndex.from_breaks(range(10)), + marks=pytest.mark.xfail(reason="raises when trying to add Intervals"), + ), + pd.period_range("2016-01-01", periods=9, freq="D"), + pd.date_range("2016-01-01", periods=9, tz="US/Pacific"), + pytest.param( + pd.array(np.arange(9), dtype="Int64"), + marks=pytest.mark.xfail(reason="doesnt implement from_factorized"), + ), + pytest.param( + pd.array(np.arange(9), dtype="Float64"), + marks=pytest.mark.xfail(reason="doesnt implement from_factorized"), + ), + ], + ids=lambda x: str(x.dtype), + ) + def index(self, request): + idx = request.param + idx.name = "A" + return idx + + def compute_quantile(self, obj, qs): + if isinstance(obj, Series): + result = obj.quantile(qs) + else: + result = obj.quantile(qs, numeric_only=False) + return result + + def test_quantile_ea(self, index, frame_or_series): + obj = frame_or_series(index).copy() + + # result should be invariant to shuffling + indexer = np.arange(len(index), dtype=np.intp) + np.random.shuffle(indexer) + obj = obj.iloc[indexer] + + qs = [0.5, 0, 1] + result = self.compute_quantile(obj, qs) + + # expected here assumes len(index) == 9 + expected = Series([index[4], index[0], index[-1]], index=qs, name="A") + expected = frame_or_series(expected) + + tm.assert_equal(result, expected) + + def test_quantile_ea_with_na(self, index, frame_or_series): + obj = frame_or_series(index).copy() + + obj.iloc[0] = index._na_value + obj.iloc[-1] = index._na_value + + # result should be invariant to shuffling + indexer = np.arange(len(index), dtype=np.intp) + np.random.shuffle(indexer) + obj = obj.iloc[indexer] + + qs = [0.5, 0, 1] + result = self.compute_quantile(obj, qs) + + # expected here assumes len(index) == 9 + expected = Series([index[4], index[1], index[-2]], index=qs, name="A") + expected = frame_or_series(expected) + tm.assert_equal(result, expected) + + def test_quantile_ea_all_na(self, index, frame_or_series): + + obj = frame_or_series(index).copy() + + obj.iloc[:] = index._na_value + + # result should be invariant to shuffling + indexer = np.arange(len(index), dtype=np.intp) + np.random.shuffle(indexer) + obj = obj.iloc[indexer] + + qs = [0.5, 0, 1] + result = self.compute_quantile(obj, qs) + + expected = index.take([-1, -1, -1], allow_fill=True, fill_value=index._na_value) + expected = Series(expected, index=qs) + expected = frame_or_series(expected) + tm.assert_equal(result, expected) + + def test_quantile_ea_scalar(self, index, frame_or_series): + # scalar qs + obj = frame_or_series(index).copy() + + # result should be invariant to shuffling + indexer = np.arange(len(index), dtype=np.intp) + np.random.shuffle(indexer) + obj = obj.iloc[indexer] + + qs = 0.5 + result = self.compute_quantile(obj, qs) + + expected = Series({"A": index[4]}, name=0.5) + if frame_or_series is Series: + expected = expected["A"] + assert result == expected + else: + tm.assert_series_equal(result, expected)