Skip to content

Commit 0a88eaa

Browse files
authored
BUG: quantile for ExtensionArray (#39606)
1 parent d93d3a5 commit 0a88eaa

File tree

4 files changed

+212
-42
lines changed

4 files changed

+212
-42
lines changed

pandas/core/array_algos/quantile.py

+77
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import numpy as np
2+
3+
from pandas._libs import lib
4+
5+
from pandas.core.dtypes.common import is_list_like
6+
7+
from pandas.core.nanops import nanpercentile
8+
9+
10+
def quantile_with_mask(
11+
values: np.ndarray,
12+
mask: np.ndarray,
13+
fill_value,
14+
qs,
15+
interpolation: str,
16+
axis: int,
17+
) -> np.ndarray:
18+
"""
19+
Compute the quantiles of the given values for each quantile in `qs`.
20+
21+
Parameters
22+
----------
23+
values : np.ndarray
24+
For ExtensionArray, this is _values_for_factorize()[0]
25+
mask : np.ndarray[bool]
26+
mask = isna(values)
27+
For ExtensionArray, this is computed before calling _value_for_factorize
28+
fill_value : Scalar
29+
The value to interpret fill NA entries with
30+
For ExtensionArray, this is _values_for_factorize()[1]
31+
qs : a scalar or list of the quantiles to be computed
32+
interpolation : str
33+
Type of interpolation
34+
axis : int
35+
Axis along which to compute quantiles.
36+
37+
Returns
38+
-------
39+
np.ndarray
40+
41+
Notes
42+
-----
43+
Assumes values is already 2D. For ExtensionArray this means np.atleast_2d
44+
has been called on _values_for_factorize()[0]
45+
"""
46+
is_empty = values.shape[axis] == 0
47+
orig_scalar = not is_list_like(qs)
48+
if orig_scalar:
49+
# make list-like, unpack later
50+
qs = [qs]
51+
52+
if is_empty:
53+
# create the array of na_values
54+
# 2d len(values) * len(qs)
55+
flat = np.array([fill_value] * len(qs))
56+
result = np.repeat(flat, len(values)).reshape(len(values), len(qs))
57+
else:
58+
# asarray needed for Sparse, see GH#24600
59+
result = nanpercentile(
60+
values,
61+
np.array(qs) * 100,
62+
axis=axis,
63+
na_value=fill_value,
64+
mask=mask,
65+
ndim=values.ndim,
66+
interpolation=interpolation,
67+
)
68+
69+
result = np.array(result, copy=False)
70+
result = result.T
71+
72+
if orig_scalar:
73+
assert result.shape[-1] == 1, result.shape
74+
result = result[..., 0]
75+
result = lib.item_from_zerodim(result)
76+
77+
return result

pandas/core/arrays/datetimelike.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,8 @@ def copy(self: DatetimeLikeArrayT) -> DatetimeLikeArrayT:
422422
return new_obj
423423

424424
def _values_for_factorize(self):
425-
return self._ndarray, iNaT
425+
# int64 instead of int ensures we have a "view" method
426+
return self._ndarray, np.int64(iNaT)
426427

427428
@classmethod
428429
def _from_factorized(

pandas/core/internals/blocks.py

+23-41
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
putmask_smart,
5757
putmask_without_repeat,
5858
)
59+
from pandas.core.array_algos.quantile import quantile_with_mask
5960
from pandas.core.array_algos.replace import (
6061
compare_or_regex_search,
6162
replace_regex,
@@ -79,7 +80,6 @@
7980
is_scalar_indexer,
8081
)
8182
import pandas.core.missing as missing
82-
from pandas.core.nanops import nanpercentile
8383

8484
if TYPE_CHECKING:
8585
from pandas import Float64Index, Index
@@ -1405,31 +1405,11 @@ def quantile(
14051405
assert axis == 1 # only ever called this way
14061406
assert is_list_like(qs) # caller is responsible for this
14071407

1408-
values = self.get_values()
1409-
1410-
is_empty = values.shape[axis] == 0
1411-
1412-
if is_empty:
1413-
# create the array of na_values
1414-
# 2d len(values) * len(qs)
1415-
result = np.repeat(
1416-
np.array([self.fill_value] * len(qs)), len(values)
1417-
).reshape(len(values), len(qs))
1418-
else:
1419-
# asarray needed for Sparse, see GH#24600
1420-
mask = np.asarray(isna(values))
1421-
result = nanpercentile(
1422-
values,
1423-
np.array(qs) * 100,
1424-
axis=axis,
1425-
na_value=self.fill_value,
1426-
mask=mask,
1427-
ndim=values.ndim,
1428-
interpolation=interpolation,
1429-
)
1408+
fill_value = self.fill_value
1409+
values = self.values
1410+
mask = np.asarray(isna(values))
14301411

1431-
result = np.array(result, copy=False)
1432-
result = result.T
1412+
result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis)
14331413

14341414
return make_block(result, placement=self.mgr_locs, ndim=2)
14351415

@@ -1860,6 +1840,24 @@ def _unstack(self, unstacker, fill_value, new_placement):
18601840
]
18611841
return blocks, mask
18621842

1843+
def quantile(self, qs, interpolation="linear", axis: int = 0) -> Block:
1844+
# asarray needed for Sparse, see GH#24600
1845+
mask = np.asarray(isna(self.values))
1846+
mask = np.atleast_2d(mask)
1847+
1848+
values, fill_value = self.values._values_for_factorize()
1849+
1850+
values = np.atleast_2d(values)
1851+
1852+
result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis)
1853+
1854+
if not is_sparse(self.dtype):
1855+
# shape[0] should be 1 as long as EAs are 1D
1856+
assert result.shape == (1, len(qs)), result.shape
1857+
result = type(self.values)._from_factorized(result[0], self.values)
1858+
1859+
return make_block(result, placement=self.mgr_locs, ndim=2)
1860+
18631861

18641862
class HybridMixin:
18651863
"""
@@ -2191,22 +2189,6 @@ def fillna(
21912189
value, limit=limit, inplace=inplace, downcast=downcast
21922190
)
21932191

2194-
def quantile(
2195-
self, qs: Float64Index, interpolation="linear", axis: int = 0
2196-
) -> Block:
2197-
assert axis == 1 # only ever called this way
2198-
naive = self.values.view("M8[ns]")
2199-
2200-
# TODO(EA2D): kludge for 2D block with 1D values
2201-
naive = naive.reshape(self.shape)
2202-
2203-
blk = self.make_block(naive)
2204-
res_blk = blk.quantile(qs, interpolation=interpolation, axis=axis)
2205-
2206-
# TODO(EA2D): ravel is kludge for 2D block with 1D values, assumes column-like
2207-
aware = self._holder(res_blk.values.ravel(), dtype=self.dtype)
2208-
return self.make_block_same_class(aware, ndim=res_blk.ndim)
2209-
22102192
def _check_ndim(self, values, ndim):
22112193
"""
22122194
ndim inference and validation.

pandas/tests/frame/methods/test_quantile.py

+110
Original file line numberDiff line numberDiff line change
@@ -533,3 +533,113 @@ def test_quantile_item_cache(self):
533533
ser.values[0] = 99
534534

535535
assert df.iloc[0, 0] == df["A"][0]
536+
537+
538+
class TestQuantileExtensionDtype:
539+
# TODO: tests for axis=1?
540+
# TODO: empty case? might as well do dt64 and td64 here too
541+
542+
@pytest.fixture(
543+
params=[
544+
pytest.param(
545+
pd.IntervalIndex.from_breaks(range(10)),
546+
marks=pytest.mark.xfail(reason="raises when trying to add Intervals"),
547+
),
548+
pd.period_range("2016-01-01", periods=9, freq="D"),
549+
pd.date_range("2016-01-01", periods=9, tz="US/Pacific"),
550+
pytest.param(
551+
pd.array(np.arange(9), dtype="Int64"),
552+
marks=pytest.mark.xfail(reason="doesnt implement from_factorized"),
553+
),
554+
pytest.param(
555+
pd.array(np.arange(9), dtype="Float64"),
556+
marks=pytest.mark.xfail(reason="doesnt implement from_factorized"),
557+
),
558+
],
559+
ids=lambda x: str(x.dtype),
560+
)
561+
def index(self, request):
562+
idx = request.param
563+
idx.name = "A"
564+
return idx
565+
566+
def compute_quantile(self, obj, qs):
567+
if isinstance(obj, Series):
568+
result = obj.quantile(qs)
569+
else:
570+
result = obj.quantile(qs, numeric_only=False)
571+
return result
572+
573+
def test_quantile_ea(self, index, frame_or_series):
574+
obj = frame_or_series(index).copy()
575+
576+
# result should be invariant to shuffling
577+
indexer = np.arange(len(index), dtype=np.intp)
578+
np.random.shuffle(indexer)
579+
obj = obj.iloc[indexer]
580+
581+
qs = [0.5, 0, 1]
582+
result = self.compute_quantile(obj, qs)
583+
584+
# expected here assumes len(index) == 9
585+
expected = Series([index[4], index[0], index[-1]], index=qs, name="A")
586+
expected = frame_or_series(expected)
587+
588+
tm.assert_equal(result, expected)
589+
590+
def test_quantile_ea_with_na(self, index, frame_or_series):
591+
obj = frame_or_series(index).copy()
592+
593+
obj.iloc[0] = index._na_value
594+
obj.iloc[-1] = index._na_value
595+
596+
# result should be invariant to shuffling
597+
indexer = np.arange(len(index), dtype=np.intp)
598+
np.random.shuffle(indexer)
599+
obj = obj.iloc[indexer]
600+
601+
qs = [0.5, 0, 1]
602+
result = self.compute_quantile(obj, qs)
603+
604+
# expected here assumes len(index) == 9
605+
expected = Series([index[4], index[1], index[-2]], index=qs, name="A")
606+
expected = frame_or_series(expected)
607+
tm.assert_equal(result, expected)
608+
609+
def test_quantile_ea_all_na(self, index, frame_or_series):
610+
611+
obj = frame_or_series(index).copy()
612+
613+
obj.iloc[:] = index._na_value
614+
615+
# result should be invariant to shuffling
616+
indexer = np.arange(len(index), dtype=np.intp)
617+
np.random.shuffle(indexer)
618+
obj = obj.iloc[indexer]
619+
620+
qs = [0.5, 0, 1]
621+
result = self.compute_quantile(obj, qs)
622+
623+
expected = index.take([-1, -1, -1], allow_fill=True, fill_value=index._na_value)
624+
expected = Series(expected, index=qs)
625+
expected = frame_or_series(expected)
626+
tm.assert_equal(result, expected)
627+
628+
def test_quantile_ea_scalar(self, index, frame_or_series):
629+
# scalar qs
630+
obj = frame_or_series(index).copy()
631+
632+
# result should be invariant to shuffling
633+
indexer = np.arange(len(index), dtype=np.intp)
634+
np.random.shuffle(indexer)
635+
obj = obj.iloc[indexer]
636+
637+
qs = 0.5
638+
result = self.compute_quantile(obj, qs)
639+
640+
expected = Series({"A": index[4]}, name=0.5)
641+
if frame_or_series is Series:
642+
expected = expected["A"]
643+
assert result == expected
644+
else:
645+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)