Skip to content

Commit 56a0120

Browse files
committed
BUG: quantile for ExtensionArray
1 parent dae99c7 commit 56a0120

File tree

4 files changed

+199
-52
lines changed

4 files changed

+199
-52
lines changed

pandas/core/array_algos/quantile.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
from typing import Sequence, Union
2+
3+
import numpy as np
4+
5+
from pandas._libs import lib
6+
7+
from pandas.core.dtypes.common import is_list_like
8+
9+
from pandas.core.nanops import nanpercentile
10+
11+
12+
def quantile_with_mask(
13+
values: np.ndarray,
14+
mask: np.ndarray,
15+
fill_value,
16+
qs: Union[float, Sequence[float]],
17+
interpolation: str,
18+
axis: int,
19+
) -> np.ndarray:
20+
"""
21+
Compute the quantiles of the given values for each quantile in `qs`.
22+
23+
24+
Parameters
25+
----------
26+
values : np.ndarray
27+
For ExtensionArray, this is _values_for_factorize()[0]
28+
mask : np.ndarray[bool]
29+
mask = isna(values)
30+
For ExtensionArray, this is computed before calling _value_for_factorize
31+
fill_value : Scalar
32+
The value to interpret fill NA entries with
33+
For ExtensionArray, this is _values_for_factorize()[1]
34+
qs : a scalar or list of the quantiles to be computed
35+
interpolation : str
36+
Type of interpolation
37+
axis : int
38+
Axis along which to compute quantiles.
39+
40+
Notes
41+
-----
42+
Assumes values is already 2D. For ExtensionArray this means np.atleast_2d
43+
has been called on _values_for_factorize()[0]
44+
"""
45+
is_empty = values.shape[axis] == 0
46+
orig_scalar = not is_list_like(qs)
47+
if orig_scalar:
48+
# make list-like, unpack later
49+
qs = [qs]
50+
51+
if is_empty:
52+
# create the array of na_values
53+
# 2d len(values) * len(qs)
54+
flat = np.array([fill_value] * len(qs))
55+
result = np.repeat(flat, len(values)).reshape(len(values), len(qs))
56+
else:
57+
# asarray needed for Sparse, see GH#24600
58+
result = nanpercentile(
59+
values,
60+
np.array(qs) * 100,
61+
axis=axis,
62+
na_value=fill_value,
63+
mask=mask,
64+
ndim=values.ndim,
65+
interpolation=interpolation,
66+
)
67+
68+
result = np.array(result, copy=False)
69+
result = result.T
70+
71+
if orig_scalar:
72+
assert result.shape[-1] == 1, result.shape
73+
result = result[..., 0]
74+
result = lib.item_from_zerodim(result)
75+
76+
return result

pandas/core/arrays/datetimelike.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -425,7 +425,8 @@ def copy(self: DatetimeLikeArrayT) -> DatetimeLikeArrayT:
425425
return new_obj
426426

427427
def _values_for_factorize(self):
428-
return self._ndarray, iNaT
428+
# int64 instead of int ensures we have a "view" method
429+
return self._ndarray, np.int64(iNaT)
429430

430431
@classmethod
431432
def _from_factorized(

pandas/core/internals/blocks.py

Lines changed: 42 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
putmask_smart,
5757
putmask_without_repeat,
5858
)
59+
from pandas.core.array_algos.quantile import quantile_with_mask
5960
from pandas.core.array_algos.replace import (
6061
compare_or_regex_search,
6162
replace_regex,
@@ -79,7 +80,6 @@
7980
is_scalar_indexer,
8081
)
8182
import pandas.core.missing as missing
82-
from pandas.core.nanops import nanpercentile
8383

8484
if TYPE_CHECKING:
8585
from pandas import Index
@@ -1390,8 +1390,10 @@ def quantile(self, qs, interpolation="linear", axis: int = 0) -> Block:
13901390
Parameters
13911391
----------
13921392
qs: a scalar or list of the quantiles to be computed
1393-
interpolation: type of interpolation, default 'linear'
1394-
axis: axis to compute, default 0
1393+
interpolation : str, default "linear"
1394+
Type of interpolation
1395+
axis : int, default 0
1396+
Axis along which to compute quantiles.
13951397
13961398
Returns
13971399
-------
@@ -1400,44 +1402,16 @@ def quantile(self, qs, interpolation="linear", axis: int = 0) -> Block:
14001402
# We should always have ndim == 2 because Series dispatches to DataFrame
14011403
assert self.ndim == 2
14021404

1403-
values = self.get_values()
1404-
1405-
is_empty = values.shape[axis] == 0
1406-
orig_scalar = not is_list_like(qs)
1407-
if orig_scalar:
1408-
# make list-like, unpack later
1409-
qs = [qs]
1410-
1411-
if is_empty:
1412-
# create the array of na_values
1413-
# 2d len(values) * len(qs)
1414-
result = np.repeat(
1415-
np.array([self.fill_value] * len(qs)), len(values)
1416-
).reshape(len(values), len(qs))
1417-
else:
1418-
# asarray needed for Sparse, see GH#24600
1419-
mask = np.asarray(isna(values))
1420-
result = nanpercentile(
1421-
values,
1422-
np.array(qs) * 100,
1423-
axis=axis,
1424-
na_value=self.fill_value,
1425-
mask=mask,
1426-
ndim=values.ndim,
1427-
interpolation=interpolation,
1428-
)
1405+
fill_value = self.fill_value
1406+
values = self.values
1407+
mask = np.asarray(isna(values))
14291408

1430-
result = np.array(result, copy=False)
1431-
result = result.T
1409+
result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis)
1410+
ndim = np.ndim(result)
14321411

1433-
if orig_scalar and not lib.is_scalar(result):
1434-
# result could be scalar in case with is_empty and self.ndim == 1
1435-
assert result.shape[-1] == 1, result.shape
1436-
result = result[..., 0]
1437-
result = lib.item_from_zerodim(result)
1412+
placement = np.arange(len(result))
14381413

1439-
ndim = np.ndim(result)
1440-
return make_block(result, placement=np.arange(len(result)), ndim=ndim)
1414+
return make_block(result, placement=placement, ndim=ndim)
14411415

14421416
def _replace_coerce(
14431417
self,
@@ -1866,6 +1840,36 @@ def _unstack(self, unstacker, fill_value, new_placement):
18661840
]
18671841
return blocks, mask
18681842

1843+
def quantile(self, qs, interpolation="linear", axis: int = 0) -> Block:
1844+
# asarray needed for Sparse, see GH#24600
1845+
mask = np.asarray(isna(self.values))
1846+
mask = np.atleast_2d(mask)
1847+
1848+
values, fill_value = self.values._values_for_factorize()
1849+
1850+
values = np.atleast_2d(values)
1851+
1852+
result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis)
1853+
ndim = np.ndim(result)
1854+
1855+
if not is_sparse(self.dtype):
1856+
# shape[0] should be 1 as long as EAs are 1D
1857+
1858+
if result.ndim == 1:
1859+
# i.e. qs was originally a scalar
1860+
assert result.shape == (1,), result.shape
1861+
result = type(self.values)._from_factorized(result, self.values)
1862+
placement = np.arange(len(result))
1863+
1864+
else:
1865+
assert result.shape == (1, len(qs)), result.shape
1866+
result = type(self.values)._from_factorized(result[0], self.values)
1867+
placement = [0]
1868+
else:
1869+
placement = np.arange(len(result))
1870+
1871+
return make_block(result, placement=placement, ndim=ndim)
1872+
18691873

18701874
class HybridMixin:
18711875
"""
@@ -2184,19 +2188,6 @@ def fillna(
21842188
value, limit=limit, inplace=inplace, downcast=downcast
21852189
)
21862190

2187-
def quantile(self, qs, interpolation="linear", axis: int = 0) -> Block:
2188-
naive = self.values.view("M8[ns]")
2189-
2190-
# TODO(EA2D): kludge for 2D block with 1D values
2191-
naive = naive.reshape(self.shape)
2192-
2193-
blk = self.make_block(naive)
2194-
res_blk = blk.quantile(qs, interpolation=interpolation, axis=axis)
2195-
2196-
# TODO(EA2D): ravel is kludge for 2D block with 1D values, assumes column-like
2197-
aware = self._holder(res_blk.values.ravel(), dtype=self.dtype)
2198-
return self.make_block_same_class(aware, ndim=res_blk.ndim)
2199-
22002191
def _check_ndim(self, values, ndim):
22012192
"""
22022193
ndim inference and validation.

pandas/tests/frame/methods/test_quantile.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,85 @@ def test_quantile(self, datetime_frame):
7878
expected = Series([3.0, 4.0], index=[0, 1], name=0.5)
7979
tm.assert_series_equal(result, expected)
8080

81+
@pytest.mark.parametrize("as_dt64tz", [True, False])
82+
def test_quantile_period(self, frame_or_series, as_dt64tz):
83+
pi = pd.period_range("2016-01-01", periods=9, freq="D", name="A")
84+
if as_dt64tz:
85+
pi = pi.to_timestamp("S").tz_localize("US/Central")
86+
87+
obj = frame_or_series(pi)
88+
89+
qs = [0.5, 0, 1]
90+
if frame_or_series is Series:
91+
result = obj.quantile(qs)
92+
else:
93+
result = obj.quantile(qs, numeric_only=False)
94+
95+
expected = Series([pi[4], pi[0], pi[-1]], index=qs, name="A")
96+
expected = frame_or_series(expected)
97+
98+
tm.assert_equal(result, expected)
99+
100+
# TODO: tests for axis=1?
101+
# TODO: empty case? might as well do dt64 and td64 here too
102+
@pytest.mark.parametrize("as_dt64tz", [True, False])
103+
def test_quantile_period_with_nat(self, frame_or_series, as_dt64tz):
104+
pi = pd.period_range("2016-01-01", periods=9, freq="D", name="A")
105+
if as_dt64tz:
106+
pi = pi.to_timestamp("S").tz_localize("US/Central")
107+
108+
obj = frame_or_series(pi)
109+
110+
obj.iloc[0] = pd.NaT
111+
obj.iloc[-1] = pd.NaT
112+
113+
qs = [0.5, 0, 1]
114+
if frame_or_series is Series:
115+
result = obj.quantile(qs)
116+
else:
117+
result = obj.quantile(qs, numeric_only=False)
118+
119+
expected = Series([pi[4], pi[1], pi[-2]], index=qs, name="A")
120+
expected = frame_or_series(expected)
121+
tm.assert_equal(result, expected)
122+
123+
@pytest.mark.parametrize("as_dt64tz", [True, False])
124+
def test_quantile_period_all_nat(self, frame_or_series, as_dt64tz):
125+
pi = pd.period_range("2016-01-01", periods=9, freq="D", name="A")
126+
if as_dt64tz:
127+
pi = pi.to_timestamp("S").tz_localize("US/Central")
128+
129+
obj = frame_or_series(pi)
130+
obj.iloc[:] = pd.NaT
131+
132+
qs = [0.5, 0, 1]
133+
if frame_or_series is Series:
134+
result = obj.quantile(qs)
135+
else:
136+
result = obj.quantile(qs, numeric_only=False)
137+
138+
expected = Series([pd.NaT, pd.NaT, pd.NaT], dtype=pi.dtype, index=qs, name="A")
139+
expected = frame_or_series(expected)
140+
tm.assert_equal(result, expected)
141+
142+
def test_quantile_period_scalar(self, frame_or_series):
143+
# scalar qs
144+
pi = pd.period_range("2016-01-01", periods=9, freq="D", name="A")
145+
obj = frame_or_series(pi)
146+
147+
qs = 0.5
148+
if frame_or_series is Series:
149+
result = obj.quantile(qs)
150+
else:
151+
result = obj.quantile(qs, numeric_only=False)
152+
153+
expected = Series({"A": pi[4]}, name=0.5)
154+
if frame_or_series is Series:
155+
expected = expected["A"]
156+
assert result == expected
157+
else:
158+
tm.assert_series_equal(result, expected)
159+
81160
def test_quantile_date_range(self):
82161
# GH 2460
83162

0 commit comments

Comments
 (0)