Skip to content

Commit 6edb64c

Browse files
authored
BUG: MaskedArray._quantile match non-nullable behavior (#46282)
1 parent dd6869f commit 6edb64c

File tree

9 files changed

+99
-57
lines changed

9 files changed

+99
-57
lines changed

pandas/core/array_algos/quantile.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,14 @@ def quantile_with_mask(
7474
7575
Quantile is computed along axis=1.
7676
"""
77+
assert values.shape == mask.shape
78+
if values.ndim == 1:
79+
# unsqueeze, operate, re-squeeze
80+
values = np.atleast_2d(values)
81+
mask = np.atleast_2d(mask)
82+
res_values = quantile_with_mask(values, mask, fill_value, qs, interpolation)
83+
return res_values[0]
84+
7785
assert values.ndim == 2
7886

7987
is_empty = values.shape[1] == 0
@@ -189,7 +197,18 @@ def _nanpercentile(
189197
_nanpercentile_1d(val, m, qs, na_value, interpolation=interpolation)
190198
for (val, m) in zip(list(values), list(mask))
191199
]
192-
result = np.array(result, dtype=values.dtype, copy=False).T
200+
if values.dtype.kind == "f":
201+
# preserve itemsize
202+
result = np.array(result, dtype=values.dtype, copy=False).T
203+
else:
204+
result = np.array(result, copy=False).T
205+
if (
206+
result.dtype != values.dtype
207+
and (result == result.astype(values.dtype, copy=False)).all()
208+
):
209+
# e.g. values id integer dtype and result is floating dtype,
210+
# only cast back to integer dtype if result values are all-integer.
211+
result = result.astype(values.dtype, copy=False)
193212
return result
194213
else:
195214
return np.percentile(

pandas/core/arrays/_mixins.py

+3-10
Original file line numberDiff line numberDiff line change
@@ -473,21 +473,14 @@ def _quantile(
473473
) -> NDArrayBackedExtensionArrayT:
474474
# TODO: disable for Categorical if not ordered?
475475

476-
# asarray needed for Sparse, see GH#24600
477476
mask = np.asarray(self.isna())
478-
mask = np.atleast_2d(mask)
479-
480-
arr = np.atleast_2d(self._ndarray)
477+
arr = self._ndarray
481478
fill_value = self._internal_fill_value
482479

483480
res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)
484-
res_values = self._cast_quantile_result(res_values)
485-
result = self._from_backing_data(res_values)
486-
if self.ndim == 1:
487-
assert result.shape == (1, len(qs)), result.shape
488-
result = result[0]
489481

490-
return result
482+
res_values = self._cast_quantile_result(res_values)
483+
return self._from_backing_data(res_values)
491484

492485
# TODO: see if we can share this with other dispatch-wrapping methods
493486
def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:

pandas/core/arrays/base.py

+2-15
Original file line numberDiff line numberDiff line change
@@ -1586,25 +1586,12 @@ def _quantile(
15861586
-------
15871587
same type as self
15881588
"""
1589-
# asarray needed for Sparse, see GH#24600
15901589
mask = np.asarray(self.isna())
1591-
mask = np.atleast_2d(mask)
1592-
1593-
arr = np.atleast_2d(np.asarray(self))
1590+
arr = np.asarray(self)
15941591
fill_value = np.nan
15951592

15961593
res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)
1597-
1598-
if self.ndim == 2:
1599-
# i.e. DatetimeArray
1600-
result = type(self)._from_sequence(res_values)
1601-
1602-
else:
1603-
# shape[0] should be 1 as long as EAs are 1D
1604-
assert res_values.shape == (1, len(qs)), res_values.shape
1605-
result = type(self)._from_sequence(res_values[0])
1606-
1607-
return result
1594+
return type(self)._from_sequence(res_values)
16081595

16091596
def _mode(self: ExtensionArrayT, dropna: bool = True) -> ExtensionArrayT:
16101597
"""

pandas/core/arrays/categorical.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,6 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi
351351
# For comparisons, so that numpy uses our implementation if the compare
352352
# ops, which raise
353353
__array_priority__ = 1000
354-
_internal_fill_value = -1
355354
# tolist is not actually deprecated, just suppressed in the __dir__
356355
_hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"])
357356
_typ = "categorical"
@@ -476,6 +475,13 @@ def dtype(self) -> CategoricalDtype:
476475
"""
477476
return self._dtype
478477

478+
@property
479+
def _internal_fill_value(self) -> int:
480+
# using the specific numpy integer instead of python int to get
481+
# the correct dtype back from _quantile in the all-NA case
482+
dtype = self._ndarray.dtype
483+
return dtype.type(-1)
484+
479485
@property
480486
def _constructor(self) -> type[Categorical]:
481487
return Categorical
@@ -2300,7 +2306,7 @@ def unique(self):
23002306

23012307
def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
23022308
# make sure we have correct itemsize for resulting codes
2303-
res_values = coerce_indexer_dtype(res_values, self.dtype.categories)
2309+
assert res_values.dtype == self._ndarray.dtype
23042310
return res_values
23052311

23062312
def equals(self, other: object) -> bool:

pandas/core/arrays/masked.py

+21-20
Original file line numberDiff line numberDiff line change
@@ -958,7 +958,7 @@ def equals(self, other) -> bool:
958958

959959
def _quantile(
960960
self: BaseMaskedArrayT, qs: npt.NDArray[np.float64], interpolation: str
961-
) -> BaseMaskedArrayT:
961+
) -> BaseMaskedArray:
962962
"""
963963
Dispatch to quantile_with_mask, needed because we do not have
964964
_from_factorized.
@@ -967,29 +967,30 @@ def _quantile(
967967
-----
968968
We assume that all impacted cases are 1D-only.
969969
"""
970-
mask = np.atleast_2d(np.asarray(self.isna()))
971-
npvalues: np.ndarray = np.atleast_2d(np.asarray(self))
972-
973970
res = quantile_with_mask(
974-
npvalues,
975-
mask=mask,
976-
fill_value=self.dtype.na_value,
971+
self._data,
972+
mask=self._mask,
973+
# TODO(GH#40932): na_value_for_dtype(self.dtype.numpy_dtype)
974+
# instead of np.nan
975+
fill_value=np.nan,
977976
qs=qs,
978977
interpolation=interpolation,
979978
)
980-
assert res.ndim == 2
981-
assert res.shape[0] == 1
982-
res = res[0]
983-
try:
984-
out = type(self)._from_sequence(res, dtype=self.dtype)
985-
except TypeError:
986-
# GH#42626: not able to safely cast Int64
987-
# for floating point output
988-
# error: Incompatible types in assignment (expression has type
989-
# "ndarray[Any, dtype[floating[_64Bit]]]", variable has type
990-
# "BaseMaskedArrayT")
991-
out = np.asarray(res, dtype=np.float64) # type: ignore[assignment]
992-
return out
979+
980+
if self._hasna:
981+
# Our result mask is all-False unless we are all-NA, in which
982+
# case it is all-True.
983+
if self.ndim == 2:
984+
# I think this should be out_mask=self.isna().all(axis=1)
985+
# but am holding off until we have tests
986+
raise NotImplementedError
987+
elif self.isna().all():
988+
out_mask = np.ones(res.shape, dtype=bool)
989+
else:
990+
out_mask = np.zeros(res.shape, dtype=bool)
991+
else:
992+
out_mask = np.zeros(res.shape, dtype=bool)
993+
return self._maybe_mask_result(res, mask=out_mask)
993994

994995
# ------------------------------------------------------------------
995996
# Reductions

pandas/core/arrays/period.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -698,11 +698,16 @@ def fillna(self, value=None, method=None, limit=None) -> PeriodArray:
698698
return result.view(self.dtype) # type: ignore[return-value]
699699
return super().fillna(value=value, method=method, limit=limit)
700700

701-
# TODO: alternately could override _quantile like searchsorted
702-
def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
703-
# quantile_with_mask may return float64 instead of int64, in which
704-
# case we need to cast back
705-
return res_values.astype(np.int64, copy=False)
701+
def _quantile(
702+
self: PeriodArray,
703+
qs: npt.NDArray[np.float64],
704+
interpolation: str,
705+
) -> PeriodArray:
706+
# dispatch to DatetimeArray implementation
707+
dtres = self.view("M8[ns]")._quantile(qs, interpolation)
708+
# error: Incompatible return value type (got "Union[ExtensionArray,
709+
# ndarray[Any, Any]]", expected "PeriodArray")
710+
return dtres.view(self.dtype) # type: ignore[return-value]
706711

707712
# ------------------------------------------------------------------
708713
# Arithmetic Methods

pandas/core/arrays/sparse/array.py

+20-2
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@
7575

7676
from pandas.core import arraylike
7777
import pandas.core.algorithms as algos
78+
from pandas.core.array_algos.quantile import quantile_with_mask
7879
from pandas.core.arraylike import OpsMixin
7980
from pandas.core.arrays import ExtensionArray
8081
from pandas.core.arrays.sparse.dtype import SparseDtype
@@ -890,10 +891,27 @@ def value_counts(self, dropna: bool = True) -> Series:
890891
return Series(counts, index=keys)
891892

892893
def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str):
894+
895+
if self._null_fill_value or self.sp_index.ngaps == 0:
896+
# We can avoid densifying
897+
npvalues = self.sp_values
898+
mask = np.zeros(npvalues.shape, dtype=bool)
899+
else:
900+
npvalues = self.to_numpy()
901+
mask = self.isna()
902+
903+
fill_value = na_value_for_dtype(npvalues.dtype, compat=False)
904+
res_values = quantile_with_mask(
905+
npvalues,
906+
mask,
907+
fill_value,
908+
qs,
909+
interpolation,
910+
)
911+
893912
# Special case: the returned array isn't _really_ sparse, so we don't
894913
# wrap it in a SparseArray
895-
result = super()._quantile(qs, interpolation)
896-
return np.asarray(result)
914+
return res_values
897915

898916
# --------
899917
# Indexing

pandas/tests/frame/methods/test_quantile.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -665,9 +665,14 @@ def test_quantile_ea(self, obj, index):
665665
qs = [0.5, 0, 1]
666666
result = self.compute_quantile(obj, qs)
667667

668+
exp_dtype = index.dtype
669+
if index.dtype == "Int64":
670+
# match non-nullable casting behavior
671+
exp_dtype = "Float64"
672+
668673
# expected here assumes len(index) == 9
669674
expected = Series(
670-
[index[4], index[0], index[-1]], dtype=index.dtype, index=qs, name="A"
675+
[index[4], index[0], index[-1]], dtype=exp_dtype, index=qs, name="A"
671676
)
672677
expected = type(obj)(expected)
673678

@@ -712,6 +717,8 @@ def test_quantile_ea_all_na(self, obj, index):
712717

713718
expected = index.take([-1, -1, -1], allow_fill=True, fill_value=index._na_value)
714719
expected = Series(expected, index=qs, name="A")
720+
if expected.dtype == "Int64":
721+
expected = expected.astype("Float64")
715722
expected = type(obj)(expected)
716723
tm.assert_equal(result, expected)
717724

@@ -726,7 +733,11 @@ def test_quantile_ea_scalar(self, obj, index):
726733
qs = 0.5
727734
result = self.compute_quantile(obj, qs)
728735

729-
expected = Series({"A": index[4]}, dtype=index.dtype, name=0.5)
736+
exp_dtype = index.dtype
737+
if index.dtype == "Int64":
738+
exp_dtype = "Float64"
739+
740+
expected = Series({"A": index[4]}, dtype=exp_dtype, name=0.5)
730741
if isinstance(obj, Series):
731742
expected = expected["A"]
732743
assert result == expected

pandas/tests/series/methods/test_quantile.py

+2
Original file line numberDiff line numberDiff line change
@@ -222,4 +222,6 @@ def test_quantile_empty(self):
222222
def test_quantile_dtypes(self, dtype):
223223
result = Series([1, 2, 3], dtype=dtype).quantile(np.arange(0, 1, 0.25))
224224
expected = Series(np.arange(1, 3, 0.5), index=np.arange(0, 1, 0.25))
225+
if dtype == "Int64":
226+
expected = expected.astype("Float64")
225227
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)