Skip to content

BUG: MaskedArray._quantile match non-nullable behavior #46282

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
May 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion pandas/core/array_algos/quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,14 @@ def quantile_with_mask(

Quantile is computed along axis=1.
"""
assert values.shape == mask.shape
if values.ndim == 1:
# unsqueeze, operate, re-squeeze
values = np.atleast_2d(values)
mask = np.atleast_2d(mask)
res_values = quantile_with_mask(values, mask, fill_value, qs, interpolation)
return res_values[0]

assert values.ndim == 2

is_empty = values.shape[1] == 0
Expand Down Expand Up @@ -189,7 +197,18 @@ def _nanpercentile(
_nanpercentile_1d(val, m, qs, na_value, interpolation=interpolation)
for (val, m) in zip(list(values), list(mask))
]
result = np.array(result, dtype=values.dtype, copy=False).T
if values.dtype.kind == "f":
# preserve itemsize
result = np.array(result, dtype=values.dtype, copy=False).T
else:
result = np.array(result, copy=False).T
if (
result.dtype != values.dtype
and (result == result.astype(values.dtype, copy=False)).all()
):
# e.g. values id integer dtype and result is floating dtype,
# only cast back to integer dtype if result values are all-integer.
result = result.astype(values.dtype, copy=False)
return result
else:
return np.percentile(
Expand Down
13 changes: 3 additions & 10 deletions pandas/core/arrays/_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,21 +473,14 @@ def _quantile(
) -> NDArrayBackedExtensionArrayT:
# TODO: disable for Categorical if not ordered?

# asarray needed for Sparse, see GH#24600
mask = np.asarray(self.isna())
mask = np.atleast_2d(mask)

arr = np.atleast_2d(self._ndarray)
arr = self._ndarray
fill_value = self._internal_fill_value

res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)
res_values = self._cast_quantile_result(res_values)
result = self._from_backing_data(res_values)
if self.ndim == 1:
assert result.shape == (1, len(qs)), result.shape
result = result[0]

return result
res_values = self._cast_quantile_result(res_values)
return self._from_backing_data(res_values)

# TODO: see if we can share this with other dispatch-wrapping methods
def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
Expand Down
17 changes: 2 additions & 15 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1586,25 +1586,12 @@ def _quantile(
-------
same type as self
"""
# asarray needed for Sparse, see GH#24600
mask = np.asarray(self.isna())
mask = np.atleast_2d(mask)

arr = np.atleast_2d(np.asarray(self))
arr = np.asarray(self)
fill_value = np.nan

res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)

if self.ndim == 2:
# i.e. DatetimeArray
result = type(self)._from_sequence(res_values)

else:
# shape[0] should be 1 as long as EAs are 1D
assert res_values.shape == (1, len(qs)), res_values.shape
result = type(self)._from_sequence(res_values[0])

return result
return type(self)._from_sequence(res_values)

def _mode(self: ExtensionArrayT, dropna: bool = True) -> ExtensionArrayT:
"""
Expand Down
10 changes: 8 additions & 2 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,6 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi
# For comparisons, so that numpy uses our implementation if the compare
# ops, which raise
__array_priority__ = 1000
_internal_fill_value = -1
# tolist is not actually deprecated, just suppressed in the __dir__
_hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"])
_typ = "categorical"
Expand Down Expand Up @@ -476,6 +475,13 @@ def dtype(self) -> CategoricalDtype:
"""
return self._dtype

@property
def _internal_fill_value(self) -> int:
# using the specific numpy integer instead of python int to get
# the correct dtype back from _quantile in the all-NA case
dtype = self._ndarray.dtype
return dtype.type(-1)

@property
def _constructor(self) -> type[Categorical]:
return Categorical
Expand Down Expand Up @@ -2300,7 +2306,7 @@ def unique(self):

def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
# make sure we have correct itemsize for resulting codes
res_values = coerce_indexer_dtype(res_values, self.dtype.categories)
assert res_values.dtype == self._ndarray.dtype
return res_values

def equals(self, other: object) -> bool:
Expand Down
41 changes: 21 additions & 20 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -958,7 +958,7 @@ def equals(self, other) -> bool:

def _quantile(
self: BaseMaskedArrayT, qs: npt.NDArray[np.float64], interpolation: str
) -> BaseMaskedArrayT:
) -> BaseMaskedArray:
"""
Dispatch to quantile_with_mask, needed because we do not have
_from_factorized.
Expand All @@ -967,29 +967,30 @@ def _quantile(
-----
We assume that all impacted cases are 1D-only.
"""
mask = np.atleast_2d(np.asarray(self.isna()))
npvalues: np.ndarray = np.atleast_2d(np.asarray(self))

res = quantile_with_mask(
npvalues,
mask=mask,
fill_value=self.dtype.na_value,
self._data,
mask=self._mask,
# TODO(GH#40932): na_value_for_dtype(self.dtype.numpy_dtype)
# instead of np.nan
fill_value=np.nan,
qs=qs,
interpolation=interpolation,
)
assert res.ndim == 2
assert res.shape[0] == 1
res = res[0]
try:
out = type(self)._from_sequence(res, dtype=self.dtype)
except TypeError:
# GH#42626: not able to safely cast Int64
# for floating point output
# error: Incompatible types in assignment (expression has type
# "ndarray[Any, dtype[floating[_64Bit]]]", variable has type
# "BaseMaskedArrayT")
out = np.asarray(res, dtype=np.float64) # type: ignore[assignment]
return out

if self._hasna:
# Our result mask is all-False unless we are all-NA, in which
# case it is all-True.
if self.ndim == 2:
# I think this should be out_mask=self.isna().all(axis=1)
# but am holding off until we have tests
raise NotImplementedError
elif self.isna().all():
out_mask = np.ones(res.shape, dtype=bool)
else:
out_mask = np.zeros(res.shape, dtype=bool)
else:
out_mask = np.zeros(res.shape, dtype=bool)
return self._maybe_mask_result(res, mask=out_mask)

# ------------------------------------------------------------------
# Reductions
Expand Down
15 changes: 10 additions & 5 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -698,11 +698,16 @@ def fillna(self, value=None, method=None, limit=None) -> PeriodArray:
return result.view(self.dtype) # type: ignore[return-value]
return super().fillna(value=value, method=method, limit=limit)

# TODO: alternately could override _quantile like searchsorted
def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
# quantile_with_mask may return float64 instead of int64, in which
# case we need to cast back
return res_values.astype(np.int64, copy=False)
def _quantile(
self: PeriodArray,
qs: npt.NDArray[np.float64],
interpolation: str,
) -> PeriodArray:
# dispatch to DatetimeArray implementation
dtres = self.view("M8[ns]")._quantile(qs, interpolation)
# error: Incompatible return value type (got "Union[ExtensionArray,
# ndarray[Any, Any]]", expected "PeriodArray")
return dtres.view(self.dtype) # type: ignore[return-value]

# ------------------------------------------------------------------
# Arithmetic Methods
Expand Down
22 changes: 20 additions & 2 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@

from pandas.core import arraylike
import pandas.core.algorithms as algos
from pandas.core.array_algos.quantile import quantile_with_mask
from pandas.core.arraylike import OpsMixin
from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.sparse.dtype import SparseDtype
Expand Down Expand Up @@ -890,10 +891,27 @@ def value_counts(self, dropna: bool = True) -> Series:
return Series(counts, index=keys)

def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str):

if self._null_fill_value or self.sp_index.ngaps == 0:
# We can avoid densifying
npvalues = self.sp_values
mask = np.zeros(npvalues.shape, dtype=bool)
else:
npvalues = self.to_numpy()
mask = self.isna()

fill_value = na_value_for_dtype(npvalues.dtype, compat=False)
res_values = quantile_with_mask(
npvalues,
mask,
fill_value,
qs,
interpolation,
)

# Special case: the returned array isn't _really_ sparse, so we don't
# wrap it in a SparseArray
result = super()._quantile(qs, interpolation)
return np.asarray(result)
return res_values

# --------
# Indexing
Expand Down
15 changes: 13 additions & 2 deletions pandas/tests/frame/methods/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -665,9 +665,14 @@ def test_quantile_ea(self, obj, index):
qs = [0.5, 0, 1]
result = self.compute_quantile(obj, qs)

exp_dtype = index.dtype
if index.dtype == "Int64":
# match non-nullable casting behavior
exp_dtype = "Float64"

# expected here assumes len(index) == 9
expected = Series(
[index[4], index[0], index[-1]], dtype=index.dtype, index=qs, name="A"
[index[4], index[0], index[-1]], dtype=exp_dtype, index=qs, name="A"
)
expected = type(obj)(expected)

Expand Down Expand Up @@ -712,6 +717,8 @@ def test_quantile_ea_all_na(self, obj, index):

expected = index.take([-1, -1, -1], allow_fill=True, fill_value=index._na_value)
expected = Series(expected, index=qs, name="A")
if expected.dtype == "Int64":
expected = expected.astype("Float64")
expected = type(obj)(expected)
tm.assert_equal(result, expected)

Expand All @@ -726,7 +733,11 @@ def test_quantile_ea_scalar(self, obj, index):
qs = 0.5
result = self.compute_quantile(obj, qs)

expected = Series({"A": index[4]}, dtype=index.dtype, name=0.5)
exp_dtype = index.dtype
if index.dtype == "Int64":
exp_dtype = "Float64"

expected = Series({"A": index[4]}, dtype=exp_dtype, name=0.5)
if isinstance(obj, Series):
expected = expected["A"]
assert result == expected
Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/series/methods/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,4 +222,6 @@ def test_quantile_empty(self):
def test_quantile_dtypes(self, dtype):
result = Series([1, 2, 3], dtype=dtype).quantile(np.arange(0, 1, 0.25))
expected = Series(np.arange(1, 3, 0.5), index=np.arange(0, 1, 0.25))
if dtype == "Int64":
expected = expected.astype("Float64")
tm.assert_series_equal(result, expected)