Skip to content

BUG: MaskedArray._quantile match non-nullable behavior #46282

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
May 9, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions pandas/core/array_algos/quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,14 @@ def quantile_with_mask(

Quantile is computed along axis=1.
"""
assert values.shape == mask.shape
if values.ndim == 1:
# unsqueeze, operate, re-squeeze
values = np.atleast_2d(values)
mask = np.atleast_2d(mask)
res_values = quantile_with_mask(values, mask, fill_value, qs, interpolation)
return res_values[0]

assert values.ndim == 2

is_empty = values.shape[1] == 0
Expand Down
13 changes: 3 additions & 10 deletions pandas/core/arrays/_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,21 +476,14 @@ def _quantile(
) -> NDArrayBackedExtensionArrayT:
# TODO: disable for Categorical if not ordered?

# asarray needed for Sparse, see GH#24600
mask = np.asarray(self.isna())
mask = np.atleast_2d(mask)

arr = np.atleast_2d(self._ndarray)
arr = self._ndarray
fill_value = self._internal_fill_value

res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)
res_values = self._cast_quantile_result(res_values)
result = self._from_backing_data(res_values)
if self.ndim == 1:
assert result.shape == (1, len(qs)), result.shape
result = result[0]

return result
res_values = self._cast_quantile_result(res_values)
return self._from_backing_data(res_values)

# TODO: see if we can share this with other dispatch-wrapping methods
def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
Expand Down
17 changes: 2 additions & 15 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1586,25 +1586,12 @@ def _quantile(
-------
same type as self
"""
# asarray needed for Sparse, see GH#24600
mask = np.asarray(self.isna())
mask = np.atleast_2d(mask)

arr = np.atleast_2d(np.asarray(self))
arr = np.asarray(self)
fill_value = np.nan

res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)

if self.ndim == 2:
# i.e. DatetimeArray
result = type(self)._from_sequence(res_values)

else:
# shape[0] should be 1 as long as EAs are 1D
assert res_values.shape == (1, len(qs)), res_values.shape
result = type(self)._from_sequence(res_values[0])

return result
return type(self)._from_sequence(res_values)

def _mode(self: ExtensionArrayT, dropna: bool = True) -> ExtensionArrayT:
"""
Expand Down
10 changes: 8 additions & 2 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,6 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi
# For comparisons, so that numpy uses our implementation if the compare
# ops, which raise
__array_priority__ = 1000
_internal_fill_value = -1
# tolist is not actually deprecated, just suppressed in the __dir__
_hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"])
_typ = "categorical"
Expand Down Expand Up @@ -477,6 +476,13 @@ def dtype(self) -> CategoricalDtype:
"""
return self._dtype

@property
def _internal_fill_value(self) -> int:
# using the specific numpy integer instead of python int to get
# the correct dtype back from _quantile in the all-NA case
dtype = self._ndarray.dtype
return dtype.type(-1)

@property
def _constructor(self) -> type[Categorical]:
return Categorical
Expand Down Expand Up @@ -2303,7 +2309,7 @@ def _values_for_factorize(self):

def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
# make sure we have correct itemsize for resulting codes
res_values = coerce_indexer_dtype(res_values, self.dtype.categories)
assert res_values.dtype == self._ndarray.dtype
return res_values

def equals(self, other: object) -> bool:
Expand Down
41 changes: 21 additions & 20 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -948,7 +948,7 @@ def equals(self, other) -> bool:

def _quantile(
self: BaseMaskedArrayT, qs: npt.NDArray[np.float64], interpolation: str
) -> BaseMaskedArrayT:
) -> BaseMaskedArray:
"""
Dispatch to quantile_with_mask, needed because we do not have
_from_factorized.
Expand All @@ -957,29 +957,30 @@ def _quantile(
-----
We assume that all impacted cases are 1D-only.
"""
mask = np.atleast_2d(np.asarray(self.isna()))
npvalues: np.ndarray = np.atleast_2d(np.asarray(self))

res = quantile_with_mask(
npvalues,
mask=mask,
fill_value=self.dtype.na_value,
self._data,
mask=self._mask,
# TODO(GH#40932): na_value_for_dtype(self.dtype.numpy_dtype)
# instead of np.nan
fill_value=np.nan,
qs=qs,
interpolation=interpolation,
)
assert res.ndim == 2
assert res.shape[0] == 1
res = res[0]
try:
out = type(self)._from_sequence(res, dtype=self.dtype)
except TypeError:
# GH#42626: not able to safely cast Int64
# for floating point output
# error: Incompatible types in assignment (expression has type
# "ndarray[Any, dtype[floating[_64Bit]]]", variable has type
# "BaseMaskedArrayT")
out = np.asarray(res, dtype=np.float64) # type: ignore[assignment]
return out

if self._hasna:
# Our result mask is all-False unless we are all-NA, in which
# case it is all-True.
if self.ndim == 2:
# I think this should be out_mask=self.isna().all(axis=1)
# but am holding off until we have tests
raise NotImplementedError
elif self.isna().all():
out_mask = np.ones(res.shape, dtype=bool)
else:
out_mask = np.zeros(res.shape, dtype=bool)
else:
out_mask = np.zeros(res.shape, dtype=bool)
return self._maybe_mask_result(res, mask=out_mask)

# ------------------------------------------------------------------
# Reductions
Expand Down
15 changes: 10 additions & 5 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -699,11 +699,16 @@ def fillna(self, value=None, method=None, limit=None) -> PeriodArray:
return result.view(self.dtype) # type: ignore[return-value]
return super().fillna(value=value, method=method, limit=limit)

# TODO: alternately could override _quantile like searchsorted
def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
# quantile_with_mask may return float64 instead of int64, in which
# case we need to cast back
return res_values.astype(np.int64, copy=False)
def _quantile(
self: PeriodArray,
qs: npt.NDArray[np.float64],
interpolation: str,
) -> PeriodArray:
# dispatch to DatetimeArray implementation
dtres = self.view("M8[ns]")._quantile(qs, interpolation)
# error: Incompatible return value type (got "Union[ExtensionArray,
# ndarray[Any, Any]]", expected "PeriodArray")
return dtres.view(self.dtype) # type: ignore[return-value]

# ------------------------------------------------------------------
# Arithmetic Methods
Expand Down
22 changes: 20 additions & 2 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@

from pandas.core import arraylike
import pandas.core.algorithms as algos
from pandas.core.array_algos.quantile import quantile_with_mask
from pandas.core.arraylike import OpsMixin
from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.sparse.dtype import SparseDtype
Expand Down Expand Up @@ -891,10 +892,27 @@ def value_counts(self, dropna: bool = True) -> Series:
return Series(counts, index=keys)

def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str):

if self._null_fill_value or self.sp_index.ngaps == 0:
# We can avoid densifying
npvalues = self.sp_values
mask = np.zeros(npvalues.shape, dtype=bool)
else:
npvalues = self.to_numpy()
mask = self.isna()

fill_value = na_value_for_dtype(npvalues.dtype, compat=False)
res_values = quantile_with_mask(
npvalues,
mask,
fill_value,
qs,
interpolation,
)

# Special case: the returned array isn't _really_ sparse, so we don't
# wrap it in a SparseArray
result = super()._quantile(qs, interpolation)
return np.asarray(result)
return res_values

# --------
# Indexing
Expand Down
13 changes: 11 additions & 2 deletions pandas/tests/frame/methods/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -643,9 +643,14 @@ def test_quantile_ea(self, obj, index):
qs = [0.5, 0, 1]
result = self.compute_quantile(obj, qs)

exp_dtype = index.dtype
if index.dtype == "Int64":
# match non-nullable casting behavior
exp_dtype = "Float64"

# expected here assumes len(index) == 9
expected = Series(
[index[4], index[0], index[-1]], dtype=index.dtype, index=qs, name="A"
[index[4], index[0], index[-1]], dtype=exp_dtype, index=qs, name="A"
)
expected = type(obj)(expected)

Expand Down Expand Up @@ -704,7 +709,11 @@ def test_quantile_ea_scalar(self, obj, index):
qs = 0.5
result = self.compute_quantile(obj, qs)

expected = Series({"A": index[4]}, dtype=index.dtype, name=0.5)
exp_dtype = index.dtype
if index.dtype == "Int64":
exp_dtype = "Float64"

expected = Series({"A": index[4]}, dtype=exp_dtype, name=0.5)
if isinstance(obj, Series):
expected = expected["A"]
assert result == expected
Expand Down
11 changes: 11 additions & 0 deletions pandas/tests/io/formats/style/test_highlight.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,9 +181,20 @@ def test_highlight_between_inclusive(styler, inclusive, expected):
)
def test_highlight_quantile(styler, kwargs):
expected = {
(0, 1): [("background-color", "yellow")],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
(0, 1): [("background-color", "yellow")],

(2, 0): [("background-color", "yellow")],
(2, 1): [("background-color", "yellow")],
}
if styler.data.dtypes["B"] != "Int64":
expected.pop((0, 1))
else:
if kwargs.get("axis", -1) is None:
expected.pop((0, 1))
elif kwargs.get("q_left", -1) == 0:
expected.pop((0, 1))
elif "subset" in kwargs:
expected.pop((0, 1))
Copy link
Contributor

@attack68 attack68 Mar 9, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if styler.data.dtypes["B"] != "Int64":
expected.pop((0, 1))
else:
if kwargs.get("axis", -1) is None:
expected.pop((0, 1))
elif kwargs.get("q_left", -1) == 0:
expected.pop((0, 1))
elif "subset" in kwargs:
expected.pop((0, 1))
if kwargs.get("axis", -1) == 0 and styler.data.dtypes["B"] == "Int64":
# pd.NA is interpreted as zero value in the quantile calculation meaning
# of values {1, <NA>, 2}, {1, 2} are returned within the [0.5, 1] quantile.
expected[(0, 1)] = [("background-color", "yellow")]

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this fixes the test, although I would not intuitively expect the results noted in the comment.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[[ the alternative fix is to change the pytest fixture column values from {1, pd.NA, 2} to {1, pd.Na, 3}, since then the value 1 is not caught within the [0.5, 1] quantile where pd.NA = 0, and conveniently all other tests in the module still pass ]]

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think the trouble is in the quantile behavior or in the Styler?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, further investigation shows that this is a rounding issue when preserving the dtype in the quantile function. With

>>> df
      A     B
0     0     1
1  <NA>  <NA>
2    10     2

Then calling df.quantile(axis=0, numeric_only=False, q=[0.5, 1], interpolation="linear") gives under dtype="Float64":

        A    B
0.5   5.0  1.5
1.0  10.0  2.0
dtypes=Float64

Whereas under this PR using dtype="Int64" the values are:

      A  B
0.5   5  1
1.0  10  2
dtypes=Int64

The value of 1 is included in the second, probably due to calling int(1.5), and so the styler highlighting is changed for cell index (0,1).

Prior to this PR the dtype of column B was upcast from "Int64" to "float64" and the leftmost quantile was calculated as 1.5. Since the interpolation style is linear and that is specifically documented to be fraction, I think it would be worth expanding the docs how dtypes are preserved and can affect calculations, or considering upcasting to "Float64".

It is possible to fix all tests in Styler by changing the fixture's column B to have 4 instead of 2 so that the rounding of the 50% quantile at 2.5 to 2 still excludes cell (0,1).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks @attack68 I think I've got a handle on whats going wrong, and it tentatively looks like its in the quantile code. ill try the fix i have in mind and let you know if that doesnt work out


result = styler.highlight_quantile(**kwargs)._compute().ctx
assert result == expected

Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/series/methods/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,4 +222,6 @@ def test_quantile_empty(self):
def test_quantile_dtypes(self, dtype):
result = Series([1, 2, 3], dtype=dtype).quantile(np.arange(0, 1, 0.25))
expected = Series(np.arange(1, 3, 0.5), index=np.arange(0, 1, 0.25))
if dtype == "Int64":
expected = expected.astype("Float64")
tm.assert_series_equal(result, expected)