From 49b6d38b651b9e189ae17a5e3739eee549d2d07a Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 21 Feb 2022 20:27:08 -0800 Subject: [PATCH 1/2] BUG: PandasArray._quantile when empty --- pandas/core/array_algos/quantile.py | 7 ++++--- pandas/core/arrays/_mixins.py | 21 +++++++++++++++---- pandas/core/arrays/categorical.py | 6 ++++++ pandas/core/arrays/datetimes.py | 1 + pandas/core/arrays/numpy_.py | 1 + pandas/core/arrays/period.py | 7 +++++++ pandas/core/arrays/timedeltas.py | 1 + .../arrays/categorical/test_analytics.py | 10 +++++++++ pandas/tests/arrays/test_numpy.py | 11 ++++++++++ 9 files changed, 58 insertions(+), 7 deletions(-) diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index 64cd43a3e77cb..131bd34c1af69 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -2,7 +2,6 @@ import numpy as np -from pandas._libs import lib from pandas._typing import ( ArrayLike, Scalar, @@ -128,7 +127,9 @@ def _nanpercentile_1d( values = values[~mask] if len(values) == 0: - return np.array([na_value] * len(qs), dtype=values.dtype) + # Can't pass dtype=values.dtype here bc we might have na_value=np.nan + # with values.dtype=int64 see test_quantile_empty + return np.array([na_value] * len(qs)) return np.percentile(values, qs, **{np_percentile_argname: interpolation}) @@ -173,7 +174,7 @@ def _nanpercentile( # have float result at this point, not i8 return result.astype(values.dtype) - if not lib.is_scalar(mask) and mask.any(): + if mask.any(): # Caller is responsible for ensuring mask shape match assert mask.shape == values.shape result = [ diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 3446d5fc43a65..cb5df7bc541e7 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -99,6 +99,12 @@ class NDArrayBackedExtensionArray(NDArrayBacked, ExtensionArray): _ndarray: np.ndarray + # scalar used to denote NA value inside our self._ndarray, e.g. -1 + # for Categorical, iNaT for Period. Outside of object dtype, + # self.isna() should be exactly locations in self._ndarray with + # _internal_fill_value. + _internal_fill_value: Any + def _box_func(self, x): """ Wrap numpy type in our dtype.type if necessary. @@ -462,18 +468,25 @@ def _quantile( mask = np.atleast_2d(mask) arr = np.atleast_2d(self._ndarray) - # TODO: something NDArrayBacked-specific instead of _values_for_factorize[1]? - fill_value = self._values_for_factorize()[1] + fill_value = self._internal_fill_value res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation) - - result = type(self)._from_factorized(res_values, self) + res_values = self._cast_quantile_result(res_values) + result = self._from_backing_data(res_values) if self.ndim == 1: assert result.shape == (1, len(qs)), result.shape result = result[0] return result + # TODO: see if we can share this with other dispatch-wrapping methods + def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: + """ + Cast the result of quantile_with_mask to an appropriate dtype + to pass to _from_backing_data in _quantile. + """ + return res_values + # ------------------------------------------------------------------------ # numpy-like methods diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 3d9d3ef8a2557..0ef79465293f4 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -356,6 +356,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi # For comparisons, so that numpy uses our implementation if the compare # ops, which raise __array_priority__ = 1000 + _internal_fill_value = -1 # tolist is not actually deprecated, just suppressed in the __dir__ _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"]) _typ = "categorical" @@ -2316,6 +2317,11 @@ def _from_factorized(cls, uniques, original): original.categories.take(uniques), dtype=original.dtype ) + def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: + # make sure we have correct itemsize for resulting codes + res_values = coerce_indexer_dtype(res_values, self.dtype.categories) + return res_values + def equals(self, other: object) -> bool: """ Returns True if categorical arrays are equal. diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index e21f2e9d7b46e..01347401c67c5 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -191,6 +191,7 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): _typ = "datetimearray" _scalar_type = Timestamp + _internal_fill_value = np.datetime64("NaT", "ns") _recognized_scalars = (datetime, np.datetime64) _is_recognized_dtype = is_datetime64_any_dtype _infer_matches = ("datetime", "datetime64", "date") diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 09511b88b029a..aa5b9150acb67 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -61,6 +61,7 @@ class PandasArray( __array_priority__ = 1000 _ndarray: np.ndarray _dtype: PandasDtype + _internal_fill_value = np.nan # ------------------------------------------------------------------------ # Constructors diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 6189584dff7f1..88f6c5e5024c8 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -163,6 +163,7 @@ class PeriodArray(dtl.DatelikeOps): __array_priority__ = 1000 _typ = "periodarray" # ABCPeriodArray _scalar_type = Period + _internal_fill_value = np.int64(iNaT) _recognized_scalars = (Period,) _is_recognized_dtype = is_period_dtype _infer_matches = ("period",) @@ -697,6 +698,12 @@ def fillna(self, value=None, method=None, limit=None) -> PeriodArray: return result.view(self.dtype) # type: ignore[return-value] return super().fillna(value=value, method=method, limit=limit) + # TODO: alternately could override _quantile like searchsorted + def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: + # quantile_with_mask may return float64 instead of int64, in which + # case we need to cast back + return res_values.astype(np.int64, copy=False) + # ------------------------------------------------------------------ # Arithmetic Methods diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index eb195e4facf4a..2d17536c07a6e 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -127,6 +127,7 @@ class TimedeltaArray(dtl.TimelikeOps): _typ = "timedeltaarray" _scalar_type = Timedelta + _internal_fill_value = np.timedelta64("NaT", "ns") _recognized_scalars = (timedelta, np.timedelta64, Tick) _is_recognized_dtype = is_timedelta64_dtype _infer_matches = ("timedelta", "timedelta64") diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 59bb34f4f4c3e..6b16ffffe9328 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -363,3 +363,13 @@ def test_validate_inplace_raises(self, value): with pytest.raises(ValueError, match=msg): cat.sort_values(inplace=value) + + def test_quantile_empty(self): + # make sure we have correct itemsize on resulting codes + cat = Categorical(["A", "B"]) + idx = Index([0.0, 0.5]) + result = cat[:0]._quantile(idx, interpolation="linear") + assert result._codes.dtype == np.int8 + + expected = cat.take([-1, -1], allow_fill=True) + tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/test_numpy.py b/pandas/tests/arrays/test_numpy.py index dc2a7fedda713..beef5522ed1ab 100644 --- a/pandas/tests/arrays/test_numpy.py +++ b/pandas/tests/arrays/test_numpy.py @@ -298,3 +298,14 @@ def test_setitem_preserves_views(): arr[-1] = 2.5 view1[-1] = 5 assert arr[-1] == 5 + + +@pytest.mark.parametrize("dtype", [np.int64, np.uint64]) +def test_quantile_empty(dtype): + # we should get back np.nans, not -1s + arr = PandasArray(np.array([], dtype=dtype)) + idx = pd.Index([0.0, 0.5]) + + result = arr._quantile(idx, interpolation="linear") + expected = PandasArray(np.array([np.nan, np.nan])) + tm.assert_extension_array_equal(result, expected) From 2e025500366815e394de00050f8140734eabfef3 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 23 Feb 2022 15:42:35 -0800 Subject: [PATCH 2/2] PERF: use np.full --- pandas/core/array_algos/quantile.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index 131bd34c1af69..6bfc2b63448ae 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -129,7 +129,8 @@ def _nanpercentile_1d( if len(values) == 0: # Can't pass dtype=values.dtype here bc we might have na_value=np.nan # with values.dtype=int64 see test_quantile_empty - return np.array([na_value] * len(qs)) + # equiv: 'np.array([na_value] * len(qs))' but much faster + return np.full(len(qs), na_value) return np.percentile(values, qs, **{np_percentile_argname: interpolation})