Skip to content

Commit 3f52f4e

Browse files
authored
BUG: PandasArray._quantile when empty (#46110)
1 parent c3abb52 commit 3f52f4e

File tree

9 files changed

+59
-7
lines changed

9 files changed

+59
-7
lines changed

pandas/core/array_algos/quantile.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import numpy as np
44

5-
from pandas._libs import lib
65
from pandas._typing import (
76
ArrayLike,
87
Scalar,
@@ -128,7 +127,10 @@ def _nanpercentile_1d(
128127
values = values[~mask]
129128

130129
if len(values) == 0:
131-
return np.array([na_value] * len(qs), dtype=values.dtype)
130+
# Can't pass dtype=values.dtype here bc we might have na_value=np.nan
131+
# with values.dtype=int64 see test_quantile_empty
132+
# equiv: 'np.array([na_value] * len(qs))' but much faster
133+
return np.full(len(qs), na_value)
132134

133135
return np.percentile(values, qs, **{np_percentile_argname: interpolation})
134136

@@ -173,7 +175,7 @@ def _nanpercentile(
173175
# have float result at this point, not i8
174176
return result.astype(values.dtype)
175177

176-
if not lib.is_scalar(mask) and mask.any():
178+
if mask.any():
177179
# Caller is responsible for ensuring mask shape match
178180
assert mask.shape == values.shape
179181
result = [

pandas/core/arrays/_mixins.py

+17-4
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,12 @@ class NDArrayBackedExtensionArray(NDArrayBacked, ExtensionArray):
9999

100100
_ndarray: np.ndarray
101101

102+
# scalar used to denote NA value inside our self._ndarray, e.g. -1
103+
# for Categorical, iNaT for Period. Outside of object dtype,
104+
# self.isna() should be exactly locations in self._ndarray with
105+
# _internal_fill_value.
106+
_internal_fill_value: Any
107+
102108
def _box_func(self, x):
103109
"""
104110
Wrap numpy type in our dtype.type if necessary.
@@ -463,18 +469,25 @@ def _quantile(
463469
mask = np.atleast_2d(mask)
464470

465471
arr = np.atleast_2d(self._ndarray)
466-
# TODO: something NDArrayBacked-specific instead of _values_for_factorize[1]?
467-
fill_value = self._values_for_factorize()[1]
472+
fill_value = self._internal_fill_value
468473

469474
res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)
470-
471-
result = type(self)._from_factorized(res_values, self)
475+
res_values = self._cast_quantile_result(res_values)
476+
result = self._from_backing_data(res_values)
472477
if self.ndim == 1:
473478
assert result.shape == (1, len(qs)), result.shape
474479
result = result[0]
475480

476481
return result
477482

483+
# TODO: see if we can share this with other dispatch-wrapping methods
484+
def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
485+
"""
486+
Cast the result of quantile_with_mask to an appropriate dtype
487+
to pass to _from_backing_data in _quantile.
488+
"""
489+
return res_values
490+
478491
# ------------------------------------------------------------------------
479492
# numpy-like methods
480493

pandas/core/arrays/categorical.py

+6
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi
356356
# For comparisons, so that numpy uses our implementation if the compare
357357
# ops, which raise
358358
__array_priority__ = 1000
359+
_internal_fill_value = -1
359360
# tolist is not actually deprecated, just suppressed in the __dir__
360361
_hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"])
361362
_typ = "categorical"
@@ -2316,6 +2317,11 @@ def _from_factorized(cls, uniques, original):
23162317
original.categories.take(uniques), dtype=original.dtype
23172318
)
23182319

2320+
def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
2321+
# make sure we have correct itemsize for resulting codes
2322+
res_values = coerce_indexer_dtype(res_values, self.dtype.categories)
2323+
return res_values
2324+
23192325
def equals(self, other: object) -> bool:
23202326
"""
23212327
Returns True if categorical arrays are equal.

pandas/core/arrays/datetimes.py

+1
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps):
191191

192192
_typ = "datetimearray"
193193
_scalar_type = Timestamp
194+
_internal_fill_value = np.datetime64("NaT", "ns")
194195
_recognized_scalars = (datetime, np.datetime64)
195196
_is_recognized_dtype = is_datetime64_any_dtype
196197
_infer_matches = ("datetime", "datetime64", "date")

pandas/core/arrays/numpy_.py

+1
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ class PandasArray(
6161
__array_priority__ = 1000
6262
_ndarray: np.ndarray
6363
_dtype: PandasDtype
64+
_internal_fill_value = np.nan
6465

6566
# ------------------------------------------------------------------------
6667
# Constructors

pandas/core/arrays/period.py

+7
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ class PeriodArray(dtl.DatelikeOps):
163163
__array_priority__ = 1000
164164
_typ = "periodarray" # ABCPeriodArray
165165
_scalar_type = Period
166+
_internal_fill_value = np.int64(iNaT)
166167
_recognized_scalars = (Period,)
167168
_is_recognized_dtype = is_period_dtype
168169
_infer_matches = ("period",)
@@ -697,6 +698,12 @@ def fillna(self, value=None, method=None, limit=None) -> PeriodArray:
697698
return result.view(self.dtype) # type: ignore[return-value]
698699
return super().fillna(value=value, method=method, limit=limit)
699700

701+
# TODO: alternately could override _quantile like searchsorted
702+
def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
703+
# quantile_with_mask may return float64 instead of int64, in which
704+
# case we need to cast back
705+
return res_values.astype(np.int64, copy=False)
706+
700707
# ------------------------------------------------------------------
701708
# Arithmetic Methods
702709

pandas/core/arrays/timedeltas.py

+1
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ class TimedeltaArray(dtl.TimelikeOps):
127127

128128
_typ = "timedeltaarray"
129129
_scalar_type = Timedelta
130+
_internal_fill_value = np.timedelta64("NaT", "ns")
130131
_recognized_scalars = (timedelta, np.timedelta64, Tick)
131132
_is_recognized_dtype = is_timedelta64_dtype
132133
_infer_matches = ("timedelta", "timedelta64")

pandas/tests/arrays/categorical/test_analytics.py

+10
Original file line numberDiff line numberDiff line change
@@ -363,3 +363,13 @@ def test_validate_inplace_raises(self, value):
363363

364364
with pytest.raises(ValueError, match=msg):
365365
cat.sort_values(inplace=value)
366+
367+
def test_quantile_empty(self):
368+
# make sure we have correct itemsize on resulting codes
369+
cat = Categorical(["A", "B"])
370+
idx = Index([0.0, 0.5])
371+
result = cat[:0]._quantile(idx, interpolation="linear")
372+
assert result._codes.dtype == np.int8
373+
374+
expected = cat.take([-1, -1], allow_fill=True)
375+
tm.assert_extension_array_equal(result, expected)

pandas/tests/arrays/numpy_/test_numpy.py

+11
Original file line numberDiff line numberDiff line change
@@ -298,3 +298,14 @@ def test_setitem_preserves_views():
298298
arr[-1] = 2.5
299299
view1[-1] = 5
300300
assert arr[-1] == 5
301+
302+
303+
@pytest.mark.parametrize("dtype", [np.int64, np.uint64])
304+
def test_quantile_empty(dtype):
305+
# we should get back np.nans, not -1s
306+
arr = PandasArray(np.array([], dtype=dtype))
307+
idx = pd.Index([0.0, 0.5])
308+
309+
result = arr._quantile(idx, interpolation="linear")
310+
expected = PandasArray(np.array([np.nan, np.nan]))
311+
tm.assert_extension_array_equal(result, expected)

0 commit comments

Comments
 (0)