Skip to content

Commit 28f274c

Browse files
authored
BUG: IntervalIndex.get_indexer incorrectly matching ints to datetimes (#54964)
* REF: separate out _nbins_to_bins * Cast x to Index early * BUG: IntervalIndex.get_indexer incorrectly matching ints to datetimes * GH ref
1 parent a98be06 commit 28f274c

File tree

5 files changed

+92
-113
lines changed

5 files changed

+92
-113
lines changed

doc/source/whatsnew/v2.2.0.rst

+3
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,8 @@ Strings
288288
Interval
289289
^^^^^^^^
290290
- Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown. (:issue:`55015`)
291+
- Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`)
292+
- Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`)
291293
-
292294

293295
Indexing
@@ -349,6 +351,7 @@ Styler
349351

350352
Other
351353
^^^^^
354+
- Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`)
352355

353356
.. ***DO NOT USE THIS SECTION***
354357

pandas/core/indexes/base.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -3938,12 +3938,8 @@ def _should_partial_index(self, target: Index) -> bool:
39383938
if isinstance(self.dtype, IntervalDtype):
39393939
if isinstance(target.dtype, IntervalDtype):
39403940
return False
3941-
# See https://github.com/pandas-dev/pandas/issues/47772 the commented
3942-
# out code can be restored (instead of hardcoding `return True`)
3943-
# once that issue is fixed
39443941
# "Index" has no attribute "left"
3945-
# return self.left._should_compare(target) # type: ignore[attr-defined]
3946-
return True
3942+
return self.left._should_compare(target) # type: ignore[attr-defined]
39473943
return False
39483944

39493945
@final

pandas/core/reshape/tile.py

+65-103
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,8 @@
1717
Timestamp,
1818
lib,
1919
)
20-
from pandas._libs.lib import infer_dtype
2120

2221
from pandas.core.dtypes.common import (
23-
DT64NS_DTYPE,
2422
ensure_platform_int,
2523
is_bool_dtype,
2624
is_integer,
@@ -243,7 +241,7 @@ def cut(
243241

244242
original = x
245243
x_idx = _preprocess_for_cut(x)
246-
x_idx, dtype = _coerce_to_type(x_idx)
244+
x_idx, _ = _coerce_to_type(x_idx)
247245

248246
if not np.iterable(bins):
249247
bins = _nbins_to_bins(x_idx, bins, right)
@@ -254,16 +252,8 @@ def cut(
254252

255253
else:
256254
bins = Index(bins)
257-
if isinstance(getattr(bins, "dtype", None), DatetimeTZDtype):
258-
bins = np.asarray(bins, dtype=DT64NS_DTYPE)
259-
else:
260-
bins = np.asarray(bins)
261-
bins = _convert_bin_to_numeric_type(bins, dtype)
262-
263-
# GH 26045: cast to float64 to avoid an overflow
264-
if (np.diff(bins.astype("float64")) < 0).any():
255+
if not bins.is_monotonic_increasing:
265256
raise ValueError("bins must increase monotonically.")
266-
bins = Index(bins)
267257

268258
fac, bins = _bins_to_cuts(
269259
x_idx,
@@ -272,12 +262,11 @@ def cut(
272262
labels=labels,
273263
precision=precision,
274264
include_lowest=include_lowest,
275-
dtype=dtype,
276265
duplicates=duplicates,
277266
ordered=ordered,
278267
)
279268

280-
return _postprocess_for_cut(fac, bins, retbins, dtype, original)
269+
return _postprocess_for_cut(fac, bins, retbins, original)
281270

282271

283272
def qcut(
@@ -343,25 +332,22 @@ def qcut(
343332
"""
344333
original = x
345334
x_idx = _preprocess_for_cut(x)
346-
x_idx, dtype = _coerce_to_type(x_idx)
335+
x_idx, _ = _coerce_to_type(x_idx)
347336

348337
quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q
349338

350-
x_np = np.asarray(x_idx)
351-
x_np = x_np[~np.isnan(x_np)]
352-
bins = np.quantile(x_np, quantiles)
339+
bins = x_idx.to_series().dropna().quantile(quantiles)
353340

354341
fac, bins = _bins_to_cuts(
355342
x_idx,
356343
Index(bins),
357344
labels=labels,
358345
precision=precision,
359346
include_lowest=True,
360-
dtype=dtype,
361347
duplicates=duplicates,
362348
)
363349

364-
return _postprocess_for_cut(fac, bins, retbins, dtype, original)
350+
return _postprocess_for_cut(fac, bins, retbins, original)
365351

366352

367353
def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:
@@ -378,18 +364,41 @@ def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:
378364
rng = (x_idx.min(), x_idx.max())
379365
mn, mx = rng
380366

381-
if np.isinf(mn) or np.isinf(mx):
367+
is_dt_or_td = lib.is_np_dtype(x_idx.dtype, "mM") or isinstance(
368+
x_idx.dtype, DatetimeTZDtype
369+
)
370+
371+
if is_numeric_dtype(x_idx.dtype) and (np.isinf(mn) or np.isinf(mx)):
382372
# GH#24314
383373
raise ValueError(
384374
"cannot specify integer `bins` when input data contains infinity"
385375
)
386376

387377
if mn == mx: # adjust end points before binning
388-
mn -= 0.001 * abs(mn) if mn != 0 else 0.001
389-
mx += 0.001 * abs(mx) if mx != 0 else 0.001
390-
bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
378+
if is_dt_or_td:
379+
# using seconds=1 is pretty arbitrary here
380+
td = Timedelta(seconds=1)
381+
# Use DatetimeArray/TimedeltaArray method instead of linspace
382+
# error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
383+
# has no attribute "_generate_range"
384+
bins = x_idx._values._generate_range( # type: ignore[union-attr]
385+
start=mn - td, end=mx + td, periods=nbins + 1, freq=None
386+
)
387+
else:
388+
mn -= 0.001 * abs(mn) if mn != 0 else 0.001
389+
mx += 0.001 * abs(mx) if mx != 0 else 0.001
390+
391+
bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
391392
else: # adjust end points after binning
392-
bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
393+
if is_dt_or_td:
394+
# Use DatetimeArray/TimedeltaArray method instead of linspace
395+
# error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
396+
# has no attribute "_generate_range"
397+
bins = x_idx._values._generate_range( # type: ignore[union-attr]
398+
start=mn, end=mx, periods=nbins + 1, freq=None
399+
)
400+
else:
401+
bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
393402
adj = (mx - mn) * 0.001 # 0.1% of the range
394403
if right:
395404
bins[0] -= adj
@@ -400,13 +409,12 @@ def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:
400409

401410

402411
def _bins_to_cuts(
403-
x: Index,
412+
x_idx: Index,
404413
bins: Index,
405414
right: bool = True,
406415
labels=None,
407416
precision: int = 3,
408417
include_lowest: bool = False,
409-
dtype: DtypeObj | None = None,
410418
duplicates: str = "raise",
411419
ordered: bool = True,
412420
):
@@ -422,7 +430,7 @@ def _bins_to_cuts(
422430

423431
if isinstance(bins, IntervalIndex):
424432
# we have a fast-path here
425-
ids = bins.get_indexer(x)
433+
ids = bins.get_indexer(x_idx)
426434
cat_dtype = CategoricalDtype(bins, ordered=True)
427435
result = Categorical.from_codes(ids, dtype=cat_dtype, validate=False)
428436
return result, bins
@@ -437,12 +445,29 @@ def _bins_to_cuts(
437445
bins = unique_bins
438446

439447
side: Literal["left", "right"] = "left" if right else "right"
440-
ids = ensure_platform_int(bins.searchsorted(x, side=side))
448+
449+
try:
450+
ids = bins.searchsorted(x_idx, side=side)
451+
except TypeError as err:
452+
# e.g. test_datetime_nan_error if bins are DatetimeArray and x_idx
453+
# is integers
454+
if x_idx.dtype.kind == "m":
455+
raise ValueError("bins must be of timedelta64 dtype") from err
456+
elif x_idx.dtype.kind == bins.dtype.kind == "M":
457+
raise ValueError(
458+
"Cannot use timezone-naive bins with timezone-aware values, "
459+
"or vice-versa"
460+
) from err
461+
elif x_idx.dtype.kind == "M":
462+
raise ValueError("bins must be of datetime64 dtype") from err
463+
else:
464+
raise
465+
ids = ensure_platform_int(ids)
441466

442467
if include_lowest:
443-
ids[np.asarray(x) == bins[0]] = 1
468+
ids[x_idx == bins[0]] = 1
444469

445-
na_mask = isna(x) | (ids == len(bins)) | (ids == 0)
470+
na_mask = isna(x_idx) | (ids == len(bins)) | (ids == 0)
446471
has_nas = na_mask.any()
447472

448473
if labels is not False:
@@ -454,7 +479,7 @@ def _bins_to_cuts(
454479

455480
if labels is None:
456481
labels = _format_labels(
457-
bins, precision, right=right, include_lowest=include_lowest, dtype=dtype
482+
bins, precision, right=right, include_lowest=include_lowest
458483
)
459484
elif ordered and len(set(labels)) != len(labels):
460485
raise ValueError(
@@ -513,90 +538,28 @@ def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]:
513538
x_arr = x.to_numpy(dtype=np.float64, na_value=np.nan)
514539
x = Index(x_arr)
515540

516-
if dtype is not None:
517-
# GH 19768: force NaT to NaN during integer conversion
518-
x_arr = np.where(x.notna(), x.view(np.int64), np.nan)
519-
x = Index(x_arr)
520-
521-
return x, dtype
522-
523-
524-
def _convert_bin_to_numeric_type(bins, dtype: DtypeObj | None):
525-
"""
526-
if the passed bin is of datetime/timedelta type,
527-
this method converts it to integer
528-
529-
Parameters
530-
----------
531-
bins : list-like of bins
532-
dtype : dtype of data
533-
534-
Raises
535-
------
536-
ValueError if bins are not of a compat dtype to dtype
537-
"""
538-
bins_dtype = infer_dtype(bins, skipna=False)
539-
if lib.is_np_dtype(dtype, "m"):
540-
if bins_dtype in ["timedelta", "timedelta64"]:
541-
bins = to_timedelta(bins).view(np.int64)
542-
else:
543-
raise ValueError("bins must be of timedelta64 dtype")
544-
elif lib.is_np_dtype(dtype, "M") or isinstance(dtype, DatetimeTZDtype):
545-
if bins_dtype in ["datetime", "datetime64"]:
546-
bins = to_datetime(bins)
547-
if lib.is_np_dtype(bins.dtype, "M"):
548-
# As of 2.0, to_datetime may give non-nano, so we need to convert
549-
# here until the rest of this file recognizes non-nano
550-
bins = bins.astype("datetime64[ns]", copy=False)
551-
bins = bins.view(np.int64)
552-
else:
553-
raise ValueError("bins must be of datetime64 dtype")
554-
555-
return bins
556-
557-
558-
def _convert_bin_to_datelike_type(bins, dtype: DtypeObj | None):
559-
"""
560-
Convert bins to a DatetimeIndex or TimedeltaIndex if the original dtype is
561-
datelike
562-
563-
Parameters
564-
----------
565-
bins : list-like of bins
566-
dtype : dtype of data
567-
568-
Returns
569-
-------
570-
bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is
571-
datelike
572-
"""
573-
if isinstance(dtype, DatetimeTZDtype):
574-
bins = to_datetime(bins.astype(np.int64), utc=True).tz_convert(dtype.tz)
575-
elif lib.is_np_dtype(dtype, "mM"):
576-
bins = Index(bins.astype(np.int64), dtype=dtype)
577-
return bins
541+
return Index(x), dtype
578542

579543

580544
def _format_labels(
581545
bins: Index,
582546
precision: int,
583547
right: bool = True,
584548
include_lowest: bool = False,
585-
dtype: DtypeObj | None = None,
586549
):
587550
"""based on the dtype, return our labels"""
588551
closed: IntervalLeftRight = "right" if right else "left"
589552

590553
formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta]
591554

592-
if isinstance(dtype, DatetimeTZDtype):
593-
formatter = lambda x: Timestamp(x, tz=dtype.tz)
555+
if isinstance(bins.dtype, DatetimeTZDtype):
556+
formatter = lambda x: x
594557
adjust = lambda x: x - Timedelta("1ns")
595-
elif lib.is_np_dtype(dtype, "M"):
596-
formatter = Timestamp
558+
elif lib.is_np_dtype(bins.dtype, "M"):
559+
formatter = lambda x: x
597560
adjust = lambda x: x - Timedelta("1ns")
598-
elif lib.is_np_dtype(dtype, "m"):
599-
formatter = Timedelta
561+
elif lib.is_np_dtype(bins.dtype, "m"):
562+
formatter = lambda x: x
600563
adjust = lambda x: x - Timedelta("1ns")
601564
else:
602565
precision = _infer_precision(precision, bins)
@@ -628,7 +591,7 @@ def _preprocess_for_cut(x) -> Index:
628591
return Index(x)
629592

630593

631-
def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, original):
594+
def _postprocess_for_cut(fac, bins, retbins: bool, original):
632595
"""
633596
handles post processing for the cut method where
634597
we combine the index information if the originally passed
@@ -640,7 +603,6 @@ def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, origi
640603
if not retbins:
641604
return fac
642605

643-
bins = _convert_bin_to_datelike_type(bins, dtype)
644606
if isinstance(bins, Index) and is_numeric_dtype(bins.dtype):
645607
bins = bins._values
646608

pandas/tests/indexes/interval/test_indexing.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -307,9 +307,9 @@ def test_get_indexer_datetime(self):
307307
result = ii.get_indexer(DatetimeIndex(["2018-01-02"]).astype(str))
308308
tm.assert_numpy_array_equal(result, expected)
309309

310-
# TODO this should probably be deprecated?
311310
# https://github.com/pandas-dev/pandas/issues/47772
312311
result = ii.get_indexer(DatetimeIndex(["2018-01-02"]).asi8)
312+
expected = np.array([-1], dtype=np.intp)
313313
tm.assert_numpy_array_equal(result, expected)
314314

315315
@pytest.mark.parametrize(

pandas/tests/reshape/test_cut.py

+22-4
Original file line numberDiff line numberDiff line change
@@ -495,15 +495,33 @@ def test_datetime_cut(data):
495495
tm.assert_series_equal(Series(result), expected)
496496

497497

498-
@pytest.mark.parametrize(
499-
"bins",
500-
[
501-
3,
498+
@pytest.mark.parametrize("box", [list, np.array, Index, Series])
499+
def test_datetime_tz_cut_mismatched_tzawareness(box):
500+
# GH#54964
501+
bins = box(
502502
[
503503
Timestamp("2013-01-01 04:57:07.200000"),
504504
Timestamp("2013-01-01 21:00:00"),
505505
Timestamp("2013-01-02 13:00:00"),
506506
Timestamp("2013-01-03 05:00:00"),
507+
]
508+
)
509+
ser = Series(date_range("20130101", periods=3, tz="US/Eastern"))
510+
511+
msg = "Cannot use timezone-naive bins with timezone-aware values"
512+
with pytest.raises(ValueError, match=msg):
513+
cut(ser, bins)
514+
515+
516+
@pytest.mark.parametrize(
517+
"bins",
518+
[
519+
3,
520+
[
521+
Timestamp("2013-01-01 04:57:07.200000", tz="UTC").tz_convert("US/Eastern"),
522+
Timestamp("2013-01-01 21:00:00", tz="UTC").tz_convert("US/Eastern"),
523+
Timestamp("2013-01-02 13:00:00", tz="UTC").tz_convert("US/Eastern"),
524+
Timestamp("2013-01-03 05:00:00", tz="UTC").tz_convert("US/Eastern"),
507525
],
508526
],
509527
)

0 commit comments

Comments
 (0)