BUG: IntervalIndex.get_indexer incorrectly matching ints to datetimes (#54964)

jbrockmendel · web-flow · commit 28f274cb107e · 2023-09-20T10:04:23.000-07:00
* REF: separate out _nbins_to_bins

* Cast x to Index early

* BUG: IntervalIndex.get_indexer incorrectly matching ints to datetimes

* GH ref
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -288,6 +288,8 @@ Strings
 Interval
 ^^^^^^^^
 - Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown. (:issue:`55015`)
+- Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`)
+- Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`)
 -
 
 Indexing
@@ -349,6 +351,7 @@ Styler
 
 Other
 ^^^^^
+- Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`)
 
 .. ***DO NOT USE THIS SECTION***
 
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -3938,12 +3938,8 @@ def _should_partial_index(self, target: Index) -> bool:
         if isinstance(self.dtype, IntervalDtype):
             if isinstance(target.dtype, IntervalDtype):
                 return False
-            # See https://github.com/pandas-dev/pandas/issues/47772 the commented
-            # out code can be restored (instead of hardcoding `return True`)
-            # once that issue is fixed
             # "Index" has no attribute "left"
-            # return self.left._should_compare(target)  # type: ignore[attr-defined]
-            return True
+            return self.left._should_compare(target)  # type: ignore[attr-defined]
         return False
 
     @final
diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
@@ -17,10 +17,8 @@
     Timestamp,
     lib,
 )
-from pandas._libs.lib import infer_dtype
 
 from pandas.core.dtypes.common import (
-    DT64NS_DTYPE,
     ensure_platform_int,
     is_bool_dtype,
     is_integer,
@@ -243,7 +241,7 @@ def cut(
 
     original = x
     x_idx = _preprocess_for_cut(x)
-    x_idx, dtype = _coerce_to_type(x_idx)
+    x_idx, _ = _coerce_to_type(x_idx)
 
     if not np.iterable(bins):
         bins = _nbins_to_bins(x_idx, bins, right)
@@ -254,16 +252,8 @@ def cut(
 
     else:
         bins = Index(bins)
-        if isinstance(getattr(bins, "dtype", None), DatetimeTZDtype):
-            bins = np.asarray(bins, dtype=DT64NS_DTYPE)
-        else:
-            bins = np.asarray(bins)
-        bins = _convert_bin_to_numeric_type(bins, dtype)
-
-        # GH 26045: cast to float64 to avoid an overflow
-        if (np.diff(bins.astype("float64")) < 0).any():
+        if not bins.is_monotonic_increasing:
             raise ValueError("bins must increase monotonically.")
-        bins = Index(bins)
 
     fac, bins = _bins_to_cuts(
         x_idx,
@@ -272,12 +262,11 @@ def cut(
         labels=labels,
         precision=precision,
         include_lowest=include_lowest,
-        dtype=dtype,
         duplicates=duplicates,
         ordered=ordered,
     )
 
-    return _postprocess_for_cut(fac, bins, retbins, dtype, original)
+    return _postprocess_for_cut(fac, bins, retbins, original)
 
 
 def qcut(
@@ -343,25 +332,22 @@ def qcut(
     """
     original = x
     x_idx = _preprocess_for_cut(x)
-    x_idx, dtype = _coerce_to_type(x_idx)
+    x_idx, _ = _coerce_to_type(x_idx)
 
     quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q
 
-    x_np = np.asarray(x_idx)
-    x_np = x_np[~np.isnan(x_np)]
-    bins = np.quantile(x_np, quantiles)
+    bins = x_idx.to_series().dropna().quantile(quantiles)
 
     fac, bins = _bins_to_cuts(
         x_idx,
         Index(bins),
         labels=labels,
         precision=precision,
         include_lowest=True,
-        dtype=dtype,
         duplicates=duplicates,
     )
 
-    return _postprocess_for_cut(fac, bins, retbins, dtype, original)
+    return _postprocess_for_cut(fac, bins, retbins, original)
 
 
 def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:
@@ -378,18 +364,41 @@ def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:
     rng = (x_idx.min(), x_idx.max())
     mn, mx = rng
 
-    if np.isinf(mn) or np.isinf(mx):
+    is_dt_or_td = lib.is_np_dtype(x_idx.dtype, "mM") or isinstance(
+        x_idx.dtype, DatetimeTZDtype
+    )
+
+    if is_numeric_dtype(x_idx.dtype) and (np.isinf(mn) or np.isinf(mx)):
         # GH#24314
         raise ValueError(
             "cannot specify integer `bins` when input data contains infinity"
         )
 
     if mn == mx:  # adjust end points before binning
-        mn -= 0.001 * abs(mn) if mn != 0 else 0.001
-        mx += 0.001 * abs(mx) if mx != 0 else 0.001
-        bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
+        if is_dt_or_td:
+            # using seconds=1 is pretty arbitrary here
+            td = Timedelta(seconds=1)
+            # Use DatetimeArray/TimedeltaArray method instead of linspace
+            # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
+            # has no attribute "_generate_range"
+            bins = x_idx._values._generate_range(  # type: ignore[union-attr]
+                start=mn - td, end=mx + td, periods=nbins + 1, freq=None
+            )
+        else:
+            mn -= 0.001 * abs(mn) if mn != 0 else 0.001
+            mx += 0.001 * abs(mx) if mx != 0 else 0.001
+
+            bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
     else:  # adjust end points after binning
-        bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
+        if is_dt_or_td:
+            # Use DatetimeArray/TimedeltaArray method instead of linspace
+            # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
+            # has no attribute "_generate_range"
+            bins = x_idx._values._generate_range(  # type: ignore[union-attr]
+                start=mn, end=mx, periods=nbins + 1, freq=None
+            )
+        else:
+            bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
         adj = (mx - mn) * 0.001  # 0.1% of the range
         if right:
             bins[0] -= adj
@@ -400,13 +409,12 @@ def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:
 
 
 def _bins_to_cuts(
-    x: Index,
+    x_idx: Index,
     bins: Index,
     right: bool = True,
     labels=None,
     precision: int = 3,
     include_lowest: bool = False,
-    dtype: DtypeObj | None = None,
     duplicates: str = "raise",
     ordered: bool = True,
 ):
@@ -422,7 +430,7 @@ def _bins_to_cuts(
 
     if isinstance(bins, IntervalIndex):
         # we have a fast-path here
-        ids = bins.get_indexer(x)
+        ids = bins.get_indexer(x_idx)
         cat_dtype = CategoricalDtype(bins, ordered=True)
         result = Categorical.from_codes(ids, dtype=cat_dtype, validate=False)
         return result, bins
@@ -437,12 +445,29 @@ def _bins_to_cuts(
         bins = unique_bins
 
     side: Literal["left", "right"] = "left" if right else "right"
-    ids = ensure_platform_int(bins.searchsorted(x, side=side))
+
+    try:
+        ids = bins.searchsorted(x_idx, side=side)
+    except TypeError as err:
+        # e.g. test_datetime_nan_error if bins are DatetimeArray and x_idx
+        #  is integers
+        if x_idx.dtype.kind == "m":
+            raise ValueError("bins must be of timedelta64 dtype") from err
+        elif x_idx.dtype.kind == bins.dtype.kind == "M":
+            raise ValueError(
+                "Cannot use timezone-naive bins with timezone-aware values, "
+                "or vice-versa"
+            ) from err
+        elif x_idx.dtype.kind == "M":
+            raise ValueError("bins must be of datetime64 dtype") from err
+        else:
+            raise
+    ids = ensure_platform_int(ids)
 
     if include_lowest:
-        ids[np.asarray(x) == bins[0]] = 1
+        ids[x_idx == bins[0]] = 1
 
-    na_mask = isna(x) | (ids == len(bins)) | (ids == 0)
+    na_mask = isna(x_idx) | (ids == len(bins)) | (ids == 0)
     has_nas = na_mask.any()
 
     if labels is not False:
@@ -454,7 +479,7 @@ def _bins_to_cuts(
 
         if labels is None:
             labels = _format_labels(
-                bins, precision, right=right, include_lowest=include_lowest, dtype=dtype
+                bins, precision, right=right, include_lowest=include_lowest
             )
         elif ordered and len(set(labels)) != len(labels):
             raise ValueError(
@@ -513,90 +538,28 @@ def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]:
         x_arr = x.to_numpy(dtype=np.float64, na_value=np.nan)
         x = Index(x_arr)
 
-    if dtype is not None:
-        # GH 19768: force NaT to NaN during integer conversion
-        x_arr = np.where(x.notna(), x.view(np.int64), np.nan)
-        x = Index(x_arr)
-
-    return x, dtype
-
-
-def _convert_bin_to_numeric_type(bins, dtype: DtypeObj | None):
-    """
-    if the passed bin is of datetime/timedelta type,
-    this method converts it to integer
-
-    Parameters
-    ----------
-    bins : list-like of bins
-    dtype : dtype of data
-
-    Raises
-    ------
-    ValueError if bins are not of a compat dtype to dtype
-    """
-    bins_dtype = infer_dtype(bins, skipna=False)
-    if lib.is_np_dtype(dtype, "m"):
-        if bins_dtype in ["timedelta", "timedelta64"]:
-            bins = to_timedelta(bins).view(np.int64)
-        else:
-            raise ValueError("bins must be of timedelta64 dtype")
-    elif lib.is_np_dtype(dtype, "M") or isinstance(dtype, DatetimeTZDtype):
-        if bins_dtype in ["datetime", "datetime64"]:
-            bins = to_datetime(bins)
-            if lib.is_np_dtype(bins.dtype, "M"):
-                # As of 2.0, to_datetime may give non-nano, so we need to convert
-                #  here until the rest of this file recognizes non-nano
-                bins = bins.astype("datetime64[ns]", copy=False)
-            bins = bins.view(np.int64)
-        else:
-            raise ValueError("bins must be of datetime64 dtype")
-
-    return bins
-
-
-def _convert_bin_to_datelike_type(bins, dtype: DtypeObj | None):
-    """
-    Convert bins to a DatetimeIndex or TimedeltaIndex if the original dtype is
-    datelike
-
-    Parameters
-    ----------
-    bins : list-like of bins
-    dtype : dtype of data
-
-    Returns
-    -------
-    bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is
-           datelike
-    """
-    if isinstance(dtype, DatetimeTZDtype):
-        bins = to_datetime(bins.astype(np.int64), utc=True).tz_convert(dtype.tz)
-    elif lib.is_np_dtype(dtype, "mM"):
-        bins = Index(bins.astype(np.int64), dtype=dtype)
-    return bins
+    return Index(x), dtype
 
 
 def _format_labels(
     bins: Index,
     precision: int,
     right: bool = True,
     include_lowest: bool = False,
-    dtype: DtypeObj | None = None,
 ):
     """based on the dtype, return our labels"""
     closed: IntervalLeftRight = "right" if right else "left"
 
     formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta]
 
-    if isinstance(dtype, DatetimeTZDtype):
-        formatter = lambda x: Timestamp(x, tz=dtype.tz)
+    if isinstance(bins.dtype, DatetimeTZDtype):
+        formatter = lambda x: x
         adjust = lambda x: x - Timedelta("1ns")
-    elif lib.is_np_dtype(dtype, "M"):
-        formatter = Timestamp
+    elif lib.is_np_dtype(bins.dtype, "M"):
+        formatter = lambda x: x
         adjust = lambda x: x - Timedelta("1ns")
-    elif lib.is_np_dtype(dtype, "m"):
-        formatter = Timedelta
+    elif lib.is_np_dtype(bins.dtype, "m"):
+        formatter = lambda x: x
         adjust = lambda x: x - Timedelta("1ns")
     else:
         precision = _infer_precision(precision, bins)
@@ -628,7 +591,7 @@ def _preprocess_for_cut(x) -> Index:
     return Index(x)
 
 
-def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, original):
+def _postprocess_for_cut(fac, bins, retbins: bool, original):
     """
     handles post processing for the cut method where
     we combine the index information if the originally passed
@@ -640,7 +603,6 @@ def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, origi
     if not retbins:
         return fac
 
-    bins = _convert_bin_to_datelike_type(bins, dtype)
     if isinstance(bins, Index) and is_numeric_dtype(bins.dtype):
         bins = bins._values
 
diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py
@@ -307,9 +307,9 @@ def test_get_indexer_datetime(self):
         result = ii.get_indexer(DatetimeIndex(["2018-01-02"]).astype(str))
         tm.assert_numpy_array_equal(result, expected)
 
-        # TODO this should probably be deprecated?
         # https://github.com/pandas-dev/pandas/issues/47772
         result = ii.get_indexer(DatetimeIndex(["2018-01-02"]).asi8)
+        expected = np.array([-1], dtype=np.intp)
         tm.assert_numpy_array_equal(result, expected)
 
     @pytest.mark.parametrize(
diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py
@@ -495,15 +495,33 @@ def test_datetime_cut(data):
     tm.assert_series_equal(Series(result), expected)
 
 
-@pytest.mark.parametrize(
-    "bins",
-    [
-        3,
+@pytest.mark.parametrize("box", [list, np.array, Index, Series])
+def test_datetime_tz_cut_mismatched_tzawareness(box):
+    # GH#54964
+    bins = box(
         [
             Timestamp("2013-01-01 04:57:07.200000"),
             Timestamp("2013-01-01 21:00:00"),
             Timestamp("2013-01-02 13:00:00"),
             Timestamp("2013-01-03 05:00:00"),
+        ]
+    )
+    ser = Series(date_range("20130101", periods=3, tz="US/Eastern"))
+
+    msg = "Cannot use timezone-naive bins with timezone-aware values"
+    with pytest.raises(ValueError, match=msg):
+        cut(ser, bins)
+
+
+@pytest.mark.parametrize(
+    "bins",
+    [
+        3,
+        [
+            Timestamp("2013-01-01 04:57:07.200000", tz="UTC").tz_convert("US/Eastern"),
+            Timestamp("2013-01-01 21:00:00", tz="UTC").tz_convert("US/Eastern"),
+            Timestamp("2013-01-02 13:00:00", tz="UTC").tz_convert("US/Eastern"),
+            Timestamp("2013-01-03 05:00:00", tz="UTC").tz_convert("US/Eastern"),
         ],
     ],
 )