BUG: cut with non-nano (#56101)

jbrockmendel · web-flow · commit 65af77620857 · 2023-12-01T10:53:30.000-08:00
* BUG: IntervalIndex.factorize with non-nano

* GH ref

* BUG: cut with non-nano

* GH ref

* mypy fixup

* mypy fixup

* Update comment

* simplify
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -598,13 +598,15 @@ Styler
 Other
 ^^^^^
 - Bug in :func:`DataFrame.describe` when formatting percentiles in the resulting percentile 99.999% is rounded to 100% (:issue:`55765`)
+- Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`)
 - Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`)
 - Bug in :func:`infer_freq` and :meth:`DatetimeIndex.inferred_freq` with weekly frequencies and non-nanosecond resolutions (:issue:`55609`)
 - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`)
 - Bug in :meth:`Dataframe.from_dict` which would always sort the rows of the created :class:`DataFrame`.  (:issue:`55683`)
 - Bug in rendering ``inf`` values inside a a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`)
 - Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`)
 - Bug in the error message when assigning an empty dataframe to a column (:issue:`55956`)
+-
 
 .. ***DO NOT USE THIS SECTION***
 
diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
@@ -38,10 +38,9 @@
     Categorical,
     Index,
     IntervalIndex,
-    to_datetime,
-    to_timedelta,
 )
 import pandas.core.algorithms as algos
+from pandas.core.arrays.datetimelike import dtype_to_unit
 
 if TYPE_CHECKING:
     from pandas._typing import (
@@ -364,38 +363,41 @@ def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:
     rng = (x_idx.min(), x_idx.max())
     mn, mx = rng
 
-    is_dt_or_td = lib.is_np_dtype(x_idx.dtype, "mM") or isinstance(
-        x_idx.dtype, DatetimeTZDtype
-    )
-
     if is_numeric_dtype(x_idx.dtype) and (np.isinf(mn) or np.isinf(mx)):
         # GH#24314
         raise ValueError(
             "cannot specify integer `bins` when input data contains infinity"
         )
 
     if mn == mx:  # adjust end points before binning
-        if is_dt_or_td:
+        if _is_dt_or_td(x_idx.dtype):
             # using seconds=1 is pretty arbitrary here
-            td = Timedelta(seconds=1)
+            # error: Argument 1 to "dtype_to_unit" has incompatible type
+            # "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]"
+            unit = dtype_to_unit(x_idx.dtype)  # type: ignore[arg-type]
+            td = Timedelta(seconds=1).as_unit(unit)
             # Use DatetimeArray/TimedeltaArray method instead of linspace
             # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
             # has no attribute "_generate_range"
             bins = x_idx._values._generate_range(  # type: ignore[union-attr]
-                start=mn - td, end=mx + td, periods=nbins + 1, freq=None
+                start=mn - td, end=mx + td, periods=nbins + 1, freq=None, unit=unit
             )
         else:
             mn -= 0.001 * abs(mn) if mn != 0 else 0.001
             mx += 0.001 * abs(mx) if mx != 0 else 0.001
 
             bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
     else:  # adjust end points after binning
-        if is_dt_or_td:
+        if _is_dt_or_td(x_idx.dtype):
             # Use DatetimeArray/TimedeltaArray method instead of linspace
+
+            # error: Argument 1 to "dtype_to_unit" has incompatible type
+            # "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]"
+            unit = dtype_to_unit(x_idx.dtype)  # type: ignore[arg-type]
             # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
             # has no attribute "_generate_range"
             bins = x_idx._values._generate_range(  # type: ignore[union-attr]
-                start=mn, end=mx, periods=nbins + 1, freq=None
+                start=mn, end=mx, periods=nbins + 1, freq=None, unit=unit
             )
         else:
             bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
@@ -519,14 +521,8 @@ def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]:
     """
     dtype: DtypeObj | None = None
 
-    if isinstance(x.dtype, DatetimeTZDtype):
+    if _is_dt_or_td(x.dtype):
         dtype = x.dtype
-    elif lib.is_np_dtype(x.dtype, "M"):
-        x = to_datetime(x).astype("datetime64[ns]", copy=False)
-        dtype = np.dtype("datetime64[ns]")
-    elif lib.is_np_dtype(x.dtype, "m"):
-        x = to_timedelta(x)
-        dtype = np.dtype("timedelta64[ns]")
     elif is_bool_dtype(x.dtype):
         # GH 20303
         x = x.astype(np.int64)
@@ -541,6 +537,12 @@ def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]:
     return Index(x), dtype
 
 
+def _is_dt_or_td(dtype: DtypeObj) -> bool:
+    # Note: the dtype here comes from an Index.dtype, so we know that that any
+    #  dt64/td64 dtype is of a supported unit.
+    return isinstance(dtype, DatetimeTZDtype) or lib.is_np_dtype(dtype, "mM")
+
+
 def _format_labels(
     bins: Index,
     precision: int,
@@ -552,15 +554,12 @@ def _format_labels(
 
     formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta]
 
-    if isinstance(bins.dtype, DatetimeTZDtype):
-        formatter = lambda x: x
-        adjust = lambda x: x - Timedelta("1ns")
-    elif lib.is_np_dtype(bins.dtype, "M"):
+    if _is_dt_or_td(bins.dtype):
+        # error: Argument 1 to "dtype_to_unit" has incompatible type
+        # "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]"
+        unit = dtype_to_unit(bins.dtype)  # type: ignore[arg-type]
         formatter = lambda x: x
-        adjust = lambda x: x - Timedelta("1ns")
-    elif lib.is_np_dtype(bins.dtype, "m"):
-        formatter = lambda x: x
-        adjust = lambda x: x - Timedelta("1ns")
+        adjust = lambda x: x - Timedelta(1, unit=unit).as_unit(unit)
     else:
         precision = _infer_precision(precision, bins)
         formatter = lambda x: _round_frac(x, precision)
@@ -571,6 +570,10 @@ def _format_labels(
         # adjust lhs of first interval by precision to account for being right closed
         breaks[0] = adjust(breaks[0])
 
+    if _is_dt_or_td(bins.dtype):
+        # error: "Index" has no attribute "as_unit"
+        breaks = type(bins)(breaks).as_unit(unit)  # type: ignore[attr-defined]
+
     return IntervalIndex.from_breaks(breaks, closed=closed)
 
 
diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py
@@ -452,46 +452,42 @@ def test_datetime_bin(conv):
     tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.parametrize(
-    "data",
-    [
-        to_datetime(Series(["2013-01-01", "2013-01-02", "2013-01-03"])),
-        [
-            np.datetime64("2013-01-01"),
-            np.datetime64("2013-01-02"),
-            np.datetime64("2013-01-03"),
-        ],
-        np.array(
-            [
-                np.datetime64("2013-01-01"),
-                np.datetime64("2013-01-02"),
-                np.datetime64("2013-01-03"),
-            ]
-        ),
-        DatetimeIndex(["2013-01-01", "2013-01-02", "2013-01-03"]),
-    ],
-)
-def test_datetime_cut(data):
+@pytest.mark.parametrize("box", [Series, Index, np.array, list])
+def test_datetime_cut(unit, box):
     # see gh-14714
     #
     # Testing time data when it comes in various collection types.
+    data = to_datetime(["2013-01-01", "2013-01-02", "2013-01-03"]).astype(f"M8[{unit}]")
+    data = box(data)
     result, _ = cut(data, 3, retbins=True)
-    expected = Series(
-        IntervalIndex(
+
+    if box is list:
+        # We don't (yet) do inference on these, so get nanos
+        unit = "ns"
+
+    if unit == "s":
+        # See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425
+        # for why we round to 8 seconds instead of 7
+        left = DatetimeIndex(
+            ["2012-12-31 23:57:08", "2013-01-01 16:00:00", "2013-01-02 08:00:00"],
+            dtype=f"M8[{unit}]",
+        )
+    else:
+        left = DatetimeIndex(
             [
-                Interval(
-                    Timestamp("2012-12-31 23:57:07.200000"),
-                    Timestamp("2013-01-01 16:00:00"),
-                ),
-                Interval(
-                    Timestamp("2013-01-01 16:00:00"), Timestamp("2013-01-02 08:00:00")
-                ),
-                Interval(
-                    Timestamp("2013-01-02 08:00:00"), Timestamp("2013-01-03 00:00:00")
-                ),
-            ]
+                "2012-12-31 23:57:07.200000",
+                "2013-01-01 16:00:00",
+                "2013-01-02 08:00:00",
+            ],
+            dtype=f"M8[{unit}]",
         )
-    ).astype(CategoricalDtype(ordered=True))
+    right = DatetimeIndex(
+        ["2013-01-01 16:00:00", "2013-01-02 08:00:00", "2013-01-03 00:00:00"],
+        dtype=f"M8[{unit}]",
+    )
+
+    exp_intervals = IntervalIndex.from_arrays(left, right)
+    expected = Series(exp_intervals).astype(CategoricalDtype(ordered=True))
     tm.assert_series_equal(Series(result), expected)
 
 
@@ -576,17 +572,33 @@ def test_datetime_nan_mask():
 
 
 @pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"])
-def test_datetime_cut_roundtrip(tz):
+def test_datetime_cut_roundtrip(tz, unit):
     # see gh-19891
-    ser = Series(date_range("20180101", periods=3, tz=tz))
+    ser = Series(date_range("20180101", periods=3, tz=tz, unit=unit))
     result, result_bins = cut(ser, 2, retbins=True)
 
     expected = cut(ser, result_bins)
     tm.assert_series_equal(result, expected)
 
-    expected_bins = DatetimeIndex(
-        ["2017-12-31 23:57:07.200000", "2018-01-02 00:00:00", "2018-01-03 00:00:00"]
-    )
+    if unit == "s":
+        # TODO: constructing DatetimeIndex with dtype="M8[s]" without truncating
+        #  the first entry here raises in array_to_datetime. Should truncate
+        #  instead of raising?
+        # See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425
+        # for why we round to 8 seconds instead of 7
+        expected_bins = DatetimeIndex(
+            ["2017-12-31 23:57:08", "2018-01-02 00:00:00", "2018-01-03 00:00:00"],
+            dtype=f"M8[{unit}]",
+        )
+    else:
+        expected_bins = DatetimeIndex(
+            [
+                "2017-12-31 23:57:07.200000",
+                "2018-01-02 00:00:00",
+                "2018-01-03 00:00:00",
+            ],
+            dtype=f"M8[{unit}]",
+        )
     expected_bins = expected_bins.tz_localize(tz)
     tm.assert_index_equal(result_bins, expected_bins)
 
@@ -759,7 +771,7 @@ def test_cut_bins_datetime_intervalindex():
     # https://github.com/pandas-dev/pandas/issues/46218
     bins = interval_range(Timestamp("2022-02-25"), Timestamp("2022-02-27"), freq="1D")
     # passing Series instead of list is important to trigger bug
-    result = cut(Series([Timestamp("2022-02-26")]), bins=bins)
+    result = cut(Series([Timestamp("2022-02-26")]).astype("M8[ns]"), bins=bins)
     expected = Categorical.from_codes([0], bins, ordered=True)
     tm.assert_categorical_equal(result.array, expected)
 
diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py
@@ -11,6 +11,7 @@
     IntervalIndex,
     NaT,
     Series,
+    Timedelta,
     TimedeltaIndex,
     Timestamp,
     cut,
@@ -22,10 +23,7 @@
 import pandas._testing as tm
 from pandas.api.types import CategoricalDtype
 
-from pandas.tseries.offsets import (
-    Day,
-    Nano,
-)
+from pandas.tseries.offsets import Day
 
 
 def test_qcut():
@@ -216,11 +214,14 @@ def test_single_quantile(data, start, end, length, labels):
     ],
     ids=lambda x: str(x.dtype),
 )
-def test_qcut_nat(ser):
+def test_qcut_nat(ser, unit):
     # see gh-19768
-    intervals = IntervalIndex.from_tuples(
-        [(ser[0] - Nano(), ser[2] - Day()), np.nan, (ser[2] - Day(), ser[2])]
-    )
+    ser = ser.dt.as_unit(unit)
+    td = Timedelta(1, unit=unit).as_unit(unit)
+
+    left = Series([ser[0] - td, np.nan, ser[2] - Day()], dtype=ser.dtype)
+    right = Series([ser[2] - Day(), np.nan, ser[2]], dtype=ser.dtype)
+    intervals = IntervalIndex.from_arrays(left, right)
     expected = Series(Categorical(intervals, ordered=True))
 
     result = qcut(ser, 2)