Skip to content

Commit 65af776

Browse files
authored
BUG: cut with non-nano (#56101)
* BUG: IntervalIndex.factorize with non-nano * GH ref * BUG: cut with non-nano * GH ref * mypy fixup * mypy fixup * Update comment * simplify
1 parent 147d68a commit 65af776

File tree

4 files changed

+92
-74
lines changed

4 files changed

+92
-74
lines changed

doc/source/whatsnew/v2.2.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -598,13 +598,15 @@ Styler
598598
Other
599599
^^^^^
600600
- Bug in :func:`DataFrame.describe` when formatting percentiles in the resulting percentile 99.999% is rounded to 100% (:issue:`55765`)
601+
- Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`)
601602
- Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`)
602603
- Bug in :func:`infer_freq` and :meth:`DatetimeIndex.inferred_freq` with weekly frequencies and non-nanosecond resolutions (:issue:`55609`)
603604
- Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`)
604605
- Bug in :meth:`Dataframe.from_dict` which would always sort the rows of the created :class:`DataFrame`. (:issue:`55683`)
605606
- Bug in rendering ``inf`` values inside a a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`)
606607
- Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`)
607608
- Bug in the error message when assigning an empty dataframe to a column (:issue:`55956`)
609+
-
608610

609611
.. ***DO NOT USE THIS SECTION***
610612

pandas/core/reshape/tile.py

+29-26
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,9 @@
3838
Categorical,
3939
Index,
4040
IntervalIndex,
41-
to_datetime,
42-
to_timedelta,
4341
)
4442
import pandas.core.algorithms as algos
43+
from pandas.core.arrays.datetimelike import dtype_to_unit
4544

4645
if TYPE_CHECKING:
4746
from pandas._typing import (
@@ -364,38 +363,41 @@ def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:
364363
rng = (x_idx.min(), x_idx.max())
365364
mn, mx = rng
366365

367-
is_dt_or_td = lib.is_np_dtype(x_idx.dtype, "mM") or isinstance(
368-
x_idx.dtype, DatetimeTZDtype
369-
)
370-
371366
if is_numeric_dtype(x_idx.dtype) and (np.isinf(mn) or np.isinf(mx)):
372367
# GH#24314
373368
raise ValueError(
374369
"cannot specify integer `bins` when input data contains infinity"
375370
)
376371

377372
if mn == mx: # adjust end points before binning
378-
if is_dt_or_td:
373+
if _is_dt_or_td(x_idx.dtype):
379374
# using seconds=1 is pretty arbitrary here
380-
td = Timedelta(seconds=1)
375+
# error: Argument 1 to "dtype_to_unit" has incompatible type
376+
# "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]"
377+
unit = dtype_to_unit(x_idx.dtype) # type: ignore[arg-type]
378+
td = Timedelta(seconds=1).as_unit(unit)
381379
# Use DatetimeArray/TimedeltaArray method instead of linspace
382380
# error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
383381
# has no attribute "_generate_range"
384382
bins = x_idx._values._generate_range( # type: ignore[union-attr]
385-
start=mn - td, end=mx + td, periods=nbins + 1, freq=None
383+
start=mn - td, end=mx + td, periods=nbins + 1, freq=None, unit=unit
386384
)
387385
else:
388386
mn -= 0.001 * abs(mn) if mn != 0 else 0.001
389387
mx += 0.001 * abs(mx) if mx != 0 else 0.001
390388

391389
bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
392390
else: # adjust end points after binning
393-
if is_dt_or_td:
391+
if _is_dt_or_td(x_idx.dtype):
394392
# Use DatetimeArray/TimedeltaArray method instead of linspace
393+
394+
# error: Argument 1 to "dtype_to_unit" has incompatible type
395+
# "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]"
396+
unit = dtype_to_unit(x_idx.dtype) # type: ignore[arg-type]
395397
# error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
396398
# has no attribute "_generate_range"
397399
bins = x_idx._values._generate_range( # type: ignore[union-attr]
398-
start=mn, end=mx, periods=nbins + 1, freq=None
400+
start=mn, end=mx, periods=nbins + 1, freq=None, unit=unit
399401
)
400402
else:
401403
bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
@@ -519,14 +521,8 @@ def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]:
519521
"""
520522
dtype: DtypeObj | None = None
521523

522-
if isinstance(x.dtype, DatetimeTZDtype):
524+
if _is_dt_or_td(x.dtype):
523525
dtype = x.dtype
524-
elif lib.is_np_dtype(x.dtype, "M"):
525-
x = to_datetime(x).astype("datetime64[ns]", copy=False)
526-
dtype = np.dtype("datetime64[ns]")
527-
elif lib.is_np_dtype(x.dtype, "m"):
528-
x = to_timedelta(x)
529-
dtype = np.dtype("timedelta64[ns]")
530526
elif is_bool_dtype(x.dtype):
531527
# GH 20303
532528
x = x.astype(np.int64)
@@ -541,6 +537,12 @@ def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]:
541537
return Index(x), dtype
542538

543539

540+
def _is_dt_or_td(dtype: DtypeObj) -> bool:
541+
# Note: the dtype here comes from an Index.dtype, so we know that that any
542+
# dt64/td64 dtype is of a supported unit.
543+
return isinstance(dtype, DatetimeTZDtype) or lib.is_np_dtype(dtype, "mM")
544+
545+
544546
def _format_labels(
545547
bins: Index,
546548
precision: int,
@@ -552,15 +554,12 @@ def _format_labels(
552554

553555
formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta]
554556

555-
if isinstance(bins.dtype, DatetimeTZDtype):
556-
formatter = lambda x: x
557-
adjust = lambda x: x - Timedelta("1ns")
558-
elif lib.is_np_dtype(bins.dtype, "M"):
557+
if _is_dt_or_td(bins.dtype):
558+
# error: Argument 1 to "dtype_to_unit" has incompatible type
559+
# "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]"
560+
unit = dtype_to_unit(bins.dtype) # type: ignore[arg-type]
559561
formatter = lambda x: x
560-
adjust = lambda x: x - Timedelta("1ns")
561-
elif lib.is_np_dtype(bins.dtype, "m"):
562-
formatter = lambda x: x
563-
adjust = lambda x: x - Timedelta("1ns")
562+
adjust = lambda x: x - Timedelta(1, unit=unit).as_unit(unit)
564563
else:
565564
precision = _infer_precision(precision, bins)
566565
formatter = lambda x: _round_frac(x, precision)
@@ -571,6 +570,10 @@ def _format_labels(
571570
# adjust lhs of first interval by precision to account for being right closed
572571
breaks[0] = adjust(breaks[0])
573572

573+
if _is_dt_or_td(bins.dtype):
574+
# error: "Index" has no attribute "as_unit"
575+
breaks = type(bins)(breaks).as_unit(unit) # type: ignore[attr-defined]
576+
574577
return IntervalIndex.from_breaks(breaks, closed=closed)
575578

576579

pandas/tests/reshape/test_cut.py

+52-40
Original file line numberDiff line numberDiff line change
@@ -452,46 +452,42 @@ def test_datetime_bin(conv):
452452
tm.assert_series_equal(result, expected)
453453

454454

455-
@pytest.mark.parametrize(
456-
"data",
457-
[
458-
to_datetime(Series(["2013-01-01", "2013-01-02", "2013-01-03"])),
459-
[
460-
np.datetime64("2013-01-01"),
461-
np.datetime64("2013-01-02"),
462-
np.datetime64("2013-01-03"),
463-
],
464-
np.array(
465-
[
466-
np.datetime64("2013-01-01"),
467-
np.datetime64("2013-01-02"),
468-
np.datetime64("2013-01-03"),
469-
]
470-
),
471-
DatetimeIndex(["2013-01-01", "2013-01-02", "2013-01-03"]),
472-
],
473-
)
474-
def test_datetime_cut(data):
455+
@pytest.mark.parametrize("box", [Series, Index, np.array, list])
456+
def test_datetime_cut(unit, box):
475457
# see gh-14714
476458
#
477459
# Testing time data when it comes in various collection types.
460+
data = to_datetime(["2013-01-01", "2013-01-02", "2013-01-03"]).astype(f"M8[{unit}]")
461+
data = box(data)
478462
result, _ = cut(data, 3, retbins=True)
479-
expected = Series(
480-
IntervalIndex(
463+
464+
if box is list:
465+
# We don't (yet) do inference on these, so get nanos
466+
unit = "ns"
467+
468+
if unit == "s":
469+
# See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425
470+
# for why we round to 8 seconds instead of 7
471+
left = DatetimeIndex(
472+
["2012-12-31 23:57:08", "2013-01-01 16:00:00", "2013-01-02 08:00:00"],
473+
dtype=f"M8[{unit}]",
474+
)
475+
else:
476+
left = DatetimeIndex(
481477
[
482-
Interval(
483-
Timestamp("2012-12-31 23:57:07.200000"),
484-
Timestamp("2013-01-01 16:00:00"),
485-
),
486-
Interval(
487-
Timestamp("2013-01-01 16:00:00"), Timestamp("2013-01-02 08:00:00")
488-
),
489-
Interval(
490-
Timestamp("2013-01-02 08:00:00"), Timestamp("2013-01-03 00:00:00")
491-
),
492-
]
478+
"2012-12-31 23:57:07.200000",
479+
"2013-01-01 16:00:00",
480+
"2013-01-02 08:00:00",
481+
],
482+
dtype=f"M8[{unit}]",
493483
)
494-
).astype(CategoricalDtype(ordered=True))
484+
right = DatetimeIndex(
485+
["2013-01-01 16:00:00", "2013-01-02 08:00:00", "2013-01-03 00:00:00"],
486+
dtype=f"M8[{unit}]",
487+
)
488+
489+
exp_intervals = IntervalIndex.from_arrays(left, right)
490+
expected = Series(exp_intervals).astype(CategoricalDtype(ordered=True))
495491
tm.assert_series_equal(Series(result), expected)
496492

497493

@@ -576,17 +572,33 @@ def test_datetime_nan_mask():
576572

577573

578574
@pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"])
579-
def test_datetime_cut_roundtrip(tz):
575+
def test_datetime_cut_roundtrip(tz, unit):
580576
# see gh-19891
581-
ser = Series(date_range("20180101", periods=3, tz=tz))
577+
ser = Series(date_range("20180101", periods=3, tz=tz, unit=unit))
582578
result, result_bins = cut(ser, 2, retbins=True)
583579

584580
expected = cut(ser, result_bins)
585581
tm.assert_series_equal(result, expected)
586582

587-
expected_bins = DatetimeIndex(
588-
["2017-12-31 23:57:07.200000", "2018-01-02 00:00:00", "2018-01-03 00:00:00"]
589-
)
583+
if unit == "s":
584+
# TODO: constructing DatetimeIndex with dtype="M8[s]" without truncating
585+
# the first entry here raises in array_to_datetime. Should truncate
586+
# instead of raising?
587+
# See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425
588+
# for why we round to 8 seconds instead of 7
589+
expected_bins = DatetimeIndex(
590+
["2017-12-31 23:57:08", "2018-01-02 00:00:00", "2018-01-03 00:00:00"],
591+
dtype=f"M8[{unit}]",
592+
)
593+
else:
594+
expected_bins = DatetimeIndex(
595+
[
596+
"2017-12-31 23:57:07.200000",
597+
"2018-01-02 00:00:00",
598+
"2018-01-03 00:00:00",
599+
],
600+
dtype=f"M8[{unit}]",
601+
)
590602
expected_bins = expected_bins.tz_localize(tz)
591603
tm.assert_index_equal(result_bins, expected_bins)
592604

@@ -759,7 +771,7 @@ def test_cut_bins_datetime_intervalindex():
759771
# https://github.com/pandas-dev/pandas/issues/46218
760772
bins = interval_range(Timestamp("2022-02-25"), Timestamp("2022-02-27"), freq="1D")
761773
# passing Series instead of list is important to trigger bug
762-
result = cut(Series([Timestamp("2022-02-26")]), bins=bins)
774+
result = cut(Series([Timestamp("2022-02-26")]).astype("M8[ns]"), bins=bins)
763775
expected = Categorical.from_codes([0], bins, ordered=True)
764776
tm.assert_categorical_equal(result.array, expected)
765777

pandas/tests/reshape/test_qcut.py

+9-8
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
IntervalIndex,
1212
NaT,
1313
Series,
14+
Timedelta,
1415
TimedeltaIndex,
1516
Timestamp,
1617
cut,
@@ -22,10 +23,7 @@
2223
import pandas._testing as tm
2324
from pandas.api.types import CategoricalDtype
2425

25-
from pandas.tseries.offsets import (
26-
Day,
27-
Nano,
28-
)
26+
from pandas.tseries.offsets import Day
2927

3028

3129
def test_qcut():
@@ -216,11 +214,14 @@ def test_single_quantile(data, start, end, length, labels):
216214
],
217215
ids=lambda x: str(x.dtype),
218216
)
219-
def test_qcut_nat(ser):
217+
def test_qcut_nat(ser, unit):
220218
# see gh-19768
221-
intervals = IntervalIndex.from_tuples(
222-
[(ser[0] - Nano(), ser[2] - Day()), np.nan, (ser[2] - Day(), ser[2])]
223-
)
219+
ser = ser.dt.as_unit(unit)
220+
td = Timedelta(1, unit=unit).as_unit(unit)
221+
222+
left = Series([ser[0] - td, np.nan, ser[2] - Day()], dtype=ser.dtype)
223+
right = Series([ser[2] - Day(), np.nan, ser[2]], dtype=ser.dtype)
224+
intervals = IntervalIndex.from_arrays(left, right)
224225
expected = Series(Categorical(intervals, ordered=True))
225226

226227
result = qcut(ser, 2)

0 commit comments

Comments
 (0)