Skip to content

Commit a99c1ad

Browse files
authored
ENH: date_range support reso keyword (#49106)
* ENH: date_range support reso keyword * GH ref * pyright ignore * reso->unit * raise if endpoints cant cast losslessly * add assertions * mypy fixup * example with unit * typo fixup
1 parent 4f3c381 commit a99c1ad

File tree

8 files changed

+127
-8
lines changed

8 files changed

+127
-8
lines changed

doc/source/whatsnew/v2.0.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,10 @@ Other enhancements
6161
- :class:`.CategoricalConversionWarning`, :class:`.InvalidComparison`, :class:`.InvalidVersion`, :class:`.LossySetitemError`, and :class:`.NoBufferPresent` are now exposed in ``pandas.errors`` (:issue:`27656`)
6262
- Fix ``test`` optional_extra by adding missing test package ``pytest-asyncio`` (:issue:`48361`)
6363
- :func:`DataFrame.astype` exception message thrown improved to include column name when type conversion is not possible. (:issue:`47571`)
64+
- :func:`date_range` now supports a ``unit`` keyword ("s", "ms", "us", or "ns") to specify the desired resolution of the output index (:issue:`49106`)
6465
- :meth:`DataFrame.to_json` now supports a ``mode`` keyword with supported inputs 'w' and 'a'. Defaulting to 'w', 'a' can be used when lines=True and orient='records' to append record oriented json lines to an existing json file. (:issue:`35849`)
6566
- Added ``name`` parameter to :meth:`IntervalIndex.from_breaks`, :meth:`IntervalIndex.from_arrays` and :meth:`IntervalIndex.from_tuples` (:issue:`48911`)
67+
-
6668

6769
.. ---------------------------------------------------------------------------
6870
.. _whatsnew_200.notable_bug_fixes:

pandas/_libs/tslibs/dtypes.pxd

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT
44

55

66
cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit)
7-
cdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev)
7+
cpdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev)
88
cdef NPY_DATETIMEUNIT freq_group_code_to_npy_unit(int freq) nogil
99
cpdef int64_t periods_per_day(NPY_DATETIMEUNIT reso=*) except? -1
1010
cpdef int64_t periods_per_second(NPY_DATETIMEUNIT reso) except? -1

pandas/_libs/tslibs/dtypes.pyi

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ def periods_per_second(reso: int) -> int: ...
1010
def is_supported_unit(reso: int) -> bool: ...
1111
def npy_unit_to_abbrev(reso: int) -> str: ...
1212
def get_supported_reso(reso: int) -> int: ...
13+
def abbrev_to_npy_unit(abbrev: str) -> int: ...
1314

1415
class PeriodDtypeBase:
1516
_dtype_code: int # PeriodDtypeCode

pandas/_libs/tslibs/dtypes.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,7 @@ cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit):
336336
raise NotImplementedError(unit)
337337

338338

339-
cdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev):
339+
cpdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev):
340340
if abbrev == "Y":
341341
return NPY_DATETIMEUNIT.NPY_FR_Y
342342
elif abbrev == "M":

pandas/core/arrays/_ranges.py

+17-2
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ def generate_regular_range(
2222
end: Timestamp | Timedelta | None,
2323
periods: int | None,
2424
freq: BaseOffset,
25+
unit: str = "ns",
2526
) -> npt.NDArray[np.intp]:
2627
"""
2728
Generate a range of dates or timestamps with the spans between dates
@@ -37,14 +38,28 @@ def generate_regular_range(
3738
Number of periods in produced date range.
3839
freq : Tick
3940
Describes space between dates in produced date range.
41+
unit : str, default "ns"
42+
The resolution the output is meant to represent.
4043
4144
Returns
4245
-------
43-
ndarray[np.int64] Representing nanoseconds.
46+
ndarray[np.int64]
47+
Representing the given resolution.
4448
"""
4549
istart = start.value if start is not None else None
4650
iend = end.value if end is not None else None
47-
stride = freq.nanos
51+
freq.nanos # raises if non-fixed frequency
52+
td = Timedelta(freq)
53+
try:
54+
td = td.as_unit( # pyright: ignore[reportGeneralTypeIssues]
55+
unit, round_ok=False
56+
)
57+
except ValueError as err:
58+
raise ValueError(
59+
f"freq={freq} is incompatible with unit={unit}. "
60+
"Use a lower freq or a higher unit instead."
61+
) from err
62+
stride = int(td.value)
4863

4964
if periods is None and istart is not None and iend is not None:
5065
b = istart

pandas/core/arrays/datetimes.py

+24-4
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
tz_convert_from_utc,
4343
tzconversion,
4444
)
45+
from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit
4546
from pandas._typing import (
4647
DateTimeErrorChoices,
4748
IntervalClosedType,
@@ -380,6 +381,8 @@ def _generate_range( # type: ignore[override]
380381
ambiguous: TimeAmbiguous = "raise",
381382
nonexistent: TimeNonexistent = "raise",
382383
inclusive: IntervalClosedType = "both",
384+
*,
385+
unit: str | None = None,
383386
) -> DatetimeArray:
384387

385388
periods = dtl.validate_periods(periods)
@@ -402,6 +405,17 @@ def _generate_range( # type: ignore[override]
402405
if start is NaT or end is NaT:
403406
raise ValueError("Neither `start` nor `end` can be NaT")
404407

408+
if unit is not None:
409+
if unit not in ["s", "ms", "us", "ns"]:
410+
raise ValueError("'unit' must be one of 's', 'ms', 'us', 'ns'")
411+
else:
412+
unit = "ns"
413+
414+
if start is not None and unit is not None:
415+
start = start.as_unit(unit, round_ok=False)
416+
if end is not None and unit is not None:
417+
end = end.as_unit(unit, round_ok=False)
418+
405419
left_inclusive, right_inclusive = validate_inclusive(inclusive)
406420
start, end = _maybe_normalize_endpoints(start, end, normalize)
407421
tz = _infer_tz_from_endpoints(start, end, tz)
@@ -416,6 +430,7 @@ def _generate_range( # type: ignore[override]
416430
end = _maybe_localize_point(
417431
end, end_tz, end, freq, tz, ambiguous, nonexistent
418432
)
433+
419434
if freq is not None:
420435
# We break Day arithmetic (fixed 24 hour) here and opt for
421436
# Day to mean calendar day (23/24/25 hour). Therefore, strip
@@ -427,7 +442,7 @@ def _generate_range( # type: ignore[override]
427442
end = end.tz_localize(None)
428443

429444
if isinstance(freq, Tick):
430-
i8values = generate_regular_range(start, end, periods, freq)
445+
i8values = generate_regular_range(start, end, periods, freq, unit=unit)
431446
else:
432447
xdr = _generate_range(
433448
start=start, end=end, periods=periods, offset=freq
@@ -441,8 +456,13 @@ def _generate_range( # type: ignore[override]
441456
if not timezones.is_utc(tz):
442457
# short-circuit tz_localize_to_utc which would make
443458
# an unnecessary copy with UTC but be a no-op.
459+
creso = abbrev_to_npy_unit(unit)
444460
i8values = tzconversion.tz_localize_to_utc(
445-
i8values, tz, ambiguous=ambiguous, nonexistent=nonexistent
461+
i8values,
462+
tz,
463+
ambiguous=ambiguous,
464+
nonexistent=nonexistent,
465+
creso=creso,
446466
)
447467

448468
# i8values is localized datetime64 array -> have to convert
@@ -477,8 +497,8 @@ def _generate_range( # type: ignore[override]
477497
if not right_inclusive and len(i8values) and i8values[-1] == end_i8:
478498
i8values = i8values[:-1]
479499

480-
dt64_values = i8values.view("datetime64[ns]")
481-
dtype = tz_to_dtype(tz)
500+
dt64_values = i8values.view(f"datetime64[{unit}]")
501+
dtype = tz_to_dtype(tz, unit=unit)
482502
return cls._simple_new(dt64_values, freq=freq, dtype=dtype)
483503

484504
# -----------------------------------------------------------------

pandas/core/indexes/datetimes.py

+15
Original file line numberDiff line numberDiff line change
@@ -818,6 +818,8 @@ def date_range(
818818
normalize: bool = False,
819819
name: Hashable = None,
820820
inclusive: IntervalClosedType = "both",
821+
*,
822+
unit: str | None = None,
821823
**kwargs,
822824
) -> DatetimeIndex:
823825
"""
@@ -856,6 +858,10 @@ def date_range(
856858
Include boundaries; Whether to set each bound as closed or open.
857859
858860
.. versionadded:: 1.4.0
861+
unit : str, default None
862+
Specify the desired resolution of the result.
863+
864+
.. versionadded:: 2.0.0
859865
**kwargs
860866
For compatibility. Has no effect on the result.
861867
@@ -966,6 +972,14 @@ def date_range(
966972
>>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='right')
967973
DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'],
968974
dtype='datetime64[ns]', freq='D')
975+
976+
**Specify a unit**
977+
978+
>>> pd.date_range(start="2017-01-01", periods=10, freq="100AS", unit="s")
979+
DatetimeIndex(['2017-01-01', '2117-01-01', '2217-01-01', '2317-01-01',
980+
'2417-01-01', '2517-01-01', '2617-01-01', '2717-01-01',
981+
'2817-01-01', '2917-01-01'],
982+
dtype='datetime64[s]', freq='100AS-JAN')
969983
"""
970984
if freq is None and com.any_none(periods, start, end):
971985
freq = "D"
@@ -978,6 +992,7 @@ def date_range(
978992
tz=tz,
979993
normalize=normalize,
980994
inclusive=inclusive,
995+
unit=unit,
981996
**kwargs,
982997
)
983998
return DatetimeIndex._simple_new(dtarr, name=name)

pandas/tests/indexes/datetimes/test_date_range.py

+66
Original file line numberDiff line numberDiff line change
@@ -1184,3 +1184,69 @@ def test_date_range_with_custom_holidays():
11841184
freq=freq,
11851185
)
11861186
tm.assert_index_equal(result, expected)
1187+
1188+
1189+
class TestDateRangeNonNano:
1190+
def test_date_range_reso_validation(self):
1191+
msg = "'unit' must be one of 's', 'ms', 'us', 'ns'"
1192+
with pytest.raises(ValueError, match=msg):
1193+
date_range("2016-01-01", "2016-03-04", periods=3, unit="h")
1194+
1195+
def test_date_range_freq_higher_than_reso(self):
1196+
# freq being higher-resolution than reso is a problem
1197+
msg = "Use a lower freq or a higher unit instead"
1198+
with pytest.raises(ValueError, match=msg):
1199+
# # TODO give a more useful or informative message?
1200+
date_range("2016-01-01", "2016-01-02", freq="ns", unit="ms")
1201+
1202+
def test_date_range_freq_matches_reso(self):
1203+
# GH#49106 matching reso is OK
1204+
dti = date_range("2016-01-01", "2016-01-01 00:00:01", freq="ms", unit="ms")
1205+
rng = np.arange(1_451_606_400_000, 1_451_606_401_001, dtype=np.int64)
1206+
expected = DatetimeIndex(rng.view("M8[ms]"), freq="ms")
1207+
tm.assert_index_equal(dti, expected)
1208+
1209+
dti = date_range("2016-01-01", "2016-01-01 00:00:01", freq="us", unit="us")
1210+
rng = np.arange(1_451_606_400_000_000, 1_451_606_401_000_001, dtype=np.int64)
1211+
expected = DatetimeIndex(rng.view("M8[us]"), freq="us")
1212+
tm.assert_index_equal(dti, expected)
1213+
1214+
dti = date_range("2016-01-01", "2016-01-01 00:00:00.001", freq="ns", unit="ns")
1215+
rng = np.arange(
1216+
1_451_606_400_000_000_000, 1_451_606_400_001_000_001, dtype=np.int64
1217+
)
1218+
expected = DatetimeIndex(rng.view("M8[ns]"), freq="ns")
1219+
tm.assert_index_equal(dti, expected)
1220+
1221+
def test_date_range_freq_lower_than_endpoints(self):
1222+
start = Timestamp("2022-10-19 11:50:44.719781")
1223+
end = Timestamp("2022-10-19 11:50:47.066458")
1224+
1225+
# start and end cannot be cast to "s" unit without lossy rounding,
1226+
# so we do not allow this in date_range
1227+
with pytest.raises(ValueError, match="Cannot losslessly convert units"):
1228+
date_range(start, end, periods=3, unit="s")
1229+
1230+
# but we can losslessly cast to "us"
1231+
dti = date_range(start, end, periods=2, unit="us")
1232+
rng = np.array(
1233+
[start.as_unit("us").value, end.as_unit("us").value], dtype=np.int64
1234+
)
1235+
expected = DatetimeIndex(rng.view("M8[us]"))
1236+
tm.assert_index_equal(dti, expected)
1237+
1238+
def test_date_range_non_nano(self):
1239+
start = np.datetime64("1066-10-14") # Battle of Hastings
1240+
end = np.datetime64("2305-07-13") # Jean-Luc Picard's birthday
1241+
1242+
dti = date_range(start, end, freq="D", unit="s")
1243+
assert dti.freq == "D"
1244+
assert dti.dtype == "M8[s]"
1245+
1246+
exp = np.arange(
1247+
start.astype("M8[s]").view("i8"),
1248+
(end + 1).astype("M8[s]").view("i8"),
1249+
24 * 3600,
1250+
).view("M8[s]")
1251+
1252+
tm.assert_numpy_array_equal(dti.to_numpy(), exp)

0 commit comments

Comments
 (0)