Skip to content

Commit cf0056d

Browse files
authored
API: retain non-nano timedelta64 dtype in DataFrame/Series/Index constructors (#49014)
* API: retain non-nano timedelta64 dtype in constructors * GH ref * troubleshoot CI * fix doctest * xfail on ArrayManager
1 parent a015631 commit cf0056d

File tree

17 files changed

+161
-59
lines changed

17 files changed

+161
-59
lines changed

doc/source/whatsnew/v2.0.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@ Other API changes
121121
- Default value of ``dtype`` in :func:`get_dummies` is changed to ``bool`` from ``uint8`` (:issue:`45848`)
122122
- :meth:`DataFrame.astype`, :meth:`Series.astype`, and :meth:`DatetimeIndex.astype` casting datetime64 data to any of "datetime64[s]", "datetime64[ms]", "datetime64[us]" will return an object with the given resolution instead of coercing back to "datetime64[ns]" (:issue:`48928`)
123123
- :meth:`DataFrame.astype`, :meth:`Series.astype`, and :meth:`DatetimeIndex.astype` casting timedelta64 data to any of "timedelta64[s]", "timedelta64[ms]", "timedelta64[us]" will return an object with the given resolution instead of coercing to "float64" dtype (:issue:`48963`)
124+
- Passing data with dtype of "timedelta64[s]", "timedelta64[ms]", or "timedelta64[us]" to :class:`TimedeltaIndex`, :class:`Series`, or :class:`DataFrame` constructors will now retain that dtype instead of casting to "timedelta64[ns]"; timedelta64 data with lower resolution will be cast to the lowest supported resolution "timedelta64[s]" (:issue:`49014`)
125+
- Passing ``dtype`` of "timedelta64[s]", "timedelta64[ms]", or "timedelta64[us]" to :class:`TimedeltaIndex`, :class:`Series`, or :class:`DataFrame` constructors will now retain that dtype instead of casting to "timedelta64[ns]"; passing a dtype with lower resolution for :class:`Series` or :class:`DataFrame` will be cast to the lowest supported resolution "timedelta64[s]" (:issue:`49014`)
124126
- Passing a ``np.datetime64`` object with non-nanosecond resolution to :class:`Timestamp` will retain the input resolution if it is "s", "ms", or "ns"; otherwise it will be cast to the closest supported resolution (:issue:`49008`)
125127
-
126128

pandas/_libs/tslibs/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,17 @@
3131
"periods_per_day",
3232
"periods_per_second",
3333
"is_supported_unit",
34+
"npy_unit_to_abbrev",
35+
"get_supported_reso",
3436
]
3537

3638
from pandas._libs.tslibs import dtypes
3739
from pandas._libs.tslibs.conversion import localize_pydatetime
3840
from pandas._libs.tslibs.dtypes import (
3941
Resolution,
42+
get_supported_reso,
4043
is_supported_unit,
44+
npy_unit_to_abbrev,
4145
periods_per_day,
4246
periods_per_second,
4347
)

pandas/core/arrays/timedeltas.py

+27-11
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,11 @@
2020
Tick,
2121
Timedelta,
2222
astype_overflowsafe,
23+
get_supported_reso,
2324
get_unit_from_dtype,
2425
iNaT,
2526
is_supported_unit,
27+
npy_unit_to_abbrev,
2628
periods_per_second,
2729
to_offset,
2830
)
@@ -197,28 +199,29 @@ def _simple_new( # type: ignore[override]
197199
return result
198200

199201
@classmethod
200-
def _from_sequence(
201-
cls, data, *, dtype=TD64NS_DTYPE, copy: bool = False
202-
) -> TimedeltaArray:
202+
def _from_sequence(cls, data, *, dtype=None, copy: bool = False) -> TimedeltaArray:
203203
if dtype:
204-
_validate_td64_dtype(dtype)
204+
dtype = _validate_td64_dtype(dtype)
205205

206206
data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=None)
207207
freq, _ = dtl.validate_inferred_freq(None, inferred_freq, False)
208208

209+
if dtype is not None:
210+
data = astype_overflowsafe(data, dtype=dtype, copy=False)
211+
209212
return cls._simple_new(data, dtype=data.dtype, freq=freq)
210213

211214
@classmethod
212215
def _from_sequence_not_strict(
213216
cls,
214217
data,
215-
dtype=TD64NS_DTYPE,
218+
dtype=None,
216219
copy: bool = False,
217220
freq=lib.no_default,
218221
unit=None,
219222
) -> TimedeltaArray:
220223
if dtype:
221-
_validate_td64_dtype(dtype)
224+
dtype = _validate_td64_dtype(dtype)
222225

223226
assert unit not in ["Y", "y", "M"] # caller is responsible for checking
224227

@@ -232,6 +235,9 @@ def _from_sequence_not_strict(
232235
if explicit_none:
233236
freq = None
234237

238+
if dtype is not None:
239+
data = astype_overflowsafe(data, dtype=dtype, copy=False)
240+
235241
result = cls._simple_new(data, dtype=data.dtype, freq=freq)
236242

237243
if inferred_freq is None and freq is not None:
@@ -944,9 +950,13 @@ def sequence_to_td64ns(
944950
copy = False
945951

946952
elif is_timedelta64_dtype(data.dtype):
947-
if data.dtype != TD64NS_DTYPE:
948-
# non-nano unit
949-
data = astype_overflowsafe(data, dtype=TD64NS_DTYPE)
953+
data_unit = get_unit_from_dtype(data.dtype)
954+
if not is_supported_unit(data_unit):
955+
# cast to closest supported unit, i.e. s or ns
956+
new_reso = get_supported_reso(data_unit)
957+
new_unit = npy_unit_to_abbrev(new_reso)
958+
new_dtype = np.dtype(f"m8[{new_unit}]")
959+
data = astype_overflowsafe(data, dtype=new_dtype, copy=False)
950960
copy = False
951961

952962
else:
@@ -955,7 +965,9 @@ def sequence_to_td64ns(
955965

956966
data = np.array(data, copy=copy)
957967

958-
assert data.dtype == "m8[ns]", data
968+
assert data.dtype.kind == "m"
969+
assert data.dtype != "m8" # i.e. not unit-less
970+
959971
return data, inferred_freq
960972

961973

@@ -1045,7 +1057,11 @@ def _validate_td64_dtype(dtype) -> DtypeObj:
10451057
)
10461058
raise ValueError(msg)
10471059

1048-
if not is_dtype_equal(dtype, TD64NS_DTYPE):
1060+
if (
1061+
not isinstance(dtype, np.dtype)
1062+
or dtype.kind != "m"
1063+
or not is_supported_unit(get_unit_from_dtype(dtype))
1064+
):
10491065
raise ValueError(f"dtype {dtype} cannot be converted to timedelta64[ns]")
10501066

10511067
return dtype

pandas/core/dtypes/cast.py

+16-5
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,10 @@
3131
Timedelta,
3232
Timestamp,
3333
astype_overflowsafe,
34+
get_supported_reso,
35+
get_unit_from_dtype,
36+
is_supported_unit,
37+
npy_unit_to_abbrev,
3438
)
3539
from pandas._libs.tslibs.timedeltas import array_to_timedelta64
3640
from pandas._typing import (
@@ -1456,8 +1460,11 @@ def _ensure_nanosecond_dtype(dtype: DtypeObj) -> DtypeObj:
14561460
"""
14571461
Convert dtypes with granularity less than nanosecond to nanosecond
14581462
1459-
>>> _ensure_nanosecond_dtype(np.dtype("M8[s]"))
1460-
dtype('<M8[ns]')
1463+
>>> _ensure_nanosecond_dtype(np.dtype("M8[D]"))
1464+
dtype('<M8[s]')
1465+
1466+
>>> _ensure_nanosecond_dtype(np.dtype("M8[us]"))
1467+
dtype('<M8[us]')
14611468
14621469
>>> _ensure_nanosecond_dtype(np.dtype("m8[ps]"))
14631470
Traceback (most recent call last):
@@ -1476,13 +1483,15 @@ def _ensure_nanosecond_dtype(dtype: DtypeObj) -> DtypeObj:
14761483
# i.e. datetime64tz
14771484
pass
14781485

1479-
elif dtype.kind == "M" and dtype != DT64NS_DTYPE:
1486+
elif dtype.kind == "M" and not is_supported_unit(get_unit_from_dtype(dtype)):
14801487
# pandas supports dtype whose granularity is less than [ns]
14811488
# e.g., [ps], [fs], [as]
14821489
if dtype <= np.dtype("M8[ns]"):
14831490
if dtype.name == "datetime64":
14841491
raise ValueError(msg)
1485-
dtype = DT64NS_DTYPE
1492+
reso = get_supported_reso(get_unit_from_dtype(dtype))
1493+
unit = npy_unit_to_abbrev(reso)
1494+
dtype = np.dtype(f"M8[{unit}]")
14861495
else:
14871496
raise TypeError(f"cannot convert datetimelike to dtype [{dtype}]")
14881497

@@ -1492,7 +1501,9 @@ def _ensure_nanosecond_dtype(dtype: DtypeObj) -> DtypeObj:
14921501
if dtype <= np.dtype("m8[ns]"):
14931502
if dtype.name == "timedelta64":
14941503
raise ValueError(msg)
1495-
dtype = TD64NS_DTYPE
1504+
reso = get_supported_reso(get_unit_from_dtype(dtype))
1505+
unit = npy_unit_to_abbrev(reso)
1506+
dtype = np.dtype(f"m8[{unit}]")
14961507
else:
14971508
raise TypeError(f"cannot convert timedeltalike to dtype [{dtype}]")
14981509
return dtype

pandas/core/indexes/timedeltas.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
from pandas._typing import DtypeObj
1313

1414
from pandas.core.dtypes.common import (
15-
TD64NS_DTYPE,
1615
is_scalar,
1716
is_timedelta64_dtype,
1817
)
@@ -121,7 +120,7 @@ def __new__(
121120
unit=None,
122121
freq=lib.no_default,
123122
closed=None,
124-
dtype=TD64NS_DTYPE,
123+
dtype=None,
125124
copy: bool = False,
126125
name=None,
127126
):

pandas/tests/arithmetic/test_numeric.py

+10-7
Original file line numberDiff line numberDiff line change
@@ -152,9 +152,11 @@ def test_mul_td64arr(self, left, box_cls):
152152
right = np.array([1, 2, 3], dtype="m8[s]")
153153
right = box_cls(right)
154154

155-
expected = TimedeltaIndex(["10s", "40s", "90s"])
155+
expected = TimedeltaIndex(["10s", "40s", "90s"], dtype=right.dtype)
156+
156157
if isinstance(left, Series) or box_cls is Series:
157158
expected = Series(expected)
159+
assert expected.dtype == right.dtype
158160

159161
result = left * right
160162
tm.assert_equal(result, expected)
@@ -171,9 +173,10 @@ def test_div_td64arr(self, left, box_cls):
171173
right = np.array([10, 40, 90], dtype="m8[s]")
172174
right = box_cls(right)
173175

174-
expected = TimedeltaIndex(["1s", "2s", "3s"])
176+
expected = TimedeltaIndex(["1s", "2s", "3s"], dtype=right.dtype)
175177
if isinstance(left, Series) or box_cls is Series:
176178
expected = Series(expected)
179+
assert expected.dtype == right.dtype
177180

178181
result = right / left
179182
tm.assert_equal(result, expected)
@@ -206,12 +209,12 @@ def test_numeric_arr_mul_tdscalar(self, scalar_td, numeric_idx, box_with_array):
206209
box = box_with_array
207210
index = numeric_idx
208211
expected = TimedeltaIndex([Timedelta(days=n) for n in range(len(index))])
209-
if isinstance(scalar_td, np.timedelta64) and box not in [Index, Series]:
212+
if isinstance(scalar_td, np.timedelta64):
210213
# TODO(2.0): once TDA.astype converts to m8, just do expected.astype
211214
tda = expected._data
212215
dtype = scalar_td.dtype
213216
expected = type(tda)._simple_new(tda._ndarray.astype(dtype), dtype=dtype)
214-
elif type(scalar_td) is timedelta and box not in [Index, Series]:
217+
elif type(scalar_td) is timedelta:
215218
# TODO(2.0): once TDA.astype converts to m8, just do expected.astype
216219
tda = expected._data
217220
dtype = np.dtype("m8[us]")
@@ -247,7 +250,7 @@ def test_numeric_arr_mul_tdscalar_numexpr_path(
247250
obj = tm.box_expected(arr, box, transpose=False)
248251

249252
expected = arr_i8.view("timedelta64[D]").astype("timedelta64[ns]")
250-
if type(scalar_td) is timedelta and box is array:
253+
if type(scalar_td) is timedelta:
251254
# TODO(2.0): this shouldn't depend on 'box'
252255
expected = expected.astype("timedelta64[us]")
253256
# TODO(2.0): won't be necessary to construct TimedeltaArray
@@ -268,15 +271,15 @@ def test_numeric_arr_rdiv_tdscalar(self, three_days, numeric_idx, box_with_array
268271
index = numeric_idx[1:3]
269272

270273
expected = TimedeltaIndex(["3 Days", "36 Hours"])
271-
if isinstance(three_days, np.timedelta64) and box not in [Index, Series]:
274+
if isinstance(three_days, np.timedelta64):
272275
# TODO(2.0): just use expected.astype
273276
tda = expected._data
274277
dtype = three_days.dtype
275278
if dtype < np.dtype("m8[s]"):
276279
# i.e. resolution is lower -> use lowest supported resolution
277280
dtype = np.dtype("m8[s]")
278281
expected = type(tda)._simple_new(tda._ndarray.astype(dtype), dtype=dtype)
279-
elif type(three_days) is timedelta and box not in [Index, Series]:
282+
elif type(three_days) is timedelta:
280283
# TODO(2.0): just use expected.astype
281284
tda = expected._data
282285
dtype = np.dtype("m8[us]")

pandas/tests/arrays/test_array.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ def test_array_copy():
271271
),
272272
(
273273
np.array([1, 2], dtype="m8[us]"),
274-
TimedeltaArray(np.array([1000, 2000], dtype="m8[ns]")),
274+
TimedeltaArray(np.array([1, 2], dtype="m8[us]")),
275275
),
276276
# integer
277277
([1, 2], IntegerArray._from_sequence([1, 2])),

pandas/tests/frame/methods/test_astype.py

+11-5
Original file line numberDiff line numberDiff line change
@@ -484,15 +484,21 @@ def test_astype_to_timedelta_unit(self, unit):
484484
dtype = f"m8[{unit}]"
485485
arr = np.array([[1, 2, 3]], dtype=dtype)
486486
df = DataFrame(arr)
487-
result = df.astype(dtype)
487+
if unit in ["us", "ms", "s"]:
488+
assert (df.dtypes == dtype).all()
489+
else:
490+
# We get the nearest supported unit, i.e. "s"
491+
assert (df.dtypes == "m8[s]").all()
488492

493+
result = df.astype(dtype)
489494
if unit in ["m", "h", "D"]:
490-
# We don't support these, so we use the old logic to convert to float
495+
# We don't support these, so we use the pre-2.0 logic to convert to float
496+
# (xref GH#48979)
497+
491498
expected = DataFrame(df.values.astype(dtype).astype(float))
492499
else:
493-
tda = pd.core.arrays.TimedeltaArray._simple_new(arr, dtype=arr.dtype)
494-
expected = DataFrame(tda)
495-
assert (expected.dtypes == dtype).all()
500+
# The conversion is a no-op, so we just get a copy
501+
expected = df
496502

497503
tm.assert_frame_equal(result, expected)
498504

pandas/tests/frame/test_block_internals.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -215,14 +215,15 @@ def test_construction_with_mixed(self, float_string_frame):
215215

216216
def test_construction_with_conversions(self):
217217

218-
# convert from a numpy array of non-ns timedelta64
218+
# convert from a numpy array of non-ns timedelta64; as of 2.0 this does
219+
# *not* convert
219220
arr = np.array([1, 2, 3], dtype="timedelta64[s]")
220221
df = DataFrame(index=range(3))
221222
df["A"] = arr
222223
expected = DataFrame(
223224
{"A": pd.timedelta_range("00:00:01", periods=3, freq="s")}, index=range(3)
224225
)
225-
tm.assert_frame_equal(df, expected)
226+
tm.assert_numpy_array_equal(df["A"].to_numpy(), arr)
226227

227228
expected = DataFrame(
228229
{

pandas/tests/frame/test_constructors.py

+20-10
Original file line numberDiff line numberDiff line change
@@ -2075,18 +2075,19 @@ def test_constructor_datetimes_non_ns(self, order, dtype):
20752075

20762076
@pytest.mark.parametrize("order", ["K", "A", "C", "F"])
20772077
@pytest.mark.parametrize(
2078-
"dtype",
2078+
"unit",
20792079
[
2080-
"timedelta64[D]",
2081-
"timedelta64[h]",
2082-
"timedelta64[m]",
2083-
"timedelta64[s]",
2084-
"timedelta64[ms]",
2085-
"timedelta64[us]",
2086-
"timedelta64[ns]",
2080+
"D",
2081+
"h",
2082+
"m",
2083+
"s",
2084+
"ms",
2085+
"us",
2086+
"ns",
20872087
],
20882088
)
2089-
def test_constructor_timedelta_non_ns(self, order, dtype):
2089+
def test_constructor_timedelta_non_ns(self, order, unit):
2090+
dtype = f"timedelta64[{unit}]"
20902091
na = np.array(
20912092
[
20922093
[np.timedelta64(1, "D"), np.timedelta64(2, "D")],
@@ -2095,13 +2096,22 @@ def test_constructor_timedelta_non_ns(self, order, dtype):
20952096
dtype=dtype,
20962097
order=order,
20972098
)
2098-
df = DataFrame(na).astype("timedelta64[ns]")
2099+
df = DataFrame(na)
2100+
if unit in ["D", "h", "m"]:
2101+
# we get the nearest supported unit, i.e. "s"
2102+
exp_unit = "s"
2103+
else:
2104+
exp_unit = unit
2105+
exp_dtype = np.dtype(f"m8[{exp_unit}]")
20992106
expected = DataFrame(
21002107
[
21012108
[Timedelta(1, "D"), Timedelta(2, "D")],
21022109
[Timedelta(4, "D"), Timedelta(5, "D")],
21032110
],
2111+
dtype=exp_dtype,
21042112
)
2113+
# TODO(2.0): ideally we should get the same 'expected' without passing
2114+
# dtype=exp_dtype.
21052115
tm.assert_frame_equal(df, expected)
21062116

21072117
def test_constructor_for_list_with_dtypes(self):

0 commit comments

Comments
 (0)