Skip to content

Commit 82a2b3d

Browse files
jbrockmendelnoatamir
authored andcommitted
API: retain non-nano dtype in DatetimeArray constructor (pandas-dev#49058)
* API: retain non-nano dtype in DatetimeArray constructor * update test * un-xfail * un-xfail * un-xfail
1 parent bcc1f96 commit 82a2b3d

File tree

14 files changed

+125
-68
lines changed

14 files changed

+125
-68
lines changed

pandas/core/arrays/datetimes.py

+45-13
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,14 @@
2929
astype_overflowsafe,
3030
fields,
3131
get_resolution,
32+
get_supported_reso,
3233
get_unit_from_dtype,
3334
ints_to_pydatetime,
3435
is_date_array_normalized,
3536
is_supported_unit,
3637
is_unitless,
3738
normalize_i8_timestamps,
39+
npy_unit_to_abbrev,
3840
timezones,
3941
to_offset,
4042
tz_convert_from_utc,
@@ -321,6 +323,14 @@ def _from_sequence_not_strict(
321323
# if dtype has an embedded tz, capture it
322324
tz = validate_tz_from_dtype(dtype, tz, explicit_tz_none)
323325

326+
unit = None
327+
if dtype is not None:
328+
if isinstance(dtype, np.dtype):
329+
unit = np.datetime_data(dtype)[0]
330+
else:
331+
# DatetimeTZDtype
332+
unit = dtype.unit
333+
324334
subarr, tz, inferred_freq = _sequence_to_dt64ns(
325335
data,
326336
copy=copy,
@@ -341,8 +351,12 @@ def _from_sequence_not_strict(
341351
if explicit_none:
342352
freq = None
343353

344-
dtype = tz_to_dtype(tz)
345-
result = cls._simple_new(subarr, freq=freq, dtype=dtype)
354+
data_unit = np.datetime_data(subarr.dtype)[0]
355+
data_dtype = tz_to_dtype(tz, data_unit)
356+
result = cls._simple_new(subarr, freq=freq, dtype=data_dtype)
357+
if unit is not None and unit != result._unit:
358+
# If unit was specified in user-passed dtype, cast to it here
359+
result = result._as_unit(unit)
346360

347361
if inferred_freq is None and freq is not None:
348362
# this condition precludes `freq_infer`
@@ -2004,7 +2018,8 @@ def sequence_to_datetimes(data, require_iso8601: bool = False) -> DatetimeArray:
20042018
require_iso8601=require_iso8601,
20052019
)
20062020

2007-
dtype = tz_to_dtype(tz)
2021+
unit = np.datetime_data(result.dtype)[0]
2022+
dtype = tz_to_dtype(tz, unit)
20082023
dta = DatetimeArray._simple_new(result, freq=freq, dtype=dtype)
20092024
return dta
20102025

@@ -2110,20 +2125,33 @@ def _sequence_to_dt64ns(
21102125
elif is_datetime64_dtype(data_dtype):
21112126
# tz-naive DatetimeArray or ndarray[datetime64]
21122127
data = getattr(data, "_ndarray", data)
2113-
if data.dtype != DT64NS_DTYPE:
2114-
data = astype_overflowsafe(data, dtype=DT64NS_DTYPE)
2128+
new_dtype = data.dtype
2129+
data_unit = get_unit_from_dtype(new_dtype)
2130+
if not is_supported_unit(data_unit):
2131+
# Cast to the nearest supported unit, generally "s"
2132+
new_reso = get_supported_reso(data_unit)
2133+
new_unit = npy_unit_to_abbrev(new_reso)
2134+
new_dtype = np.dtype(f"M8[{new_unit}]")
2135+
data = astype_overflowsafe(data, dtype=new_dtype, copy=False)
2136+
copy = False
2137+
2138+
if data.dtype.byteorder == ">":
2139+
# TODO: better way to handle this? non-copying alternative?
2140+
# without this, test_constructor_datetime64_bigendian fails
2141+
data = data.astype(data.dtype.newbyteorder("<"))
2142+
new_dtype = data.dtype
21152143
copy = False
21162144

21172145
if tz is not None:
21182146
# Convert tz-naive to UTC
21192147
# TODO: if tz is UTC, are there situations where we *don't* want a
21202148
# copy? tz_localize_to_utc always makes one.
21212149
data = tzconversion.tz_localize_to_utc(
2122-
data.view("i8"), tz, ambiguous=ambiguous
2150+
data.view("i8"), tz, ambiguous=ambiguous, reso=data_unit
21232151
)
2124-
data = data.view(DT64NS_DTYPE)
2152+
data = data.view(new_dtype)
21252153

2126-
assert data.dtype == DT64NS_DTYPE, data.dtype
2154+
assert data.dtype == new_dtype, data.dtype
21272155
result = data
21282156

21292157
else:
@@ -2137,7 +2165,9 @@ def _sequence_to_dt64ns(
21372165
result = result.copy()
21382166

21392167
assert isinstance(result, np.ndarray), type(result)
2140-
assert result.dtype == "M8[ns]", result.dtype
2168+
assert result.dtype.kind == "M"
2169+
assert result.dtype != "M8"
2170+
assert is_supported_unit(get_unit_from_dtype(result.dtype))
21412171
return result, tz, inferred_freq
21422172

21432173

@@ -2358,12 +2388,14 @@ def _validate_dt64_dtype(dtype):
23582388
)
23592389
raise ValueError(msg)
23602390

2361-
if (isinstance(dtype, np.dtype) and dtype != DT64NS_DTYPE) or not isinstance(
2362-
dtype, (np.dtype, DatetimeTZDtype)
2363-
):
2391+
if (
2392+
isinstance(dtype, np.dtype)
2393+
and (dtype.kind != "M" or not is_supported_unit(get_unit_from_dtype(dtype)))
2394+
) or not isinstance(dtype, (np.dtype, DatetimeTZDtype)):
23642395
raise ValueError(
23652396
f"Unexpected value for 'dtype': '{dtype}'. "
2366-
"Must be 'datetime64[ns]' or DatetimeTZDtype'."
2397+
"Must be 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', "
2398+
"'datetime64[ns]' or DatetimeTZDtype'."
23672399
)
23682400

23692401
if getattr(dtype, "tz", None):

pandas/core/dtypes/cast.py

+3
Original file line numberDiff line numberDiff line change
@@ -1373,6 +1373,9 @@ def maybe_cast_to_datetime(
13731373
# Note: NOT equivalent to dta.astype(dtype)
13741374
dta = dta.tz_localize(None)
13751375

1376+
# TODO(2.0): Do this astype in sequence_to_datetimes to
1377+
# avoid potential extra copy?
1378+
dta = dta.astype(dtype, copy=False)
13761379
value = dta
13771380
elif is_datetime64tz:
13781381
dtype = cast(DatetimeTZDtype, dtype)

pandas/tests/arrays/categorical/test_constructors.py

-9
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,6 @@
66
import numpy as np
77
import pytest
88

9-
from pandas.compat import (
10-
IS64,
11-
is_platform_windows,
12-
)
13-
149
from pandas.core.dtypes.common import (
1510
is_float_dtype,
1611
is_integer_dtype,
@@ -749,10 +744,6 @@ def test_from_sequence_copy(self):
749744

750745
assert not tm.shares_memory(result, cat)
751746

752-
@pytest.mark.xfail(
753-
not IS64 or is_platform_windows(),
754-
reason="Incorrectly raising in astype_overflowsafe",
755-
)
756747
def test_constructor_datetime64_non_nano(self):
757748
categories = np.arange(10).view("M8[D]")
758749
values = categories[::2].copy()

pandas/tests/arrays/test_array.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,9 @@ def test_array_copy():
242242
),
243243
(
244244
np.array([1, 2], dtype="M8[us]"),
245-
DatetimeArray(np.array([1000, 2000], dtype="M8[ns]")),
245+
DatetimeArray._simple_new(
246+
np.array([1, 2], dtype="M8[us]"), dtype=np.dtype("M8[us]")
247+
),
246248
),
247249
# datetimetz
248250
(

pandas/tests/base/test_constructors.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -146,9 +146,9 @@ def test_constructor_datetime_outofbound(self, a, constructor):
146146
# datetime64[non-ns] raise error, other cases result in object dtype
147147
# and preserve original data
148148
if a.dtype.kind == "M":
149-
msg = "Out of bounds"
150-
with pytest.raises(pd.errors.OutOfBoundsDatetime, match=msg):
151-
constructor(a)
149+
# Can't fit in nanosecond bounds -> get the nearest supported unit
150+
result = constructor(a)
151+
assert result.dtype == "M8[s]"
152152
else:
153153
result = constructor(a)
154154
assert result.dtype == "object"
@@ -162,7 +162,10 @@ def test_constructor_datetime_outofbound(self, a, constructor):
162162

163163
def test_constructor_datetime_nonns(self, constructor):
164164
arr = np.array(["2020-01-01T00:00:00.000000"], dtype="datetime64[us]")
165-
expected = constructor(pd.to_datetime(["2020-01-01"]))
165+
dta = pd.core.arrays.DatetimeArray._simple_new(arr, dtype=arr.dtype)
166+
expected = constructor(dta)
167+
assert expected.dtype == arr.dtype
168+
166169
result = constructor(arr)
167170
tm.assert_equal(result, expected)
168171

pandas/tests/frame/constructors/test_from_records.py

+1
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def test_from_records_with_datetimes(self):
4444
dtypes = [("EXPIRY", "<M8[m]")]
4545
recarray = np.core.records.fromarrays(arrdata, dtype=dtypes)
4646
result = DataFrame.from_records(recarray)
47+
expected["EXPIRY"] = expected["EXPIRY"].astype("M8[m]")
4748
tm.assert_frame_equal(result, expected)
4849

4950
def test_from_records_sequencelike(self):

pandas/tests/frame/indexing/test_setitem.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -277,11 +277,11 @@ def test_setitem_dt64_ndarray_with_NaT_and_diff_time_units(self):
277277
expected = DataFrame({0: [1, None], "new": [1, None]}, dtype="datetime64[ns]")
278278
tm.assert_frame_equal(result, expected)
279279

280-
# OutOfBoundsDatetime error shouldn't occur
280+
# OutOfBoundsDatetime error shouldn't occur; as of 2.0 we preserve "M8[s]"
281281
data_s = np.array([1, "nat"], dtype="datetime64[s]")
282282
result["new"] = data_s
283-
expected = DataFrame({0: [1, None], "new": [1e9, None]}, dtype="datetime64[ns]")
284-
tm.assert_frame_equal(result, expected)
283+
tm.assert_series_equal(result[0], expected[0])
284+
tm.assert_numpy_array_equal(result["new"].to_numpy(), data_s)
285285

286286
@pytest.mark.parametrize("unit", ["h", "m", "s", "ms", "D", "M", "Y"])
287287
def test_frame_setitem_datetime64_col_other_units(self, unit):
@@ -291,12 +291,17 @@ def test_frame_setitem_datetime64_col_other_units(self, unit):
291291

292292
dtype = np.dtype(f"M8[{unit}]")
293293
vals = np.arange(n, dtype=np.int64).view(dtype)
294-
ex_vals = vals.astype("datetime64[ns]")
294+
if unit in ["s", "ms"]:
295+
# supported unit
296+
ex_vals = vals
297+
else:
298+
# we get the nearest supported units, i.e. "s"
299+
ex_vals = vals.astype("datetime64[s]")
295300

296301
df = DataFrame({"ints": np.arange(n)}, index=np.arange(n))
297302
df[unit] = vals
298303

299-
assert df[unit].dtype == np.dtype("M8[ns]")
304+
assert df[unit].dtype == ex_vals.dtype
300305
assert (df[unit].values == ex_vals).all()
301306

302307
@pytest.mark.parametrize("unit", ["h", "m", "s", "ms", "D", "M", "Y"])

pandas/tests/frame/methods/test_astype.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -424,9 +424,8 @@ def test_astype_to_datetime_unit(self, unit):
424424
# GH#48928
425425
exp_dtype = dtype
426426
else:
427-
# TODO(2.0): use the nearest supported dtype (i.e. M8[s]) instead
428-
# of nanos
429-
exp_dtype = "M8[ns]"
427+
# we use the nearest supported dtype (i.e. M8[s])
428+
exp_dtype = "M8[s]"
430429
# TODO(2.0): once DataFrame constructor doesn't cast ndarray inputs.
431430
# can simplify this
432431
exp_values = arr.astype(exp_dtype)

pandas/tests/frame/test_block_internals.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -228,11 +228,13 @@ def test_construction_with_conversions(self):
228228
expected = DataFrame(
229229
{
230230
"dt1": Timestamp("20130101"),
231-
"dt2": date_range("20130101", periods=3),
231+
"dt2": date_range("20130101", periods=3).astype("M8[s]"),
232232
# 'dt3' : date_range('20130101 00:00:01',periods=3,freq='s'),
233233
},
234234
index=range(3),
235235
)
236+
assert expected.dtypes["dt1"] == "M8[ns]"
237+
assert expected.dtypes["dt2"] == "M8[s]"
236238

237239
df = DataFrame(index=range(3))
238240
df["dt1"] = np.datetime64("2013-01-01")

pandas/tests/indexes/datetimes/test_constructors.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -913,9 +913,9 @@ def test_constructor_no_precision_raises(self):
913913
Index(["2000"], dtype="datetime64")
914914

915915
def test_constructor_wrong_precision_raises(self):
916-
msg = "Unexpected value for 'dtype': 'datetime64\\[us\\]'"
917-
with pytest.raises(ValueError, match=msg):
918-
DatetimeIndex(["2000"], dtype="datetime64[us]")
916+
dti = DatetimeIndex(["2000"], dtype="datetime64[us]")
917+
assert dti.dtype == "M8[us]"
918+
assert dti[0] == Timestamp(2000, 1, 1)
919919

920920
def test_index_constructor_with_numpy_object_array_and_timestamp_tz_with_nan(self):
921921
# GH 27011

pandas/tests/reshape/merge/test_merge.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -731,18 +731,28 @@ def test_other_datetime_unit(self, unit):
731731

732732
dtype = f"datetime64[{unit}]"
733733
df2 = ser.astype(dtype).to_frame("days")
734-
# coerces to datetime64[ns], thus should not be affected
735-
assert df2["days"].dtype == "datetime64[ns]"
734+
735+
if unit in ["D", "h", "m"]:
736+
# not supported so we cast to the nearest supported unit, seconds
737+
# TODO(2.0): cast to nearest (second) instead of ns
738+
# coerces to datetime64[ns], thus should not be affected
739+
exp_dtype = "datetime64[s]"
740+
else:
741+
exp_dtype = dtype
742+
assert df2["days"].dtype == exp_dtype
736743

737744
result = df1.merge(df2, left_on="entity_id", right_index=True)
738745

746+
days = np.array(["nat", "nat"], dtype=exp_dtype)
747+
days = pd.core.arrays.DatetimeArray._simple_new(days, dtype=days.dtype)
739748
exp = DataFrame(
740749
{
741750
"entity_id": [101, 102],
742-
"days": np.array(["nat", "nat"], dtype="datetime64[ns]"),
751+
"days": days,
743752
},
744753
columns=["entity_id", "days"],
745754
)
755+
assert exp["days"].dtype == exp_dtype
746756
tm.assert_frame_equal(result, exp)
747757

748758
@pytest.mark.parametrize("unit", ["D", "h", "m", "s", "ms", "us", "ns"])

0 commit comments

Comments
 (0)