Skip to content

Commit b04a889

Browse files
jbrockmendelim-vinicius
authored and
im-vinicius
committed
API/BUG: infer_dtype_from_scalar with non-nano (pandas-dev#52212)
* API/BUG: infer_dtype_from_scalar with non-nano * update test * xfail on 32bit * fix xfail condition * whatsnew * xfail on windows
1 parent 9116a2b commit b04a889

23 files changed

+166
-91
lines changed

doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,7 @@ Datetimelike
317317
- Bug in :func:`date_range` when ``freq`` was a :class:`DateOffset` with ``nanoseconds`` (:issue:`46877`)
318318
- Bug in :meth:`Timestamp.round` with values close to the implementation bounds returning incorrect results instead of raising ``OutOfBoundsDatetime`` (:issue:`51494`)
319319
- Bug in :meth:`arrays.DatetimeArray.map` and :meth:`DatetimeIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`)
320+
- Bug in constructing a :class:`Series` or :class:`DataFrame` from a datetime or timedelta scalar always inferring nanosecond resolution instead of inferring from the input (:issue:`52212`)
320321
- Bug in parsing datetime strings with weekday but no day e.g. "2023 Sept Thu" incorrectly raising ``AttributeError`` instead of ``ValueError`` (:issue:`52659`)
321322
-
322323

pandas/conftest.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -931,7 +931,7 @@ def rand_series_with_duplicate_datetimeindex() -> Series:
931931
(Period("2012-02-01", freq="D"), "period[D]"),
932932
(
933933
Timestamp("2011-01-01", tz="US/Eastern"),
934-
DatetimeTZDtype(tz="US/Eastern"),
934+
DatetimeTZDtype(unit="s", tz="US/Eastern"),
935935
),
936936
(Timedelta(seconds=500), "timedelta64[ns]"),
937937
]

pandas/core/dtypes/cast.py

+18-6
Original file line numberDiff line numberDiff line change
@@ -645,7 +645,18 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan):
645645
if inferred == dtype:
646646
return dtype, fv
647647

648-
return np.dtype("object"), fill_value
648+
elif inferred.kind == "m":
649+
# different unit, e.g. passed np.timedelta64(24, "h") with dtype=m8[ns]
650+
# see if we can losslessly cast it to our dtype
651+
unit = np.datetime_data(dtype)[0]
652+
try:
653+
td = Timedelta(fill_value).as_unit(unit, round_ok=False)
654+
except OutOfBoundsTimedelta:
655+
return _dtype_obj, fill_value
656+
else:
657+
return dtype, td.asm8
658+
659+
return _dtype_obj, fill_value
649660

650661
elif is_float(fill_value):
651662
if issubclass(dtype.type, np.bool_):
@@ -775,8 +786,6 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
775786
elif isinstance(val, (np.datetime64, dt.datetime)):
776787
try:
777788
val = Timestamp(val)
778-
if val is not NaT:
779-
val = val.as_unit("ns")
780789
except OutOfBoundsDatetime:
781790
return _dtype_obj, val
782791

@@ -785,16 +794,19 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
785794
dtype = val.dtype
786795
# TODO: test with datetime(2920, 10, 1) based on test_replace_dtypes
787796
else:
788-
dtype = DatetimeTZDtype(unit="ns", tz=val.tz)
797+
dtype = DatetimeTZDtype(unit=val.unit, tz=val.tz)
789798

790799
elif isinstance(val, (np.timedelta64, dt.timedelta)):
791800
try:
792801
val = Timedelta(val)
793802
except (OutOfBoundsTimedelta, OverflowError):
794803
dtype = _dtype_obj
795804
else:
796-
dtype = np.dtype("m8[ns]")
797-
val = np.timedelta64(val.value, "ns")
805+
if val is NaT:
806+
val = np.timedelta64("NaT", "ns")
807+
else:
808+
val = val.asm8
809+
dtype = val.dtype
798810

799811
elif is_bool(val):
800812
dtype = np.dtype(np.bool_)

pandas/tests/dtypes/cast/test_infer_dtype.py

+25-11
Original file line numberDiff line numberDiff line change
@@ -61,17 +61,31 @@ def test_infer_dtype_from_complex(complex_dtype):
6161
assert dtype == np.complex_
6262

6363

64-
@pytest.mark.parametrize(
65-
"data", [np.datetime64(1, "ns"), Timestamp(1), datetime(2000, 1, 1, 0, 0)]
66-
)
67-
def test_infer_dtype_from_datetime(data):
68-
dtype, val = infer_dtype_from_scalar(data)
64+
def test_infer_dtype_from_datetime():
65+
dt64 = np.datetime64(1, "ns")
66+
dtype, val = infer_dtype_from_scalar(dt64)
6967
assert dtype == "M8[ns]"
7068

69+
ts = Timestamp(1)
70+
dtype, val = infer_dtype_from_scalar(ts)
71+
assert dtype == "M8[ns]"
7172

72-
@pytest.mark.parametrize("data", [np.timedelta64(1, "ns"), Timedelta(1), timedelta(1)])
73-
def test_infer_dtype_from_timedelta(data):
74-
dtype, val = infer_dtype_from_scalar(data)
73+
dt = datetime(2000, 1, 1, 0, 0)
74+
dtype, val = infer_dtype_from_scalar(dt)
75+
assert dtype == "M8[us]"
76+
77+
78+
def test_infer_dtype_from_timedelta():
79+
td64 = np.timedelta64(1, "ns")
80+
dtype, val = infer_dtype_from_scalar(td64)
81+
assert dtype == "m8[ns]"
82+
83+
pytd = timedelta(1)
84+
dtype, val = infer_dtype_from_scalar(pytd)
85+
assert dtype == "m8[us]"
86+
87+
td = Timedelta(1)
88+
dtype, val = infer_dtype_from_scalar(td)
7589
assert dtype == "m8[ns]"
7690

7791

@@ -140,9 +154,9 @@ def test_infer_dtype_from_scalar_errors():
140154
(b"foo", np.object_),
141155
(1, np.int64),
142156
(1.5, np.float_),
143-
(np.datetime64("2016-01-01"), np.dtype("M8[ns]")),
144-
(Timestamp("20160101"), np.dtype("M8[ns]")),
145-
(Timestamp("20160101", tz="UTC"), "datetime64[ns, UTC]"),
157+
(np.datetime64("2016-01-01"), np.dtype("M8[s]")),
158+
(Timestamp("20160101"), np.dtype("M8[s]")),
159+
(Timestamp("20160101", tz="UTC"), "datetime64[s, UTC]"),
146160
],
147161
)
148162
def test_infer_dtype_from_scalar(value, expected):

pandas/tests/frame/indexing/test_indexing.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -819,7 +819,7 @@ def test_setitem_single_column_mixed_datetime(self):
819819
# check our dtypes
820820
result = df.dtypes
821821
expected = Series(
822-
[np.dtype("float64")] * 3 + [np.dtype("datetime64[ns]")],
822+
[np.dtype("float64")] * 3 + [np.dtype("datetime64[s]")],
823823
index=["foo", "bar", "baz", "timestamp"],
824824
)
825825
tm.assert_series_equal(result, expected)

pandas/tests/frame/indexing/test_setitem.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ def test_setitem_dt64_index_empty_columns(self):
154154
def test_setitem_timestamp_empty_columns(self):
155155
# GH#19843
156156
df = DataFrame(index=range(3))
157-
df["now"] = Timestamp("20130101", tz="UTC")
157+
df["now"] = Timestamp("20130101", tz="UTC").as_unit("ns")
158158

159159
expected = DataFrame(
160160
[[Timestamp("20130101", tz="UTC")]] * 3, index=[0, 1, 2], columns=["now"]
@@ -234,7 +234,7 @@ def test_setitem_dict_preserves_dtypes(self):
234234
(Interval(left=0, right=5), IntervalDtype("int64", "right")),
235235
(
236236
Timestamp("2011-01-01", tz="US/Eastern"),
237-
DatetimeTZDtype(tz="US/Eastern"),
237+
DatetimeTZDtype(unit="s", tz="US/Eastern"),
238238
),
239239
],
240240
)

pandas/tests/frame/methods/test_get_numeric_data.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def test_get_numeric_data_preserve_dtype(self):
2121
tm.assert_frame_equal(result, expected)
2222

2323
def test_get_numeric_data(self):
24-
datetime64name = np.dtype("M8[ns]").name
24+
datetime64name = np.dtype("M8[s]").name
2525
objectname = np.dtype(np.object_).name
2626

2727
df = DataFrame(

pandas/tests/frame/methods/test_reindex.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@
88
import pytest
99

1010
from pandas._libs.tslibs.timezones import dateutil_gettz as gettz
11+
from pandas.compat import (
12+
IS64,
13+
is_platform_windows,
14+
)
1115
import pandas.util._test_decorators as td
1216

1317
import pandas as pd
@@ -118,15 +122,21 @@ class TestDataFrameSelectReindex:
118122
# These are specific reindex-based tests; other indexing tests should go in
119123
# test_indexing
120124

125+
@pytest.mark.xfail(
126+
not IS64 or is_platform_windows(),
127+
reason="Passes int32 values to DatetimeArray in make_na_array on "
128+
"windows, 32bit linux builds",
129+
)
121130
@td.skip_array_manager_not_yet_implemented
122131
def test_reindex_tzaware_fill_value(self):
123132
# GH#52586
124133
df = DataFrame([[1]])
125134

126135
ts = pd.Timestamp("2023-04-10 17:32", tz="US/Pacific")
127136
res = df.reindex([0, 1], axis=1, fill_value=ts)
128-
assert res.dtypes[1] == pd.DatetimeTZDtype(tz="US/Pacific")
137+
assert res.dtypes[1] == pd.DatetimeTZDtype(unit="s", tz="US/Pacific")
129138
expected = DataFrame({0: [1], 1: [ts]})
139+
expected[1] = expected[1].astype(res.dtypes[1])
130140
tm.assert_frame_equal(res, expected)
131141

132142
per = ts.tz_localize(None).to_period("s")
@@ -137,8 +147,9 @@ def test_reindex_tzaware_fill_value(self):
137147

138148
interval = pd.Interval(ts, ts + pd.Timedelta(seconds=1))
139149
res = df.reindex([0, 1], axis=1, fill_value=interval)
140-
assert res.dtypes[1] == pd.IntervalDtype("datetime64[ns, US/Pacific]", "right")
150+
assert res.dtypes[1] == pd.IntervalDtype("datetime64[s, US/Pacific]", "right")
141151
expected = DataFrame({0: [1], 1: [interval]})
152+
expected[1] = expected[1].astype(res.dtypes[1])
142153
tm.assert_frame_equal(res, expected)
143154

144155
def test_reindex_copies(self):

pandas/tests/frame/methods/test_to_csv.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -656,14 +656,17 @@ def create_cols(name):
656656
"foo", index=df_float.index, columns=create_cols("object")
657657
)
658658
df_dt = DataFrame(
659-
Timestamp("20010101"), index=df_float.index, columns=create_cols("date")
659+
Timestamp("20010101").as_unit("ns"),
660+
index=df_float.index,
661+
columns=create_cols("date"),
660662
)
661663

662664
# add in some nans
663665
df_float.iloc[30:50, 1:3] = np.nan
664666

665667
# ## this is a bug in read_csv right now ####
666668
# df_dt.loc[30:50,1:3] = np.nan
669+
# FIXME: don't leave commented-out
667670

668671
df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)
669672

@@ -702,7 +705,9 @@ def test_to_csv_dups_cols(self):
702705
df_int = DataFrame(np.random.randn(1000, 3)).astype("int64")
703706
df_bool = DataFrame(True, index=df_float.index, columns=range(3))
704707
df_object = DataFrame("foo", index=df_float.index, columns=range(3))
705-
df_dt = DataFrame(Timestamp("20010101"), index=df_float.index, columns=range(3))
708+
df_dt = DataFrame(
709+
Timestamp("20010101").as_unit("ns"), index=df_float.index, columns=range(3)
710+
)
706711
df = pd.concat(
707712
[df_float, df_int, df_bool, df_object, df_dt], axis=1, ignore_index=True
708713
)

pandas/tests/frame/test_block_internals.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -191,20 +191,20 @@ def test_construction_with_mixed(self, float_string_frame):
191191

192192
# check dtypes
193193
result = df.dtypes
194-
expected = Series({"datetime64[ns]": 3})
194+
expected = Series({"datetime64[us]": 3})
195195

196196
# mixed-type frames
197197
float_string_frame["datetime"] = datetime.now()
198198
float_string_frame["timedelta"] = timedelta(days=1, seconds=1)
199-
assert float_string_frame["datetime"].dtype == "M8[ns]"
200-
assert float_string_frame["timedelta"].dtype == "m8[ns]"
199+
assert float_string_frame["datetime"].dtype == "M8[us]"
200+
assert float_string_frame["timedelta"].dtype == "m8[us]"
201201
result = float_string_frame.dtypes
202202
expected = Series(
203203
[np.dtype("float64")] * 4
204204
+ [
205205
np.dtype("object"),
206-
np.dtype("datetime64[ns]"),
207-
np.dtype("timedelta64[ns]"),
206+
np.dtype("datetime64[us]"),
207+
np.dtype("timedelta64[us]"),
208208
],
209209
index=list("ABCD") + ["foo", "datetime", "timedelta"],
210210
)
@@ -230,7 +230,7 @@ def test_construction_with_conversions(self):
230230
},
231231
index=range(3),
232232
)
233-
assert expected.dtypes["dt1"] == "M8[ns]"
233+
assert expected.dtypes["dt1"] == "M8[s]"
234234
assert expected.dtypes["dt2"] == "M8[s]"
235235

236236
df = DataFrame(index=range(3))

pandas/tests/frame/test_constructors.py

+28-15
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ def test_constructor_from_2d_datetimearray(self, using_array_manager):
9797
def test_constructor_dict_with_tzaware_scalar(self):
9898
# GH#42505
9999
dt = Timestamp("2019-11-03 01:00:00-0700").tz_convert("America/Los_Angeles")
100+
dt = dt.as_unit("ns")
100101

101102
df = DataFrame({"dt": dt}, index=[0])
102103
expected = DataFrame({"dt": [dt]})
@@ -926,7 +927,7 @@ def test_constructor_dict_extension_scalar(self, ea_scalar_and_dtype):
926927
(Interval(left=0, right=5), IntervalDtype("int64", "right")),
927928
(
928929
Timestamp("2011-01-01", tz="US/Eastern"),
929-
DatetimeTZDtype(tz="US/Eastern"),
930+
DatetimeTZDtype(unit="s", tz="US/Eastern"),
930931
),
931932
],
932933
)
@@ -1323,7 +1324,7 @@ def test_constructor_unequal_length_nested_list_column(self):
13231324
[[Timestamp("2021-01-01")]],
13241325
[{"x": Timestamp("2021-01-01")}],
13251326
{"x": [Timestamp("2021-01-01")]},
1326-
{"x": Timestamp("2021-01-01")},
1327+
{"x": Timestamp("2021-01-01").as_unit("ns")},
13271328
],
13281329
)
13291330
def test_constructor_one_element_data_list(self, data):
@@ -1814,7 +1815,6 @@ def test_constructor_single_value(self):
18141815
def test_constructor_with_datetimes(self):
18151816
intname = np.dtype(np.int_).name
18161817
floatname = np.dtype(np.float_).name
1817-
datetime64name = np.dtype("M8[ns]").name
18181818
objectname = np.dtype(np.object_).name
18191819

18201820
# single item
@@ -1832,7 +1832,7 @@ def test_constructor_with_datetimes(self):
18321832
expected = Series(
18331833
[np.dtype("int64")]
18341834
+ [np.dtype(objectname)] * 2
1835-
+ [np.dtype(datetime64name)] * 2,
1835+
+ [np.dtype("M8[s]"), np.dtype("M8[us]")],
18361836
index=list("ABCDE"),
18371837
)
18381838
tm.assert_series_equal(result, expected)
@@ -1912,7 +1912,7 @@ def test_constructor_with_datetimes3(self):
19121912
df = DataFrame({"End Date": dt}, index=[0])
19131913
assert df.iat[0, 0] == dt
19141914
tm.assert_series_equal(
1915-
df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"})
1915+
df.dtypes, Series({"End Date": "datetime64[us, US/Eastern]"})
19161916
)
19171917

19181918
df = DataFrame([{"End Date": dt}])
@@ -3047,15 +3047,22 @@ def test_from_scalar_datetimelike_mismatched(self, constructor, cls):
30473047
with pytest.raises(TypeError, match=msg):
30483048
constructor(scalar, dtype=dtype)
30493049

3050-
@pytest.mark.xfail(
3051-
reason="Timestamp constructor has been updated to cast dt64 to non-nano, "
3052-
"but DatetimeArray._from_sequence has not"
3053-
)
30543050
@pytest.mark.parametrize("cls", [datetime, np.datetime64])
3055-
def test_from_out_of_bounds_ns_datetime(self, constructor, cls):
3051+
def test_from_out_of_bounds_ns_datetime(
3052+
self, constructor, cls, request, box, frame_or_series
3053+
):
30563054
# scalar that won't fit in nanosecond dt64, but will fit in microsecond
3055+
if box is list or (frame_or_series is Series and box is dict):
3056+
mark = pytest.mark.xfail(
3057+
reason="Timestamp constructor has been updated to cast dt64 to "
3058+
"non-nano, but DatetimeArray._from_sequence has not",
3059+
strict=True,
3060+
)
3061+
request.node.add_marker(mark)
3062+
30573063
scalar = datetime(9999, 1, 1)
30583064
exp_dtype = "M8[us]" # pydatetime objects default to this reso
3065+
30593066
if cls is np.datetime64:
30603067
scalar = np.datetime64(scalar, "D")
30613068
exp_dtype = "M8[s]" # closest reso to input
@@ -3076,13 +3083,19 @@ def test_out_of_s_bounds_datetime64(self, constructor):
30763083
dtype = tm.get_dtype(result)
30773084
assert dtype == object
30783085

3079-
@pytest.mark.xfail(
3080-
reason="TimedeltaArray constructor has been updated to cast td64 to non-nano, "
3081-
"but TimedeltaArray._from_sequence has not"
3082-
)
30833086
@pytest.mark.parametrize("cls", [timedelta, np.timedelta64])
3084-
def test_from_out_of_bounds_ns_timedelta(self, constructor, cls):
3087+
def test_from_out_of_bounds_ns_timedelta(
3088+
self, constructor, cls, request, box, frame_or_series
3089+
):
30853090
# scalar that won't fit in nanosecond td64, but will fit in microsecond
3091+
if box is list or (frame_or_series is Series and box is dict):
3092+
mark = pytest.mark.xfail(
3093+
reason="TimedeltaArray constructor has been updated to cast td64 "
3094+
"to non-nano, but TimedeltaArray._from_sequence has not",
3095+
strict=True,
3096+
)
3097+
request.node.add_marker(mark)
3098+
30863099
scalar = datetime(9999, 1, 1) - datetime(1970, 1, 1)
30873100
exp_dtype = "m8[us]" # smallest reso that fits
30883101
if cls is np.timedelta64:

pandas/tests/groupby/test_apply.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -721,7 +721,9 @@ def func_with_date(batch):
721721
dfg_no_conversion_expected.index.name = "a"
722722

723723
dfg_conversion = df.groupby(by=["a"]).apply(func_with_date)
724-
dfg_conversion_expected = DataFrame({"b": datetime(2015, 1, 1), "c": 2}, index=[1])
724+
dfg_conversion_expected = DataFrame(
725+
{"b": pd.Timestamp(2015, 1, 1).as_unit("ns"), "c": 2}, index=[1]
726+
)
725727
dfg_conversion_expected.index.name = "a"
726728

727729
tm.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected)

pandas/tests/groupby/test_groupby_shift_diff.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def test_group_shift_with_fill_value():
6262

6363
def test_group_shift_lose_timezone():
6464
# GH 30134
65-
now_dt = Timestamp.utcnow()
65+
now_dt = Timestamp.utcnow().as_unit("ns")
6666
df = DataFrame({"a": [1, 1], "date": now_dt})
6767
result = df.groupby("a").shift(0).iloc[0]
6868
expected = Series({"date": now_dt}, name=result.name)

0 commit comments

Comments
 (0)