Skip to content

Commit b8ee68b

Browse files
authored
DEPR: datetimelike inference with strings (#41731)
1 parent 7b8f638 commit b8ee68b

File tree

13 files changed

+113
-60
lines changed

13 files changed

+113
-60
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -698,6 +698,7 @@ Deprecations
698698
- Deprecated passing arguments (apart from ``value``) as positional in :meth:`DataFrame.fillna` and :meth:`Series.fillna` (:issue:`41485`)
699699
- Deprecated passing arguments as positional in :meth:`DataFrame.reset_index` (other than ``"level"``) and :meth:`Series.reset_index` (:issue:`41485`)
700700
- Deprecated construction of :class:`Series` or :class:`DataFrame` with ``DatetimeTZDtype`` data and ``datetime64[ns]`` dtype. Use ``Series(data).dt.tz_localize(None)`` instead (:issue:`41555`,:issue:`33401`)
701+
- Deprecated inference of ``timedelta64[ns]``, ``datetime64[ns]``, or ``DatetimeTZDtype`` dtypes in :class:`Series` construction when data containing strings is passed and no ``dtype`` is passed (:issue:`33558`)
701702
- In a future version, constructing :class:`Series` or :class:`DataFrame` with ``datetime64[ns]`` data and ``DatetimeTZDtype`` will treat the data as wall-times instead of as UTC times (matching DatetimeIndex behavior). To treat the data as UTC times, use ``pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(dtype.tz)`` or ``pd.Series(data.view("int64"), dtype=dtype)`` (:issue:`33401`)
702703
- Deprecated passing arguments as positional in :meth:`DataFrame.set_axis` and :meth:`Series.set_axis` (other than ``"labels"``) (:issue:`41485`)
703704
- Deprecated passing arguments as positional in :meth:`DataFrame.where` and :meth:`Series.where` (other than ``"cond"`` and ``"other"``) (:issue:`41485`)

pandas/_libs/lib.pyi

+1-1
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ def ensure_string_array(
153153

154154
def infer_datetimelike_array(
155155
arr: np.ndarray # np.ndarray[object]
156-
) -> str: ...
156+
) -> tuple[str, bool]: ...
157157

158158
def astype_intsafe(
159159
arr: np.ndarray, # np.ndarray[object]

pandas/_libs/lib.pyx

+12-10
Original file line numberDiff line numberDiff line change
@@ -1558,7 +1558,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
15581558
return "mixed"
15591559

15601560

1561-
def infer_datetimelike_array(arr: ndarray[object]) -> str:
1561+
def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]:
15621562
"""
15631563
Infer if we have a datetime or timedelta array.
15641564
- date: we have *only* date and maybe strings, nulls
@@ -1576,19 +1576,21 @@ def infer_datetimelike_array(arr: ndarray[object]) -> str:
15761576
Returns
15771577
-------
15781578
str: {datetime, timedelta, date, nat, mixed}
1579+
bool
15791580
"""
15801581
cdef:
15811582
Py_ssize_t i, n = len(arr)
15821583
bint seen_timedelta = False, seen_date = False, seen_datetime = False
15831584
bint seen_tz_aware = False, seen_tz_naive = False
1584-
bint seen_nat = False
1585+
bint seen_nat = False, seen_str = False
15851586
list objs = []
15861587
object v
15871588

15881589
for i in range(n):
15891590
v = arr[i]
15901591
if isinstance(v, str):
15911592
objs.append(v)
1593+
seen_str = True
15921594

15931595
if len(objs) == 3:
15941596
break
@@ -1609,7 +1611,7 @@ def infer_datetimelike_array(arr: ndarray[object]) -> str:
16091611
seen_tz_aware = True
16101612

16111613
if seen_tz_naive and seen_tz_aware:
1612-
return "mixed"
1614+
return "mixed", seen_str
16131615
elif util.is_datetime64_object(v):
16141616
# np.datetime64
16151617
seen_datetime = True
@@ -1619,16 +1621,16 @@ def infer_datetimelike_array(arr: ndarray[object]) -> str:
16191621
# timedelta, or timedelta64
16201622
seen_timedelta = True
16211623
else:
1622-
return "mixed"
1624+
return "mixed", seen_str
16231625

16241626
if seen_date and not (seen_datetime or seen_timedelta):
1625-
return "date"
1627+
return "date", seen_str
16261628
elif seen_datetime and not seen_timedelta:
1627-
return "datetime"
1629+
return "datetime", seen_str
16281630
elif seen_timedelta and not seen_datetime:
1629-
return "timedelta"
1631+
return "timedelta", seen_str
16301632
elif seen_nat:
1631-
return "nat"
1633+
return "nat", seen_str
16321634

16331635
# short-circuit by trying to
16341636
# actually convert these strings
@@ -1637,14 +1639,14 @@ def infer_datetimelike_array(arr: ndarray[object]) -> str:
16371639
if len(objs):
16381640
try:
16391641
array_to_datetime(objs, errors="raise")
1640-
return "datetime"
1642+
return "datetime", seen_str
16411643
except (ValueError, TypeError):
16421644
pass
16431645

16441646
# we are *not* going to infer from strings
16451647
# for timedelta as too much ambiguity
16461648

1647-
return 'mixed'
1649+
return "mixed", seen_str
16481650

16491651

16501652
cdef inline bint is_timedelta(object o):

pandas/core/dtypes/cast.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -1543,7 +1543,7 @@ def try_timedelta(v: np.ndarray) -> np.ndarray:
15431543
else:
15441544
return td_values.reshape(shape)
15451545

1546-
inferred_type = lib.infer_datetimelike_array(ensure_object(v))
1546+
inferred_type, seen_str = lib.infer_datetimelike_array(ensure_object(v))
15471547

15481548
if inferred_type == "datetime":
15491549
# error: Incompatible types in assignment (expression has type "ExtensionArray",
@@ -1572,6 +1572,15 @@ def try_timedelta(v: np.ndarray) -> np.ndarray:
15721572
# "ExtensionArray", variable has type "Union[ndarray, List[Any]]")
15731573
value = try_datetime(v) # type: ignore[assignment]
15741574

1575+
if value.dtype.kind in ["m", "M"] and seen_str:
1576+
warnings.warn(
1577+
f"Inferring {value.dtype} from data containing strings is deprecated "
1578+
"and will be removed in a future version. To retain the old behavior "
1579+
"explicitly pass Series(data, dtype={value.dtype})",
1580+
FutureWarning,
1581+
stacklevel=find_stack_level(),
1582+
)
1583+
# return v.reshape(shape)
15751584
return value
15761585

15771586

pandas/tests/apply/test_series_apply.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -859,7 +859,9 @@ def test_apply_to_timedelta():
859859
list_of_strings = ["00:00:01", np.nan, pd.NaT, pd.NaT]
860860

861861
a = pd.to_timedelta(list_of_strings) # noqa
862-
b = Series(list_of_strings).apply(pd.to_timedelta) # noqa
862+
with tm.assert_produces_warning(FutureWarning, match="Inferring timedelta64"):
863+
ser = Series(list_of_strings)
864+
b = ser.apply(pd.to_timedelta) # noqa
863865
# Can't compare until apply on a Series gives the correct dtype
864866
# assert_series_equal(a, b)
865867

pandas/tests/arithmetic/test_datetime64.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -328,7 +328,7 @@ def test_dt64arr_timestamp_equality(self, box_with_array):
328328
box_with_array if box_with_array not in [pd.Index, pd.array] else np.ndarray
329329
)
330330

331-
ser = Series([Timestamp("2000-01-29 01:59:00"), Timestamp("2000-01-30"), "NaT"])
331+
ser = Series([Timestamp("2000-01-29 01:59:00"), Timestamp("2000-01-30"), NaT])
332332
ser = tm.box_expected(ser, box_with_array)
333333

334334
result = ser != ser

pandas/tests/dtypes/test_inference.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -1169,7 +1169,7 @@ def test_infer_dtype_period_with_na(self, na_value):
11691169
],
11701170
)
11711171
def test_infer_datetimelike_array_datetime(self, data):
1172-
assert lib.infer_datetimelike_array(data) == "datetime"
1172+
assert lib.infer_datetimelike_array(data) == ("datetime", False)
11731173

11741174
@pytest.mark.parametrize(
11751175
"data",
@@ -1181,11 +1181,11 @@ def test_infer_datetimelike_array_datetime(self, data):
11811181
],
11821182
)
11831183
def test_infer_datetimelike_array_timedelta(self, data):
1184-
assert lib.infer_datetimelike_array(data) == "timedelta"
1184+
assert lib.infer_datetimelike_array(data) == ("timedelta", False)
11851185

11861186
def test_infer_datetimelike_array_date(self):
11871187
arr = [date(2017, 6, 12), date(2017, 3, 11)]
1188-
assert lib.infer_datetimelike_array(arr) == "date"
1188+
assert lib.infer_datetimelike_array(arr) == ("date", False)
11891189

11901190
@pytest.mark.parametrize(
11911191
"data",
@@ -1200,7 +1200,7 @@ def test_infer_datetimelike_array_date(self):
12001200
],
12011201
)
12021202
def test_infer_datetimelike_array_mixed(self, data):
1203-
assert lib.infer_datetimelike_array(data) == "mixed"
1203+
assert lib.infer_datetimelike_array(data)[0] == "mixed"
12041204

12051205
@pytest.mark.parametrize(
12061206
"first, expected",
@@ -1218,7 +1218,7 @@ def test_infer_datetimelike_array_mixed(self, data):
12181218
@pytest.mark.parametrize("second", [None, np.nan])
12191219
def test_infer_datetimelike_array_nan_nat_like(self, first, second, expected):
12201220
first.append(second)
1221-
assert lib.infer_datetimelike_array(first) == expected
1221+
assert lib.infer_datetimelike_array(first) == (expected, False)
12221222

12231223
def test_infer_dtype_all_nan_nat_like(self):
12241224
arr = np.array([np.nan, np.nan])

pandas/tests/resample/test_time_grouper.py

+24-21
Original file line numberDiff line numberDiff line change
@@ -305,27 +305,30 @@ def test_groupby_resample_interpolate():
305305
.resample("1D")
306306
.interpolate(method="linear")
307307
)
308-
expected_ind = pd.MultiIndex.from_tuples(
309-
[
310-
(50, "2018-01-07"),
311-
(50, Timestamp("2018-01-08")),
312-
(50, Timestamp("2018-01-09")),
313-
(50, Timestamp("2018-01-10")),
314-
(50, Timestamp("2018-01-11")),
315-
(50, Timestamp("2018-01-12")),
316-
(50, Timestamp("2018-01-13")),
317-
(50, Timestamp("2018-01-14")),
318-
(50, Timestamp("2018-01-15")),
319-
(50, Timestamp("2018-01-16")),
320-
(50, Timestamp("2018-01-17")),
321-
(50, Timestamp("2018-01-18")),
322-
(50, Timestamp("2018-01-19")),
323-
(50, Timestamp("2018-01-20")),
324-
(50, Timestamp("2018-01-21")),
325-
(60, Timestamp("2018-01-14")),
326-
],
327-
names=["volume", "week_starting"],
328-
)
308+
309+
msg = "containing strings is deprecated"
310+
with tm.assert_produces_warning(FutureWarning, match=msg):
311+
expected_ind = pd.MultiIndex.from_tuples(
312+
[
313+
(50, "2018-01-07"),
314+
(50, Timestamp("2018-01-08")),
315+
(50, Timestamp("2018-01-09")),
316+
(50, Timestamp("2018-01-10")),
317+
(50, Timestamp("2018-01-11")),
318+
(50, Timestamp("2018-01-12")),
319+
(50, Timestamp("2018-01-13")),
320+
(50, Timestamp("2018-01-14")),
321+
(50, Timestamp("2018-01-15")),
322+
(50, Timestamp("2018-01-16")),
323+
(50, Timestamp("2018-01-17")),
324+
(50, Timestamp("2018-01-18")),
325+
(50, Timestamp("2018-01-19")),
326+
(50, Timestamp("2018-01-20")),
327+
(50, Timestamp("2018-01-21")),
328+
(60, Timestamp("2018-01-14")),
329+
],
330+
names=["volume", "week_starting"],
331+
)
329332
expected = DataFrame(
330333
data={
331334
"price": [

pandas/tests/series/accessors/test_dt_accessor.py

+1
Original file line numberDiff line numberDiff line change
@@ -679,6 +679,7 @@ def test_dt_timetz_accessor(self, tz_naive_fixture):
679679
[["2016-01-07", "2016-01-01"], [[2016, 1, 4], [2015, 53, 5]]],
680680
],
681681
)
682+
@pytest.mark.filterwarnings("ignore:Inferring datetime64:FutureWarning")
682683
def test_isocalendar(self, input_series, expected_output):
683684
result = pd.to_datetime(Series(input_series)).dt.isocalendar()
684685
expected_frame = DataFrame(

pandas/tests/series/methods/test_combine_first.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,11 @@ def test_combine_first_dt64(self):
7878
s0 = to_datetime(Series(["2010", np.NaN]))
7979
s1 = Series([np.NaN, "2011"])
8080
rs = s0.combine_first(s1)
81-
xp = Series([datetime(2010, 1, 1), "2011"])
81+
82+
msg = "containing strings is deprecated"
83+
with tm.assert_produces_warning(FutureWarning, match=msg):
84+
xp = Series([datetime(2010, 1, 1), "2011"])
85+
8286
tm.assert_series_equal(rs, xp)
8387

8488
def test_combine_first_dt_tz_values(self, tz_naive_fixture):

pandas/tests/series/methods/test_fillna.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -319,8 +319,11 @@ def test_datetime64_fillna(self):
319319

320320
# GH#6587
321321
# make sure that we are treating as integer when filling
322-
# this also tests inference of a datetime-like with NaT's
323-
ser = Series([NaT, NaT, "2013-08-05 15:30:00.000001"])
322+
msg = "containing strings is deprecated"
323+
with tm.assert_produces_warning(FutureWarning, match=msg):
324+
# this also tests inference of a datetime-like with NaT's
325+
ser = Series([NaT, NaT, "2013-08-05 15:30:00.000001"])
326+
324327
expected = Series(
325328
[
326329
"2013-08-05 15:30:00.000001",

pandas/tests/series/test_constructors.py

+33-16
Original file line numberDiff line numberDiff line change
@@ -900,14 +900,23 @@ def test_constructor_dtype_datetime64_7(self):
900900

901901
def test_constructor_dtype_datetime64_6(self):
902902
# these will correctly infer a datetime
903-
s = Series([None, NaT, "2013-08-05 15:30:00.000001"])
904-
assert s.dtype == "datetime64[ns]"
905-
s = Series([np.nan, NaT, "2013-08-05 15:30:00.000001"])
906-
assert s.dtype == "datetime64[ns]"
907-
s = Series([NaT, None, "2013-08-05 15:30:00.000001"])
908-
assert s.dtype == "datetime64[ns]"
909-
s = Series([NaT, np.nan, "2013-08-05 15:30:00.000001"])
910-
assert s.dtype == "datetime64[ns]"
903+
msg = "containing strings is deprecated"
904+
905+
with tm.assert_produces_warning(FutureWarning, match=msg):
906+
ser = Series([None, NaT, "2013-08-05 15:30:00.000001"])
907+
assert ser.dtype == "datetime64[ns]"
908+
909+
with tm.assert_produces_warning(FutureWarning, match=msg):
910+
ser = Series([np.nan, NaT, "2013-08-05 15:30:00.000001"])
911+
assert ser.dtype == "datetime64[ns]"
912+
913+
with tm.assert_produces_warning(FutureWarning, match=msg):
914+
ser = Series([NaT, None, "2013-08-05 15:30:00.000001"])
915+
assert ser.dtype == "datetime64[ns]"
916+
917+
with tm.assert_produces_warning(FutureWarning, match=msg):
918+
ser = Series([NaT, np.nan, "2013-08-05 15:30:00.000001"])
919+
assert ser.dtype == "datetime64[ns]"
911920

912921
def test_constructor_dtype_datetime64_5(self):
913922
# tz-aware (UTC and other tz's)
@@ -1379,14 +1388,22 @@ def test_constructor_dtype_timedelta64(self):
13791388
assert td.dtype == "object"
13801389

13811390
# these will correctly infer a timedelta
1382-
s = Series([None, NaT, "1 Day"])
1383-
assert s.dtype == "timedelta64[ns]"
1384-
s = Series([np.nan, NaT, "1 Day"])
1385-
assert s.dtype == "timedelta64[ns]"
1386-
s = Series([NaT, None, "1 Day"])
1387-
assert s.dtype == "timedelta64[ns]"
1388-
s = Series([NaT, np.nan, "1 Day"])
1389-
assert s.dtype == "timedelta64[ns]"
1391+
msg = "containing strings is deprecated"
1392+
with tm.assert_produces_warning(FutureWarning, match=msg):
1393+
ser = Series([None, NaT, "1 Day"])
1394+
assert ser.dtype == "timedelta64[ns]"
1395+
1396+
with tm.assert_produces_warning(FutureWarning, match=msg):
1397+
ser = Series([np.nan, NaT, "1 Day"])
1398+
assert ser.dtype == "timedelta64[ns]"
1399+
1400+
with tm.assert_produces_warning(FutureWarning, match=msg):
1401+
ser = Series([NaT, None, "1 Day"])
1402+
assert ser.dtype == "timedelta64[ns]"
1403+
1404+
with tm.assert_produces_warning(FutureWarning, match=msg):
1405+
ser = Series([NaT, np.nan, "1 Day"])
1406+
assert ser.dtype == "timedelta64[ns]"
13901407

13911408
# GH 16406
13921409
def test_constructor_mixed_tz(self):

pandas/tests/tools/test_to_timedelta.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,16 @@ def test_to_timedelta_via_apply(self):
187187
result = Series([to_timedelta("00:00:01")])
188188
tm.assert_series_equal(result, expected)
189189

190+
def test_to_timedelta_inference_without_warning(self):
191+
# GH#41731 inference produces a warning in the Series constructor,
192+
# but _not_ in to_timedelta
193+
vals = ["00:00:01", pd.NaT]
194+
with tm.assert_produces_warning(None):
195+
result = to_timedelta(vals)
196+
197+
expected = TimedeltaIndex([pd.Timedelta(seconds=1), pd.NaT])
198+
tm.assert_index_equal(result, expected)
199+
190200
def test_to_timedelta_on_missing_values(self):
191201
# GH5438
192202
timedelta_NaT = np.timedelta64("NaT")
@@ -197,7 +207,8 @@ def test_to_timedelta_on_missing_values(self):
197207
)
198208
tm.assert_series_equal(actual, expected)
199209

200-
actual = to_timedelta(Series(["00:00:01", pd.NaT]))
210+
with tm.assert_produces_warning(FutureWarning, match="Inferring timedelta64"):
211+
actual = to_timedelta(Series(["00:00:01", pd.NaT]))
201212
tm.assert_series_equal(actual, expected)
202213

203214
actual = to_timedelta(np.nan)

0 commit comments

Comments
 (0)