Skip to content

Commit 6ee0acb

Browse files
authored
DEPR: stop inferring dt64/td64 from strings in Series construtor (#49319)
* DEPR: stop inferring dt64/td64 from strings in Series construtor * update pyi
1 parent 218ab09 commit 6ee0acb

File tree

12 files changed

+72
-116
lines changed

12 files changed

+72
-116
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ Removal of prior version deprecations/changes
238238
- Removed the ``display.column_space`` option in favor of ``df.to_string(col_space=...)`` (:issue:`47280`)
239239
- Removed the deprecated method ``mad`` from pandas classes (:issue:`11787`)
240240
- Removed the deprecated method ``tshift`` from pandas classes (:issue:`11631`)
241+
- Changed the behavior of :class:`Series` constructor, it will no longer infer a datetime64 or timedelta64 dtype from string entries (:issue:`41731`)
241242
- Changed behavior of :class:`Index` constructor when passed a ``SparseArray`` or ``SparseDtype`` to retain that dtype instead of casting to ``numpy.ndarray`` (:issue:`43930`)
242243

243244
.. ---------------------------------------------------------------------------

pandas/_libs/lib.pyi

+1-1
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ def ensure_string_array(
158158
) -> npt.NDArray[np.object_]: ...
159159
def infer_datetimelike_array(
160160
arr: npt.NDArray[np.object_],
161-
) -> tuple[str, bool]: ...
161+
) -> str: ...
162162
def convert_nans_to_NA(
163163
arr: npt.NDArray[np.object_],
164164
) -> npt.NDArray[np.object_]: ...

pandas/_libs/lib.pyx

+15-35
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,6 @@ from pandas._libs.util cimport (
9595
is_nan,
9696
)
9797

98-
from pandas._libs.tslib import array_to_datetime
9998
from pandas._libs.tslibs import (
10099
OutOfBoundsDatetime,
101100
OutOfBoundsTimedelta,
@@ -1583,25 +1582,19 @@ def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]:
15831582
Returns
15841583
-------
15851584
str: {datetime, timedelta, date, nat, mixed}
1586-
bool
15871585
"""
15881586
cdef:
15891587
Py_ssize_t i, n = len(arr)
15901588
bint seen_timedelta = False, seen_date = False, seen_datetime = False
15911589
bint seen_tz_aware = False, seen_tz_naive = False
1592-
bint seen_nat = False, seen_str = False
1590+
bint seen_nat = False
15931591
bint seen_period = False, seen_interval = False
1594-
list objs = []
15951592
object v
15961593

15971594
for i in range(n):
15981595
v = arr[i]
15991596
if isinstance(v, str):
1600-
objs.append(v)
1601-
seen_str = True
1602-
1603-
if len(objs) == 3:
1604-
break
1597+
return "mixed"
16051598

16061599
elif v is None or util.is_nan(v):
16071600
# nan or None
@@ -1619,7 +1612,7 @@ def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]:
16191612
seen_tz_aware = True
16201613

16211614
if seen_tz_naive and seen_tz_aware:
1622-
return "mixed", seen_str
1615+
return "mixed"
16231616
elif util.is_datetime64_object(v):
16241617
# np.datetime64
16251618
seen_datetime = True
@@ -1635,43 +1628,30 @@ def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]:
16351628
seen_interval = True
16361629
break
16371630
else:
1638-
return "mixed", seen_str
1631+
return "mixed"
16391632

16401633
if seen_period:
16411634
if is_period_array(arr):
1642-
return "period", seen_str
1643-
return "mixed", seen_str
1635+
return "period"
1636+
return "mixed"
16441637

16451638
if seen_interval:
16461639
if is_interval_array(arr):
1647-
return "interval", seen_str
1648-
return "mixed", seen_str
1640+
return "interval"
1641+
return "mixed"
16491642

16501643
if seen_date and not (seen_datetime or seen_timedelta):
1651-
return "date", seen_str
1644+
return "date"
16521645
elif seen_datetime and not seen_timedelta:
1653-
return "datetime", seen_str
1646+
return "datetime"
16541647
elif seen_timedelta and not seen_datetime:
1655-
return "timedelta", seen_str
1648+
return "timedelta"
1649+
elif seen_datetime and seen_timedelta:
1650+
return "mixed"
16561651
elif seen_nat:
1657-
return "nat", seen_str
1652+
return "nat"
16581653

1659-
# short-circuit by trying to
1660-
# actually convert these strings
1661-
# this is for performance as we don't need to try
1662-
# convert *every* string array
1663-
if len(objs):
1664-
try:
1665-
# require_iso8601 as in maybe_infer_to_datetimelike
1666-
array_to_datetime(objs, errors="raise", require_iso8601=True)
1667-
return "datetime", seen_str
1668-
except (ValueError, TypeError):
1669-
pass
1670-
1671-
# we are *not* going to infer from strings
1672-
# for timedelta as too much ambiguity
1673-
1674-
return "mixed", seen_str
1654+
return "mixed"
16751655

16761656

16771657
cdef inline bint is_timedelta(object o):

pandas/core/dtypes/cast.py

+4-11
Original file line numberDiff line numberDiff line change
@@ -1264,7 +1264,9 @@ def try_timedelta(v: np.ndarray) -> np.ndarray:
12641264
else:
12651265
return td_values.reshape(shape)
12661266

1267-
inferred_type, seen_str = lib.infer_datetimelike_array(ensure_object(v))
1267+
# TODO: can we just do lib.maybe_convert_objects for this entire function?
1268+
inferred_type = lib.infer_datetimelike_array(ensure_object(v))
1269+
12681270
if inferred_type in ["period", "interval"]:
12691271
# Incompatible return value type (got "Union[ExtensionArray, ndarray]",
12701272
# expected "Union[ndarray, DatetimeArray, TimedeltaArray, PeriodArray,
@@ -1280,14 +1282,14 @@ def try_timedelta(v: np.ndarray) -> np.ndarray:
12801282
elif inferred_type == "timedelta":
12811283
value = try_timedelta(v)
12821284
elif inferred_type == "nat":
1285+
# only reached if we have at least 1 NaT and the rest (NaT or None or np.nan)
12831286

12841287
# if all NaT, return as datetime
12851288
if isna(v).all():
12861289
# error: Incompatible types in assignment (expression has type
12871290
# "ExtensionArray", variable has type "Union[ndarray, List[Any]]")
12881291
value = try_datetime(v) # type: ignore[assignment]
12891292
else:
1290-
12911293
# We have at least a NaT and a string
12921294
# try timedelta first to avoid spurious datetime conversions
12931295
# e.g. '00:00:01' is a timedelta but technically is also a datetime
@@ -1300,15 +1302,6 @@ def try_timedelta(v: np.ndarray) -> np.ndarray:
13001302
# "ExtensionArray", variable has type "Union[ndarray, List[Any]]")
13011303
value = try_datetime(v) # type: ignore[assignment]
13021304

1303-
if value.dtype.kind in ["m", "M"] and seen_str:
1304-
# TODO(2.0): enforcing this deprecation should close GH#40111
1305-
warnings.warn(
1306-
f"Inferring {value.dtype} from data containing strings is deprecated "
1307-
"and will be removed in a future version. To retain the old behavior "
1308-
f"explicitly pass Series(data, dtype={value.dtype})",
1309-
FutureWarning,
1310-
stacklevel=find_stack_level(),
1311-
)
13121305
return value
13131306

13141307

pandas/io/parsers/base_parser.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -775,7 +775,7 @@ def _infer_types(
775775
result = BooleanArray(result, bool_mask)
776776
elif result.dtype == np.object_ and use_nullable_dtypes:
777777
# read_excel sends array of datetime objects
778-
inferred_type, _ = lib.infer_datetimelike_array(result)
778+
inferred_type = lib.infer_datetimelike_array(result)
779779
if inferred_type != "datetime":
780780
result = StringDtype().construct_array_type()._from_sequence(values)
781781

pandas/tests/apply/test_series_apply.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -859,8 +859,7 @@ def test_apply_to_timedelta():
859859
list_of_strings = ["00:00:01", np.nan, pd.NaT, pd.NaT]
860860

861861
a = pd.to_timedelta(list_of_strings)
862-
with tm.assert_produces_warning(FutureWarning, match="Inferring timedelta64"):
863-
ser = Series(list_of_strings)
862+
ser = Series(list_of_strings)
864863
b = ser.apply(pd.to_timedelta)
865864
tm.assert_series_equal(Series(a), b)
866865

pandas/tests/dtypes/test_inference.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -1346,7 +1346,7 @@ def test_infer_dtype_period_with_na(self, na_value):
13461346
],
13471347
)
13481348
def test_infer_datetimelike_array_datetime(self, data):
1349-
assert lib.infer_datetimelike_array(data) == ("datetime", False)
1349+
assert lib.infer_datetimelike_array(data) == "datetime"
13501350

13511351
@pytest.mark.parametrize(
13521352
"data",
@@ -1358,11 +1358,11 @@ def test_infer_datetimelike_array_datetime(self, data):
13581358
],
13591359
)
13601360
def test_infer_datetimelike_array_timedelta(self, data):
1361-
assert lib.infer_datetimelike_array(data) == ("timedelta", False)
1361+
assert lib.infer_datetimelike_array(data) == "timedelta"
13621362

13631363
def test_infer_datetimelike_array_date(self):
13641364
arr = [date(2017, 6, 12), date(2017, 3, 11)]
1365-
assert lib.infer_datetimelike_array(arr) == ("date", False)
1365+
assert lib.infer_datetimelike_array(arr) == "date"
13661366

13671367
@pytest.mark.parametrize(
13681368
"data",
@@ -1377,7 +1377,7 @@ def test_infer_datetimelike_array_date(self):
13771377
],
13781378
)
13791379
def test_infer_datetimelike_array_mixed(self, data):
1380-
assert lib.infer_datetimelike_array(data)[0] == "mixed"
1380+
assert lib.infer_datetimelike_array(data) == "mixed"
13811381

13821382
@pytest.mark.parametrize(
13831383
"first, expected",
@@ -1395,7 +1395,7 @@ def test_infer_datetimelike_array_mixed(self, data):
13951395
@pytest.mark.parametrize("second", [None, np.nan])
13961396
def test_infer_datetimelike_array_nan_nat_like(self, first, second, expected):
13971397
first.append(second)
1398-
assert lib.infer_datetimelike_array(first) == (expected, False)
1398+
assert lib.infer_datetimelike_array(first) == expected
13991399

14001400
def test_infer_dtype_all_nan_nat_like(self):
14011401
arr = np.array([np.nan, np.nan])

pandas/tests/resample/test_time_grouper.py

+21-23
Original file line numberDiff line numberDiff line change
@@ -321,29 +321,27 @@ def test_groupby_resample_interpolate():
321321
.interpolate(method="linear")
322322
)
323323

324-
msg = "containing strings is deprecated"
325-
with tm.assert_produces_warning(FutureWarning, match=msg):
326-
expected_ind = pd.MultiIndex.from_tuples(
327-
[
328-
(50, "2018-01-07"),
329-
(50, Timestamp("2018-01-08")),
330-
(50, Timestamp("2018-01-09")),
331-
(50, Timestamp("2018-01-10")),
332-
(50, Timestamp("2018-01-11")),
333-
(50, Timestamp("2018-01-12")),
334-
(50, Timestamp("2018-01-13")),
335-
(50, Timestamp("2018-01-14")),
336-
(50, Timestamp("2018-01-15")),
337-
(50, Timestamp("2018-01-16")),
338-
(50, Timestamp("2018-01-17")),
339-
(50, Timestamp("2018-01-18")),
340-
(50, Timestamp("2018-01-19")),
341-
(50, Timestamp("2018-01-20")),
342-
(50, Timestamp("2018-01-21")),
343-
(60, Timestamp("2018-01-14")),
344-
],
345-
names=["volume", "week_starting"],
346-
)
324+
expected_ind = pd.MultiIndex.from_tuples(
325+
[
326+
(50, Timestamp("2018-01-07")),
327+
(50, Timestamp("2018-01-08")),
328+
(50, Timestamp("2018-01-09")),
329+
(50, Timestamp("2018-01-10")),
330+
(50, Timestamp("2018-01-11")),
331+
(50, Timestamp("2018-01-12")),
332+
(50, Timestamp("2018-01-13")),
333+
(50, Timestamp("2018-01-14")),
334+
(50, Timestamp("2018-01-15")),
335+
(50, Timestamp("2018-01-16")),
336+
(50, Timestamp("2018-01-17")),
337+
(50, Timestamp("2018-01-18")),
338+
(50, Timestamp("2018-01-19")),
339+
(50, Timestamp("2018-01-20")),
340+
(50, Timestamp("2018-01-21")),
341+
(60, Timestamp("2018-01-14")),
342+
],
343+
names=["volume", "week_starting"],
344+
)
347345

348346
expected = DataFrame(
349347
data={

pandas/tests/series/methods/test_combine_first.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,7 @@ def test_combine_first_dt64(self):
7979
s1 = Series([np.NaN, "2011"])
8080
rs = s0.combine_first(s1)
8181

82-
msg = "containing strings is deprecated"
83-
with tm.assert_produces_warning(FutureWarning, match=msg):
84-
xp = Series([datetime(2010, 1, 1), "2011"])
82+
xp = Series([datetime(2010, 1, 1), "2011"], dtype="datetime64[ns]")
8583

8684
tm.assert_series_equal(rs, xp)
8785

pandas/tests/series/methods/test_fillna.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -365,10 +365,7 @@ def test_datetime64_fillna(self):
365365
def test_datetime64_fillna_backfill(self):
366366
# GH#6587
367367
# make sure that we are treating as integer when filling
368-
msg = "containing strings is deprecated"
369-
with tm.assert_produces_warning(FutureWarning, match=msg):
370-
# this also tests inference of a datetime-like with NaT's
371-
ser = Series([NaT, NaT, "2013-08-05 15:30:00.000001"])
368+
ser = Series([NaT, NaT, "2013-08-05 15:30:00.000001"], dtype="M8[ns]")
372369

373370
expected = Series(
374371
[

pandas/tests/series/test_constructors.py

+20-28
Original file line numberDiff line numberDiff line change
@@ -1018,24 +1018,20 @@ def test_constructor_dtype_datetime64_7(self):
10181018
assert series1.dtype == object
10191019

10201020
def test_constructor_dtype_datetime64_6(self):
1021-
# these will correctly infer a datetime
1022-
msg = "containing strings is deprecated"
1021+
# as of 2.0, these no longer infer datetime64 based on the strings,
1022+
# matching the Index behavior
10231023

1024-
with tm.assert_produces_warning(FutureWarning, match=msg):
1025-
ser = Series([None, NaT, "2013-08-05 15:30:00.000001"])
1026-
assert ser.dtype == "datetime64[ns]"
1024+
ser = Series([None, NaT, "2013-08-05 15:30:00.000001"])
1025+
assert ser.dtype == object
10271026

1028-
with tm.assert_produces_warning(FutureWarning, match=msg):
1029-
ser = Series([np.nan, NaT, "2013-08-05 15:30:00.000001"])
1030-
assert ser.dtype == "datetime64[ns]"
1027+
ser = Series([np.nan, NaT, "2013-08-05 15:30:00.000001"])
1028+
assert ser.dtype == object
10311029

1032-
with tm.assert_produces_warning(FutureWarning, match=msg):
1033-
ser = Series([NaT, None, "2013-08-05 15:30:00.000001"])
1034-
assert ser.dtype == "datetime64[ns]"
1030+
ser = Series([NaT, None, "2013-08-05 15:30:00.000001"])
1031+
assert ser.dtype == object
10351032

1036-
with tm.assert_produces_warning(FutureWarning, match=msg):
1037-
ser = Series([NaT, np.nan, "2013-08-05 15:30:00.000001"])
1038-
assert ser.dtype == "datetime64[ns]"
1033+
ser = Series([NaT, np.nan, "2013-08-05 15:30:00.000001"])
1034+
assert ser.dtype == object
10391035

10401036
def test_constructor_dtype_datetime64_5(self):
10411037
# tz-aware (UTC and other tz's)
@@ -1517,23 +1513,19 @@ def test_constructor_dtype_timedelta64(self):
15171513
td = Series([timedelta(days=i) for i in range(3)] + ["foo"])
15181514
assert td.dtype == "object"
15191515

1520-
# these will correctly infer a timedelta
1521-
msg = "containing strings is deprecated"
1522-
with tm.assert_produces_warning(FutureWarning, match=msg):
1523-
ser = Series([None, NaT, "1 Day"])
1524-
assert ser.dtype == "timedelta64[ns]"
1516+
# as of 2.0, these no longer infer timedelta64 based on the strings,
1517+
# matching Index behavior
1518+
ser = Series([None, NaT, "1 Day"])
1519+
assert ser.dtype == object
15251520

1526-
with tm.assert_produces_warning(FutureWarning, match=msg):
1527-
ser = Series([np.nan, NaT, "1 Day"])
1528-
assert ser.dtype == "timedelta64[ns]"
1521+
ser = Series([np.nan, NaT, "1 Day"])
1522+
assert ser.dtype == object
15291523

1530-
with tm.assert_produces_warning(FutureWarning, match=msg):
1531-
ser = Series([NaT, None, "1 Day"])
1532-
assert ser.dtype == "timedelta64[ns]"
1524+
ser = Series([NaT, None, "1 Day"])
1525+
assert ser.dtype == object
15331526

1534-
with tm.assert_produces_warning(FutureWarning, match=msg):
1535-
ser = Series([NaT, np.nan, "1 Day"])
1536-
assert ser.dtype == "timedelta64[ns]"
1527+
ser = Series([NaT, np.nan, "1 Day"])
1528+
assert ser.dtype == object
15371529

15381530
# GH 16406
15391531
def test_constructor_mixed_tz(self):

pandas/tests/tools/test_to_timedelta.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -207,9 +207,7 @@ def test_to_timedelta_on_missing_values(self):
207207
)
208208
tm.assert_series_equal(actual, expected)
209209

210-
with tm.assert_produces_warning(FutureWarning, match="Inferring timedelta64"):
211-
ser = Series(["00:00:01", pd.NaT])
212-
assert ser.dtype == "m8[ns]"
210+
ser = Series(["00:00:01", pd.NaT], dtype="m8[ns]")
213211
actual = to_timedelta(ser)
214212
tm.assert_series_equal(actual, expected)
215213

0 commit comments

Comments
 (0)