Skip to content

BUG: to_datetime with mixed-string-and-numeric #55780

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Nov 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,9 @@ Categorical

Datetimelike
^^^^^^^^^^^^
- Bug in :class:`DatetimeIndex` when passing an object-dtype ndarray of float objects and a ``tz`` incorrectly localizing the result (:issue:`55780`)
- Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame. (:issue:`52093`)
- Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing a list of mixed-string-and-numeric types incorrectly raising (:issue:`55780`)
- Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`)
- Bug in :meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` always caching :meth:`Index.is_unique` as ``True`` when first value in index is ``NaT`` (:issue:`55755`)
- Bug in :meth:`Index.view` to a datetime64 dtype with non-supported resolution incorrectly raising (:issue:`55710`)
Expand Down
12 changes: 6 additions & 6 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -700,15 +700,15 @@ def array_to_datetime_with_tz(ndarray values, tzinfo tz, NPY_DATETIMEUNIT creso)
ival = NPY_NAT

else:
ts = Timestamp(item)
if PyDateTime_Check(item) and item.tzinfo is not None:
# We can't call Timestamp constructor with a tz arg, have to
# do 2-step
ts = Timestamp(item).tz_convert(tz)
else:
ts = Timestamp(item, tz=tz)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Should we use tz_localize here to be more explicit?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the point is to match the behavior of calling the constructor this way, so i prefer this.

But now that #55712 is merged we can replace all of this with convert_to_tsobject which will move the ball down the field on a bunch of fronts. will update

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like moving to convert_to_tsobject introduces its own set of issues, so ill do that in a separate PR

if ts is NaT:
ival = NPY_NAT
else:
if ts.tzinfo is not None:
ts = ts.tz_convert(tz)
else:
# datetime64, tznaive pydatetime, int, float
ts = ts.tz_localize(tz)
ts = (<_Timestamp>ts)._as_creso(creso)
ival = ts._value

Expand Down
4 changes: 3 additions & 1 deletion pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
)
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
from pandas.core.dtypes.common import (
is_all_strings,
is_integer_dtype,
Expand Down Expand Up @@ -2358,7 +2359,8 @@ def ensure_arraylike_for_datetimelike(data, copy: bool, cls_name: str):
if not isinstance(data, (list, tuple)) and np.ndim(data) == 0:
# i.e. generator
data = list(data)
data = np.asarray(data)

data = construct_1d_object_array_from_listlike(data)
copy = False
elif isinstance(data, ABCMultiIndex):
raise TypeError(f"Cannot create a {cls_name} from a MultiIndex.")
Expand Down
20 changes: 14 additions & 6 deletions pandas/tests/dtypes/test_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,12 +418,10 @@ def test_array_equivalent(dtype_equal):
assert not array_equivalent(
Index([0, np.nan]), Index([1, np.nan]), dtype_equal=dtype_equal
)
assert array_equivalent(
DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan]), dtype_equal=dtype_equal
)
assert not array_equivalent(
DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan]), dtype_equal=dtype_equal
)


@pytest.mark.parametrize("dtype_equal", [True, False])
def test_array_equivalent_tdi(dtype_equal):
assert array_equivalent(
TimedeltaIndex([0, np.nan]),
TimedeltaIndex([0, np.nan]),
Expand All @@ -435,6 +433,16 @@ def test_array_equivalent(dtype_equal):
dtype_equal=dtype_equal,
)


@pytest.mark.parametrize("dtype_equal", [True, False])
def test_array_equivalent_dti(dtype_equal):
assert array_equivalent(
DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan]), dtype_equal=dtype_equal
)
assert not array_equivalent(
DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan]), dtype_equal=dtype_equal
)

dti1 = DatetimeIndex([0, np.nan], tz="US/Eastern")
dti2 = DatetimeIndex([0, np.nan], tz="CET")
dti3 = DatetimeIndex([1, np.nan], tz="US/Eastern")
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -3154,9 +3154,9 @@ def test_from_scalar_datetimelike_mismatched(self, constructor, cls):
dtype = {np.datetime64: "m8[ns]", np.timedelta64: "M8[ns]"}[cls]

if cls is np.datetime64:
msg1 = r"dtype datetime64\[ns\] cannot be converted to timedelta64\[ns\]"
msg1 = "Invalid type for timedelta scalar: <class 'numpy.datetime64'>"
else:
msg1 = r"dtype timedelta64\[ns\] cannot be converted to datetime64\[ns\]"
msg1 = "<class 'numpy.timedelta64'> is not convertible to datetime"
msg = "|".join(["Cannot cast", msg1])

with pytest.raises(TypeError, match=msg):
Expand Down
16 changes: 14 additions & 2 deletions pandas/tests/indexes/datetimes/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1024,8 +1024,11 @@ def test_dti_constructor_with_non_nano_dtype(self, tz):
# to 2 microseconds
vals = [ts, "2999-01-02 03:04:05.678910", 2500]
result = DatetimeIndex(vals, dtype=dtype)
exp_arr = np.array([ts.asm8, vals[1], 2], dtype="M8[us]")
expected = DatetimeIndex(exp_arr, dtype="M8[us]").tz_localize(tz)
exp_vals = [Timestamp(x, tz=tz).as_unit("us").asm8 for x in vals]
exp_arr = np.array(exp_vals, dtype="M8[us]")
expected = DatetimeIndex(exp_arr, dtype="M8[us]")
if tz is not None:
expected = expected.tz_localize("UTC").tz_convert(tz)
tm.assert_index_equal(result, expected)

result2 = DatetimeIndex(np.array(vals, dtype=object), dtype=dtype)
Expand All @@ -1050,6 +1053,15 @@ def test_dti_constructor_with_non_nano_now_today(self):
assert diff1 >= pd.Timedelta(0)
assert diff1 < tolerance

def test_dti_constructor_object_float_matches_float_dtype(self):
# GH#55780
arr = np.array([0, np.nan], dtype=np.float64)
arr2 = arr.astype(object)

dti1 = DatetimeIndex(arr, tz="CET")
dti2 = DatetimeIndex(arr2, tz="CET")
tm.assert_index_equal(dti1, dti2)


class TestTimeSeries:
def test_dti_constructor_preserve_dti_freq(self):
Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/series/methods/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,11 @@ def test_astype_object_to_dt64_non_nano(self, tz):
ser = Series(vals, dtype=object)
result = ser.astype(dtype)

exp_arr = np.array([ts.asm8, vals[1], 2], dtype="M8[us]")
expected = Series(exp_arr, dtype="M8[us]").dt.tz_localize(tz)
exp_vals = [Timestamp(x, tz=tz).as_unit("us").asm8 for x in vals]
exp_arr = np.array(exp_vals, dtype="M8[us]")
expected = Series(exp_arr, dtype="M8[us]")
if tz is not None:
expected = expected.dt.tz_localize("UTC").dt.tz_convert(tz)
tm.assert_series_equal(result, expected)

def test_astype_mixed_object_to_dt64tz(self):
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,20 @@ def test_to_datetime_mixed_datetime_and_string(self):
expected = to_datetime([d1, d2]).tz_convert(timezone(timedelta(minutes=-60)))
tm.assert_index_equal(res, expected)

def test_to_datetime_mixed_string_and_numeric(self):
# GH#55780 np.array(vals) would incorrectly cast the number to str
vals = ["2016-01-01", 0]
expected = DatetimeIndex([Timestamp(x) for x in vals])
result = to_datetime(vals, format="mixed")
result2 = to_datetime(vals[::-1], format="mixed")[::-1]
result3 = DatetimeIndex(vals)
result4 = DatetimeIndex(vals[::-1])[::-1]

tm.assert_index_equal(result, expected)
tm.assert_index_equal(result2, expected)
tm.assert_index_equal(result3, expected)
tm.assert_index_equal(result4, expected)

@pytest.mark.parametrize(
"format", ["%Y-%m-%d", "%Y-%d-%m"], ids=["ISO8601", "non-ISO8601"]
)
Expand Down