Skip to content

BUG: to_datetime(..., infer_datetime_format=True) fails with np.str_input #48970

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions pandas/_libs/tslibs/parsing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -963,10 +963,6 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None:
datetime format string (for `strftime` or `strptime`),
or None if it can't be guessed.
"""

if not isinstance(dt_str, str):
return None
Comment on lines -967 to -968
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no longer necessary, as the input is typed as : str


day_attribute_and_format = (('day',), '%d', 2)

# attr name, format, padding (if any)
Expand Down
9 changes: 6 additions & 3 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,11 +125,14 @@ class FulldatetimeDict(YearMonthDayDict, total=False):
# ---------------------------------------------------------------------


def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False):
# Try to guess the format based on the first non-NaN element
def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None:
# Try to guess the format based on the first non-NaN element, return None if can't
non_nan_elements = notna(arr).nonzero()[0]
if len(non_nan_elements):
return guess_datetime_format(arr[non_nan_elements[0]], dayfirst=dayfirst)
if type(first_non_nan_element := arr[non_nan_elements[0]]) is str:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the fallback path for this less performant? could we instead cast np.str_ objs to str here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good call, have opened #48974 for now, will get back to it when I get a chance

# GH#32264 np.str_ object
return guess_datetime_format(first_non_nan_element, dayfirst=dayfirst)
return None


def should_cache(
Expand Down
8 changes: 5 additions & 3 deletions pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,8 +468,10 @@ def test_to_datetime_mixed_datetime_and_string(self):
expected = to_datetime([d1, d2]).tz_convert(pytz.FixedOffset(-60))
tm.assert_index_equal(res, expected)

def test_to_datetime_np_str(self):
@pytest.mark.parametrize("infer_datetime_format", [True, False])
def test_to_datetime_np_str(self, infer_datetime_format):
# GH#32264
# GH#48969
value = np.str_("2019-02-04 10:18:46.297000+0000")

ser = Series([value])
Expand All @@ -479,11 +481,11 @@ def test_to_datetime_np_str(self):
assert to_datetime(value) == exp
assert to_datetime(ser.iloc[0]) == exp

res = to_datetime([value])
res = to_datetime([value], infer_datetime_format=infer_datetime_format)
expected = Index([exp])
tm.assert_index_equal(res, expected)

res = to_datetime(ser)
res = to_datetime(ser, infer_datetime_format=infer_datetime_format)
expected = Series(expected)
tm.assert_series_equal(res, expected)

Expand Down