Skip to content

BUG: _guess_datetime_format_for_array doesn't guess if first element is '' or 'NaT' #49120

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pandas/_libs/tslib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def array_with_unit_to_datetime(
unit: str,
errors: str = ...,
) -> tuple[np.ndarray, tzinfo | None]: ...
def first_non_null(values: np.ndarray) -> int: ...
def array_to_datetime(
values: npt.NDArray[np.object_],
errors: str = ...,
Expand Down
17 changes: 17 additions & 0 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,23 @@ def array_with_unit_to_datetime(

return oresult, tz

@cython.wraparound(False)
@cython.boundscheck(False)
def first_non_null(values: ndarray) -> int:
"""Find position of first non-null value, return -1 if there isn't one."""
cdef:
Py_ssize_t n = len(values)
Py_ssize_t i
int result
for i in range(n):
val = values[i]
if checknull_with_nat_and_na(val):
continue
if isinstance(val, str) and (len(val) == 0 or val in nat_strings):
continue
return i
else:
return -1

@cython.wraparound(False)
@cython.boundscheck(False)
Expand Down
5 changes: 2 additions & 3 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,8 @@ class FulldatetimeDict(YearMonthDayDict, total=False):

def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None:
# Try to guess the format based on the first non-NaN element, return None if can't
non_nan_elements = notna(arr).nonzero()[0]
if len(non_nan_elements):
if type(first_non_nan_element := arr[non_nan_elements[0]]) is str:
if (first_non_null := tslib.first_non_null(arr)) != -1:
if type(first_non_nan_element := arr[first_non_null]) is str:
# GH#32264 np.str_ object
return guess_datetime_format(first_non_nan_element, dayfirst=dayfirst)
return None
Expand Down
8 changes: 5 additions & 3 deletions pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -2093,21 +2093,23 @@ def test_to_datetime_dta_tz(self, klass):


class TestGuessDatetimeFormat:
@td.skip_if_not_us_locale
@pytest.mark.parametrize(
"test_array",
"test_list",
[
[
"2011-12-30 00:00:00.000000",
"2011-12-30 00:00:00.000000",
"2011-12-30 00:00:00.000000",
],
[np.nan, np.nan, "2011-12-30 00:00:00.000000"],
["", "2011-12-30 00:00:00.000000"],
["NaT", "2011-12-30 00:00:00.000000"],
["2011-12-30 00:00:00.000000", "random_string"],
],
)
def test_guess_datetime_format_for_array(self, test_array):
def test_guess_datetime_format_for_array(self, test_list):
expected_format = "%Y-%m-%d %H:%M:%S.%f"
test_array = np.array(test_list, dtype=object)
assert tools._guess_datetime_format_for_array(test_array) == expected_format

@td.skip_if_not_us_locale
Expand Down