Skip to content

Commit 9049179

Browse files
MarcoGorelliabkosar
authored andcommitted
BUG: _guess_datetime_format_for_array doesn't guess if first element is '' or 'NaT' (#49120)
1 parent 73b85f0 commit 9049179

File tree

4 files changed

+25
-6
lines changed

4 files changed

+25
-6
lines changed

pandas/_libs/tslib.pyi

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ def array_with_unit_to_datetime(
1616
unit: str,
1717
errors: str = ...,
1818
) -> tuple[np.ndarray, tzinfo | None]: ...
19+
def first_non_null(values: np.ndarray) -> int: ...
1920
def array_to_datetime(
2021
values: npt.NDArray[np.object_],
2122
errors: str = ...,

pandas/_libs/tslib.pyx

+17
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,23 @@ def array_with_unit_to_datetime(
421421

422422
return oresult, tz
423423

424+
@cython.wraparound(False)
425+
@cython.boundscheck(False)
426+
def first_non_null(values: ndarray) -> int:
427+
"""Find position of first non-null value, return -1 if there isn't one."""
428+
cdef:
429+
Py_ssize_t n = len(values)
430+
Py_ssize_t i
431+
int result
432+
for i in range(n):
433+
val = values[i]
434+
if checknull_with_nat_and_na(val):
435+
continue
436+
if isinstance(val, str) and (len(val) == 0 or val in nat_strings):
437+
continue
438+
return i
439+
else:
440+
return -1
424441

425442
@cython.wraparound(False)
426443
@cython.boundscheck(False)

pandas/core/tools/datetimes.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,8 @@ class FulldatetimeDict(YearMonthDayDict, total=False):
126126

127127
def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None:
128128
# Try to guess the format based on the first non-NaN element, return None if can't
129-
non_nan_elements = notna(arr).nonzero()[0]
130-
if len(non_nan_elements):
131-
if type(first_non_nan_element := arr[non_nan_elements[0]]) is str:
129+
if (first_non_null := tslib.first_non_null(arr)) != -1:
130+
if type(first_non_nan_element := arr[first_non_null]) is str:
132131
# GH#32264 np.str_ object
133132
return guess_datetime_format(first_non_nan_element, dayfirst=dayfirst)
134133
return None

pandas/tests/tools/test_to_datetime.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -2093,21 +2093,23 @@ def test_to_datetime_dta_tz(self, klass):
20932093

20942094

20952095
class TestGuessDatetimeFormat:
2096-
@td.skip_if_not_us_locale
20972096
@pytest.mark.parametrize(
2098-
"test_array",
2097+
"test_list",
20992098
[
21002099
[
21012100
"2011-12-30 00:00:00.000000",
21022101
"2011-12-30 00:00:00.000000",
21032102
"2011-12-30 00:00:00.000000",
21042103
],
21052104
[np.nan, np.nan, "2011-12-30 00:00:00.000000"],
2105+
["", "2011-12-30 00:00:00.000000"],
2106+
["NaT", "2011-12-30 00:00:00.000000"],
21062107
["2011-12-30 00:00:00.000000", "random_string"],
21072108
],
21082109
)
2109-
def test_guess_datetime_format_for_array(self, test_array):
2110+
def test_guess_datetime_format_for_array(self, test_list):
21102111
expected_format = "%Y-%m-%d %H:%M:%S.%f"
2112+
test_array = np.array(test_list, dtype=object)
21112113
assert tools._guess_datetime_format_for_array(test_array) == expected_format
21122114

21132115
@td.skip_if_not_us_locale

0 commit comments

Comments
 (0)