From 9d736ad94af2cfa530df6339f8661ee593fb80ad Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sat, 15 Oct 2022 18:23:21 +0100 Subject: [PATCH 1/2] check for nat_strings when finding first null --- pandas/_libs/tslib.pyx | 17 +++++++++++++++++ pandas/core/tools/datetimes.py | 5 ++--- pandas/tests/tools/test_to_datetime.py | 8 +++++--- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index a1271ef0d897e..03331f54db892 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -421,6 +421,23 @@ def array_with_unit_to_datetime( return oresult, tz +@cython.wraparound(False) +@cython.boundscheck(False) +def first_non_null(values: ndarray) -> int: + """Find position of first non-null value, return -1 if there isn't one.""" + cdef: + Py_ssize_t n = len(values) + Py_ssize_t i + int result + for i in range(n): + val = values[i] + if checknull_with_nat_and_na(val): + continue + if isinstance(val, str) and (len(val) == 0 or val in nat_strings): + continue + return i + else: + return -1 @cython.wraparound(False) @cython.boundscheck(False) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 8566468d4e23f..7791ea804a52a 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -126,9 +126,8 @@ class FulldatetimeDict(YearMonthDayDict, total=False): def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None: # Try to guess the format based on the first non-NaN element, return None if can't - non_nan_elements = notna(arr).nonzero()[0] - if len(non_nan_elements): - if type(first_non_nan_element := arr[non_nan_elements[0]]) is str: + if (first_non_null := tslib.first_non_null(arr)) != -1: + if type(first_non_nan_element := arr[first_non_null]) is str: # GH#32264 np.str_ object return guess_datetime_format(first_non_nan_element, dayfirst=dayfirst) return None diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 2b5457fc9f7b3..263f2b597947a 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2093,9 +2093,8 @@ def test_to_datetime_dta_tz(self, klass): class TestGuessDatetimeFormat: - @td.skip_if_not_us_locale @pytest.mark.parametrize( - "test_array", + "test_list", [ [ "2011-12-30 00:00:00.000000", @@ -2103,11 +2102,14 @@ class TestGuessDatetimeFormat: "2011-12-30 00:00:00.000000", ], [np.nan, np.nan, "2011-12-30 00:00:00.000000"], + ["", "2011-12-30 00:00:00.000000"], + ["NaT", "2011-12-30 00:00:00.000000"], ["2011-12-30 00:00:00.000000", "random_string"], ], ) - def test_guess_datetime_format_for_array(self, test_array): + def test_guess_datetime_format_for_array(self, test_list): expected_format = "%Y-%m-%d %H:%M:%S.%f" + test_array = np.array(test_list, dtype=object) assert tools._guess_datetime_format_for_array(test_array) == expected_format @td.skip_if_not_us_locale From d8550eff08f40556bf95b624e424f395be6245a0 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sun, 16 Oct 2022 09:44:07 +0100 Subject: [PATCH 2/2] pyi file --- pandas/_libs/tslib.pyi | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index 2212f8db8ea1e..8fec9ecf27f30 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -16,6 +16,7 @@ def array_with_unit_to_datetime( unit: str, errors: str = ..., ) -> tuple[np.ndarray, tzinfo | None]: ... +def first_non_null(values: np.ndarray) -> int: ... def array_to_datetime( values: npt.NDArray[np.object_], errors: str = ...,