From 11739a846c332ebbd50b28ab59948a7be0357ce5 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 27 Oct 2023 18:20:28 -0700 Subject: [PATCH 1/3] ENH: infer resolution in array_to_datetime --- pandas/_libs/tslib.pyx | 75 +++++++++++++++++-- pandas/_libs/tslibs/strptime.pxd | 5 ++ pandas/_libs/tslibs/strptime.pyx | 14 +++- pandas/tests/tslibs/test_array_to_datetime.py | 63 ++++++++++++++++ 4 files changed, 148 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 8fdbd9affa58a..e260a6613792c 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -31,10 +31,15 @@ import numpy as np cnp.import_array() +from pandas._libs.tslibs.dtypes cimport ( + get_supported_reso, + npy_unit_to_abbrev, +) from pandas._libs.tslibs.np_datetime cimport ( NPY_DATETIMEUNIT, NPY_FR_ns, check_dts_bounds, + get_datetime64_unit, import_pandas_datetime, npy_datetimestruct, npy_datetimestruct_to_datetime, @@ -277,6 +282,7 @@ def array_with_unit_to_datetime( result, tz = array_to_datetime( values.astype(object, copy=False), errors=errors, + creso=NPY_FR_ns, ) return result, tz @@ -408,6 +414,7 @@ cpdef array_to_datetime( bint dayfirst=False, bint yearfirst=False, bint utc=False, + NPY_DATETIMEUNIT creso=NPY_FR_ns, ): """ Converts a 1D array of date-like values to a numpy array of either: @@ -434,6 +441,8 @@ cpdef array_to_datetime( yearfirst parsing behavior when encountering datetime strings utc : bool, default False indicator whether the dates should be UTC + creso : NPY_DATETIMEUNIT, default NPY_FR_ns + Set to NPY_FR_GENERIC to infer a resolution. Returns ------- @@ -457,13 +466,18 @@ cpdef array_to_datetime( set out_tzoffset_vals = set() tzinfo tz_out = None cnp.flatiter it = cnp.PyArray_IterNew(values) - NPY_DATETIMEUNIT creso = NPY_FR_ns - DatetimeParseState state = DatetimeParseState() + NPY_DATETIMEUNIT item_reso + bint infer_reso = creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC + DatetimeParseState state = DatetimeParseState(creso) # specify error conditions assert is_raise or is_ignore or is_coerce - result = np.empty((values).shape, dtype="M8[ns]") + if infer_reso: + abbrev = "ns" + else: + abbrev = npy_unit_to_abbrev(creso) + result = np.empty((values).shape, dtype=f"M8[{abbrev}]") iresult = result.view("i8").ravel() for i in range(n): @@ -476,24 +490,43 @@ cpdef array_to_datetime( iresult[i] = NPY_NAT elif PyDateTime_Check(val): + if isinstance(val, _Timestamp): + item_reso = val._creso + else: + item_reso = NPY_DATETIMEUNIT.NPY_FR_us + state.update_creso(item_reso) + if infer_reso: + creso = state.creso tz_out = state.process_datetime(val, tz_out, utc_convert) iresult[i] = parse_pydatetime(val, &dts, utc_convert, creso=creso) elif PyDate_Check(val): - iresult[i] = pydate_to_dt64(val, &dts) - check_dts_bounds(&dts) + item_reso = NPY_DATETIMEUNIT.NPY_FR_s + state.update_creso(item_reso) + if infer_reso: + creso = state.creso + iresult[i] = pydate_to_dt64(val, &dts, reso=creso) + check_dts_bounds(&dts, creso) elif is_datetime64_object(val): - iresult[i] = get_datetime64_nanos(val, NPY_FR_ns) + item_reso = get_supported_reso(get_datetime64_unit(val)) + state.update_creso(item_reso) + if infer_reso: + creso = state.creso + iresult[i] = get_datetime64_nanos(val, creso) elif is_integer_object(val) or is_float_object(val): # these must be ns unit by-definition + item_reso = NPY_FR_ns + state.update_creso(item_reso) + if infer_reso: + creso = state.creso if val != val or val == NPY_NAT: iresult[i] = NPY_NAT else: # we now need to parse this as if unit='ns' - iresult[i] = cast_from_unit(val, "ns") + iresult[i] = cast_from_unit(val, "ns", out_reso=creso) elif isinstance(val, str): # string @@ -504,12 +537,20 @@ cpdef array_to_datetime( if parse_today_now(val, &iresult[i], utc): # We can't _quite_ dispatch this to convert_str_to_tsobject # bc there isn't a nice way to pass "utc" + item_reso = NPY_DATETIMEUNIT.NPY_FR_us + state.update_creso(item_reso) + if infer_reso: + creso = state.creso continue _ts = convert_str_to_tsobject( val, None, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst ) - _ts.ensure_reso(NPY_FR_ns, val) + item_reso = _ts.creso + state.update_creso(item_reso) + if infer_reso: + creso = state.creso + _ts.ensure_reso(creso, val) iresult[i] = _ts.value @@ -554,6 +595,24 @@ cpdef array_to_datetime( else: tz_offset = out_tzoffset_vals.pop() tz_out = timezone(timedelta(seconds=tz_offset)) + + if infer_reso: + if state.creso_changed: + # We encountered mismatched resolutions, need to re-parse with + # the correct one. + return array_to_datetime( + values, + errors=errors, + yearfirst=yearfirst, + dayfirst=dayfirst, + utc=utc, + creso=state.creso, + ) + + # Otherwise we can use the single reso that we encountered and avoid + # a second pass. + abbrev = npy_unit_to_abbrev(state.creso) + result = iresult.view(f"M8[{abbrev}]") return result, tz_out diff --git a/pandas/_libs/tslibs/strptime.pxd b/pandas/_libs/tslibs/strptime.pxd index d09612f4fbf7d..2631d12c7db7f 100644 --- a/pandas/_libs/tslibs/strptime.pxd +++ b/pandas/_libs/tslibs/strptime.pxd @@ -4,6 +4,8 @@ from cpython.datetime cimport ( ) from numpy cimport int64_t +from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT + cdef bint parse_today_now(str val, int64_t* iresult, bint utc) @@ -12,5 +14,8 @@ cdef class DatetimeParseState: cdef: bint found_tz bint found_naive + bint creso_changed + NPY_DATETIMEUNIT creso + cdef bint update_creso(self, NPY_DATETIMEUNIT creso) cdef tzinfo process_datetime(self, datetime dt, tzinfo tz, bint utc_convert) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index d1c6bab180576..e893bd861bcdc 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -155,9 +155,21 @@ cdef dict _parse_code_table = {"y": 0, cdef class DatetimeParseState: - def __cinit__(self): + def __cinit__(self, NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_GENERIC): self.found_tz = False self.found_naive = False + self.creso = creso + self.creso_changed = False + + cdef bint update_creso(self, NPY_DATETIMEUNIT item_reso): + # Return a bool to indicate we bumped to a higher resolution + if self.creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: + self.creso = item_reso + elif item_reso > self.creso: + self.creso = item_reso + self.creso_changed = True + return True + return False cdef tzinfo process_datetime(self, datetime dt, tzinfo tz, bint utc_convert): if dt.tzinfo is not None: diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index 829bb140e6e96..91363e137ec6c 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -13,10 +13,13 @@ iNaT, tslib, ) +from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas import Timestamp import pandas._testing as tm +creso_infer = NpyDatetimeUnit.NPY_FR_GENERIC.value + @pytest.mark.parametrize( "data,expected", @@ -203,3 +206,63 @@ def test_datetime_subclass(data, expected): expected = np.array(expected, dtype="M8[ns]") tm.assert_numpy_array_equal(result, expected) + + +class TestArrayToDatetimeResolutionInference: + # TODO: tests that include tzs, ints + + def test_infer_homogeoneous_datetimes(self): + dt = datetime(2023, 10, 27, 18, 3, 5, 678000) + arr = np.array([dt, dt, dt], dtype=object) + result, tz = tslib.array_to_datetime(arr, creso=creso_infer) + assert tz is None + expected = np.array([dt, dt, dt], dtype="M8[us]") + tm.assert_numpy_array_equal(result, expected) + + def test_infer_homogeoneous_date_objects(self): + dt = datetime(2023, 10, 27, 18, 3, 5, 678000) + dt2 = dt.date() + arr = np.array([None, dt2, dt2, dt2], dtype=object) + result, tz = tslib.array_to_datetime(arr, creso=creso_infer) + assert tz is None + expected = np.array([np.datetime64("NaT"), dt2, dt2, dt2], dtype="M8[s]") + tm.assert_numpy_array_equal(result, expected) + + def test_infer_homogeoneous_dt64(self): + dt = datetime(2023, 10, 27, 18, 3, 5, 678000) + dt64 = np.datetime64(dt, "ms") + arr = np.array([None, dt64, dt64, dt64], dtype=object) + result, tz = tslib.array_to_datetime(arr, creso=creso_infer) + assert tz is None + expected = np.array([np.datetime64("NaT"), dt64, dt64, dt64], dtype="M8[ms]") + tm.assert_numpy_array_equal(result, expected) + + def test_infer_homogeoneous_timestamps(self): + dt = datetime(2023, 10, 27, 18, 3, 5, 678000) + ts = Timestamp(dt).as_unit("ns") + arr = np.array([None, ts, ts, ts], dtype=object) + result, tz = tslib.array_to_datetime(arr, creso=creso_infer) + assert tz is None + expected = np.array([np.datetime64("NaT")] + [ts.asm8] * 3, dtype="M8[ns]") + tm.assert_numpy_array_equal(result, expected) + + def test_infer_homogeoneous_datetimes_strings(self): + item = "2023-10-27 18:03:05.678000" + arr = np.array([None, item, item, item], dtype=object) + result, tz = tslib.array_to_datetime(arr, creso=creso_infer) + assert tz is None + expected = np.array([np.datetime64("NaT"), item, item, item], dtype="M8[us]") + tm.assert_numpy_array_equal(result, expected) + + def test_infer_heterogeneous(self): + dtstr = "2023-10-27 18:03:05.678000" + + arr = np.array([dtstr, dtstr[:-3], dtstr[:-7], None], dtype=object) + result, tz = tslib.array_to_datetime(arr, creso=creso_infer) + assert tz is None + expected = np.array(arr, dtype="M8[us]") + tm.assert_numpy_array_equal(result, expected) + + result, tz = tslib.array_to_datetime(arr[::-1], creso=creso_infer) + assert tz is None + tm.assert_numpy_array_equal(result, expected[::-1]) From 3b88a1ebe2bf378d65a550ca50db95cdaf4d371b Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 1 Nov 2023 16:58:54 -0700 Subject: [PATCH 2/3] post-merge fixup --- pandas/_libs/tslib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index d2c67b4d0d229..ec214e882f6d3 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -601,7 +601,7 @@ cpdef array_to_datetime( tz_out = timezone(timedelta(seconds=tz_offset)) if infer_reso: - if state.creso_changed: + if state.creso_ever_changed: # We encountered mismatched resolutions, need to re-parse with # the correct one. return array_to_datetime( From 8d2b5f383344dbbc029c26c944c6ff3bcb3856c9 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 3 Nov 2023 09:37:06 -0700 Subject: [PATCH 3/3] post-merge fixup --- pandas/_libs/tslibs/strptime.pxd | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/tslibs/strptime.pxd b/pandas/_libs/tslibs/strptime.pxd index 85cc5e418f8c0..64db2b59dfcff 100644 --- a/pandas/_libs/tslibs/strptime.pxd +++ b/pandas/_libs/tslibs/strptime.pxd @@ -17,6 +17,5 @@ cdef class DatetimeParseState: bint creso_ever_changed NPY_DATETIMEUNIT creso - cdef bint update_creso(self, NPY_DATETIMEUNIT creso) cdef tzinfo process_datetime(self, datetime dt, tzinfo tz, bint utc_convert) cdef bint update_creso(self, NPY_DATETIMEUNIT item_reso) noexcept