diff --git a/asv_bench/benchmarks/timestamp.py b/asv_bench/benchmarks/timestamp.py index 8eaf815eaa962..a0f3bfa6298f3 100644 --- a/asv_bench/benchmarks/timestamp.py +++ b/asv_bench/benchmarks/timestamp.py @@ -87,7 +87,7 @@ def time_microsecond(self, tz, freq): class TimestampOps(object): - params = [None, 'US/Eastern'] + params = [None, 'US/Eastern', 'UTC'] param_names = ['tz'] def setup(self, tz): @@ -102,6 +102,9 @@ def time_replace_None(self, tz): def time_to_pydatetime(self, tz): self.ts.to_pydatetime() + def time_normalize(self, tz): + self.ts.normalize() + class TimestampAcrossDst(object): def setup(self): diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 98941b6d353bb..7a7f75d6e0eaa 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1139,6 +1139,7 @@ Performance Improvements - Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`) - Improved performance of :func:`PeriodIndex.unique` (:issue:`23083`) - Improved performance of :func:`pd.concat` for `Series` objects (:issue:`23404`) +- Improved performance of :meth:`DatetimeIndex.normalize` and :meth:`Timestamp.normalize` for timezone naive or UTC datetimes (:issue:`23634`) .. _whatsnew_0240.docs: diff --git a/pandas/_libs/tslibs/ccalendar.pyx b/pandas/_libs/tslibs/ccalendar.pyx index 07c146c06b510..587213049af85 100644 --- a/pandas/_libs/tslibs/ccalendar.pyx +++ b/pandas/_libs/tslibs/ccalendar.pyx @@ -49,6 +49,9 @@ DAYS_FULL = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', int_to_weekday = {num: name for num, name in enumerate(DAYS)} weekday_to_int = {int_to_weekday[key]: key for key in int_to_weekday} +DAY_SECONDS = 86400 +HOUR_SECONDS = 3600 + # ---------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index f88671b41a16a..c2897e1d0e8c8 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -16,6 +16,8 @@ from cpython.datetime cimport (datetime, tzinfo, PyDateTime_CheckExact, PyDateTime_IMPORT) PyDateTime_IMPORT +from ccalendar import DAY_SECONDS, HOUR_SECONDS + from np_datetime cimport (check_dts_bounds, npy_datetimestruct, pandas_datetime_to_datetimestruct, _string_to_dts, @@ -41,8 +43,6 @@ from nattype cimport NPY_NAT, checknull_with_nat # ---------------------------------------------------------------------- # Constants -cdef int64_t DAY_NS = 86400000000000LL -cdef int64_t HOURS_NS = 3600000000000 NS_DTYPE = np.dtype('M8[ns]') TD_DTYPE = np.dtype('m8[ns]') @@ -875,6 +875,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, Py_ssize_t delta_idx_offset, delta_idx, pos_left, pos_right int64_t *tdata int64_t v, left, right, val, v_left, v_right, new_local, remaining_mins + int64_t HOURS_NS = HOUR_SECONDS * 1000000000 ndarray[int64_t] result, result_a, result_b, dst_hours npy_datetimestruct dts bint infer_dst = False, is_dst = False, fill = False @@ -931,10 +932,10 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, result_b[:] = NPY_NAT idx_shifted_left = (np.maximum(0, trans.searchsorted( - vals - DAY_NS, side='right') - 1)).astype(np.int64) + vals - DAY_SECONDS * 1000000000, side='right') - 1)).astype(np.int64) idx_shifted_right = (np.maximum(0, trans.searchsorted( - vals + DAY_NS, side='right') - 1)).astype(np.int64) + vals + DAY_SECONDS * 1000000000, side='right') - 1)).astype(np.int64) for i in range(n): val = vals[i] @@ -1116,9 +1117,9 @@ def normalize_date(dt: object) -> datetime: @cython.boundscheck(False) def normalize_i8_timestamps(int64_t[:] stamps, object tz=None): """ - Normalize each of the (nanosecond) timestamps in the given array by - rounding down to the beginning of the day (i.e. midnight). If `tz` - is not None, then this is midnight for this timezone. + Normalize each of the (nanosecond) timezone aware timestamps in the given + array by rounding down to the beginning of the day (i.e. midnight). + This is midnight for timezone, `tz`. Parameters ---------- @@ -1130,21 +1131,11 @@ def normalize_i8_timestamps(int64_t[:] stamps, object tz=None): result : int64 ndarray of converted of normalized nanosecond timestamps """ cdef: - Py_ssize_t i, n = len(stamps) - npy_datetimestruct dts + Py_ssize_t n = len(stamps) int64_t[:] result = np.empty(n, dtype=np.int64) - if tz is not None: - tz = maybe_get_tz(tz) - result = _normalize_local(stamps, tz) - else: - with nogil: - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - dt64_to_dtstruct(stamps[i], &dts) - result[i] = _normalized_stamp(&dts) + tz = maybe_get_tz(tz) + result = _normalize_local(stamps, tz) return result.base # .base to access underlying np.ndarray diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 4c0af69d72517..72157c2fcb2f3 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -12,7 +12,7 @@ cimport numpy as cnp from numpy cimport ndarray, int64_t, int32_t, int8_t cnp.import_array() -from ccalendar import get_locale_names, MONTHS_FULL, DAYS_FULL +from ccalendar import get_locale_names, MONTHS_FULL, DAYS_FULL, DAY_SECONDS from ccalendar cimport (get_days_in_month, is_leapyear, dayofweek, get_week_of_year, get_day_of_year) from np_datetime cimport (npy_datetimestruct, pandas_timedeltastruct, @@ -36,7 +36,8 @@ def get_time_micros(ndarray[int64_t] dtindex): cdef: ndarray[int64_t] micros - micros = np.mod(dtindex, 86400000000000, dtype=np.int64) // 1000LL + micros = np.mod(dtindex, DAY_SECONDS * 1000000000, dtype=np.int64) + micros //= 1000LL return micros diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 61d0f697e7fe4..dbbe9da381f0a 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -37,6 +37,7 @@ cdef extern from "src/datetime/np_datetime_strings.h": npy_datetimestruct *out, int *out_local, int *out_tzoffset) + # ---------------------------------------------------------------------- # numpy object inspection diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index ca8491726a5f7..4d612a6f43107 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -28,6 +28,8 @@ from util cimport (is_timedelta64_object, is_datetime64_object, is_integer_object, is_float_object, is_string_object) +from ccalendar import DAY_SECONDS + from np_datetime cimport (cmp_scalar, reverse_ops, td64_to_tdstruct, pandas_timedeltastruct) @@ -38,8 +40,6 @@ from offsets cimport to_offset # ---------------------------------------------------------------------- # Constants -cdef int64_t DAY_NS = 86400000000000LL - # components named tuple Components = collections.namedtuple('Components', [ 'days', 'hours', 'minutes', 'seconds', @@ -266,10 +266,10 @@ cdef inline int64_t cast_from_unit(object ts, object unit) except? -1: m = 1000000000L * 2629746 p = 9 elif unit == 'W': - m = 1000000000L * 86400 * 7 + m = 1000000000L * DAY_SECONDS * 7 p = 9 elif unit == 'D' or unit == 'd': - m = 1000000000L * 86400 + m = 1000000000L * DAY_SECONDS p = 9 elif unit == 'h': m = 1000000000L * 3600 diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index eaaa170a387e9..0ee20d84b9010 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -21,6 +21,7 @@ from util cimport (is_datetime64_object, is_timedelta64_object, is_offset_object) cimport ccalendar +from ccalendar import DAY_SECONDS from conversion import tz_localize_to_utc, normalize_i8_timestamps from conversion cimport (tz_convert_single, _TSObject, convert_to_tsobject, convert_datetime_to_tsobject) @@ -1285,6 +1286,10 @@ class Timestamp(_Timestamp): Normalize Timestamp to midnight, preserving tz information. """ + if self.tz is None or is_utc(self.tz): + DAY_NS = DAY_SECONDS * 1000000000 + normalized_value = self.value - (self.value % DAY_NS) + return Timestamp(normalized_value).tz_localize(self.tz) normalized_value = normalize_i8_timestamps( np.array([self.value], dtype='i8'), tz=self.tz)[0] return Timestamp(normalized_value).tz_localize(self.tz) diff --git a/pandas/_libs/tslibs/timezones.pxd b/pandas/_libs/tslibs/timezones.pxd index 8965b46f747c4..50c4a41f97a82 100644 --- a/pandas/_libs/tslibs/timezones.pxd +++ b/pandas/_libs/tslibs/timezones.pxd @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -cdef bint is_utc(object tz) +cpdef bint is_utc(object tz) cdef bint is_tzlocal(object tz) cdef bint treat_tz_as_pytz(object tz) diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 19a00fbd37dd5..a2a40a8aa1ca4 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -27,7 +27,7 @@ cdef int64_t NPY_NAT = get_nat() # ---------------------------------------------------------------------- -cdef inline bint is_utc(object tz): +cpdef inline bint is_utc(object tz): return tz is UTC or isinstance(tz, _dateutil_tzutc) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index ce1ca01cd3234..b3a9973e50ff0 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -8,7 +8,7 @@ from pandas._libs import lib, tslib from pandas._libs.tslib import Timestamp, NaT, iNaT from pandas._libs.tslibs import ( - normalize_date, + ccalendar, normalize_date, conversion, fields, timezones, resolution as libresolution) @@ -853,7 +853,14 @@ def normalize(self): '2014-08-01 00:00:00+05:30'], dtype='datetime64[ns, Asia/Calcutta]', freq=None) """ - new_values = conversion.normalize_i8_timestamps(self.asi8, self.tz) + if self.tz is None or timezones.is_utc(self.tz): + not_null = self.notna() + DAY_NS = ccalendar.DAY_SECONDS * 1000000000 + new_values = self.asi8.copy() + adjustment = (new_values[not_null] % DAY_NS) + new_values[not_null] = new_values[not_null] - adjustment + else: + new_values = conversion.normalize_i8_timestamps(self.asi8, self.tz) return type(self)(new_values, freq='infer').tz_localize(self.tz) def to_period(self, freq=None): diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index 81f4c77009ce4..eb4169639482f 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -235,6 +235,12 @@ def test_normalize(self): assert result.is_normalized assert not rng.is_normalized + def test_normalize_nat(self): + dti = DatetimeIndex([pd.NaT, Timestamp('2018-01-01 01:00:00')]) + result = dti.normalize() + expected = DatetimeIndex([pd.NaT, Timestamp('2018-01-01')]) + tm.assert_index_equal(result, expected) + class TestDateTimeIndexToJulianDate(object): diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index a9a60c4119605..21404bf7ef76f 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -328,6 +328,17 @@ def test_replace_dst_border(self): expected = Timestamp('2013-11-3 03:00:00', tz='America/Chicago') assert result == expected + # -------------------------------------------------------------- + # Timestamp.normalize + + @pytest.mark.parametrize('arg', ['2013-11-30', '2013-11-30 12:00:00']) + def test_normalize(self, tz_naive_fixture, arg): + tz = tz_naive_fixture + ts = Timestamp(arg, tz=tz) + result = ts.normalize() + expected = Timestamp('2013-11-30', tz=tz) + assert result == expected + # -------------------------------------------------------------- @td.skip_if_windows