Skip to content

Commit 3702de2

Browse files
mroeschkejreback
authored andcommitted
PERF: Datetime/Timestamp.normalize for timezone naive datetimes (#23634)
\
1 parent e6eec3e commit 3702de2

File tree

13 files changed

+60
-31
lines changed

13 files changed

+60
-31
lines changed

asv_bench/benchmarks/timestamp.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def time_microsecond(self, tz, freq):
8787

8888

8989
class TimestampOps(object):
90-
params = [None, 'US/Eastern']
90+
params = [None, 'US/Eastern', 'UTC']
9191
param_names = ['tz']
9292

9393
def setup(self, tz):
@@ -102,6 +102,9 @@ def time_replace_None(self, tz):
102102
def time_to_pydatetime(self, tz):
103103
self.ts.to_pydatetime()
104104

105+
def time_normalize(self, tz):
106+
self.ts.normalize()
107+
105108

106109
class TimestampAcrossDst(object):
107110
def setup(self):

doc/source/whatsnew/v0.24.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1139,6 +1139,7 @@ Performance Improvements
11391139
- Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`)
11401140
- Improved performance of :func:`PeriodIndex.unique` (:issue:`23083`)
11411141
- Improved performance of :func:`pd.concat` for `Series` objects (:issue:`23404`)
1142+
- Improved performance of :meth:`DatetimeIndex.normalize` and :meth:`Timestamp.normalize` for timezone naive or UTC datetimes (:issue:`23634`)
11421143

11431144

11441145
.. _whatsnew_0240.docs:

pandas/_libs/tslibs/ccalendar.pyx

+3
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ DAYS_FULL = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday',
4949
int_to_weekday = {num: name for num, name in enumerate(DAYS)}
5050
weekday_to_int = {int_to_weekday[key]: key for key in int_to_weekday}
5151

52+
DAY_SECONDS = 86400
53+
HOUR_SECONDS = 3600
54+
5255
# ----------------------------------------------------------------------
5356

5457

pandas/_libs/tslibs/conversion.pyx

+11-20
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ from cpython.datetime cimport (datetime, tzinfo,
1616
PyDateTime_CheckExact, PyDateTime_IMPORT)
1717
PyDateTime_IMPORT
1818

19+
from ccalendar import DAY_SECONDS, HOUR_SECONDS
20+
1921
from np_datetime cimport (check_dts_bounds,
2022
npy_datetimestruct,
2123
pandas_datetime_to_datetimestruct, _string_to_dts,
@@ -41,8 +43,6 @@ from nattype cimport NPY_NAT, checknull_with_nat
4143
# ----------------------------------------------------------------------
4244
# Constants
4345

44-
cdef int64_t DAY_NS = 86400000000000LL
45-
cdef int64_t HOURS_NS = 3600000000000
4646
NS_DTYPE = np.dtype('M8[ns]')
4747
TD_DTYPE = np.dtype('m8[ns]')
4848

@@ -875,6 +875,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None,
875875
Py_ssize_t delta_idx_offset, delta_idx, pos_left, pos_right
876876
int64_t *tdata
877877
int64_t v, left, right, val, v_left, v_right, new_local, remaining_mins
878+
int64_t HOURS_NS = HOUR_SECONDS * 1000000000
878879
ndarray[int64_t] result, result_a, result_b, dst_hours
879880
npy_datetimestruct dts
880881
bint infer_dst = False, is_dst = False, fill = False
@@ -931,10 +932,10 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None,
931932
result_b[:] = NPY_NAT
932933

933934
idx_shifted_left = (np.maximum(0, trans.searchsorted(
934-
vals - DAY_NS, side='right') - 1)).astype(np.int64)
935+
vals - DAY_SECONDS * 1000000000, side='right') - 1)).astype(np.int64)
935936

936937
idx_shifted_right = (np.maximum(0, trans.searchsorted(
937-
vals + DAY_NS, side='right') - 1)).astype(np.int64)
938+
vals + DAY_SECONDS * 1000000000, side='right') - 1)).astype(np.int64)
938939

939940
for i in range(n):
940941
val = vals[i]
@@ -1116,9 +1117,9 @@ def normalize_date(dt: object) -> datetime:
11161117
@cython.boundscheck(False)
11171118
def normalize_i8_timestamps(int64_t[:] stamps, object tz=None):
11181119
"""
1119-
Normalize each of the (nanosecond) timestamps in the given array by
1120-
rounding down to the beginning of the day (i.e. midnight). If `tz`
1121-
is not None, then this is midnight for this timezone.
1120+
Normalize each of the (nanosecond) timezone aware timestamps in the given
1121+
array by rounding down to the beginning of the day (i.e. midnight).
1122+
This is midnight for timezone, `tz`.
11221123
11231124
Parameters
11241125
----------
@@ -1130,21 +1131,11 @@ def normalize_i8_timestamps(int64_t[:] stamps, object tz=None):
11301131
result : int64 ndarray of converted of normalized nanosecond timestamps
11311132
"""
11321133
cdef:
1133-
Py_ssize_t i, n = len(stamps)
1134-
npy_datetimestruct dts
1134+
Py_ssize_t n = len(stamps)
11351135
int64_t[:] result = np.empty(n, dtype=np.int64)
11361136

1137-
if tz is not None:
1138-
tz = maybe_get_tz(tz)
1139-
result = _normalize_local(stamps, tz)
1140-
else:
1141-
with nogil:
1142-
for i in range(n):
1143-
if stamps[i] == NPY_NAT:
1144-
result[i] = NPY_NAT
1145-
continue
1146-
dt64_to_dtstruct(stamps[i], &dts)
1147-
result[i] = _normalized_stamp(&dts)
1137+
tz = maybe_get_tz(tz)
1138+
result = _normalize_local(stamps, tz)
11481139

11491140
return result.base # .base to access underlying np.ndarray
11501141

pandas/_libs/tslibs/fields.pyx

+3-2
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ cimport numpy as cnp
1212
from numpy cimport ndarray, int64_t, int32_t, int8_t
1313
cnp.import_array()
1414

15-
from ccalendar import get_locale_names, MONTHS_FULL, DAYS_FULL
15+
from ccalendar import get_locale_names, MONTHS_FULL, DAYS_FULL, DAY_SECONDS
1616
from ccalendar cimport (get_days_in_month, is_leapyear, dayofweek,
1717
get_week_of_year, get_day_of_year)
1818
from np_datetime cimport (npy_datetimestruct, pandas_timedeltastruct,
@@ -36,7 +36,8 @@ def get_time_micros(ndarray[int64_t] dtindex):
3636
cdef:
3737
ndarray[int64_t] micros
3838

39-
micros = np.mod(dtindex, 86400000000000, dtype=np.int64) // 1000LL
39+
micros = np.mod(dtindex, DAY_SECONDS * 1000000000, dtype=np.int64)
40+
micros //= 1000LL
4041
return micros
4142

4243

pandas/_libs/tslibs/np_datetime.pyx

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ cdef extern from "src/datetime/np_datetime_strings.h":
3737
npy_datetimestruct *out,
3838
int *out_local, int *out_tzoffset)
3939

40+
4041
# ----------------------------------------------------------------------
4142
# numpy object inspection
4243

pandas/_libs/tslibs/timedeltas.pyx

+4-4
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ from util cimport (is_timedelta64_object, is_datetime64_object,
2828
is_integer_object, is_float_object,
2929
is_string_object)
3030

31+
from ccalendar import DAY_SECONDS
32+
3133
from np_datetime cimport (cmp_scalar, reverse_ops, td64_to_tdstruct,
3234
pandas_timedeltastruct)
3335

@@ -38,8 +40,6 @@ from offsets cimport to_offset
3840
# ----------------------------------------------------------------------
3941
# Constants
4042

41-
cdef int64_t DAY_NS = 86400000000000LL
42-
4343
# components named tuple
4444
Components = collections.namedtuple('Components', [
4545
'days', 'hours', 'minutes', 'seconds',
@@ -266,10 +266,10 @@ cdef inline int64_t cast_from_unit(object ts, object unit) except? -1:
266266
m = 1000000000L * 2629746
267267
p = 9
268268
elif unit == 'W':
269-
m = 1000000000L * 86400 * 7
269+
m = 1000000000L * DAY_SECONDS * 7
270270
p = 9
271271
elif unit == 'D' or unit == 'd':
272-
m = 1000000000L * 86400
272+
m = 1000000000L * DAY_SECONDS
273273
p = 9
274274
elif unit == 'h':
275275
m = 1000000000L * 3600

pandas/_libs/tslibs/timestamps.pyx

+5
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ from util cimport (is_datetime64_object, is_timedelta64_object,
2121
is_offset_object)
2222

2323
cimport ccalendar
24+
from ccalendar import DAY_SECONDS
2425
from conversion import tz_localize_to_utc, normalize_i8_timestamps
2526
from conversion cimport (tz_convert_single, _TSObject,
2627
convert_to_tsobject, convert_datetime_to_tsobject)
@@ -1285,6 +1286,10 @@ class Timestamp(_Timestamp):
12851286
Normalize Timestamp to midnight, preserving
12861287
tz information.
12871288
"""
1289+
if self.tz is None or is_utc(self.tz):
1290+
DAY_NS = DAY_SECONDS * 1000000000
1291+
normalized_value = self.value - (self.value % DAY_NS)
1292+
return Timestamp(normalized_value).tz_localize(self.tz)
12881293
normalized_value = normalize_i8_timestamps(
12891294
np.array([self.value], dtype='i8'), tz=self.tz)[0]
12901295
return Timestamp(normalized_value).tz_localize(self.tz)

pandas/_libs/tslibs/timezones.pxd

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# -*- coding: utf-8 -*-
22

3-
cdef bint is_utc(object tz)
3+
cpdef bint is_utc(object tz)
44
cdef bint is_tzlocal(object tz)
55

66
cdef bint treat_tz_as_pytz(object tz)

pandas/_libs/tslibs/timezones.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ cdef int64_t NPY_NAT = get_nat()
2727

2828
# ----------------------------------------------------------------------
2929

30-
cdef inline bint is_utc(object tz):
30+
cpdef inline bint is_utc(object tz):
3131
return tz is UTC or isinstance(tz, _dateutil_tzutc)
3232

3333

pandas/core/arrays/datetimes.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from pandas._libs import lib, tslib
99
from pandas._libs.tslib import Timestamp, NaT, iNaT
1010
from pandas._libs.tslibs import (
11-
normalize_date,
11+
ccalendar, normalize_date,
1212
conversion, fields, timezones,
1313
resolution as libresolution)
1414

@@ -853,7 +853,14 @@ def normalize(self):
853853
'2014-08-01 00:00:00+05:30'],
854854
dtype='datetime64[ns, Asia/Calcutta]', freq=None)
855855
"""
856-
new_values = conversion.normalize_i8_timestamps(self.asi8, self.tz)
856+
if self.tz is None or timezones.is_utc(self.tz):
857+
not_null = self.notna()
858+
DAY_NS = ccalendar.DAY_SECONDS * 1000000000
859+
new_values = self.asi8.copy()
860+
adjustment = (new_values[not_null] % DAY_NS)
861+
new_values[not_null] = new_values[not_null] - adjustment
862+
else:
863+
new_values = conversion.normalize_i8_timestamps(self.asi8, self.tz)
857864
return type(self)(new_values, freq='infer').tz_localize(self.tz)
858865

859866
def to_period(self, freq=None):

pandas/tests/indexes/datetimes/test_scalar_compat.py

+6
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,12 @@ def test_normalize(self):
235235
assert result.is_normalized
236236
assert not rng.is_normalized
237237

238+
def test_normalize_nat(self):
239+
dti = DatetimeIndex([pd.NaT, Timestamp('2018-01-01 01:00:00')])
240+
result = dti.normalize()
241+
expected = DatetimeIndex([pd.NaT, Timestamp('2018-01-01')])
242+
tm.assert_index_equal(result, expected)
243+
238244

239245
class TestDateTimeIndexToJulianDate(object):
240246

pandas/tests/scalar/timestamp/test_unary_ops.py

+11
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,17 @@ def test_replace_dst_border(self):
328328
expected = Timestamp('2013-11-3 03:00:00', tz='America/Chicago')
329329
assert result == expected
330330

331+
# --------------------------------------------------------------
332+
# Timestamp.normalize
333+
334+
@pytest.mark.parametrize('arg', ['2013-11-30', '2013-11-30 12:00:00'])
335+
def test_normalize(self, tz_naive_fixture, arg):
336+
tz = tz_naive_fixture
337+
ts = Timestamp(arg, tz=tz)
338+
result = ts.normalize()
339+
expected = Timestamp('2013-11-30', tz=tz)
340+
assert result == expected
341+
331342
# --------------------------------------------------------------
332343

333344
@td.skip_if_windows

0 commit comments

Comments
 (0)