-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
API/ENH: tz_localize handling of nonexistent times: rename keyword + add shift option #22644
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 20 commits
bf5e7bf
36d13c7
a5ea445
8753d00
e1a6c6a
a7c86c8
1884c7b
a6a05df
c4dc8aa
1bc81db
c81d58c
b2c8429
a65987d
710014c
93159e5
a0ffcdd
219256f
d435481
7c849b6
56ac4fe
b7b09bd
94a72a5
39b769e
18664d8
8852d43
38b95e9
c88b0d8
1bae682
d30f891
f337692
6a12a7e
a7b8357
7ad87ec
abad726
6be1c25
f8be4b6
c192c9f
8909f38
49f203f
01678c7
707fdde
ae27a50
85ed25e
9041ebe
a4cdac2
0a9c1db
efb382e
61c73ca
20cc925
394a0db
a5253ee
5185683
ba1bfed
8b06c96
42ae923
fe575fe
3482f92
f0e43e2
b98d4cf
e6c5b2d
83423ad
1ca0ab2
5bcc977
8cf16e2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2305,6 +2305,31 @@ constructor as well as ``tz_localize``. | |
# tz_convert(None) is identical with tz_convert('UTC').tz_localize(None) | ||
didx.tz_convert('UCT').tz_localize(None) | ||
|
||
.. _timeseries.timezone_nonexsistent: | ||
|
||
Nonexistent Times when Localizing | ||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|
||
A DST transition may also shift the local time ahead by 1 hour creating nonexistent | ||
local times. The behavior of localizing a timeseries with nonexistent times | ||
can be controlled by the ``nonexistent`` argument. The following options are available: | ||
|
||
* ``shift``: Shifts nonexistent times forward to the closest real time | ||
* ``NaT``: Replaces nonexistent times with ``NaT`` | ||
* ``raise``: Raises a ``pytz.NonExistentTimeError`` (the default behavior) | ||
|
||
.. ipython:: python | ||
# 2:30 is a nonexistent time | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you put this comment one line below? |
||
dti = date_range(start='2015-03-29 01:30:00', periods=3, freq='H') | ||
dti | ||
dti.tz_localize('Europe/Warsaw', nonexistent='shift') | ||
dti.tz_localize('Europe/Warsaw', nonexistent='NaT') | ||
|
||
.. code-block:: ipython | ||
|
||
In [2]: dti.tz_localize('Europe/Warsaw') | ||
NonExistentTimeError: 2015-03-29 02:30:00 | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
.. _timeseries.timezone_series: | ||
|
||
TZ Aware Dtypes | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -185,6 +185,7 @@ Other Enhancements | |
- :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`). | ||
- :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`). | ||
- :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`). | ||
- :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have gained the ``nonexistent`` argument for alternative handling of nonexistent times (:issue:`8917`) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a pointer to the docs |
||
|
||
.. _whatsnew_0240.api_breaking: | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -44,6 +44,7 @@ from nattype cimport NPY_NAT, checknull_with_nat | |
# Constants | ||
|
||
cdef int64_t DAY_NS = 86400000000000LL | ||
cdef int64_t HOURS_NS = 3600000000000 | ||
NS_DTYPE = np.dtype('M8[ns]') | ||
TD_DTYPE = np.dtype('m8[ns]') | ||
|
||
|
@@ -826,7 +827,7 @@ def tz_convert(int64_t[:] vals, object tz1, object tz2): | |
@cython.boundscheck(False) | ||
@cython.wraparound(False) | ||
def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, | ||
object errors='raise'): | ||
object nonexistent=None, object errors='raise'): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we even need the errors path here? IOW doesn't that translate directly to a combination of nonexistent / ambiguous? e.g. if its raise then both are raise? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i don't think its actually used in here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah it looks like |
||
""" | ||
Localize tzinfo-naive i8 to given time zone (using pytz). If | ||
there are ambiguities in the values, raise AmbiguousTimeError. | ||
|
@@ -837,6 +838,8 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, | |
tz : tzinfo or None | ||
ambiguous : str, bool, or arraylike | ||
If arraylike, must have the same length as vals | ||
nonexistent : str, bool, or arraylike | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. versionadded |
||
If arraylike, must have the same length as vals | ||
errors : {"raise", "coerce"}, default "raise" | ||
|
||
Returns | ||
|
@@ -849,10 +852,11 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, | |
ndarray ambiguous_array | ||
Py_ssize_t i, idx, pos, ntrans, n = len(vals) | ||
int64_t *tdata | ||
int64_t v, left, right | ||
int64_t v, left, right, val, v_left, v_right | ||
ndarray[int64_t] result, result_a, result_b, dst_hours | ||
npy_datetimestruct dts | ||
bint infer_dst = False, is_dst = False, fill = False | ||
bint shift = False, fill_nonexist = False | ||
bint is_coerce = errors == 'coerce', is_raise = errors == 'raise' | ||
|
||
# Vectorized version of DstTzInfo.localize | ||
|
@@ -888,39 +892,43 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, | |
"the same size as vals") | ||
ambiguous_array = np.asarray(ambiguous) | ||
|
||
if is_string_object(nonexistent): | ||
if nonexistent == 'NaT': | ||
fill_nonexist = True | ||
elif nonexistent == 'shift': | ||
shift = True | ||
|
||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
trans, deltas, typ = get_dst_info(tz) | ||
|
||
tdata = <int64_t*> cnp.PyArray_DATA(trans) | ||
ntrans = len(trans) | ||
|
||
# Determine whether each date lies left of the DST transition (store in | ||
# result_a) or right of the DST transition (store in result_b) | ||
result_a = np.empty(n, dtype=np.int64) | ||
result_b = np.empty(n, dtype=np.int64) | ||
result_a.fill(NPY_NAT) | ||
result_b.fill(NPY_NAT) | ||
|
||
# left side | ||
idx_shifted = (np.maximum(0, trans.searchsorted( | ||
idx_shifted_left = (np.maximum(0, trans.searchsorted( | ||
vals - DAY_NS, side='right') - 1)).astype(np.int64) | ||
|
||
for i in range(n): | ||
v = vals[i] - deltas[idx_shifted[i]] | ||
pos = bisect_right_i8(tdata, v, ntrans) - 1 | ||
|
||
# timestamp falls to the left side of the DST transition | ||
if v + deltas[pos] == vals[i]: | ||
result_a[i] = v | ||
|
||
# right side | ||
idx_shifted = (np.maximum(0, trans.searchsorted( | ||
idx_shifted_right = (np.maximum(0, trans.searchsorted( | ||
vals + DAY_NS, side='right') - 1)).astype(np.int64) | ||
|
||
for i in range(n): | ||
v = vals[i] - deltas[idx_shifted[i]] | ||
pos = bisect_right_i8(tdata, v, ntrans) - 1 | ||
val = vals[i] | ||
v_left = val - deltas[idx_shifted_left[i]] | ||
pos_left = bisect_right_i8(tdata, v_left, ntrans) - 1 | ||
# timestamp falls to the left side of the DST transition | ||
if v_left + deltas[pos_left] == val: | ||
result_a[i] = v_left | ||
|
||
v_right = val - deltas[idx_shifted_right[i]] | ||
pos_right = bisect_right_i8(tdata, v_right, ntrans) - 1 | ||
# timestamp falls to the right side of the DST transition | ||
if v + deltas[pos] == vals[i]: | ||
result_b[i] = v | ||
if v_right + deltas[pos_right] == val: | ||
result_b[i] = v_right | ||
|
||
if infer_dst: | ||
dst_hours = np.empty(n, dtype=np.int64) | ||
|
@@ -935,7 +943,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, | |
stamp = _render_tstamp(vals[trans_idx]) | ||
raise pytz.AmbiguousTimeError( | ||
"Cannot infer dst time from %s as there " | ||
"are no repeated times" % stamp) | ||
"are no repeated times".format(stamp)) | ||
# Split the array into contiguous chunks (where the difference between | ||
# indices is 1). These are effectively dst transitions in different | ||
# years which is useful for checking that there is not an ambiguous | ||
|
@@ -960,18 +968,19 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, | |
if switch_idx.size > 1: | ||
raise pytz.AmbiguousTimeError( | ||
"There are %i dst switches when " | ||
"there should only be 1." % switch_idx.size) | ||
"there should only be 1.".format(switch_idx.size)) | ||
switch_idx = switch_idx[0] + 1 | ||
# Pull the only index and adjust | ||
a_idx = grp[:switch_idx] | ||
b_idx = grp[switch_idx:] | ||
dst_hours[grp] = np.hstack((result_a[a_idx], result_b[b_idx])) | ||
|
||
for i in range(n): | ||
val = vals[i] | ||
left = result_a[i] | ||
right = result_b[i] | ||
if vals[i] == NPY_NAT: | ||
result[i] = vals[i] | ||
if val == NPY_NAT: | ||
result[i] = val | ||
elif left != NPY_NAT and right != NPY_NAT: | ||
if left == right: | ||
result[i] = left | ||
|
@@ -986,19 +995,27 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, | |
elif fill: | ||
result[i] = NPY_NAT | ||
else: | ||
stamp = _render_tstamp(vals[i]) | ||
stamp = _render_tstamp(val) | ||
raise pytz.AmbiguousTimeError( | ||
"Cannot infer dst time from %r, try using the " | ||
"'ambiguous' argument" % stamp) | ||
"'ambiguous' argument".format(stamp)) | ||
elif left != NPY_NAT: | ||
result[i] = left | ||
elif right != NPY_NAT: | ||
result[i] = right | ||
else: | ||
if is_coerce: | ||
# Handle nonexistent times | ||
if shift: | ||
# Shift the nonexistent time forward to the closest existing | ||
# time | ||
remaining_minutes = val % HOURS_NS | ||
new_local = val + (HOURS_NS - remaining_minutes) | ||
delta_idx = trans.searchsorted(new_local, side='right') - 1 | ||
result[i] = new_local - deltas[delta_idx] | ||
elif fill_nonexist or is_coerce: | ||
result[i] = NPY_NAT | ||
else: | ||
stamp = _render_tstamp(vals[i]) | ||
stamp = _render_tstamp(val) | ||
raise pytz.NonExistentTimeError(stamp) | ||
|
||
return result | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -535,6 +535,13 @@ class NaTType(_NaT): | |
- 'NaT' will return NaT for an ambiguous time | ||
- 'raise' will raise an AmbiguousTimeError for an ambiguous time | ||
|
||
nonexistent : 'shift', 'NaT', default 'raise' | ||
- 'shift' will shift the nonexistent time forward to the closest | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, rst formatting nitpick: there needs to be a blank line between the first sentences, and the start of this list ... (getting rst right can be annoying ..) |
||
existing time | ||
- 'NaT' will return NaT where there are nonexistent times | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
- 'raise' will raise an NonExistentTimeError if there are | ||
nonexistent times | ||
|
||
errors : 'raise', 'coerce', default 'raise' | ||
- 'raise' will raise a NonExistentTimeError if a timestamp is not | ||
valid in the specified timezone (e.g. due to a transition from | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -616,7 +616,8 @@ def tz_convert(self, tz): | |
# No conversion since timestamps are all UTC to begin with | ||
return self._shallow_copy(tz=tz) | ||
|
||
def tz_localize(self, tz, ambiguous='raise', errors='raise'): | ||
def tz_localize(self, tz, ambiguous='raise', nonexistent='raise', | ||
errors='raise'): | ||
""" | ||
Localize tz-naive Datetime Array/Index to tz-aware | ||
Datetime Array/Index. | ||
|
@@ -632,8 +633,7 @@ def tz_localize(self, tz, ambiguous='raise', errors='raise'): | |
tz : string, pytz.timezone, dateutil.tz.tzfile or None | ||
Time zone to convert timestamps to. Passing ``None`` will | ||
remove the time zone information preserving local time. | ||
ambiguous : str {'infer', 'NaT', 'raise'} or bool array, | ||
default 'raise' | ||
ambiguous : 'infer', 'NaT', bool array, default 'raise' | ||
|
||
- 'infer' will attempt to infer fall dst-transition hours based on | ||
order | ||
|
@@ -644,6 +644,13 @@ def tz_localize(self, tz, ambiguous='raise', errors='raise'): | |
- 'raise' will raise an AmbiguousTimeError if there are ambiguous | ||
times | ||
|
||
nonexistent : 'shift', 'NaT' default 'raise' | ||
- 'shift' will shift the nonexistent times forward to the closest | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you reuse the explanation about what a non-existent time is that is in the (and idem for the other occurences of this in docstrings) |
||
existing time | ||
- 'NaT' will return NaT where there are nonexistent times | ||
- 'raise' will raise an NonExistentTimeError if there are | ||
nonexistent times | ||
|
||
errors : {'raise', 'coerce'}, default 'raise' | ||
|
||
- 'raise' will raise a NonExistentTimeError if a timestamp is not | ||
|
@@ -703,9 +710,10 @@ def tz_localize(self, tz, ambiguous='raise', errors='raise'): | |
tz = timezones.maybe_get_tz(tz) | ||
# Convert to UTC | ||
|
||
new_dates = conversion.tz_localize_to_utc(self.asi8, tz, | ||
ambiguous=ambiguous, | ||
errors=errors) | ||
new_dates = conversion.tz_localize_to_utc( | ||
self.asi8, tz, ambiguous=ambiguous, nonexistent=nonexistent, | ||
errors=errors | ||
) | ||
new_dates = new_dates.view(_NS_DTYPE) | ||
return self._shallow_copy(new_dates, tz=tz) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -574,6 +574,24 @@ def test_dti_tz_localize_bdate_range(self): | |
localized = dr.tz_localize(pytz.utc) | ||
tm.assert_index_equal(dr_utc, localized) | ||
|
||
@pytest.mark.parametrize('tz', ['Europe/Warsaw', 'dateutil/Europe/Warsaw']) | ||
@pytest.mark.parametrize('method, exp', [ | ||
['shift', '2015-03-29 03:00:00'], | ||
['NaT', pd.NaT], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do you have tests that exericse the assertion when you pass a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just added a test for an invalid |
||
['raise', None] | ||
]) | ||
def test_dti_tz_localize_nonexistent(self, tz, method, exp): | ||
# GH 8917 | ||
n = 60 | ||
dti = date_range(start='2015-03-29 02:00:00', periods=n, freq='min') | ||
if method == 'raise': | ||
with pytest.raises(pytz.NonExistentTimeError): | ||
dti.tz_localize(tz, nonexistent=method) | ||
else: | ||
result = dti.tz_localize(tz, nonexistent=method) | ||
expected = DatetimeIndex([exp] * n, tz=tz) | ||
tm.assert_index_equal(result, expected) | ||
|
||
# ------------------------------------------------------------- | ||
# DatetimeIndex.normalize | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is too short, needs to be same length as title