Skip to content

Commit e5ea6c8

Browse files
committed
ENH: Ability to tz localize when index is implicility in tz
Fix to issue #4230 which allows to localize an index which is implicitly in a tz (e.g., reading from a file) by passing infer_dst to tz_localize.
1 parent 354f10a commit e5ea6c8

File tree

9 files changed

+144
-16
lines changed

9 files changed

+144
-16
lines changed

doc/source/release.rst

+3
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,9 @@ Improvements to existing features
160160
:issue:`4998`)
161161
- ``to_dict`` now takes ``records`` as a possible outtype. Returns an array
162162
of column-keyed dictionaries. (:issue:`4936`)
163+
- ``tz_localize`` can infer a fall daylight savings transition based on the
164+
structure of unlocalized data (:issue:`4230`)
165+
- DatetimeIndex is now in the API documentation
163166

164167
API Changes
165168
~~~~~~~~~~~

doc/source/timeseries.rst

+14
Original file line numberDiff line numberDiff line change
@@ -1108,6 +1108,20 @@ TimeSeries, aligning the data on the UTC timestamps:
11081108
11091109
.. _timeseries.timedeltas:
11101110

1111+
In some cases, localize cannot determine the DST and non-DST hours when there are
1112+
duplicates. This often happens when reading files that simply duplicate the hours.
1113+
The infer_dst argument in tz_localize will attempt
1114+
to determine the right offset.
1115+
1116+
.. ipython:: python
1117+
1118+
rng_hourly = DatetimeIndex(['11/06/2011 00:00', '11/06/2011 01:00',
1119+
'11/06/2011 01:00', '11/06/2011 02:00',
1120+
'11/06/2011 03:00'])
1121+
rng_hourly.tz_localize('US/Eastern')
1122+
rng_hourly_eastern = rng_hourly.tz_localize('US/Eastern', infer_dst=True)
1123+
rng_hourly_eastern.values
1124+
11111125
Time Deltas
11121126
-----------
11131127

doc/source/v0.13.0.txt

+5-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ enhancements along with a large number of bug fixes.
88

99
.. warning::
1010

11-
In 0.13.0 ``Series`` has internaly been refactored to no longer sub-class ``ndarray``
11+
In 0.13.0 ``Series`` has internally been refactored to no longer sub-class ``ndarray``
1212
but instead subclass ``NDFrame``, similarly to the rest of the pandas containers. This should be
1313
a transparent change with only very limited API implications. See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
1414

@@ -481,6 +481,10 @@ Enhancements
481481

482482
:ref:`See the docs<indexing.basics.indexing_isin>` for more.
483483

484+
- ``tz_localize`` can infer a fall daylight savings transition based on the structure
485+
of the unlocalized data (:issue:`4230`), see :ref:`here<timeseries.timezone>`
486+
- DatetimeIndex is now in the API documentation, see :ref:`here<api.datetimeindex>`
487+
484488
.. _whatsnew_0130.experimental:
485489

486490
Experimental

pandas/core/generic.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -2752,7 +2752,7 @@ def tz_convert(self, tz, axis=0, copy=True):
27522752

27532753
return new_obj
27542754

2755-
def tz_localize(self, tz, axis=0, copy=True):
2755+
def tz_localize(self, tz, axis=0, copy=True, infer_dst=False):
27562756
"""
27572757
Localize tz-naive TimeSeries to target time zone
27582758
@@ -2761,6 +2761,8 @@ def tz_localize(self, tz, axis=0, copy=True):
27612761
tz : string or pytz.timezone object
27622762
copy : boolean, default True
27632763
Also make a copy of the underlying data
2764+
infer_dst : boolean, default False
2765+
Attempt to infer fall dst-transition times based on order
27642766
27652767
Returns
27662768
-------
@@ -2778,7 +2780,7 @@ def tz_localize(self, tz, axis=0, copy=True):
27782780
new_data = new_data.copy()
27792781

27802782
new_obj = self._constructor(new_data)
2781-
new_ax = ax.tz_localize(tz)
2783+
new_ax = ax.tz_localize(tz, infer_dst=infer_dst)
27822784

27832785
if axis == 0:
27842786
new_obj._set_axis(1, new_ax)

pandas/core/series.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -2331,7 +2331,7 @@ def tz_convert(self, tz, copy=True):
23312331

23322332
return self._constructor(new_values, index=new_index, name=self.name)
23332333

2334-
def tz_localize(self, tz, copy=True):
2334+
def tz_localize(self, tz, copy=True, infer_dst=False):
23352335
"""
23362336
Localize tz-naive TimeSeries to target time zone
23372337
Entries will retain their "naive" value but will be annotated as
@@ -2345,6 +2345,8 @@ def tz_localize(self, tz, copy=True):
23452345
tz : string or pytz.timezone object
23462346
copy : boolean, default True
23472347
Also make a copy of the underlying data
2348+
infer_dst : boolean, default False
2349+
Attempt to infer fall dst-transition hours based on order
23482350
23492351
Returns
23502352
-------
@@ -2358,7 +2360,7 @@ def tz_localize(self, tz, copy=True):
23582360

23592361
new_index = DatetimeIndex([], tz=tz)
23602362
else:
2361-
new_index = self.index.tz_localize(tz)
2363+
new_index = self.index.tz_localize(tz, infer_dst=infer_dst)
23622364

23632365
new_values = self.values
23642366
if copy:

pandas/tseries/index.py

+19-7
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ def __new__(cls, data=None,
147147

148148
dayfirst = kwds.pop('dayfirst', None)
149149
yearfirst = kwds.pop('yearfirst', None)
150+
infer_dst = kwds.pop('infer_dst', False)
150151
warn = False
151152
if 'offset' in kwds and kwds['offset']:
152153
freq = kwds['offset']
@@ -183,7 +184,8 @@ def __new__(cls, data=None,
183184

184185
if data is None:
185186
return cls._generate(start, end, periods, name, offset,
186-
tz=tz, normalize=normalize)
187+
tz=tz, normalize=normalize,
188+
infer_dst=infer_dst)
187189

188190
if not isinstance(data, np.ndarray):
189191
if np.isscalar(data):
@@ -209,7 +211,7 @@ def __new__(cls, data=None,
209211
data.name = name
210212

211213
if tz is not None:
212-
return data.tz_localize(tz)
214+
return data.tz_localize(tz, infer_dst=infer_dst)
213215

214216
return data
215217

@@ -261,7 +263,8 @@ def __new__(cls, data=None,
261263
getattr(data, 'tz', None) is None):
262264
# Convert tz-naive to UTC
263265
ints = subarr.view('i8')
264-
subarr = tslib.tz_localize_to_utc(ints, tz)
266+
subarr = tslib.tz_localize_to_utc(ints, tz,
267+
infer_dst=infer_dst)
265268

266269
subarr = subarr.view(_NS_DTYPE)
267270

@@ -286,7 +289,7 @@ def __new__(cls, data=None,
286289

287290
@classmethod
288291
def _generate(cls, start, end, periods, name, offset,
289-
tz=None, normalize=False):
292+
tz=None, normalize=False, infer_dst=False):
290293
if com._count_not_none(start, end, periods) != 2:
291294
raise ValueError('Must specify two of start, end, or periods')
292295

@@ -375,7 +378,8 @@ def _generate(cls, start, end, periods, name, offset,
375378
index = _generate_regular_range(start, end, periods, offset)
376379

377380
if tz is not None and getattr(index, 'tz', None) is None:
378-
index = tslib.tz_localize_to_utc(com._ensure_int64(index), tz)
381+
index = tslib.tz_localize_to_utc(com._ensure_int64(index), tz,
382+
infer_dst=infer_dst)
379383
index = index.view(_NS_DTYPE)
380384

381385
index = index.view(cls)
@@ -1537,9 +1541,17 @@ def tz_convert(self, tz):
15371541
# No conversion since timestamps are all UTC to begin with
15381542
return self._simple_new(self.values, self.name, self.offset, tz)
15391543

1540-
def tz_localize(self, tz):
1544+
def tz_localize(self, tz, infer_dst=False):
15411545
"""
15421546
Localize tz-naive DatetimeIndex to given time zone (using pytz)
1547+
1548+
Parameters
1549+
----------
1550+
tz : string or pytz.timezone
1551+
Time zone for time. Corresponding timestamps would be converted to
1552+
time zone of the TimeSeries
1553+
infer_dst : boolean, default False
1554+
Attempt to infer fall dst-transition hours based on order
15431555
15441556
Returns
15451557
-------
@@ -1550,7 +1562,7 @@ def tz_localize(self, tz):
15501562
tz = tools._maybe_get_tz(tz)
15511563

15521564
# Convert to UTC
1553-
new_dates = tslib.tz_localize_to_utc(self.asi8, tz)
1565+
new_dates = tslib.tz_localize_to_utc(self.asi8, tz, infer_dst=infer_dst)
15541566
new_dates = new_dates.view(_NS_DTYPE)
15551567

15561568
return self._simple_new(new_dates, self.name, self.offset, tz)

pandas/tseries/tests/test_timezones.py

+26
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,32 @@ def test_with_tz_ambiguous_times(self):
360360
dr = date_range(datetime(2011, 3, 13), periods=48,
361361
freq=datetools.Minute(30), tz=pytz.utc)
362362

363+
def test_infer_dst(self):
364+
# November 6, 2011, fall back, repeat 2 AM hour
365+
# With no repeated hours, we cannot infer the transition
366+
tz = pytz.timezone('US/Eastern')
367+
dr = date_range(datetime(2011, 11, 6, 0), periods=5,
368+
freq=datetools.Hour())
369+
self.assertRaises(pytz.AmbiguousTimeError, dr.tz_localize,
370+
tz, infer_dst=True)
371+
372+
# With repeated hours, we can infer the transition
373+
dr = date_range(datetime(2011, 11, 6, 0), periods=5,
374+
freq=datetools.Hour(), tz=tz)
375+
di = DatetimeIndex(['11/06/2011 00:00', '11/06/2011 01:00',
376+
'11/06/2011 01:00', '11/06/2011 02:00',
377+
'11/06/2011 03:00'])
378+
localized = di.tz_localize(tz, infer_dst=True)
379+
self.assert_(np.array_equal(dr, localized))
380+
381+
# When there is no dst transition, nothing special happens
382+
dr = date_range(datetime(2011, 6, 1, 0), periods=10,
383+
freq=datetools.Hour())
384+
localized = dr.tz_localize(tz)
385+
localized_infer = dr.tz_localize(tz, infer_dst=True)
386+
self.assert_(np.array_equal(localized, localized_infer))
387+
388+
363389
# test utility methods
364390
def test_infer_tz(self):
365391
eastern = pytz.timezone('US/Eastern')

pandas/tslib.pyx

+51-4
Original file line numberDiff line numberDiff line change
@@ -1630,7 +1630,7 @@ cpdef ndarray _unbox_utcoffsets(object transinfo):
16301630

16311631
@cython.boundscheck(False)
16321632
@cython.wraparound(False)
1633-
def tz_localize_to_utc(ndarray[int64_t] vals, object tz):
1633+
def tz_localize_to_utc(ndarray[int64_t] vals, object tz, bint infer_dst=False):
16341634
"""
16351635
Localize tzinfo-naive DateRange to given time zone (using pytz). If
16361636
there are ambiguities in the values, raise AmbiguousTimeError.
@@ -1644,7 +1644,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz):
16441644
Py_ssize_t i, idx, pos, ntrans, n = len(vals)
16451645
int64_t *tdata
16461646
int64_t v, left, right
1647-
ndarray[int64_t] result, result_a, result_b
1647+
ndarray[int64_t] result, result_a, result_b, dst_hours
16481648
pandas_datetimestruct dts
16491649

16501650
# Vectorized version of DstTzInfo.localize
@@ -1701,6 +1701,48 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz):
17011701
# timestamp falls to the right side of the DST transition
17021702
if v + deltas[pos] == vals[i]:
17031703
result_b[i] = v
1704+
1705+
1706+
if infer_dst:
1707+
dst_hours = np.empty(n, dtype=np.int64)
1708+
dst_hours.fill(NPY_NAT)
1709+
1710+
# Get the ambiguous hours (given the above, these are the hours
1711+
# where result_a != result_b and neither of them are NAT)
1712+
both_nat = np.logical_and(result_a != NPY_NAT, result_b != NPY_NAT)
1713+
both_eq = result_a == result_b
1714+
trans_idx = np.squeeze(np.nonzero(np.logical_and(both_nat, ~both_eq)))
1715+
if trans_idx.size == 1:
1716+
stamp = Timestamp(vals[trans_idx])
1717+
raise pytz.AmbiguousTimeError("Cannot infer dst time from %s as"
1718+
"there are no repeated times" % stamp)
1719+
# Split the array into contiguous chunks (where the difference between
1720+
# indices is 1). These are effectively dst transitions in different years
1721+
# which is useful for checking that there is not an ambiguous transition
1722+
# in an individual year.
1723+
if trans_idx.size > 0:
1724+
one_diff = np.where(np.diff(trans_idx)!=1)[0]+1
1725+
trans_grp = np.array_split(trans_idx, one_diff)
1726+
1727+
# Iterate through each day, if there are no hours where the delta is negative
1728+
# (indicates a repeat of hour) the switch cannot be inferred
1729+
for grp in trans_grp:
1730+
1731+
delta = np.diff(result_a[grp])
1732+
if grp.size == 1 or np.all(delta>0):
1733+
stamp = Timestamp(vals[grp[0]])
1734+
raise pytz.AmbiguousTimeError(stamp)
1735+
1736+
# Find the index for the switch and pull from a for dst and b for standard
1737+
switch_idx = (delta<=0).nonzero()[0]
1738+
if switch_idx.size > 1:
1739+
raise pytz.AmbiguousTimeError("There are %i dst switches "
1740+
"when there should only be 1."
1741+
% switch_idx.size)
1742+
switch_idx = switch_idx[0]+1 # Pull the only index and adjust
1743+
a_idx = grp[:switch_idx]
1744+
b_idx = grp[switch_idx:]
1745+
dst_hours[grp] = np.hstack((result_a[a_idx], result_b[b_idx]))
17041746

17051747
for i in range(n):
17061748
left = result_a[i]
@@ -1709,8 +1751,13 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz):
17091751
if left == right:
17101752
result[i] = left
17111753
else:
1712-
stamp = Timestamp(vals[i])
1713-
raise pytz.AmbiguousTimeError(stamp)
1754+
if infer_dst and dst_hours[i] != NPY_NAT:
1755+
result[i] = dst_hours[i]
1756+
else:
1757+
stamp = Timestamp(vals[i])
1758+
raise pytz.AmbiguousTimeError("Cannot infer dst time from %r, "\
1759+
"try using the 'infer_dst' argument"
1760+
% stamp)
17141761
elif left != NPY_NAT:
17151762
result[i] = left
17161763
elif right != NPY_NAT:

vb_suite/timeseries.py

+18
Original file line numberDiff line numberDiff line change
@@ -225,3 +225,21 @@ def date_range(start=None, end=None, periods=None, freq=None):
225225

226226
datetimeindex_unique = Benchmark('index.unique()', setup,
227227
start_date=datetime(2012, 7, 1))
228+
229+
# tz_localize with infer argument. This is an attempt to emulate the results
230+
# of read_csv with duplicated data. Not passing infer_dst will fail
231+
setup = common_setup + """
232+
dst_rng = date_range('10/29/2000 1:00:00',
233+
'10/29/2000 1:59:59', freq='S')
234+
index = date_range('10/29/2000', '10/29/2000 00:59:59', freq='S')
235+
index = index.append(dst_rng)
236+
index = index.append(dst_rng)
237+
index = index.append(date_range('10/29/2000 2:00:00',
238+
'10/29/2000 3:00:00', freq='S'))
239+
"""
240+
241+
datetimeindex_infer_dst = \
242+
Benchmark('index.tz_localize("US/Eastern", infer_dst=True)',
243+
setup, start_date=datetime(2013, 9, 30))
244+
245+

0 commit comments

Comments
 (0)