diff --git a/doc/source/api.rst b/doc/source/api.rst index f74f5f0d28a58..5ad36b3c8b45c 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -868,3 +868,149 @@ Serialization / IO / Conversion Panel.to_frame Panel.to_clipboard +.. currentmodule:: pandas.core.index + +.. _api.index + +Index +----- + +**Many of these methods or variants thereof are available on the objects that contain an index (Series/Dataframe) +and those should most likely be used before calling these methods directly.** + + * **values** +Modifying and Computations +~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Index.copy + Index.delete + Index.diff + Index.drop + Index.equals + Index.identical + Index.insert + Index.order + Index.reindex + Index.repeat + Index.set_names + Index.unique + +Conversion +~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Index.astype + Index.tolist + Index.to_datetime + Index.to_series + +Sorting +~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Index.argsort + Index.order + Index.sort + +Time-specific operations +~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Index.shift + +Combining / joining / merging +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Index.append + Index.intersection + Index.join + Index.union + +Selecting +~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Index.get_indexer + Index.get_indexer_non_unique + Index.get_level_values + Index.get_loc + Index.get_value + Index.isin + Index.slice_indexer + Index.slice_locs + +Properties +~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Index.is_monotonic + Index.is_numeric + +.. currentmodule:: pandas.tseries.index + +.. _api.datetimeindex: + +DatetimeIndex +------------- + +Time/Date Components +~~~~~~~~~~~~~~~~~~~~ + * **year** + * **month** + * **day** + * **hour** + * **minute** + * **second** + * **microsecond** + * **nanosecond** + + * **weekofyear** + * **week**: Same as weekofyear + * **dayofweek**: (0=Monday, 6=Sunday) + * **weekday**: (0=Monday, 6=Sunday) + * **dayofyear** + * **quarter** + + * **date**: Returns date component of Timestamps + * **time**: Returns time component of Timestamps + + +Selecting +~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DatetimeIndex.indexer_at_time + DatetimeIndex.indexer_between_time + + +Time-specific operations +~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DatetimeIndex.normalize + DatetimeIndex.snap + DatetimeIndex.tz_convert + DatetimeIndex.tz_localize + + +Conversion +~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DatetimeIndex.to_datetime + DatetimeIndex.to_period + DatetimeIndex.to_pydatetime + + diff --git a/doc/source/release.rst b/doc/source/release.rst index 058ea165120a6..fe21aa4326a07 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -160,6 +160,9 @@ Improvements to existing features :issue:`4998`) - ``to_dict`` now takes ``records`` as a possible outtype. Returns an array of column-keyed dictionaries. (:issue:`4936`) + - ``tz_localize`` can infer a fall daylight savings transition based on the + structure of unlocalized data (:issue:`4230`) + - DatetimeIndex is now in the API documentation API Changes ~~~~~~~~~~~ diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 85ac48c379aad..33b16c1448ed8 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1108,6 +1108,20 @@ TimeSeries, aligning the data on the UTC timestamps: .. _timeseries.timedeltas: +In some cases, localize cannot determine the DST and non-DST hours when there are +duplicates. This often happens when reading files that simply duplicate the hours. +The infer_dst argument in tz_localize will attempt +to determine the right offset. + +.. ipython:: python + + rng_hourly = DatetimeIndex(['11/06/2011 00:00', '11/06/2011 01:00', + '11/06/2011 01:00', '11/06/2011 02:00', + '11/06/2011 03:00']) + rng_hourly.tz_localize('US/Eastern') + rng_hourly_eastern = rng_hourly.tz_localize('US/Eastern', infer_dst=True) + rng_hourly_eastern.values + Time Deltas ----------- diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index 90d2989de65c2..9cf944895d64f 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -8,7 +8,7 @@ enhancements along with a large number of bug fixes. .. warning:: - In 0.13.0 ``Series`` has internaly been refactored to no longer sub-class ``ndarray`` + In 0.13.0 ``Series`` has internally been refactored to no longer sub-class ``ndarray`` but instead subclass ``NDFrame``, similarly to the rest of the pandas containers. This should be a transparent change with only very limited API implications. See :ref:`Internal Refactoring` @@ -481,6 +481,10 @@ Enhancements :ref:`See the docs` for more. + - ``tz_localize`` can infer a fall daylight savings transition based on the structure + of the unlocalized data (:issue:`4230`), see :ref:`here` + - DatetimeIndex is now in the API documentation, see :ref:`here` + .. _whatsnew_0130.experimental: Experimental diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3142f74f2f5c5..5ac9d12de8a9a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2752,7 +2752,7 @@ def tz_convert(self, tz, axis=0, copy=True): return new_obj - def tz_localize(self, tz, axis=0, copy=True): + def tz_localize(self, tz, axis=0, copy=True, infer_dst=False): """ Localize tz-naive TimeSeries to target time zone @@ -2761,6 +2761,8 @@ def tz_localize(self, tz, axis=0, copy=True): tz : string or pytz.timezone object copy : boolean, default True Also make a copy of the underlying data + infer_dst : boolean, default False + Attempt to infer fall dst-transition times based on order Returns ------- @@ -2778,7 +2780,7 @@ def tz_localize(self, tz, axis=0, copy=True): new_data = new_data.copy() new_obj = self._constructor(new_data) - new_ax = ax.tz_localize(tz) + new_ax = ax.tz_localize(tz, infer_dst=infer_dst) if axis == 0: new_obj._set_axis(1, new_ax) diff --git a/pandas/core/series.py b/pandas/core/series.py index 79faad93ff1c1..ddbb67cc0c323 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2331,7 +2331,7 @@ def tz_convert(self, tz, copy=True): return self._constructor(new_values, index=new_index, name=self.name) - def tz_localize(self, tz, copy=True): + def tz_localize(self, tz, copy=True, infer_dst=False): """ Localize tz-naive TimeSeries to target time zone Entries will retain their "naive" value but will be annotated as @@ -2345,6 +2345,8 @@ def tz_localize(self, tz, copy=True): tz : string or pytz.timezone object copy : boolean, default True Also make a copy of the underlying data + infer_dst : boolean, default False + Attempt to infer fall dst-transition hours based on order Returns ------- @@ -2358,7 +2360,7 @@ def tz_localize(self, tz, copy=True): new_index = DatetimeIndex([], tz=tz) else: - new_index = self.index.tz_localize(tz) + new_index = self.index.tz_localize(tz, infer_dst=infer_dst) new_values = self.values if copy: diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 24e94f4c2d482..281ac0cc8a35a 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -147,6 +147,7 @@ def __new__(cls, data=None, dayfirst = kwds.pop('dayfirst', None) yearfirst = kwds.pop('yearfirst', None) + infer_dst = kwds.pop('infer_dst', False) warn = False if 'offset' in kwds and kwds['offset']: freq = kwds['offset'] @@ -183,7 +184,8 @@ def __new__(cls, data=None, if data is None: return cls._generate(start, end, periods, name, offset, - tz=tz, normalize=normalize) + tz=tz, normalize=normalize, + infer_dst=infer_dst) if not isinstance(data, np.ndarray): if np.isscalar(data): @@ -209,7 +211,7 @@ def __new__(cls, data=None, data.name = name if tz is not None: - return data.tz_localize(tz) + return data.tz_localize(tz, infer_dst=infer_dst) return data @@ -261,7 +263,8 @@ def __new__(cls, data=None, getattr(data, 'tz', None) is None): # Convert tz-naive to UTC ints = subarr.view('i8') - subarr = tslib.tz_localize_to_utc(ints, tz) + subarr = tslib.tz_localize_to_utc(ints, tz, + infer_dst=infer_dst) subarr = subarr.view(_NS_DTYPE) @@ -286,7 +289,7 @@ def __new__(cls, data=None, @classmethod def _generate(cls, start, end, periods, name, offset, - tz=None, normalize=False): + tz=None, normalize=False, infer_dst=False): if com._count_not_none(start, end, periods) != 2: raise ValueError('Must specify two of start, end, or periods') @@ -375,7 +378,8 @@ def _generate(cls, start, end, periods, name, offset, index = _generate_regular_range(start, end, periods, offset) if tz is not None and getattr(index, 'tz', None) is None: - index = tslib.tz_localize_to_utc(com._ensure_int64(index), tz) + index = tslib.tz_localize_to_utc(com._ensure_int64(index), tz, + infer_dst=infer_dst) index = index.view(_NS_DTYPE) index = index.view(cls) @@ -1537,9 +1541,17 @@ def tz_convert(self, tz): # No conversion since timestamps are all UTC to begin with return self._simple_new(self.values, self.name, self.offset, tz) - def tz_localize(self, tz): + def tz_localize(self, tz, infer_dst=False): """ Localize tz-naive DatetimeIndex to given time zone (using pytz) + + Parameters + ---------- + tz : string or pytz.timezone + Time zone for time. Corresponding timestamps would be converted to + time zone of the TimeSeries + infer_dst : boolean, default False + Attempt to infer fall dst-transition hours based on order Returns ------- @@ -1550,7 +1562,7 @@ def tz_localize(self, tz): tz = tools._maybe_get_tz(tz) # Convert to UTC - new_dates = tslib.tz_localize_to_utc(self.asi8, tz) + new_dates = tslib.tz_localize_to_utc(self.asi8, tz, infer_dst=infer_dst) new_dates = new_dates.view(_NS_DTYPE) return self._simple_new(new_dates, self.name, self.offset, tz) diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index 80d85241ae0ff..083de95895d18 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -360,6 +360,32 @@ def test_with_tz_ambiguous_times(self): dr = date_range(datetime(2011, 3, 13), periods=48, freq=datetools.Minute(30), tz=pytz.utc) + def test_infer_dst(self): + # November 6, 2011, fall back, repeat 2 AM hour + # With no repeated hours, we cannot infer the transition + tz = pytz.timezone('US/Eastern') + dr = date_range(datetime(2011, 11, 6, 0), periods=5, + freq=datetools.Hour()) + self.assertRaises(pytz.AmbiguousTimeError, dr.tz_localize, + tz, infer_dst=True) + + # With repeated hours, we can infer the transition + dr = date_range(datetime(2011, 11, 6, 0), periods=5, + freq=datetools.Hour(), tz=tz) + di = DatetimeIndex(['11/06/2011 00:00', '11/06/2011 01:00', + '11/06/2011 01:00', '11/06/2011 02:00', + '11/06/2011 03:00']) + localized = di.tz_localize(tz, infer_dst=True) + self.assert_(np.array_equal(dr, localized)) + + # When there is no dst transition, nothing special happens + dr = date_range(datetime(2011, 6, 1, 0), periods=10, + freq=datetools.Hour()) + localized = dr.tz_localize(tz) + localized_infer = dr.tz_localize(tz, infer_dst=True) + self.assert_(np.array_equal(localized, localized_infer)) + + # test utility methods def test_infer_tz(self): eastern = pytz.timezone('US/Eastern') diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index a8c27806c2c1e..5f81389f318f8 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -1630,7 +1630,7 @@ cpdef ndarray _unbox_utcoffsets(object transinfo): @cython.boundscheck(False) @cython.wraparound(False) -def tz_localize_to_utc(ndarray[int64_t] vals, object tz): +def tz_localize_to_utc(ndarray[int64_t] vals, object tz, bint infer_dst=False): """ Localize tzinfo-naive DateRange to given time zone (using pytz). If there are ambiguities in the values, raise AmbiguousTimeError. @@ -1644,7 +1644,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz): Py_ssize_t i, idx, pos, ntrans, n = len(vals) int64_t *tdata int64_t v, left, right - ndarray[int64_t] result, result_a, result_b + ndarray[int64_t] result, result_a, result_b, dst_hours pandas_datetimestruct dts # Vectorized version of DstTzInfo.localize @@ -1701,6 +1701,48 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz): # timestamp falls to the right side of the DST transition if v + deltas[pos] == vals[i]: result_b[i] = v + + + if infer_dst: + dst_hours = np.empty(n, dtype=np.int64) + dst_hours.fill(NPY_NAT) + + # Get the ambiguous hours (given the above, these are the hours + # where result_a != result_b and neither of them are NAT) + both_nat = np.logical_and(result_a != NPY_NAT, result_b != NPY_NAT) + both_eq = result_a == result_b + trans_idx = np.squeeze(np.nonzero(np.logical_and(both_nat, ~both_eq))) + if trans_idx.size == 1: + stamp = Timestamp(vals[trans_idx]) + raise pytz.AmbiguousTimeError("Cannot infer dst time from %s as" + "there are no repeated times" % stamp) + # Split the array into contiguous chunks (where the difference between + # indices is 1). These are effectively dst transitions in different years + # which is useful for checking that there is not an ambiguous transition + # in an individual year. + if trans_idx.size > 0: + one_diff = np.where(np.diff(trans_idx)!=1)[0]+1 + trans_grp = np.array_split(trans_idx, one_diff) + + # Iterate through each day, if there are no hours where the delta is negative + # (indicates a repeat of hour) the switch cannot be inferred + for grp in trans_grp: + + delta = np.diff(result_a[grp]) + if grp.size == 1 or np.all(delta>0): + stamp = Timestamp(vals[grp[0]]) + raise pytz.AmbiguousTimeError(stamp) + + # Find the index for the switch and pull from a for dst and b for standard + switch_idx = (delta<=0).nonzero()[0] + if switch_idx.size > 1: + raise pytz.AmbiguousTimeError("There are %i dst switches " + "when there should only be 1." + % switch_idx.size) + switch_idx = switch_idx[0]+1 # Pull the only index and adjust + a_idx = grp[:switch_idx] + b_idx = grp[switch_idx:] + dst_hours[grp] = np.hstack((result_a[a_idx], result_b[b_idx])) for i in range(n): left = result_a[i] @@ -1709,8 +1751,13 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz): if left == right: result[i] = left else: - stamp = Timestamp(vals[i]) - raise pytz.AmbiguousTimeError(stamp) + if infer_dst and dst_hours[i] != NPY_NAT: + result[i] = dst_hours[i] + else: + stamp = Timestamp(vals[i]) + raise pytz.AmbiguousTimeError("Cannot infer dst time from %r, "\ + "try using the 'infer_dst' argument" + % stamp) elif left != NPY_NAT: result[i] = left elif right != NPY_NAT: diff --git a/vb_suite/timeseries.py b/vb_suite/timeseries.py index 353d7afc63cb3..a990a9873cea0 100644 --- a/vb_suite/timeseries.py +++ b/vb_suite/timeseries.py @@ -225,3 +225,21 @@ def date_range(start=None, end=None, periods=None, freq=None): datetimeindex_unique = Benchmark('index.unique()', setup, start_date=datetime(2012, 7, 1)) + +# tz_localize with infer argument. This is an attempt to emulate the results +# of read_csv with duplicated data. Not passing infer_dst will fail +setup = common_setup + """ +dst_rng = date_range('10/29/2000 1:00:00', + '10/29/2000 1:59:59', freq='S') +index = date_range('10/29/2000', '10/29/2000 00:59:59', freq='S') +index = index.append(dst_rng) +index = index.append(dst_rng) +index = index.append(date_range('10/29/2000 2:00:00', + '10/29/2000 3:00:00', freq='S')) +""" + +datetimeindex_infer_dst = \ +Benchmark('index.tz_localize("US/Eastern", infer_dst=True)', + setup, start_date=datetime(2013, 9, 30)) + +