From 68e862c8430cb7b647394358827d880cd54c7771 Mon Sep 17 00:00:00 2001 From: roch Date: Mon, 9 Mar 2015 21:17:00 -0400 Subject: [PATCH] Fixes to resample over DST boundaries. This requires changes to offset classes that weren't working over such boundaries as well as adding normalize() on Timestamp. --- doc/source/whatsnew/v0.16.0.txt | 2 + pandas/tseries/offsets.py | 35 ++-- pandas/tseries/resample.py | 18 +- .../tests/data/dateoffset_0_15_2.pickle | 187 ++++++++++++++++++ pandas/tseries/tests/test_offsets.py | 39 +++- pandas/tseries/tests/test_resample.py | 57 +++++- pandas/tslib.pyx | 8 + 7 files changed, 326 insertions(+), 20 deletions(-) create mode 100644 pandas/tseries/tests/data/dateoffset_0_15_2.pickle diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index 2e910e32d4dfd..4a68567842711 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -69,6 +69,7 @@ New features - Added ``StringMethods.zfill()`` which behave as the same as standard ``str`` (:issue:`9387`) - Added ``days_in_month`` (compatibility alias ``daysinmonth``) property to ``Timestamp``, ``DatetimeIndex``, ``Period``, ``PeriodIndex``, and ``Series.dt`` (:issue:`9572`) - Added ``decimal`` option in ``to_csv`` to provide formatting for non-'.' decimal separators (:issue:`781`) +- Added ``normalize`` option for ``Timestamp`` to normalized to midnight (:issue:`8794`) .. _whatsnew_0160.enhancements.assign: @@ -461,6 +462,7 @@ Bug Fixes To reproduce the old behavior, simply add more precision to the label (e.g., use ``2000-02-01`` instead of ``2000-02``). - Bug in adding ``offsets.Nano`` to other offets raises ``TypeError`` (:issue:`9284`) - Bug in ``DatetimeIndex`` iteration, related to (:issue:`8890`), fixed in (:issue:`9100`) +- Bug in ``resample`` around DST transitions (:issue:`5172`, :issue:`8744`, :issue:`8653`, :issue:`9173`, :issue:`9468`). This required fixing offset classes so they behave correctly on DST transitions. - Bug in binary operator method (eg ``.mul()``) alignment with integer levels (:issue:`9463`). - Bug in boxplot, scatter and hexbin plot may show an unnecessary warning (:issue:`8877`) - Bug in subplot with ``layout`` kw may show unnecessary warning (:issue:`9464`) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 84449cd2fad98..cb6bd2fb2b250 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -54,12 +54,16 @@ def wrapper(self, other): nano = getattr(other, 'nanosecond', 0) try: - result = func(self, other) + if self._adjust_dst and isinstance(other, Timestamp): + other = other.tz_localize(None) - if self.normalize: - # normalize_date returns normal datetime - result = tslib.normalize_date(result) + result = func(self, other) + if self._adjust_dst: + result = tslib._localize_pydatetime(result, tz) + result = Timestamp(result) + if self.normalize: + result = result.normalize() # nanosecond may be deleted depending on offset process if not self.normalize and nano != 0: @@ -79,7 +83,7 @@ def wrapper(self, other): if self.normalize: # normalize_date returns normal datetime - result = tslib.normalize_date(result) + result = normalize_date(result) if tz is not None and result.tzinfo is None: result = tslib._localize_pydatetime(result, tz) @@ -158,6 +162,7 @@ def __add__(date): 'hour', 'minute', 'second', 'microsecond' ) _use_relativedelta = False + _adjust_dst = False # default for prior pickles normalize = False @@ -380,8 +385,8 @@ def freqstr(self): return fstr - class SingleConstructorOffset(DateOffset): + @classmethod def _from_name(cls, suffix=None): # default _from_name calls cls with no args @@ -389,7 +394,6 @@ def _from_name(cls, suffix=None): raise ValueError("Bad freq suffix %s" % suffix) return cls() - class BusinessMixin(object): """ mixin to business types to provide related functions """ @@ -425,6 +429,7 @@ class BusinessDay(BusinessMixin, SingleConstructorOffset): DateOffset subclass representing possibly n business days """ _prefix = 'B' + _adjust_dst = True def __init__(self, n=1, normalize=False, **kwds): self.n = int(n) @@ -685,6 +690,8 @@ def onOffset(self, dt): class MonthOffset(SingleConstructorOffset): + _adjust_dst = True + @property def name(self): if self.isAnchored: @@ -925,7 +932,7 @@ class Week(DateOffset): weekday : int, default None Always generate specific day of week. 0 for Monday """ - + _adjust_dst = True def __init__(self, n=1, normalize=False, **kwds): self.n = n self.normalize = normalize @@ -1031,7 +1038,9 @@ class WeekOfMonth(DateOffset): 5: Saturdays 6: Sundays """ - + + _adjust_dst = True + def __init__(self, n=1, normalize=False, **kwds): self.n = n self.normalize = normalize @@ -1190,7 +1199,7 @@ class QuarterOffset(DateOffset): _default_startingMonth = None #: default month in _from_name _from_name_startingMonth = None - + _adjust_dst = True # TODO: Consider combining QuarterOffset and YearOffset __init__ at some # point def __init__(self, n=1, normalize=False, **kwds): @@ -1395,7 +1404,7 @@ def apply(self, other): class YearOffset(DateOffset): """DateOffset that just needs a month""" - + _adjust_dst = True def __init__(self, n=1, normalize=False, **kwds): self.month = kwds.get('month', self._default_month) @@ -1627,6 +1636,7 @@ class FY5253(DateOffset): _prefix = 'RE' _suffix_prefix_last = 'L' _suffix_prefix_nearest = 'N' + _adjust_dst = True def __init__(self, n=1, normalize=False, **kwds): self.n = n @@ -1848,6 +1858,7 @@ class FY5253Quarter(DateOffset): """ _prefix = 'REQ' + _adjust_dst = True def __init__(self, n=1, normalize=False, **kwds): self.n = n @@ -1966,6 +1977,8 @@ class Easter(DateOffset): the revised method which is valid in years 1583-4099. ''' + _adjust_dst = True + def __init__(self, n=1, **kwds): super(Easter, self).__init__(n, **kwds) diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 95d3ff015394a..7607bef0f1d71 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -395,8 +395,8 @@ def _get_range_edges(first, last, offset, closed='left', base=0): if not isinstance(offset, Tick): # and first.time() != last.time(): # hack! - first = tools.normalize_date(first) - last = tools.normalize_date(last) + first = first.normalize() + last = last.normalize() if closed == 'left': first = Timestamp(offset.rollback(first)) @@ -409,7 +409,7 @@ def _get_range_edges(first, last, offset, closed='left', base=0): def _adjust_dates_anchored(first, last, offset, closed='right', base=0): - from pandas.tseries.tools import normalize_date +# from pandas.tseries.tools import normalize_date # First and last offsets should be calculated from the start day to fix an # error cause by resampling across multiple days when a one day period is @@ -417,7 +417,10 @@ def _adjust_dates_anchored(first, last, offset, closed='right', base=0): # # See https://github.com/pydata/pandas/issues/8683 - start_day_nanos = Timestamp(normalize_date(first)).value + first_tzinfo = first.tzinfo + first = first.tz_localize(None) + last = last.tz_localize(None) + start_day_nanos = first.normalize().value base_nanos = (base % offset.n) * offset.nanos // offset.n start_day_nanos += base_nanos @@ -451,8 +454,11 @@ def _adjust_dates_anchored(first, last, offset, closed='right', base=0): else: lresult = last.value + offset.nanos - return (Timestamp(fresult, tz=first.tz), - Timestamp(lresult, tz=last.tz)) +# return (Timestamp(fresult, tz=first.tz), +# Timestamp(lresult, tz=last.tz)) + + return (Timestamp(fresult).tz_localize(first_tzinfo), + Timestamp(lresult).tz_localize(first_tzinfo)) def asfreq(obj, freq, method=None, how=None, normalize=False): diff --git a/pandas/tseries/tests/data/dateoffset_0_15_2.pickle b/pandas/tseries/tests/data/dateoffset_0_15_2.pickle new file mode 100644 index 0000000000000..0dc28fba3d800 --- /dev/null +++ b/pandas/tseries/tests/data/dateoffset_0_15_2.pickle @@ -0,0 +1,187 @@ +(dp0 +S'YearBegin' +p1 +ccopy_reg +_reconstructor +p2 +(cpandas.tseries.offsets +YearBegin +p3 +c__builtin__ +object +p4 +Ntp5 +Rp6 +(dp7 +S'normalize' +p8 +I00 +sS'kwds' +p9 +(dp10 +sS'n' +p11 +I1 +sS'_offset' +p12 +cdatetime +timedelta +p13 +(I1 +I0 +I0 +tp14 +Rp15 +sS'month' +p16 +I1 +sS'_use_relativedelta' +p17 +I00 +sbsS'Week' +p18 +g2 +(cpandas.tseries.offsets +Week +p19 +g4 +Ntp20 +Rp21 +(dp22 +g8 +I00 +sS'_inc' +p23 +g13 +(I7 +I0 +I0 +tp24 +Rp25 +sg9 +(dp26 +sS'weekday' +p27 +Nsg11 +I1 +sbsS'MonthBegin' +p28 +g2 +(cpandas.tseries.offsets +MonthBegin +p29 +g4 +Ntp30 +Rp31 +(dp32 +g8 +I00 +sg12 +g13 +(I1 +I0 +I0 +tp33 +Rp34 +sg17 +I00 +sg9 +(dp35 +sg11 +I1 +sbsS'Day' +p36 +g2 +(cpandas.tseries.offsets +Day +p37 +g4 +Ntp38 +Rp39 +(dp40 +g8 +I00 +sg12 +g13 +(I1 +I0 +I0 +tp41 +Rp42 +sg17 +I00 +sg9 +(dp43 +sg11 +I1 +sbsS'DateOffset' +p44 +g2 +(cpandas.tseries.offsets +DateOffset +p45 +g4 +Ntp46 +Rp47 +(dp48 +g8 +I00 +sg12 +g2 +(cdateutil.relativedelta +relativedelta +p49 +g4 +Ntp50 +Rp51 +(dp52 +S'_has_time' +p53 +I0 +sS'hour' +p54 +NsS'seconds' +p55 +I0 +sS'months' +p56 +I0 +sS'year' +p57 +NsS'days' +p58 +I0 +sS'years' +p59 +I1 +sS'hours' +p60 +I0 +sS'second' +p61 +NsS'microsecond' +p62 +Nsg16 +NsS'microseconds' +p63 +I0 +sS'leapdays' +p64 +I0 +sS'minutes' +p65 +I0 +sS'day' +p66 +NsS'minute' +p67 +Nsg27 +Nsbsg17 +I01 +sg9 +(dp68 +g59 +I1 +ssg11 +I1 +sbs. \ No newline at end of file diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py index 9e49cccc2f218..e4533d0ff9476 100644 --- a/pandas/tseries/tests/test_offsets.py +++ b/pandas/tseries/tests/test_offsets.py @@ -1,7 +1,7 @@ import os from datetime import date, datetime, timedelta from dateutil.relativedelta import relativedelta -from pandas.compat import range +from pandas.compat import range, iteritems from pandas import compat import nose from nose.tools import assert_raises @@ -410,6 +410,18 @@ def test_add(self): self.assertTrue(isinstance(result, Timestamp)) self.assertEqual(result, expected_localize) + def test_pickle_v0_15_2(self): + offsets = {'DateOffset': DateOffset(years=1), + 'MonthBegin': MonthBegin(1), + 'Day': Day(1), + 'YearBegin': YearBegin(1), + 'Week': Week(1)} + pickle_path = os.path.join(tm.get_data_path(), + 'dateoffset_0_15_2.pickle') + # This code was executed once on v0.15.2 to generate the pickle: + # with open(pickle_path, 'wb') as f: pickle.dump(offsets, f) + # + self.assertEqual(offsets, read_pickle(pickle_path)) class TestDateOffset(Base): _multiprocess_can_split_ = True @@ -3298,7 +3310,30 @@ def test_springforward_singular(self): tstart=self._make_timestamp(self.ts_pre_springfwd, hrs_pre, tz), expected_utc_offset=None ) - + + def test_all_offset_classes(self): + tests = {MonthBegin: ['11/2/2012', '12/1/2012'], + MonthEnd: ['11/2/2012', '11/30/2012'], + BMonthBegin: ['11/2/2012', '12/3/2012'], + BMonthEnd: ['11/2/2012', '11/30/2012'], + CBMonthBegin: ['11/2/2012', '12/3/2012'], + CBMonthEnd: ['11/2/2012', '11/30/2012'], + Week: ['11/2/2012', '11/9/2012'], + YearBegin: ['11/2/2012', '1/1/2013'], + YearEnd: ['11/2/2012', '12/31/2012'], + BYearBegin: ['11/2/2012', '1/1/2013'], + BYearEnd: ['11/2/2012', '12/31/2012'], + QuarterBegin: ['11/2/2012', '12/1/2012'], + QuarterEnd: ['11/2/2012', '12/31/2012'], + BQuarterBegin: ['11/2/2012', '12/3/2012'], + BQuarterEnd: ['11/2/2012', '12/31/2012'], + Day: ['11/4/2012', '11/4/2012 23:00'] + } + + for offset, test_values in iteritems(tests): + first = Timestamp(test_values[0], tz='US/Eastern') + offset() + second = Timestamp(test_values[1], tz='US/Eastern') + self.assertEqual(first, second, str(offset)) if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 42b09b699b919..3e565d5764fe2 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -871,7 +871,62 @@ def test_resample_timegrouper(self): result = df.groupby(pd.Grouper(freq='M', key='A')).count() assert_frame_equal(result, expected) - + def test_resmaple_dst_anchor(self): + # 5172 + dti = DatetimeIndex([datetime(2012, 11, 4, 23)], tz='US/Eastern') + df = DataFrame([5], index=dti) + assert_frame_equal(df.resample(rule='D', how='sum'), + DataFrame([5], index=df.index.normalize())) + df.resample(rule='MS', how='sum') + assert_frame_equal(df.resample(rule='MS', how='sum'), + DataFrame([5], index=DatetimeIndex([datetime(2012, 11, 1)], + tz='US/Eastern'))) + + dti = date_range('2013-09-30', '2013-11-02', freq='30Min', tz='Europe/Paris') + values = range(dti.size) + df = DataFrame({"a": values, "b": values, "c": values}, index=dti) + how = {"a": "min", "b": "max", "c": "count"} + + assert_frame_equal(df.resample("W-MON", how=how)[["a", "b", "c"]], + DataFrame({"a": [0, 48, 384, 720, 1056, 1394], + "b": [47, 383, 719, 1055, 1393, 1586], + "c": [48, 336, 336, 336, 338, 193]}, + index=date_range('9/30/2013', '11/4/2013', + freq='W-MON', tz='Europe/Paris')), + 'W-MON Frequency') + + assert_frame_equal(df.resample("2W-MON", how=how)[["a", "b", "c"]], + DataFrame({"a": [0, 48, 720, 1394], + "b": [47, 719, 1393, 1586], + "c": [48, 672, 674, 193]}, + index=date_range('9/30/2013', '11/11/2013', + freq='2W-MON', tz='Europe/Paris')), + '2W-MON Frequency') + + assert_frame_equal(df.resample("MS", how=how)[["a", "b", "c"]], + DataFrame({"a": [0, 48, 1538], + "b": [47, 1537, 1586], + "c": [48, 1490, 49]}, + index=date_range('9/1/2013', '11/1/2013', + freq='MS', tz='Europe/Paris')), + 'MS Frequency') + + assert_frame_equal(df.resample("2MS", how=how)[["a", "b", "c"]], + DataFrame({"a": [0, 1538], + "b": [1537, 1586], + "c": [1538, 49]}, + index=date_range('9/1/2013', '11/1/2013', + freq='2MS', tz='Europe/Paris')), + '2MS Frequency') + + df_daily = df['10/26/2013':'10/29/2013'] + assert_frame_equal(df_daily.resample("D", how={"a": "min", "b": "max", "c": "count"})[["a", "b", "c"]], + DataFrame({"a": [1248, 1296, 1346, 1394], + "b": [1295, 1345, 1393, 1441], + "c": [48, 50, 48, 48]}, + index=date_range('10/26/2013', '10/29/2013', + freq='D', tz='Europe/Paris')), + 'D Frequency') def _simple_ts(start, end, freq='D'): rng = date_range(start, end, freq=freq) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index eee72f268036a..763c06c7d4e7c 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -564,6 +564,14 @@ class Timestamp(_Timestamp): self.nanosecond/3600.0/1e+9 )/24.0) + def normalize(self): + """ + Normalize Timestamp to midnight, preserving + tz information. + """ + normalized_value = date_normalize(np.array([self.value]), tz=self.tz)[0] + return Timestamp(normalized_value).tz_localize(self.tz) + def __radd__(self, other): # __radd__ on cython extension types like _Timestamp is not used, so # define it here instead