From 68e862c8430cb7b647394358827d880cd54c7771 Mon Sep 17 00:00:00 2001
From: roch <grant.roch@gmail.com>
Date: Mon, 9 Mar 2015 21:17:00 -0400
Subject: [PATCH] Fixes to resample over DST boundaries.  This requires changes
 to offset classes that weren't working over such boundaries as well as adding
 normalize() on Timestamp.

---
 doc/source/whatsnew/v0.16.0.txt               |   2 +
 pandas/tseries/offsets.py                     |  35 ++--
 pandas/tseries/resample.py                    |  18 +-
 .../tests/data/dateoffset_0_15_2.pickle       | 187 ++++++++++++++++++
 pandas/tseries/tests/test_offsets.py          |  39 +++-
 pandas/tseries/tests/test_resample.py         |  57 +++++-
 pandas/tslib.pyx                              |   8 +
 7 files changed, 326 insertions(+), 20 deletions(-)
 create mode 100644 pandas/tseries/tests/data/dateoffset_0_15_2.pickle

diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt
index 2e910e32d4dfd..4a68567842711 100644
--- a/doc/source/whatsnew/v0.16.0.txt
+++ b/doc/source/whatsnew/v0.16.0.txt
@@ -69,6 +69,7 @@ New features
 - Added ``StringMethods.zfill()`` which behave as the same as standard ``str`` (:issue:`9387`)
 - Added ``days_in_month`` (compatibility alias ``daysinmonth``) property to ``Timestamp``, ``DatetimeIndex``, ``Period``, ``PeriodIndex``, and ``Series.dt`` (:issue:`9572`)
 - Added ``decimal`` option in ``to_csv`` to provide formatting for non-'.' decimal separators (:issue:`781`)
+- Added ``normalize`` option for ``Timestamp`` to normalized to midnight (:issue:`8794`)
 
 .. _whatsnew_0160.enhancements.assign:
 
@@ -461,6 +462,7 @@ Bug Fixes
   To reproduce the old behavior, simply add more precision to the label (e.g., use ``2000-02-01`` instead of ``2000-02``).
 - Bug in adding ``offsets.Nano`` to other offets raises ``TypeError`` (:issue:`9284`)
 - Bug in ``DatetimeIndex`` iteration, related to (:issue:`8890`), fixed in (:issue:`9100`)
+- Bug in ``resample`` around DST transitions (:issue:`5172`, :issue:`8744`, :issue:`8653`, :issue:`9173`, :issue:`9468`).  This required fixing offset classes so they behave correctly on DST transitions.
 - Bug in binary operator method (eg ``.mul()``) alignment with integer levels (:issue:`9463`).
 - Bug in boxplot, scatter and hexbin plot may show an unnecessary warning (:issue:`8877`)
 - Bug in subplot with ``layout`` kw may show unnecessary warning (:issue:`9464`)
diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py
index 84449cd2fad98..cb6bd2fb2b250 100644
--- a/pandas/tseries/offsets.py
+++ b/pandas/tseries/offsets.py
@@ -54,12 +54,16 @@ def wrapper(self, other):
         nano = getattr(other, 'nanosecond', 0)
 
         try:
-            result = func(self, other)
+            if self._adjust_dst and isinstance(other, Timestamp):
+                other = other.tz_localize(None)
 
-            if self.normalize:
-                # normalize_date returns normal datetime
-                result = tslib.normalize_date(result)
+            result = func(self, other)
+            if self._adjust_dst:
+                result = tslib._localize_pydatetime(result, tz)
+            
             result = Timestamp(result)
+            if self.normalize:
+                result = result.normalize()
 
             # nanosecond may be deleted depending on offset process
             if not self.normalize and nano != 0:
@@ -79,7 +83,7 @@ def wrapper(self, other):
 
             if self.normalize:
                 # normalize_date returns normal datetime
-                result = tslib.normalize_date(result)
+                result = normalize_date(result)
 
             if tz is not None and result.tzinfo is None:
                 result = tslib._localize_pydatetime(result, tz)
@@ -158,6 +162,7 @@ def __add__(date):
         'hour', 'minute', 'second', 'microsecond'
         )
     _use_relativedelta = False
+    _adjust_dst = False
 
     # default for prior pickles
     normalize = False
@@ -380,8 +385,8 @@ def freqstr(self):
 
         return fstr
 
-
 class SingleConstructorOffset(DateOffset):
+    
     @classmethod
     def _from_name(cls, suffix=None):
         # default _from_name calls cls with no args
@@ -389,7 +394,6 @@ def _from_name(cls, suffix=None):
             raise ValueError("Bad freq suffix %s" % suffix)
         return cls()
 
-
 class BusinessMixin(object):
     """ mixin to business types to provide related functions """
 
@@ -425,6 +429,7 @@ class BusinessDay(BusinessMixin, SingleConstructorOffset):
     DateOffset subclass representing possibly n business days
     """
     _prefix = 'B'
+    _adjust_dst = True
 
     def __init__(self, n=1, normalize=False, **kwds):
         self.n = int(n)
@@ -685,6 +690,8 @@ def onOffset(self, dt):
 
 
 class MonthOffset(SingleConstructorOffset):
+    _adjust_dst = True
+    
     @property
     def name(self):
         if self.isAnchored:
@@ -925,7 +932,7 @@ class Week(DateOffset):
     weekday : int, default None
         Always generate specific day of week. 0 for Monday
     """
-
+    _adjust_dst = True
     def __init__(self, n=1, normalize=False, **kwds):
         self.n = n
         self.normalize = normalize
@@ -1031,7 +1038,9 @@ class WeekOfMonth(DateOffset):
         5: Saturdays
         6: Sundays
     """
-
+    
+    _adjust_dst = True
+    
     def __init__(self, n=1, normalize=False, **kwds):
         self.n = n
         self.normalize = normalize
@@ -1190,7 +1199,7 @@ class QuarterOffset(DateOffset):
     _default_startingMonth = None
     #: default month in _from_name
     _from_name_startingMonth = None
-
+    _adjust_dst = True
     # TODO: Consider combining QuarterOffset and YearOffset __init__ at some
     #       point
     def __init__(self, n=1, normalize=False, **kwds):
@@ -1395,7 +1404,7 @@ def apply(self, other):
 
 class YearOffset(DateOffset):
     """DateOffset that just needs a month"""
-
+    _adjust_dst = True
     def __init__(self, n=1, normalize=False, **kwds):
         self.month = kwds.get('month', self._default_month)
 
@@ -1627,6 +1636,7 @@ class FY5253(DateOffset):
     _prefix = 'RE'
     _suffix_prefix_last = 'L'
     _suffix_prefix_nearest = 'N'
+    _adjust_dst = True
 
     def __init__(self, n=1, normalize=False, **kwds):
         self.n = n
@@ -1848,6 +1858,7 @@ class FY5253Quarter(DateOffset):
     """
 
     _prefix = 'REQ'
+    _adjust_dst = True
 
     def __init__(self, n=1, normalize=False, **kwds):
         self.n = n
@@ -1966,6 +1977,8 @@ class Easter(DateOffset):
     the revised method which is valid in years
     1583-4099.
     '''
+    _adjust_dst = True
+    
     def __init__(self, n=1, **kwds):
         super(Easter, self).__init__(n, **kwds)
 
diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py
index 95d3ff015394a..7607bef0f1d71 100644
--- a/pandas/tseries/resample.py
+++ b/pandas/tseries/resample.py
@@ -395,8 +395,8 @@ def _get_range_edges(first, last, offset, closed='left', base=0):
 
     if not isinstance(offset, Tick):  # and first.time() != last.time():
         # hack!
-        first = tools.normalize_date(first)
-        last = tools.normalize_date(last)
+        first = first.normalize()
+        last = last.normalize()
 
     if closed == 'left':
         first = Timestamp(offset.rollback(first))
@@ -409,7 +409,7 @@ def _get_range_edges(first, last, offset, closed='left', base=0):
 
 
 def _adjust_dates_anchored(first, last, offset, closed='right', base=0):
-    from pandas.tseries.tools import normalize_date
+#     from pandas.tseries.tools import normalize_date
 
     # First and last offsets should be calculated from the start day to fix an
     # error cause by resampling across multiple days when a one day period is
@@ -417,7 +417,10 @@ def _adjust_dates_anchored(first, last, offset, closed='right', base=0):
     #
     # See https://github.com/pydata/pandas/issues/8683
 
-    start_day_nanos = Timestamp(normalize_date(first)).value
+    first_tzinfo = first.tzinfo
+    first = first.tz_localize(None)
+    last = last.tz_localize(None)
+    start_day_nanos = first.normalize().value
 
     base_nanos = (base % offset.n) * offset.nanos // offset.n
     start_day_nanos += base_nanos
@@ -451,8 +454,11 @@ def _adjust_dates_anchored(first, last, offset, closed='right', base=0):
         else:
             lresult = last.value + offset.nanos
 
-    return (Timestamp(fresult, tz=first.tz),
-            Timestamp(lresult, tz=last.tz))
+#     return (Timestamp(fresult, tz=first.tz),
+#             Timestamp(lresult, tz=last.tz))
+
+    return (Timestamp(fresult).tz_localize(first_tzinfo),
+            Timestamp(lresult).tz_localize(first_tzinfo))
 
 
 def asfreq(obj, freq, method=None, how=None, normalize=False):
diff --git a/pandas/tseries/tests/data/dateoffset_0_15_2.pickle b/pandas/tseries/tests/data/dateoffset_0_15_2.pickle
new file mode 100644
index 0000000000000..0dc28fba3d800
--- /dev/null
+++ b/pandas/tseries/tests/data/dateoffset_0_15_2.pickle
@@ -0,0 +1,187 @@
+(dp0
+S'YearBegin'
+p1
+ccopy_reg
+_reconstructor
+p2
+(cpandas.tseries.offsets
+YearBegin
+p3
+c__builtin__
+object
+p4
+Ntp5
+Rp6
+(dp7
+S'normalize'
+p8
+I00
+sS'kwds'
+p9
+(dp10
+sS'n'
+p11
+I1
+sS'_offset'
+p12
+cdatetime
+timedelta
+p13
+(I1
+I0
+I0
+tp14
+Rp15
+sS'month'
+p16
+I1
+sS'_use_relativedelta'
+p17
+I00
+sbsS'Week'
+p18
+g2
+(cpandas.tseries.offsets
+Week
+p19
+g4
+Ntp20
+Rp21
+(dp22
+g8
+I00
+sS'_inc'
+p23
+g13
+(I7
+I0
+I0
+tp24
+Rp25
+sg9
+(dp26
+sS'weekday'
+p27
+Nsg11
+I1
+sbsS'MonthBegin'
+p28
+g2
+(cpandas.tseries.offsets
+MonthBegin
+p29
+g4
+Ntp30
+Rp31
+(dp32
+g8
+I00
+sg12
+g13
+(I1
+I0
+I0
+tp33
+Rp34
+sg17
+I00
+sg9
+(dp35
+sg11
+I1
+sbsS'Day'
+p36
+g2
+(cpandas.tseries.offsets
+Day
+p37
+g4
+Ntp38
+Rp39
+(dp40
+g8
+I00
+sg12
+g13
+(I1
+I0
+I0
+tp41
+Rp42
+sg17
+I00
+sg9
+(dp43
+sg11
+I1
+sbsS'DateOffset'
+p44
+g2
+(cpandas.tseries.offsets
+DateOffset
+p45
+g4
+Ntp46
+Rp47
+(dp48
+g8
+I00
+sg12
+g2
+(cdateutil.relativedelta
+relativedelta
+p49
+g4
+Ntp50
+Rp51
+(dp52
+S'_has_time'
+p53
+I0
+sS'hour'
+p54
+NsS'seconds'
+p55
+I0
+sS'months'
+p56
+I0
+sS'year'
+p57
+NsS'days'
+p58
+I0
+sS'years'
+p59
+I1
+sS'hours'
+p60
+I0
+sS'second'
+p61
+NsS'microsecond'
+p62
+Nsg16
+NsS'microseconds'
+p63
+I0
+sS'leapdays'
+p64
+I0
+sS'minutes'
+p65
+I0
+sS'day'
+p66
+NsS'minute'
+p67
+Nsg27
+Nsbsg17
+I01
+sg9
+(dp68
+g59
+I1
+ssg11
+I1
+sbs.
\ No newline at end of file
diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py
index 9e49cccc2f218..e4533d0ff9476 100644
--- a/pandas/tseries/tests/test_offsets.py
+++ b/pandas/tseries/tests/test_offsets.py
@@ -1,7 +1,7 @@
 import os
 from datetime import date, datetime, timedelta
 from dateutil.relativedelta import relativedelta
-from pandas.compat import range
+from pandas.compat import range, iteritems
 from pandas import compat
 import nose
 from nose.tools import assert_raises
@@ -410,6 +410,18 @@ def test_add(self):
                 self.assertTrue(isinstance(result, Timestamp))
                 self.assertEqual(result, expected_localize)
 
+    def test_pickle_v0_15_2(self):
+        offsets = {'DateOffset': DateOffset(years=1),
+                   'MonthBegin': MonthBegin(1),
+                   'Day': Day(1),
+                   'YearBegin': YearBegin(1),
+                   'Week': Week(1)}
+        pickle_path = os.path.join(tm.get_data_path(),
+                                   'dateoffset_0_15_2.pickle')
+        # This code was executed once on v0.15.2 to generate the pickle:
+        # with open(pickle_path, 'wb') as f: pickle.dump(offsets, f)
+        #
+        self.assertEqual(offsets, read_pickle(pickle_path))   
 
 class TestDateOffset(Base):
     _multiprocess_can_split_ = True
@@ -3298,7 +3310,30 @@ def test_springforward_singular(self):
                 tstart=self._make_timestamp(self.ts_pre_springfwd, hrs_pre, tz),
                 expected_utc_offset=None
                 )
-
+    
+    def test_all_offset_classes(self):
+        tests = {MonthBegin: ['11/2/2012', '12/1/2012'],
+                 MonthEnd: ['11/2/2012', '11/30/2012'],
+                 BMonthBegin: ['11/2/2012', '12/3/2012'],
+                 BMonthEnd: ['11/2/2012', '11/30/2012'],
+                 CBMonthBegin: ['11/2/2012', '12/3/2012'],
+                 CBMonthEnd: ['11/2/2012', '11/30/2012'],
+                 Week: ['11/2/2012', '11/9/2012'],
+                 YearBegin: ['11/2/2012', '1/1/2013'],
+                 YearEnd: ['11/2/2012', '12/31/2012'],
+                 BYearBegin: ['11/2/2012', '1/1/2013'],
+                 BYearEnd: ['11/2/2012', '12/31/2012'],
+                 QuarterBegin: ['11/2/2012', '12/1/2012'],
+                 QuarterEnd: ['11/2/2012', '12/31/2012'],
+                 BQuarterBegin: ['11/2/2012', '12/3/2012'],
+                 BQuarterEnd: ['11/2/2012', '12/31/2012'],
+                 Day: ['11/4/2012', '11/4/2012 23:00']
+                 }
+        
+        for offset, test_values in iteritems(tests):
+            first = Timestamp(test_values[0], tz='US/Eastern') + offset()
+            second = Timestamp(test_values[1], tz='US/Eastern')
+            self.assertEqual(first, second, str(offset))
 
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py
index 42b09b699b919..3e565d5764fe2 100644
--- a/pandas/tseries/tests/test_resample.py
+++ b/pandas/tseries/tests/test_resample.py
@@ -871,7 +871,62 @@ def test_resample_timegrouper(self):
             result = df.groupby(pd.Grouper(freq='M', key='A')).count()
             assert_frame_equal(result, expected)
 
-
+    def test_resmaple_dst_anchor(self):
+        # 5172
+        dti = DatetimeIndex([datetime(2012, 11, 4, 23)], tz='US/Eastern')
+        df = DataFrame([5], index=dti)
+        assert_frame_equal(df.resample(rule='D', how='sum'), 
+                           DataFrame([5], index=df.index.normalize()))
+        df.resample(rule='MS', how='sum')
+        assert_frame_equal(df.resample(rule='MS', how='sum'),
+                           DataFrame([5], index=DatetimeIndex([datetime(2012, 11, 1)], 
+                                                              tz='US/Eastern')))
+
+        dti = date_range('2013-09-30', '2013-11-02', freq='30Min', tz='Europe/Paris')
+        values = range(dti.size)
+        df = DataFrame({"a": values, "b": values, "c": values}, index=dti)
+        how = {"a": "min", "b": "max", "c": "count"}
+        
+        assert_frame_equal(df.resample("W-MON", how=how)[["a", "b", "c"]],
+                           DataFrame({"a": [0, 48, 384, 720, 1056, 1394],
+                                      "b": [47, 383, 719, 1055, 1393, 1586],
+                                      "c": [48, 336, 336, 336, 338, 193]},
+                                     index=date_range('9/30/2013', '11/4/2013', 
+                                                      freq='W-MON', tz='Europe/Paris')),
+                           'W-MON Frequency')
+
+        assert_frame_equal(df.resample("2W-MON", how=how)[["a", "b", "c"]],
+                           DataFrame({"a": [0, 48, 720, 1394],
+                                      "b": [47, 719, 1393, 1586],
+                                      "c": [48, 672, 674, 193]},
+                                     index=date_range('9/30/2013', '11/11/2013', 
+                                                      freq='2W-MON', tz='Europe/Paris')),
+                           '2W-MON Frequency')
+
+        assert_frame_equal(df.resample("MS", how=how)[["a", "b", "c"]],
+                           DataFrame({"a": [0, 48, 1538],
+                                      "b": [47, 1537, 1586],
+                                      "c": [48, 1490, 49]},
+                                     index=date_range('9/1/2013', '11/1/2013', 
+                                                      freq='MS', tz='Europe/Paris')),
+                           'MS Frequency')
+
+        assert_frame_equal(df.resample("2MS", how=how)[["a", "b", "c"]],
+                           DataFrame({"a": [0, 1538],
+                                      "b": [1537, 1586],
+                                      "c": [1538, 49]},
+                                     index=date_range('9/1/2013', '11/1/2013', 
+                                                      freq='2MS', tz='Europe/Paris')),
+                           '2MS Frequency')
+
+        df_daily = df['10/26/2013':'10/29/2013']
+        assert_frame_equal(df_daily.resample("D", how={"a": "min", "b": "max", "c": "count"})[["a", "b", "c"]],
+                           DataFrame({"a": [1248, 1296, 1346, 1394],
+                                      "b": [1295, 1345, 1393, 1441],
+                                      "c": [48, 50, 48, 48]},
+                                     index=date_range('10/26/2013', '10/29/2013',
+                                                      freq='D', tz='Europe/Paris')),
+                           'D Frequency')
 
 def _simple_ts(start, end, freq='D'):
     rng = date_range(start, end, freq=freq)
diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx
index eee72f268036a..763c06c7d4e7c 100644
--- a/pandas/tslib.pyx
+++ b/pandas/tslib.pyx
@@ -564,6 +564,14 @@ class Timestamp(_Timestamp):
                  self.nanosecond/3600.0/1e+9
                 )/24.0)
 
+    def normalize(self):
+        """
+        Normalize Timestamp to midnight, preserving
+        tz information.
+        """
+        normalized_value = date_normalize(np.array([self.value]), tz=self.tz)[0]
+        return Timestamp(normalized_value).tz_localize(self.tz)
+
     def __radd__(self, other):
         # __radd__ on cython extension types like _Timestamp is not used, so
         # define it here instead