From f868ee884f01c8430a9439a92f5cb4e8a849a5af Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 19 Feb 2018 09:50:06 -0800 Subject: [PATCH 1/3] BUG: DataFrame.diff(axis=0) with DatetimeTZ data add whatsnew clarify comment Add addtional tests move diff into its own function in DatetimeTZBlock Use correct placement fix failing test formatting --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/internals.py | 12 ++++++++++++ pandas/tests/frame/test_timeseries.py | 23 +++++++++++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 90ce6b47728fb..ac527387ae6a1 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -833,6 +833,7 @@ Timezones - Bug in the :class:`DataFrame` constructor, where tz-aware Datetimeindex and a given column name will result in an empty ``DataFrame`` (:issue:`19157`) - Bug in :func:`Timestamp.tz_localize` where localizing a timestamp near the minimum or maximum valid values could overflow and return a timestamp with an incorrect nanosecond value (:issue:`12677`) - Bug when iterating over :class:`DatetimeIndex` that was localized with fixed timezone offset that rounded nanosecond precision to microseconds (:issue:`19603`) +- Bug in :func:`DataFrame.diff` that raised an ``IndexError`` with tz-aware values (:issue:`18578`) Offsets ^^^^^^^ diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 00ef8f9cef598..9ba60f540cf0b 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2905,6 +2905,18 @@ def shift(self, periods, axis=0, mgr=None): return [self.make_block_same_class(new_values, placement=self.mgr_locs)] + def diff(self, n, axis=0, mgr=None): + """1st discrete difference""" + if axis == 0: + # Cannot currently calculate diff across multiple blocks since this + # function is invoked via apply + raise NotImplementedError + new_values = (self.values - self.shift(n, axis=axis)[0].values).asi8 + # Reshape the new_values like how algos.diff does for timedelta data + new_values = new_values.reshape(1, len(new_values)) + new_values = new_values.astype('timedelta64[ns]') + return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)] + def concat_same_type(self, to_concat, placement=None): """ Concatenate list of single blocks of the same type. diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index e1bc310e1e934..5114815ddf178 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -57,6 +57,29 @@ def test_diff(self): 1), 'z': pd.Series(1)}).astype('float64') assert_frame_equal(result, expected) + @pytest.mark.parametrize('axis', [0, 1]) + @pytest.mark.parametrize('tz', [None, 'UTC']) + def test_diff_datetime(self, axis, tz): + # GH 18578 + df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz), + 1: date_range('2010', freq='D', periods=2, tz=tz)}) + if axis == 1: + if tz is None: + result = df.diff(axis=axis) + expected = DataFrame({0: pd.TimedeltaIndex(['NaT', 'NaT']), + 1: pd.TimedeltaIndex(['0 days', + '0 days'])}) + assert_frame_equal(result, expected) + else: + with pytest.raises(NotImplementedError): + result = df.diff(axis=axis) + + else: + result = df.diff(axis=axis) + expected = DataFrame({0: pd.TimedeltaIndex(['NaT', '1 days']), + 1: pd.TimedeltaIndex(['NaT', '1 days'])}) + assert_frame_equal(result, expected) + def test_diff_timedelta(self): # GH 4533 df = DataFrame(dict(time=[Timestamp('20130101 9:01'), From 63c10015bc3e03ce7675fc46ebf1d43ff40278bc Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 26 Feb 2018 15:06:51 -0800 Subject: [PATCH 2/3] Split test and add docstring --- pandas/core/internals.py | 18 +++++++++++++- pandas/tests/frame/test_timeseries.py | 35 +++++++++++++++------------ 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 9ba60f540cf0b..a08e78ad00a0d 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2906,7 +2906,23 @@ def shift(self, periods, axis=0, mgr=None): placement=self.mgr_locs)] def diff(self, n, axis=0, mgr=None): - """1st discrete difference""" + """1st discrete difference + + Parameters + ---------- + n : int, number of periods to diff + axis : int, axis to diff upon. default 0 + mgr : default None + + Return + ------ + A list with a new TimeDeltaBlock. + + Note + ---- + The arguments here are mimicking shift so they are called correctly + by apply. + """ if axis == 0: # Cannot currently calculate diff across multiple blocks since this # function is invoked via apply diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 5114815ddf178..ceb6c942c81b1 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -57,28 +57,31 @@ def test_diff(self): 1), 'z': pd.Series(1)}).astype('float64') assert_frame_equal(result, expected) - @pytest.mark.parametrize('axis', [0, 1]) @pytest.mark.parametrize('tz', [None, 'UTC']) - def test_diff_datetime(self, axis, tz): + def test_diff_datetime_axis0(self, tz): # GH 18578 df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz), 1: date_range('2010', freq='D', periods=2, tz=tz)}) - if axis == 1: - if tz is None: - result = df.diff(axis=axis) - expected = DataFrame({0: pd.TimedeltaIndex(['NaT', 'NaT']), - 1: pd.TimedeltaIndex(['0 days', - '0 days'])}) - assert_frame_equal(result, expected) - else: - with pytest.raises(NotImplementedError): - result = df.diff(axis=axis) - else: - result = df.diff(axis=axis) - expected = DataFrame({0: pd.TimedeltaIndex(['NaT', '1 days']), - 1: pd.TimedeltaIndex(['NaT', '1 days'])}) + result = df.diff(axis=0) + expected = DataFrame({0: pd.TimedeltaIndex(['NaT', '1 days']), + 1: pd.TimedeltaIndex(['NaT', '1 days'])}) + assert_frame_equal(result, expected) + + @pytest.mark.parametrize('tz', [None, 'UTC']) + def test_diff_datetime_axis1(self, tz): + # GH 18578 + df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz), + 1: date_range('2010', freq='D', periods=2, tz=tz)}) + if tz is None: + result = df.diff(axis=1) + expected = DataFrame({0: pd.TimedeltaIndex(['NaT', 'NaT']), + 1: pd.TimedeltaIndex(['0 days', + '0 days'])}) assert_frame_equal(result, expected) + else: + with pytest.raises(NotImplementedError): + result = df.diff(axis=1) def test_diff_timedelta(self): # GH 4533 From c77f5aa4338d87a4ed9df77752ecf8cbb8b1a82c Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 28 Feb 2018 10:49:23 -0800 Subject: [PATCH 3/3] add blank line --- pandas/core/internals.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index a08e78ad00a0d..240c9b1f3377c 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2928,6 +2928,7 @@ def diff(self, n, axis=0, mgr=None): # function is invoked via apply raise NotImplementedError new_values = (self.values - self.shift(n, axis=axis)[0].values).asi8 + # Reshape the new_values like how algos.diff does for timedelta data new_values = new_values.reshape(1, len(new_values)) new_values = new_values.astype('timedelta64[ns]')