From 36a251e432ae48b371043478136ff6d65aedab29 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 4 Dec 2018 09:48:53 -0800 Subject: [PATCH 1/6] fix mutation of DTI backing Series/DataFrame --- pandas/core/internals/blocks.py | 9 ++++++- pandas/tests/frame/test_block_internals.py | 31 ++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 198e832ca4603..90db2bb95c0f6 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -860,6 +860,11 @@ def setitem(self, indexer, value): b = self.astype(dtype) return b.setitem(indexer, value) + if (self._holder is not None and + issubclass(self._holder, ABCIndexClass)): + # avoid alterting Index objects in place + values = values.copy() + # value must be storeable at this moment arr_value = np.array(value) @@ -2923,7 +2928,9 @@ def _try_coerce_result(self, result): # allow passing of > 1dim if its trivial if result.ndim > 1: result = result.reshape(np.prod(result.shape)) - result = self.values._shallow_copy(result) + + # new values invalidates a frequency + result = self.values._shallow_copy(result, freq=None) return result diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 224e56777f6b4..631d63dc28f11 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -28,6 +28,37 @@ class TestDataFrameBlockInternals(): + def test_setitem_invalidates_datetime_index_freq(self): + # altering a datetime64tz column inplace invalidates the `freq` + # attribute on the underlying DatetimeIndex + + df = DataFrame({'B': date_range('20130101', periods=3, + tz='US/Eastern')}) + assert df['B']._values.freq == 'D' + + df.iloc[1, 0] = pd.NaT + assert df['B']._values.freq is None + + ser = Series(date_range('20130101', periods=3, + tz='US/Eastern')) + ts = ser[1] + dti = ser._values + assert dti.freq == 'D' + ser.iloc[1] = pd.NaT + assert ser._values.freq is None + + # check that the DatetimeIndex was not altered in place + assert ser._values is not dti + assert dti[1] == ts + + def test_dt64tz_setitem_does_not_mutate_dti(self): + # GH#21907 + dti = pd.date_range('2016-01-01', periods=10, tz='US/Pacific') + ts = dti[0] + ser = pd.Series(dti) + ser[::3] = pd.NaT + assert ser[0] is pd.NaT + assert dti[0] == ts def test_cast_internals(self, float_frame): casted = DataFrame(float_frame._data, dtype=int) From 03f4eb4e14111cdf1ee541a92bea0d65ef9659bc Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 4 Dec 2018 09:50:15 -0800 Subject: [PATCH 2/6] typo fixup --- pandas/core/internals/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 90db2bb95c0f6..d897c4b3c5509 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -862,7 +862,7 @@ def setitem(self, indexer, value): if (self._holder is not None and issubclass(self._holder, ABCIndexClass)): - # avoid alterting Index objects in place + # avoid altering Index objects in place values = values.copy() # value must be storeable at this moment From 616aaf78f0389531faedf4d6e6e678c66904dae1 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 4 Dec 2018 09:53:31 -0800 Subject: [PATCH 3/6] GH references --- pandas/core/internals/blocks.py | 4 ++-- pandas/tests/frame/test_block_internals.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index d897c4b3c5509..b323fad197911 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -862,7 +862,7 @@ def setitem(self, indexer, value): if (self._holder is not None and issubclass(self._holder, ABCIndexClass)): - # avoid altering Index objects in place + # GH#24096 avoid altering Index objects in place values = values.copy() # value must be storeable at this moment @@ -2929,7 +2929,7 @@ def _try_coerce_result(self, result): if result.ndim > 1: result = result.reshape(np.prod(result.shape)) - # new values invalidates a frequency + # GH#24096 new values invalidates a frequency result = self.values._shallow_copy(result, freq=None) return result diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 631d63dc28f11..b3da1665b04eb 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -29,8 +29,8 @@ class TestDataFrameBlockInternals(): def test_setitem_invalidates_datetime_index_freq(self): - # altering a datetime64tz column inplace invalidates the `freq` - # attribute on the underlying DatetimeIndex + # GH#24096 altering a datetime64tz column inplace invalidates the + # `freq` attribute on the underlying DatetimeIndex df = DataFrame({'B': date_range('20130101', periods=3, tz='US/Eastern')}) @@ -52,7 +52,7 @@ def test_setitem_invalidates_datetime_index_freq(self): assert dti[1] == ts def test_dt64tz_setitem_does_not_mutate_dti(self): - # GH#21907 + # GH#21907, GH#24096 dti = pd.date_range('2016-01-01', periods=10, tz='US/Pacific') ts = dti[0] ser = pd.Series(dti) From 841c4a5f8b33507c7e3ee7e1ca0ca3439785a65e Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 4 Dec 2018 11:54:42 -0800 Subject: [PATCH 4/6] only copy at init-time --- pandas/core/frame.py | 7 +++++- pandas/core/internals/blocks.py | 5 ----- pandas/core/series.py | 6 ++++- pandas/tests/frame/test_block_internals.py | 26 +++++++++++++++++----- 4 files changed, 31 insertions(+), 13 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b9f32042924b9..c24386cabb3ea 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -470,6 +470,7 @@ def _init_dict(self, data, index, columns, dtype=None): Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. """ + if columns is not None: arrays = Series(data, index=columns, dtype=object) data_names = arrays.index @@ -494,6 +495,11 @@ def _init_dict(self, data, index, columns, dtype=None): arrays.loc[missing] = [v] * missing.sum() else: + for key in data: + if isinstance(data[key], ABCIndexClass): + # GH#24096 need copy to be deep for datetime64tz case + data[key] = data[key].copy(deep=True) + keys = com.dict_keys_to_ordered_list(data) columns = data_names = Index(keys) arrays = [data[k] for k in keys] @@ -556,7 +562,6 @@ def _get_axes(N, K, index=index, columns=columns): # by definition an array here # the dtypes will be coerced to a single dtype values = _prep_ndarray(values, copy=copy) - if dtype is not None: if not is_dtype_equal(values.dtype, dtype): try: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b323fad197911..9c2d4cd5729d2 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -860,11 +860,6 @@ def setitem(self, indexer, value): b = self.astype(dtype) return b.setitem(indexer, value) - if (self._holder is not None and - issubclass(self._holder, ABCIndexClass)): - # GH#24096 avoid altering Index objects in place - values = values.copy() - # value must be storeable at this moment arr_value = np.array(value) diff --git a/pandas/core/series.py b/pandas/core/series.py index 8d4d7677cca44..4ead10636c777 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -29,7 +29,8 @@ is_integer, is_integer_dtype, is_iterator, is_list_like, is_object_dtype, is_scalar, is_string_like, is_timedelta64_dtype, pandas_dtype) from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCIndexClass, ABCSeries, ABCSparseArray, ABCSparseSeries) + ABCDataFrame, ABCDatetimeIndex, ABCIndexClass, ABCSeries, ABCSparseArray, + ABCSparseSeries) from pandas.core.dtypes.missing import ( isna, na_value_for_dtype, notna, remove_na_arraylike) @@ -189,6 +190,9 @@ def __init__(self, data=None, index=None, dtype=None, name=None, else: # need to copy to avoid aliasing issues data = data._values.copy() + if isinstance(data, ABCDatetimeIndex): + # GH#24096 need copy to be deep for datetime64tz case + data = data._values.copy(deep=True) copy = False elif isinstance(data, np.ndarray): diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index b3da1665b04eb..16f49c11e2216 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -32,30 +32,44 @@ def test_setitem_invalidates_datetime_index_freq(self): # GH#24096 altering a datetime64tz column inplace invalidates the # `freq` attribute on the underlying DatetimeIndex - df = DataFrame({'B': date_range('20130101', periods=3, - tz='US/Eastern')}) + dti = date_range('20130101', periods=3, tz='US/Eastern') + ts = dti[1] + + df = DataFrame({'B': dti}) assert df['B']._values.freq == 'D' df.iloc[1, 0] = pd.NaT assert df['B']._values.freq is None - ser = Series(date_range('20130101', periods=3, - tz='US/Eastern')) - ts = ser[1] - dti = ser._values + # check that the DatetimeIndex was not altered in place + assert dti.freq == 'D' + assert dti[1] == ts + + dti = date_range('20130101', periods=3, tz='US/Eastern') + ts = dti[1] + ser = Series(dti) + assert ser._values is not dti + assert ser._values._data.base is not dti._data.base assert dti.freq == 'D' ser.iloc[1] = pd.NaT assert ser._values.freq is None # check that the DatetimeIndex was not altered in place assert ser._values is not dti + assert ser._values._data.base is not dti._data.base assert dti[1] == ts + assert dti.freq == 'D' def test_dt64tz_setitem_does_not_mutate_dti(self): # GH#21907, GH#24096 dti = pd.date_range('2016-01-01', periods=10, tz='US/Pacific') ts = dti[0] ser = pd.Series(dti) + assert ser._values is not dti + assert ser._values._data.base is not dti._data.base + assert ser._data.blocks[0].values is not dti + assert ser._data.blocks[0].values._data.base is not dti._data.base + ser[::3] = pd.NaT assert ser[0] is pd.NaT assert dti[0] == ts From a66ee4e7b5f962cf16aa0d2628c18767eeece526 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Wed, 5 Dec 2018 10:14:45 -0800 Subject: [PATCH 5/6] implement test_block_internals for series --- pandas/tests/frame/test_block_internals.py | 29 -------------- pandas/tests/series/test_block_internals.py | 42 +++++++++++++++++++++ 2 files changed, 42 insertions(+), 29 deletions(-) create mode 100644 pandas/tests/series/test_block_internals.py diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 16f49c11e2216..647077a0428f3 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -45,35 +45,6 @@ def test_setitem_invalidates_datetime_index_freq(self): assert dti.freq == 'D' assert dti[1] == ts - dti = date_range('20130101', periods=3, tz='US/Eastern') - ts = dti[1] - ser = Series(dti) - assert ser._values is not dti - assert ser._values._data.base is not dti._data.base - assert dti.freq == 'D' - ser.iloc[1] = pd.NaT - assert ser._values.freq is None - - # check that the DatetimeIndex was not altered in place - assert ser._values is not dti - assert ser._values._data.base is not dti._data.base - assert dti[1] == ts - assert dti.freq == 'D' - - def test_dt64tz_setitem_does_not_mutate_dti(self): - # GH#21907, GH#24096 - dti = pd.date_range('2016-01-01', periods=10, tz='US/Pacific') - ts = dti[0] - ser = pd.Series(dti) - assert ser._values is not dti - assert ser._values._data.base is not dti._data.base - assert ser._data.blocks[0].values is not dti - assert ser._data.blocks[0].values._data.base is not dti._data.base - - ser[::3] = pd.NaT - assert ser[0] is pd.NaT - assert dti[0] == ts - def test_cast_internals(self, float_frame): casted = DataFrame(float_frame._data, dtype=int) expected = DataFrame(float_frame._series, dtype=int) diff --git a/pandas/tests/series/test_block_internals.py b/pandas/tests/series/test_block_internals.py new file mode 100644 index 0000000000000..ccfb169cc2f8d --- /dev/null +++ b/pandas/tests/series/test_block_internals.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- + +import pandas as pd + +# Segregated collection of methods that require the BlockManager internal data +# structure + + +class TestSeriesBlockInternals(object): + + def test_setitem_invalidates_datetime_index_freq(self): + # GH#24096 altering a datetime64tz Series inplace invalidates the + # `freq` attribute on the underlying DatetimeIndex + + dti = pd.date_range('20130101', periods=3, tz='US/Eastern') + ts = dti[1] + ser = pd.Series(dti) + assert ser._values is not dti + assert ser._values._data.base is not dti._data.base + assert dti.freq == 'D' + ser.iloc[1] = pd.NaT + assert ser._values.freq is None + + # check that the DatetimeIndex was not altered in place + assert ser._values is not dti + assert ser._values._data.base is not dti._data.base + assert dti[1] == ts + assert dti.freq == 'D' + + def test_dt64tz_setitem_does_not_mutate_dti(self): + # GH#21907, GH#24096 + dti = pd.date_range('2016-01-01', periods=10, tz='US/Pacific') + ts = dti[0] + ser = pd.Series(dti) + assert ser._values is not dti + assert ser._values._data.base is not dti._data.base + assert ser._data.blocks[0].values is not dti + assert ser._data.blocks[0].values._data.base is not dti._data.base + + ser[::3] = pd.NaT + assert ser[0] is pd.NaT + assert dti[0] == ts From 4851ffa3981c41c3f8bf5832a522771f906ff763 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Wed, 5 Dec 2018 10:17:30 -0800 Subject: [PATCH 6/6] only copy if tz is not None, TODO comments --- pandas/core/internals/construction.py | 4 +++- pandas/core/series.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 5e2e3effc1083..910690a986c1c 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -198,8 +198,10 @@ def init_dict(data, index, columns, dtype=None): else: for key in data: - if isinstance(data[key], ABCIndexClass): + if (isinstance(data[key], ABCDatetimeIndex) and + data[key].tz is not None): # GH#24096 need copy to be deep for datetime64tz case + # TODO: See if we can avoid these copies data[key] = data[key].copy(deep=True) keys = com.dict_keys_to_ordered_list(data) diff --git a/pandas/core/series.py b/pandas/core/series.py index 7ad983f983c31..6f5ab43ff6756 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -182,8 +182,10 @@ def __init__(self, data=None, index=None, dtype=None, name=None, else: # need to copy to avoid aliasing issues data = data._values.copy() - if isinstance(data, ABCDatetimeIndex): + if (isinstance(data, ABCDatetimeIndex) and + data.tz is not None): # GH#24096 need copy to be deep for datetime64tz case + # TODO: See if we can avoid these copies data = data._values.copy(deep=True) copy = False