Skip to content

BUG: fix mutation of DTI backing Series/DataFrame #24096

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Dec 5, 2018
7 changes: 6 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,7 @@ def _init_dict(self, data, index, columns, dtype=None):
Segregate Series based on type and coerce into matrices.
Needs to handle a lot of exceptional cases.
"""

if columns is not None:
arrays = Series(data, index=columns, dtype=object)
data_names = arrays.index
Expand All @@ -494,6 +495,11 @@ def _init_dict(self, data, index, columns, dtype=None):
arrays.loc[missing] = [v] * missing.sum()

else:
for key in data:
if isinstance(data[key], ABCIndexClass):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

really? we need to deep copy everything?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can be more specific and just make this ABCDatetimeIndex, but I don't see any solution that doesn't involve making a copy here.

The first thing I tried was to make a copy in Block.setitem (circa line 923 in internals.blocks) if we had a DatetimeIndex. But that breaks a test because it means when we call ser.__setitem__ (or ser.loc.__setitem__, or...), we fail to change other Series that are views on ser.

AFAICT the only way to do do this is to avoid ever having a DatetimeIndex and a Series/column share data. This may be simpler to resolve after Blocks are backed by DatetimeArray instead of DatetimeIndex (I'm not 100% sure on this), but ATM this is a blocker to #24074, which is itself a blocker to DatetimeArray. We could avoid having this be a blocker for #24074 if you were OK with the pickle workaround currently there.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, the pickle workaround is much worse. for now let's just do this for DTI w/tz only. otherwise this may be a lot of copying. Pls add a TODO around this though.

# GH#24096 need copy to be deep for datetime64tz case
data[key] = data[key].copy(deep=True)

keys = com.dict_keys_to_ordered_list(data)
columns = data_names = Index(keys)
arrays = [data[k] for k in keys]
Expand Down Expand Up @@ -556,7 +562,6 @@ def _get_axes(N, K, index=index, columns=columns):
# by definition an array here
# the dtypes will be coerced to a single dtype
values = _prep_ndarray(values, copy=copy)

if dtype is not None:
if not is_dtype_equal(values.dtype, dtype):
try:
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2923,7 +2923,9 @@ def _try_coerce_result(self, result):
# allow passing of > 1dim if its trivial
if result.ndim > 1:
result = result.reshape(np.prod(result.shape))
result = self.values._shallow_copy(result)

# GH#24096 new values invalidates a frequency
result = self.values._shallow_copy(result, freq=None)

return result

Expand Down
6 changes: 5 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@
is_integer, is_integer_dtype, is_iterator, is_list_like, is_object_dtype,
is_scalar, is_string_like, is_timedelta64_dtype, pandas_dtype)
from pandas.core.dtypes.generic import (
ABCDataFrame, ABCIndexClass, ABCSeries, ABCSparseArray, ABCSparseSeries)
ABCDataFrame, ABCDatetimeIndex, ABCIndexClass, ABCSeries, ABCSparseArray,
ABCSparseSeries)
from pandas.core.dtypes.missing import (
isna, na_value_for_dtype, notna, remove_na_arraylike)

Expand Down Expand Up @@ -189,6 +190,9 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
else:
# need to copy to avoid aliasing issues
data = data._values.copy()
if isinstance(data, ABCDatetimeIndex):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

really?? this is a specific problem to Datetime w/tz, this is too heavy handed

# GH#24096 need copy to be deep for datetime64tz case
data = data._values.copy(deep=True)
copy = False

elif isinstance(data, np.ndarray):
Expand Down
45 changes: 45 additions & 0 deletions pandas/tests/frame/test_block_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,51 @@


class TestDataFrameBlockInternals():
def test_setitem_invalidates_datetime_index_freq(self):
# GH#24096 altering a datetime64tz column inplace invalidates the
# `freq` attribute on the underlying DatetimeIndex

dti = date_range('20130101', periods=3, tz='US/Eastern')
ts = dti[1]

df = DataFrame({'B': dti})
assert df['B']._values.freq == 'D'

df.iloc[1, 0] = pd.NaT
assert df['B']._values.freq is None

# check that the DatetimeIndex was not altered in place
assert dti.freq == 'D'
assert dti[1] == ts

dti = date_range('20130101', periods=3, tz='US/Eastern')
ts = dti[1]
ser = Series(dti)
assert ser._values is not dti
assert ser._values._data.base is not dti._data.base
assert dti.freq == 'D'
ser.iloc[1] = pd.NaT
assert ser._values.freq is None

# check that the DatetimeIndex was not altered in place
assert ser._values is not dti
assert ser._values._data.base is not dti._data.base
assert dti[1] == ts
assert dti.freq == 'D'

def test_dt64tz_setitem_does_not_mutate_dti(self):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there is not a series.test_block_internals; not sure if there is somewhere else this might belong

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pls create one

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure

# GH#21907, GH#24096
dti = pd.date_range('2016-01-01', periods=10, tz='US/Pacific')
ts = dti[0]
ser = pd.Series(dti)
assert ser._values is not dti
assert ser._values._data.base is not dti._data.base
assert ser._data.blocks[0].values is not dti
assert ser._data.blocks[0].values._data.base is not dti._data.base

ser[::3] = pd.NaT
assert ser[0] is pd.NaT
assert dti[0] == ts

def test_cast_internals(self, float_frame):
casted = DataFrame(float_frame._data, dtype=int)
Expand Down