Skip to content

Backport PR #30784: API: DataFrame.take always returns a copy #31362

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -753,7 +753,7 @@ Deprecations
- The deprecated internal attributes ``_start``, ``_stop`` and ``_step`` of :class:`RangeIndex` now raise a ``FutureWarning`` instead of a ``DeprecationWarning`` (:issue:`26581`)
- The ``pandas.util.testing`` module has been deprecated. Use the public API in ``pandas.testing`` documented at :ref:`api.general.testing` (:issue:`16232`).
- ``pandas.SparseArray`` has been deprecated. Use ``pandas.arrays.SparseArray`` (:class:`arrays.SparseArray`) instead. (:issue:`30642`)
- The parameter ``is_copy`` of :meth:`DataFrame.take` has been deprecated and will be removed in a future version. (:issue:`27357`)
- The parameter ``is_copy`` of :meth:`Series.take` and :meth:`DataFrame.take` has been deprecated and will be removed in a future version. (:issue:`27357`)
- Support for multi-dimensional indexing (e.g. ``index[:, None]``) on a :class:`Index` is deprecated and will be removed in a future version, convert to a numpy array before indexing instead (:issue:`30588`)
- The ``pandas.np`` submodule is now deprecated. Import numpy directly instead (:issue:`30296`)
- The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30610`)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2809,7 +2809,7 @@ def __getitem__(self, key):
if getattr(indexer, "dtype", None) == bool:
indexer = np.where(indexer)[0]

data = self.take(indexer, axis=1)
data = self._take_with_is_copy(indexer, axis=1)

if is_single_key:
# What does looking for a single key in a non-unique index return?
Expand Down Expand Up @@ -2842,7 +2842,7 @@ def _getitem_bool_array(self, key):
# be reindexed to match DataFrame rows
key = check_bool_indexer(self.index, key)
indexer = key.nonzero()[0]
return self.take(indexer, axis=0)
return self._take_with_is_copy(indexer, axis=0)

def _getitem_multilevel(self, key):
# self.columns is a MultiIndex
Expand Down
43 changes: 26 additions & 17 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3313,8 +3313,11 @@ def take(
axis : {0 or 'index', 1 or 'columns', None}, default 0
The axis on which to select elements. ``0`` means that we are
selecting rows, ``1`` means that we are selecting columns.
is_copy : bool, default True
Whether to return a copy of the original object or not.
is_copy : bool
Before pandas 1.0, ``is_copy=False`` can be specified to ensure
that the return value is an actual copy. Starting with pandas 1.0,
``take`` always returns a copy, and the keyword is therefore
deprecated.

.. deprecated:: 1.0.0
**kwargs
Expand Down Expand Up @@ -3378,12 +3381,10 @@ class max_speed
if is_copy is not None:
warnings.warn(
"is_copy is deprecated and will be removed in a future version. "
"take will always return a copy in the future.",
"'take' always returns a copy, so there is no need to specify this.",
FutureWarning,
stacklevel=2,
)
else:
is_copy = True

nv.validate_take(tuple(), kwargs)

Expand All @@ -3392,13 +3393,22 @@ class max_speed
new_data = self._data.take(
indices, axis=self._get_block_manager_axis(axis), verify=True
)
result = self._constructor(new_data).__finalize__(self)
return self._constructor(new_data).__finalize__(self)

# Maybe set copy if we didn't actually change the index.
if is_copy:
if not result._get_axis(axis).equals(self._get_axis(axis)):
result._set_is_copy(self)
def _take_with_is_copy(
self: FrameOrSeries, indices, axis=0, **kwargs
) -> FrameOrSeries:
"""
Internal version of the `take` method that sets the `_is_copy`
attribute to keep track of the parent dataframe (using in indexing
for the SettingWithCopyWarning).

See the docstring of `take` for full explanation of the parameters.
"""
result = self.take(indices=indices, axis=axis, **kwargs)
# Maybe set copy if we didn't actually change the index.
if not result._get_axis(axis).equals(self._get_axis(axis)):
result._set_is_copy(self)
return result

def xs(self, key, axis=0, level=None, drop_level: bool_t = True):
Expand Down Expand Up @@ -3528,9 +3538,9 @@ class animal locomotion
if isinstance(loc, np.ndarray):
if loc.dtype == np.bool_:
(inds,) = loc.nonzero()
return self.take(inds, axis=axis)
return self._take_with_is_copy(inds, axis=axis)
else:
return self.take(loc, axis=axis)
return self._take_with_is_copy(loc, axis=axis)

if not is_scalar(loc):
new_index = self.index[loc]
Expand Down Expand Up @@ -3587,7 +3597,7 @@ def _iget_item_cache(self, item):
if ax.is_unique:
lower = self._get_item_cache(ax[item])
else:
lower = self.take(item, axis=self._info_axis_number)
lower = self._take_with_is_copy(item, axis=self._info_axis_number)
return lower

def _box_item_values(self, key, values):
Expand Down Expand Up @@ -7180,8 +7190,7 @@ def asof(self, where, subset=None):

# mask the missing
missing = locs == -1
d = self.take(locs)
data = d.copy()
data = self.take(locs)
data.index = where
data.loc[missing] = np.nan
return data if is_list else data.iloc[-1]
Expand Down Expand Up @@ -7727,7 +7736,7 @@ def at_time(
except AttributeError:
raise TypeError("Index must be DatetimeIndex")

return self.take(indexer, axis=axis)
return self._take_with_is_copy(indexer, axis=axis)

def between_time(
self: FrameOrSeries,
Expand Down Expand Up @@ -7809,7 +7818,7 @@ def between_time(
except AttributeError:
raise TypeError("Index must be DatetimeIndex")

return self.take(indexer, axis=axis)
return self._take_with_is_copy(indexer, axis=axis)

def resample(
self,
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -685,7 +685,7 @@ def get_group(self, name, obj=None):
if not len(inds):
raise KeyError(name)

return obj.take(inds, axis=self.axis)
return obj._take_with_is_copy(inds, axis=self.axis)

def __iter__(self):
"""
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1588,7 +1588,7 @@ def _getitem_iterable(self, key, axis: int):
# A boolean indexer
key = check_bool_indexer(labels, key)
(inds,) = key.nonzero()
return self.obj.take(inds, axis=axis)
return self.obj._take_with_is_copy(inds, axis=axis)
else:
# A collection of keys
keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False)
Expand Down Expand Up @@ -1780,7 +1780,7 @@ def _getbool_axis(self, key, axis: int):
labels = self.obj._get_axis(axis)
key = check_bool_indexer(labels, key)
inds = key.nonzero()[0]
return self.obj.take(inds, axis=axis)
return self.obj._take_with_is_copy(inds, axis=axis)

def _get_slice_axis(self, slice_obj: slice, axis: int):
"""
Expand All @@ -1801,7 +1801,7 @@ def _get_slice_axis(self, slice_obj: slice, axis: int):
else:
# DatetimeIndex overrides Index.slice_indexer and may
# return a DatetimeIndex instead of a slice object.
return self.obj.take(indexer, axis=axis)
return self.obj._take_with_is_copy(indexer, axis=axis)


@Appender(IndexingMixin.loc.__doc__)
Expand Down Expand Up @@ -2107,7 +2107,7 @@ def _get_list_axis(self, key, axis: int):
`axis` can only be zero.
"""
try:
return self.obj.take(key, axis=axis)
return self.obj._take_with_is_copy(key, axis=axis)
except IndexError:
# re-raise with different error message
raise IndexError("positional indexers are out-of-bounds")
Expand Down
25 changes: 18 additions & 7 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -804,7 +804,14 @@ def axes(self):
# Indexing Methods

@Appender(generic.NDFrame.take.__doc__)
def take(self, indices, axis=0, is_copy=False, **kwargs):
def take(self, indices, axis=0, is_copy=None, **kwargs) -> "Series":
if is_copy is not None:
warnings.warn(
"is_copy is deprecated and will be removed in a future version. "
"'take' always returns a copy, so there is no need to specify this.",
FutureWarning,
stacklevel=2,
)
nv.validate_take(tuple(), kwargs)

indices = ensure_platform_int(indices)
Expand All @@ -819,16 +826,20 @@ def take(self, indices, axis=0, is_copy=False, **kwargs):
kwargs = {}
new_values = self._values.take(indices, **kwargs)

result = self._constructor(
return self._constructor(
new_values, index=new_index, fastpath=True
).__finalize__(self)

# Maybe set copy if we didn't actually change the index.
if is_copy:
if not result._get_axis(axis).equals(self._get_axis(axis)):
result._set_is_copy(self)
def _take_with_is_copy(self, indices, axis=0, **kwargs):
"""
Internal version of the `take` method that sets the `_is_copy`
attribute to keep track of the parent dataframe (using in indexing
for the SettingWithCopyWarning). For Series this does the same
as the public take (it never sets `_is_copy`).

return result
See the docstring of `take` for full explanation of the parameters.
"""
return self.take(indices=indices, axis=axis, **kwargs)

def _ixs(self, i: int, axis: int = 0):
"""
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/frame/methods/test_asof.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,16 @@ def test_time_zone_aware_index(self, stamp, expected):

result = df.asof(stamp)
tm.assert_series_equal(result, expected)

def test_is_copy(self, date_range_frame):
# GH-27357, GH-30784: ensure the result of asof is an actual copy and
# doesn't track the parent dataframe / doesn't give SettingWithCopy warnings
df = date_range_frame
N = 50
df.loc[15:30, "A"] = np.nan
dates = date_range("1/1/1990", periods=N * 3, freq="25s")

result = df.asof(dates)

with tm.assert_produces_warning(None):
result["C"] = 1
20 changes: 17 additions & 3 deletions pandas/tests/generic/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,15 @@ def test_sample_upsampling_without_replacement(self):
with pytest.raises(ValueError, match=msg):
df.sample(frac=2, replace=False)

def test_sample_is_copy(self):
# GH-27357, GH-30784: ensure the result of sample is an actual copy and
# doesn't track the parent dataframe / doesn't give SettingWithCopy warnings
df = pd.DataFrame(np.random.randn(10, 3), columns=["a", "b", "c"])
df2 = df.sample(3)

with tm.assert_produces_warning(None):
df2["d"] = 1

def test_size_compat(self):
# GH8846
# size property should be defined
Expand Down Expand Up @@ -820,18 +829,23 @@ def test_take_invalid_kwargs(self):
with pytest.raises(ValueError, match=msg):
obj.take(indices, mode="clip")

def test_depr_take_kwarg_is_copy(self):
@pytest.mark.parametrize("is_copy", [True, False])
def test_depr_take_kwarg_is_copy(self, is_copy):
# GH 27357
df = DataFrame({"A": [1, 2, 3]})
msg = (
"is_copy is deprecated and will be removed in a future version. "
"take will always return a copy in the future."
"'take' always returns a copy, so there is no need to specify this."
)
with tm.assert_produces_warning(FutureWarning) as w:
df.take([0, 1], is_copy=True)
df.take([0, 1], is_copy=is_copy)

assert w[0].message.args[0] == msg

s = Series([1, 2, 3])
with tm.assert_produces_warning(FutureWarning):
s.take([0, 1], is_copy=is_copy)

def test_equals(self):
s1 = pd.Series([1, 2, 3], index=[0, 2, 1])
s2 = s1.copy()
Expand Down