Skip to content

API / CoW: Copy arrays by default in Series constructor #52022

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Mar 29, 2023
2 changes: 1 addition & 1 deletion pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,7 +708,7 @@ def _create_series(index):
"""Helper for the _series dict"""
size = len(index)
data = np.random.randn(size)
return Series(data, index=index, name="a")
return Series(data, index=index, name="a", copy=False)


_series = {
Expand Down
18 changes: 15 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
validate_percentile,
)

from pandas.core.dtypes.astype import astype_is_view
from pandas.core.dtypes.cast import (
LossySetitemError,
convert_dtypes,
Expand Down Expand Up @@ -376,9 +377,15 @@ def __init__(
index=None,
dtype: Dtype | None = None,
name=None,
copy: bool = False,
copy: bool | None = None,
fastpath: bool = False,
) -> None:
if copy is None:
default_cow_copy = True
copy = False
else:
default_cow_copy = copy

if (
isinstance(data, (SingleBlockManager, SingleArrayManager))
and index is None
Expand All @@ -394,6 +401,11 @@ def __init__(
self.name = name
return

if isinstance(data, (ExtensionArray, np.ndarray)):
if default_cow_copy and not copy and using_copy_on_write():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i find this difficult to reason about (... in part bc im distracted by being on a call). if a user explicitly passes copy=False, we should never ignore that. Does this (or any of the other CoW-constructor PRs that im having trouble keeping track of) respect that?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you pass an array then we respect copy=False with the drawback that your series gets modified when you modify the array, if you pass a Series/DataFrame we make a lazy copy to set up references

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is actually the reason why I changed the default to None, so that we can respect a user actually passing False

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also don't find it easy to read, but I think it can be simplified?
Wouldn't it be enough to check for if copy was originally None? Because if copy=True, I would expect the copy to happen anyway later on (assuming it currently honored that keyword)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wanted to avoid using_copy_on_write before hitting the fastpath, but only using copy is certainly more readable, changed

if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)):
data = data.copy()

# we are called internally, so short-circuit
if fastpath:
# data is a ndarray, index is defined
Expand Down Expand Up @@ -6087,7 +6099,7 @@ def _construct_result(
# TODO: result should always be ArrayLike, but this fails for some
# JSONArray tests
dtype = getattr(result, "dtype", None)
out = self._constructor(result, index=self.index, dtype=dtype)
out = self._constructor(result, index=self.index, dtype=dtype, copy=False)
out = out.__finalize__(self)

# Set the result's name after __finalize__ is called because __finalize__
Expand All @@ -6106,7 +6118,7 @@ def _flex_method(self, other, op, *, level=None, fill_value=None, axis: Axis = 0
elif isinstance(other, (np.ndarray, list, tuple)):
if len(other) != len(self):
raise ValueError("Lengths must be equal")
other = self._constructor(other, self.index)
other = self._constructor(other, self.index, copy=False)
result = self._binop(other, op, level=level, fill_value=fill_value)
result.name = res_name
return result
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/arrays/categorical/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,15 @@ def test_replace_categorical(to_replace, value, result, expected_error_msg):
# GH#26988
cat = Categorical(["a", "b"])
expected = Categorical(result)
result = pd.Series(cat).replace(to_replace, value)._values
result = pd.Series(cat, copy=False).replace(to_replace, value)._values

tm.assert_categorical_equal(result, expected)
if to_replace == "b": # the "c" test is supposed to be unchanged
with pytest.raises(AssertionError, match=expected_error_msg):
# ensure non-inplace call does not affect original
tm.assert_categorical_equal(cat, expected)

pd.Series(cat).replace(to_replace, value, inplace=True)
pd.Series(cat, copy=False).replace(to_replace, value, inplace=True)
tm.assert_categorical_equal(cat, expected)


Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/copy_view/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,9 @@ def test_astype_arrow_timestamp(using_copy_on_write):
result = df.astype("timestamp[ns][pyarrow]")
if using_copy_on_write:
assert not result._mgr._has_no_reference(0)
assert np.shares_memory(
# TODO(CoW): arrow is not setting copy=False in the Series constructor
# under the hood
assert not np.shares_memory(
get_array(df, "a").asi8, get_array(result, "a")._pa_array
)

Expand Down
31 changes: 31 additions & 0 deletions pandas/tests/copy_view/test_constructors.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pytest

import pandas as pd
from pandas import (
DataFrame,
Series,
Expand Down Expand Up @@ -82,6 +83,36 @@ def test_series_from_series_with_reindex(using_copy_on_write):
assert not result._mgr.blocks[0].refs.has_reference()


@pytest.mark.parametrize("fastpath", [False, True])
@pytest.mark.parametrize("dtype", [None, "int64"])
@pytest.mark.parametrize("idx", [None, pd.RangeIndex(start=0, stop=3, step=1)])
@pytest.mark.parametrize(
"arr", [np.array([1, 2, 3], dtype="int64"), pd.array([1, 2, 3], dtype="Int64")]
)
def test_series_from_array(using_copy_on_write, idx, dtype, fastpath, arr):
ser = Series(arr, dtype=dtype)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should idx and fastpath be passed here as well??

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah thx, yes

ser_orig = ser.copy()
data = getattr(arr, "_data", arr)
if using_copy_on_write:
assert not np.shares_memory(get_array(ser), data)
else:
assert np.shares_memory(get_array(ser), data)

arr[0] = 100
if using_copy_on_write:
tm.assert_series_equal(ser, ser_orig)
else:
expected = Series([100, 2, 3], dtype=dtype if dtype is not None else arr.dtype)
tm.assert_series_equal(ser, expected)


@pytest.mark.parametrize("copy", [True, False, None])
def test_series_from_array_different_dtype(using_copy_on_write, copy):
arr = np.array([1, 2, 3], dtype="int64")
ser = Series(arr, dtype="int32", copy=copy)
assert not np.shares_memory(get_array(ser), arr)


@pytest.mark.parametrize("func", [lambda x: x, lambda x: x._mgr])
@pytest.mark.parametrize("columns", [None, ["a"]])
def test_dataframe_constructor_mgr_or_df(using_copy_on_write, columns, func):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/internals/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1364,7 +1364,7 @@ def check_can_hold_element(self, obj, elem, inplace: bool):

def check_series_setitem(self, elem, index: Index, inplace: bool):
arr = index._data.copy()
ser = Series(arr)
ser = Series(arr, copy=False)

self.check_can_hold_element(ser, elem, inplace)

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/series/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,7 +570,7 @@ def test_setitem_scalar_into_readonly_backing_data():

array = np.zeros(5)
array.flags.writeable = False # make the array immutable
series = Series(array)
series = Series(array, copy=False)

for n in series.index:
msg = "assignment destination is read-only"
Expand All @@ -585,7 +585,7 @@ def test_setitem_slice_into_readonly_backing_data():

array = np.zeros(5)
array.flags.writeable = False # make the array immutable
series = Series(array)
series = Series(array, copy=False)

msg = "assignment destination is read-only"
with pytest.raises(ValueError, match=msg):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ def test_categorical_sideeffects_free(self):
# however, copy is False by default
# so this WILL change values
cat = Categorical(["a", "b", "c", "a"])
s = Series(cat)
s = Series(cat, copy=False)
assert s.values is cat
s = s.cat.rename_categories([1, 2, 3])
assert s.values is not cat
Expand Down