Skip to content

Commit 396ef60

Browse files
authored
CoW: Avoid copying Index in Series constructor (#52008)
1 parent d534007 commit 396ef60

File tree

14 files changed

+108
-32
lines changed

14 files changed

+108
-32
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1219,6 +1219,7 @@ Conversion
12191219
- Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`)
12201220
- Bug in :meth:`TimedeltaArray.astype` raising ``TypeError`` when converting to a pyarrow duration type (:issue:`49795`)
12211221
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` raising for extension array dtypes (:issue:`29618`, :issue:`50261`, :issue:`31913`)
1222+
- Bug in :meth:`Series` not copying data when created from :class:`Index` and ``dtype`` is equal to ``dtype`` from :class:`Index` (:issue:`52008`)
12221223

12231224
Strings
12241225
^^^^^^^

pandas/_libs/internals.pyi

+2-1
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ class BlockManager:
9696

9797
class BlockValuesRefs:
9898
referenced_blocks: list[weakref.ref]
99-
def __init__(self, blk: SharedBlock) -> None: ...
99+
def __init__(self, blk: SharedBlock | None = ...) -> None: ...
100100
def add_reference(self, blk: SharedBlock) -> None: ...
101+
def add_index_reference(self, index: object) -> None: ...
101102
def has_reference(self) -> bool: ...

pandas/_libs/internals.pyx

+5-2
Original file line numberDiff line numberDiff line change
@@ -877,8 +877,11 @@ cdef class BlockValuesRefs:
877877
cdef:
878878
public list referenced_blocks
879879

880-
def __cinit__(self, blk: SharedBlock) -> None:
881-
self.referenced_blocks = [weakref.ref(blk)]
880+
def __cinit__(self, blk: SharedBlock | None = None) -> None:
881+
if blk is not None:
882+
self.referenced_blocks = [weakref.ref(blk)]
883+
else:
884+
self.referenced_blocks = []
882885

883886
def add_reference(self, blk: SharedBlock) -> None:
884887
"""Adds a new reference to our reference collection.

pandas/core/indexes/base.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
index as libindex,
3131
lib,
3232
)
33+
from pandas._libs.internals import BlockValuesRefs
3334
import pandas._libs.join as libjoin
3435
from pandas._libs.lib import (
3536
is_datetime_array,
@@ -652,9 +653,11 @@ def _simple_new(cls, values: ArrayLike, name: Hashable = None, refs=None) -> Sel
652653
result._name = name
653654
result._cache = {}
654655
result._reset_identity()
655-
result._references = refs
656656
if refs is not None:
657-
refs.add_index_reference(result)
657+
result._references = refs
658+
else:
659+
result._references = BlockValuesRefs()
660+
result._references.add_index_reference(result)
658661

659662
return result
660663

pandas/core/indexes/interval.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,8 @@ def inferred_type(self) -> str:
399399
"""Return a string of the type inferred from the values"""
400400
return "interval"
401401

402-
@Appender(Index.memory_usage.__doc__)
402+
# Cannot determine type of "memory_usage"
403+
@Appender(Index.memory_usage.__doc__) # type: ignore[has-type]
403404
def memory_usage(self, deep: bool = False) -> int:
404405
# we don't use an explicit engine
405406
# so return the bytes here

pandas/core/indexes/multi.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1255,7 +1255,8 @@ def f(level) -> bool:
12551255

12561256
return any(f(level) for level in self._inferred_type_levels)
12571257

1258-
@doc(Index.memory_usage)
1258+
# Cannot determine type of "memory_usage"
1259+
@doc(Index.memory_usage) # type: ignore[has-type]
12591260
def memory_usage(self, deep: bool = False) -> int:
12601261
# we are overwriting our base class to avoid
12611262
# computing .values here which could materialize

pandas/core/internals/managers.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,10 @@
2222
internals as libinternals,
2323
lib,
2424
)
25-
from pandas._libs.internals import BlockPlacement
25+
from pandas._libs.internals import (
26+
BlockPlacement,
27+
BlockValuesRefs,
28+
)
2629
from pandas.errors import PerformanceWarning
2730
from pandas.util._decorators import cache_readonly
2831
from pandas.util._exceptions import find_stack_level
@@ -1877,11 +1880,13 @@ def from_blocks(
18771880
return cls(blocks[0], axes[0], verify_integrity=False)
18781881

18791882
@classmethod
1880-
def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager:
1883+
def from_array(
1884+
cls, array: ArrayLike, index: Index, refs: BlockValuesRefs | None = None
1885+
) -> SingleBlockManager:
18811886
"""
18821887
Constructor for if we have an array that is not yet a Block.
18831888
"""
1884-
block = new_block(array, placement=slice(0, len(index)), ndim=1)
1889+
block = new_block(array, placement=slice(0, len(index)), ndim=1, refs=refs)
18851890
return cls(block, index)
18861891

18871892
def to_2d_mgr(self, columns: Index) -> BlockManager:

pandas/core/series.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -429,10 +429,15 @@ def __init__(
429429
raise NotImplementedError(
430430
"initializing a Series from a MultiIndex is not supported"
431431
)
432+
433+
refs = None
432434
if isinstance(data, Index):
433435
if dtype is not None:
434-
# astype copies
435-
data = data.astype(dtype)
436+
data = data.astype(dtype, copy=False)
437+
438+
if using_copy_on_write():
439+
refs = data._references
440+
data = data._values
436441
else:
437442
# GH#24096 we need to ensure the index remains immutable
438443
data = data._values.copy()
@@ -496,7 +501,7 @@ def __init__(
496501

497502
manager = get_option("mode.data_manager")
498503
if manager == "block":
499-
data = SingleBlockManager.from_array(data, index)
504+
data = SingleBlockManager.from_array(data, index, refs=refs)
500505
elif manager == "array":
501506
data = SingleArrayManager.from_array(data, index)
502507

pandas/tests/copy_view/test_astype.py

+3-7
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ def test_astype_different_timezones(using_copy_on_write):
173173
result = df.astype("datetime64[ns, Europe/Berlin]")
174174
if using_copy_on_write:
175175
assert not result._mgr._has_no_reference(0)
176-
assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a").asi8)
176+
assert np.shares_memory(get_array(df, "a"), get_array(result, "a"))
177177

178178

179179
def test_astype_different_timezones_different_reso(using_copy_on_write):
@@ -183,9 +183,7 @@ def test_astype_different_timezones_different_reso(using_copy_on_write):
183183
result = df.astype("datetime64[ms, Europe/Berlin]")
184184
if using_copy_on_write:
185185
assert result._mgr._has_no_reference(0)
186-
assert not np.shares_memory(
187-
get_array(df, "a").asi8, get_array(result, "a").asi8
188-
)
186+
assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))
189187

190188

191189
@pytest.mark.skipif(pa_version_under7p0, reason="pyarrow not installed")
@@ -202,9 +200,7 @@ def test_astype_arrow_timestamp(using_copy_on_write):
202200
result = df.astype("timestamp[ns][pyarrow]")
203201
if using_copy_on_write:
204202
assert not result._mgr._has_no_reference(0)
205-
assert np.shares_memory(
206-
get_array(df, "a").asi8, get_array(result, "a")._pa_array
207-
)
203+
assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._pa_array)
208204

209205

210206
def test_convert_dtypes_infer_objects(using_copy_on_write):

pandas/tests/copy_view/test_constructors.py

+36
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,14 @@
33

44
from pandas import (
55
DataFrame,
6+
DatetimeIndex,
7+
Index,
8+
Period,
9+
PeriodIndex,
610
Series,
11+
Timedelta,
12+
TimedeltaIndex,
13+
Timestamp,
714
)
815
import pandas._testing as tm
916
from pandas.tests.copy_view.util import get_array
@@ -82,6 +89,35 @@ def test_series_from_series_with_reindex(using_copy_on_write):
8289
assert not result._mgr.blocks[0].refs.has_reference()
8390

8491

92+
@pytest.mark.parametrize(
93+
"idx",
94+
[
95+
Index([1, 2]),
96+
DatetimeIndex([Timestamp("2019-12-31"), Timestamp("2020-12-31")]),
97+
PeriodIndex([Period("2019-12-31"), Period("2020-12-31")]),
98+
TimedeltaIndex([Timedelta("1 days"), Timedelta("2 days")]),
99+
],
100+
)
101+
def test_series_from_index(using_copy_on_write, idx):
102+
ser = Series(idx)
103+
expected = idx.copy(deep=True)
104+
if using_copy_on_write:
105+
assert np.shares_memory(get_array(ser), get_array(idx))
106+
assert not ser._mgr._has_no_reference(0)
107+
else:
108+
assert not np.shares_memory(get_array(ser), get_array(idx))
109+
ser.iloc[0] = ser.iloc[1]
110+
tm.assert_index_equal(idx, expected)
111+
112+
113+
def test_series_from_index_different_dtypes(using_copy_on_write):
114+
idx = Index([1, 2, 3], dtype="int64")
115+
ser = Series(idx, dtype="int32")
116+
assert not np.shares_memory(get_array(ser), get_array(idx))
117+
if using_copy_on_write:
118+
assert ser._mgr._has_no_reference(0)
119+
120+
85121
@pytest.mark.parametrize("func", [lambda x: x, lambda x: x._mgr])
86122
@pytest.mark.parametrize("columns", [None, ["a"]])
87123
def test_dataframe_constructor_mgr_or_df(using_copy_on_write, columns, func):

pandas/tests/copy_view/util.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1-
from pandas import Series
1+
from pandas import (
2+
Categorical,
3+
Index,
4+
Series,
5+
)
26
from pandas.core.arrays import BaseMaskedArray
37

48

@@ -10,7 +14,9 @@ def get_array(obj, col=None):
1014
which triggers tracking references / CoW (and we might be testing that
1115
this is done by some other operation).
1216
"""
13-
if isinstance(obj, Series) and (col is None or obj.name == col):
17+
if isinstance(obj, Index):
18+
arr = obj._values
19+
elif isinstance(obj, Series) and (col is None or obj.name == col):
1420
arr = obj._values
1521
else:
1622
assert col is not None
@@ -19,4 +25,6 @@ def get_array(obj, col=None):
1925
arr = obj._get_column_array(icol)
2026
if isinstance(arr, BaseMaskedArray):
2127
return arr._data
22-
return arr
28+
elif isinstance(arr, Categorical):
29+
return arr
30+
return getattr(arr, "_ndarray", arr)

pandas/tests/indexing/test_indexing.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -849,7 +849,7 @@ def test_setitem_dt64_string_scalar(self, tz_naive_fixture, indexer_sli):
849849
tz = tz_naive_fixture
850850

851851
dti = date_range("2016-01-01", periods=3, tz=tz)
852-
ser = Series(dti)
852+
ser = Series(dti.copy(deep=True))
853853

854854
values = ser._values
855855

@@ -877,7 +877,7 @@ def test_setitem_dt64_string_values(self, tz_naive_fixture, indexer_sli, key, bo
877877
key = slice(0, 1)
878878

879879
dti = date_range("2016-01-01", periods=3, tz=tz)
880-
ser = Series(dti)
880+
ser = Series(dti.copy(deep=True))
881881

882882
values = ser._values
883883

@@ -897,7 +897,7 @@ def test_setitem_dt64_string_values(self, tz_naive_fixture, indexer_sli, key, bo
897897
def test_setitem_td64_scalar(self, indexer_sli, scalar):
898898
# dispatching _can_hold_element to underling TimedeltaArray
899899
tdi = timedelta_range("1 Day", periods=3)
900-
ser = Series(tdi)
900+
ser = Series(tdi.copy(deep=True))
901901

902902
values = ser._values
903903
values._validate_setitem_value(scalar)
@@ -915,7 +915,7 @@ def test_setitem_td64_string_values(self, indexer_sli, key, box):
915915
key = slice(0, 1)
916916

917917
tdi = timedelta_range("1 Day", periods=3)
918-
ser = Series(tdi)
918+
ser = Series(tdi.copy(deep=True))
919919

920920
values = ser._values
921921

pandas/tests/series/indexing/test_setitem.py

+13-5
Original file line numberDiff line numberDiff line change
@@ -404,15 +404,18 @@ def test_setitem_mask_smallint_no_upcast(self):
404404

405405

406406
class TestSetitemViewCopySemantics:
407-
def test_setitem_invalidates_datetime_index_freq(self):
407+
def test_setitem_invalidates_datetime_index_freq(self, using_copy_on_write):
408408
# GH#24096 altering a datetime64tz Series inplace invalidates the
409409
# `freq` attribute on the underlying DatetimeIndex
410410

411411
dti = date_range("20130101", periods=3, tz="US/Eastern")
412412
ts = dti[1]
413413
ser = Series(dti)
414414
assert ser._values is not dti
415-
assert ser._values._ndarray.base is not dti._data._ndarray.base
415+
if using_copy_on_write:
416+
assert ser._values._ndarray.base is dti._data._ndarray.base
417+
else:
418+
assert ser._values._ndarray.base is not dti._data._ndarray.base
416419
assert dti.freq == "D"
417420
ser.iloc[1] = NaT
418421
assert ser._values.freq is None
@@ -423,15 +426,20 @@ def test_setitem_invalidates_datetime_index_freq(self):
423426
assert dti[1] == ts
424427
assert dti.freq == "D"
425428

426-
def test_dt64tz_setitem_does_not_mutate_dti(self):
429+
def test_dt64tz_setitem_does_not_mutate_dti(self, using_copy_on_write):
427430
# GH#21907, GH#24096
428431
dti = date_range("2016-01-01", periods=10, tz="US/Pacific")
429432
ts = dti[0]
430433
ser = Series(dti)
431434
assert ser._values is not dti
432-
assert ser._values._ndarray.base is not dti._data._ndarray.base
435+
if using_copy_on_write:
436+
assert ser._values._ndarray.base is dti._data._ndarray.base
437+
assert ser._mgr.arrays[0]._ndarray.base is dti._data._ndarray.base
438+
else:
439+
assert ser._values._ndarray.base is not dti._data._ndarray.base
440+
assert ser._mgr.arrays[0]._ndarray.base is not dti._data._ndarray.base
441+
433442
assert ser._mgr.arrays[0] is not dti
434-
assert ser._mgr.arrays[0]._ndarray.base is not dti._data._ndarray.base
435443

436444
ser[::3] = NaT
437445
assert ser[0] is NaT

pandas/tests/series/test_constructors.py

+8
Original file line numberDiff line numberDiff line change
@@ -2056,6 +2056,14 @@ def test_series_constructor_ea_all_na(self):
20562056
)
20572057
tm.assert_series_equal(result, expected)
20582058

2059+
def test_series_from_index_dtype_equal_does_not_copy(self):
2060+
# GH#52008
2061+
idx = Index([1, 2, 3])
2062+
expected = idx.copy(deep=True)
2063+
ser = Series(idx, dtype="int64")
2064+
ser.iloc[0] = 100
2065+
tm.assert_index_equal(idx, expected)
2066+
20592067

20602068
class TestSeriesConstructorIndexCoercion:
20612069
def test_series_constructor_datetimelike_index_coercion(self):

0 commit comments

Comments
 (0)