Skip to content

Commit 065a320

Browse files
authored
Backport PR #52008 on branch 2.0.x (CoW: Avoid copying Index in Series constructor) (#52048)
CoW: Avoid copying Index in Series constructor (#52008)
1 parent a498448 commit 065a320

File tree

14 files changed

+108
-30
lines changed

14 files changed

+108
-30
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1212,6 +1212,7 @@ Conversion
12121212
- Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`)
12131213
- Bug in :meth:`TimedeltaArray.astype` raising ``TypeError`` when converting to a pyarrow duration type (:issue:`49795`)
12141214
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` raising for extension array dtypes (:issue:`29618`, :issue:`50261`, :issue:`31913`)
1215+
- Bug in :meth:`Series` not copying data when created from :class:`Index` and ``dtype`` is equal to ``dtype`` from :class:`Index` (:issue:`52008`)
12151216

12161217
Strings
12171218
^^^^^^^

pandas/_libs/internals.pyi

+2-1
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ class BlockManager:
9696

9797
class BlockValuesRefs:
9898
referenced_blocks: list[weakref.ref]
99-
def __init__(self, blk: SharedBlock) -> None: ...
99+
def __init__(self, blk: SharedBlock | None = ...) -> None: ...
100100
def add_reference(self, blk: SharedBlock) -> None: ...
101+
def add_index_reference(self, index: object) -> None: ...
101102
def has_reference(self) -> bool: ...

pandas/_libs/internals.pyx

+5-2
Original file line numberDiff line numberDiff line change
@@ -877,8 +877,11 @@ cdef class BlockValuesRefs:
877877
cdef:
878878
public list referenced_blocks
879879

880-
def __cinit__(self, blk: SharedBlock) -> None:
881-
self.referenced_blocks = [weakref.ref(blk)]
880+
def __cinit__(self, blk: SharedBlock | None = None) -> None:
881+
if blk is not None:
882+
self.referenced_blocks = [weakref.ref(blk)]
883+
else:
884+
self.referenced_blocks = []
882885

883886
def add_reference(self, blk: SharedBlock) -> None:
884887
"""Adds a new reference to our reference collection.

pandas/core/indexes/base.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
index as libindex,
3232
lib,
3333
)
34+
from pandas._libs.internals import BlockValuesRefs
3435
import pandas._libs.join as libjoin
3536
from pandas._libs.lib import (
3637
is_datetime_array,
@@ -653,9 +654,11 @@ def _simple_new(
653654
result._name = name
654655
result._cache = {}
655656
result._reset_identity()
656-
result._references = refs
657657
if refs is not None:
658-
refs.add_index_reference(result)
658+
result._references = refs
659+
else:
660+
result._references = BlockValuesRefs()
661+
result._references.add_index_reference(result)
659662

660663
return result
661664

pandas/core/indexes/interval.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,8 @@ def inferred_type(self) -> str:
390390
"""Return a string of the type inferred from the values"""
391391
return "interval"
392392

393-
@Appender(Index.memory_usage.__doc__)
393+
# Cannot determine type of "memory_usage"
394+
@Appender(Index.memory_usage.__doc__) # type: ignore[has-type]
394395
def memory_usage(self, deep: bool = False) -> int:
395396
# we don't use an explicit engine
396397
# so return the bytes here

pandas/core/indexes/multi.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1233,7 +1233,8 @@ def f(level) -> bool:
12331233

12341234
return any(f(level) for level in self._inferred_type_levels)
12351235

1236-
@doc(Index.memory_usage)
1236+
# Cannot determine type of "memory_usage"
1237+
@doc(Index.memory_usage) # type: ignore[has-type]
12371238
def memory_usage(self, deep: bool = False) -> int:
12381239
# we are overwriting our base class to avoid
12391240
# computing .values here which could materialize

pandas/core/internals/managers.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,10 @@
2222
internals as libinternals,
2323
lib,
2424
)
25-
from pandas._libs.internals import BlockPlacement
25+
from pandas._libs.internals import (
26+
BlockPlacement,
27+
BlockValuesRefs,
28+
)
2629
from pandas._typing import (
2730
ArrayLike,
2831
AxisInt,
@@ -1868,11 +1871,13 @@ def from_blocks(
18681871
return cls(blocks[0], axes[0], verify_integrity=False)
18691872

18701873
@classmethod
1871-
def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager:
1874+
def from_array(
1875+
cls, array: ArrayLike, index: Index, refs: BlockValuesRefs | None = None
1876+
) -> SingleBlockManager:
18721877
"""
18731878
Constructor for if we have an array that is not yet a Block.
18741879
"""
1875-
block = new_block(array, placement=slice(0, len(index)), ndim=1)
1880+
block = new_block(array, placement=slice(0, len(index)), ndim=1, refs=refs)
18761881
return cls(block, index)
18771882

18781883
def to_2d_mgr(self, columns: Index) -> BlockManager:

pandas/core/series.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -426,10 +426,15 @@ def __init__(
426426
raise NotImplementedError(
427427
"initializing a Series from a MultiIndex is not supported"
428428
)
429+
430+
refs = None
429431
if isinstance(data, Index):
430432
if dtype is not None:
431-
# astype copies
432-
data = data.astype(dtype)
433+
data = data.astype(dtype, copy=False)
434+
435+
if using_copy_on_write():
436+
refs = data._references
437+
data = data._values
433438
else:
434439
# GH#24096 we need to ensure the index remains immutable
435440
data = data._values.copy()
@@ -493,7 +498,7 @@ def __init__(
493498

494499
manager = get_option("mode.data_manager")
495500
if manager == "block":
496-
data = SingleBlockManager.from_array(data, index)
501+
data = SingleBlockManager.from_array(data, index, refs=refs)
497502
elif manager == "array":
498503
data = SingleArrayManager.from_array(data, index)
499504

pandas/tests/copy_view/test_astype.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ def test_astype_different_timezones(using_copy_on_write):
173173
result = df.astype("datetime64[ns, Europe/Berlin]")
174174
if using_copy_on_write:
175175
assert not result._mgr._has_no_reference(0)
176-
assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a").asi8)
176+
assert np.shares_memory(get_array(df, "a"), get_array(result, "a"))
177177

178178

179179
def test_astype_different_timezones_different_reso(using_copy_on_write):
@@ -183,9 +183,7 @@ def test_astype_different_timezones_different_reso(using_copy_on_write):
183183
result = df.astype("datetime64[ms, Europe/Berlin]")
184184
if using_copy_on_write:
185185
assert result._mgr._has_no_reference(0)
186-
assert not np.shares_memory(
187-
get_array(df, "a").asi8, get_array(result, "a").asi8
188-
)
186+
assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))
189187

190188

191189
@pytest.mark.skipif(pa_version_under7p0, reason="pyarrow not installed")
@@ -202,7 +200,7 @@ def test_astype_arrow_timestamp(using_copy_on_write):
202200
result = df.astype("timestamp[ns][pyarrow]")
203201
if using_copy_on_write:
204202
assert not result._mgr._has_no_reference(0)
205-
assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a")._data)
203+
assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._data)
206204

207205

208206
def test_convert_dtypes_infer_objects(using_copy_on_write):

pandas/tests/copy_view/test_constructors.py

+36
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,14 @@
33

44
from pandas import (
55
DataFrame,
6+
DatetimeIndex,
7+
Index,
8+
Period,
9+
PeriodIndex,
610
Series,
11+
Timedelta,
12+
TimedeltaIndex,
13+
Timestamp,
714
)
815
import pandas._testing as tm
916
from pandas.tests.copy_view.util import get_array
@@ -82,6 +89,35 @@ def test_series_from_series_with_reindex(using_copy_on_write):
8289
assert not result._mgr.blocks[0].refs.has_reference()
8390

8491

92+
@pytest.mark.parametrize(
93+
"idx",
94+
[
95+
Index([1, 2]),
96+
DatetimeIndex([Timestamp("2019-12-31"), Timestamp("2020-12-31")]),
97+
PeriodIndex([Period("2019-12-31"), Period("2020-12-31")]),
98+
TimedeltaIndex([Timedelta("1 days"), Timedelta("2 days")]),
99+
],
100+
)
101+
def test_series_from_index(using_copy_on_write, idx):
102+
ser = Series(idx)
103+
expected = idx.copy(deep=True)
104+
if using_copy_on_write:
105+
assert np.shares_memory(get_array(ser), get_array(idx))
106+
assert not ser._mgr._has_no_reference(0)
107+
else:
108+
assert not np.shares_memory(get_array(ser), get_array(idx))
109+
ser.iloc[0] = ser.iloc[1]
110+
tm.assert_index_equal(idx, expected)
111+
112+
113+
def test_series_from_index_different_dtypes(using_copy_on_write):
114+
idx = Index([1, 2, 3], dtype="int64")
115+
ser = Series(idx, dtype="int32")
116+
assert not np.shares_memory(get_array(ser), get_array(idx))
117+
if using_copy_on_write:
118+
assert ser._mgr._has_no_reference(0)
119+
120+
85121
@pytest.mark.parametrize("func", [lambda x: x, lambda x: x._mgr])
86122
@pytest.mark.parametrize("columns", [None, ["a"]])
87123
def test_dataframe_constructor_mgr_or_df(using_copy_on_write, columns, func):

pandas/tests/copy_view/util.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1-
from pandas import Series
1+
from pandas import (
2+
Categorical,
3+
Index,
4+
Series,
5+
)
26
from pandas.core.arrays import BaseMaskedArray
37

48

@@ -10,7 +14,9 @@ def get_array(obj, col=None):
1014
which triggers tracking references / CoW (and we might be testing that
1115
this is done by some other operation).
1216
"""
13-
if isinstance(obj, Series) and (col is None or obj.name == col):
17+
if isinstance(obj, Index):
18+
arr = obj._values
19+
elif isinstance(obj, Series) and (col is None or obj.name == col):
1420
arr = obj._values
1521
else:
1622
assert col is not None
@@ -19,4 +25,6 @@ def get_array(obj, col=None):
1925
arr = obj._get_column_array(icol)
2026
if isinstance(arr, BaseMaskedArray):
2127
return arr._data
22-
return arr
28+
elif isinstance(arr, Categorical):
29+
return arr
30+
return getattr(arr, "_ndarray", arr)

pandas/tests/indexing/test_indexing.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -849,7 +849,7 @@ def test_setitem_dt64_string_scalar(self, tz_naive_fixture, indexer_sli):
849849
tz = tz_naive_fixture
850850

851851
dti = date_range("2016-01-01", periods=3, tz=tz)
852-
ser = Series(dti)
852+
ser = Series(dti.copy(deep=True))
853853

854854
values = ser._values
855855

@@ -877,7 +877,7 @@ def test_setitem_dt64_string_values(self, tz_naive_fixture, indexer_sli, key, bo
877877
key = slice(0, 1)
878878

879879
dti = date_range("2016-01-01", periods=3, tz=tz)
880-
ser = Series(dti)
880+
ser = Series(dti.copy(deep=True))
881881

882882
values = ser._values
883883

@@ -897,7 +897,7 @@ def test_setitem_dt64_string_values(self, tz_naive_fixture, indexer_sli, key, bo
897897
def test_setitem_td64_scalar(self, indexer_sli, scalar):
898898
# dispatching _can_hold_element to underling TimedeltaArray
899899
tdi = timedelta_range("1 Day", periods=3)
900-
ser = Series(tdi)
900+
ser = Series(tdi.copy(deep=True))
901901

902902
values = ser._values
903903
values._validate_setitem_value(scalar)
@@ -915,7 +915,7 @@ def test_setitem_td64_string_values(self, indexer_sli, key, box):
915915
key = slice(0, 1)
916916

917917
tdi = timedelta_range("1 Day", periods=3)
918-
ser = Series(tdi)
918+
ser = Series(tdi.copy(deep=True))
919919

920920
values = ser._values
921921

pandas/tests/series/indexing/test_setitem.py

+13-5
Original file line numberDiff line numberDiff line change
@@ -404,15 +404,18 @@ def test_setitem_mask_smallint_no_upcast(self):
404404

405405

406406
class TestSetitemViewCopySemantics:
407-
def test_setitem_invalidates_datetime_index_freq(self):
407+
def test_setitem_invalidates_datetime_index_freq(self, using_copy_on_write):
408408
# GH#24096 altering a datetime64tz Series inplace invalidates the
409409
# `freq` attribute on the underlying DatetimeIndex
410410

411411
dti = date_range("20130101", periods=3, tz="US/Eastern")
412412
ts = dti[1]
413413
ser = Series(dti)
414414
assert ser._values is not dti
415-
assert ser._values._ndarray.base is not dti._data._ndarray.base
415+
if using_copy_on_write:
416+
assert ser._values._ndarray.base is dti._data._ndarray.base
417+
else:
418+
assert ser._values._ndarray.base is not dti._data._ndarray.base
416419
assert dti.freq == "D"
417420
ser.iloc[1] = NaT
418421
assert ser._values.freq is None
@@ -423,15 +426,20 @@ def test_setitem_invalidates_datetime_index_freq(self):
423426
assert dti[1] == ts
424427
assert dti.freq == "D"
425428

426-
def test_dt64tz_setitem_does_not_mutate_dti(self):
429+
def test_dt64tz_setitem_does_not_mutate_dti(self, using_copy_on_write):
427430
# GH#21907, GH#24096
428431
dti = date_range("2016-01-01", periods=10, tz="US/Pacific")
429432
ts = dti[0]
430433
ser = Series(dti)
431434
assert ser._values is not dti
432-
assert ser._values._ndarray.base is not dti._data._ndarray.base
435+
if using_copy_on_write:
436+
assert ser._values._ndarray.base is dti._data._ndarray.base
437+
assert ser._mgr.arrays[0]._ndarray.base is dti._data._ndarray.base
438+
else:
439+
assert ser._values._ndarray.base is not dti._data._ndarray.base
440+
assert ser._mgr.arrays[0]._ndarray.base is not dti._data._ndarray.base
441+
433442
assert ser._mgr.arrays[0] is not dti
434-
assert ser._mgr.arrays[0]._ndarray.base is not dti._data._ndarray.base
435443

436444
ser[::3] = NaT
437445
assert ser[0] is NaT

pandas/tests/series/test_constructors.py

+8
Original file line numberDiff line numberDiff line change
@@ -2056,6 +2056,14 @@ def test_series_constructor_ea_all_na(self):
20562056
)
20572057
tm.assert_series_equal(result, expected)
20582058

2059+
def test_series_from_index_dtype_equal_does_not_copy(self):
2060+
# GH#52008
2061+
idx = Index([1, 2, 3])
2062+
expected = idx.copy(deep=True)
2063+
ser = Series(idx, dtype="int64")
2064+
ser.iloc[0] = 100
2065+
tm.assert_index_equal(idx, expected)
2066+
20592067

20602068
class TestSeriesConstructorIndexCoercion:
20612069
def test_series_constructor_datetimelike_index_coercion(self):

0 commit comments

Comments
 (0)