pandas-dev · jorisvandenbossche · Mar 17, 2023 · Mar 15, 2023 · Mar 16, 2023 · Mar 16, 2023
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -1209,6 +1209,7 @@ Conversion
 - Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`)
 - Bug in :meth:`TimedeltaArray.astype` raising ``TypeError`` when converting to a pyarrow duration type (:issue:`49795`)
 - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` raising for extension array dtypes (:issue:`29618`, :issue:`50261`, :issue:`31913`)
+- Bug in :meth:`Series` not copying data when created from :class:`Index` and ``dtype`` is equal to ``dtype`` from :class:`Index` (:issue:`52008`)
 
 Strings
 ^^^^^^^

diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi
@@ -96,6 +96,7 @@ class BlockManager:
 
 class BlockValuesRefs:
     referenced_blocks: list[weakref.ref]
-    def __init__(self, blk: SharedBlock) -> None: ...
+    def __init__(self, blk: SharedBlock | None = ...) -> None: ...
     def add_reference(self, blk: SharedBlock) -> None: ...
+    def add_index_reference(self, index: object) -> None: ...
     def has_reference(self) -> bool: ...
diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx
@@ -877,8 +877,11 @@ cdef class BlockValuesRefs:
     cdef:
         public list referenced_blocks
 
-    def __cinit__(self, blk: SharedBlock) -> None:
-        self.referenced_blocks = [weakref.ref(blk)]
+    def __cinit__(self, blk: SharedBlock | None = None) -> None:
+        if blk is not None:
+            self.referenced_blocks = [weakref.ref(blk)]
+        else:
+            self.referenced_blocks = []
 
     def add_reference(self, blk: SharedBlock) -> None:
         """Adds a new reference to our reference collection.

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -30,6 +30,7 @@
     index as libindex,
     lib,
 )
+from pandas._libs.internals import BlockValuesRefs
 import pandas._libs.join as libjoin
 from pandas._libs.lib import (
     is_datetime_array,
@@ -652,9 +653,11 @@ def _simple_new(cls, values: ArrayLike, name: Hashable = None, refs=None) -> Sel
         result._name = name
         result._cache = {}
         result._reset_identity()
-        result._references = refs
         if refs is not None:
-            refs.add_index_reference(result)
+            result._references = refs
+        else:
+            result._references = BlockValuesRefs()
+        result._references.add_index_reference(result)
 
         return result
 

diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
@@ -399,7 +399,8 @@ def inferred_type(self) -> str:
         """Return a string of the type inferred from the values"""
         return "interval"
 
-    @Appender(Index.memory_usage.__doc__)
+    # Cannot determine type of "memory_usage"
+    @Appender(Index.memory_usage.__doc__)  # type: ignore[has-type]
     def memory_usage(self, deep: bool = False) -> int:
         # we don't use an explicit engine
         # so return the bytes here

diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -1255,7 +1255,8 @@ def f(level) -> bool:
 
         return any(f(level) for level in self._inferred_type_levels)
 
-    @doc(Index.memory_usage)
+    # Cannot determine type of "memory_usage"
+    @doc(Index.memory_usage)  # type: ignore[has-type]
     def memory_usage(self, deep: bool = False) -> int:
         # we are overwriting our base class to avoid
         # computing .values here which could materialize

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -22,7 +22,10 @@
     internals as libinternals,
     lib,
 )
-from pandas._libs.internals import BlockPlacement
+from pandas._libs.internals import (
+    BlockPlacement,
+    BlockValuesRefs,
+)
 from pandas.errors import PerformanceWarning
 from pandas.util._decorators import cache_readonly
 from pandas.util._exceptions import find_stack_level
@@ -1877,11 +1880,13 @@ def from_blocks(
         return cls(blocks[0], axes[0], verify_integrity=False)
 
     @classmethod
-    def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager:
+    def from_array(
+        cls, array: ArrayLike, index: Index, refs: BlockValuesRefs | None = None
+    ) -> SingleBlockManager:
         """
         Constructor for if we have an array that is not yet a Block.
         """
-        block = new_block(array, placement=slice(0, len(index)), ndim=1)
+        block = new_block(array, placement=slice(0, len(index)), ndim=1, refs=refs)
         return cls(block, index)
 
     def to_2d_mgr(self, columns: Index) -> BlockManager:

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -429,10 +429,15 @@ def __init__(
             raise NotImplementedError(
                 "initializing a Series from a MultiIndex is not supported"
             )
+
+        refs = None
         if isinstance(data, Index):
             if dtype is not None:
-                # astype copies
-                data = data.astype(dtype)
+                data = data.astype(dtype, copy=False)
+
+            if using_copy_on_write():
+                refs = data._references
+                data = data._values
             else:
                 # GH#24096 we need to ensure the index remains immutable
                 data = data._values.copy()
@@ -496,7 +501,7 @@ def __init__(
 
             manager = get_option("mode.data_manager")
             if manager == "block":
-                data = SingleBlockManager.from_array(data, index)
+                data = SingleBlockManager.from_array(data, index, refs=refs)
             elif manager == "array":
                 data = SingleArrayManager.from_array(data, index)
 

diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py
@@ -173,7 +173,7 @@ def test_astype_different_timezones(using_copy_on_write):
     result = df.astype("datetime64[ns, Europe/Berlin]")
     if using_copy_on_write:
         assert not result._mgr._has_no_reference(0)
-        assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a").asi8)
+        assert np.shares_memory(get_array(df, "a"), get_array(result, "a"))
 
 
 def test_astype_different_timezones_different_reso(using_copy_on_write):
@@ -183,9 +183,7 @@ def test_astype_different_timezones_different_reso(using_copy_on_write):
     result = df.astype("datetime64[ms, Europe/Berlin]")
     if using_copy_on_write:
         assert result._mgr._has_no_reference(0)
-        assert not np.shares_memory(
-            get_array(df, "a").asi8, get_array(result, "a").asi8
-        )
+        assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))
 
 
 @pytest.mark.skipif(pa_version_under7p0, reason="pyarrow not installed")
@@ -202,9 +200,7 @@ def test_astype_arrow_timestamp(using_copy_on_write):
     result = df.astype("timestamp[ns][pyarrow]")
     if using_copy_on_write:
         assert not result._mgr._has_no_reference(0)
-        assert np.shares_memory(
-            get_array(df, "a").asi8, get_array(result, "a")._pa_array
-        )
+        assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._pa_array)
 
 
 def test_convert_dtypes_infer_objects(using_copy_on_write):

diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py
@@ -3,7 +3,14 @@
 
 from pandas import (
     DataFrame,
+    DatetimeIndex,
+    Index,
+    Period,
+    PeriodIndex,
     Series,
+    Timedelta,
+    TimedeltaIndex,
+    Timestamp,
 )
 import pandas._testing as tm
 from pandas.tests.copy_view.util import get_array
@@ -82,6 +89,35 @@ def test_series_from_series_with_reindex(using_copy_on_write):
         assert not result._mgr.blocks[0].refs.has_reference()
 
 
+@pytest.mark.parametrize(
+    "idx",
+    [
+        Index([1, 2]),
+        DatetimeIndex([Timestamp("2019-12-31"), Timestamp("2020-12-31")]),
+        PeriodIndex([Period("2019-12-31"), Period("2020-12-31")]),
+        TimedeltaIndex([Timedelta("1 days"), Timedelta("2 days")]),
+    ],
+)
+def test_series_from_index(using_copy_on_write, idx):
+    ser = Series(idx)
+    expected = idx.copy(deep=True)
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(ser), get_array(idx))
+        assert not ser._mgr._has_no_reference(0)
+    else:
+        assert not np.shares_memory(get_array(ser), get_array(idx))
+    ser.iloc[0] = ser.iloc[1]
+    tm.assert_index_equal(idx, expected)
+
+
+def test_series_from_index_different_dtypes(using_copy_on_write):
+    idx = Index([1, 2, 3], dtype="int64")
+    ser = Series(idx, dtype="int32")
+    assert not np.shares_memory(get_array(ser), get_array(idx))
+    if using_copy_on_write:
+        assert ser._mgr._has_no_reference(0)
+
+
 @pytest.mark.parametrize("func", [lambda x: x, lambda x: x._mgr])
 @pytest.mark.parametrize("columns", [None, ["a"]])
 def test_dataframe_constructor_mgr_or_df(using_copy_on_write, columns, func):

diff --git a/pandas/tests/copy_view/util.py b/pandas/tests/copy_view/util.py
@@ -1,4 +1,8 @@
-from pandas import Series
+from pandas import (
+    Categorical,
+    Index,
+    Series,
+)
 from pandas.core.arrays import BaseMaskedArray
 
 
@@ -10,7 +14,9 @@ def get_array(obj, col=None):
     which triggers tracking references / CoW (and we might be testing that
     this is done by some other operation).
     """
-    if isinstance(obj, Series) and (col is None or obj.name == col):
+    if isinstance(obj, Index):
+        arr = obj._values
+    elif isinstance(obj, Series) and (col is None or obj.name == col):
         arr = obj._values
     else:
         assert col is not None
@@ -19,4 +25,6 @@ def get_array(obj, col=None):
         arr = obj._get_column_array(icol)
     if isinstance(arr, BaseMaskedArray):
         return arr._data
-    return arr
+    elif isinstance(arr, Categorical):
+        return arr
+    return getattr(arr, "_ndarray", arr)
diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py
@@ -849,7 +849,7 @@ def test_setitem_dt64_string_scalar(self, tz_naive_fixture, indexer_sli):
         tz = tz_naive_fixture
 
         dti = date_range("2016-01-01", periods=3, tz=tz)
-        ser = Series(dti)
+        ser = Series(dti.copy(deep=True))
 
         values = ser._values
 
@@ -877,7 +877,7 @@ def test_setitem_dt64_string_values(self, tz_naive_fixture, indexer_sli, key, bo
             key = slice(0, 1)
 
         dti = date_range("2016-01-01", periods=3, tz=tz)
-        ser = Series(dti)
+        ser = Series(dti.copy(deep=True))
 
         values = ser._values
 
@@ -897,7 +897,7 @@ def test_setitem_dt64_string_values(self, tz_naive_fixture, indexer_sli, key, bo
     def test_setitem_td64_scalar(self, indexer_sli, scalar):
         # dispatching _can_hold_element to underling TimedeltaArray
         tdi = timedelta_range("1 Day", periods=3)
-        ser = Series(tdi)
+        ser = Series(tdi.copy(deep=True))
 
         values = ser._values
         values._validate_setitem_value(scalar)
@@ -915,7 +915,7 @@ def test_setitem_td64_string_values(self, indexer_sli, key, box):
             key = slice(0, 1)
 
         tdi = timedelta_range("1 Day", periods=3)
-        ser = Series(tdi)
+        ser = Series(tdi.copy(deep=True))
 
         values = ser._values
 

diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py
@@ -404,15 +404,18 @@ def test_setitem_mask_smallint_no_upcast(self):
 
 
 class TestSetitemViewCopySemantics:
-    def test_setitem_invalidates_datetime_index_freq(self):
+    def test_setitem_invalidates_datetime_index_freq(self, using_copy_on_write):
         # GH#24096 altering a datetime64tz Series inplace invalidates the
         #  `freq` attribute on the underlying DatetimeIndex
 
         dti = date_range("20130101", periods=3, tz="US/Eastern")
         ts = dti[1]
         ser = Series(dti)
         assert ser._values is not dti
-        assert ser._values._ndarray.base is not dti._data._ndarray.base
+        if using_copy_on_write:
+            assert ser._values._ndarray.base is dti._data._ndarray.base
+        else:
+            assert ser._values._ndarray.base is not dti._data._ndarray.base
         assert dti.freq == "D"
         ser.iloc[1] = NaT
         assert ser._values.freq is None
@@ -423,15 +426,20 @@ def test_setitem_invalidates_datetime_index_freq(self):
         assert dti[1] == ts
         assert dti.freq == "D"
 
-    def test_dt64tz_setitem_does_not_mutate_dti(self):
+    def test_dt64tz_setitem_does_not_mutate_dti(self, using_copy_on_write):
         # GH#21907, GH#24096
         dti = date_range("2016-01-01", periods=10, tz="US/Pacific")
         ts = dti[0]
         ser = Series(dti)
         assert ser._values is not dti
-        assert ser._values._ndarray.base is not dti._data._ndarray.base
+        if using_copy_on_write:
+            assert ser._values._ndarray.base is dti._data._ndarray.base
+            assert ser._mgr.arrays[0]._ndarray.base is dti._data._ndarray.base
+        else:
+            assert ser._values._ndarray.base is not dti._data._ndarray.base
+            assert ser._mgr.arrays[0]._ndarray.base is not dti._data._ndarray.base
+
         assert ser._mgr.arrays[0] is not dti
-        assert ser._mgr.arrays[0]._ndarray.base is not dti._data._ndarray.base
 
         ser[::3] = NaT
         assert ser[0] is NaT

diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -2056,6 +2056,14 @@ def test_series_constructor_ea_all_na(self):
         )
         tm.assert_series_equal(result, expected)
 
+    def test_series_from_index_dtype_equal_does_not_copy(self):
+        # GH#52008
+        idx = Index([1, 2, 3])
+        expected = idx.copy(deep=True)
+        ser = Series(idx, dtype="int64")
+        ser.iloc[0] = 100
+        tm.assert_index_equal(idx, expected)
+
 
 class TestSeriesConstructorIndexCoercion:
     def test_series_constructor_datetimelike_index_coercion(self):