From 8be558eaffa1f249a171287ec8a48ed6af9ab3a5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 16 Mar 2023 17:49:34 +0100 Subject: [PATCH 01/10] API-CoW: Copy arrays in Series constructor --- pandas/conftest.py | 2 +- pandas/core/series.py | 18 +++++++++-- .../tests/arrays/categorical/test_replace.py | 4 +-- pandas/tests/copy_view/test_astype.py | 4 ++- pandas/tests/copy_view/test_constructors.py | 30 +++++++++++++++++++ pandas/tests/internals/test_internals.py | 2 +- pandas/tests/series/indexing/test_setitem.py | 4 +-- pandas/tests/series/test_constructors.py | 2 +- 8 files changed, 55 insertions(+), 11 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 95bb2078d151c..8f8164228d3b8 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -708,7 +708,7 @@ def _create_series(index): """Helper for the _series dict""" size = len(index) data = np.random.randn(size) - return Series(data, index=index, name="a") + return Series(data, index=index, name="a", copy=False) _series = { diff --git a/pandas/core/series.py b/pandas/core/series.py index b0958869c67f3..c254ebc4bb303 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -58,6 +58,7 @@ validate_percentile, ) +from pandas.core.dtypes.astype import astype_is_view from pandas.core.dtypes.cast import ( LossySetitemError, convert_dtypes, @@ -376,9 +377,15 @@ def __init__( index=None, dtype: Dtype | None = None, name=None, - copy: bool = False, + copy: bool | None = None, fastpath: bool = False, ) -> None: + if copy is None: + default_cow_copy = True + copy = False + else: + default_cow_copy = copy + if ( isinstance(data, (SingleBlockManager, SingleArrayManager)) and index is None @@ -394,6 +401,11 @@ def __init__( self.name = name return + if isinstance(data, (ExtensionArray, np.ndarray)): + if default_cow_copy and not copy and using_copy_on_write(): + if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)): + data = data.copy() + # we are called internally, so short-circuit if fastpath: # data is a ndarray, index is defined @@ -6087,7 +6099,7 @@ def _construct_result( # TODO: result should always be ArrayLike, but this fails for some # JSONArray tests dtype = getattr(result, "dtype", None) - out = self._constructor(result, index=self.index, dtype=dtype) + out = self._constructor(result, index=self.index, dtype=dtype, copy=False) out = out.__finalize__(self) # Set the result's name after __finalize__ is called because __finalize__ @@ -6106,7 +6118,7 @@ def _flex_method(self, other, op, *, level=None, fill_value=None, axis: Axis = 0 elif isinstance(other, (np.ndarray, list, tuple)): if len(other) != len(self): raise ValueError("Lengths must be equal") - other = self._constructor(other, self.index) + other = self._constructor(other, self.index, copy=False) result = self._binop(other, op, level=level, fill_value=fill_value) result.name = res_name return result diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py index ee9e1dbc81e12..d38f0b8719de0 100644 --- a/pandas/tests/arrays/categorical/test_replace.py +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -60,7 +60,7 @@ def test_replace_categorical(to_replace, value, result, expected_error_msg): # GH#26988 cat = Categorical(["a", "b"]) expected = Categorical(result) - result = pd.Series(cat).replace(to_replace, value)._values + result = pd.Series(cat, copy=False).replace(to_replace, value)._values tm.assert_categorical_equal(result, expected) if to_replace == "b": # the "c" test is supposed to be unchanged @@ -68,7 +68,7 @@ def test_replace_categorical(to_replace, value, result, expected_error_msg): # ensure non-inplace call does not affect original tm.assert_categorical_equal(cat, expected) - pd.Series(cat).replace(to_replace, value, inplace=True) + pd.Series(cat, copy=False).replace(to_replace, value, inplace=True) tm.assert_categorical_equal(cat, expected) diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 16c060d004bc7..cd444fa8a18df 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -202,7 +202,9 @@ def test_astype_arrow_timestamp(using_copy_on_write): result = df.astype("timestamp[ns][pyarrow]") if using_copy_on_write: assert not result._mgr._has_no_reference(0) - assert np.shares_memory( + # TODO(CoW): arrow is not setting copy=False in the Series constructor + # under the hood + assert not np.shares_memory( get_array(df, "a").asi8, get_array(result, "a")._pa_array ) diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index 6cf45c194707e..fc579b7b7b0be 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -1,6 +1,7 @@ import numpy as np import pytest +import pandas as pd from pandas import ( DataFrame, Series, @@ -82,6 +83,35 @@ def test_series_from_series_with_reindex(using_copy_on_write): assert not result._mgr.blocks[0].refs.has_reference() +@pytest.mark.parametrize("fastpath", [False, True]) +@pytest.mark.parametrize("dtype", [None, "int64"]) +@pytest.mark.parametrize("idx", [None, pd.RangeIndex(start=0, stop=3, step=1)]) +@pytest.mark.parametrize( + "arr", [np.array([1, 2, 3], dtype="int64"), pd.array([1, 2, 3], dtype="Int64")] +) +def test_series_from_array(using_copy_on_write, idx, dtype, fastpath, arr): + ser = Series(arr, dtype=dtype) + ser_orig = ser.copy() + data = getattr(arr, "_data", arr) + if using_copy_on_write: + assert not np.shares_memory(get_array(ser), data) + else: + assert np.shares_memory(get_array(ser), data) + + arr[0] = 100 + if using_copy_on_write: + tm.assert_series_equal(ser, ser_orig) + else: + expected = Series([100, 2, 3], dtype=dtype if dtype is not None else arr.dtype) + tm.assert_series_equal(ser, expected) + + +def test_series_from_array_different_dtype(using_copy_on_write): + arr = np.array([1, 2, 3], dtype="int64") + ser = Series(arr, dtype="int64") + assert not np.shares_memory(get_array(ser), arr) + + @pytest.mark.parametrize("func", [lambda x: x, lambda x: x._mgr]) @pytest.mark.parametrize("columns", [None, ["a"]]) def test_dataframe_constructor_mgr_or_df(using_copy_on_write, columns, func): diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 964a4c4982481..c14178a3e122e 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1364,7 +1364,7 @@ def check_can_hold_element(self, obj, elem, inplace: bool): def check_series_setitem(self, elem, index: Index, inplace: bool): arr = index._data.copy() - ser = Series(arr) + ser = Series(arr, copy=False) self.check_can_hold_element(ser, elem, inplace) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index c36831ba60b89..00d10e9e33a13 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -570,7 +570,7 @@ def test_setitem_scalar_into_readonly_backing_data(): array = np.zeros(5) array.flags.writeable = False # make the array immutable - series = Series(array) + series = Series(array, copy=False) for n in series.index: msg = "assignment destination is read-only" @@ -585,7 +585,7 @@ def test_setitem_slice_into_readonly_backing_data(): array = np.zeros(5) array.flags.writeable = False # make the array immutable - series = Series(array) + series = Series(array, copy=False) msg = "assignment destination is read-only" with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index bb1926ca9bfb7..e21b1a647971f 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -527,7 +527,7 @@ def test_categorical_sideeffects_free(self): # however, copy is False by default # so this WILL change values cat = Categorical(["a", "b", "c", "a"]) - s = Series(cat) + s = Series(cat, copy=False) assert s.values is cat s = s.cat.rename_categories([1, 2, 3]) assert s.values is not cat From 38c7f373653fb606fc4cf1a037a7e354a913e4d7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 16 Mar 2023 17:52:15 +0100 Subject: [PATCH 02/10] Fix --- pandas/tests/copy_view/test_constructors.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index fc579b7b7b0be..d11389b6fb07c 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -106,9 +106,10 @@ def test_series_from_array(using_copy_on_write, idx, dtype, fastpath, arr): tm.assert_series_equal(ser, expected) -def test_series_from_array_different_dtype(using_copy_on_write): +@pytest.mark.parametrize("copy", [True, False, None]) +def test_series_from_array_different_dtype(using_copy_on_write, copy): arr = np.array([1, 2, 3], dtype="int64") - ser = Series(arr, dtype="int64") + ser = Series(arr, dtype="int32", copy=copy) assert not np.shares_memory(get_array(ser), arr) From b657561e833c3591a033e757d3a56ee093901ae1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 16 Mar 2023 20:09:29 +0100 Subject: [PATCH 03/10] Fix tests --- pandas/tests/extension/base/constructors.py | 2 +- pandas/tests/extension/base/methods.py | 2 +- pandas/tests/extension/test_sparse.py | 2 +- pandas/tests/io/formats/test_format.py | 2 +- pandas/tests/reductions/test_stat_reductions.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py index 29766ff392296..1f85c89ef38be 100644 --- a/pandas/tests/extension/base/constructors.py +++ b/pandas/tests/extension/base/constructors.py @@ -22,7 +22,7 @@ def test_array_from_scalars(self, data): assert isinstance(result, type(data)) def test_series_constructor(self, data): - result = pd.Series(data) + result = pd.Series(data, copy=False) assert result.dtype == data.dtype assert len(result) == len(data) if hasattr(result._mgr, "blocks"): diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 9f556b47937f7..d3f090fab075d 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -271,7 +271,7 @@ def test_fillna_copy_frame(self, data_missing): def test_fillna_copy_series(self, data_missing): arr = data_missing.take([1, 1]) - ser = pd.Series(arr) + ser = pd.Series(arr, copy=False) ser_orig = ser.copy() filled_val = ser[0] diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index e14de81d6fbd6..bdff5d67f884a 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -288,7 +288,7 @@ def test_fillna_copy_frame(self, data_missing, using_copy_on_write): def test_fillna_copy_series(self, data_missing, using_copy_on_write): arr = data_missing.take([1, 1]) - ser = pd.Series(arr) + ser = pd.Series(arr, copy=False) filled_val = ser[0] result = ser.fillna(filled_val) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index fccb053e73d4b..175c2478808b9 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2821,7 +2821,7 @@ def __getitem__(self, ix): def dtype(self): return DtypeStub() - series = Series(ExtTypeStub()) + series = Series(ExtTypeStub(), copy=False) res = repr(series) # This line crashed before #33770 was fixed. expected = "\n".join( ["0 [False True]", "1 [ True False]", "dtype: DtypeStub"] diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index dd6aef04a2e6a..70cb173483692 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -70,7 +70,7 @@ def test_td64_mean(self, box): tdi = pd.TimedeltaIndex([0, 3, -2, -7, 1, 2, -1, 3, 5, -2, 4], unit="D") tdarr = tdi._data - obj = box(tdarr) + obj = box(tdarr, copy=False) result = obj.mean() expected = np.array(tdarr).mean() From 6ec58650d25bb5f3d623f33ee575db5b09199350 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 17 Mar 2023 16:01:54 +0100 Subject: [PATCH 04/10] Address review --- pandas/core/series.py | 10 +++++----- pandas/tests/copy_view/test_constructors.py | 4 +++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index c254ebc4bb303..ed518d0de0a9f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -381,10 +381,10 @@ def __init__( fastpath: bool = False, ) -> None: if copy is None: - default_cow_copy = True - copy = False - else: - default_cow_copy = copy + if using_copy_on_write(): + copy = True + else: + copy = False if ( isinstance(data, (SingleBlockManager, SingleArrayManager)) @@ -402,7 +402,7 @@ def __init__( return if isinstance(data, (ExtensionArray, np.ndarray)): - if default_cow_copy and not copy and using_copy_on_write(): + if copy and using_copy_on_write(): if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)): data = data.copy() diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index d11389b6fb07c..1f5f9f1b550b9 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -90,7 +90,9 @@ def test_series_from_series_with_reindex(using_copy_on_write): "arr", [np.array([1, 2, 3], dtype="int64"), pd.array([1, 2, 3], dtype="Int64")] ) def test_series_from_array(using_copy_on_write, idx, dtype, fastpath, arr): - ser = Series(arr, dtype=dtype) + if idx is None or dtype is not None: + fastpath = False + ser = Series(arr, dtype=dtype, index=idx, fastpath=fastpath) ser_orig = ser.copy() data = getattr(arr, "_data", arr) if using_copy_on_write: From 48339d76de2290c400737b888e826a55c1ed4d11 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 17 Mar 2023 16:03:52 +0100 Subject: [PATCH 05/10] Merge remote-tracking branch 'upstream/main' into cow_series_from_array # Conflicts: # pandas/tests/copy_view/test_astype.py # pandas/tests/copy_view/test_constructors.py --- ci/code_checks.sh | 2 - doc/source/development/community.rst | 14 ++- doc/source/whatsnew/v2.0.0.rst | 9 ++ pandas/_libs/internals.pyi | 3 +- pandas/_libs/internals.pyx | 7 +- pandas/_libs/lib.pyi | 19 +-- pandas/_typing.py | 6 + pandas/compat/numpy/function.py | 5 +- pandas/core/arrays/arrow/array.py | 8 +- pandas/core/arrays/datetimelike.py | 1 - pandas/core/arrays/sparse/accessor.py | 13 ++ pandas/core/dtypes/cast.py | 10 +- pandas/core/dtypes/inference.py | 15 ++- pandas/core/frame.py | 17 ++- pandas/core/generic.py | 3 +- pandas/core/indexes/api.py | 6 +- pandas/core/indexes/base.py | 7 +- pandas/core/indexes/interval.py | 3 +- pandas/core/indexes/multi.py | 7 +- pandas/core/indexes/period.py | 4 +- pandas/core/indexing.py | 4 +- pandas/core/internals/managers.py | 11 +- pandas/core/reshape/concat.py | 6 +- pandas/core/reshape/merge.py | 3 +- pandas/core/series.py | 11 +- pandas/core/window/rolling.py | 2 +- pandas/io/parquet.py | 22 ---- pandas/io/parsers/python_parser.py | 43 +++++-- pandas/io/sql.py | 3 +- pandas/tests/copy_view/util.py | 14 ++- pandas/tests/extension/test_arrow.py | 17 +++ pandas/tests/frame/methods/test_fillna.py | 3 +- pandas/tests/frame/methods/test_to_numpy.py | 15 ++- pandas/tests/frame/methods/test_transpose.py | 7 +- pandas/tests/frame/methods/test_values.py | 18 ++- pandas/tests/frame/test_constructors.py | 13 +- pandas/tests/indexing/test_indexing.py | 8 +- .../io/parser/test_python_parser_only.py | 69 +++++++++++ pandas/tests/io/test_clipboard.py | 16 +-- pandas/tests/io/test_parquet.py | 115 ++++++++++++------ pandas/tests/io/test_sql.py | 2 +- pandas/tests/series/indexing/test_setitem.py | 18 ++- pandas/tests/series/test_constructors.py | 8 ++ pandas/util/_validators.py | 6 +- 44 files changed, 416 insertions(+), 177 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 8fefa47c16bab..eeaa277b1ab2c 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -547,8 +547,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.DataFrame.last_valid_index \ pandas.DataFrame.attrs \ pandas.DataFrame.plot \ - pandas.DataFrame.sparse.density \ - pandas.DataFrame.sparse.to_coo \ pandas.DataFrame.to_gbq \ pandas.DataFrame.style \ pandas.DataFrame.__dataframe__ diff --git a/doc/source/development/community.rst b/doc/source/development/community.rst index c536cafce3367..9a4de3c2580ab 100644 --- a/doc/source/development/community.rst +++ b/doc/source/development/community.rst @@ -111,9 +111,11 @@ contributing to pandas. The slack is a private space, specifically meant for people who are hesitant to bring up their questions or ideas on a large public mailing list or GitHub. -If this sounds like the right place for you, you are welcome to join! Email us -at `slack@pandas.pydata.org `_ and let us -know that you read and agree to our `Code of Conduct `_ -😉 to get an invite. And please remember that slack is not meant to replace the -mailing list or issue tracker - all important announcements and conversations -should still happen there. +If this sounds like the right place for you, you are welcome to join using +`this link `_! +Please remember to follow our `Code of Conduct `_, +and be aware that our admins are monitoring for irrelevant messages and will remove folks who use +our +slack for spam, advertisements and messages not related to the pandas contributing community. And +please remember that slack is not meant to replace the mailing list or issue tracker - all important +announcements and conversations should still happen there. diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 55185afc0a098..cdc9cbe0d7261 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -190,6 +190,13 @@ Copy-on-Write improvements of Series objects and specifying ``copy=False``, will now use a lazy copy of those Series objects for the columns of the DataFrame (:issue:`50777`) +- The :class:`DataFrame` constructor, when constructing from a NumPy array, + will now copy the array by default to avoid mutating the :class:`DataFrame` + when mutating the array. Specify ``copy=False`` to get the old behavior. + When setting ``copy=False`` pandas does not guarantee correct Copy-on-Write + behavior when the NumPy array is modified after creation of the + :class:`DataFrame`. + - Trying to set values using chained assignment (for example, ``df["a"][1:3] = 0``) will now always raise an warning when Copy-on-Write is enabled. In this mode, chained assignment can never work because we are always setting into a temporary @@ -1212,6 +1219,7 @@ Conversion - Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`) - Bug in :meth:`TimedeltaArray.astype` raising ``TypeError`` when converting to a pyarrow duration type (:issue:`49795`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` raising for extension array dtypes (:issue:`29618`, :issue:`50261`, :issue:`31913`) +- Bug in :meth:`Series` not copying data when created from :class:`Index` and ``dtype`` is equal to ``dtype`` from :class:`Index` (:issue:`52008`) Strings ^^^^^^^ @@ -1295,6 +1303,7 @@ I/O - Bug in :func:`read_csv` when ``engine="pyarrow"`` where ``encoding`` parameter was not handled correctly (:issue:`51302`) - Bug in :func:`read_xml` ignored repeated elements when iterparse is used (:issue:`51183`) - Bug in :class:`ExcelWriter` leaving file handles open if an exception occurred during instantiation (:issue:`51443`) +- Bug in :meth:`DataFrame.to_parquet` where non-string index or columns were raising a ``ValueError`` when ``engine="pyarrow"`` (:issue:`52036`) Period ^^^^^^ diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi index 5dfcc3726c84f..cee96801290b4 100644 --- a/pandas/_libs/internals.pyi +++ b/pandas/_libs/internals.pyi @@ -96,6 +96,7 @@ class BlockManager: class BlockValuesRefs: referenced_blocks: list[weakref.ref] - def __init__(self, blk: SharedBlock) -> None: ... + def __init__(self, blk: SharedBlock | None = ...) -> None: ... def add_reference(self, blk: SharedBlock) -> None: ... + def add_index_reference(self, index: object) -> None: ... def has_reference(self) -> bool: ... diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 7323bdfc4c6d7..533727f8f2d42 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -877,8 +877,11 @@ cdef class BlockValuesRefs: cdef: public list referenced_blocks - def __cinit__(self, blk: SharedBlock) -> None: - self.referenced_blocks = [weakref.ref(blk)] + def __cinit__(self, blk: SharedBlock | None = None) -> None: + if blk is not None: + self.referenced_blocks = [weakref.ref(blk)] + else: + self.referenced_blocks = [] def add_reference(self, blk: SharedBlock) -> None: """Adds a new reference to our reference collection. diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index fbc577712d294..31d4274bb5f8d 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -1,6 +1,6 @@ # TODO(npdtypes): Many types specified here can be made more specific/accurate; # the more specific versions are specified in comments - +from decimal import Decimal from typing import ( Any, Callable, @@ -13,9 +13,12 @@ from typing import ( import numpy as np +from pandas._libs.interval import Interval +from pandas._libs.tslibs import Period from pandas._typing import ( ArrayLike, DtypeObj, + TypeGuard, npt, ) @@ -38,13 +41,13 @@ def infer_dtype(value: object, skipna: bool = ...) -> str: ... def is_iterator(obj: object) -> bool: ... def is_scalar(val: object) -> bool: ... def is_list_like(obj: object, allow_sets: bool = ...) -> bool: ... -def is_period(val: object) -> bool: ... -def is_interval(val: object) -> bool: ... -def is_decimal(val: object) -> bool: ... -def is_complex(val: object) -> bool: ... -def is_bool(val: object) -> bool: ... -def is_integer(val: object) -> bool: ... -def is_float(val: object) -> bool: ... +def is_period(val: object) -> TypeGuard[Period]: ... +def is_interval(val: object) -> TypeGuard[Interval]: ... +def is_decimal(val: object) -> TypeGuard[Decimal]: ... +def is_complex(val: object) -> TypeGuard[complex]: ... +def is_bool(val: object) -> TypeGuard[bool | np.bool_]: ... +def is_integer(val: object) -> TypeGuard[int | np.integer]: ... +def is_float(val: object) -> TypeGuard[float]: ... def is_interval_array(values: np.ndarray) -> bool: ... def is_datetime64_array(values: np.ndarray) -> bool: ... def is_timedelta_or_timedelta64_array(values: np.ndarray) -> bool: ... diff --git a/pandas/_typing.py b/pandas/_typing.py index 3d9872c55ca2d..de02a549856ab 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -84,6 +84,11 @@ # Name "npt._ArrayLikeInt_co" is not defined [name-defined] NumpySorter = Optional[npt._ArrayLikeInt_co] # type: ignore[name-defined] + if sys.version_info >= (3, 10): + from typing import TypeGuard + else: + from typing_extensions import TypeGuard # pyright: reportUnusedImport = false + if sys.version_info >= (3, 11): from typing import Self else: @@ -91,6 +96,7 @@ else: npt: Any = None Self: Any = None + TypeGuard: Any = None HashableT = TypeVar("HashableT", bound=Hashable) diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index bdd26b315ed83..8b2916bf1ded9 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -25,6 +25,7 @@ overload, ) +import numpy as np from numpy import ndarray from pandas._libs.lib import ( @@ -215,7 +216,7 @@ def validate_clip_with_axis( ) -def validate_cum_func_with_skipna(skipna, args, kwargs, name) -> bool: +def validate_cum_func_with_skipna(skipna: bool, args, kwargs, name) -> bool: """ If this function is called via the 'numpy' library, the third parameter in its signature is 'dtype', which takes either a 'numpy' dtype or 'None', so @@ -224,6 +225,8 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name) -> bool: if not is_bool(skipna): args = (skipna,) + args skipna = True + elif isinstance(skipna, np.bool_): + skipna = bool(skipna) validate_cum_func(args, kwargs, fname=name) return skipna diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index c07aee737934b..551b925f42579 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -433,11 +433,15 @@ def __abs__(self) -> Self: # https://issues.apache.org/jira/browse/ARROW-10739 is addressed def __getstate__(self): state = self.__dict__.copy() - state["_data"] = self._pa_array.combine_chunks() + state["_pa_array"] = self._pa_array.combine_chunks() return state def __setstate__(self, state) -> None: - state["_pa_array"] = pa.chunked_array(state["_data"]) + if "_data" in state: + data = state.pop("_data") + else: + data = state["_pa_array"] + state["_pa_array"] = pa.chunked_array(data) self.__dict__.update(state) def _cmp_method(self, other, op): diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index eb5a8a52ab6f5..7a4a6cd0269ad 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -2172,7 +2172,6 @@ def validate_periods(periods: int | float | None) -> int | None: periods = int(periods) elif not lib.is_integer(periods): raise TypeError(f"periods must be a number, got {periods}") - periods = cast(int, periods) return periods diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 7980638deb438..ca1e73d3e6865 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -329,6 +329,13 @@ def to_coo(self): e.g. If the dtypes are float16 and float32, dtype will be upcast to float32. By numpy.find_common_type convention, mixing int64 and and uint64 will result in a float64 dtype. + + Examples + -------- + >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])}) + >>> df.sparse.to_coo() + <4x1 sparse matrix of type '' + with 2 stored elements in COOrdinate format> """ import_optional_dependency("scipy") from scipy.sparse import coo_matrix @@ -357,6 +364,12 @@ def to_coo(self): def density(self) -> float: """ Ratio of non-sparse points to total (dense) data points. + + Examples + -------- + >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])}) + >>> df.sparse.density + 0.5 """ tmp = np.mean([column.array.density for _, column in self._parent.items()]) return tmp diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 39ef8aad52bb7..c81ebc06ba753 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -191,15 +191,9 @@ def maybe_box_native(value: Scalar | None | NAType) -> Scalar | None | NAType: scalar or Series """ if is_float(value): - # error: Argument 1 to "float" has incompatible type - # "Union[Union[str, int, float, bool], Union[Any, Timestamp, Timedelta, Any]]"; - # expected "Union[SupportsFloat, _SupportsIndex, str]" - value = float(value) # type: ignore[arg-type] + value = float(value) elif is_integer(value): - # error: Argument 1 to "int" has incompatible type - # "Union[Union[str, int, float, bool], Union[Any, Timestamp, Timedelta, Any]]"; - # expected "Union[str, SupportsInt, _SupportsIndex, _SupportsTrunc]" - value = int(value) # type: ignore[arg-type] + value = int(value) elif is_bool(value): value = bool(value) elif isinstance(value, (np.datetime64, np.timedelta64)): diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 28e034de869f4..af4f0a1c0aa05 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -5,12 +5,19 @@ from collections import abc from numbers import Number import re -from typing import Pattern +from typing import ( + TYPE_CHECKING, + Hashable, + Pattern, +) import numpy as np from pandas._libs import lib +if TYPE_CHECKING: + from pandas._typing import TypeGuard + is_bool = lib.is_bool is_integer = lib.is_integer @@ -30,7 +37,7 @@ is_iterator = lib.is_iterator -def is_number(obj) -> bool: +def is_number(obj) -> TypeGuard[Number | np.number]: """ Check if the object is a number. @@ -132,7 +139,7 @@ def is_file_like(obj) -> bool: return bool(hasattr(obj, "__iter__")) -def is_re(obj) -> bool: +def is_re(obj) -> TypeGuard[Pattern]: """ Check if the object is a regex pattern instance. @@ -325,7 +332,7 @@ def is_named_tuple(obj) -> bool: return isinstance(obj, abc.Sequence) and hasattr(obj, "_fields") -def is_hashable(obj) -> bool: +def is_hashable(obj) -> TypeGuard[Hashable]: """ Return True if hash(obj) will succeed, False otherwise. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c2ee684751b5f..881993a6a4f5d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -696,6 +696,10 @@ def __init__( # INFO(ArrayManager) by default copy the 2D input array to get # contiguous 1D arrays copy = True + elif using_copy_on_write() and not isinstance( + data, (Index, DataFrame, Series) + ): + copy = True else: copy = False @@ -9516,11 +9520,7 @@ def melt( ) def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame: if not lib.is_integer(periods): - if not ( - is_float(periods) - # error: "int" has no attribute "is_integer" - and periods.is_integer() # type: ignore[attr-defined] - ): + if not (is_float(periods) and periods.is_integer()): raise ValueError("periods must be an integer") periods = int(periods) @@ -10412,8 +10412,13 @@ def _series_round(ser: Series, decimals: int) -> Series: new_cols = list(_dict_round(self, decimals)) elif is_integer(decimals): # Dispatch to Block.round + # Argument "decimals" to "round" of "BaseBlockManager" has incompatible + # type "Union[int, integer[Any]]"; expected "int" return self._constructor( - self._mgr.round(decimals=decimals, using_cow=using_copy_on_write()), + self._mgr.round( + decimals=decimals, # type: ignore[arg-type] + using_cow=using_copy_on_write(), + ), ).__finalize__(self, method="round") else: raise TypeError("decimals must be an integer, a dict-like or a Series") diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 365f0f3eedc43..060197e337f41 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4096,7 +4096,8 @@ class animal locomotion loc, new_index = index._get_loc_level(key, level=0) if not drop_level: if lib.is_integer(loc): - new_index = index[loc : loc + 1] + # Slice index must be an integer or None + new_index = index[loc : loc + 1] # type: ignore[misc] else: new_index = index[loc] else: diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index f880e1f10106d..4070b25767912 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -70,7 +70,11 @@ def get_objs_combined_axis( - objs, intersect: bool = False, axis: Axis = 0, sort: bool = True, copy: bool = False + objs, + intersect: bool = False, + axis: Axis = 0, + sort: bool = True, + copy: bool = False, ) -> Index: """ Extract combined index: return intersection or union (depending on the diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 87fed03a73daf..684d8607a1464 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -30,6 +30,7 @@ index as libindex, lib, ) +from pandas._libs.internals import BlockValuesRefs import pandas._libs.join as libjoin from pandas._libs.lib import ( is_datetime_array, @@ -652,9 +653,11 @@ def _simple_new(cls, values: ArrayLike, name: Hashable = None, refs=None) -> Sel result._name = name result._cache = {} result._reset_identity() - result._references = refs if refs is not None: - refs.add_index_reference(result) + result._references = refs + else: + result._references = BlockValuesRefs() + result._references.add_index_reference(result) return result diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index c3b7ba72b2e46..1740c5c368a94 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -399,7 +399,8 @@ def inferred_type(self) -> str: """Return a string of the type inferred from the values""" return "interval" - @Appender(Index.memory_usage.__doc__) + # Cannot determine type of "memory_usage" + @Appender(Index.memory_usage.__doc__) # type: ignore[has-type] def memory_usage(self, deep: bool = False) -> int: # we don't use an explicit engine # so return the bytes here diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 0acb99fdd9071..580a1901fc2da 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1255,7 +1255,8 @@ def f(level) -> bool: return any(f(level) for level in self._inferred_type_levels) - @doc(Index.memory_usage) + # Cannot determine type of "memory_usage" + @doc(Index.memory_usage) # type: ignore[has-type] def memory_usage(self, deep: bool = False) -> int: # we are overwriting our base class to avoid # computing .values here which could materialize @@ -2700,6 +2701,7 @@ def _partial_tup_index(self, tup: tuple, side: Literal["left", "right"] = "left" for k, (lab, lev, level_codes) in enumerate(zipped): section = level_codes[start:end] + loc: npt.NDArray[np.intp] | np.intp | int if lab not in lev and not isna(lab): # short circuit try: @@ -2931,7 +2933,8 @@ def get_loc_level(self, key, level: IndexLabel = 0, drop_level: bool = True): loc, mi = self._get_loc_level(key, level=level) if not drop_level: if lib.is_integer(loc): - mi = self[loc : loc + 1] + # Slice index must be an integer or None + mi = self[loc : loc + 1] # type: ignore[misc] else: mi = self[loc] return loc, mi diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 3ac68b861a8bc..3b4a6b2e5dfde 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -303,9 +303,7 @@ def _maybe_convert_timedelta(self, other) -> int | npt.NDArray[np.int64]: raise raise_on_incompatible(self, other) elif is_integer(other): - # integer is passed to .shift via - # _add_datetimelike_methods basically - # but ufunc may pass integer to _add_delta + assert isinstance(other, int) return other # raise when input doesn't have freq diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 665ced8190e9f..5d8196b778788 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1565,7 +1565,7 @@ def _is_scalar_access(self, key: tuple) -> bool: return all(is_integer(k) for k in key) - def _validate_integer(self, key: int, axis: AxisInt) -> None: + def _validate_integer(self, key: int | np.integer, axis: AxisInt) -> None: """ Check that 'key' is a valid position in the desired axis. @@ -2174,7 +2174,7 @@ def _ensure_iterable_column_indexer(self, column_indexer): """ Ensure that our column indexer is something that can be iterated over. """ - ilocs: Sequence[int] | np.ndarray + ilocs: Sequence[int | np.integer] | np.ndarray if is_integer(column_indexer): ilocs = [column_indexer] elif isinstance(column_indexer, slice): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c6b770942919f..70d7920ac5bb2 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -22,7 +22,10 @@ internals as libinternals, lib, ) -from pandas._libs.internals import BlockPlacement +from pandas._libs.internals import ( + BlockPlacement, + BlockValuesRefs, +) from pandas.errors import PerformanceWarning from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level @@ -1877,11 +1880,13 @@ def from_blocks( return cls(blocks[0], axes[0], verify_integrity=False) @classmethod - def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager: + def from_array( + cls, array: ArrayLike, index: Index, refs: BlockValuesRefs | None = None + ) -> SingleBlockManager: """ Constructor for if we have an array that is not yet a Block. """ - block = new_block(array, placement=slice(0, len(index)), ndim=1) + block = new_block(array, placement=slice(0, len(index)), ndim=1, refs=refs) return cls(block, index) def to_2d_mgr(self, columns: Index) -> BlockManager: diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 650d51b896dc5..395db8060ce0e 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -391,6 +391,8 @@ class _Concatenator: Orchestrates a concatenation operation for BlockManagers """ + sort: bool + def __init__( self, objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame], @@ -555,7 +557,9 @@ def __init__( raise ValueError( f"The 'sort' keyword only accepts boolean values; {sort} was passed." ) - self.sort = sort + # Incompatible types in assignment (expression has type "Union[bool, bool_]", + # variable has type "bool") + self.sort = sort # type: ignore[assignment] self.ignore_index = ignore_index self.verify_integrity = verify_integrity diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 21ce1d3c96379..d2b022214167f 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2026,7 +2026,8 @@ def _get_merge_keys( elif is_float_dtype(lt): if not is_number(self.tolerance): raise MergeError(msg) - if self.tolerance < 0: + # error: Unsupported operand types for > ("int" and "Number") + if self.tolerance < 0: # type: ignore[operator] raise MergeError("tolerance must be positive") else: diff --git a/pandas/core/series.py b/pandas/core/series.py index ed518d0de0a9f..98ec026cb74c2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -441,10 +441,15 @@ def __init__( raise NotImplementedError( "initializing a Series from a MultiIndex is not supported" ) + + refs = None if isinstance(data, Index): if dtype is not None: - # astype copies - data = data.astype(dtype) + data = data.astype(dtype, copy=False) + + if using_copy_on_write(): + refs = data._references + data = data._values else: # GH#24096 we need to ensure the index remains immutable data = data._values.copy() @@ -508,7 +513,7 @@ def __init__( manager = get_option("mode.data_manager") if manager == "block": - data = SingleBlockManager.from_array(data, index) + data = SingleBlockManager.from_array(data, index, refs=refs) elif manager == "array": data = SingleArrayManager.from_array(data, index) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index b11ff11421ed4..ed0de80e381c3 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1402,7 +1402,7 @@ def _generate_cython_apply_func( self, args: tuple[Any, ...], kwargs: dict[str, Any], - raw: bool, + raw: bool | np.bool_, function: Callable[..., Any], ) -> Callable[[np.ndarray, np.ndarray, np.ndarray, int], np.ndarray]: from pandas import Series diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index a606cb9287d16..7791ca53a6447 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -21,7 +21,6 @@ import pandas as pd from pandas import ( DataFrame, - MultiIndex, get_option, ) from pandas.core.shared_docs import _shared_docs @@ -152,27 +151,6 @@ def validate_dataframe(df: DataFrame) -> None: if not isinstance(df, DataFrame): raise ValueError("to_parquet only supports IO with DataFrames") - # must have value column names for all index levels (strings only) - if isinstance(df.columns, MultiIndex): - if not all( - x.inferred_type in {"string", "empty"} for x in df.columns.levels - ): - raise ValueError( - """ - parquet must have string column names for all values in - each level of the MultiIndex - """ - ) - elif df.columns.inferred_type not in {"string", "empty"}: - raise ValueError("parquet must have string column names") - - # index level names must be strings - valid_names = all( - isinstance(name, str) for name in df.index.names if name is not None - ) - if not valid_names: - raise ValueError("Index level names must be strings") - def write(self, df: DataFrame, path, compression, **kwargs): raise AbstractMethodError(self) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 315d18d052d9f..0a39d7299d1bf 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -29,7 +29,11 @@ ParserError, ) -from pandas.core.dtypes.common import is_integer +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_integer, + is_numeric_dtype, +) from pandas.core.dtypes.inference import is_dict_like from pandas.io.common import ( @@ -155,12 +159,7 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: self._col_indices = list(range(len(self.columns))) self._parse_date_cols = self._validate_parse_dates_presence(self.columns) - no_thousands_columns: set[int] | None = None - if self.parse_dates: - no_thousands_columns = self._set_noconvert_dtype_columns( - self._col_indices, self.columns - ) - self._no_thousands_columns = no_thousands_columns + self._no_thousands_columns = self._set_no_thousand_columns() if len(self.decimal) != 1: raise ValueError("Only length-1 decimal markers supported") @@ -889,7 +888,7 @@ def _search_replace_num_columns( if ( not isinstance(x, str) or search not in x - or (self._no_thousands_columns and i in self._no_thousands_columns) + or i in self._no_thousands_columns or not self.num.search(x.strip()) ): rl.append(x) @@ -1162,6 +1161,31 @@ def _remove_skipped_rows(self, new_rows: list[list[Scalar]]) -> list[list[Scalar ] return new_rows + def _set_no_thousand_columns(self) -> set[int]: + no_thousands_columns: set[int] = set() + if self.columns and self.parse_dates: + assert self._col_indices is not None + no_thousands_columns = self._set_noconvert_dtype_columns( + self._col_indices, self.columns + ) + if self.columns and self.dtype: + assert self._col_indices is not None + for i in self._col_indices: + if not isinstance(self.dtype, dict) and not is_numeric_dtype( + self.dtype + ): + no_thousands_columns.add(i) + if ( + isinstance(self.dtype, dict) + and self.columns[i] in self.dtype + and ( + not is_numeric_dtype(self.dtype[self.columns[i]]) + or is_bool_dtype(self.dtype[self.columns[i]]) + ) + ): + no_thousands_columns.add(i) + return no_thousands_columns + class FixedWidthReader(abc.Iterator): """ @@ -1348,4 +1372,5 @@ def _validate_skipfooter_arg(skipfooter: int) -> int: if skipfooter < 0: raise ValueError("skipfooter cannot be negative") - return skipfooter + # Incompatible return value type (got "Union[int, integer[Any]]", expected "int") + return skipfooter # type: ignore[return-value] diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 8d48d04c738e8..ec04a9ce81d92 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -44,7 +44,6 @@ from pandas.core.dtypes.common import ( is_datetime64tz_dtype, is_dict_like, - is_integer, is_list_like, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -1022,7 +1021,7 @@ def insert( chunk_iter = zip(*(arr[start_i:end_i] for arr in data_list)) num_inserted = exec_insert(conn, keys, chunk_iter) # GH 46891 - if is_integer(num_inserted): + if num_inserted is not None: if total_inserted is None: total_inserted = num_inserted else: diff --git a/pandas/tests/copy_view/util.py b/pandas/tests/copy_view/util.py index f15560f91ae01..9693344249365 100644 --- a/pandas/tests/copy_view/util.py +++ b/pandas/tests/copy_view/util.py @@ -1,4 +1,8 @@ -from pandas import Series +from pandas import ( + Categorical, + Index, + Series, +) from pandas.core.arrays import BaseMaskedArray @@ -10,7 +14,9 @@ def get_array(obj, col=None): which triggers tracking references / CoW (and we might be testing that this is done by some other operation). """ - if isinstance(obj, Series) and (col is None or obj.name == col): + if isinstance(obj, Index): + arr = obj._values + elif isinstance(obj, Series) and (col is None or obj.name == col): arr = obj._values else: assert col is not None @@ -19,4 +25,6 @@ def get_array(obj, col=None): arr = obj._get_column_array(icol) if isinstance(arr, BaseMaskedArray): return arr._data - return arr + elif isinstance(arr, Categorical): + return arr + return getattr(arr, "_ndarray", arr) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index fcca2e4fa8e8e..2378710555340 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2351,3 +2351,20 @@ def test_concat_empty_arrow_backed_series(dtype): expected = ser.copy() result = pd.concat([ser[np.array([], dtype=np.bool_)]]) tm.assert_series_equal(result, expected) + + +# _data was renamed to _pa_data +class OldArrowExtensionArray(ArrowExtensionArray): + def __getstate__(self): + state = super().__getstate__() + state["_data"] = state.pop("_pa_array") + return state + + +def test_pickle_old_arrowextensionarray(): + data = pa.array([1]) + expected = OldArrowExtensionArray(data) + result = pickle.loads(pickle.dumps(expected)) + tm.assert_extension_array_equal(result, expected) + assert result._pa_array == pa.chunked_array(data) + assert not hasattr(result, "_data") diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index f161cf7b3c525..d80c3c0da9935 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -47,8 +47,9 @@ def test_fillna_dict_inplace_nonunique_columns(self, using_copy_on_write): def test_fillna_on_column_view(self, using_copy_on_write): # GH#46149 avoid unnecessary copies arr = np.full((40, 50), np.nan) - df = DataFrame(arr) + df = DataFrame(arr, copy=False) + # TODO(CoW): This should raise a chained assignment error df[0].fillna(-1, inplace=True) if using_copy_on_write: assert np.isnan(arr[:, 0]).all() diff --git a/pandas/tests/frame/methods/test_to_numpy.py b/pandas/tests/frame/methods/test_to_numpy.py index 20e2a63cc793c..ae0eafb0bf348 100644 --- a/pandas/tests/frame/methods/test_to_numpy.py +++ b/pandas/tests/frame/methods/test_to_numpy.py @@ -23,16 +23,23 @@ def test_to_numpy_dtype(self): tm.assert_numpy_array_equal(result, expected) @td.skip_array_manager_invalid_test - def test_to_numpy_copy(self): + def test_to_numpy_copy(self, using_copy_on_write): arr = np.random.randn(4, 3) df = DataFrame(arr) - assert df.values.base is arr - assert df.to_numpy(copy=False).base is arr + if using_copy_on_write: + assert df.values.base is not arr + assert df.to_numpy(copy=False).base is df.values.base + else: + assert df.values.base is arr + assert df.to_numpy(copy=False).base is arr assert df.to_numpy(copy=True).base is not arr # we still don't want a copy when na_value=np.nan is passed, # and that can be respected because we are already numpy-float - assert df.to_numpy(copy=False, na_value=np.nan).base is arr + if using_copy_on_write: + assert df.to_numpy(copy=False).base is df.values.base + else: + assert df.to_numpy(copy=False, na_value=np.nan).base is arr def test_to_numpy_mixed_dtype_to_str(self): # https://github.com/pandas-dev/pandas/issues/35455 diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index 881af8a41f82c..8ff6ea37eae18 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -122,7 +122,7 @@ def test_transpose_get_view(self, float_frame, using_copy_on_write): assert (float_frame.values[5:10] == 5).all() @td.skip_array_manager_invalid_test - def test_transpose_get_view_dt64tzget_view(self): + def test_transpose_get_view_dt64tzget_view(self, using_copy_on_write): dti = date_range("2016-01-01", periods=6, tz="US/Pacific") arr = dti._data.reshape(3, 2) df = DataFrame(arr) @@ -132,7 +132,10 @@ def test_transpose_get_view_dt64tzget_view(self): assert result._mgr.nblocks == 1 rtrip = result._mgr.blocks[0].values - assert np.shares_memory(arr._ndarray, rtrip._ndarray) + if using_copy_on_write: + assert np.shares_memory(df._mgr.blocks[0].values._ndarray, rtrip._ndarray) + else: + assert np.shares_memory(arr._ndarray, rtrip._ndarray) def test_transpose_not_inferring_dt(self): # GH#51546 diff --git a/pandas/tests/frame/methods/test_values.py b/pandas/tests/frame/methods/test_values.py index 71f0bf6e24832..5728a849262ee 100644 --- a/pandas/tests/frame/methods/test_values.py +++ b/pandas/tests/frame/methods/test_values.py @@ -230,14 +230,17 @@ def test_values_lcd(self, mixed_float_frame, mixed_int_frame): class TestPrivateValues: @td.skip_array_manager_invalid_test - def test_private_values_dt64tz(self): + def test_private_values_dt64tz(self, using_copy_on_write): dta = date_range("2000", periods=4, tz="US/Central")._data.reshape(-1, 1) df = DataFrame(dta, columns=["A"]) tm.assert_equal(df._values, dta) - # we have a view - assert np.shares_memory(df._values._ndarray, dta._ndarray) + if using_copy_on_write: + assert not np.shares_memory(df._values._ndarray, dta._ndarray) + else: + # we have a view + assert np.shares_memory(df._values._ndarray, dta._ndarray) # TimedeltaArray tda = dta - dta @@ -245,14 +248,17 @@ def test_private_values_dt64tz(self): tm.assert_equal(df2._values, tda) @td.skip_array_manager_invalid_test - def test_private_values_dt64tz_multicol(self): + def test_private_values_dt64tz_multicol(self, using_copy_on_write): dta = date_range("2000", periods=8, tz="US/Central")._data.reshape(-1, 2) df = DataFrame(dta, columns=["A", "B"]) tm.assert_equal(df._values, dta) - # we have a view - assert np.shares_memory(df._values._ndarray, dta._ndarray) + if using_copy_on_write: + assert not np.shares_memory(df._values._ndarray, dta._ndarray) + else: + # we have a view + assert np.shares_memory(df._values._ndarray, dta._ndarray) # TimedeltaArray tda = dta - dta diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 711eb924925d6..db7e97d672b4d 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -309,14 +309,14 @@ def test_constructor_dtype_nocast_view_2d_array( def test_1d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") - df = DataFrame(arr) + df = DataFrame(arr, copy=False) assert np.shares_memory(df.values, arr) @td.skip_array_manager_invalid_test def test_2d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array([["a", "b"], ["c", "d"]], dtype="object") - df = DataFrame(arr) + df = DataFrame(arr, copy=False) assert np.shares_memory(df.values, arr) def test_constructor_dtype_list_data(self): @@ -2107,13 +2107,18 @@ def test_constructor_frame_shallow_copy(self, float_frame): cop.index = np.arange(len(cop)) tm.assert_frame_equal(float_frame, orig) - def test_constructor_ndarray_copy(self, float_frame, using_array_manager): + def test_constructor_ndarray_copy( + self, float_frame, using_array_manager, using_copy_on_write + ): if not using_array_manager: arr = float_frame.values.copy() df = DataFrame(arr) arr[5] = 5 - assert (df.values[5] == 5).all() + if using_copy_on_write: + assert not (df.values[5] == 5).all() + else: + assert (df.values[5] == 5).all() df = DataFrame(arr, copy=True) arr[6] = 6 diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 3d45b238017ca..bf817a1db73c4 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -849,7 +849,7 @@ def test_setitem_dt64_string_scalar(self, tz_naive_fixture, indexer_sli): tz = tz_naive_fixture dti = date_range("2016-01-01", periods=3, tz=tz) - ser = Series(dti) + ser = Series(dti.copy(deep=True)) values = ser._values @@ -877,7 +877,7 @@ def test_setitem_dt64_string_values(self, tz_naive_fixture, indexer_sli, key, bo key = slice(0, 1) dti = date_range("2016-01-01", periods=3, tz=tz) - ser = Series(dti) + ser = Series(dti.copy(deep=True)) values = ser._values @@ -897,7 +897,7 @@ def test_setitem_dt64_string_values(self, tz_naive_fixture, indexer_sli, key, bo def test_setitem_td64_scalar(self, indexer_sli, scalar): # dispatching _can_hold_element to underling TimedeltaArray tdi = timedelta_range("1 Day", periods=3) - ser = Series(tdi) + ser = Series(tdi.copy(deep=True)) values = ser._values values._validate_setitem_value(scalar) @@ -915,7 +915,7 @@ def test_setitem_td64_string_values(self, indexer_sli, key, box): key = slice(0, 1) tdi = timedelta_range("1 Day", periods=3) - ser = Series(tdi) + ser = Series(tdi.copy(deep=True)) values = ser._values diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index ca5a757328ba7..b22953fedd6af 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -14,6 +14,7 @@ ) from typing import Iterator +import numpy as np import pytest from pandas.errors import ( @@ -488,3 +489,71 @@ def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parse ) expected = DataFrame({"a": ["a", "c", "f"]}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}] +) +def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, dtype): + # GH#50270 + parser = python_parser_only + data = """\ +a;b;c +0000.7995;16.000;0 +3.03.001.00514;0;4.000 +4923.600.041;23.000;131""" + result = parser.read_csv( + StringIO(data), + sep=";", + dtype=dtype, + thousands=".", + ) + expected = DataFrame( + { + "a": ["0000.7995", "3.03.001.00514", "4923.600.041"], + "b": [16000, 0, 23000], + "c": [0, 4000, 131], + } + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype,expected", + [ + ( + {"a": str, "b": np.float64, "c": np.int64}, + DataFrame( + { + "b": [16000.1, 0, 23000], + "c": [0, 4001, 131], + } + ), + ), + ( + str, + DataFrame( + { + "b": ["16,000.1", "0", "23,000"], + "c": ["0", "4,001", "131"], + } + ), + ), + ], +) +def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, expected): + # GH#50270 + parser = python_parser_only + data = """a;b;c +0000,7995;16,000.1;0 +3,03,001,00514;0;4,001 +4923,600,041;23,000;131 +""" + result = parser.read_csv( + StringIO(data), + sep=";", + dtype=dtype, + thousands=",", + ) + expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index baf2bcdc9386f..3bcf5b823647e 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -1,15 +1,10 @@ import os -import subprocess from textwrap import dedent import numpy as np import pytest -from pandas.compat import ( - is_ci_environment, - is_platform_linux, - is_platform_mac, -) +from pandas.compat import is_ci_environment from pandas.errors import ( PyperclipException, PyperclipWindowsException, @@ -406,17 +401,14 @@ def test_round_trip_valid_encodings(self, enc, df): @pytest.mark.single_cpu @pytest.mark.parametrize("data", ["\U0001f44d...", "Ωœ∑´...", "abcd..."]) @pytest.mark.xfail( - os.environ.get("DISPLAY") is None and not is_platform_mac(), - reason="Cannot be runed if a headless system is not put in place with Xvfb", - strict=True, + os.environ.get("DISPLAY") is None or is_ci_environment(), + reason="Cannot pass if a headless system is not put in place with Xvfb", + strict=not is_ci_environment(), # Flaky failures in the CI ) def test_raw_roundtrip(self, data): # PR #25040 wide unicode wasn't copied correctly on PY3 on windows clipboard_set(data) assert data == clipboard_get() - if is_ci_environment() and is_platform_linux(): - # Clipboard can sometimes keep previous param causing flaky CI failures - subprocess.run(["xsel", "--delete", "--clipboard"], check=True) @pytest.mark.parametrize("engine", ["c", "python"]) def test_read_clipboard_dtype_backend( diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 4ba3776bf6063..b55e97a4fe0ae 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -404,25 +404,6 @@ def test_columns_dtypes(self, engine): df.columns = ["foo", "bar"] check_round_trip(df, engine) - def test_columns_dtypes_invalid(self, engine): - df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) - - msg = "parquet must have string column names" - # numeric - df.columns = [0, 1] - self.check_error_on_write(df, engine, ValueError, msg) - - # bytes - df.columns = [b"foo", b"bar"] - self.check_error_on_write(df, engine, ValueError, msg) - - # python object - df.columns = [ - datetime.datetime(2011, 1, 1, 0, 0), - datetime.datetime(2011, 1, 1, 1, 1), - ] - self.check_error_on_write(df, engine, ValueError, msg) - @pytest.mark.parametrize("compression", [None, "gzip", "snappy", "brotli"]) def test_compression(self, engine, compression): if compression == "snappy": @@ -528,16 +509,16 @@ def test_write_column_multiindex(self, engine): # Not able to write column multi-indexes with non-string column names. mi_columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns) - msg = ( - r"\s*parquet must have string column names for all values in\s*" - "each level of the MultiIndex" - ) - self.check_error_on_write(df, engine, ValueError, msg) - def test_write_column_multiindex_nonstring(self, pa): + if engine == "fastparquet": + self.check_error_on_write( + df, engine, TypeError, "Column name must be a string" + ) + elif engine == "pyarrow": + check_round_trip(df, engine) + + def test_write_column_multiindex_nonstring(self, engine): # GH #34777 - # Not supported in fastparquet as of 0.1.3 - engine = pa # Not able to write column multi-indexes with non-string column names arrays = [ @@ -546,11 +527,14 @@ def test_write_column_multiindex_nonstring(self, pa): ] df = pd.DataFrame(np.random.randn(8, 8), columns=arrays) df.columns.names = ["Level1", "Level2"] - msg = ( - r"\s*parquet must have string column names for all values in\s*" - "each level of the MultiIndex" - ) - self.check_error_on_write(df, engine, ValueError, msg) + if engine == "fastparquet": + if Version(fastparquet.__version__) < Version("0.7.0"): + err = TypeError + else: + err = ValueError + self.check_error_on_write(df, engine, err, "Column name") + elif engine == "pyarrow": + check_round_trip(df, engine) def test_write_column_multiindex_string(self, pa): # GH #34777 @@ -579,17 +563,19 @@ def test_write_column_index_string(self, pa): check_round_trip(df, engine) - def test_write_column_index_nonstring(self, pa): + def test_write_column_index_nonstring(self, engine): # GH #34777 - # Not supported in fastparquet as of 0.1.3 - engine = pa # Write column indexes with string column names arrays = [1, 2, 3, 4] df = pd.DataFrame(np.random.randn(8, 4), columns=arrays) df.columns.name = "NonStringCol" - msg = r"parquet must have string column names" - self.check_error_on_write(df, engine, ValueError, msg) + if engine == "fastparquet": + self.check_error_on_write( + df, engine, TypeError, "Column name must be a string" + ) + else: + check_round_trip(df, engine) @pytest.mark.skipif(pa_version_under7p0, reason="minimum pyarrow not installed") def test_dtype_backend(self, engine, request): @@ -1041,6 +1027,31 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa): expected=expected, ) + def test_columns_dtypes_not_invalid(self, pa): + df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) + + # numeric + df.columns = [0, 1] + check_round_trip(df, pa) + + # bytes + df.columns = [b"foo", b"bar"] + with pytest.raises(NotImplementedError, match="|S3"): + # Bytes fails on read_parquet + check_round_trip(df, pa) + + # python object + df.columns = [ + datetime.datetime(2011, 1, 1, 0, 0), + datetime.datetime(2011, 1, 1, 1, 1), + ] + check_round_trip(df, pa) + + def test_empty_columns(self, pa): + # GH 52034 + df = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name")) + check_round_trip(df, pa) + class TestParquetFastParquet(Base): def test_basic(self, fp, df_full): @@ -1052,6 +1063,27 @@ def test_basic(self, fp, df_full): df["timedelta"] = pd.timedelta_range("1 day", periods=3) check_round_trip(df, fp) + def test_columns_dtypes_invalid(self, fp): + df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) + + err = TypeError + msg = "Column name must be a string" + + # numeric + df.columns = [0, 1] + self.check_error_on_write(df, fp, err, msg) + + # bytes + df.columns = [b"foo", b"bar"] + self.check_error_on_write(df, fp, err, msg) + + # python object + df.columns = [ + datetime.datetime(2011, 1, 1, 0, 0), + datetime.datetime(2011, 1, 1, 1, 1), + ] + self.check_error_on_write(df, fp, err, msg) + def test_duplicate_columns(self, fp): # not currently able to handle duplicate columns df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy() @@ -1281,3 +1313,12 @@ def test_invalid_dtype_backend(self, engine): df.to_parquet(path) with pytest.raises(ValueError, match=msg): read_parquet(path, dtype_backend="numpy") + + def test_empty_columns(self, fp): + # GH 52034 + df = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name")) + expected = pd.DataFrame( + columns=pd.Index([], dtype=object), + index=pd.Index(["a", "b", "c"], name="custom name"), + ) + check_round_trip(df, fp, expected=expected) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index dc51a5b0a77fb..3d79d483038ee 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -775,7 +775,7 @@ def psql_insert_copy(table, conn, keys, data_iter): "test_frame", conn, index=False, method=psql_insert_copy ) # GH 46891 - if not isinstance(expected_count, int): + if expected_count is None: assert result_count is None else: assert result_count == expected_count diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 00d10e9e33a13..39cbf2b7bac10 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -404,7 +404,7 @@ def test_setitem_mask_smallint_no_upcast(self): class TestSetitemViewCopySemantics: - def test_setitem_invalidates_datetime_index_freq(self): + def test_setitem_invalidates_datetime_index_freq(self, using_copy_on_write): # GH#24096 altering a datetime64tz Series inplace invalidates the # `freq` attribute on the underlying DatetimeIndex @@ -412,7 +412,10 @@ def test_setitem_invalidates_datetime_index_freq(self): ts = dti[1] ser = Series(dti) assert ser._values is not dti - assert ser._values._ndarray.base is not dti._data._ndarray.base + if using_copy_on_write: + assert ser._values._ndarray.base is dti._data._ndarray.base + else: + assert ser._values._ndarray.base is not dti._data._ndarray.base assert dti.freq == "D" ser.iloc[1] = NaT assert ser._values.freq is None @@ -423,15 +426,20 @@ def test_setitem_invalidates_datetime_index_freq(self): assert dti[1] == ts assert dti.freq == "D" - def test_dt64tz_setitem_does_not_mutate_dti(self): + def test_dt64tz_setitem_does_not_mutate_dti(self, using_copy_on_write): # GH#21907, GH#24096 dti = date_range("2016-01-01", periods=10, tz="US/Pacific") ts = dti[0] ser = Series(dti) assert ser._values is not dti - assert ser._values._ndarray.base is not dti._data._ndarray.base + if using_copy_on_write: + assert ser._values._ndarray.base is dti._data._ndarray.base + assert ser._mgr.arrays[0]._ndarray.base is dti._data._ndarray.base + else: + assert ser._values._ndarray.base is not dti._data._ndarray.base + assert ser._mgr.arrays[0]._ndarray.base is not dti._data._ndarray.base + assert ser._mgr.arrays[0] is not dti - assert ser._mgr.arrays[0]._ndarray.base is not dti._data._ndarray.base ser[::3] = NaT assert ser[0] is NaT diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index e21b1a647971f..2d91a4ef6c58e 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2056,6 +2056,14 @@ def test_series_constructor_ea_all_na(self): ) tm.assert_series_equal(result, expected) + def test_series_from_index_dtype_equal_does_not_copy(self): + # GH#52008 + idx = Index([1, 2, 3]) + expected = idx.copy(deep=True) + ser = Series(idx, dtype="int64") + ser.iloc[0] = 100 + tm.assert_index_equal(idx, expected) + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index 17ef583febc24..f03d1ceb507fd 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -250,7 +250,7 @@ def validate_bool_kwarg( """ good_value = is_bool(value) if none_allowed: - good_value = good_value or value is None + good_value = good_value or (value is None) if int_allowed: good_value = good_value or isinstance(value, int) @@ -260,7 +260,7 @@ def validate_bool_kwarg( f'For argument "{arg_name}" expected type bool, received ' f"type {type(value).__name__}." ) - return value + return value # pyright: ignore[reportGeneralTypeIssues] def validate_fillna_kwargs(value, method, validate_scalar_dict_value: bool = True): @@ -438,7 +438,7 @@ def validate_insert_loc(loc: int, length: int) -> int: loc += length if not 0 <= loc <= length: raise IndexError(f"loc must be an integer between -{length} and {length}") - return loc + return loc # pyright: ignore[reportGeneralTypeIssues] def check_dtype_backend(dtype_backend) -> None: From 0f98a2d9ab8434ab64df4d422de580ca7d6a910a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 17 Mar 2023 16:05:23 +0100 Subject: [PATCH 06/10] Update whatsnew --- doc/source/whatsnew/v2.0.0.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index cdc9cbe0d7261..4c5ffcc34102b 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -190,12 +190,13 @@ Copy-on-Write improvements of Series objects and specifying ``copy=False``, will now use a lazy copy of those Series objects for the columns of the DataFrame (:issue:`50777`) -- The :class:`DataFrame` constructor, when constructing from a NumPy array, - will now copy the array by default to avoid mutating the :class:`DataFrame` +- The :class:`DataFrame` and :class:`Series` constructors, when constructing from + a NumPy array, will now copy the array by default to avoid mutating + the :class:`DataFrame` / :class:`Series` when mutating the array. Specify ``copy=False`` to get the old behavior. When setting ``copy=False`` pandas does not guarantee correct Copy-on-Write behavior when the NumPy array is modified after creation of the - :class:`DataFrame`. + :class:`DataFrame` / :class:`Series`. - Trying to set values using chained assignment (for example, ``df["a"][1:3] = 0``) will now always raise an warning when Copy-on-Write is enabled. In this mode, From bfd6595cd9ddf33b0c52d92a3b920f034393bf82 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 17 Mar 2023 16:25:04 +0100 Subject: [PATCH 07/10] Move --- pandas/core/series.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 98ec026cb74c2..1208ec01ef4e1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -380,17 +380,11 @@ def __init__( copy: bool | None = None, fastpath: bool = False, ) -> None: - if copy is None: - if using_copy_on_write(): - copy = True - else: - copy = False - if ( isinstance(data, (SingleBlockManager, SingleArrayManager)) and index is None and dtype is None - and copy is False + and (copy is False or copy is None) ): # GH#33357 called with just the SingleBlockManager NDFrame.__init__(self, data) @@ -401,6 +395,12 @@ def __init__( self.name = name return + if copy is None: + if using_copy_on_write(): + copy = True + else: + copy = False + if isinstance(data, (ExtensionArray, np.ndarray)): if copy and using_copy_on_write(): if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)): From d99c4e924d78c30b39f7246c3a5945cc8cc28149 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 19 Mar 2023 18:47:44 +0100 Subject: [PATCH 08/10] Fix --- pandas/conftest.py | 1 + pandas/core/series.py | 9 ++------- pandas/tests/copy_view/test_constructors.py | 6 +++--- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 8f8164228d3b8..2b9da5dafdc2e 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1921,6 +1921,7 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ + pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" diff --git a/pandas/core/series.py b/pandas/core/series.py index 1208ec01ef4e1..a024212d23574 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -395,16 +395,11 @@ def __init__( self.name = name return - if copy is None: - if using_copy_on_write(): - copy = True - else: - copy = False - if isinstance(data, (ExtensionArray, np.ndarray)): - if copy and using_copy_on_write(): + if copy is not False and using_copy_on_write(): if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)): data = data.copy() + copy = False # we are called internally, so short-circuit if fastpath: diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index 44d1fa184e851..408867f60da48 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -30,7 +30,7 @@ def test_series_from_series(dtype, using_copy_on_write): result = Series(ser, dtype=dtype) # the shallow copy still shares memory - assert np.shares_memory(ser.values, result.values) + assert np.shares_memory(get_array(ser), get_array(result)) if using_copy_on_write: assert result._mgr.blocks[0].refs.has_reference() @@ -40,13 +40,13 @@ def test_series_from_series(dtype, using_copy_on_write): result.iloc[0] = 0 assert ser.iloc[0] == 1 # mutating triggered a copy-on-write -> no longer shares memory - assert not np.shares_memory(ser.values, result.values) + assert not np.shares_memory(get_array(ser), get_array(result)) else: # mutating shallow copy does mutate original result.iloc[0] = 0 assert ser.iloc[0] == 0 # and still shares memory - assert np.shares_memory(ser.values, result.values) + assert np.shares_memory(get_array(ser), get_array(result)) # the same when modifying the parent result = Series(ser, dtype=dtype) From 0e2c11ec9337c1c8d3092bebc56b8a7604131089 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 19 Mar 2023 18:48:12 +0100 Subject: [PATCH 09/10] Fix --- pandas/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 2b9da5dafdc2e..8f8164228d3b8 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1921,7 +1921,6 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ - pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" From 3e7151a41efb7ce30ca99f0c9f8e55d62b43c6ed Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 19 Mar 2023 19:18:06 +0100 Subject: [PATCH 10/10] Update series.py --- pandas/core/series.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index a024212d23574..7bb0e89c0de26 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -399,7 +399,8 @@ def __init__( if copy is not False and using_copy_on_write(): if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)): data = data.copy() - copy = False + if copy is None: + copy = False # we are called internally, so short-circuit if fastpath: