From a644b1a87057e5817cbc8ba53d33404278d65c9e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 8 Feb 2023 23:36:35 +0100 Subject: [PATCH 1/3] ENH: Implement CoW for convert_dtypes --- pandas/core/generic.py | 2 +- pandas/core/series.py | 4 ++-- pandas/tests/copy_view/test_astype.py | 24 ++++++++++++++++++++++++ pandas/tests/copy_view/util.py | 5 +++-- 4 files changed, 30 insertions(+), 5 deletions(-) create mode 100644 pandas/tests/copy_view/test_astype.py diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e4f7bb43b23dc..56f5187cff140 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6642,7 +6642,7 @@ def convert_dtypes( # https://github.com/python/mypy/issues/8354 return cast(NDFrameT, result) else: - return self.copy() + return self.copy(deep=None) # ---------------------------------------------------------------------- # Filling NA's diff --git a/pandas/core/series.py b/pandas/core/series.py index 80dd0dd19f96f..a88dd224068ac 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5468,7 +5468,7 @@ def _convert_dtypes( if infer_objects: input_series = input_series.infer_objects() if is_object_dtype(input_series): - input_series = input_series.copy() + input_series = input_series.copy(deep=None) if convert_string or convert_integer or convert_boolean or convert_floating: dtype_backend = get_option("mode.dtype_backend") @@ -5483,7 +5483,7 @@ def _convert_dtypes( ) result = input_series.astype(inferred_dtype) else: - result = input_series.copy() + result = input_series.copy(deep=None) return result # error: Cannot determine type of 'isna' diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py new file mode 100644 index 0000000000000..4bbc83b1bd650 --- /dev/null +++ b/pandas/tests/copy_view/test_astype.py @@ -0,0 +1,24 @@ +import numpy as np + +from pandas import Series +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + + +def test_convert_dtypes_infer_objects(using_copy_on_write): + ser = Series(["a", "b", "c"]) + ser_orig = ser.copy() + result = ser.convert_dtypes( + convert_integer=False, + convert_boolean=False, + convert_floating=False, + convert_string=False, + ) + + if using_copy_on_write: + assert np.shares_memory(get_array(ser), get_array(result)) + else: + assert not np.shares_memory(get_array(ser), get_array(result)) + + result.iloc[0] = "x" + tm.assert_series_equal(ser, ser_orig) diff --git a/pandas/tests/copy_view/util.py b/pandas/tests/copy_view/util.py index b5f4f74fc7e5e..8e53c1644b0a2 100644 --- a/pandas/tests/copy_view/util.py +++ b/pandas/tests/copy_view/util.py @@ -2,7 +2,7 @@ from pandas.core.arrays import BaseMaskedArray -def get_array(obj, col): +def get_array(obj, col=None): """ Helper method to get array for a DataFrame column or a Series. @@ -10,8 +10,9 @@ def get_array(obj, col): which triggers tracking references / CoW (and we might be testing that this is done by some other operation). """ - if isinstance(obj, Series) and obj.name == col: + if isinstance(obj, Series) and (obj is None or obj.name == col): return obj._values + assert col is not None icol = obj.columns.get_loc(col) assert isinstance(icol, int) arr = obj._get_column_array(icol) From a9c91264c150df046382f26693de8e49de4b25f8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 9 Feb 2023 17:39:21 +0100 Subject: [PATCH 2/3] Integrate astype change --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/tests/copy_view/test_astype.py | 39 +++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index b10f201e79318..a3a0bce6e5e37 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -225,6 +225,7 @@ Copy-on-Write improvements - :meth:`DataFrame.tz_convert` / :meth:`Series.tz_localize` - :meth:`DataFrame.infer_objects` / :meth:`Series.infer_objects` - :meth:`DataFrame.astype` / :meth:`Series.astype` + - :meth:`DataFrame.infer_dtypes` / :meth:`Series.infer_dtypes` - :func:`concat` These methods return views when Copy-on-Write is enabled, which provides a significant diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index a485275a28ac4..73343976e92fb 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -193,3 +193,42 @@ def test_astype_arrow_timestamp(using_copy_on_write): if using_copy_on_write: assert not result._mgr._has_no_reference(0) assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a")._data) + + +def test_convert_dtypes_infer_objects(using_copy_on_write): + ser = Series(["a", "b", "c"]) + ser_orig = ser.copy() + result = ser.convert_dtypes( + convert_integer=False, + convert_boolean=False, + convert_floating=False, + convert_string=False, + ) + + if using_copy_on_write: + assert np.shares_memory(get_array(ser), get_array(result)) + else: + assert not np.shares_memory(get_array(ser), get_array(result)) + + result.iloc[0] = "x" + tm.assert_series_equal(ser, ser_orig) + + +def test_convert_dtypes(using_copy_on_write): + df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]}) + df_orig = df.copy() + df2 = df.convert_dtypes() + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "d"), get_array(df, "d")) + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(df2, "d"), get_array(df, "d")) + + df2.iloc[0, 0] = "x" + tm.assert_frame_equal(df, df_orig) From 02cd055027f8fbdb037a743fcb95bd83eedf5244 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 10 Feb 2023 09:00:00 +0100 Subject: [PATCH 3/3] Update doc/source/whatsnew/v2.0.0.rst --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index a3a0bce6e5e37..850391522dbff 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -225,7 +225,7 @@ Copy-on-Write improvements - :meth:`DataFrame.tz_convert` / :meth:`Series.tz_localize` - :meth:`DataFrame.infer_objects` / :meth:`Series.infer_objects` - :meth:`DataFrame.astype` / :meth:`Series.astype` - - :meth:`DataFrame.infer_dtypes` / :meth:`Series.infer_dtypes` + - :meth:`DataFrame.convert_dtypes` / :meth:`Series.convert_dtypes` - :func:`concat` These methods return views when Copy-on-Write is enabled, which provides a significant