diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 7d028935ad175..70c60401f29fb 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -223,6 +223,7 @@ Copy-on-Write improvements - :meth:`DataFrame.to_period` / :meth:`Series.to_period` - :meth:`DataFrame.truncate` - :meth:`DataFrame.tz_convert` / :meth:`Series.tz_localize` + - :meth:`DataFrame.infer_objects` / :meth:`Series.infer_objects` - :func:`concat` These methods return views when Copy-on-Write is enabled, which provides a significant diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9ad291c39cbc5..eae4ed038d692 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6426,7 +6426,7 @@ def __deepcopy__(self: NDFrameT, memo=None) -> NDFrameT: return self.copy(deep=True) @final - def infer_objects(self: NDFrameT, copy: bool_t = True) -> NDFrameT: + def infer_objects(self: NDFrameT, copy: bool_t | None = None) -> NDFrameT: """ Attempt to infer better dtypes for object columns. diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 76da973e110bf..2823c355955ee 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -369,7 +369,10 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T: def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T: return self.apply(astype_array_safe, dtype=dtype, copy=copy, errors=errors) - def convert(self: T, copy: bool) -> T: + def convert(self: T, copy: bool | None) -> T: + if copy is None: + copy = True + def _convert(arr): if is_object_dtype(arr.dtype): # extract PandasArray for tests that patch PandasArray._typ diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 15abc143cd081..115ae5dc6bb9d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -207,7 +207,9 @@ def mgr_locs(self, new_mgr_locs: BlockPlacement) -> None: self._mgr_locs = new_mgr_locs @final - def make_block(self, values, placement=None) -> Block: + def make_block( + self, values, placement=None, refs: BlockValuesRefs | None = None + ) -> Block: """ Create a new block, with type inference propagate any values that are not specified @@ -219,7 +221,7 @@ def make_block(self, values, placement=None) -> Block: # TODO: perf by not going through new_block # We assume maybe_coerce_values has already been called - return new_block(values, placement=placement, ndim=self.ndim) + return new_block(values, placement=placement, ndim=self.ndim, refs=refs) @final def make_block_same_class( @@ -372,7 +374,7 @@ def _split(self) -> list[Block]: vals = self.values[slice(i, i + 1)] bp = BlockPlacement(ref_loc) - nb = type(self)(vals, placement=bp, ndim=2) + nb = type(self)(vals, placement=bp, ndim=2, refs=self.refs) new_blocks.append(nb) return new_blocks @@ -448,12 +450,15 @@ def convert( self, *, copy: bool = True, + using_cow: bool = False, ) -> list[Block]: """ attempt to coerce any object types to better types return a copy of the block (if copy = True) by definition we are not an ObjectBlock here! """ + if not copy and using_cow: + return [self.copy(deep=False)] return [self.copy()] if copy else [self] # --------------------------------------------------------------------- @@ -2040,6 +2045,7 @@ def convert( self, *, copy: bool = True, + using_cow: bool = False, ) -> list[Block]: """ attempt to cast any object types to better types return a copy of @@ -2048,6 +2054,8 @@ def convert( if self.dtype != _dtype_obj: # GH#50067 this should be impossible in ObjectBlock, but until # that is fixed, we short-circuit here. + if using_cow: + return [self.copy(deep=False)] return [self] values = self.values @@ -2063,10 +2071,14 @@ def convert( convert_period=True, convert_interval=True, ) + refs = None if copy and res_values is values: res_values = values.copy() + elif res_values is values and using_cow: + refs = self.refs + res_values = ensure_block_shape(res_values, self.ndim) - return [self.make_block(res_values)] + return [self.make_block(res_values, refs=refs)] # ----------------------------------------------------------------- diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 74116dd855e3e..5d45b33871900 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -424,11 +424,14 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T: def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T: return self.apply("astype", dtype=dtype, copy=copy, errors=errors) - def convert(self: T, copy: bool) -> T: - return self.apply( - "convert", - copy=copy, - ) + def convert(self: T, copy: bool | None) -> T: + if copy is None: + if using_copy_on_write(): + copy = False + else: + copy = True + + return self.apply("convert", copy=copy, using_cow=using_copy_on_write()) def replace(self: T, to_replace, value, inplace: bool) -> T: inplace = validate_bool_kwarg(inplace, "inplace") diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index b814b9089aabd..6b54345723118 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -748,6 +748,82 @@ def test_head_tail(method, using_copy_on_write): tm.assert_frame_equal(df, df_orig) +def test_infer_objects(using_copy_on_write): + df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"}) + df_orig = df.copy() + df2 = df.infer_objects() + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + + df2.iloc[0, 0] = 0 + df2.iloc[0, 1] = "d" + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + tm.assert_frame_equal(df, df_orig) + + +def test_infer_objects_no_reference(using_copy_on_write): + df = DataFrame( + { + "a": [1, 2], + "b": "c", + "c": 1, + "d": Series( + [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object" + ), + "e": "b", + } + ) + df = df.infer_objects() + + arr_a = get_array(df, "a") + arr_b = get_array(df, "b") + arr_d = get_array(df, "d") + + df.iloc[0, 0] = 0 + df.iloc[0, 1] = "d" + df.iloc[0, 3] = Timestamp("2018-12-31") + if using_copy_on_write: + assert np.shares_memory(arr_a, get_array(df, "a")) + # TODO(CoW): Block splitting causes references here + assert not np.shares_memory(arr_b, get_array(df, "b")) + assert np.shares_memory(arr_d, get_array(df, "d")) + + +def test_infer_objects_reference(using_copy_on_write): + df = DataFrame( + { + "a": [1, 2], + "b": "c", + "c": 1, + "d": Series( + [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object" + ), + } + ) + view = df[:] # noqa: F841 + df = df.infer_objects() + + arr_a = get_array(df, "a") + arr_b = get_array(df, "b") + arr_d = get_array(df, "d") + + df.iloc[0, 0] = 0 + df.iloc[0, 1] = "d" + df.iloc[0, 3] = Timestamp("2018-12-31") + if using_copy_on_write: + assert not np.shares_memory(arr_a, get_array(df, "a")) + assert not np.shares_memory(arr_b, get_array(df, "b")) + assert np.shares_memory(arr_d, get_array(df, "d")) + + @pytest.mark.parametrize( "kwargs", [