Skip to content

Commit 7e88122

Browse files
authored
ENH: Use lazy copy in infer objects (#50428)
1 parent 0d842ea commit 7e88122

File tree

6 files changed

+106
-11
lines changed

6 files changed

+106
-11
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ Copy-on-Write improvements
223223
- :meth:`DataFrame.to_period` / :meth:`Series.to_period`
224224
- :meth:`DataFrame.truncate`
225225
- :meth:`DataFrame.tz_convert` / :meth:`Series.tz_localize`
226+
- :meth:`DataFrame.infer_objects` / :meth:`Series.infer_objects`
226227
- :func:`concat`
227228

228229
These methods return views when Copy-on-Write is enabled, which provides a significant

pandas/core/generic.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -6426,7 +6426,7 @@ def __deepcopy__(self: NDFrameT, memo=None) -> NDFrameT:
64266426
return self.copy(deep=True)
64276427

64286428
@final
6429-
def infer_objects(self: NDFrameT, copy: bool_t = True) -> NDFrameT:
6429+
def infer_objects(self: NDFrameT, copy: bool_t | None = None) -> NDFrameT:
64306430
"""
64316431
Attempt to infer better dtypes for object columns.
64326432

pandas/core/internals/array_manager.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,10 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
369369
def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
370370
return self.apply(astype_array_safe, dtype=dtype, copy=copy, errors=errors)
371371

372-
def convert(self: T, copy: bool) -> T:
372+
def convert(self: T, copy: bool | None) -> T:
373+
if copy is None:
374+
copy = True
375+
373376
def _convert(arr):
374377
if is_object_dtype(arr.dtype):
375378
# extract PandasArray for tests that patch PandasArray._typ

pandas/core/internals/blocks.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,9 @@ def mgr_locs(self, new_mgr_locs: BlockPlacement) -> None:
207207
self._mgr_locs = new_mgr_locs
208208

209209
@final
210-
def make_block(self, values, placement=None) -> Block:
210+
def make_block(
211+
self, values, placement=None, refs: BlockValuesRefs | None = None
212+
) -> Block:
211213
"""
212214
Create a new block, with type inference propagate any values that are
213215
not specified
@@ -219,7 +221,7 @@ def make_block(self, values, placement=None) -> Block:
219221

220222
# TODO: perf by not going through new_block
221223
# We assume maybe_coerce_values has already been called
222-
return new_block(values, placement=placement, ndim=self.ndim)
224+
return new_block(values, placement=placement, ndim=self.ndim, refs=refs)
223225

224226
@final
225227
def make_block_same_class(
@@ -372,7 +374,7 @@ def _split(self) -> list[Block]:
372374
vals = self.values[slice(i, i + 1)]
373375

374376
bp = BlockPlacement(ref_loc)
375-
nb = type(self)(vals, placement=bp, ndim=2)
377+
nb = type(self)(vals, placement=bp, ndim=2, refs=self.refs)
376378
new_blocks.append(nb)
377379
return new_blocks
378380

@@ -448,12 +450,15 @@ def convert(
448450
self,
449451
*,
450452
copy: bool = True,
453+
using_cow: bool = False,
451454
) -> list[Block]:
452455
"""
453456
attempt to coerce any object types to better types return a copy
454457
of the block (if copy = True) by definition we are not an ObjectBlock
455458
here!
456459
"""
460+
if not copy and using_cow:
461+
return [self.copy(deep=False)]
457462
return [self.copy()] if copy else [self]
458463

459464
# ---------------------------------------------------------------------
@@ -2040,6 +2045,7 @@ def convert(
20402045
self,
20412046
*,
20422047
copy: bool = True,
2048+
using_cow: bool = False,
20432049
) -> list[Block]:
20442050
"""
20452051
attempt to cast any object types to better types return a copy of
@@ -2048,6 +2054,8 @@ def convert(
20482054
if self.dtype != _dtype_obj:
20492055
# GH#50067 this should be impossible in ObjectBlock, but until
20502056
# that is fixed, we short-circuit here.
2057+
if using_cow:
2058+
return [self.copy(deep=False)]
20512059
return [self]
20522060

20532061
values = self.values
@@ -2063,10 +2071,14 @@ def convert(
20632071
convert_period=True,
20642072
convert_interval=True,
20652073
)
2074+
refs = None
20662075
if copy and res_values is values:
20672076
res_values = values.copy()
2077+
elif res_values is values and using_cow:
2078+
refs = self.refs
2079+
20682080
res_values = ensure_block_shape(res_values, self.ndim)
2069-
return [self.make_block(res_values)]
2081+
return [self.make_block(res_values, refs=refs)]
20702082

20712083

20722084
# -----------------------------------------------------------------

pandas/core/internals/managers.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -424,11 +424,14 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
424424
def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
425425
return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
426426

427-
def convert(self: T, copy: bool) -> T:
428-
return self.apply(
429-
"convert",
430-
copy=copy,
431-
)
427+
def convert(self: T, copy: bool | None) -> T:
428+
if copy is None:
429+
if using_copy_on_write():
430+
copy = False
431+
else:
432+
copy = True
433+
434+
return self.apply("convert", copy=copy, using_cow=using_copy_on_write())
432435

433436
def replace(self: T, to_replace, value, inplace: bool) -> T:
434437
inplace = validate_bool_kwarg(inplace, "inplace")

pandas/tests/copy_view/test_methods.py

+76
Original file line numberDiff line numberDiff line change
@@ -748,6 +748,82 @@ def test_head_tail(method, using_copy_on_write):
748748
tm.assert_frame_equal(df, df_orig)
749749

750750

751+
def test_infer_objects(using_copy_on_write):
752+
df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"})
753+
df_orig = df.copy()
754+
df2 = df.infer_objects()
755+
756+
if using_copy_on_write:
757+
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
758+
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
759+
760+
else:
761+
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
762+
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
763+
764+
df2.iloc[0, 0] = 0
765+
df2.iloc[0, 1] = "d"
766+
if using_copy_on_write:
767+
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
768+
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
769+
tm.assert_frame_equal(df, df_orig)
770+
771+
772+
def test_infer_objects_no_reference(using_copy_on_write):
773+
df = DataFrame(
774+
{
775+
"a": [1, 2],
776+
"b": "c",
777+
"c": 1,
778+
"d": Series(
779+
[Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object"
780+
),
781+
"e": "b",
782+
}
783+
)
784+
df = df.infer_objects()
785+
786+
arr_a = get_array(df, "a")
787+
arr_b = get_array(df, "b")
788+
arr_d = get_array(df, "d")
789+
790+
df.iloc[0, 0] = 0
791+
df.iloc[0, 1] = "d"
792+
df.iloc[0, 3] = Timestamp("2018-12-31")
793+
if using_copy_on_write:
794+
assert np.shares_memory(arr_a, get_array(df, "a"))
795+
# TODO(CoW): Block splitting causes references here
796+
assert not np.shares_memory(arr_b, get_array(df, "b"))
797+
assert np.shares_memory(arr_d, get_array(df, "d"))
798+
799+
800+
def test_infer_objects_reference(using_copy_on_write):
801+
df = DataFrame(
802+
{
803+
"a": [1, 2],
804+
"b": "c",
805+
"c": 1,
806+
"d": Series(
807+
[Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object"
808+
),
809+
}
810+
)
811+
view = df[:] # noqa: F841
812+
df = df.infer_objects()
813+
814+
arr_a = get_array(df, "a")
815+
arr_b = get_array(df, "b")
816+
arr_d = get_array(df, "d")
817+
818+
df.iloc[0, 0] = 0
819+
df.iloc[0, 1] = "d"
820+
df.iloc[0, 3] = Timestamp("2018-12-31")
821+
if using_copy_on_write:
822+
assert not np.shares_memory(arr_a, get_array(df, "a"))
823+
assert not np.shares_memory(arr_b, get_array(df, "b"))
824+
assert np.shares_memory(arr_d, get_array(df, "d"))
825+
826+
751827
@pytest.mark.parametrize(
752828
"kwargs",
753829
[

0 commit comments

Comments
 (0)