Skip to content

ENH: Use lazy copy in infer objects #50428

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 33 commits into from
Feb 8, 2023
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
3c4cee3
ENH: Use lazy copy in infer objects
phofl Dec 24, 2022
db5dba1
Update test_methods.py
phofl Dec 24, 2022
9eb4d85
Merge remote-tracking branch 'upstream/main' into cow_infer_objects
phofl Jan 8, 2023
081dd29
Convert copy
phofl Jan 8, 2023
76c443b
Convert copy
phofl Jan 8, 2023
f7724ff
Remove setting
phofl Jan 8, 2023
a7b4e27
Fix
phofl Jan 8, 2023
1d4f726
Fix typing
phofl Jan 8, 2023
018cfe6
Revert unrelated change
phofl Jan 8, 2023
1696d8a
Fix
phofl Jan 8, 2023
1782fbd
Fix array manager
phofl Jan 8, 2023
8cb6355
Fix type
phofl Jan 8, 2023
cead228
Merge branch 'main' into cow_infer_objects
phofl Jan 12, 2023
47d85b3
Fix copy array manager
phofl Jan 13, 2023
a3d0a2b
Fix manager
phofl Jan 13, 2023
2e2ed0f
Merge remote-tracking branch 'upstream/main' into cow_inf_obj
phofl Jan 13, 2023
3a84382
Merge branch 'main' into cow_infer_objects
phofl Jan 17, 2023
64d550b
Merge remote-tracking branch 'upstream/main' into cow_infer_objects
phofl Jan 27, 2023
716cef8
Refactor
phofl Jan 27, 2023
f693829
Fix mypy
phofl Jan 31, 2023
97fa214
Move to cython
phofl Jan 31, 2023
c3e4e66
Merge remote-tracking branch 'upstream/main' into cow_infer_objects
phofl Feb 7, 2023
5d687dd
Merge CoW ref tracking
phofl Feb 7, 2023
f5bf65f
Refactor for new ref tracking logic
phofl Feb 7, 2023
af8af95
Merge remote-tracking branch 'upstream/main' into cow_infer_objects
phofl Feb 8, 2023
f47f6dd
Fix array manager diff
phofl Feb 8, 2023
e357b64
Fix merge conflicts
phofl Feb 8, 2023
bf1bb3b
Add todo
phofl Feb 8, 2023
5624983
Add test
phofl Feb 8, 2023
9a6d516
Adjust test
phofl Feb 8, 2023
8b0e2b0
Adjust test
phofl Feb 8, 2023
3f832ba
Adjust test
phofl Feb 8, 2023
9f11f0a
Fixup
phofl Feb 8, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ Copy-on-Write improvements
- :meth:`DataFrame.to_period` / :meth:`Series.to_period`
- :meth:`DataFrame.truncate`
- :meth:`DataFrame.tz_convert` / :meth:`Series.tz_localize`
- :meth:`DataFrame.infer_objects` / :meth:`Series.infer_objects`
- :func:`concat`

These methods return views when Copy-on-Write is enabled, which provides a significant
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6426,7 +6426,7 @@ def __deepcopy__(self: NDFrameT, memo=None) -> NDFrameT:
return self.copy(deep=True)

@final
def infer_objects(self: NDFrameT, copy: bool_t = True) -> NDFrameT:
def infer_objects(self: NDFrameT, copy: bool_t | None = None) -> NDFrameT:
"""
Attempt to infer better dtypes for object columns.

Expand Down
5 changes: 4 additions & 1 deletion pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,10 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
return self.apply(astype_array_safe, dtype=dtype, copy=copy, errors=errors)

def convert(self: T, copy: bool) -> T:
def convert(self: T, copy: bool | None) -> T:
if copy is None:
copy = True

def _convert(arr):
if is_object_dtype(arr.dtype):
# extract PandasArray for tests that patch PandasArray._typ
Expand Down
20 changes: 16 additions & 4 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,9 @@ def mgr_locs(self, new_mgr_locs: BlockPlacement) -> None:
self._mgr_locs = new_mgr_locs

@final
def make_block(self, values, placement=None) -> Block:
def make_block(
self, values, placement=None, refs: BlockValuesRefs | None = None
) -> Block:
"""
Create a new block, with type inference propagate any values that are
not specified
Expand All @@ -219,7 +221,7 @@ def make_block(self, values, placement=None) -> Block:

# TODO: perf by not going through new_block
# We assume maybe_coerce_values has already been called
return new_block(values, placement=placement, ndim=self.ndim)
return new_block(values, placement=placement, ndim=self.ndim, refs=refs)

@final
def make_block_same_class(
Expand Down Expand Up @@ -372,7 +374,7 @@ def _split(self) -> list[Block]:
vals = self.values[slice(i, i + 1)]

bp = BlockPlacement(ref_loc)
nb = type(self)(vals, placement=bp, ndim=2)
nb = type(self)(vals, placement=bp, ndim=2, refs=self.refs)
new_blocks.append(nb)
return new_blocks

Expand Down Expand Up @@ -448,12 +450,15 @@ def convert(
self,
*,
copy: bool = True,
using_cow: bool = False,
) -> list[Block]:
"""
attempt to coerce any object types to better types return a copy
of the block (if copy = True) by definition we are not an ObjectBlock
here!
"""
if not copy and using_cow:
return [self.copy(deep=False)]
return [self.copy()] if copy else [self]

# ---------------------------------------------------------------------
Expand Down Expand Up @@ -2040,6 +2045,7 @@ def convert(
self,
*,
copy: bool = True,
using_cow: bool = False,
) -> list[Block]:
"""
attempt to cast any object types to better types return a copy of
Expand All @@ -2048,6 +2054,8 @@ def convert(
if self.dtype != _dtype_obj:
# GH#50067 this should be impossible in ObjectBlock, but until
# that is fixed, we short-circuit here.
if using_cow:
return [self.copy(deep=False)]
return [self]

values = self.values
Expand All @@ -2063,10 +2071,14 @@ def convert(
convert_period=True,
convert_interval=True,
)
refs = None
if copy and res_values is values:
res_values = values.copy()
elif res_values is values and using_cow:
refs = self.refs

res_values = ensure_block_shape(res_values, self.ndim)
return [self.make_block(res_values)]
return [self.make_block(res_values, refs=refs)]


# -----------------------------------------------------------------
Expand Down
13 changes: 8 additions & 5 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,11 +424,14 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
return self.apply("astype", dtype=dtype, copy=copy, errors=errors)

def convert(self: T, copy: bool) -> T:
return self.apply(
"convert",
copy=copy,
)
def convert(self: T, copy: bool | None) -> T:
if copy is None:
if using_copy_on_write():
copy = False
else:
copy = True

return self.apply("convert", copy=copy, using_cow=using_copy_on_write())

def replace(self: T, to_replace, value, inplace: bool) -> T:
inplace = validate_bool_kwarg(inplace, "inplace")
Expand Down
51 changes: 51 additions & 0 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -748,6 +748,57 @@ def test_head_tail(method, using_copy_on_write):
tm.assert_frame_equal(df, df_orig)


def test_infer_objects(using_copy_on_write):
df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"})
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be worth adding a test where one of the object dtype columns actually gets converted? (eg change d to "d": np.array([0, 1], dtype=object)) To ensure this column owns its memory (and cover that part of the code additions)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, adjusted d in the two tests below to cover both cases.

Good to go apart from that? convert needs cow for a couple of other things

df_orig = df.copy()
df2 = df.infer_objects()

if using_copy_on_write:
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))

else:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))

df2.iloc[0, 0] = 0
df2.iloc[0, 1] = "d"
if using_copy_on_write:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
tm.assert_frame_equal(df, df_orig)


def test_infer_objects_no_reference(using_copy_on_write):
df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"})
df = df.infer_objects()

arr_a = get_array(df, "a")
arr_b = get_array(df, "b")

df.iloc[0, 0] = 0
df.iloc[0, 1] = "d"
if using_copy_on_write:
assert np.shares_memory(arr_a, get_array(df, "a"))
# TODO(CoW): Block splitting causes references here
assert not np.shares_memory(arr_b, get_array(df, "b"))


def test_infer_objects_reference(using_copy_on_write):
df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"})
view = df[:] # noqa: F841
df = df.infer_objects()

arr_a = get_array(df, "a")
arr_b = get_array(df, "b")

df.iloc[0, 0] = 0
df.iloc[0, 1] = "d"
if using_copy_on_write:
assert not np.shares_memory(arr_a, get_array(df, "a"))
assert not np.shares_memory(arr_b, get_array(df, "b"))


@pytest.mark.parametrize(
"kwargs",
[
Expand Down