Skip to content

REF: implement NDFrame._from_mgr #52132

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Jun 25, 2023
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -715,7 +715,7 @@ def apply(self) -> DataFrame | Series:
with np.errstate(all="ignore"):
results = self.obj._mgr.apply("apply", func=self.f)
# _constructor will retain self.index and self.columns
return self.obj._constructor(data=results)
return self.obj._constructor_from_mgr(results, axes=results.axes)

# broadcasting
if self.result_type == "broadcast":
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arraylike.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ def _reconstruct(result):
return result
if isinstance(result, BlockManager):
# we went through BlockManager.apply e.g. np.sqrt
result = self._constructor(result, **reconstruct_kwargs, copy=False)
result = self._constructor_from_mgr(result, axes=result.axes)
else:
# we converted an array, lost our axes
result = self._constructor(
Expand Down
71 changes: 49 additions & 22 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,8 +637,30 @@ class DataFrame(NDFrame, OpsMixin):
def _constructor(self) -> Callable[..., DataFrame]:
return DataFrame

def _constructor_from_mgr(self, mgr, axes):
if self._constructor is DataFrame:
# we are pandas.DataFrame (or a subclass that doesn't override _constructor)
return self._from_mgr(mgr, axes=axes)
else:
assert axes is mgr.axes
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this assert needed here? (it's also not done in _from_mgr)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not strictly. ATM _from_mgr is documented as requiring them to match, so this assertion seemed like an easy way of making it required along this path too. could remove the assertion and document the requirement in the docstring

return self._constructor(mgr)

_constructor_sliced: Callable[..., Series] = Series

def _sliced_from_mgr(self, mgr, axes) -> Series:
# https://github.com/pandas-dev/pandas/pull/52132#issuecomment-1481491828
# This is a short-term implementation that will be replaced
# with self._constructor_sliced._from_mgr(...)
# once downstream packages (geopandas) have had a chance to implement
# their own overrides.
return self._constructor_sliced(mgr)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this be something like Series._from_mgr(mgr) or self._constructor_sliced._from_mgr(mgr) ?
Because now this is calling the same as the fall-back in _constructor_sliced_from_mgr, and thus the if block there is not doing anything different.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. i think this is leftover from a previous round of editing. will update.


def _constructor_sliced_from_mgr(self, mgr, axes):
if self._constructor_sliced is Series:
return self._sliced_from_mgr(mgr, axes)
assert axes is mgr.axes
return self._constructor_sliced(mgr)

# ----------------------------------------------------------------------
# Constructors

Expand Down Expand Up @@ -3667,9 +3689,9 @@ def _ixs(self, i: int, axis: AxisInt = 0) -> Series:

# if we are a copy, mark as such
copy = isinstance(new_mgr.array, np.ndarray) and new_mgr.array.base is None
result = self._constructor_sliced(new_mgr, name=self.index[i]).__finalize__(
self
)
result = self._constructor_sliced_from_mgr(new_mgr, axes=new_mgr.axes)
result._name = self.index[i]
result = result.__finalize__(self)
result._set_is_copy(self, copy=copy)
return result

Expand Down Expand Up @@ -3722,7 +3744,7 @@ def _getitem_nocopy(self, key: list):
copy=False,
only_slice=True,
)
return self._constructor(new_mgr)
return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)

def __getitem__(self, key):
check_dict_or_set_indexers(key)
Expand Down Expand Up @@ -4261,9 +4283,10 @@ def _box_col_values(self, values: SingleDataManager, loc: int) -> Series:
# Lookup in columns so that if e.g. a str datetime was passed
# we attach the Timestamp object as the name.
name = self.columns[loc]
klass = self._constructor_sliced
# We get index=self.index bc values is a SingleDataManager
return klass(values, name=name, fastpath=True).__finalize__(self)
obj = self._constructor_sliced_from_mgr(values, axes=values.axes)
obj._name = name
return obj.__finalize__(self)

# ----------------------------------------------------------------------
# Lookup Caching
Expand Down Expand Up @@ -4737,7 +4760,7 @@ def predicate(arr: ArrayLike) -> bool:
return True

mgr = self._mgr._get_data_subset(predicate).copy(deep=None)
return type(self)(mgr).__finalize__(self)
return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self)

def insert(
self,
Expand Down Expand Up @@ -5551,7 +5574,7 @@ def shift(
fill_value=fill_value,
allow_dups=True,
)
res_df = self._constructor(mgr)
res_df = self._constructor_from_mgr(mgr, axes=mgr.axes)
return res_df.__finalize__(self, method="shift")

return super().shift(
Expand Down Expand Up @@ -6079,7 +6102,8 @@ class max type

@doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])
def isna(self) -> DataFrame:
result = self._constructor(self._mgr.isna(func=isna))
res_mgr = self._mgr.isna(func=isna)
result = self._constructor_from_mgr(res_mgr, axes=res_mgr.axes)
return result.__finalize__(self, method="isna")

@doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])
Expand Down Expand Up @@ -6791,7 +6815,7 @@ def sort_values(
self._get_block_manager_axis(axis), default_index(len(indexer))
)

result = self._constructor(new_data)
result = self._constructor_from_mgr(new_data, axes=new_data.axes)
if inplace:
return self._update_inplace(result)
else:
Expand Down Expand Up @@ -7485,7 +7509,7 @@ def _dispatch_frame_op(
if not is_list_like(right):
# i.e. scalar, faster than checking np.ndim(right) == 0
bm = self._mgr.apply(array_op, right=right)
return self._constructor(bm)
return self._constructor_from_mgr(bm, axes=bm.axes)

elif isinstance(right, DataFrame):
assert self.index.equals(right.index)
Expand All @@ -7505,7 +7529,7 @@ def _dispatch_frame_op(
right._mgr, # type: ignore[arg-type]
array_op,
)
return self._constructor(bm)
return self._constructor_from_mgr(bm, axes=bm.axes)

elif isinstance(right, Series) and axis == 1:
# axis=1 means we want to operate row-by-row
Expand Down Expand Up @@ -9474,7 +9498,9 @@ def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame:
axis = 0

new_data = self._mgr.diff(n=periods, axis=axis)
return self._constructor(new_data).__finalize__(self, "diff")
return self._constructor_from_mgr(new_data, axes=new_data.axes).__finalize__(
self, "diff"
)

# ----------------------------------------------------------------------
# Function application
Expand Down Expand Up @@ -10330,12 +10356,13 @@ def _series_round(ser: Series, decimals: int) -> Series:
# Dispatch to Block.round
# Argument "decimals" to "round" of "BaseBlockManager" has incompatible
# type "Union[int, integer[Any]]"; expected "int"
return self._constructor(
self._mgr.round(
decimals=decimals, # type: ignore[arg-type]
using_cow=using_copy_on_write(),
),
).__finalize__(self, method="round")
new_mgr = self._mgr.round(
decimals=decimals, # type: ignore[arg-type]
using_cow=using_copy_on_write(),
)
return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(
self, method="round"
)
else:
raise TypeError("decimals must be an integer, a dict-like or a Series")

Expand Down Expand Up @@ -10888,7 +10915,7 @@ def _get_data() -> DataFrame:
# After possibly _get_data and transposing, we are now in the
# simple case where we can use BlockManager.reduce
res = df._mgr.reduce(blk_func)
out = df._constructor(res).iloc[0]
out = df._constructor_from_mgr(res, axes=res.axes).iloc[0]
if out_dtype is not None:
out = out.astype(out_dtype)
elif (df._mgr.get_dtypes() == object).any():
Expand Down Expand Up @@ -11502,7 +11529,7 @@ def quantile(
res = data._mgr.take(indexer[q_idx], verify=False)
res.axes[1] = q

result = self._constructor(res)
result = self._constructor_from_mgr(res, axes=res.axes)
return result.__finalize__(self, method="quantile")

def to_timestamp(
Expand Down Expand Up @@ -11824,7 +11851,7 @@ def _to_dict_of_blocks(self, copy: bool = True):
mgr = mgr_to_mgr(mgr, "block")
mgr = cast(BlockManager, mgr)
return {
k: self._constructor(v).__finalize__(self)
k: self._constructor_from_mgr(v, axes=v.axes).__finalize__(self)
for k, v, in mgr.to_dict(copy=copy).items()
}

Expand Down
Loading