Skip to content

REF: simplify _from_mgr constructors, use in more places #53871

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 16 additions & 13 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,23 +638,28 @@ def _constructor(self) -> Callable[..., DataFrame]:
return DataFrame

def _constructor_from_mgr(self, mgr, axes):
if self._constructor is DataFrame:
# we are pandas.DataFrame (or a subclass that doesn't override _constructor)
return self._from_mgr(mgr, axes=axes)
df = self._from_mgr(mgr, axes=axes)

if type(self) is DataFrame:
# fastpath avoiding constructor call
return df
else:
assert axes is mgr.axes
return self._constructor(mgr)
return self._constructor(df, copy=False)

_constructor_sliced: Callable[..., Series] = Series

def _sliced_from_mgr(self, mgr, axes) -> Series:
return Series._from_mgr(mgr, axes)

def _constructor_sliced_from_mgr(self, mgr, axes):
if self._constructor_sliced is Series:
return self._sliced_from_mgr(mgr, axes)
ser = self._sliced_from_mgr(mgr, axes=axes)
ser._name = None # caller is responsible for setting real name
if type(self) is DataFrame:
# fastpath avoiding constructor call
return ser
assert axes is mgr.axes
return self._constructor_sliced(mgr)
return self._constructor_sliced(ser, copy=False)

# ----------------------------------------------------------------------
# Constructors
Expand Down Expand Up @@ -11763,12 +11768,10 @@ def isin_(x):
)
return result.reshape(x.shape)

res_values = self._mgr.apply(isin_)
result = self._constructor(
res_values,
self.index,
self.columns,
copy=False,
res_mgr = self._mgr.apply(isin_)
result = self._constructor_from_mgr(
res_mgr,
axes=res_mgr.axes,
)
return result.__finalize__(self, method="isin")

Expand Down
4 changes: 2 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ def _as_manager(self, typ: str, copy: bool_t = True) -> Self:
new_mgr: Manager
new_mgr = mgr_to_mgr(self._mgr, typ=typ, copy=copy)
# fastpath of passing a manager doesn't check the option/manager class
return self._constructor(new_mgr).__finalize__(self)
return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self)

@classmethod
def _from_mgr(cls, mgr: Manager, axes: list[Index]) -> Self:
Expand Down Expand Up @@ -6919,7 +6919,7 @@ def _fillna_with_method(
inplace=inplace,
downcast=downcast,
)
result = self._constructor(new_mgr)
result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
if inplace:
return self._update_inplace(result)
else:
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1519,11 +1519,11 @@ def _numba_agg_general(
aggregator = executor.generate_shared_aggregator(
func, dtype_mapping, **get_jit_arguments(engine_kwargs)
)
result = sorted_df._mgr.apply(
res_mgr = sorted_df._mgr.apply(
aggregator, start=starts, end=ends, **aggregator_kwargs
)
result.axes[1] = self.grouper.result_index
result = df._constructor(result)
res_mgr.axes[1] = self.grouper.result_index
result = df._constructor_from_mgr(res_mgr, axes=res_mgr.axes)

if data.ndim == 1:
result = result.squeeze("columns")
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/reshape/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -640,7 +640,8 @@ def get_result(self):

mgr = type(sample._mgr).from_array(res, index=new_index)

result = cons(mgr, name=name, fastpath=True)
result = sample._constructor_from_mgr(mgr, axes=mgr.axes)
result._name = name
return result.__finalize__(self, method="concat")

# combine as columns in a frame
Expand Down
24 changes: 14 additions & 10 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,12 +575,14 @@ def _constructor(self) -> Callable[..., Series]:
return Series

def _constructor_from_mgr(self, mgr, axes):
if self._constructor is Series:
# we are pandas.Series (or a subclass that doesn't override _constructor)
return self._from_mgr(mgr, axes=axes)
ser = self._from_mgr(mgr, axes=axes)
Copy link
Member

@jorisvandenbossche jorisvandenbossche Jul 4, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jbrockmendel this changes the logic from what we discussed in #52132, and I am afraid this isn't exactly equivalent for subclasses implementing _constructor.

(before, we passed the Manager object to _constructor, while with this PR we pass a Series object)

Now, what was the rationale for changing this?
The PR title says "simplify _from_mgr constructors", but don't see how this is a simplification here? (it actually got longer, and added _name handling)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC i went to use the new constructor_from_mgr in more places and found that broke some tests, and the edits here were the best option I found to keep things working (in particular pinning ser._name)

I also very much like that in this version subclasses don't have to worry about handling Manager objects, just Series/DataFrame objects, which they should already be able to handle.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain why or when the pinning of ser._name is needed? (which case is it solving?)

In geopandas, it is a feature that we can do something different for blockmanager vs generic array-like (we use it to decide whether we want to try to infer objects being passed or not). It also adds some overhead to go through the constructor again with a dataframe/series.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain why or when the pinning of ser._name is needed? (which case is it solving?)

My recollection is that without that, when I changed the usages to use constructor_from_mgr, a handful of tests broke with AttributeError bc _name was not defined.

ser._name = None # caller is responsible for setting real name
if type(self) is Series:
# fastpath avoiding constructor call
return ser
else:
assert axes is mgr.axes
return self._constructor(mgr)
return self._constructor(ser, copy=False)

@property
def _constructor_expanddim(self) -> Callable[..., DataFrame]:
Expand All @@ -599,15 +601,17 @@ def _expanddim_from_mgr(self, mgr, axes) -> DataFrame:
# once downstream packages (geopandas) have had a chance to implement
# their own overrides.
# error: "Callable[..., DataFrame]" has no attribute "_from_mgr" [attr-defined]
return self._constructor_expanddim._from_mgr( # type: ignore[attr-defined]
mgr, axes=mgr.axes
)
from pandas import DataFrame

return DataFrame._from_mgr(mgr, axes=mgr.axes)

def _constructor_expanddim_from_mgr(self, mgr, axes):
if self._constructor is Series:
return self._expanddim_from_mgr(mgr, axes)
df = self._expanddim_from_mgr(mgr, axes)
if type(self) is Series:
# fastpath avoiding constructor
return df
assert axes is mgr.axes
return self._constructor_expanddim(mgr)
return self._constructor_expanddim(df, copy=False)

# types
@property
Expand Down