Skip to content

PERF: fastpath DataFrame constructor from BlockManager #33357

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Apr 9, 2020
4 changes: 1 addition & 3 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,7 @@ def get_result(self):
elif isinstance(self.f, np.ufunc):
with np.errstate(all="ignore"):
results = self.obj._mgr.apply("apply", func=self.f)
return self.obj._constructor(
data=results, index=self.index, columns=self.columns, copy=False
)
return type(self.obj)._from_mgr(results)

# broadcasting
if self.result_type == "broadcast":
Expand Down
10 changes: 5 additions & 5 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4469,7 +4469,7 @@ def _maybe_casted_values(index, labels=None):

@Appender(_shared_docs["isna"] % _shared_doc_kwargs)
def isna(self) -> "DataFrame":
result = self._constructor(self._data.isna(func=isna))
result = type(self)._from_mgr(self._mgr.isna(func=isna))
return result.__finalize__(self, method="isna")

@Appender(_shared_docs["isna"] % _shared_doc_kwargs)
Expand Down Expand Up @@ -4794,7 +4794,7 @@ def sort_values(
if ignore_index:
new_data.axes[1] = ibase.default_index(len(indexer))

result = self._constructor(new_data)
result = type(self)._from_mgr(new_data)
if inplace:
return self._update_inplace(result)
else:
Expand Down Expand Up @@ -4930,7 +4930,7 @@ def sort_index(
if ignore_index:
new_data.axes[1] = ibase.default_index(len(indexer))

result = self._constructor(new_data)
result = type(self)._from_mgr(new_data)
if inplace:
return self._update_inplace(result)
else:
Expand Down Expand Up @@ -6662,7 +6662,7 @@ def diff(self, periods: int = 1, axis: Axis = 0) -> "DataFrame":
"""
bm_axis = self._get_block_manager_axis(axis)
new_data = self._mgr.diff(n=periods, axis=bm_axis)
return self._constructor(new_data)
return type(self)._from_mgr(new_data)

# ----------------------------------------------------------------------
# Function application
Expand Down Expand Up @@ -8426,7 +8426,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"):
)

if result.ndim == 2:
result = self._constructor(result)
result = type(self)._from_mgr(result)
else:
result = self._constructor_sliced(result, name=q)

Expand Down
88 changes: 54 additions & 34 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,13 +213,13 @@ def __init__(
object.__setattr__(self, "_attrs", attrs)

@classmethod
def _init_mgr(cls, mgr, axes=None, dtype=None, copy=False):
def _init_mgr(cls, mgr, axes, dtype=None, copy: bool = False) -> BlockManager:
""" passed a manager and a axes dict """
for a, axe in axes.items():
if axe is not None:
mgr = mgr.reindex_axis(
axe, axis=cls._get_block_manager_axis(a), copy=False
)
axe = ensure_index(axe)
bm_axis = cls._get_block_manager_axis(a)
mgr = mgr.reindex_axis(axe, axis=bm_axis, copy=False)

# make a copy if explicitly requested
if copy:
Expand All @@ -230,6 +230,15 @@ def _init_mgr(cls, mgr, axes=None, dtype=None, copy=False):
mgr = mgr.astype(dtype=dtype)
return mgr

@classmethod
def _from_mgr(cls, mgr: BlockManager):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add the return type

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be NDFrame or FrameOrSeries?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think

Suggested change
def _from_mgr(cls, mgr: BlockManager):
def _from_mgr(cls: Type[FrameOrSeries], mgr: BlockManager) -> FrameOrSeries:

though not checked

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like we need to change from classmethod in order to get _constructor, unless we want to do something like

constructor = cls._constructor.fget(cls)

which seems sketchy

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does obj._constructor -> obj._constructor._from_mgr not work with the classmethod. obj._constructor returns a class type not an instance?

"""
Fastpath constructor for if we have only a BlockManager.
"""
obj = object.__new__(cls)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might consider calling finalize here (have to pass the method as an arg)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

one of the use cases i have in mind is in groupby _chop where we are potentially creating many many Series/DataFrame objects, hoping we can avoid the extra call

NDFrame.__init__(obj, mgr)
return obj

# ----------------------------------------------------------------------

@property
Expand Down Expand Up @@ -1359,8 +1368,8 @@ def __invert__(self):
return self

new_data = self._mgr.apply(operator.invert)
result = self._constructor(new_data).__finalize__(self, method="__invert__")
return result
result = type(self)._from_mgr(new_data)
return result.__finalize__(self, method="__invert__")

def __nonzero__(self):
raise ValueError(
Expand Down Expand Up @@ -3365,7 +3374,7 @@ class max_speed
new_data = self._mgr.take(
indices, axis=self._get_block_manager_axis(axis), verify=True
)
return self._constructor(new_data).__finalize__(self, method="take")
return type(self)._from_mgr(new_data).__finalize__(self, method="take")

def _take_with_is_copy(self: FrameOrSeries, indices, axis=0) -> FrameOrSeries:
"""
Expand Down Expand Up @@ -3571,7 +3580,7 @@ def _slice(self: FrameOrSeries, slobj: slice, axis=0) -> FrameOrSeries:
"""
assert isinstance(slobj, slice), type(slobj)
axis = self._get_block_manager_axis(axis)
result = self._constructor(self._mgr.get_slice(slobj, axis=axis))
result = type(self)._from_mgr(self._mgr.get_slice(slobj, axis=axis))
result = result.__finalize__(self)

# this could be a view
Expand Down Expand Up @@ -4508,7 +4517,7 @@ def _reindex_with_indexers(
if copy and new_data is self._mgr:
new_data = new_data.copy()

return self._constructor(new_data).__finalize__(self)
return type(self)._from_mgr(new_data).__finalize__(self)

def filter(
self: FrameOrSeries,
Expand Down Expand Up @@ -5274,7 +5283,7 @@ def _consolidate(self, inplace: bool_t = False):
else:
f = lambda: self._mgr.consolidate()
cons_data = self._protect_consolidate(f)
return self._constructor(cons_data).__finalize__(self)
return type(self)._from_mgr(cons_data).__finalize__(self)

@property
def _is_mixed_type(self) -> bool_t:
Expand Down Expand Up @@ -5303,10 +5312,10 @@ def _check_inplace_setting(self, value) -> bool_t:
return True

def _get_numeric_data(self):
return self._constructor(self._mgr.get_numeric_data()).__finalize__(self,)
return type(self)._from_mgr(self._mgr.get_numeric_data()).__finalize__(self)

def _get_bool_data(self):
return self._constructor(self._mgr.get_bool_data()).__finalize__(self,)
return type(self)._from_mgr(self._mgr.get_bool_data()).__finalize__(self)

# ----------------------------------------------------------------------
# Internal Interface Methods
Expand Down Expand Up @@ -5573,7 +5582,7 @@ def astype(
else:
# else, only a single dtype is given
new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,)
return self._constructor(new_data).__finalize__(self, method="astype")
return type(self)._from_mgr(new_data).__finalize__(self, method="astype")

# GH 19920: retain column metadata after concat
result = pd.concat(results, axis=1, copy=False)
Expand Down Expand Up @@ -5687,7 +5696,7 @@ def copy(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries:
"""
data = self._mgr.copy(deep=deep)
self._clear_item_cache()
return self._constructor(data).__finalize__(self, method="copy")
return type(self)._from_mgr(data).__finalize__(self, method="copy")

def __copy__(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries:
return self.copy(deep=deep)
Expand Down Expand Up @@ -5738,15 +5747,19 @@ def _convert(
validate_bool_kwarg(timedelta, "timedelta")
validate_bool_kwarg(coerce, "coerce")
validate_bool_kwarg(copy, "copy")
return self._constructor(
self._mgr.convert(
datetime=datetime,
numeric=numeric,
timedelta=timedelta,
coerce=coerce,
copy=copy,
return (
type(self)
._from_mgr(
self._mgr.convert(
datetime=datetime,
numeric=numeric,
timedelta=timedelta,
coerce=coerce,
copy=copy,
)
)
).__finalize__(self)
.__finalize__(self)
)

def infer_objects(self: FrameOrSeries) -> FrameOrSeries:
"""
Expand Down Expand Up @@ -5789,11 +5802,19 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries:
# numeric=False necessary to only soft convert;
# python objects will still be converted to
# native numpy numeric types
return self._constructor(
self._mgr.convert(
datetime=True, numeric=False, timedelta=True, coerce=False, copy=True
return (
type(self)
._from_mgr(
self._mgr.convert(
datetime=True,
numeric=False,
timedelta=True,
coerce=False,
copy=True,
)
)
).__finalize__(self, method="infer_objects")
.__finalize__(self, method="infer_objects")
)

def convert_dtypes(
self: FrameOrSeries,
Expand Down Expand Up @@ -6896,7 +6917,7 @@ def interpolate(
**kwargs,
)

result = self._constructor(new_data)
result = type(self)._from_mgr(new_data)
if axis == 1:
result = result.T
if inplace:
Expand Down Expand Up @@ -8511,7 +8532,7 @@ def _align_series(
if copy and fdata is self._mgr:
fdata = fdata.copy()

left = self._constructor(fdata)
left = type(self)._from_mgr(fdata)

if ridx is None:
right = other
Expand Down Expand Up @@ -8663,7 +8684,7 @@ def _where(
new_data = self._mgr.putmask(
mask=cond, new=other, align=align, axis=block_axis,
)
result = self._constructor(new_data)
result = type(self)._from_mgr(new_data)
return self._update_inplace(result)

else:
Expand All @@ -8675,7 +8696,7 @@ def _where(
try_cast=try_cast,
axis=block_axis,
)
result = self._constructor(new_data)
result = type(self)._from_mgr(new_data)
return result.__finalize__(self)

_shared_docs[
Expand Down Expand Up @@ -8948,7 +8969,7 @@ def shift(
else:
return self.tshift(periods, freq)

return self._constructor(new_data).__finalize__(self, method="shift")
return type(self)._from_mgr(new_data).__finalize__(self, method="shift")

def slice_shift(self: FrameOrSeries, periods: int = 1, axis=0) -> FrameOrSeries:
"""
Expand Down Expand Up @@ -11202,9 +11223,8 @@ def block_accum_func(blk_values):

result = self._mgr.apply(block_accum_func)

d = self._construct_axes_dict()
d["copy"] = False
return self._constructor(result, **d).__finalize__(self, method=name)
obj = type(self)._from_mgr(result)
return obj.__finalize__(self, method=name)

return set_function_name(cum_func, name, cls)

Expand Down