Skip to content

REF/PERF: do maybe_coerce_values before Block.__init__ #40385

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pandas/core/internals/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
check_ndim,
extract_pandas_array,
get_block_type,
maybe_coerce_values,
)


Expand Down Expand Up @@ -58,6 +59,7 @@ def make_block(

ndim = _maybe_infer_ndim(values, placement, ndim)
check_ndim(values, placement, ndim)
values = maybe_coerce_values(values)
return klass(values, ndim=ndim, placement=placement)


Expand Down
113 changes: 49 additions & 64 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,28 +160,14 @@ def __init__(self, values, placement, ndim: int):
Parameters
----------
values : np.ndarray or ExtensionArray
We assume maybe_coerce_values has already been called.
placement : BlockPlacement (or castable)
ndim : int
1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame
"""
self.ndim = ndim
self.mgr_locs = placement
self.values = self._maybe_coerce_values(values)

@classmethod
def _maybe_coerce_values(cls, values):
"""
Ensure we have correctly-typed values.

Parameters
----------
values : np.ndarray or ExtensionArray

Returns
-------
np.ndarray or ExtensionArray
"""
return values
self.values = values

@property
def _holder(self):
Expand Down Expand Up @@ -280,13 +266,17 @@ def make_block(self, values, placement=None) -> Block:
if self.is_extension:
values = ensure_block_shape(values, ndim=self.ndim)

# TODO: perf by not going through new_block
# We assume maybe_coerce_values has already been called
return new_block(values, placement=placement, ndim=self.ndim)

@final
def make_block_same_class(self, values, placement=None) -> Block:
""" Wrap given values in a block of same type as self. """
if placement is None:
placement = self.mgr_locs
# TODO: perf by not going through new_block
# We assume maybe_coerce_values has already been called
return type(self)(values, placement=placement, ndim=self.ndim)

@final
Expand Down Expand Up @@ -418,6 +408,7 @@ def _split_op_result(self, result) -> List[Block]:
return nbs

if not isinstance(result, Block):
result = maybe_coerce_values(result)
result = self.make_block(result)

return [result]
Expand Down Expand Up @@ -629,6 +620,7 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"):
values, dtype, copy=copy, errors=errors # type: ignore[type-var]
)

new_values = maybe_coerce_values(new_values)
newb = self.make_block(new_values)
if newb.shape != self.shape:
raise TypeError(
Expand Down Expand Up @@ -687,6 +679,7 @@ def to_native_types(self, na_rep="nan", quoting=None, **kwargs):
values = np.array(values, dtype="object")

values[mask] = na_rep
values = values.astype(object, copy=False)
return self.make_block(values)

# block actions #
Expand Down Expand Up @@ -1540,24 +1533,6 @@ def putmask(self, mask, new) -> List[Block]:
new_values[mask] = new
return [self.make_block(values=new_values)]

@classmethod
def _maybe_coerce_values(cls, values):
"""
Unbox to an extension array.

This will unbox an ExtensionArray stored in an Index or Series.
ExtensionArrays pass through. No dtype coercion is done.

Parameters
----------
values : np.ndarray or ExtensionArray

Returns
-------
ExtensionArray
"""
return extract_array(values)

@property
def _holder(self):
# For extension blocks, the holder is values-dependent.
Expand Down Expand Up @@ -1891,6 +1866,7 @@ def to_native_types(
values = np.array(values, dtype="object")

values[mask] = na_rep
values = values.astype(object, copy=False)
return self.make_block(values)

from pandas.io.formats.format import FloatArrayFormatter
Expand All @@ -1904,6 +1880,7 @@ def to_native_types(
fixed_width=False,
)
res = formatter.get_result_as_array()
res = res.astype(object, copy=False)
return self.make_block(res)


Expand Down Expand Up @@ -1957,6 +1934,7 @@ def where(self, other, cond, errors="raise", axis: int = 0) -> List[Block]:

# TODO(EA2D): reshape not needed with 2D EAs
res_values = res_values.reshape(self.values.shape)
res_values = maybe_coerce_values(res_values)
nb = self.make_block_same_class(res_values)
return [nb]

Expand Down Expand Up @@ -1984,12 +1962,14 @@ def diff(self, n: int, axis: int = 0) -> List[Block]:
values = self.array_values().reshape(self.shape)

new_values = values - values.shift(n, axis=axis)
new_values = maybe_coerce_values(new_values)
return [self.make_block(new_values)]

def shift(self, periods: int, axis: int = 0, fill_value: Any = None) -> List[Block]:
# TODO(EA2D) this is unnecessary if these blocks are backed by 2D EAs
values = self.array_values().reshape(self.shape)
new_values = values.shift(periods, fill_value=fill_value, axis=axis)
new_values = maybe_coerce_values(new_values)
return [self.make_block_same_class(new_values)]

def fillna(
Expand All @@ -2005,6 +1985,7 @@ def fillna(
values = self.array_values()
values = values if inplace else values.copy()
new_values = values.fillna(value=value, limit=limit)
new_values = maybe_coerce_values(new_values)
return [self.make_block_same_class(values=new_values)]


Expand All @@ -2014,30 +1995,6 @@ class DatetimeLikeBlockMixin(NDArrayBackedExtensionBlock):
is_numeric = False
_can_hold_na = True

@classmethod
def _maybe_coerce_values(cls, values):
"""
Input validation for values passed to __init__. Ensure that
we have nanosecond datetime64/timedelta64, coercing if necessary.

Parameters
----------
values : np.ndarray or ExtensionArray
Must be convertible to datetime64/timedelta64

Returns
-------
values : ndarray[datetime64ns/timedelta64ns]
"""
values = extract_array(values, extract_numpy=True)
if isinstance(values, np.ndarray):
values = sanitize_to_nanoseconds(values)
elif isinstance(values.dtype, np.dtype):
# i.e. not datetime64tz
values = values._data

return values

def array_values(self):
return ensure_wrapped_if_datetimelike(self.values)

Expand All @@ -2054,6 +2011,7 @@ def to_native_types(self, na_rep="NaT", **kwargs):
arr = self.array_values()

result = arr._format_native_types(na_rep=na_rep, **kwargs)
result = result.astype(object, copy=False)
return self.make_block(result)


Expand Down Expand Up @@ -2111,12 +2069,6 @@ class ObjectBlock(Block):
is_object = True
_can_hold_na = True

@classmethod
def _maybe_coerce_values(cls, values):
if issubclass(values.dtype.type, str):
values = np.array(values, dtype=object)
return values

@property
def is_bool(self):
"""
Expand Down Expand Up @@ -2242,6 +2194,38 @@ def replace(
# Constructor Helpers


def maybe_coerce_values(values) -> ArrayLike:
"""
Input validation for values passed to __init__. Ensure that
any datetime64/timedelta64 dtypes are in nanoseconds. Ensure
that we do not have string dtypes.

Parameters
----------
values : np.ndarray or ExtensionArray

Returns
-------
values : np.ndarray or ExtensionArray
"""

# Note: the only test that needs extract_array here is one where we
# pass PandasDtype to Series.astype, then need to extract PandasArray here.
values = extract_array(values, extract_numpy=True)

if isinstance(values, np.ndarray):
values = sanitize_to_nanoseconds(values)

if issubclass(values.dtype.type, str):
values = np.array(values, dtype=object)

elif isinstance(values.dtype, np.dtype):
# i.e. not datetime64tz, extract DTA/TDA -> ndarray
values = values._data

return values


def get_block_type(values, dtype: Optional[Dtype] = None):
"""
Find the appropriate Block subclass to use for the given values and dtype.
Expand Down Expand Up @@ -2300,6 +2284,7 @@ def new_block(values, placement, *, ndim: int, klass=None) -> Block:
if klass is None:
klass = get_block_type(values, values.dtype)

values = maybe_coerce_values(values)
return klass(values, ndim=ndim, placement=placement)


Expand Down
3 changes: 3 additions & 0 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
ensure_block_shape,
extend_blocks,
get_block_type,
maybe_coerce_values,
new_block,
)
from pandas.core.internals.ops import (
Expand Down Expand Up @@ -1057,6 +1058,7 @@ def iget(self, i: int) -> SingleBlockManager:
values = block.iget(self.blklocs[i])

# shortcut for select a single-dim from a 2-dim BM
values = maybe_coerce_values(values)
nb = type(block)(values, placement=slice(0, len(values)), ndim=1)
return SingleBlockManager(nb, self.axes[1])

Expand Down Expand Up @@ -1650,6 +1652,7 @@ def getitem_mgr(self, indexer) -> SingleBlockManager:
if array.ndim > blk.values.ndim:
# This will be caught by Series._get_values
raise ValueError("dimension-expanding indexing not allowed")

block = blk.make_block_same_class(array, placement=slice(0, len(array)))
return type(self)(block, self.index[indexer])

Expand Down