Skip to content

CLN: de-duplicate Block.should_store and related #33028

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Mar 27, 2020
88 changes: 33 additions & 55 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -654,6 +654,20 @@ def _can_hold_element(self, element: Any) -> bool:
return issubclass(tipo.type, dtype)
return isinstance(element, dtype)

def should_store(self, value: ArrayLike) -> bool:
"""
Should we set self.values[indexer] = value inplace or do we need to cast?

Parameters
----------
value : np.ndarray or ExtensionArray

Returns
-------
bool
"""
return is_dtype_equal(value.dtype, self.dtype)

def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs):
""" convert to our native types format, slicing if desired """
values = self.values
Expand Down Expand Up @@ -1747,10 +1761,7 @@ def setitem(self, indexer, value):

def get_values(self, dtype=None):
# ExtensionArrays must be iterable, so this works.
values = np.asarray(self.values)
if values.ndim == self.ndim - 1:
values = values.reshape((1,) + values.shape)
return values
return np.asarray(self.values).reshape(self.shape)

def array_values(self) -> ExtensionArray:
return self.values
Expand Down Expand Up @@ -2016,11 +2027,6 @@ def to_native_types(
)
return formatter.get_result_as_array()

def should_store(self, value: ArrayLike) -> bool:
# when inserting a column should not coerce integers to floats
# unnecessarily
return issubclass(value.dtype.type, np.floating) and value.dtype == self.dtype


class ComplexBlock(FloatOrComplexBlock):
__slots__ = ()
Expand Down Expand Up @@ -2053,9 +2059,6 @@ def _can_hold_element(self, element: Any) -> bool:
)
return is_integer(element)

def should_store(self, value: ArrayLike) -> bool:
return is_integer_dtype(value) and value.dtype == self.dtype


class DatetimeLikeBlockMixin:
"""Mixin class for DatetimeBlock, DatetimeTZBlock, and TimedeltaBlock."""
Expand All @@ -2064,9 +2067,6 @@ class DatetimeLikeBlockMixin:
def _holder(self):
return DatetimeArray

def should_store(self, value):
return is_dtype_equal(self.dtype, value.dtype)

@property
def fill_value(self):
return np.datetime64("NaT", "ns")
Expand All @@ -2076,15 +2076,17 @@ def get_values(self, dtype=None):
return object dtype as boxed values, such as Timestamps/Timedelta
"""
if is_object_dtype(dtype):
values = self.values.ravel()
result = self._holder(values).astype(object)
return result.reshape(self.values.shape)
# DTA/TDA constructor and astype can handle 2D
return self._holder(self.values).astype(object)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a comment here on why _holder can handle 2D?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do

return self.values

def internal_values(self):
# Override to return DatetimeArray and TimedeltaArray
return self.array_values()

def array_values(self):
return self._holder._simple_new(self.values)

def iget(self, key):
# GH#31649 we need to wrap scalars in Timestamp/Timedelta
# TODO(EA2D): this can be removed if we ever have 2D EA
Expand Down Expand Up @@ -2211,12 +2213,6 @@ def set(self, locs, values):

self.values[locs] = values

def external_values(self):
return np.asarray(self.values.astype("datetime64[ns]", copy=False))

def array_values(self) -> ExtensionArray:
return DatetimeArray._simple_new(self.values)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is no the same as self._holder(self.values) as you did above, this was calling _simple_new on purpose for efficience

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True. My thought was that since the DTA constructor is relatively strict, the overhead wouldn't be too bad. Can revert if you have a strong opinion. (and/or we could cache_readonly array_values)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would need to look back or time it again to be sure about the exact impact, but as far as I remember, I did this change when optimizing extract_array, and this was done for a reason



class DatetimeTZBlock(ExtensionBlock, DatetimeBlock):
""" implement a datetime64 block with a tz attribute """
Expand All @@ -2229,7 +2225,8 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock):
_can_hold_element = DatetimeBlock._can_hold_element
to_native_types = DatetimeBlock.to_native_types
fill_value = np.datetime64("NaT", "ns")
should_store = DatetimeBlock.should_store
should_store = Block.should_store
array_values = ExtensionBlock.array_values

@property
def _holder(self):
Expand Down Expand Up @@ -2288,14 +2285,16 @@ def get_values(self, dtype=None):
if is_object_dtype(dtype):
values = values.astype(object)

values = np.asarray(values)
# TODO(EA2D): reshape unnecessary with 2D EAs
# Ensure that our shape is correct for DataFrame.
# ExtensionArrays are always 1-D, even in a DataFrame when
# the analogous NumPy-backed column would be a 2-D ndarray.
return np.asarray(values).reshape(self.shape)

if self.ndim == 2:
# Ensure that our shape is correct for DataFrame.
# ExtensionArrays are always 1-D, even in a DataFrame when
# the analogous NumPy-backed column would be a 2-D ndarray.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you keep this comment?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do

values = values.reshape(1, -1)
return values
def external_values(self):
# NB: this is different from np.asarray(self.values), since that
# return an object-dtype ndarray of Timestamps.
return np.asarray(self.values.astype("datetime64[ns]", copy=False))

def _slice(self, slicer):
""" return a slice of my values """
Expand Down Expand Up @@ -2462,12 +2461,6 @@ def to_native_types(self, slicer=None, na_rep=None, quoting=None, **kwargs):
)
return rvalues

def external_values(self):
return np.asarray(self.values.astype("timedelta64[ns]", copy=False))

def array_values(self) -> ExtensionArray:
return TimedeltaArray._simple_new(self.values)


class BoolBlock(NumericBlock):
__slots__ = ()
Expand All @@ -2480,11 +2473,6 @@ def _can_hold_element(self, element: Any) -> bool:
return issubclass(tipo.type, np.bool_)
return isinstance(element, (bool, np.bool_))

def should_store(self, value: ArrayLike) -> bool:
return issubclass(value.dtype.type, np.bool_) and not is_extension_array_dtype(
value
)

def replace(
self, to_replace, value, inplace=False, filter=None, regex=False, convert=True
):
Expand Down Expand Up @@ -2572,15 +2560,6 @@ def _maybe_downcast(self, blocks: List["Block"], downcast=None) -> List["Block"]
def _can_hold_element(self, element: Any) -> bool:
return True

def should_store(self, value: ArrayLike) -> bool:
return not (
issubclass(
value.dtype.type,
(np.integer, np.floating, np.complexfloating, np.datetime64, np.bool_),
)
or is_extension_array_dtype(value)
)

def replace(
self, to_replace, value, inplace=False, filter=None, regex=False, convert=True
):
Expand Down Expand Up @@ -2811,6 +2790,8 @@ class CategoricalBlock(ExtensionBlock):
_can_hold_na = True
_concatenator = staticmethod(concat_categorical)

should_store = Block.should_store

def __init__(self, values, placement, ndim=None):
# coerce to categorical if we can
values = extract_array(values)
Expand All @@ -2821,9 +2802,6 @@ def __init__(self, values, placement, ndim=None):
def _holder(self):
return Categorical

def should_store(self, arr: ArrayLike):
return isinstance(arr, self._holder) and is_dtype_equal(self.dtype, arr.dtype)

def to_native_types(self, slicer=None, na_rep="", quoting=None, **kwargs):
""" convert to our native types format, slicing if desired """
values = self.values
Expand Down