Skip to content

Commit e708b40

Browse files
authored
REF/PERF: do maybe_coerce_values before Block.__init__ (#40385)
1 parent cd07813 commit e708b40

File tree

3 files changed

+54
-64
lines changed

3 files changed

+54
-64
lines changed

pandas/core/internals/api.py

+2
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
check_ndim,
2323
extract_pandas_array,
2424
get_block_type,
25+
maybe_coerce_values,
2526
)
2627

2728

@@ -58,6 +59,7 @@ def make_block(
5859

5960
ndim = _maybe_infer_ndim(values, placement, ndim)
6061
check_ndim(values, placement, ndim)
62+
values = maybe_coerce_values(values)
6163
return klass(values, ndim=ndim, placement=placement)
6264

6365

pandas/core/internals/blocks.py

+49-64
Original file line numberDiff line numberDiff line change
@@ -158,28 +158,14 @@ def __init__(self, values, placement, ndim: int):
158158
Parameters
159159
----------
160160
values : np.ndarray or ExtensionArray
161+
We assume maybe_coerce_values has already been called.
161162
placement : BlockPlacement (or castable)
162163
ndim : int
163164
1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame
164165
"""
165166
self.ndim = ndim
166167
self.mgr_locs = placement
167-
self.values = self._maybe_coerce_values(values)
168-
169-
@classmethod
170-
def _maybe_coerce_values(cls, values):
171-
"""
172-
Ensure we have correctly-typed values.
173-
174-
Parameters
175-
----------
176-
values : np.ndarray or ExtensionArray
177-
178-
Returns
179-
-------
180-
np.ndarray or ExtensionArray
181-
"""
182-
return values
168+
self.values = values
183169

184170
@property
185171
def _holder(self):
@@ -278,13 +264,17 @@ def make_block(self, values, placement=None) -> Block:
278264
if self.is_extension:
279265
values = ensure_block_shape(values, ndim=self.ndim)
280266

267+
# TODO: perf by not going through new_block
268+
# We assume maybe_coerce_values has already been called
281269
return new_block(values, placement=placement, ndim=self.ndim)
282270

283271
@final
284272
def make_block_same_class(self, values, placement=None) -> Block:
285273
""" Wrap given values in a block of same type as self. """
286274
if placement is None:
287275
placement = self.mgr_locs
276+
# TODO: perf by not going through new_block
277+
# We assume maybe_coerce_values has already been called
288278
return type(self)(values, placement=placement, ndim=self.ndim)
289279

290280
@final
@@ -416,6 +406,7 @@ def _split_op_result(self, result) -> List[Block]:
416406
return nbs
417407

418408
if not isinstance(result, Block):
409+
result = maybe_coerce_values(result)
419410
result = self.make_block(result)
420411

421412
return [result]
@@ -619,6 +610,7 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"):
619610

620611
new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
621612

613+
new_values = maybe_coerce_values(new_values)
622614
newb = self.make_block(new_values)
623615
if newb.shape != self.shape:
624616
raise TypeError(
@@ -677,6 +669,7 @@ def to_native_types(self, na_rep="nan", quoting=None, **kwargs):
677669
values = np.array(values, dtype="object")
678670

679671
values[mask] = na_rep
672+
values = values.astype(object, copy=False)
680673
return self.make_block(values)
681674

682675
# block actions #
@@ -1501,24 +1494,6 @@ def putmask(self, mask, new) -> List[Block]:
15011494
new_values[mask] = new
15021495
return [self.make_block(values=new_values)]
15031496

1504-
@classmethod
1505-
def _maybe_coerce_values(cls, values):
1506-
"""
1507-
Unbox to an extension array.
1508-
1509-
This will unbox an ExtensionArray stored in an Index or Series.
1510-
ExtensionArrays pass through. No dtype coercion is done.
1511-
1512-
Parameters
1513-
----------
1514-
values : np.ndarray or ExtensionArray
1515-
1516-
Returns
1517-
-------
1518-
ExtensionArray
1519-
"""
1520-
return extract_array(values)
1521-
15221497
@property
15231498
def _holder(self):
15241499
# For extension blocks, the holder is values-dependent.
@@ -1847,6 +1822,7 @@ def to_native_types(
18471822
values = np.array(values, dtype="object")
18481823

18491824
values[mask] = na_rep
1825+
values = values.astype(object, copy=False)
18501826
return self.make_block(values)
18511827

18521828
from pandas.io.formats.format import FloatArrayFormatter
@@ -1860,6 +1836,7 @@ def to_native_types(
18601836
fixed_width=False,
18611837
)
18621838
res = formatter.get_result_as_array()
1839+
res = res.astype(object, copy=False)
18631840
return self.make_block(res)
18641841

18651842

@@ -1913,6 +1890,7 @@ def where(self, other, cond, errors="raise", axis: int = 0) -> List[Block]:
19131890

19141891
# TODO(EA2D): reshape not needed with 2D EAs
19151892
res_values = res_values.reshape(self.values.shape)
1893+
res_values = maybe_coerce_values(res_values)
19161894
nb = self.make_block_same_class(res_values)
19171895
return [nb]
19181896

@@ -1940,12 +1918,14 @@ def diff(self, n: int, axis: int = 0) -> List[Block]:
19401918
values = self.array_values().reshape(self.shape)
19411919

19421920
new_values = values - values.shift(n, axis=axis)
1921+
new_values = maybe_coerce_values(new_values)
19431922
return [self.make_block(new_values)]
19441923

19451924
def shift(self, periods: int, axis: int = 0, fill_value: Any = None) -> List[Block]:
19461925
# TODO(EA2D) this is unnecessary if these blocks are backed by 2D EAs
19471926
values = self.array_values().reshape(self.shape)
19481927
new_values = values.shift(periods, fill_value=fill_value, axis=axis)
1928+
new_values = maybe_coerce_values(new_values)
19491929
return [self.make_block_same_class(new_values)]
19501930

19511931
def fillna(
@@ -1961,6 +1941,7 @@ def fillna(
19611941
values = self.array_values()
19621942
values = values if inplace else values.copy()
19631943
new_values = values.fillna(value=value, limit=limit)
1944+
new_values = maybe_coerce_values(new_values)
19641945
return [self.make_block_same_class(values=new_values)]
19651946

19661947

@@ -1970,30 +1951,6 @@ class DatetimeLikeBlockMixin(NDArrayBackedExtensionBlock):
19701951
is_numeric = False
19711952
_can_hold_na = True
19721953

1973-
@classmethod
1974-
def _maybe_coerce_values(cls, values):
1975-
"""
1976-
Input validation for values passed to __init__. Ensure that
1977-
we have nanosecond datetime64/timedelta64, coercing if necessary.
1978-
1979-
Parameters
1980-
----------
1981-
values : np.ndarray or ExtensionArray
1982-
Must be convertible to datetime64/timedelta64
1983-
1984-
Returns
1985-
-------
1986-
values : ndarray[datetime64ns/timedelta64ns]
1987-
"""
1988-
values = extract_array(values, extract_numpy=True)
1989-
if isinstance(values, np.ndarray):
1990-
values = sanitize_to_nanoseconds(values)
1991-
elif isinstance(values.dtype, np.dtype):
1992-
# i.e. not datetime64tz
1993-
values = values._data
1994-
1995-
return values
1996-
19971954
def array_values(self):
19981955
return ensure_wrapped_if_datetimelike(self.values)
19991956

@@ -2010,6 +1967,7 @@ def to_native_types(self, na_rep="NaT", **kwargs):
20101967
arr = self.array_values()
20111968

20121969
result = arr._format_native_types(na_rep=na_rep, **kwargs)
1970+
result = result.astype(object, copy=False)
20131971
return self.make_block(result)
20141972

20151973

@@ -2067,12 +2025,6 @@ class ObjectBlock(Block):
20672025
is_object = True
20682026
_can_hold_na = True
20692027

2070-
@classmethod
2071-
def _maybe_coerce_values(cls, values):
2072-
if issubclass(values.dtype.type, str):
2073-
values = np.array(values, dtype=object)
2074-
return values
2075-
20762028
@property
20772029
def is_bool(self):
20782030
"""
@@ -2198,6 +2150,38 @@ def replace(
21982150
# Constructor Helpers
21992151

22002152

2153+
def maybe_coerce_values(values) -> ArrayLike:
2154+
"""
2155+
Input validation for values passed to __init__. Ensure that
2156+
any datetime64/timedelta64 dtypes are in nanoseconds. Ensure
2157+
that we do not have string dtypes.
2158+
2159+
Parameters
2160+
----------
2161+
values : np.ndarray or ExtensionArray
2162+
2163+
Returns
2164+
-------
2165+
values : np.ndarray or ExtensionArray
2166+
"""
2167+
2168+
# Note: the only test that needs extract_array here is one where we
2169+
# pass PandasDtype to Series.astype, then need to extract PandasArray here.
2170+
values = extract_array(values, extract_numpy=True)
2171+
2172+
if isinstance(values, np.ndarray):
2173+
values = sanitize_to_nanoseconds(values)
2174+
2175+
if issubclass(values.dtype.type, str):
2176+
values = np.array(values, dtype=object)
2177+
2178+
elif isinstance(values.dtype, np.dtype):
2179+
# i.e. not datetime64tz, extract DTA/TDA -> ndarray
2180+
values = values._data
2181+
2182+
return values
2183+
2184+
22012185
def get_block_type(values, dtype: Optional[Dtype] = None):
22022186
"""
22032187
Find the appropriate Block subclass to use for the given values and dtype.
@@ -2256,6 +2240,7 @@ def new_block(values, placement, *, ndim: int, klass=None) -> Block:
22562240
if klass is None:
22572241
klass = get_block_type(values, values.dtype)
22582242

2243+
values = maybe_coerce_values(values)
22592244
return klass(values, ndim=ndim, placement=placement)
22602245

22612246

pandas/core/internals/managers.py

+3
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272
ensure_block_shape,
7373
extend_blocks,
7474
get_block_type,
75+
maybe_coerce_values,
7576
new_block,
7677
)
7778
from pandas.core.internals.ops import (
@@ -1051,6 +1052,7 @@ def iget(self, i: int) -> SingleBlockManager:
10511052
values = block.iget(self.blklocs[i])
10521053

10531054
# shortcut for select a single-dim from a 2-dim BM
1055+
values = maybe_coerce_values(values)
10541056
nb = type(block)(values, placement=slice(0, len(values)), ndim=1)
10551057
return SingleBlockManager(nb, self.axes[1])
10561058

@@ -1648,6 +1650,7 @@ def getitem_mgr(self, indexer) -> SingleBlockManager:
16481650
if array.ndim > blk.values.ndim:
16491651
# This will be caught by Series._get_values
16501652
raise ValueError("dimension-expanding indexing not allowed")
1653+
16511654
block = blk.make_block_same_class(array, placement=slice(0, len(array)))
16521655
return type(self)(block, self.index[indexer])
16531656

0 commit comments

Comments
 (0)