Skip to content

Commit f290b65

Browse files
authored
REF: prelims for 2D DTA/TDA (#40331)
1 parent 2d0dbf3 commit f290b65

File tree

9 files changed

+59
-27
lines changed

9 files changed

+59
-27
lines changed

asv_bench/benchmarks/reshape.py

+36
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,42 @@ def time_unstack(self):
5353
self.df.unstack(1)
5454

5555

56+
class ReshapeExtensionDtype:
57+
58+
params = ["datetime64[ns, US/Pacific]", "Period[s]"]
59+
param_names = ["dtype"]
60+
61+
def setup(self, dtype):
62+
lev = pd.Index(list("ABCDEFGHIJ"))
63+
ri = pd.Index(range(1000))
64+
mi = MultiIndex.from_product([lev, ri], names=["foo", "bar"])
65+
66+
index = date_range("2016-01-01", periods=10000, freq="s", tz="US/Pacific")
67+
if dtype == "Period[s]":
68+
index = index.tz_localize(None).to_period("s")
69+
70+
ser = pd.Series(index, index=mi)
71+
df = ser.unstack("bar")
72+
# roundtrips -> df.stack().equals(ser)
73+
74+
self.ser = ser
75+
self.df = df
76+
77+
def time_stack(self, dtype):
78+
self.df.stack()
79+
80+
def time_unstack_fast(self, dtype):
81+
# last level -> doesnt have to make copies
82+
self.ser.unstack("bar")
83+
84+
def time_unstack_slow(self, dtype):
85+
# first level -> must make copies
86+
self.ser.unstack("foo")
87+
88+
def time_transpose(self, dtype):
89+
self.df.T
90+
91+
5692
class Unstack:
5793

5894
params = ["int", "category"]

pandas/core/groupby/generic.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -81,10 +81,7 @@
8181
validate_func_kwargs,
8282
)
8383
from pandas.core.apply import GroupByApply
84-
from pandas.core.arrays import (
85-
Categorical,
86-
ExtensionArray,
87-
)
84+
from pandas.core.arrays import Categorical
8885
from pandas.core.base import (
8986
DataError,
9087
SpecificationError,
@@ -1123,8 +1120,7 @@ def py_fallback(values: ArrayLike) -> ArrayLike:
11231120
obj: FrameOrSeriesUnion
11241121

11251122
# call our grouper again with only this block
1126-
if isinstance(values, ExtensionArray) or values.ndim == 1:
1127-
# TODO(EA2D): special case not needed with 2D EAs
1123+
if values.ndim == 1:
11281124
obj = Series(values)
11291125
else:
11301126
# TODO special case not needed with ArrayManager

pandas/core/internals/array_manager.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,10 @@
8585
DataManager,
8686
SingleDataManager,
8787
)
88-
from pandas.core.internals.blocks import new_block
88+
from pandas.core.internals.blocks import (
89+
ensure_block_shape,
90+
new_block,
91+
)
8992

9093
if TYPE_CHECKING:
9194
from pandas import Float64Index
@@ -497,10 +500,7 @@ def quantile(
497500
interpolation="linear",
498501
) -> ArrayManager:
499502

500-
arrs = [
501-
x if not isinstance(x, np.ndarray) else np.atleast_2d(x)
502-
for x in self.arrays
503-
]
503+
arrs = [ensure_block_shape(x, 2) for x in self.arrays]
504504
assert axis == 1
505505
new_arrs = [quantile_compat(x, qs, interpolation, axis=axis) for x in arrs]
506506
for i, arr in enumerate(new_arrs):

pandas/core/internals/blocks.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,9 @@
118118
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
119119

120120

121+
_dtype_obj = np.dtype(object) # comparison is faster than is_object_dtype
122+
123+
121124
class Block(PandasObject):
122125
"""
123126
Canonical n-dimensional unit of homogeneous dtype contained in a pandas
@@ -281,8 +284,8 @@ def get_values(self, dtype: Optional[DtypeObj] = None) -> np.ndarray:
281284
return an internal format, currently just the ndarray
282285
this is often overridden to handle to_dense like operations
283286
"""
284-
if is_object_dtype(dtype):
285-
return self.values.astype(object)
287+
if dtype == _dtype_obj:
288+
return self.values.astype(_dtype_obj)
286289
return self.values
287290

288291
@final

pandas/core/internals/concat.py

+6-7
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,10 @@
4242
ExtensionArray,
4343
)
4444
from pandas.core.internals.array_manager import ArrayManager
45-
from pandas.core.internals.blocks import new_block
45+
from pandas.core.internals.blocks import (
46+
ensure_block_shape,
47+
new_block,
48+
)
4649
from pandas.core.internals.managers import BlockManager
4750

4851
if TYPE_CHECKING:
@@ -420,12 +423,8 @@ def _concatenate_join_units(
420423
# the non-EA values are 2D arrays with shape (1, n)
421424
to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat]
422425
concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True)
423-
if not is_extension_array_dtype(concat_values.dtype):
424-
# if the result of concat is not an EA but an ndarray, reshape to
425-
# 2D to put it a non-EA Block
426-
# special case DatetimeArray/TimedeltaArray, which *is* an EA, but
427-
# is put in a consolidated 2D block
428-
concat_values = np.atleast_2d(concat_values)
426+
concat_values = ensure_block_shape(concat_values, 2)
427+
429428
else:
430429
concat_values = concat_compat(to_concat, axis=concat_axis)
431430

pandas/core/internals/managers.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1226,7 +1226,7 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False
12261226

12271227
if value.ndim == 2:
12281228
value = value.T
1229-
elif value.ndim == self.ndim - 1 and not is_extension_array_dtype(value.dtype):
1229+
elif value.ndim == self.ndim - 1:
12301230
# TODO(EA2D): special case not needed with 2D EAs
12311231
value = ensure_block_shape(value, ndim=2)
12321232

pandas/core/internals/ops.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
Tuple,
99
)
1010

11-
import numpy as np
12-
1311
from pandas._typing import ArrayLike
1412

1513
if TYPE_CHECKING:
@@ -32,7 +30,7 @@ def _iter_block_pairs(
3230
locs = blk.mgr_locs
3331
blk_vals = blk.values
3432

35-
left_ea = not isinstance(blk_vals, np.ndarray)
33+
left_ea = blk_vals.ndim == 1
3634

3735
rblks = right._slice_take_blocks_ax0(locs.indexer, only_slice=True)
3836

@@ -43,7 +41,7 @@ def _iter_block_pairs(
4341
# assert rblks[0].shape[0] == 1, rblks[0].shape
4442

4543
for k, rblk in enumerate(rblks):
46-
right_ea = not isinstance(rblk.values, np.ndarray)
44+
right_ea = rblk.values.ndim == 1
4745

4846
lvals, rvals = _get_same_shape_values(blk, rblk, left_ea, right_ea)
4947
info = BlockPairInfo(lvals, rvals, locs, left_ea, right_ea, rblk)

pandas/core/nanops.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1741,7 +1741,7 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike:
17411741
# restore NaT elements
17421742
y[mask] = iNaT # TODO: could try/finally for this?
17431743

1744-
if isinstance(values, np.ndarray):
1744+
if isinstance(values.dtype, np.dtype):
17451745
result = result.view(orig_dtype)
17461746
else:
17471747
# DatetimeArray/TimedeltaArray

pandas/core/reshape/reshape.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -440,7 +440,7 @@ def unstack(obj, level, fill_value=None):
440440
obj.index, level=level, constructor=obj._constructor_expanddim
441441
)
442442
return unstacker.get_result(
443-
obj.values, value_columns=None, fill_value=fill_value
443+
obj._values, value_columns=None, fill_value=fill_value
444444
)
445445

446446

0 commit comments

Comments
 (0)