Skip to content

REF: prelims for 2D DTA/TDA #40331

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions asv_bench/benchmarks/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,42 @@ def time_unstack(self):
self.df.unstack(1)


class ReshapeExtensionDtype:

params = ["datetime64[ns, US/Pacific]", "Period[s]"]
param_names = ["dtype"]

def setup(self, dtype):
lev = pd.Index(list("ABCDEFGHIJ"))
ri = pd.Index(range(1000))
mi = MultiIndex.from_product([lev, ri], names=["foo", "bar"])

index = date_range("2016-01-01", periods=10000, freq="s", tz="US/Pacific")
if dtype == "Period[s]":
index = index.tz_localize(None).to_period("s")

ser = pd.Series(index, index=mi)
df = ser.unstack("bar")
# roundtrips -> df.stack().equals(ser)

self.ser = ser
self.df = df

def time_stack(self, dtype):
self.df.stack()

def time_unstack_fast(self, dtype):
# last level -> doesnt have to make copies
self.ser.unstack("bar")

def time_unstack_slow(self, dtype):
# first level -> must make copies
self.ser.unstack("foo")

def time_transpose(self, dtype):
self.df.T


class Unstack:

params = ["int", "category"]
Expand Down
8 changes: 2 additions & 6 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,7 @@
validate_func_kwargs,
)
from pandas.core.apply import GroupByApply
from pandas.core.arrays import (
Categorical,
ExtensionArray,
)
from pandas.core.arrays import Categorical
from pandas.core.base import (
DataError,
SpecificationError,
Expand Down Expand Up @@ -1123,8 +1120,7 @@ def py_fallback(values: ArrayLike) -> ArrayLike:
obj: FrameOrSeriesUnion

# call our grouper again with only this block
if isinstance(values, ExtensionArray) or values.ndim == 1:
# TODO(EA2D): special case not needed with 2D EAs
if values.ndim == 1:
obj = Series(values)
else:
# TODO special case not needed with ArrayManager
Expand Down
10 changes: 5 additions & 5 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,10 @@
DataManager,
SingleDataManager,
)
from pandas.core.internals.blocks import new_block
from pandas.core.internals.blocks import (
ensure_block_shape,
new_block,
)

if TYPE_CHECKING:
from pandas import Float64Index
Expand Down Expand Up @@ -497,10 +500,7 @@ def quantile(
interpolation="linear",
) -> ArrayManager:

arrs = [
x if not isinstance(x, np.ndarray) else np.atleast_2d(x)
for x in self.arrays
]
arrs = [ensure_block_shape(x, 2) for x in self.arrays]
assert axis == 1
new_arrs = [quantile_compat(x, qs, interpolation, axis=axis) for x in arrs]
for i, arr in enumerate(new_arrs):
Expand Down
7 changes: 5 additions & 2 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,9 @@
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray


_dtype_obj = np.dtype(object) # comparison is faster than is_object_dtype
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wonder if you could actually just put this in is_object_dtype itself



class Block(PandasObject):
"""
Canonical n-dimensional unit of homogeneous dtype contained in a pandas
Expand Down Expand Up @@ -278,8 +281,8 @@ def get_values(self, dtype: Optional[DtypeObj] = None) -> np.ndarray:
return an internal format, currently just the ndarray
this is often overridden to handle to_dense like operations
"""
if is_object_dtype(dtype):
return self.values.astype(object)
if dtype == _dtype_obj:
return self.values.astype(_dtype_obj)
return self.values

@final
Expand Down
13 changes: 6 additions & 7 deletions pandas/core/internals/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,10 @@
ExtensionArray,
)
from pandas.core.internals.array_manager import ArrayManager
from pandas.core.internals.blocks import new_block
from pandas.core.internals.blocks import (
ensure_block_shape,
new_block,
)
from pandas.core.internals.managers import BlockManager

if TYPE_CHECKING:
Expand Down Expand Up @@ -420,12 +423,8 @@ def _concatenate_join_units(
# the non-EA values are 2D arrays with shape (1, n)
to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat]
concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True)
if not is_extension_array_dtype(concat_values.dtype):
# if the result of concat is not an EA but an ndarray, reshape to
# 2D to put it a non-EA Block
# special case DatetimeArray/TimedeltaArray, which *is* an EA, but
# is put in a consolidated 2D block
concat_values = np.atleast_2d(concat_values)
concat_values = ensure_block_shape(concat_values, 2)

else:
concat_values = concat_compat(to_concat, axis=concat_axis)

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1226,7 +1226,7 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False

if value.ndim == 2:
value = value.T
elif value.ndim == self.ndim - 1 and not is_extension_array_dtype(value.dtype):
elif value.ndim == self.ndim - 1:
# TODO(EA2D): special case not needed with 2D EAs
value = ensure_block_shape(value, ndim=2)

Expand Down
6 changes: 2 additions & 4 deletions pandas/core/internals/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
Tuple,
)

import numpy as np

from pandas._typing import ArrayLike

if TYPE_CHECKING:
Expand All @@ -32,7 +30,7 @@ def _iter_block_pairs(
locs = blk.mgr_locs
blk_vals = blk.values

left_ea = not isinstance(blk_vals, np.ndarray)
left_ea = blk_vals.ndim == 1

rblks = right._slice_take_blocks_ax0(locs.indexer, only_slice=True)

Expand All @@ -43,7 +41,7 @@ def _iter_block_pairs(
# assert rblks[0].shape[0] == 1, rblks[0].shape

for k, rblk in enumerate(rblks):
right_ea = not isinstance(rblk.values, np.ndarray)
right_ea = rblk.values.ndim == 1

lvals, rvals = _get_same_shape_values(blk, rblk, left_ea, right_ea)
info = BlockPairInfo(lvals, rvals, locs, left_ea, right_ea, rblk)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1741,7 +1741,7 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike:
# restore NaT elements
y[mask] = iNaT # TODO: could try/finally for this?

if isinstance(values, np.ndarray):
if isinstance(values.dtype, np.dtype):
result = result.view(orig_dtype)
else:
# DatetimeArray/TimedeltaArray
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,7 @@ def unstack(obj, level, fill_value=None):
obj.index, level=level, constructor=obj._constructor_expanddim
)
return unstacker.get_result(
obj.values, value_columns=None, fill_value=fill_value
obj._values, value_columns=None, fill_value=fill_value
)


Expand Down