From fae2a97881614dc8968fd407af530d3ad2d80c59 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 15 Mar 2021 18:14:32 -0700 Subject: [PATCH 1/6] REF: back DatetimeBlock, TimedeltaBlock by DTA/TDA --- pandas/core/array_algos/quantile.py | 4 + pandas/core/array_algos/take.py | 9 ++- pandas/core/arrays/_mixins.py | 22 +++++- pandas/core/frame.py | 3 + pandas/core/groupby/ops.py | 5 +- pandas/core/internals/array_manager.py | 9 ++- pandas/core/internals/blocks.py | 85 +++++++++++----------- pandas/core/internals/concat.py | 11 ++- pandas/core/sorting.py | 12 ++- pandas/tests/arithmetic/test_datetime64.py | 1 + pandas/tests/extension/json/array.py | 9 +-- pandas/tests/groupby/test_apply.py | 5 +- pandas/tests/groupby/test_groupby.py | 6 +- pandas/tests/internals/test_internals.py | 21 +++++- 14 files changed, 132 insertions(+), 70 deletions(-) diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index f140ee08aef05..3e2f32ea78e97 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -162,6 +162,10 @@ def quantile_ea_compat( assert result.shape == (1,), result.shape result = type(orig)._from_factorized(result, orig) + elif orig.ndim == 2: + # i.e. DatetimeArray + result = type(orig)._from_factorized(result, orig) + else: assert result.shape == (1, len(qs)), result.shape result = type(orig)._from_factorized(result[0], orig) diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py index b1311efa718ae..72a8bf71a4521 100644 --- a/pandas/core/array_algos/take.py +++ b/pandas/core/array_algos/take.py @@ -4,6 +4,7 @@ from typing import ( TYPE_CHECKING, Optional, + cast, overload, ) @@ -25,6 +26,7 @@ from pandas.core.construction import ensure_wrapped_if_datetimelike if TYPE_CHECKING: + from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.arrays.base import ExtensionArray @@ -100,7 +102,12 @@ def take_nd( if not isinstance(arr, np.ndarray): # i.e. ExtensionArray, - # includes for EA to catch DatetimeArray, TimedeltaArray + if arr.ndim == 2: + # e.g. DatetimeArray, TimedeltArray + arr = cast("NDArrayBackedExtensionArray", arr) + return arr.take( + indexer, fill_value=fill_value, allow_fill=allow_fill, axis=axis + ) return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) arr = np.asarray(arr) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 848e467afb7b6..2a51bf00ba122 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -20,7 +20,10 @@ cache_readonly, doc, ) -from pandas.util._validators import validate_fillna_kwargs +from pandas.util._validators import ( + validate_bool_kwarg, + validate_fillna_kwargs, +) from pandas.core.dtypes.common import is_dtype_equal from pandas.core.dtypes.missing import array_equivalent @@ -35,6 +38,7 @@ from pandas.core.arrays.base import ExtensionArray from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer +from pandas.core.sorting import nargminmax NDArrayBackedExtensionArrayT = TypeVar( "NDArrayBackedExtensionArrayT", bound="NDArrayBackedExtensionArray" @@ -185,6 +189,22 @@ def equals(self, other) -> bool: def _values_for_argsort(self): return self._ndarray + # Signature of "argmin" incompatible with supertype "ExtensionArray" + def argmin(self, axis: int = 0, skipna: bool = True): # type:ignore[override] + # override base class by adding axis keyword + validate_bool_kwarg(skipna, "skipna") + if not skipna and self.isna().any(): + raise NotImplementedError + return nargminmax(self, "argmin", axis=axis) + + # Signature of "argmax" incompatible with supertype "ExtensionArray" + def argmax(self, axis: int = 0, skipna: bool = True): # type:ignore[override] + # override base class by adding axis keyword + validate_bool_kwarg(skipna, "skipna") + if not skipna and self.isna().any(): + raise NotImplementedError + return nargminmax(self, "argmax", axis=axis) + def copy(self: NDArrayBackedExtensionArrayT) -> NDArrayBackedExtensionArrayT: new_data = self._ndarray.copy() return self._from_backing_data(new_data) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 99e91e01845ee..1e4c7131bb8a5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9481,6 +9481,9 @@ def func(values: np.ndarray): def blk_func(values, axis=1): if isinstance(values, ExtensionArray): + if values.ndim == 2: + # i.e. DatetimeArray, TimedeltaArray + return values._reduce(name, axis=1, skipna=skipna, **kwds) return values._reduce(name, skipna=skipna, **kwds) else: return op(values, axis=axis, skipna=skipna, **kwds) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 74e96015b4544..ce6f883ef8225 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -72,6 +72,7 @@ ) import pandas.core.algorithms as algorithms +from pandas.core.arrays import ExtensionArray from pandas.core.base import SelectionMixin import pandas.core.common as com from pandas.core.frame import DataFrame @@ -267,7 +268,9 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): group_keys = self._get_group_keys() result_values = None - if data.ndim == 2 and np.any(data.dtypes.apply(is_extension_array_dtype)): + if data.ndim == 2 and any( + isinstance(x, ExtensionArray) for x in data._iter_column_arrays() + ): # calling splitter.fast_apply will raise TypeError via apply_frame_axis0 # if we pass EA instead of ndarray # TODO: can we have a workaround for EAs backed by ndarray? diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 1dae3d586a0a9..e18ed2828eccc 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -488,9 +488,12 @@ def apply_with_block(self: T, f, align_keys=None, swap_axis=True, **kwargs) -> T if isinstance(applied, list): applied = applied[0] arr = applied.values - if self.ndim == 2: - if isinstance(arr, np.ndarray): - arr = arr[0, :] + if self.ndim == 2 and arr.ndim == 2: + assert len(arr) == 1 + # error: Invalid index type "Tuple[int, slice]" for + # "Union[ndarray, ExtensionArray]"; expected type + # "Union[int, slice, ndarray]" + arr = arr[0, :] # type: ignore[index] result_arrays.append(arr) return type(self)(result_arrays, self._axes) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 83a7c224060a8..5540b4772fe6c 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -25,7 +25,6 @@ writers, ) from pandas._libs.internals import BlockPlacement -from pandas._libs.tslibs import conversion from pandas._typing import ( ArrayLike, Dtype, @@ -43,7 +42,6 @@ maybe_downcast_numeric, maybe_downcast_to_dtype, maybe_upcast, - sanitize_to_nanoseconds, soft_convert_objects, ) from pandas.core.dtypes.common import ( @@ -115,6 +113,7 @@ Float64Index, Index, ) + from pandas.core.arrays import TimedeltaArray from pandas.core.arrays._mixins import NDArrayBackedExtensionArray # comparison is faster than is_object_dtype @@ -934,7 +933,11 @@ def setitem(self, indexer, value): return self.coerce_to_target_dtype(value).setitem(indexer, value) if self.dtype.kind in ["m", "M"]: - arr = self.array_values().T + arr = self.values + if self.ndim > 1: + # Dont transpose with ndim=1 bc we would fail to invalidate + # arr.freq + arr = arr.T arr[indexer] = value return self @@ -1168,6 +1171,7 @@ def _interpolate_with_fill( limit_area=limit_area, ) + values = maybe_coerce_values(values) blocks = [self.make_block_same_class(values)] return self._maybe_downcast(blocks, downcast) @@ -1223,6 +1227,7 @@ def func(yvalues: np.ndarray) -> np.ndarray: # interp each column independently interp_values = np.apply_along_axis(func, axis, data) + interp_values = maybe_coerce_values(interp_values) blocks = [self.make_block_same_class(interp_values)] return self._maybe_downcast(blocks, downcast) @@ -1845,15 +1850,23 @@ class NDArrayBackedExtensionBlock(HybridMixin, Block): Block backed by an NDArrayBackedExtensionArray """ + values: NDArrayBackedExtensionArray + + @property + def is_view(self) -> bool: + """ return a boolean if I am possibly a view """ + # check the ndarray values of the DatetimeIndex values + return self.values._ndarray.base is not None + def internal_values(self): # Override to return DatetimeArray and TimedeltaArray - return self.array_values() + return self.values def get_values(self, dtype: Optional[DtypeObj] = None) -> np.ndarray: """ return object dtype as boxed values, such as Timestamps/Timedelta """ - values = self.array_values() + values = self.values if is_object_dtype(dtype): # DTA/TDA constructor and astype can handle 2D values = values.astype(object) @@ -1863,7 +1876,7 @@ def get_values(self, dtype: Optional[DtypeObj] = None) -> np.ndarray: def iget(self, key): # GH#31649 we need to wrap scalars in Timestamp/Timedelta # TODO(EA2D): this can be removed if we ever have 2D EA - return self.array_values().reshape(self.shape)[key] + return self.values.reshape(self.shape)[key] def putmask(self, mask, new) -> List[Block]: mask = extract_bool_array(mask) @@ -1872,14 +1885,13 @@ def putmask(self, mask, new) -> List[Block]: return self.astype(object).putmask(mask, new) # TODO(EA2D): reshape unnecessary with 2D EAs - arr = self.array_values().reshape(self.shape) - arr = cast("NDArrayBackedExtensionArray", arr) + arr = self.values.reshape(self.shape) arr.T.putmask(mask, new) return [self] def where(self, other, cond, errors="raise", axis: int = 0) -> List[Block]: # TODO(EA2D): reshape unnecessary with 2D EAs - arr = self.array_values().reshape(self.shape) + arr = self.values.reshape(self.shape) cond = extract_bool_array(cond) @@ -1890,7 +1902,6 @@ def where(self, other, cond, errors="raise", axis: int = 0) -> List[Block]: # TODO(EA2D): reshape not needed with 2D EAs res_values = res_values.reshape(self.values.shape) - res_values = maybe_coerce_values(res_values) nb = self.make_block_same_class(res_values) return [nb] @@ -1915,17 +1926,15 @@ def diff(self, n: int, axis: int = 0) -> List[Block]: by apply. """ # TODO(EA2D): reshape not necessary with 2D EAs - values = self.array_values().reshape(self.shape) + values = self.values.reshape(self.shape) new_values = values - values.shift(n, axis=axis) - new_values = maybe_coerce_values(new_values) return [self.make_block(new_values)] def shift(self, periods: int, axis: int = 0, fill_value: Any = None) -> List[Block]: # TODO(EA2D) this is unnecessary if these blocks are backed by 2D EAs - values = self.array_values().reshape(self.shape) + values = self.values.reshape(self.shape) new_values = values.shift(periods, fill_value=fill_value, axis=axis) - new_values = maybe_coerce_values(new_values) return [self.make_block_same_class(new_values)] def fillna( @@ -1938,25 +1947,31 @@ def fillna( # TODO: don't special-case td64 return self.astype(object).fillna(value, limit, inplace, downcast) - values = self.array_values() + values = self.values values = values if inplace else values.copy() new_values = values.fillna(value=value, limit=limit) - new_values = maybe_coerce_values(new_values) return [self.make_block_same_class(values=new_values)] class DatetimeLikeBlockMixin(NDArrayBackedExtensionBlock): """Mixin class for DatetimeBlock, DatetimeTZBlock, and TimedeltaBlock.""" + values: Union[DatetimeArray, TimedeltaArray] + is_numeric = False _can_hold_na = True def array_values(self): - return ensure_wrapped_if_datetimelike(self.values) + return self.values + + def external_values(self): + # NB: for dt64tz this is different from np.asarray(self.values), + # since that return an object-dtype ndarray of Timestamps. + return self.values._ndarray @property def _holder(self): - return type(self.array_values()) + return type(self.values) @property def fill_value(self): @@ -1964,7 +1979,7 @@ def fill_value(self): def to_native_types(self, na_rep="NaT", **kwargs): """ convert to our native types format """ - arr = self.array_values() + arr = self.values result = arr._format_native_types(na_rep=na_rep, **kwargs) result = result.astype(object, copy=False) @@ -1974,14 +1989,6 @@ def to_native_types(self, na_rep="NaT", **kwargs): class DatetimeBlock(DatetimeLikeBlockMixin): __slots__ = () - def set_inplace(self, locs, values): - """ - See Block.set.__doc__ - """ - values = conversion.ensure_datetime64ns(values, copy=False) - - self.values[locs] = values - class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): """ implement a datetime64 block with a tz attribute """ @@ -2000,20 +2007,14 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): where = DatetimeBlock.where putmask = DatetimeLikeBlockMixin.putmask fillna = DatetimeLikeBlockMixin.fillna + external_values = DatetimeLikeBlockMixin.external_values - array_values = ExtensionBlock.array_values + # error: Incompatible types in assignment (expression has type + # "Callable[[NDArrayBackedExtensionBlock], bool]", base class "ExtensionBlock" + # defined the type as "bool") [assignment] + is_view = NDArrayBackedExtensionBlock.is_view # type: ignore[assignment] - @property - def is_view(self) -> bool: - """ return a boolean if I am possibly a view """ - # check the ndarray values of the DatetimeIndex values - return self.values._data.base is not None - - def external_values(self): - # NB: this is different from np.asarray(self.values), since that - # return an object-dtype ndarray of Timestamps. - # Avoid FutureWarning in .astype in casting from dt64tz to dt64 - return self.values._data + array_values = ExtensionBlock.array_values class TimeDeltaBlock(DatetimeLikeBlockMixin): @@ -2170,15 +2171,11 @@ def maybe_coerce_values(values) -> ArrayLike: values = extract_array(values, extract_numpy=True) if isinstance(values, np.ndarray): - values = sanitize_to_nanoseconds(values) + values = ensure_wrapped_if_datetimelike(values) if issubclass(values.dtype.type, str): values = np.array(values, dtype=object) - elif isinstance(values.dtype, np.dtype): - # i.e. not datetime64tz, extract DTA/TDA -> ndarray - values = values._data - return values diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index b82ab807562f4..07438bf3bc66b 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -422,10 +422,17 @@ def _concatenate_join_units( concat_values = concat_values.copy() else: concat_values = concat_values.copy() - elif any(isinstance(t, ExtensionArray) for t in to_concat): + elif any(isinstance(t, ExtensionArray) and t.ndim == 1 for t in to_concat): # concatting with at least one EA means we are concatting a single column # the non-EA values are 2D arrays with shape (1, n) - to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat] + # error: Invalid index type "Tuple[int, slice]" for + # "Union[ExtensionArray, ndarray]"; expected type "Union[int, slice, ndarray]" + to_concat = [ + t + if (isinstance(t, ExtensionArray) and t.ndim == 1) + else t[0, :] # type: ignore[index] + for t in to_concat + ] concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True) concat_values = ensure_block_shape(concat_values, 2) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 720643d3d98aa..b81483cbf2e93 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -397,7 +397,7 @@ def nargsort( return indexer -def nargminmax(values, method: str): +def nargminmax(values, method: str, axis: int = 0): """ Implementation of np.argmin/argmax but for ExtensionArray and which handles missing values. @@ -406,6 +406,7 @@ def nargminmax(values, method: str): ---------- values : ExtensionArray method : {"argmax", "argmin"} + axis: int, default 0 Returns ------- @@ -417,11 +418,16 @@ def nargminmax(values, method: str): mask = np.asarray(isna(values)) values = values._values_for_argsort() - idx = np.arange(len(values)) + idx = np.arange(values.shape[axis]) + if values.ndim > 1: + if mask.any(): + raise NotImplementedError + return func(values, axis=axis) + non_nans = values[~mask] non_nan_idx = idx[~mask] - return non_nan_idx[func(non_nans)] + return non_nan_idx[func(non_nans, axis=axis)] def _ensure_key_mapped_multiindex( diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index f75b3800f623f..215b51dd88ef4 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1968,6 +1968,7 @@ def test_operators_datetimelike_with_timezones(self): td1 = Series(pd.timedelta_range("1 days 1 min", periods=5, freq="H")) td2 = td1.copy() td2.iloc[1] = np.nan + assert td2._values.freq is None result = dt1 + td1[0] exp = (dt1.dt.tz_localize(None) + td1[0]).dt.tz_localize(tz) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index ca593da6d97bc..525a337b63f74 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -30,6 +30,7 @@ import numpy as np +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import pandas_dtype import pandas as pd @@ -207,11 +208,9 @@ def _values_for_factorize(self): return frozen, () def _values_for_argsort(self): - # Disable NumPy's shape inference by including an empty tuple... - # If all the elements of self are the same size P, NumPy will - # cast them to an (N, P) array, instead of an (N,) array of tuples. - frozen = [()] + [tuple(x.items()) for x in self] - return np.array(frozen, dtype=object)[1:] + # Bypass NumPy's shape inference to get a (N,) array of tuples. + frozen = [tuple(x.items()) for x in self] + return construct_1d_object_array_from_listlike(frozen) def make_data(): diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index eb54887cea277..117612696df11 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1008,11 +1008,8 @@ def test_apply_function_with_indexing_return_column(): tm.assert_frame_equal(result, expected) -def test_apply_with_timezones_aware(using_array_manager, request): +def test_apply_with_timezones_aware(): # GH: 27212 - if not using_array_manager: - request.node.add_marker(pytest.mark.xfail(reason="GH-34998")) - dates = ["2001-01-01"] * 2 + ["2001-01-02"] * 2 + ["2001-01-03"] * 2 index_no_tz = pd.DatetimeIndex(dates) index_tz = pd.DatetimeIndex(dates, tz="UTC") diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index afde1daca74c1..50572665db6b5 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -820,7 +820,7 @@ def test_groupby_multi_corner(df): tm.assert_frame_equal(agged, expected) -def test_omit_nuisance(df): +def test_omit_nuisance(df, using_array_manager): grouped = df.groupby("A") result = grouped.mean() @@ -840,7 +840,9 @@ def test_omit_nuisance(df): # won't work with axis = 1 grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1) - msg = "reduction operation 'sum' not allowed for this dtype" + msg = "'DatetimeArray' does not implement reduction 'sum'" + if using_array_manager: + msg = "reduction operation 'sum' not allowed for this dtype" with pytest.raises(TypeError, match=msg): grouped.agg(lambda x: x.sum(0, numeric_only=False)) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 3c37d827c0778..f352f9ce77574 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -431,13 +431,26 @@ def test_copy(self, mgr): cp = mgr.copy(deep=True) for blk, cp_blk in zip(mgr.blocks, cp.blocks): + bvals = blk.values + cpvals = cp_blk.values + + tm.assert_equal(cpvals, bvals) + + if isinstance(cpvals, np.ndarray): + lbase = cpvals.base + rbase = bvals.base + else: + lbase = cpvals._ndarray.base + rbase = bvals._ndarray.base + # copy assertion we either have a None for a base or in case of # some blocks it is an array (e.g. datetimetz), but was copied - tm.assert_equal(cp_blk.values, blk.values) - if not isinstance(cp_blk.values, np.ndarray): - assert cp_blk.values._data.base is not blk.values._data.base + if isinstance(cpvals, DatetimeArray): + assert (lbase is None and rbase is None) or (lbase is not rbase) + elif not isinstance(cpvals, np.ndarray): + assert lbase is not rbase else: - assert cp_blk.values.base is None and blk.values.base is None + assert lbase is None and rbase is None def test_sparse(self): mgr = create_mgr("a: sparse-1; b: sparse-2") From c776390628ebd5bf304884ddb2630e1b7c1aec69 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 16 Mar 2021 10:25:27 -0700 Subject: [PATCH 2/6] fix pytables tests --- pandas/io/pytables.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 02a723902271e..bbc3ee141db3d 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3963,7 +3963,9 @@ def _create_axes( typ = klass._get_atom(data_converted) kind = _dtype_to_kind(data_converted.dtype.name) - tz = _get_tz(data_converted.tz) if hasattr(data_converted, "tz") else None + tz = None + if getattr(data_converted, "tz", None) is not None: + tz = _get_tz(data_converted.tz) meta = metadata = ordered = None if is_categorical_dtype(data_converted.dtype): From c3ce5039c975a95114bb56d1e8e97e0b95cb9abf Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 16 Mar 2021 14:47:15 -0700 Subject: [PATCH 3/6] exception message shared AM/BM --- pandas/tests/groupby/test_groupby.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 50572665db6b5..de508b8cd78ec 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -820,7 +820,7 @@ def test_groupby_multi_corner(df): tm.assert_frame_equal(agged, expected) -def test_omit_nuisance(df, using_array_manager): +def test_omit_nuisance(df): grouped = df.groupby("A") result = grouped.mean() @@ -841,8 +841,6 @@ def test_omit_nuisance(df, using_array_manager): # won't work with axis = 1 grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1) msg = "'DatetimeArray' does not implement reduction 'sum'" - if using_array_manager: - msg = "reduction operation 'sum' not allowed for this dtype" with pytest.raises(TypeError, match=msg): grouped.agg(lambda x: x.sum(0, numeric_only=False)) From d7f271809cdcbe373de685146ca9ed12ea82a820 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 23 Mar 2021 07:39:29 -0700 Subject: [PATCH 4/6] nanargminmax with ndim=2 and mask.any() --- pandas/core/sorting.py | 17 +++++++++--- pandas/tests/frame/test_reductions.py | 37 +++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 7efe5e6947d3d..5e10c5ca1fef7 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -417,16 +417,27 @@ def nargminmax(values, method: str, axis: int = 0): mask = np.asarray(isna(values)) values = values._values_for_argsort() - idx = np.arange(values.shape[axis]) if values.ndim > 1: if mask.any(): - raise NotImplementedError + if axis == 1: + zipped = zip(values, mask) + else: + zipped = zip(values.T, mask.T) + return np.array([_nanargminmax(v, m, func) for v, m in zipped]) return func(values, axis=axis) + return _nanargminmax(values, mask, func) + + +def _nanargminmax(values, mask, func) -> int: + """ + See nanargminmax.__doc__. + """ + idx = np.arange(values.shape[0]) non_nans = values[~mask] non_nan_idx = idx[~mask] - return non_nan_idx[func(non_nans, axis=axis)] + return non_nan_idx[func(non_nans)] def _ensure_key_mapped_multiindex( diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index d24320ad17709..3b04722463b7f 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1002,6 +1002,43 @@ def test_idxmax_mixed_dtype(self): expected = Series([0, 2, 0], index=[1, 2, 3]) tm.assert_series_equal(result, expected) + # with NaTs + df.loc[0, 3] = pd.NaT + result = df.idxmax() + expected = Series([1, 0, 2], index=[1, 2, 3]) + tm.assert_series_equal(result, expected) + + result = df.idxmin() + expected = Series([0, 2, 1], index=[1, 2, 3]) + tm.assert_series_equal(result, expected) + + # with multi-column dt64 block + df[4] = dti[::-1] + df._consolidate_inplace() + + result = df.idxmax() + expected = Series([1, 0, 2, 0], index=[1, 2, 3, 4]) + tm.assert_series_equal(result, expected) + + result = df.idxmin() + expected = Series([0, 2, 1, 2], index=[1, 2, 3, 4]) + tm.assert_series_equal(result, expected) + + def test_idxmax_dt64_multicolumn_axis1(self): + dti = date_range("2016-01-01", periods=3) + df = DataFrame({3: dti, 4: dti[::-1]}) + df.iloc[0, 0] = pd.NaT + + df._consolidate_inplace() + + result = df.idxmax(axis=1) + expected = Series([4, 3, 3]) + tm.assert_series_equal(result, expected) + + result = df.idxmin(axis=1) + expected = Series([4, 3, 4]) + tm.assert_series_equal(result, expected) + # ---------------------------------------------------------------------- # Logical reductions From c7ff2372cab5c5d1ab1590b25196127c58111951 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 24 Mar 2021 10:48:37 -0700 Subject: [PATCH 5/6] REF: remove DatetimeBlock, TimeDeltaBlock --- pandas/core/internals/__init__.py | 4 -- pandas/core/internals/array_manager.py | 2 +- pandas/core/internals/blocks.py | 53 +++++++-------------- pandas/core/internals/concat.py | 8 +++- pandas/core/internals/managers.py | 13 ++--- pandas/tests/frame/methods/test_quantile.py | 2 +- pandas/tests/internals/test_api.py | 2 - pandas/tests/series/methods/test_dropna.py | 4 +- pandas/tests/series/methods/test_fillna.py | 4 +- 9 files changed, 34 insertions(+), 58 deletions(-) diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index f0018928255e6..af1350f088b7a 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -9,12 +9,10 @@ ) from pandas.core.internals.blocks import ( # io.pytables, io.packers Block, - DatetimeBlock, DatetimeTZBlock, ExtensionBlock, NumericBlock, ObjectBlock, - TimeDeltaBlock, ) from pandas.core.internals.concat import concatenate_managers from pandas.core.internals.managers import ( @@ -28,11 +26,9 @@ "Block", "CategoricalBlock", "NumericBlock", - "DatetimeBlock", "DatetimeTZBlock", "ExtensionBlock", "ObjectBlock", - "TimeDeltaBlock", "make_block", "DataManager", "ArrayManager", diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 070fcccc7f97f..5878220ff27e7 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -471,7 +471,7 @@ def apply_with_block(self: T, f, align_keys=None, swap_axis=True, **kwargs) -> T # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no # attribute "tz" if hasattr(arr, "tz") and arr.tz is None: # type: ignore[union-attr] - # DatetimeArray needs to be converted to ndarray for DatetimeBlock + # DatetimeArray needs to be converted to ndarray for DatetimeLikeBlock # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no # attribute "_data" diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index aed622a41580e..4dafe24c2a4c4 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -52,7 +52,6 @@ is_dtype_equal, is_extension_array_dtype, is_list_like, - is_object_dtype, is_sparse, pandas_dtype, ) @@ -209,12 +208,6 @@ def is_bool(self) -> bool: def external_values(self): return external_values(self.values) - def internal_values(self): - """ - The array that Series._values returns (internal values). - """ - return self.values - def array_values(self) -> ExtensionArray: """ The array that Series.array returns. Always an ExtensionArray. @@ -1767,8 +1760,7 @@ def is_view(self) -> bool: # check the ndarray values of the DatetimeIndex values return self.values._ndarray.base is not None - def internal_values(self): - # Override to return DatetimeArray and TimedeltaArray + def array_values(self) -> ExtensionArray: return self.values def get_values(self, dtype: Optional[DtypeObj] = None) -> np.ndarray: @@ -1776,7 +1768,7 @@ def get_values(self, dtype: Optional[DtypeObj] = None) -> np.ndarray: return object dtype as boxed values, such as Timestamps/Timedelta """ values = self.values - if is_object_dtype(dtype): + if dtype == _dtype_obj: # DTA/TDA constructor and astype can handle 2D values = values.astype(object) # TODO(EA2D): reshape not needed with 2D EAs @@ -1827,7 +1819,7 @@ def diff(self, n: int, axis: int = 0) -> List[Block]: Returns ------- - A list with a new TimeDeltaBlock. + A list with a new Block. Notes ----- @@ -1862,22 +1854,17 @@ def fillna( return [self.make_block_same_class(values=new_values)] -class DatetimeLikeBlockMixin(NDArrayBackedExtensionBlock): - """Mixin class for DatetimeBlock, DatetimeTZBlock, and TimedeltaBlock.""" +class DatetimeLikeBlock(NDArrayBackedExtensionBlock): + """Mixin class for DatetimeLikeBlock, DatetimeTZBlock.""" + + __slots__ = () values: Union[DatetimeArray, TimedeltaArray] is_numeric = False - def array_values(self): - return self.values - -class DatetimeBlock(DatetimeLikeBlockMixin): - __slots__ = () - - -class DatetimeTZBlock(ExtensionBlock, DatetimeLikeBlockMixin): +class DatetimeTZBlock(ExtensionBlock, DatetimeLikeBlock): """ implement a datetime64 block with a tz attribute """ values: DatetimeArray @@ -1886,12 +1873,14 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeLikeBlockMixin): is_extension = True is_numeric = False - internal_values = Block.internal_values - _can_hold_element = DatetimeBlock._can_hold_element - diff = DatetimeBlock.diff - where = DatetimeBlock.where - putmask = DatetimeLikeBlockMixin.putmask - fillna = DatetimeLikeBlockMixin.fillna + _can_hold_element = NDArrayBackedExtensionBlock._can_hold_element + diff = NDArrayBackedExtensionBlock.diff + where = NDArrayBackedExtensionBlock.where + putmask = NDArrayBackedExtensionBlock.putmask + fillna = NDArrayBackedExtensionBlock.fillna + + array_values = NDArrayBackedExtensionBlock.array_values + get_values = NDArrayBackedExtensionBlock.get_values # error: Incompatible types in assignment (expression has type # "Callable[[NDArrayBackedExtensionBlock], bool]", base class "ExtensionBlock" @@ -1899,10 +1888,6 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeLikeBlockMixin): is_view = NDArrayBackedExtensionBlock.is_view # type: ignore[assignment] -class TimeDeltaBlock(DatetimeLikeBlockMixin): - __slots__ = () - - class ObjectBlock(Block): __slots__ = () is_object = True @@ -2033,10 +2018,8 @@ def get_block_type(values, dtype: Optional[Dtype] = None): # Note: need to be sure PandasArray is unwrapped before we get here cls = ExtensionBlock - elif kind == "M": - cls = DatetimeBlock - elif kind == "m": - cls = TimeDeltaBlock + elif kind in ["M", "m"]: + cls = DatetimeLikeBlock elif kind in ["f", "c", "i", "u", "b"]: cls = NumericBlock else: diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 02d582c439ea2..7e637b84c8f7d 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -505,12 +505,16 @@ def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool: _concatenate_join_units (which uses `concat_compat`). """ - # TODO: require dtype match in addition to same type? e.g. DatetimeTZBlock - # cannot necessarily join return ( # all blocks need to have the same type all(type(ju.block) is type(join_units[0].block) for ju in join_units) # noqa and + # e.g. DatetimeLikeBlock can be dt64 or td64, but these are not uniform + all( + is_dtype_equal(ju.block.dtype, join_units[0].block.dtype) + for ju in join_units + ) + and # no blocks that would get missing values (can lead to type upcasts) # unless we're an extension dtype. all(not ju.is_na or ju.block.is_extension for ju in join_units) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index e4cce731b7b56..b941119d0be4e 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -35,7 +35,6 @@ from pandas.core.dtypes.cast import infer_dtype_from_scalar from pandas.core.dtypes.common import ( - DT64NS_DTYPE, ensure_int64, is_dtype_equal, is_extension_array_dtype, @@ -1676,7 +1675,7 @@ def external_values(self): def internal_values(self): """The array that Series._values returns""" - return self._block.internal_values() + return self._block.values def array_values(self): """The array that Series.array returns""" @@ -1823,13 +1822,9 @@ def _form_blocks( numeric_blocks = _multi_blockify(items_dict["NumericBlock"]) blocks.extend(numeric_blocks) - if len(items_dict["TimeDeltaBlock"]): - timedelta_blocks = _multi_blockify(items_dict["TimeDeltaBlock"]) - blocks.extend(timedelta_blocks) - - if len(items_dict["DatetimeBlock"]): - datetime_blocks = _simple_blockify(items_dict["DatetimeBlock"], DT64NS_DTYPE) - blocks.extend(datetime_blocks) + if len(items_dict["DatetimeLikeBlock"]): + dtlike_blocks = _multi_blockify(items_dict["DatetimeLikeBlock"]) + blocks.extend(dtlike_blocks) if len(items_dict["DatetimeTZBlock"]): dttz_blocks = [ diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index bc1d4605e985a..dbb5cb357de47 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -336,7 +336,7 @@ def test_quantile_box(self): ) tm.assert_frame_equal(res, exp) - # DatetimeBlock may be consolidated and contain NaT in different loc + # DatetimeLikeBlock may be consolidated and contain NaT in different loc df = DataFrame( { "A": [ diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py index 60fbd2da70e79..0062d5aa34319 100644 --- a/pandas/tests/internals/test_api.py +++ b/pandas/tests/internals/test_api.py @@ -27,11 +27,9 @@ def test_namespace(): expected = [ "Block", "NumericBlock", - "DatetimeBlock", "DatetimeTZBlock", "ExtensionBlock", "ObjectBlock", - "TimeDeltaBlock", "make_block", "DataManager", "ArrayManager", diff --git a/pandas/tests/series/methods/test_dropna.py b/pandas/tests/series/methods/test_dropna.py index 1c7c52d228cfa..5bff7306fac33 100644 --- a/pandas/tests/series/methods/test_dropna.py +++ b/pandas/tests/series/methods/test_dropna.py @@ -70,7 +70,7 @@ def test_dropna_period_dtype(self): tm.assert_series_equal(result, expected) def test_datetime64_tz_dropna(self): - # DatetimeBlock + # DatetimeLikeBlock ser = Series( [ Timestamp("2011-01-01 10:00"), @@ -85,7 +85,7 @@ def test_datetime64_tz_dropna(self): ) tm.assert_series_equal(result, expected) - # DatetimeBlockTZ + # DatetimeTZBlock idx = DatetimeIndex( ["2011-01-01 10:00", NaT, "2011-01-03 10:00", NaT], tz="Asia/Tokyo" ) diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index cf6b357d0a418..51864df915f8c 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -334,7 +334,7 @@ def test_datetime64_fillna(self): @pytest.mark.parametrize("tz", ["US/Eastern", "Asia/Tokyo"]) def test_datetime64_tz_fillna(self, tz): - # DatetimeBlock + # DatetimeLikeBlock ser = Series( [ Timestamp("2011-01-01 10:00"), @@ -414,7 +414,7 @@ def test_datetime64_tz_fillna(self, tz): tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) - # DatetimeBlockTZ + # DatetimeTZBlock idx = DatetimeIndex(["2011-01-01 10:00", NaT, "2011-01-03 10:00", NaT], tz=tz) ser = Series(idx) assert ser.dtype == f"datetime64[ns, {tz}]" From fb18444307d877a2e1b57cd3e575ff5968d11c66 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 2 Apr 2021 06:55:01 -0700 Subject: [PATCH 6/6] mypy fixup --- pandas/core/internals/blocks.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index fc70fdfe01331..d1d0db913f854 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1880,13 +1880,11 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeLikeBlock): is_extension = True is_numeric = False - _can_hold_element = NDArrayBackedExtensionBlock._can_hold_element diff = NDArrayBackedExtensionBlock.diff where = NDArrayBackedExtensionBlock.where putmask = NDArrayBackedExtensionBlock.putmask fillna = NDArrayBackedExtensionBlock.fillna - array_values = NDArrayBackedExtensionBlock.array_values get_values = NDArrayBackedExtensionBlock.get_values # error: Incompatible types in assignment (expression has type