From 882ffba70ab92eed3092bf83cb9dbef9baacd602 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 29 Mar 2023 16:03:56 -0700 Subject: [PATCH 1/2] PERF: concat_same_type for PeriodDtype --- pandas/_libs/arrays.pyx | 7 ++++ pandas/core/arrays/_mixins.py | 10 ++--- pandas/core/arrays/datetimelike.py | 65 +++++++++++++----------------- pandas/core/dtypes/dtypes.py | 9 ++--- 4 files changed, 43 insertions(+), 48 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index df2730512c9c9..718fb358e26bc 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -182,3 +182,10 @@ cdef class NDArrayBacked: def transpose(self, *axes): res_values = self._ndarray.transpose(*axes) return self._from_backing_data(res_values) + + @classmethod + def _concat_same_type(cls, to_concat, axis=0): + # NB: We are assuming at this point that dtypes all match + new_values = [obj._ndarray for obj in to_concat] + new_arr = cnp.PyArray_Concatenate(new_values, axis) + return to_concat[0]._from_backing_data(new_arr) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index acf8cbc8fd545..74a8d90e0de4f 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -224,13 +224,11 @@ def _concat_same_type( to_concat: Sequence[Self], axis: AxisInt = 0, ) -> Self: - dtypes = {str(x.dtype) for x in to_concat} - if len(dtypes) != 1: - raise ValueError("to_concat must have the same dtype (tz)", dtypes) + if not lib.dtypes_all_equal([x.dtype for x in to_concat]): + dtypes = {str(x.dtype) for x in to_concat} + raise ValueError("to_concat must have the same dtype", dtypes) - new_values = [x._ndarray for x in to_concat] - new_arr = np.concatenate(new_values, axis=axis) - return to_concat[0]._from_backing_data(new_arr) + return super()._concat_same_type(to_concat, axis=axis) @doc(ExtensionArray.searchsorted) def searchsorted( diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 995fec9963bdb..d7ed105d04233 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -505,42 +505,6 @@ def view(self, dtype: Dtype | None = None) -> ArrayLike: # are present in this file. return super().view(dtype) - # ------------------------------------------------------------------ - # ExtensionArray Interface - - @classmethod - def _concat_same_type( - cls, - to_concat: Sequence[Self], - axis: AxisInt = 0, - ) -> Self: - new_obj = super()._concat_same_type(to_concat, axis) - - obj = to_concat[0] - dtype = obj.dtype - - new_freq = None - if isinstance(dtype, PeriodDtype): - new_freq = obj.freq - elif axis == 0: - # GH 3232: If the concat result is evenly spaced, we can retain the - # original frequency - to_concat = [x for x in to_concat if len(x)] - - if obj.freq is not None and all(x.freq == obj.freq for x in to_concat): - pairs = zip(to_concat[:-1], to_concat[1:]) - if all(pair[0][-1] + obj.freq == pair[1][0] for pair in pairs): - new_freq = obj.freq - - new_obj._freq = new_freq - return new_obj - - def copy(self, order: str = "C") -> Self: - # error: Unexpected keyword argument "order" for "copy" - new_obj = super().copy(order=order) # type: ignore[call-arg] - new_obj._freq = self.freq - return new_obj - # ------------------------------------------------------------------ # Validation Methods # TODO: try to de-duplicate these, ensure identical behavior @@ -2085,6 +2049,7 @@ def _with_freq(self, freq) -> Self: return arr # -------------------------------------------------------------- + # ExtensionArray Interface def factorize( self, @@ -2102,6 +2067,34 @@ def factorize( # FIXME: shouldn't get here; we are ignoring sort return super().factorize(use_na_sentinel=use_na_sentinel) + @classmethod + def _concat_same_type( + cls, + to_concat: Sequence[Self], + axis: AxisInt = 0, + ) -> Self: + new_obj = super()._concat_same_type(to_concat, axis) + + obj = to_concat[0] + + if axis == 0: + # GH 3232: If the concat result is evenly spaced, we can retain the + # original frequency + to_concat = [x for x in to_concat if len(x)] + + if obj.freq is not None and all(x.freq == obj.freq for x in to_concat): + pairs = zip(to_concat[:-1], to_concat[1:]) + if all(pair[0][-1] + obj.freq == pair[1][0] for pair in pairs): + new_freq = obj.freq + new_obj._freq = new_freq + return new_obj + + def copy(self, order: str = "C") -> Self: + # error: Unexpected keyword argument "order" for "copy" + new_obj = super().copy(order=order) # type: ignore[call-arg] + new_obj._freq = self.freq + return new_obj + # ------------------------------------------------------------------- # Shared Constructor Helpers diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 26a23f59d7dc6..4698aec5d5312 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -956,12 +956,9 @@ def __eq__(self, other: Any) -> bool: elif isinstance(other, PeriodDtype): # For freqs that can be held by a PeriodDtype, this check is # equivalent to (and much faster than) self.freq == other.freq - sfreq = self.freq - ofreq = other.freq - return ( - sfreq.n == ofreq.n - and sfreq._period_dtype_code == ofreq._period_dtype_code - ) + sfreq = self._freq + ofreq = other._freq + return sfreq.n == ofreq.n and self._dtype_code == other._dtype_code return False From 8292c9f2cb5073125c1a0ac437d6e42705000015 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 29 Mar 2023 17:30:56 -0700 Subject: [PATCH 2/2] mypy fixup --- pandas/_libs/arrays.pyi | 6 ++++++ pandas/core/arrays/datetimelike.py | 6 +++++- pandas/core/arrays/datetimes.py | 4 +++- pandas/core/arrays/numpy_.py | 4 +++- pandas/core/arrays/period.py | 9 +++++++-- pandas/core/arrays/string_.py | 4 +++- 6 files changed, 27 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index c9350ed9b8a75..78fee8f01319c 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -3,7 +3,9 @@ from typing import Sequence import numpy as np from pandas._typing import ( + AxisInt, DtypeObj, + Self, Shape, ) @@ -32,3 +34,7 @@ class NDArrayBacked: def ravel(self, order=...): ... @property def T(self): ... + @classmethod + def _concat_same_type( + cls, to_concat: Sequence[Self], axis: AxisInt = ... + ) -> Self: ... diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index d7ed105d04233..c036dc09948d8 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -182,7 +182,11 @@ def new_meth(self, *args, **kwargs): return cast(F, new_meth) -class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): +# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is +# incompatible with definition in base class "ExtensionArray" +class DatetimeLikeArrayMixin( # type: ignore[misc] + OpsMixin, NDArrayBackedExtensionArray +): """ Shared Base/Mixin class for DatetimeArray, TimedeltaArray, PeriodArray diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index af270ae040125..0143b68c3bd3a 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -151,7 +151,9 @@ def f(self): return property(f) -class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): +# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is +# incompatible with definition in base class "ExtensionArray" +class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): # type: ignore[misc] """ Pandas ExtensionArray for tz-naive or tz-aware datetime data. diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index c97305e65e38b..b8ba461d2806a 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -40,7 +40,9 @@ ) -class PandasArray( +# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is +# incompatible with definition in base class "ExtensionArray" +class PandasArray( # type: ignore[misc] OpsMixin, NDArrayBackedExtensionArray, ObjectStringArrayMixin, diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 83d50f2d0832a..a6ba738bd5983 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -109,7 +109,9 @@ def f(self): return property(f) -class PeriodArray(dtl.DatelikeOps, libperiod.PeriodMixin): +# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is +# incompatible with definition in base class "ExtensionArray" +class PeriodArray(dtl.DatelikeOps, libperiod.PeriodMixin): # type: ignore[misc] """ Pandas ExtensionArray for storing Period data. @@ -263,7 +265,10 @@ def _from_sequence( validate_dtype_freq(scalars.dtype, freq) if copy: scalars = scalars.copy() - return scalars + # error: Incompatible return value type + # (got "Union[Sequence[Optional[Period]], Union[Union[ExtensionArray, + # ndarray[Any, Any]], Index, Series]]", expected "PeriodArray") + return scalars # type: ignore[return-value] periods = np.asarray(scalars, dtype=object) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 2508bad80dc26..3bafc1d7b4977 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -228,7 +228,9 @@ def tolist(self): return list(self.to_numpy()) -class StringArray(BaseStringArray, PandasArray): +# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is +# incompatible with definition in base class "ExtensionArray" +class StringArray(BaseStringArray, PandasArray): # type: ignore[misc] """ Extension array for string data.