PERF: make DTA/TDA/PA _ndarray the attribute, _data the property (#40007)

jbrockmendel · web-flow · commit 8ceec8b178d7 · 2021-02-24T13:08:21.000-05:00
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -156,7 +156,7 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray):
     _infer_matches: Tuple[str, ...]
     _is_recognized_dtype: Callable[[DtypeObj], bool]
     _recognized_scalars: Tuple[Type, ...]
-    _data: np.ndarray
+    _ndarray: np.ndarray
 
     def __init__(self, data, dtype: Optional[Dtype] = None, freq=None, copy=False):
         raise AbstractMethodError(self)
@@ -253,9 +253,24 @@ def _check_compatible_with(
     # ------------------------------------------------------------------
     # NDArrayBackedExtensionArray compat
 
+    def __setstate__(self, state):
+        if isinstance(state, dict):
+            if "_data" in state and "_ndarray" not in state:
+                # backward compat, changed what is property vs attribute
+                state["_ndarray"] = state.pop("_data")
+            for key, value in state.items():
+                setattr(self, key, value)
+        else:
+            # PeriodArray, bc it mixes in a cython class
+            if isinstance(state, tuple) and len(state) == 1:
+                state = state[0]
+                self.__setstate__(state)
+            else:
+                raise TypeError(state)
+
     @cache_readonly
-    def _ndarray(self) -> np.ndarray:
-        return self._data
+    def _data(self) -> np.ndarray:
+        return self._ndarray
 
     def _from_backing_data(
         self: DatetimeLikeArrayT, arr: np.ndarray
@@ -294,7 +309,7 @@ def asi8(self) -> np.ndarray:
             An ndarray with int64 dtype.
         """
         # do not cache or you'll create a memory leak
-        return self._data.view("i8")
+        return self._ndarray.view("i8")
 
     # ----------------------------------------------------------------
     # Rendering Methods
diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -261,7 +261,7 @@ def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy=False):
 
             if freq is None:
                 freq = values.freq
-            values = values._data
+            values = values._ndarray
 
         if not isinstance(values, np.ndarray):
             raise ValueError(
@@ -303,7 +303,7 @@ def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy=False):
             # be incorrect(ish?) for the array as a whole
             dtype = DatetimeTZDtype(tz=timezones.tz_standardize(dtype.tz))
 
-        self._data = values
+        self._ndarray = values
         self._dtype = dtype
         self._freq = freq
 
@@ -320,7 +320,7 @@ def _simple_new(
             values = values.view(DT64NS_DTYPE)
 
         result = object.__new__(cls)
-        result._data = values
+        result._ndarray = values
         result._freq = freq
         result._dtype = dtype
         return result
@@ -618,7 +618,7 @@ def astype(self, dtype, copy=True):
 
         elif self.tz is None and is_datetime64_dtype(dtype) and dtype != self.dtype:
             # unit conversion e.g. datetime64[s]
-            return self._data.astype(dtype)
+            return self._ndarray.astype(dtype)
 
         elif is_period_dtype(dtype):
             return self.to_period(freq=dtype.freq)
@@ -1138,7 +1138,7 @@ def to_period(self, freq=None):
 
             freq = res
 
-        return PeriodArray._from_datetime64(self._data, freq, tz=self.tz)
+        return PeriodArray._from_datetime64(self._ndarray, freq, tz=self.tz)
 
     def to_perioddelta(self, freq):
         """
diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py
@@ -181,6 +181,8 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps):
     _datetimelike_ops = _field_ops + _object_ops + _bool_ops
     _datetimelike_methods = ["strftime", "to_timestamp", "asfreq"]
 
+    __setstate__ = dtl.DatelikeOps.__setstate__
+
     # --------------------------------------------------------------------
     # Constructors
 
@@ -201,10 +203,10 @@ def __init__(self, values, dtype: Optional[Dtype] = None, freq=None, copy=False)
         if isinstance(values, type(self)):
             if freq is not None and freq != values.freq:
                 raise raise_on_incompatible(values, freq)
-            values, freq = values._data, values.freq
+            values, freq = values._ndarray, values.freq
 
         values = np.array(values, dtype="int64", copy=copy)
-        self._data = values
+        self._ndarray = values
         if freq is None:
             raise ValueError("freq is not specified and cannot be inferred")
         self._dtype = PeriodDtype(freq)
@@ -347,7 +349,7 @@ def __arrow_array__(self, type=None):
 
         if type is not None:
             if pyarrow.types.is_integer(type):
-                return pyarrow.array(self._data, mask=self.isna(), type=type)
+                return pyarrow.array(self._ndarray, mask=self.isna(), type=type)
             elif isinstance(type, ArrowPeriodType):
                 # ensure we have the same freq
                 if self.freqstr != type.freq:
@@ -361,7 +363,7 @@ def __arrow_array__(self, type=None):
                 )
 
         period_type = ArrowPeriodType(self.freqstr)
-        storage_array = pyarrow.array(self._data, mask=self.isna(), type="int64")
+        storage_array = pyarrow.array(self._ndarray, mask=self.isna(), type="int64")
         return pyarrow.ExtensionArray.from_storage(period_type, storage_array)
 
     # --------------------------------------------------------------------
diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
@@ -166,6 +166,8 @@ def dtype(self) -> np.dtype:
     # ----------------------------------------------------------------
     # Constructors
 
+    _freq = None
+
     def __init__(self, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False):
         values = extract_array(values)
 
@@ -182,7 +184,7 @@ def __init__(self, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False):
             elif freq and values.freq:
                 freq = to_offset(freq)
                 freq, _ = dtl.validate_inferred_freq(freq, values.freq, False)
-            values = values._data
+            values = values._ndarray
 
         if not isinstance(values, np.ndarray):
             msg = (
@@ -214,7 +216,7 @@ def __init__(self, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False):
         if freq:
             freq = to_offset(freq)
 
-        self._data = values
+        self._ndarray = values
         self._dtype = dtype
         self._freq = freq
 
@@ -232,7 +234,7 @@ def _simple_new(
             values = values.view(TD64NS_DTYPE)
 
         result = object.__new__(cls)
-        result._data = values
+        result._ndarray = values
         result._freq = to_offset(freq)
         result._dtype = TD64NS_DTYPE
         return result
@@ -344,7 +346,7 @@ def astype(self, dtype, copy: bool = True):
         dtype = pandas_dtype(dtype)
 
         if dtype.kind == "m":
-            return astype_td64_unit_conversion(self._data, dtype, copy=copy)
+            return astype_td64_unit_conversion(self._ndarray, dtype, copy=copy)
 
         return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy=copy)
 
@@ -418,8 +420,8 @@ def _formatter(self, boxed=False):
     def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs):
         from pandas.io.formats.format import get_format_timedelta64
 
-        formatter = get_format_timedelta64(self._data, na_rep)
-        return np.array([formatter(x) for x in self._data])
+        formatter = get_format_timedelta64(self._ndarray, na_rep)
+        return np.array([formatter(x) for x in self._ndarray])
 
     # ----------------------------------------------------------------
     # Arithmetic Methods
@@ -488,7 +490,7 @@ def _addsub_object_array(self, other, op):
     def __mul__(self, other) -> TimedeltaArray:
         if is_scalar(other):
             # numpy will accept float and int, raise TypeError for others
-            result = self._data * other
+            result = self._ndarray * other
             freq = None
             if self.freq is not None and not isna(other):
                 freq = self.freq * other
@@ -511,7 +513,7 @@ def __mul__(self, other) -> TimedeltaArray:
             return type(self)(result)
 
         # numpy will accept float or int dtype, raise TypeError for others
-        result = self._data * other
+        result = self._ndarray * other
         return type(self)(result)
 
     __rmul__ = __mul__
@@ -529,11 +531,11 @@ def __truediv__(self, other):
                 return result
 
             # otherwise, dispatch to Timedelta implementation
-            return self._data / other
+            return self._ndarray / other
 
         elif lib.is_scalar(other):
             # assume it is numeric
-            result = self._data / other
+            result = self._ndarray / other
             freq = None
             if self.freq is not None:
                 # Tick division is not implemented, so operate on Timedelta
@@ -549,7 +551,7 @@ def __truediv__(self, other):
 
         elif is_timedelta64_dtype(other.dtype):
             # let numpy handle it
-            return self._data / other
+            return self._ndarray / other
 
         elif is_object_dtype(other.dtype):
             # We operate on raveled arrays to avoid problems in inference
@@ -571,7 +573,7 @@ def __truediv__(self, other):
             return result
 
         else:
-            result = self._data / other
+            result = self._ndarray / other
             return type(self)(result)
 
     @unpack_zerodim_and_defer("__rtruediv__")
@@ -586,7 +588,7 @@ def __rtruediv__(self, other):
                 return result
 
             # otherwise, dispatch to Timedelta implementation
-            return other / self._data
+            return other / self._ndarray
 
         elif lib.is_scalar(other):
             raise TypeError(
@@ -602,7 +604,7 @@ def __rtruediv__(self, other):
 
         elif is_timedelta64_dtype(other.dtype):
             # let numpy handle it
-            return other / self._data
+            return other / self._ndarray
 
         elif is_object_dtype(other.dtype):
             # Note: unlike in __truediv__, we do not _need_ to do type
@@ -629,7 +631,7 @@ def __floordiv__(self, other):
                     return result
 
                 # dispatch to Timedelta implementation
-                result = other.__rfloordiv__(self._data)
+                result = other.__rfloordiv__(self._ndarray)
                 return result
 
             # at this point we should only have numeric scalars; anything
@@ -673,7 +675,7 @@ def __floordiv__(self, other):
             return result
 
         elif is_integer_dtype(other.dtype) or is_float_dtype(other.dtype):
-            result = self._data // other
+            result = self._ndarray // other
             return type(self)(result)
 
         else:
@@ -693,7 +695,7 @@ def __rfloordiv__(self, other):
                     return result
 
                 # dispatch to Timedelta implementation
-                result = other.__floordiv__(self._data)
+                result = other.__floordiv__(self._ndarray)
                 return result
 
             raise TypeError(
@@ -763,15 +765,15 @@ def __rdivmod__(self, other):
 
     def __neg__(self) -> TimedeltaArray:
         if self.freq is not None:
-            return type(self)(-self._data, freq=-self.freq)
-        return type(self)(-self._data)
+            return type(self)(-self._ndarray, freq=-self.freq)
+        return type(self)(-self._ndarray)
 
     def __pos__(self) -> TimedeltaArray:
-        return type(self)(self._data, freq=self.freq)
+        return type(self)(self._ndarray, freq=self.freq)
 
     def __abs__(self) -> TimedeltaArray:
         # Note: freq is not preserved
-        return type(self)(np.abs(self._data))
+        return type(self)(np.abs(self._ndarray))
 
     # ----------------------------------------------------------------
     # Conversion Methods - Vectorized analogues of Timedelta methods
@@ -949,9 +951,12 @@ def sequence_to_td64ns(data, copy=False, unit=None, errors="raise"):
         data = np.array(data, copy=False)
     elif isinstance(data, ABCSeries):
         data = data._values
-    elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArray)):
+    elif isinstance(data, ABCTimedeltaIndex):
+        inferred_freq = data.freq
+        data = data._data._ndarray
+    elif isinstance(data, TimedeltaArray):
         inferred_freq = data.freq
-        data = data._data
+        data = data._ndarray
     elif isinstance(data, IntegerArray):
         data = data.to_numpy("int64", na_value=tslibs.iNaT)
     elif is_categorical_dtype(data.dtype):
diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
@@ -150,7 +150,7 @@ def _simple_new(
         result._cache = {}
 
         # For groupby perf. See note in indexes/base about _index_data
-        result._index_data = values._data
+        result._index_data = values._ndarray
 
         result._reset_identity()
         return result
@@ -165,7 +165,7 @@ def _is_all_dates(self) -> bool:
     @property
     def values(self) -> np.ndarray:
         # Note: PeriodArray overrides this to return an ndarray of objects.
-        return self._data._data
+        return self._data._ndarray
 
     def __array_wrap__(self, result, context=None):
         """