REF: make DatetimeIndex._simple_new actually simple (pandas-dev#32282)

jbrockmendel · SeeminSyed · commit 74a178ad1b8c · 2020-03-22T19:55:07.000-04:00
diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx
@@ -114,7 +114,18 @@ def apply_index_wraps(func):
     # Note: normally we would use `@functools.wraps(func)`, but this does
     # not play nicely with cython class methods
     def wrapper(self, other):
-        result = func(self, other)
+
+        is_index = getattr(other, "_typ", "") == "datetimeindex"
+
+        # operate on DatetimeArray
+        arr = other._data if is_index else other
+
+        result = func(self, arr)
+
+        if is_index:
+            # Wrap DatetimeArray result back to DatetimeIndex
+            result = type(other)._simple_new(result, name=other.name)
+
         if self.normalize:
             result = result.to_period('D').to_timestamp()
         return result
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -3281,13 +3281,11 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None):
         target = _ensure_has_len(target)  # target may be an iterator
 
         if not isinstance(target, Index) and len(target) == 0:
-            attrs = self._get_attributes_dict()
-            attrs.pop("freq", None)  # don't preserve freq
             if isinstance(self, ABCRangeIndex):
                 values = range(0)
             else:
                 values = self._data[:0]  # appropriately-dtyped empty array
-            target = self._simple_new(values, **attrs)
+            target = self._simple_new(values, name=self.name)
         else:
             target = ensure_index(target)
 
diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
@@ -622,21 +622,11 @@ def _shallow_copy(self, values=None, name: Label = lib.no_default):
         if values is None:
             values = self._data
 
-        if isinstance(values, type(self)):
-            values = values._data
         if isinstance(values, np.ndarray):
             # TODO: We would rather not get here
             values = type(self._data)(values, dtype=self.dtype)
 
-        attributes = self._get_attributes_dict()
-
-        if self.freq is not None:
-            if isinstance(values, (DatetimeArray, TimedeltaArray)):
-                if values.freq is None:
-                    del attributes["freq"]
-
-        attributes["name"] = name
-        result = self._simple_new(values, **attributes)
+        result = type(self)._simple_new(values, name=name)
         result._cache = cache
         return result
 
@@ -780,7 +770,10 @@ def _fast_union(self, other, sort=None):
             loc = right.searchsorted(left_start, side="left")
             right_chunk = right.values[:loc]
             dates = concat_compat((left.values, right_chunk))
-            return self._shallow_copy(dates)
+            result = self._shallow_copy(dates)
+            result._set_freq("infer")
+            # TODO: can we infer that it has self.freq?
+            return result
         else:
             left, right = other, self
 
@@ -792,7 +785,10 @@ def _fast_union(self, other, sort=None):
             loc = right.searchsorted(left_end, side="right")
             right_chunk = right.values[loc:]
             dates = concat_compat((left.values, right_chunk))
-            return self._shallow_copy(dates)
+            result = self._shallow_copy(dates)
+            result._set_freq("infer")
+            # TODO: can we infer that it has self.freq?
+            return result
         else:
             return left
 
diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
@@ -7,17 +7,13 @@
 
 from pandas._libs import NaT, Period, Timestamp, index as libindex, lib, tslib as libts
 from pandas._libs.tslibs import fields, parsing, timezones
+from pandas._typing import Label
 from pandas.util._decorators import cache_readonly
 
 from pandas.core.dtypes.common import _NS_DTYPE, is_float, is_integer, is_scalar
-from pandas.core.dtypes.dtypes import DatetimeTZDtype
 from pandas.core.dtypes.missing import is_valid_nat_for_dtype
 
-from pandas.core.arrays.datetimes import (
-    DatetimeArray,
-    tz_to_dtype,
-    validate_tz_from_dtype,
-)
+from pandas.core.arrays.datetimes import DatetimeArray, tz_to_dtype
 import pandas.core.common as com
 from pandas.core.indexes.base import Index, InvalidIndexError, maybe_extract_name
 from pandas.core.indexes.datetimelike import DatetimeTimedeltaMixin
@@ -36,7 +32,20 @@ def _new_DatetimeIndex(cls, d):
     if "data" in d and not isinstance(d["data"], DatetimeIndex):
         # Avoid need to verify integrity by calling simple_new directly
         data = d.pop("data")
-        result = cls._simple_new(data, **d)
+        if not isinstance(data, DatetimeArray):
+            # For backward compat with older pickles, we may need to construct
+            #  a DatetimeArray to adapt to the newer _simple_new signature
+            tz = d.pop("tz")
+            freq = d.pop("freq")
+            dta = DatetimeArray._simple_new(data, dtype=tz_to_dtype(tz), freq=freq)
+        else:
+            dta = data
+            for key in ["tz", "freq"]:
+                # These are already stored in our DatetimeArray; if they are
+                #  also in the pickle and don't match, we have a problem.
+                if key in d:
+                    assert d.pop(key) == getattr(dta, key)
+        result = cls._simple_new(dta, **d)
     else:
         with warnings.catch_warnings():
             # TODO: If we knew what was going in to **d, we might be able to
@@ -244,34 +253,16 @@ def __new__(
         return subarr
 
     @classmethod
-    def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None):
-        """
-        We require the we have a dtype compat for the values
-        if we are passed a non-dtype compat, then coerce using the constructor
-        """
-        if isinstance(values, DatetimeArray):
-            if tz:
-                tz = validate_tz_from_dtype(dtype, tz)
-                dtype = DatetimeTZDtype(tz=tz)
-            elif dtype is None:
-                dtype = _NS_DTYPE
-
-            values = DatetimeArray(values, freq=freq, dtype=dtype)
-            tz = values.tz
-            freq = values.freq
-            values = values._data
-
-        dtype = tz_to_dtype(tz)
-        dtarr = DatetimeArray._simple_new(values, freq=freq, dtype=dtype)
-        assert isinstance(dtarr, DatetimeArray)
+    def _simple_new(cls, values: DatetimeArray, name: Label = None):
+        assert isinstance(values, DatetimeArray), type(values)
 
         result = object.__new__(cls)
-        result._data = dtarr
+        result._data = values
         result.name = name
         result._cache = {}
         result._no_setting_name = False
         # For groupby perf. See note in indexes/base about _index_data
-        result._index_data = dtarr._data
+        result._index_data = values._data
         result._reset_identity()
         return result
 
diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py
@@ -1,6 +1,7 @@
 """ implement the TimedeltaIndex """
 
 from pandas._libs import NaT, Timedelta, index as libindex
+from pandas._typing import Label
 from pandas.util._decorators import Appender
 
 from pandas.core.dtypes.common import (
@@ -154,7 +155,7 @@ def __new__(
         if isinstance(data, TimedeltaArray) and freq is None:
             if copy:
                 data = data.copy()
-            return cls._simple_new(data, name=name, freq=freq)
+            return cls._simple_new(data, name=name)
 
         if isinstance(data, TimedeltaIndex) and freq is None and name is None:
             if copy:
@@ -170,12 +171,8 @@ def __new__(
         return cls._simple_new(tdarr, name=name)
 
     @classmethod
-    def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE):
-        # `dtype` is passed by _shallow_copy in corner cases, should always
-        #  be timedelta64[ns] if present
-        assert dtype == _TD_DTYPE, dtype
+    def _simple_new(cls, values: TimedeltaArray, name: Label = None):
         assert isinstance(values, TimedeltaArray)
-        assert freq is None or values.freq == freq
 
         result = object.__new__(cls)
         result._data = values
diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py
@@ -65,8 +65,8 @@ def test_compare_len1_raises(self):
         #  to the case where one has length-1, which numpy would broadcast
         data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9
 
-        idx = self.array_cls._simple_new(data, freq="D")
-        arr = self.index_cls(idx)
+        arr = self.array_cls._simple_new(data, freq="D")
+        idx = self.index_cls(arr)
 
         with pytest.raises(ValueError, match="Lengths must match"):
             arr == arr[:1]
diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py
@@ -363,7 +363,7 @@ def test_equals(self):
         assert not idx.equals(pd.Series(idx2))
 
         # same internal, different tz
-        idx3 = pd.DatetimeIndex._simple_new(idx.asi8, tz="US/Pacific")
+        idx3 = pd.DatetimeIndex(idx.asi8, tz="US/Pacific")
         tm.assert_numpy_array_equal(idx.asi8, idx3.asi8)
         assert not idx.equals(idx3)
         assert not idx.equals(idx3.copy())
diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py
@@ -337,9 +337,6 @@ def apply_index(self, i):
                 # integer addition on PeriodIndex is deprecated,
                 #   so we directly use _time_shift instead
                 asper = i.to_period("W")
-                if not isinstance(asper._data, np.ndarray):
-                    # unwrap PeriodIndex --> PeriodArray
-                    asper = asper._data
                 shifted = asper._time_shift(weeks)
                 i = shifted.to_timestamp() + i.to_perioddelta("W")
 
@@ -629,9 +626,6 @@ def apply_index(self, i):
         # to_period rolls forward to next BDay; track and
         # reduce n where it does when rolling forward
         asper = i.to_period("B")
-        if not isinstance(asper._data, np.ndarray):
-            # unwrap PeriodIndex --> PeriodArray
-            asper = asper._data
 
         if self.n > 0:
             shifted = (i.to_perioddelta("B") - time).asi8 != 0
@@ -1384,9 +1378,6 @@ def apply_index(self, i):
         # integer-array addition on PeriodIndex is deprecated,
         #  so we use _addsub_int_array directly
         asper = i.to_period("M")
-        if not isinstance(asper._data, np.ndarray):
-            # unwrap PeriodIndex --> PeriodArray
-            asper = asper._data
 
         shifted = asper._addsub_int_array(roll // 2, operator.add)
         i = type(dti)(shifted.to_timestamp())
@@ -1582,9 +1573,6 @@ def apply_index(self, i):
             # integer addition on PeriodIndex is deprecated,
             #  so we use _time_shift directly
             asper = i.to_period("W")
-            if not isinstance(asper._data, np.ndarray):
-                # unwrap PeriodIndex --> PeriodArray
-                asper = asper._data
 
             shifted = asper._time_shift(self.n)
             return shifted.to_timestamp() + i.to_perioddelta("W")
@@ -1608,9 +1596,6 @@ def _end_apply_index(self, dtindex):
 
         base, mult = libfrequencies.get_freq_code(self.freqstr)
         base_period = dtindex.to_period(base)
-        if not isinstance(base_period._data, np.ndarray):
-            # unwrap PeriodIndex --> PeriodArray
-            base_period = base_period._data
 
         if self.n > 0:
             # when adding, dates on end roll to next