REF: strictness/simplification in DatetimeArray/Index _simple_new (pandas-dev#23431)

jbrockmendel · Pingviinituutti · commit fa0ed2ed432c · 2019-02-28T10:19:10.000+02:00
diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -177,16 +177,14 @@ def _simple_new(cls, values, freq=None, tz=None, **kwargs):
         we require the we have a dtype compat for the values
         if we are passed a non-dtype compat, then coerce using the constructor
         """
+        assert isinstance(values, np.ndarray), type(values)
+        if values.dtype == 'i8':
+            # for compat with datetime/timedelta/period shared methods,
+            #  we can sometimes get here with int64 values.  These represent
+            #  nanosecond UTC (or tz-naive) unix timestamps
+            values = values.view('M8[ns]')
 
-        if getattr(values, 'dtype', None) is None:
-            # empty, but with dtype compat
-            if values is None:
-                values = np.empty(0, dtype=_NS_DTYPE)
-                return cls(values, freq=freq, tz=tz, **kwargs)
-            values = np.array(values, copy=False)
-
-        if not is_datetime64_dtype(values):
-            values = ensure_int64(values).view(_NS_DTYPE)
+        assert values.dtype == 'M8[ns]', values.dtype
 
         result = object.__new__(cls)
         result._data = values
@@ -209,6 +207,16 @@ def __new__(cls, values, freq=None, tz=None, dtype=None):
         # if dtype has an embedded tz, capture it
         tz = dtl.validate_tz_from_dtype(dtype, tz)
 
+        if isinstance(values, DatetimeArrayMixin):
+            # extract nanosecond unix timestamps
+            values = values.asi8
+        if values.dtype == 'i8':
+            values = values.view('M8[ns]')
+
+        assert isinstance(values, np.ndarray), type(values)
+        assert is_datetime64_dtype(values)  # not yet assured nanosecond
+        values = conversion.ensure_datetime64ns(values, copy=False)
+
         result = cls._simple_new(values, freq=freq, tz=tz)
         if freq_infer:
             inferred = result.inferred_freq
@@ -271,7 +279,7 @@ def _generate_range(cls, start, end, periods, freq, tz=None,
             # TODO: consider re-implementing _cached_range; GH#17914
             index = _generate_regular_range(cls, start, end, periods, freq)
 
-            if tz is not None and getattr(index, 'tz', None) is None:
+            if tz is not None and index.tz is None:
                 arr = conversion.tz_localize_to_utc(
                     ensure_int64(index.values),
                     tz, ambiguous=ambiguous)
@@ -843,7 +851,8 @@ def to_perioddelta(self, freq):
         # TODO: consider privatizing (discussion in GH#23113)
         from pandas.core.arrays.timedeltas import TimedeltaArrayMixin
         i8delta = self.asi8 - self.to_period(freq).to_timestamp().asi8
-        return TimedeltaArrayMixin(i8delta)
+        m8delta = i8delta.view('m8[ns]')
+        return TimedeltaArrayMixin(m8delta)
 
     # -----------------------------------------------------------------
     # Properties - Vectorized Timestamp Properties/Methods
@@ -1320,6 +1329,27 @@ def to_julian_date(self):
 
 
 def _generate_regular_range(cls, start, end, periods, freq):
+    """
+    Generate a range of dates with the spans between dates described by
+    the given `freq` DateOffset.
+
+    Parameters
+    ----------
+    cls : class
+    start : Timestamp or None
+        first point of produced date range
+    end : Timestamp or None
+        last point of produced date range
+    periods : int
+        number of periods in produced date range
+    freq : DateOffset
+        describes space between dates in produced date range
+
+    Returns
+    -------
+    ndarray[np.int64] representing nanosecond unix timestamps
+
+    """
     if isinstance(freq, Tick):
         stride = freq.nanos
         if periods is None:
@@ -1342,22 +1372,22 @@ def _generate_regular_range(cls, start, end, periods, freq):
             raise ValueError("at least 'start' or 'end' should be specified "
                              "if a 'period' is given.")
 
-        data = np.arange(b, e, stride, dtype=np.int64)
-        data = cls._simple_new(data.view(_NS_DTYPE), None, tz=tz)
+        values = np.arange(b, e, stride, dtype=np.int64)
+
     else:
         tz = None
         # start and end should have the same timezone by this point
-        if isinstance(start, Timestamp):
+        if start is not None:
             tz = start.tz
-        elif isinstance(end, Timestamp):
+        elif end is not None:
             tz = end.tz
 
         xdr = generate_range(start=start, end=end,
                              periods=periods, offset=freq)
 
-        values = np.array([x.value for x in xdr])
-        data = cls._simple_new(values, freq=freq, tz=tz)
+        values = np.array([x.value for x in xdr], dtype=np.int64)
 
+    data = cls._simple_new(values, freq=freq, tz=tz)
     return data
 
 
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -18,7 +18,7 @@
 from pandas.core.dtypes.generic import (
     ABCSeries, ABCDataFrame,
     ABCMultiIndex,
-    ABCPeriodIndex, ABCTimedeltaIndex,
+    ABCPeriodIndex, ABCTimedeltaIndex, ABCDatetimeIndex,
     ABCDateOffset)
 from pandas.core.dtypes.missing import isna, array_equivalent
 from pandas.core.dtypes.cast import maybe_cast_to_integer_array
@@ -545,6 +545,10 @@ def _shallow_copy(self, values=None, **kwargs):
 
         # _simple_new expects an ndarray
         values = getattr(values, 'values', values)
+        if isinstance(values, ABCDatetimeIndex):
+            # `self.values` returns `self` for tz-aware, so we need to unwrap
+            #  more specifically
+            values = values.asi8
 
         return self._simple_new(values, **attributes)
 
@@ -2947,7 +2951,8 @@ def difference(self, other):
         self._assert_can_do_setop(other)
 
         if self.equals(other):
-            return self._shallow_copy([])
+            # pass an empty np.ndarray with the appropriate dtype
+            return self._shallow_copy(self._data[:0])
 
         other, result_name = self._convert_can_do_setop(other)
 
@@ -3715,7 +3720,8 @@ def reindex(self, target, method=None, level=None, limit=None,
         if not isinstance(target, Index) and len(target) == 0:
             attrs = self._get_attributes_dict()
             attrs.pop('freq', None)  # don't preserve freq
-            target = self._simple_new(None, dtype=self.dtype, **attrs)
+            values = self._data[:0]  # appropriately-dtyped empty array
+            target = self._simple_new(values, dtype=self.dtype, **attrs)
         else:
             target = ensure_index(target)
 
diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
@@ -175,6 +175,7 @@ class DatetimeIndex(DatetimeArrayMixin, DatelikeOps, TimelikeOps,
     pandas.to_datetime : Convert argument to datetime
     """
     _resolution = cache_readonly(DatetimeArrayMixin._resolution.fget)
+    _shallow_copy = Index._shallow_copy
 
     _typ = 'datetimeindex'
     _join_precedence = 10
@@ -298,6 +299,9 @@ def __new__(cls, data=None,
                 data = data.astype(np.int64, copy=False)
             subarr = data.view(_NS_DTYPE)
 
+        assert isinstance(subarr, np.ndarray), type(subarr)
+        assert subarr.dtype == 'M8[ns]', subarr.dtype
+
         subarr = cls._simple_new(subarr, name=name, freq=freq, tz=tz)
         if dtype is not None:
             if not is_dtype_equal(subarr.dtype, dtype):
@@ -329,22 +333,8 @@ def _simple_new(cls, values, name=None, freq=None, tz=None,
         we require the we have a dtype compat for the values
         if we are passed a non-dtype compat, then coerce using the constructor
         """
-
-        if getattr(values, 'dtype', None) is None:
-            # empty, but with dtype compat
-            if values is None:
-                values = np.empty(0, dtype=_NS_DTYPE)
-                return cls(values, name=name, freq=freq, tz=tz,
-                           dtype=dtype, **kwargs)
-            values = np.array(values, copy=False)
-
-        if not is_datetime64_dtype(values):
-            values = ensure_int64(values).view(_NS_DTYPE)
-
-        values = getattr(values, 'values', values)
-
-        assert isinstance(values, np.ndarray), "values is not an np.ndarray"
-        assert is_datetime64_dtype(values)
+        # DatetimeArray._simple_new will accept either i8 or M8[ns] dtypes
+        assert isinstance(values, np.ndarray), type(values)
 
         result = super(DatetimeIndex, cls)._simple_new(values, freq, tz,
                                                        **kwargs)
diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py
@@ -371,7 +371,7 @@ def _nat_new(self, box=True):
     def to_timestamp(self, freq=None, how='start'):
         from pandas import DatetimeIndex
         result = self._data.to_timestamp(freq=freq, how=how)
-        return DatetimeIndex._simple_new(result,
+        return DatetimeIndex._simple_new(result.asi8,
                                          name=self.name,
                                          freq=result.freq)