ENH: preserve non-nano DTA/TDA in Index/Series/DataFrame (pandas-dev#47230)

jbrockmendel · yehoshuadimarsky · commit f2dff263367d · 2022-07-13T10:18:04.000-04:00
* ENH: preserve non-nano DTA/TDA in Index/Series/DataFrame

* tighten xfail

* _prep_ndarray-&gt;_prep_ndarraylike

* xfail non-strict
diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
@@ -326,6 +326,18 @@ def __new__(
 
         name = maybe_extract_name(name, data, cls)
 
+        if (
+            isinstance(data, DatetimeArray)
+            and freq is lib.no_default
+            and tz is None
+            and dtype is None
+        ):
+            # fastpath, similar logic in TimedeltaIndex.__new__;
+            # Note in this particular case we retain non-nano.
+            if copy:
+                data = data.copy()
+            return cls._simple_new(data, name=name)
+
         dtarr = DatetimeArray._from_sequence_not_strict(
             data,
             dtype=dtype,
diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py
@@ -132,6 +132,7 @@ def __new__(
                 "represent unambiguous timedelta values durations."
             )
 
+        # FIXME: need to check for dtype/data match
         if isinstance(data, TimedeltaArray) and freq is lib.no_default:
             if copy:
                 data = data.copy()
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -326,7 +326,7 @@ def ndarray_to_mgr(
     else:
         # by definition an array here
         # the dtypes will be coerced to a single dtype
-        values = _prep_ndarray(values, copy=copy_on_sanitize)
+        values = _prep_ndarraylike(values, copy=copy_on_sanitize)
 
     if dtype is not None and not is_dtype_equal(values.dtype, dtype):
         # GH#40110 see similar check inside sanitize_array
@@ -341,7 +341,7 @@ def ndarray_to_mgr(
             allow_2d=True,
         )
 
-    # _prep_ndarray ensures that values.ndim == 2 at this point
+    # _prep_ndarraylike ensures that values.ndim == 2 at this point
     index, columns = _get_axes(
         values.shape[0], values.shape[1], index=index, columns=columns
     )
@@ -537,15 +537,16 @@ def treat_as_nested(data) -> bool:
 # ---------------------------------------------------------------------
 
 
-def _prep_ndarray(values, copy: bool = True) -> np.ndarray:
+def _prep_ndarraylike(
+    values, copy: bool = True
+) -> np.ndarray | DatetimeArray | TimedeltaArray:
     if isinstance(values, TimedeltaArray) or (
         isinstance(values, DatetimeArray) and values.tz is None
     ):
-        # On older numpy, np.asarray below apparently does not call __array__,
-        #  so nanoseconds get dropped.
-        values = values._ndarray
+        # By retaining DTA/TDA instead of unpacking, we end up retaining non-nano
+        pass
 
-    if not isinstance(values, (np.ndarray, ABCSeries, Index)):
+    elif not isinstance(values, (np.ndarray, ABCSeries, Index)):
         if len(values) == 0:
             return np.empty((0, 0), dtype=object)
         elif isinstance(values, range):
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -52,6 +52,7 @@
     IntervalArray,
     PeriodArray,
     SparseArray,
+    TimedeltaArray,
 )
 from pandas.core.api import Int64Index
 
@@ -2665,6 +2666,12 @@ def test_from_dict_with_missing_copy_false(self):
         )
         tm.assert_frame_equal(df, expected)
 
+    def test_construction_empty_array_multi_column_raises(self):
+        # GH#46822
+        msg = "Empty data passed with indices specified."
+        with pytest.raises(ValueError, match=msg):
+            DataFrame(data=np.array([]), columns=["a", "b"])
+
 
 class TestDataFrameConstructorIndexInference:
     def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self):
@@ -3086,8 +3093,50 @@ def test_tzaware_data_tznaive_dtype(self, constructor):
         assert np.all(result.dtypes == "M8[ns]")
         assert np.all(result == ts_naive)
 
-    def test_construction_empty_array_multi_column_raises(self):
-        # GH#46822
-        msg = "Empty data passed with indices specified."
-        with pytest.raises(ValueError, match=msg):
-            DataFrame(data=np.array([]), columns=["a", "b"])
+
+# TODO: better location for this test?
+class TestAllowNonNano:
+    # Until 2.0, we do not preserve non-nano dt64/td64 when passed as ndarray,
+    #  but do preserve it when passed as DTA/TDA
+
+    @pytest.fixture(params=[True, False])
+    def as_td(self, request):
+        return request.param
+
+    @pytest.fixture
+    def arr(self, as_td):
+        values = np.arange(5).astype(np.int64).view("M8[s]")
+        if as_td:
+            values = values - values[0]
+            return TimedeltaArray._simple_new(values, dtype=values.dtype)
+        else:
+            return DatetimeArray._simple_new(values, dtype=values.dtype)
+
+    def test_index_allow_non_nano(self, arr):
+        idx = Index(arr)
+        assert idx.dtype == arr.dtype
+
+    def test_dti_tdi_allow_non_nano(self, arr, as_td):
+        if as_td:
+            idx = pd.TimedeltaIndex(arr)
+        else:
+            idx = DatetimeIndex(arr)
+        assert idx.dtype == arr.dtype
+
+    def test_series_allow_non_nano(self, arr):
+        ser = Series(arr)
+        assert ser.dtype == arr.dtype
+
+    def test_frame_allow_non_nano(self, arr):
+        df = DataFrame(arr)
+        assert df.dtypes[0] == arr.dtype
+
+    @pytest.mark.xfail(
+        # TODO(2.0): xfail should become unnecessary
+        strict=False,
+        reason="stack_arrays converts TDA to ndarray, then goes "
+        "through ensure_wrapped_if_datetimelike",
+    )
+    def test_frame_from_dict_allow_non_nano(self, arr):
+        df = DataFrame({0: arr})
+        assert df.dtypes[0] == arr.dtype

Original file line number	Diff line number	Diff line change
`@@ -132,6 +132,7 @@ def __new__(`
`132`	`132`	`"represent unambiguous timedelta values durations."`
`133`	`133`	`)`
`134`	`134`
	`135`	`+ # FIXME: need to check for dtype/data match`
`135`	`136`	`if isinstance(data, TimedeltaArray) and freq is lib.no_default:`
`136`	`137`	`if copy:`
`137`	`138`	`data = data.copy()`