BUG: DataFrame.append with timedelta64 (#39574)

jbrockmendel · web-flow · commit bcf2406dc49e · 2021-02-12T15:13:53.000-05:00
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -431,6 +431,7 @@ Reshaping
 - Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`)
 - Bug in :meth:`DataFrame.sort_values` not reshaping index correctly after sorting on columns, when ``ignore_index=True`` (:issue:`39464`)
 - Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`)
+- Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``datetime64`` and ``timedelta64`` dtypes (:issue:`39574`)
 
 Sparse
 ^^^^^^
diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
@@ -61,7 +61,7 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
     return arr.astype(dtype, copy=False)
 
 
-def concat_compat(to_concat, axis: int = 0):
+def concat_compat(to_concat, axis: int = 0, ea_compat_axis: bool = False):
     """
     provide concatenation of an array of arrays each of which is a single
     'normalized' dtypes (in that for example, if it's object, then it is a
@@ -72,6 +72,9 @@ def concat_compat(to_concat, axis: int = 0):
     ----------
     to_concat : array of arrays
     axis : axis to provide concatenation
+    ea_compat_axis : bool, default False
+        For ExtensionArray compat, behave as if axis == 1 when determining
+        whether to drop empty arrays.
 
     Returns
     -------
@@ -91,7 +94,8 @@ def is_nonempty(x) -> bool:
     # marginal given that it would still require shape & dtype calculation and
     # np.concatenate which has them both implemented is compiled.
     non_empties = [x for x in to_concat if is_nonempty(x)]
-    if non_empties and axis == 0:
+    if non_empties and axis == 0 and not ea_compat_axis:
+        # ea_compat_axis see GH#39574
         to_concat = non_empties
 
     kinds = {obj.dtype.kind for obj in to_concat}
diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py
@@ -1,9 +1,8 @@
 from __future__ import annotations
 
-from collections import defaultdict
 import copy
 import itertools
-from typing import TYPE_CHECKING, Dict, List, Sequence, cast
+from typing import TYPE_CHECKING, Dict, List, Sequence
 
 import numpy as np
 
@@ -14,16 +13,13 @@
 from pandas.core.dtypes.cast import ensure_dtype_can_hold_na, find_common_type
 from pandas.core.dtypes.common import (
     is_categorical_dtype,
-    is_datetime64_dtype,
     is_datetime64tz_dtype,
+    is_dtype_equal,
     is_extension_array_dtype,
-    is_float_dtype,
-    is_numeric_dtype,
     is_sparse,
-    is_timedelta64_dtype,
 )
 from pandas.core.dtypes.concat import concat_compat
-from pandas.core.dtypes.missing import isna_all
+from pandas.core.dtypes.missing import is_valid_na_for_dtype, isna_all
 
 import pandas.core.algorithms as algos
 from pandas.core.arrays import DatetimeArray, ExtensionArray
@@ -33,7 +29,6 @@
 
 if TYPE_CHECKING:
     from pandas import Index
-    from pandas.core.arrays.sparse.dtype import SparseDtype
 
 
 def concatenate_block_managers(
@@ -232,6 +227,29 @@ def dtype(self):
             return blk.dtype
         return ensure_dtype_can_hold_na(blk.dtype)
 
+    def is_valid_na_for(self, dtype: DtypeObj) -> bool:
+        """
+        Check that we are all-NA of a type/dtype that is compatible with this dtype.
+        Augments `self.is_na` with an additional check of the type of NA values.
+        """
+        if not self.is_na:
+            return False
+        if self.block is None:
+            return True
+
+        if self.dtype == object:
+            values = self.block.values
+            return all(is_valid_na_for_dtype(x, dtype) for x in values.ravel(order="K"))
+
+        if self.dtype.kind == dtype.kind == "M" and not is_dtype_equal(
+            self.dtype, dtype
+        ):
+            # fill_values match but we should not cast self.block.values to dtype
+            return False
+
+        na_value = self.block.fill_value
+        return is_valid_na_for_dtype(na_value, dtype)
+
     @cache_readonly
     def is_na(self) -> bool:
         if self.block is None:
@@ -262,7 +280,7 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
         else:
             fill_value = upcasted_na
 
-            if self.is_na:
+            if self.is_valid_na_for(empty_dtype):
                 blk_dtype = getattr(self.block, "dtype", None)
 
                 if blk_dtype == np.dtype(object):
@@ -276,10 +294,9 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
                 if is_datetime64tz_dtype(blk_dtype) or is_datetime64tz_dtype(
                     empty_dtype
                 ):
-                    if self.block is None:
-                        # TODO(EA2D): special case unneeded with 2D EAs
-                        i8values = np.full(self.shape[1], fill_value.value)
-                        return DatetimeArray(i8values, dtype=empty_dtype)
+                    # TODO(EA2D): special case unneeded with 2D EAs
+                    i8values = np.full(self.shape[1], fill_value.value)
+                    return DatetimeArray(i8values, dtype=empty_dtype)
                 elif is_categorical_dtype(blk_dtype):
                     pass
                 elif is_extension_array_dtype(blk_dtype):
@@ -295,6 +312,8 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
                         empty_arr, allow_fill=True, fill_value=fill_value
                     )
                 else:
+                    # NB: we should never get here with empty_dtype integer or bool;
+                    #  if we did, the missing_arr.fill would cast to gibberish
                     missing_arr = np.empty(self.shape, dtype=empty_dtype)
                     missing_arr.fill(fill_value)
                     return missing_arr
@@ -362,14 +381,12 @@ def _concatenate_join_units(
         # concatting with at least one EA means we are concatting a single column
         # the non-EA values are 2D arrays with shape (1, n)
         to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat]
-        concat_values = concat_compat(to_concat, axis=0)
-        if not isinstance(concat_values, ExtensionArray) or (
-            isinstance(concat_values, DatetimeArray) and concat_values.tz is None
-        ):
+        concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True)
+        if not is_extension_array_dtype(concat_values.dtype):
             # if the result of concat is not an EA but an ndarray, reshape to
             # 2D to put it a non-EA Block
-            # special case DatetimeArray, which *is* an EA, but is put in a
-            # consolidated 2D block
+            # special case DatetimeArray/TimedeltaArray, which *is* an EA, but
+            # is put in a consolidated 2D block
             concat_values = np.atleast_2d(concat_values)
     else:
         concat_values = concat_compat(to_concat, axis=concat_axis)
@@ -419,108 +436,17 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj:
         return empty_dtype
 
     has_none_blocks = any(unit.block is None for unit in join_units)
-    dtypes = [None if unit.block is None else unit.dtype for unit in join_units]
 
-    filtered_dtypes = [
+    dtypes = [
         unit.dtype for unit in join_units if unit.block is not None and not unit.is_na
     ]
-    if not len(filtered_dtypes):
-        filtered_dtypes = [unit.dtype for unit in join_units if unit.block is not None]
-    dtype_alt = find_common_type(filtered_dtypes)
-
-    upcast_classes = _get_upcast_classes(join_units, dtypes)
-
-    if is_extension_array_dtype(dtype_alt):
-        return dtype_alt
-    elif dtype_alt == object:
-        return dtype_alt
-
-    # TODO: de-duplicate with maybe_promote?
-    # create the result
-    if "extension" in upcast_classes:
-        return np.dtype("object")
-    elif "bool" in upcast_classes:
-        if has_none_blocks:
-            return np.dtype(np.object_)
-        else:
-            return np.dtype(np.bool_)
-    elif "datetimetz" in upcast_classes:
-        # GH-25014. We use NaT instead of iNaT, since this eventually
-        # ends up in DatetimeArray.take, which does not allow iNaT.
-        dtype = upcast_classes["datetimetz"]
-        return dtype[0]
-    elif "datetime" in upcast_classes:
-        return np.dtype("M8[ns]")
-    elif "timedelta" in upcast_classes:
-        return np.dtype("m8[ns]")
-    else:
-        try:
-            common_dtype = np.find_common_type(upcast_classes, [])
-        except TypeError:
-            # At least one is an ExtensionArray
-            return np.dtype(np.object_)
-        else:
-            if is_float_dtype(common_dtype):
-                return common_dtype
-            elif is_numeric_dtype(common_dtype):
-                if has_none_blocks:
-                    return np.dtype(np.float64)
-                else:
-                    return common_dtype
-
-    msg = "invalid dtype determination in get_concat_dtype"
-    raise AssertionError(msg)
-
-
-def _get_upcast_classes(
-    join_units: Sequence[JoinUnit],
-    dtypes: Sequence[DtypeObj],
-) -> Dict[str, List[DtypeObj]]:
-    """Create mapping between upcast class names and lists of dtypes."""
-    upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list)
-    null_upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list)
-    for dtype, unit in zip(dtypes, join_units):
-        if dtype is None:
-            continue
-
-        upcast_cls = _select_upcast_cls_from_dtype(dtype)
-        # Null blocks should not influence upcast class selection, unless there
-        # are only null blocks, when same upcasting rules must be applied to
-        # null upcast classes.
-        if unit.is_na:
-            null_upcast_classes[upcast_cls].append(dtype)
-        else:
-            upcast_classes[upcast_cls].append(dtype)
-
-    if not upcast_classes:
-        upcast_classes = null_upcast_classes
-
-    return upcast_classes
-
-
-def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str:
-    """Select upcast class name based on dtype."""
-    if is_categorical_dtype(dtype):
-        return "extension"
-    elif is_datetime64tz_dtype(dtype):
-        return "datetimetz"
-    elif is_extension_array_dtype(dtype):
-        return "extension"
-    elif issubclass(dtype.type, np.bool_):
-        return "bool"
-    elif issubclass(dtype.type, np.object_):
-        return "object"
-    elif is_datetime64_dtype(dtype):
-        return "datetime"
-    elif is_timedelta64_dtype(dtype):
-        return "timedelta"
-    elif is_sparse(dtype):
-        dtype = cast("SparseDtype", dtype)
-        return dtype.subtype.name
-    elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
-        return dtype.name
-    else:
-        return "float"
+    if not len(dtypes):
+        dtypes = [unit.dtype for unit in join_units if unit.block is not None]
+
+    dtype = find_common_type(dtypes)
+    if has_none_blocks:
+        dtype = ensure_dtype_can_hold_na(dtype)
+    return dtype
 
 
 def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool:
diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py
@@ -334,9 +334,9 @@ def test_append_missing_column_proper_upcast(self, sort):
     def test_append_empty_frame_to_series_with_dateutil_tz(self):
         # GH 23682
         date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc())
-        s = Series({"date": date, "a": 1.0, "b": 2.0})
+        ser = Series({"date": date, "a": 1.0, "b": 2.0})
         df = DataFrame(columns=["c", "d"])
-        result_a = df.append(s, ignore_index=True)
+        result_a = df.append(ser, ignore_index=True)
         expected = DataFrame(
             [[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"]
         )
@@ -350,13 +350,12 @@ def test_append_empty_frame_to_series_with_dateutil_tz(self):
         )
         expected["c"] = expected["c"].astype(object)
         expected["d"] = expected["d"].astype(object)
-
-        result_b = result_a.append(s, ignore_index=True)
+        result_b = result_a.append(ser, ignore_index=True)
         tm.assert_frame_equal(result_b, expected)
 
         # column order is different
         expected = expected[["c", "d", "date", "a", "b"]]
-        result = df.append([s, s], ignore_index=True)
+        result = df.append([ser, ser], ignore_index=True)
         tm.assert_frame_equal(result, expected)
 
     def test_append_empty_tz_frame_with_datetime64ns(self):
@@ -378,12 +377,27 @@ def test_append_empty_tz_frame_with_datetime64ns(self):
     @pytest.mark.parametrize(
         "dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
     )
-    def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str):
+    @pytest.mark.parametrize("val", [1, "NaT"])
+    def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str, val):
         # https://github.com/pandas-dev/pandas/issues/35460
         df = DataFrame(columns=["a"]).astype(dtype_str)
 
-        other = DataFrame({"a": [np.timedelta64("NaT", "ns")]})
+        other = DataFrame({"a": [np.timedelta64(val, "ns")]})
         result = df.append(other, ignore_index=True)
 
         expected = other.astype(object)
         tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
+    )
+    @pytest.mark.parametrize("val", [1, "NaT"])
+    def test_append_frame_with_timedelta64ns_nat(self, dtype_str, val):
+        # https://github.com/pandas-dev/pandas/issues/35460
+        df = DataFrame({"a": pd.array([1], dtype=dtype_str)})
+
+        other = DataFrame({"a": [np.timedelta64(val, "ns")]})
+        result = df.append(other, ignore_index=True)
+
+        expected = DataFrame({"a": [df.iloc[0, 0], other.iloc[0, 0]]}, dtype=object)
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py