DEPR: concat ignoring all-NA entries (#58314)

jbrockmendel · web-flow · commit 2dbfbbe7edb6 · 2024-04-19T14:35:31.000-07:00
* DEPR: concat ignoring all-NA entries

* fixup
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -206,6 +206,7 @@ Removal of prior version deprecations/changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 - :class:`.DataFrameGroupBy.idxmin`, :class:`.DataFrameGroupBy.idxmax`, :class:`.SeriesGroupBy.idxmin`, and :class:`.SeriesGroupBy.idxmax` will now raise a ``ValueError`` when used with ``skipna=False`` and an NA value is encountered (:issue:`10694`)
 - :func:`concat` no longer ignores empty objects when determining output dtypes (:issue:`39122`)
+- :func:`concat` with all-NA entries no longer ignores the dtype of those entries when determining the result dtype (:issue:`40893`)
 - :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`)
 - :meth:`DataFrame.groupby` with ``as_index=False`` and aggregation methods will no longer exclude from the result the groupings that do not arise from the input (:issue:`49519`)
 - :meth:`Series.dt.to_pydatetime` now returns a :class:`Series` of :py:class:`datetime.datetime` objects (:issue:`52459`)
diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py
@@ -4,7 +4,6 @@
     TYPE_CHECKING,
     cast,
 )
-import warnings
 
 import numpy as np
 
@@ -16,27 +15,18 @@
 )
 from pandas._libs.missing import NA
 from pandas.util._decorators import cache_readonly
-from pandas.util._exceptions import find_stack_level
 
 from pandas.core.dtypes.cast import (
     ensure_dtype_can_hold_na,
     find_common_type,
 )
 from pandas.core.dtypes.common import (
     is_1d_only_ea_dtype,
-    is_scalar,
     needs_i8_conversion,
 )
 from pandas.core.dtypes.concat import concat_compat
-from pandas.core.dtypes.dtypes import (
-    ExtensionDtype,
-    SparseDtype,
-)
-from pandas.core.dtypes.missing import (
-    is_valid_na_for_dtype,
-    isna,
-    isna_all,
-)
+from pandas.core.dtypes.dtypes import ExtensionDtype
+from pandas.core.dtypes.missing import is_valid_na_for_dtype
 
 from pandas.core.construction import ensure_wrapped_if_datetimelike
 from pandas.core.internals.blocks import (
@@ -100,6 +90,7 @@ def concatenate_managers(
         if first_dtype in [np.float64, np.float32]:
             # TODO: support more dtypes here.  This will be simpler once
             #  JoinUnit.is_na behavior is deprecated.
+            #  (update 2024-04-13 that deprecation has been enforced)
             if (
                 all(_is_homogeneous_mgr(mgr, first_dtype) for mgr, _ in mgrs_indexers)
                 and len(mgrs_indexers) > 1
@@ -351,41 +342,6 @@ def _is_valid_na_for(self, dtype: DtypeObj) -> bool:
 
     @cache_readonly
     def is_na(self) -> bool:
-        blk = self.block
-        if blk.dtype.kind == "V":
-            return True
-
-        if not blk._can_hold_na:
-            return False
-
-        values = blk.values
-        if values.size == 0:
-            # GH#39122 this case will return False once deprecation is enforced
-            return True
-
-        if isinstance(values.dtype, SparseDtype):
-            return False
-
-        if values.ndim == 1:
-            # TODO(EA2D): no need for special case with 2D EAs
-            val = values[0]
-            if not is_scalar(val) or not isna(val):
-                # ideally isna_all would do this short-circuiting
-                return False
-            return isna_all(values)
-        else:
-            val = values[0][0]
-            if not is_scalar(val) or not isna(val):
-                # ideally isna_all would do this short-circuiting
-                return False
-            return all(isna_all(row) for row in values)
-
-    @cache_readonly
-    def is_na_after_size_and_isna_all_deprecation(self) -> bool:
-        """
-        Will self.is_na be True after values.size == 0 deprecation and isna_all
-        deprecation are enforced?
-        """
         blk = self.block
         if blk.dtype.kind == "V":
             return True
@@ -421,7 +377,7 @@ def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike
     """
     Concatenate values from several join units along axis=1.
     """
-    empty_dtype, empty_dtype_future = _get_empty_dtype(join_units)
+    empty_dtype = _get_empty_dtype(join_units)
 
     has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units)
     upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks)
@@ -446,18 +402,6 @@ def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike
     else:
         concat_values = concat_compat(to_concat, axis=1)
 
-    if empty_dtype != empty_dtype_future:
-        if empty_dtype == concat_values.dtype:
-            # GH#39122, GH#40893
-            warnings.warn(
-                "The behavior of DataFrame concatenation with empty or all-NA "
-                "entries is deprecated. In a future version, this will no longer "
-                "exclude empty or all-NA columns when determining the result dtypes. "
-                "To retain the old behavior, exclude the relevant entries before "
-                "the concat operation.",
-                FutureWarning,
-                stacklevel=find_stack_level(),
-            )
     return concat_values
 
 
@@ -484,7 +428,7 @@ def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool):
     raise NotImplementedError
 
 
-def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> tuple[DtypeObj, DtypeObj]:
+def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj:
     """
     Return dtype and N/A values to use when concatenating specified units.
 
@@ -496,38 +440,17 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> tuple[DtypeObj, DtypeObj
     """
     if lib.dtypes_all_equal([ju.block.dtype for ju in join_units]):
         empty_dtype = join_units[0].block.dtype
-        return empty_dtype, empty_dtype
+        return empty_dtype
 
     has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units)
 
     dtypes = [unit.block.dtype for unit in join_units if not unit.is_na]
-    if not len(dtypes):
-        dtypes = [
-            unit.block.dtype for unit in join_units if unit.block.dtype.kind != "V"
-        ]
 
     dtype = find_common_type(dtypes)
     if has_none_blocks:
         dtype = ensure_dtype_can_hold_na(dtype)
 
-    dtype_future = dtype
-    if len(dtypes) != len(join_units):
-        dtypes_future = [
-            unit.block.dtype
-            for unit in join_units
-            if not unit.is_na_after_size_and_isna_all_deprecation
-        ]
-        if not len(dtypes_future):
-            dtypes_future = [
-                unit.block.dtype for unit in join_units if unit.block.dtype.kind != "V"
-            ]
-
-        if len(dtypes) != len(dtypes_future):
-            dtype_future = find_common_type(dtypes_future)
-            if has_none_blocks:
-                dtype_future = ensure_dtype_can_hold_na(dtype_future)
-
-    return dtype, dtype_future
+    return dtype
 
 
 def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool:
diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py
@@ -332,7 +332,7 @@ def test_append_empty_tz_frame_with_datetime64ns(self):
 
         # pd.NaT gets inferred as tz-naive, so append result is tz-naive
         result = df._append({"a": pd.NaT}, ignore_index=True)
-        expected = DataFrame({"a": [np.nan]}, dtype=object)
+        expected = DataFrame({"a": [pd.NaT]}, dtype=object)
         tm.assert_frame_equal(result, expected)
 
         # also test with typed value to append
@@ -359,12 +359,6 @@ def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str, val):
         result = df._append(other, ignore_index=True)
 
         expected = other.astype(object)
-        if isinstance(val, str) and dtype_str != "int64":
-            # TODO: expected used to be `other.astype(object)` which is a more
-            #  reasonable result.  This was changed when tightening
-            #  assert_frame_equal's treatment of mismatched NAs to match the
-            #  existing behavior.
-            expected = DataFrame({"a": [np.nan]}, dtype=object)
         tm.assert_frame_equal(result, expected)
 
     @pytest.mark.parametrize(
diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py
@@ -789,21 +789,24 @@ def test_concat_ignore_empty_object_float(empty_dtype, df_dtype):
     df = DataFrame({"foo": [1, 2], "bar": [1, 2]}, dtype=df_dtype)
     empty = DataFrame(columns=["foo", "bar"], dtype=empty_dtype)
 
-    msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
-    warn = None
+    needs_update = False
     if df_dtype == "datetime64[ns]" or (
         df_dtype == "float64" and empty_dtype != "float64"
     ):
-        warn = FutureWarning
-    with tm.assert_produces_warning(warn, match=msg):
-        result = concat([empty, df])
+        needs_update = True
+
+    result = concat([empty, df])
     expected = df
     if df_dtype == "int64":
         # TODO what exact behaviour do we want for integer eventually?
         if empty_dtype == "float64":
             expected = df.astype("float64")
         else:
             expected = df.astype("object")
+
+    if needs_update:
+        # GH#40893 changed the expected here to retain dependence on empty
+        expected = expected.astype(object)
     tm.assert_frame_equal(result, expected)
 
 
@@ -820,17 +823,19 @@ def test_concat_ignore_all_na_object_float(empty_dtype, df_dtype):
         else:
             df_dtype = "float64"
 
-    msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
-    warn = None
+    needs_update = False
     if empty_dtype != df_dtype and empty_dtype is not None:
-        warn = FutureWarning
+        needs_update = True
     elif df_dtype == "datetime64[ns]":
-        warn = FutureWarning
+        needs_update = True
 
-    with tm.assert_produces_warning(warn, match=msg):
-        result = concat([empty, df], ignore_index=True)
+    result = concat([empty, df], ignore_index=True)
 
     expected = DataFrame({"foo": [np.nan, 1, 2], "bar": [np.nan, 1, 2]}, dtype=df_dtype)
+    if needs_update:
+        # GH#40893 changed the expected here to retain dependence on empty
+        expected = expected.astype(object)
+        expected.iloc[0] = np.nan
     tm.assert_frame_equal(result, expected)
 
 
@@ -841,10 +846,16 @@ def test_concat_ignore_empty_from_reindex():
 
     aligned = df2.reindex(columns=df1.columns)
 
-    msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        result = concat([df1, aligned], ignore_index=True)
-    expected = df1 = DataFrame({"a": [1, 2], "b": [pd.Timestamp("2012-01-01"), pd.NaT]})
+    result = concat([df1, aligned], ignore_index=True)
+
+    expected = DataFrame(
+        {
+            "a": [1, 2],
+            "b": pd.array([pd.Timestamp("2012-01-01"), np.nan], dtype=object),
+        },
+        dtype=object,
+    )
+    expected["a"] = expected["a"].astype("int64")
     tm.assert_frame_equal(result, expected)
 
 
@@ -907,10 +918,10 @@ def test_concat_none_with_timezone_timestamp():
     # GH#52093
     df1 = DataFrame([{"A": None}])
     df2 = DataFrame([{"A": pd.Timestamp("1990-12-20 00:00:00+00:00")}])
-    msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        result = concat([df1, df2], ignore_index=True)
-    expected = DataFrame({"A": [None, pd.Timestamp("1990-12-20 00:00:00+00:00")]})
+    result = concat([df1, df2], ignore_index=True)
+    expected = DataFrame(
+        {"A": [None, pd.Timestamp("1990-12-20 00:00:00+00:00")]}, dtype=object
+    )
     tm.assert_frame_equal(result, expected)
 
 
diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py
@@ -226,15 +226,6 @@ def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, item):
         expected = expected.apply(lambda x: x.dt.tz_localize(tz2))
         if tz1 != tz2:
             expected = expected.astype(object)
-            if item is pd.NaT:
-                # GH#18463
-                # TODO: setting nan here is to keep the test passing as we
-                #  make assert_frame_equal stricter, but is nan really the
-                #  ideal behavior here?
-                if tz1 is not None:
-                    expected.iloc[-1, 0] = np.nan
-                else:
-                    expected.iloc[:-1, 0] = np.nan
 
         tm.assert_frame_equal(result, expected)
 
@@ -590,8 +581,9 @@ def test_concat_float_datetime64():
     result = concat([df_time.iloc[:0], df_float])
     tm.assert_frame_equal(result, expected)
 
-    expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")})
-    msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        result = concat([df_time, df_float.iloc[:0]])
+    expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")}).astype(
+        object
+    )
+
+    result = concat([df_time, df_float.iloc[:0]])
     tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
@@ -709,16 +709,14 @@ def test_join_append_timedeltas(self):
             {"d": [datetime(2013, 11, 5, 5, 56)], "t": [timedelta(0, 22500)]}
         )
         df = DataFrame(columns=list("dt"))
-        msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
-        warn = FutureWarning
-        with tm.assert_produces_warning(warn, match=msg):
-            df = concat([df, d], ignore_index=True)
-            result = concat([df, d], ignore_index=True)
+        df = concat([df, d], ignore_index=True)
+        result = concat([df, d], ignore_index=True)
         expected = DataFrame(
             {
                 "d": [datetime(2013, 11, 5, 5, 56), datetime(2013, 11, 5, 5, 56)],
                 "t": [timedelta(0, 22500), timedelta(0, 22500)],
-            }
+            },
+            dtype=object,
         )
         tm.assert_frame_equal(result, expected)