From c213e6423aac5995b16df97f72eae7b963a58837 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 11 Apr 2023 17:07:49 -0700 Subject: [PATCH 1/6] DEPR: concat ignoring all-NA columns --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/core/internals/concat.py | 54 ++++++++++++++++++++-- pandas/tests/reshape/concat/test_concat.py | 20 +++++++- 3 files changed, 68 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index fca355069ae74..21775a4a93cca 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -219,6 +219,7 @@ Deprecations - Deprecated the 'axis' keyword in :meth:`.GroupBy.idxmax`, :meth:`.GroupBy.idxmin`, :meth:`.GroupBy.fillna`, :meth:`.GroupBy.take`, :meth:`.GroupBy.skew`, :meth:`.GroupBy.rank`, :meth:`.GroupBy.cumprod`, :meth:`.GroupBy.cumsum`, :meth:`.GroupBy.cummax`, :meth:`.GroupBy.cummin`, :meth:`.GroupBy.pct_change`, :meth:`GroupBy.diff`, :meth:`.GroupBy.shift`, and :meth:`DataFrameGroupBy.corrwith`; for ``axis=1`` operate on the underlying :class:`DataFrame` instead (:issue:`50405`, :issue:`51046`) - Deprecated the "fastpath" keyword in :class:`Categorical` constructor, use :meth:`Categorical.from_codes` instead (:issue:`20110`) - Deprecated behavior of :meth:`Series.dt.to_pydatetime`, in a future version this will return a :class:`Series` containing python ``datetime`` objects instead of an ``ndarray`` of datetimes; this matches the behavior of other :meth:`Series.dt` properties (:issue:`20306`) +- Deprecated behavior of :func:`concat` when :class:`DataFrame` has columns that are all-NA, in a future version these will not be discarded when determining the resulting dtype (:issue:`40893`) - Deprecated passing a dictionary to :meth:`.SeriesGroupBy.agg`; pass a list of aggregations instead (:issue:`50684`) - Deprecated logical operations (``|``, ``&``, ``^``) between pandas objects and dtype-less sequences (e.g. ``list``, ``tuple``), wrap a sequence in a :class:`Series` or numpy array before operating instead (:issue:`51521`) - Deprecated the methods :meth:`Series.bool` and :meth:`DataFrame.bool` (:issue:`51749`) @@ -228,7 +229,6 @@ Deprecations - Deprecated ``freq`` parameter in :class:`PeriodArray` constructor, pass ``dtype`` instead (:issue:`52462`) - Deprecated :func:`is_categorical_dtype`, use ``isinstance(obj.dtype, pd.CategoricalDtype)`` instead (:issue:`52527`) - Deprecated :func:`is_int64_dtype`, check ``dtype == np.dtype(np.int64)`` instead (:issue:`52564`) -- .. --------------------------------------------------------------------------- .. _whatsnew_210.performance: diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index a17b45cdc7bcb..b6ccff9f49f45 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -5,6 +5,7 @@ TYPE_CHECKING, Sequence, ) +import warnings import numpy as np @@ -15,6 +16,7 @@ ) from pandas._libs.missing import NA from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.cast import ( @@ -448,6 +450,19 @@ def is_na(self) -> bool: return False return all(isna_all(row) for row in values) + @cache_readonly + def is_na_without_isna_all(self) -> bool: + blk = self.block + if blk.dtype.kind == "V": + return True + if not blk._can_hold_na: + return False + + values = blk.values + if values.size == 0: + return True + return False + def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: values: ArrayLike @@ -496,7 +511,7 @@ def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike """ Concatenate values from several join units along axis=1. """ - empty_dtype = _get_empty_dtype(join_units) + empty_dtype, empty_dtype_future = _get_empty_dtype(join_units) has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units) upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks) @@ -535,6 +550,19 @@ def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike else: concat_values = concat_compat(to_concat, axis=1) + if empty_dtype != empty_dtype_future: + if empty_dtype == concat_values.dtype: + # GH#40893 + warnings.warn( + "The behavior of DataFrame concatenation with all-NA entries is " + "deprecated. In a future version, this will no longer exclude " + "all-NA columns when determining the result dtypes. " + "To retain the old behavior, cast the all-NA columns to the " + "desired dtype before the concat operation.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return concat_values @@ -561,7 +589,7 @@ def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool): raise NotImplementedError -def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj: +def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> tuple[DtypeObj, DtypeObj]: """ Return dtype and N/A values to use when concatenating specified units. @@ -573,11 +601,11 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj: """ if len(join_units) == 1: blk = join_units[0].block - return blk.dtype + return blk.dtype, blk.dtype if _is_uniform_reindex(join_units): empty_dtype = join_units[0].block.dtype - return empty_dtype + return empty_dtype, empty_dtype has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units) @@ -590,7 +618,23 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj: dtype = find_common_type(dtypes) if has_none_blocks: dtype = ensure_dtype_can_hold_na(dtype) - return dtype + + dtype_future = dtype + if len(dtypes) != len(join_units): + dtypes_future = [ + unit.block.dtype for unit in join_units if not unit.is_na_without_isna_all + ] + if not len(dtypes_future): + dtypes_future = [ + unit.block.dtype for unit in join_units if unit.block.dtype.kind != "V" + ] + + if len(dtypes) != len(dtypes_future): + dtype_future = find_common_type(dtypes_future) + if has_none_blocks: + dtype_future = ensure_dtype_can_hold_na(dtype_future) + + return dtype, dtype_future def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool: diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 244fe6a7927fe..41492d7df53da 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -747,7 +747,9 @@ def test_concat_ignore_empty_object_float(empty_dtype, df_dtype): # https://github.com/pandas-dev/pandas/issues/45637 df = DataFrame({"foo": [1, 2], "bar": [1, 2]}, dtype=df_dtype) empty = DataFrame(columns=["foo", "bar"], dtype=empty_dtype) + result = concat([empty, df]) + expected = df if df_dtype == "int64": # TODO what exact behaviour do we want for integer eventually? @@ -764,7 +766,6 @@ def test_concat_ignore_empty_object_float(empty_dtype, df_dtype): def test_concat_ignore_all_na_object_float(empty_dtype, df_dtype): df = DataFrame({"foo": [1, 2], "bar": [1, 2]}, dtype=df_dtype) empty = DataFrame({"foo": [np.nan], "bar": [np.nan]}, dtype=empty_dtype) - result = concat([empty, df], ignore_index=True) if df_dtype == "int64": # TODO what exact behaviour do we want for integer eventually? @@ -772,6 +773,17 @@ def test_concat_ignore_all_na_object_float(empty_dtype, df_dtype): df_dtype = "object" else: df_dtype = "float64" + + msg = "The behavior of DataFrame concatenation with all-NA entries" + warn = None + if empty_dtype != df_dtype and empty_dtype is not None: + warn = FutureWarning + elif df_dtype == "datetime64[ns]": + warn = FutureWarning + + with tm.assert_produces_warning(warn, match=msg): + result = concat([empty, df], ignore_index=True) + expected = DataFrame({"foo": [None, 1, 2], "bar": [None, 1, 2]}, dtype=df_dtype) tm.assert_frame_equal(result, expected) @@ -782,7 +794,11 @@ def test_concat_ignore_empty_from_reindex(): df1 = DataFrame({"a": [1], "b": [pd.Timestamp("2012-01-01")]}) df2 = DataFrame({"a": [2]}) - result = concat([df1, df2.reindex(columns=df1.columns)], ignore_index=True) + aligned = df2.reindex(columns=df1.columns) + + msg = "The behavior of DataFrame concatenation with all-NA entries" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = concat([df1, aligned], ignore_index=True) expected = df1 = DataFrame({"a": [1, 2], "b": [pd.Timestamp("2012-01-01"), pd.NaT]}) tm.assert_frame_equal(result, expected) From 78acbe463f6946f865c0a427c9312f8093a215c5 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 11 Apr 2023 19:57:53 -0700 Subject: [PATCH 2/6] silence warning --- doc/source/whatsnew/v1.4.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 9dbe450261e54..9a584a46a66dc 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -280,6 +280,7 @@ if one of the DataFrames was empty or had all-NA values, its dtype was consistently *not* ignored (:issue:`43507`). .. ipython:: python + :okwarning: df1 = pd.DataFrame({"bar": [pd.Timestamp("2013-01-01")]}, index=range(1)) df2 = pd.DataFrame({"bar": np.nan}, index=range(1, 2)) From 88143a4eb21020b5a79aca611b70c6ddd7720e3b Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 14 Apr 2023 18:06:15 -0700 Subject: [PATCH 3/6] use code-block --- doc/source/whatsnew/v1.4.0.rst | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 9a584a46a66dc..ec0b04f8b3b77 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -279,12 +279,11 @@ if one of the DataFrames was empty or had all-NA values, its dtype was *sometimes* ignored when finding the concatenated dtype. These are now consistently *not* ignored (:issue:`43507`). -.. ipython:: python - :okwarning: +.. code-block:: ipython - df1 = pd.DataFrame({"bar": [pd.Timestamp("2013-01-01")]}, index=range(1)) - df2 = pd.DataFrame({"bar": np.nan}, index=range(1, 2)) - res = pd.concat([df1, df2]) + In [3]: df1 = pd.DataFrame({"bar": [pd.Timestamp("2013-01-01")]}, index=range(1)) + In [4]: df2 = pd.DataFrame({"bar": np.nan}, index=range(1, 2)) + In [5]: res = pd.concat([df1, df2]) Previously, the float-dtype in ``df2`` would be ignored so the result dtype would be ``datetime64[ns]``. As a result, the ``np.nan`` would be cast to @@ -294,8 +293,8 @@ would be ``datetime64[ns]``. As a result, the ``np.nan`` would be cast to .. code-block:: ipython - In [4]: res - Out[4]: + In [6]: res + Out[6]: bar 0 2013-01-01 1 NaT @@ -307,8 +306,8 @@ object, the ``np.nan`` is retained. .. code-block:: ipython - In [4]: res - Out[4]: + In [6]: res + Out[6]: bar 0 2013-01-01 00:00:00 1 NaN From 0eaec56835ae9e3a2fa2907c61ca71fc4279034c Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 14 Apr 2023 18:06:34 -0700 Subject: [PATCH 4/6] fix duplicate whatsnew entry --- doc/source/whatsnew/v2.1.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 2dba23cf03944..dc789b7a08287 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -227,7 +227,6 @@ Deprecations - Deprecated :meth:`DataFrame.swapaxes` and :meth:`Series.swapaxes`, use :meth:`DataFrame.transpose` or :meth:`Series.transpose` instead (:issue:`51946`) - Deprecated :meth:`DataFrame.swapaxes` and :meth:`Series.swapaxes`, use :meth:`DataFrame.transpose` or :meth:`Series.transpose` instead (:issue:`51946`) - Deprecated ``freq`` parameter in :class:`PeriodArray` constructor, pass ``dtype`` instead (:issue:`52462`) -- Deprecated ``freq`` parameter in :class:`PeriodArray` constructor, pass ``dtype`` instead (:issue:`52462`) - Deprecated behavior of :func:`concat` when :class:`DataFrame` has columns that are all-NA, in a future version these will not be discarded when determining the resulting dtype (:issue:`40893`) - Deprecated behavior of :meth:`Series.dt.to_pydatetime`, in a future version this will return a :class:`Series` containing python ``datetime`` objects instead of an ``ndarray`` of datetimes; this matches the behavior of other :meth:`Series.dt` properties (:issue:`20306`) - Deprecated behavior of :meth:`Series.dt.to_pydatetime`, in a future version this will return a :class:`Series` containing python ``datetime`` objects instead of an ``ndarray`` of datetimes; this matches the behavior of other :meth:`Series.dt` properties (:issue:`20306`) From 1f51062fb2a8a313a7b1f9b231ad3a32bfc92f65 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 14 Apr 2023 18:13:57 -0700 Subject: [PATCH 5/6] remove duplicate whatsnew entry --- doc/source/whatsnew/v2.1.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index dc789b7a08287..13a2a346f69a1 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -225,7 +225,6 @@ Deprecations - Deprecated :func:`is_int64_dtype`, check ``dtype == np.dtype(np.int64)`` instead (:issue:`52564`) - Deprecated :func:`is_interval_dtype`, check ``isinstance(dtype, pd.IntervalDtype)`` instead (:issue:`52607`) - Deprecated :meth:`DataFrame.swapaxes` and :meth:`Series.swapaxes`, use :meth:`DataFrame.transpose` or :meth:`Series.transpose` instead (:issue:`51946`) -- Deprecated :meth:`DataFrame.swapaxes` and :meth:`Series.swapaxes`, use :meth:`DataFrame.transpose` or :meth:`Series.transpose` instead (:issue:`51946`) - Deprecated ``freq`` parameter in :class:`PeriodArray` constructor, pass ``dtype`` instead (:issue:`52462`) - Deprecated behavior of :func:`concat` when :class:`DataFrame` has columns that are all-NA, in a future version these will not be discarded when determining the resulting dtype (:issue:`40893`) - Deprecated behavior of :meth:`Series.dt.to_pydatetime`, in a future version this will return a :class:`Series` containing python ``datetime`` objects instead of an ``ndarray`` of datetimes; this matches the behavior of other :meth:`Series.dt` properties (:issue:`20306`) From edf2e1681243e051ec6704e0501db499ddffc989 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 17 Apr 2023 12:23:54 -0700 Subject: [PATCH 6/6] Fix duplicates in whatsnew --- doc/source/whatsnew/v2.1.0.rst | 7 ------- 1 file changed, 7 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index f0fe98387a9d5..af33afb701872 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -231,19 +231,12 @@ Deprecations - Deprecated ``freq`` parameter in :class:`PeriodArray` constructor, pass ``dtype`` instead (:issue:`52462`) - Deprecated behavior of :func:`concat` when :class:`DataFrame` has columns that are all-NA, in a future version these will not be discarded when determining the resulting dtype (:issue:`40893`) - Deprecated behavior of :meth:`Series.dt.to_pydatetime`, in a future version this will return a :class:`Series` containing python ``datetime`` objects instead of an ``ndarray`` of datetimes; this matches the behavior of other :meth:`Series.dt` properties (:issue:`20306`) -- Deprecated behavior of :meth:`Series.dt.to_pydatetime`, in a future version this will return a :class:`Series` containing python ``datetime`` objects instead of an ``ndarray`` of datetimes; this matches the behavior of other :meth:`Series.dt` properties (:issue:`20306`) -- Deprecated logical operations (``|``, ``&``, ``^``) between pandas objects and dtype-less sequences (e.g. ``list``, ``tuple``), wrap a sequence in a :class:`Series` or numpy array before operating instead (:issue:`51521`) - Deprecated logical operations (``|``, ``&``, ``^``) between pandas objects and dtype-less sequences (e.g. ``list``, ``tuple``), wrap a sequence in a :class:`Series` or numpy array before operating instead (:issue:`51521`) - Deprecated making :meth:`Series.apply` return a :class:`DataFrame` when the passed-in callable returns a :class:`Series` object. In the future this will return a :class:`Series` whose values are themselves :class:`Series`. This pattern was very slow and it's recommended to use alternative methods to archive the same goal (:issue:`52116`) -- Deprecated making :meth:`Series.apply` return a :class:`DataFrame` when the passed-in callable returns a :class:`Series` object. In the future this will return a :class:`Series` whose values are themselves :class:`Series`. This pattern was very slow and it's recommended to use alternative methods to archive the same goal (:issue:`52116`) -- Deprecated parameter ``convert_type`` in :meth:`Series.apply` (:issue:`52140`) - Deprecated parameter ``convert_type`` in :meth:`Series.apply` (:issue:`52140`) - Deprecated passing a dictionary to :meth:`.SeriesGroupBy.agg`; pass a list of aggregations instead (:issue:`50684`) -- Deprecated passing a dictionary to :meth:`.SeriesGroupBy.agg`; pass a list of aggregations instead (:issue:`50684`) -- Deprecated the "fastpath" keyword in :class:`Categorical` constructor, use :meth:`Categorical.from_codes` instead (:issue:`20110`) - Deprecated the "fastpath" keyword in :class:`Categorical` constructor, use :meth:`Categorical.from_codes` instead (:issue:`20110`) - Deprecated the methods :meth:`Series.bool` and :meth:`DataFrame.bool` (:issue:`51749`) -- Deprecated the methods :meth:`Series.bool` and :meth:`DataFrame.bool` (:issue:`51749`) - Deprecated unused "closed" and "normalize" keywords in the :class:`DatetimeIndex` constructor (:issue:`52628`) - Deprecated unused "closed" keyword in the :class:`TimedeltaIndex` constructor (:issue:`52628`) -