Skip to content

Commit 2dbfbbe

Browse files
authored
DEPR: concat ignoring all-NA entries (#58314)
* DEPR: concat ignoring all-NA entries * fixup
1 parent e293dd8 commit 2dbfbbe

File tree

6 files changed

+48
-129
lines changed

6 files changed

+48
-129
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,7 @@ Removal of prior version deprecations/changes
206206
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
207207
- :class:`.DataFrameGroupBy.idxmin`, :class:`.DataFrameGroupBy.idxmax`, :class:`.SeriesGroupBy.idxmin`, and :class:`.SeriesGroupBy.idxmax` will now raise a ``ValueError`` when used with ``skipna=False`` and an NA value is encountered (:issue:`10694`)
208208
- :func:`concat` no longer ignores empty objects when determining output dtypes (:issue:`39122`)
209+
- :func:`concat` with all-NA entries no longer ignores the dtype of those entries when determining the result dtype (:issue:`40893`)
209210
- :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`)
210211
- :meth:`DataFrame.groupby` with ``as_index=False`` and aggregation methods will no longer exclude from the result the groupings that do not arise from the input (:issue:`49519`)
211212
- :meth:`Series.dt.to_pydatetime` now returns a :class:`Series` of :py:class:`datetime.datetime` objects (:issue:`52459`)

pandas/core/internals/concat.py

+7-84
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
TYPE_CHECKING,
55
cast,
66
)
7-
import warnings
87

98
import numpy as np
109

@@ -16,27 +15,18 @@
1615
)
1716
from pandas._libs.missing import NA
1817
from pandas.util._decorators import cache_readonly
19-
from pandas.util._exceptions import find_stack_level
2018

2119
from pandas.core.dtypes.cast import (
2220
ensure_dtype_can_hold_na,
2321
find_common_type,
2422
)
2523
from pandas.core.dtypes.common import (
2624
is_1d_only_ea_dtype,
27-
is_scalar,
2825
needs_i8_conversion,
2926
)
3027
from pandas.core.dtypes.concat import concat_compat
31-
from pandas.core.dtypes.dtypes import (
32-
ExtensionDtype,
33-
SparseDtype,
34-
)
35-
from pandas.core.dtypes.missing import (
36-
is_valid_na_for_dtype,
37-
isna,
38-
isna_all,
39-
)
28+
from pandas.core.dtypes.dtypes import ExtensionDtype
29+
from pandas.core.dtypes.missing import is_valid_na_for_dtype
4030

4131
from pandas.core.construction import ensure_wrapped_if_datetimelike
4232
from pandas.core.internals.blocks import (
@@ -100,6 +90,7 @@ def concatenate_managers(
10090
if first_dtype in [np.float64, np.float32]:
10191
# TODO: support more dtypes here. This will be simpler once
10292
# JoinUnit.is_na behavior is deprecated.
93+
# (update 2024-04-13 that deprecation has been enforced)
10394
if (
10495
all(_is_homogeneous_mgr(mgr, first_dtype) for mgr, _ in mgrs_indexers)
10596
and len(mgrs_indexers) > 1
@@ -351,41 +342,6 @@ def _is_valid_na_for(self, dtype: DtypeObj) -> bool:
351342

352343
@cache_readonly
353344
def is_na(self) -> bool:
354-
blk = self.block
355-
if blk.dtype.kind == "V":
356-
return True
357-
358-
if not blk._can_hold_na:
359-
return False
360-
361-
values = blk.values
362-
if values.size == 0:
363-
# GH#39122 this case will return False once deprecation is enforced
364-
return True
365-
366-
if isinstance(values.dtype, SparseDtype):
367-
return False
368-
369-
if values.ndim == 1:
370-
# TODO(EA2D): no need for special case with 2D EAs
371-
val = values[0]
372-
if not is_scalar(val) or not isna(val):
373-
# ideally isna_all would do this short-circuiting
374-
return False
375-
return isna_all(values)
376-
else:
377-
val = values[0][0]
378-
if not is_scalar(val) or not isna(val):
379-
# ideally isna_all would do this short-circuiting
380-
return False
381-
return all(isna_all(row) for row in values)
382-
383-
@cache_readonly
384-
def is_na_after_size_and_isna_all_deprecation(self) -> bool:
385-
"""
386-
Will self.is_na be True after values.size == 0 deprecation and isna_all
387-
deprecation are enforced?
388-
"""
389345
blk = self.block
390346
if blk.dtype.kind == "V":
391347
return True
@@ -421,7 +377,7 @@ def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike
421377
"""
422378
Concatenate values from several join units along axis=1.
423379
"""
424-
empty_dtype, empty_dtype_future = _get_empty_dtype(join_units)
380+
empty_dtype = _get_empty_dtype(join_units)
425381

426382
has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units)
427383
upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks)
@@ -446,18 +402,6 @@ def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike
446402
else:
447403
concat_values = concat_compat(to_concat, axis=1)
448404

449-
if empty_dtype != empty_dtype_future:
450-
if empty_dtype == concat_values.dtype:
451-
# GH#39122, GH#40893
452-
warnings.warn(
453-
"The behavior of DataFrame concatenation with empty or all-NA "
454-
"entries is deprecated. In a future version, this will no longer "
455-
"exclude empty or all-NA columns when determining the result dtypes. "
456-
"To retain the old behavior, exclude the relevant entries before "
457-
"the concat operation.",
458-
FutureWarning,
459-
stacklevel=find_stack_level(),
460-
)
461405
return concat_values
462406

463407

@@ -484,7 +428,7 @@ def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool):
484428
raise NotImplementedError
485429

486430

487-
def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> tuple[DtypeObj, DtypeObj]:
431+
def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj:
488432
"""
489433
Return dtype and N/A values to use when concatenating specified units.
490434
@@ -496,38 +440,17 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> tuple[DtypeObj, DtypeObj
496440
"""
497441
if lib.dtypes_all_equal([ju.block.dtype for ju in join_units]):
498442
empty_dtype = join_units[0].block.dtype
499-
return empty_dtype, empty_dtype
443+
return empty_dtype
500444

501445
has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units)
502446

503447
dtypes = [unit.block.dtype for unit in join_units if not unit.is_na]
504-
if not len(dtypes):
505-
dtypes = [
506-
unit.block.dtype for unit in join_units if unit.block.dtype.kind != "V"
507-
]
508448

509449
dtype = find_common_type(dtypes)
510450
if has_none_blocks:
511451
dtype = ensure_dtype_can_hold_na(dtype)
512452

513-
dtype_future = dtype
514-
if len(dtypes) != len(join_units):
515-
dtypes_future = [
516-
unit.block.dtype
517-
for unit in join_units
518-
if not unit.is_na_after_size_and_isna_all_deprecation
519-
]
520-
if not len(dtypes_future):
521-
dtypes_future = [
522-
unit.block.dtype for unit in join_units if unit.block.dtype.kind != "V"
523-
]
524-
525-
if len(dtypes) != len(dtypes_future):
526-
dtype_future = find_common_type(dtypes_future)
527-
if has_none_blocks:
528-
dtype_future = ensure_dtype_can_hold_na(dtype_future)
529-
530-
return dtype, dtype_future
453+
return dtype
531454

532455

533456
def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool:

pandas/tests/reshape/concat/test_append.py

+1-7
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,7 @@ def test_append_empty_tz_frame_with_datetime64ns(self):
332332

333333
# pd.NaT gets inferred as tz-naive, so append result is tz-naive
334334
result = df._append({"a": pd.NaT}, ignore_index=True)
335-
expected = DataFrame({"a": [np.nan]}, dtype=object)
335+
expected = DataFrame({"a": [pd.NaT]}, dtype=object)
336336
tm.assert_frame_equal(result, expected)
337337

338338
# also test with typed value to append
@@ -359,12 +359,6 @@ def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str, val):
359359
result = df._append(other, ignore_index=True)
360360

361361
expected = other.astype(object)
362-
if isinstance(val, str) and dtype_str != "int64":
363-
# TODO: expected used to be `other.astype(object)` which is a more
364-
# reasonable result. This was changed when tightening
365-
# assert_frame_equal's treatment of mismatched NAs to match the
366-
# existing behavior.
367-
expected = DataFrame({"a": [np.nan]}, dtype=object)
368362
tm.assert_frame_equal(result, expected)
369363

370364
@pytest.mark.parametrize(

pandas/tests/reshape/concat/test_concat.py

+30-19
Original file line numberDiff line numberDiff line change
@@ -789,21 +789,24 @@ def test_concat_ignore_empty_object_float(empty_dtype, df_dtype):
789789
df = DataFrame({"foo": [1, 2], "bar": [1, 2]}, dtype=df_dtype)
790790
empty = DataFrame(columns=["foo", "bar"], dtype=empty_dtype)
791791

792-
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
793-
warn = None
792+
needs_update = False
794793
if df_dtype == "datetime64[ns]" or (
795794
df_dtype == "float64" and empty_dtype != "float64"
796795
):
797-
warn = FutureWarning
798-
with tm.assert_produces_warning(warn, match=msg):
799-
result = concat([empty, df])
796+
needs_update = True
797+
798+
result = concat([empty, df])
800799
expected = df
801800
if df_dtype == "int64":
802801
# TODO what exact behaviour do we want for integer eventually?
803802
if empty_dtype == "float64":
804803
expected = df.astype("float64")
805804
else:
806805
expected = df.astype("object")
806+
807+
if needs_update:
808+
# GH#40893 changed the expected here to retain dependence on empty
809+
expected = expected.astype(object)
807810
tm.assert_frame_equal(result, expected)
808811

809812

@@ -820,17 +823,19 @@ def test_concat_ignore_all_na_object_float(empty_dtype, df_dtype):
820823
else:
821824
df_dtype = "float64"
822825

823-
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
824-
warn = None
826+
needs_update = False
825827
if empty_dtype != df_dtype and empty_dtype is not None:
826-
warn = FutureWarning
828+
needs_update = True
827829
elif df_dtype == "datetime64[ns]":
828-
warn = FutureWarning
830+
needs_update = True
829831

830-
with tm.assert_produces_warning(warn, match=msg):
831-
result = concat([empty, df], ignore_index=True)
832+
result = concat([empty, df], ignore_index=True)
832833

833834
expected = DataFrame({"foo": [np.nan, 1, 2], "bar": [np.nan, 1, 2]}, dtype=df_dtype)
835+
if needs_update:
836+
# GH#40893 changed the expected here to retain dependence on empty
837+
expected = expected.astype(object)
838+
expected.iloc[0] = np.nan
834839
tm.assert_frame_equal(result, expected)
835840

836841

@@ -841,10 +846,16 @@ def test_concat_ignore_empty_from_reindex():
841846

842847
aligned = df2.reindex(columns=df1.columns)
843848

844-
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
845-
with tm.assert_produces_warning(FutureWarning, match=msg):
846-
result = concat([df1, aligned], ignore_index=True)
847-
expected = df1 = DataFrame({"a": [1, 2], "b": [pd.Timestamp("2012-01-01"), pd.NaT]})
849+
result = concat([df1, aligned], ignore_index=True)
850+
851+
expected = DataFrame(
852+
{
853+
"a": [1, 2],
854+
"b": pd.array([pd.Timestamp("2012-01-01"), np.nan], dtype=object),
855+
},
856+
dtype=object,
857+
)
858+
expected["a"] = expected["a"].astype("int64")
848859
tm.assert_frame_equal(result, expected)
849860

850861

@@ -907,10 +918,10 @@ def test_concat_none_with_timezone_timestamp():
907918
# GH#52093
908919
df1 = DataFrame([{"A": None}])
909920
df2 = DataFrame([{"A": pd.Timestamp("1990-12-20 00:00:00+00:00")}])
910-
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
911-
with tm.assert_produces_warning(FutureWarning, match=msg):
912-
result = concat([df1, df2], ignore_index=True)
913-
expected = DataFrame({"A": [None, pd.Timestamp("1990-12-20 00:00:00+00:00")]})
921+
result = concat([df1, df2], ignore_index=True)
922+
expected = DataFrame(
923+
{"A": [None, pd.Timestamp("1990-12-20 00:00:00+00:00")]}, dtype=object
924+
)
914925
tm.assert_frame_equal(result, expected)
915926

916927

pandas/tests/reshape/concat/test_datetimes.py

+5-13
Original file line numberDiff line numberDiff line change
@@ -226,15 +226,6 @@ def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, item):
226226
expected = expected.apply(lambda x: x.dt.tz_localize(tz2))
227227
if tz1 != tz2:
228228
expected = expected.astype(object)
229-
if item is pd.NaT:
230-
# GH#18463
231-
# TODO: setting nan here is to keep the test passing as we
232-
# make assert_frame_equal stricter, but is nan really the
233-
# ideal behavior here?
234-
if tz1 is not None:
235-
expected.iloc[-1, 0] = np.nan
236-
else:
237-
expected.iloc[:-1, 0] = np.nan
238229

239230
tm.assert_frame_equal(result, expected)
240231

@@ -590,8 +581,9 @@ def test_concat_float_datetime64():
590581
result = concat([df_time.iloc[:0], df_float])
591582
tm.assert_frame_equal(result, expected)
592583

593-
expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")})
594-
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
595-
with tm.assert_produces_warning(FutureWarning, match=msg):
596-
result = concat([df_time, df_float.iloc[:0]])
584+
expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")}).astype(
585+
object
586+
)
587+
588+
result = concat([df_time, df_float.iloc[:0]])
597589
tm.assert_frame_equal(result, expected)

pandas/tests/reshape/merge/test_merge.py

+4-6
Original file line numberDiff line numberDiff line change
@@ -709,16 +709,14 @@ def test_join_append_timedeltas(self):
709709
{"d": [datetime(2013, 11, 5, 5, 56)], "t": [timedelta(0, 22500)]}
710710
)
711711
df = DataFrame(columns=list("dt"))
712-
msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
713-
warn = FutureWarning
714-
with tm.assert_produces_warning(warn, match=msg):
715-
df = concat([df, d], ignore_index=True)
716-
result = concat([df, d], ignore_index=True)
712+
df = concat([df, d], ignore_index=True)
713+
result = concat([df, d], ignore_index=True)
717714
expected = DataFrame(
718715
{
719716
"d": [datetime(2013, 11, 5, 5, 56), datetime(2013, 11, 5, 5, 56)],
720717
"t": [timedelta(0, 22500), timedelta(0, 22500)],
721-
}
718+
},
719+
dtype=object,
722720
)
723721
tm.assert_frame_equal(result, expected)
724722

0 commit comments

Comments
 (0)