Skip to content

DEPR: ignoring empty entries in pd.concat #58056

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ Other Deprecations
Removal of prior version deprecations/changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- :class:`.DataFrameGroupBy.idxmin`, :class:`.DataFrameGroupBy.idxmax`, :class:`.SeriesGroupBy.idxmin`, and :class:`.SeriesGroupBy.idxmax` will now raise a ``ValueError`` when used with ``skipna=False`` and an NA value is encountered (:issue:`10694`)
- :func:`concat` no longer ignores empty objects when determining output dtypes (:issue:`39122`)
- :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`)
- :meth:`DataFrame.groupby` with ``as_index=False`` and aggregation methods will no longer exclude from the result the groupings that do not arise from the input (:issue:`49519`)
- :meth:`Series.dt.to_pydatetime` now returns a :class:`Series` of :py:class:`datetime.datetime` objects (:issue:`52459`)
Expand Down
20 changes: 0 additions & 20 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,10 @@
TYPE_CHECKING,
cast,
)
import warnings

import numpy as np

from pandas._libs import lib
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.astype import astype_array
from pandas.core.dtypes.cast import (
Expand Down Expand Up @@ -101,28 +99,10 @@ def concat_compat(
# Creating an empty array directly is tempting, but the winnings would be
# marginal given that it would still require shape & dtype calculation and
# np.concatenate which has them both implemented is compiled.
orig = to_concat
non_empties = [x for x in to_concat if _is_nonempty(x, axis)]
if non_empties and axis == 0 and not ea_compat_axis:
# ea_compat_axis see GH#39574
to_concat = non_empties

any_ea, kinds, target_dtype = _get_result_dtype(to_concat, non_empties)

if len(to_concat) < len(orig):
_, _, alt_dtype = _get_result_dtype(orig, non_empties)
if alt_dtype != target_dtype:
# GH#39122
warnings.warn(
"The behavior of array concatenation with empty entries is "
"deprecated. In a future version, this will no longer exclude "
"empty items when determining the result dtype. "
"To retain the old behavior, exclude the empty entries before "
"the concat operation.",
FutureWarning,
stacklevel=find_stack_level(),
)

if target_dtype is not None:
to_concat = [astype_array(arr, target_dtype, copy=False) for arr in to_concat]

Expand Down
9 changes: 3 additions & 6 deletions pandas/tests/dtypes/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,9 @@ def test_concat_mismatched_categoricals_with_empty():
ser1 = Series(["a", "b", "c"], dtype="category")
ser2 = Series([], dtype="category")

msg = "The behavior of array concatenation with empty entries is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = _concat.concat_compat([ser1._values, ser2._values])
with tm.assert_produces_warning(FutureWarning, match=msg):
expected = pd.concat([ser1, ser2])._values
tm.assert_categorical_equal(result, expected)
result = _concat.concat_compat([ser1._values, ser2._values])
expected = pd.concat([ser1, ser2])._values
tm.assert_numpy_array_equal(result, expected)


def test_concat_single_dataframe_tz_aware():
Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,6 @@ def f3(x):

df2 = DataFrame({"a": [3, 2, 2, 2], "b": range(4), "c": range(5, 9)})

depr_msg = "The behavior of array concatenation with empty entries is deprecated"

# correct result
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
Expand All @@ -245,8 +243,7 @@ def f3(x):
with pytest.raises(AssertionError, match=msg):
df.groupby("a").apply(f3)
with pytest.raises(AssertionError, match=msg):
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
df2.groupby("a").apply(f3)
df2.groupby("a").apply(f3)


def test_attr_wrapper(ts):
Expand Down
4 changes: 1 addition & 3 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,9 +636,7 @@ def test_append_empty_preserve_name(self, name, expected):
left = Index([], name="foo")
right = Index([1, 2, 3], name=name)

msg = "The behavior of array concatenation with empty entries is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = left.append(right)
result = left.append(right)
assert result.name == expected

@pytest.mark.parametrize(
Expand Down
4 changes: 1 addition & 3 deletions pandas/tests/reshape/concat/test_append.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,7 @@ def test_append_preserve_index_name(self):
df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], columns=["A", "B", "C"])
df2 = df2.set_index(["A"])

msg = "The behavior of array concatenation with empty entries is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df1._append(df2)
result = df1._append(df2)
assert result.index.name == "A"

indexes_can_append = [
Expand Down
26 changes: 10 additions & 16 deletions pandas/tests/reshape/concat/test_append_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -691,15 +691,12 @@ def test_concat_categorical_empty(self):

s1 = Series([], dtype="category")
s2 = Series([1, 2], dtype="category")
exp = s2.astype(object)
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)

msg = "The behavior of array concatenation with empty entries is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2)
tm.assert_series_equal(s1._append(s2, ignore_index=True), s2)

with tm.assert_produces_warning(FutureWarning, match=msg):
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2)
tm.assert_series_equal(s2._append(s1, ignore_index=True), s2)
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)

s1 = Series([], dtype="category")
s2 = Series([], dtype="category")
Expand All @@ -719,15 +716,12 @@ def test_concat_categorical_empty(self):
s1 = Series([], dtype="category")
s2 = Series([np.nan, np.nan])

# empty Series is ignored
exp = Series([np.nan, np.nan])
with tm.assert_produces_warning(FutureWarning, match=msg):
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
exp = Series([np.nan, np.nan], dtype=object)
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)

with tm.assert_produces_warning(FutureWarning, match=msg):
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)

def test_categorical_concat_append(self):
cat = Categorical(["a", "b"], categories=["a", "b"])
Expand Down
6 changes: 2 additions & 4 deletions pandas/tests/reshape/concat/test_empty.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,9 @@ def test_concat_empty_series(self):

s1 = Series([1, 2, 3], name="x")
s2 = Series(name="y", dtype="float64")
msg = "The behavior of array concatenation with empty entries is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
res = concat([s1, s2], axis=0)
res = concat([s1, s2], axis=0)
# name will be reset
exp = Series([1, 2, 3])
exp = Series([1, 2, 3], dtype="float64")
tm.assert_series_equal(res, exp)

# empty Series with no name
Expand Down
6 changes: 2 additions & 4 deletions pandas/tests/reshape/concat/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,8 @@ def test_concat_empty_and_non_empty_series_regression(self):
s1 = Series([1])
s2 = Series([], dtype=object)

expected = s1
msg = "The behavior of array concatenation with empty entries is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = concat([s1, s2])
expected = s1.astype(object)
result = concat([s1, s2])
tm.assert_series_equal(result, expected)

def test_concat_series_axis1(self):
Expand Down
12 changes: 4 additions & 8 deletions pandas/tests/series/methods/test_combine_first.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,9 @@ def test_combine_first(self):
# corner case
ser = Series([1.0, 2, 3], index=[0, 1, 2])
empty = Series([], index=[], dtype=object)
msg = "The behavior of array concatenation with empty entries is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = ser.combine_first(empty)
result = ser.combine_first(empty)
ser.index = ser.index.astype("O")
tm.assert_series_equal(ser, result)
tm.assert_series_equal(result, ser.astype(object))

def test_combine_first_dt64(self, unit):
s0 = to_datetime(Series(["2010", np.nan])).dt.as_unit(unit)
Expand Down Expand Up @@ -112,10 +110,8 @@ def test_combine_first_timezone_series_with_empty_series(self):
)
s1 = Series(range(10), index=time_index)
s2 = Series(index=time_index)
msg = "The behavior of array concatenation with empty entries is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = s1.combine_first(s2)
tm.assert_series_equal(result, s1)
result = s1.combine_first(s2)
tm.assert_series_equal(result, s1.astype(np.float64))

def test_combine_first_preserves_dtype(self):
# GH51764
Expand Down