Skip to content

BUG/API: concat with empty DataFrames or all-NA columns #43507

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Sep 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 37 additions & 2 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,44 @@ The ``dayfirst`` option of :func:`to_datetime` isn't strict, and this can lead t
Now, a warning will be raised if a date string cannot be parsed accordance to the given ``dayfirst`` value when
the value is a delimited date string (e.g. ``31-12-2012``).

.. _whatsnew_140.notable_bug_fixes.notable_bug_fix2:
.. _whatsnew_140.notable_bug_fixes.concat_with_empty_or_all_na:

notable_bug_fix2
Ignoring dtypes in concat with empty or all-NA columns
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

When using :func:`concat` to concatenate two or more :class:`DataFrame` objects,
if one of the DataFrames was empty or had all-NA values, its dtype was _sometimes_
ignored when finding the concatenated dtype. These are now consistently _not_ ignored (:issue:`43507`).

.. ipython:: python

df1 = pd.DataFrame({"bar": [pd.Timestamp("2013-01-01")]}, index=range(1))
df2 = pd.DataFrame({"bar": np.nan}, index=range(1, 2))
res = df1.append(df2)

Previously, the float-dtype in ``df2`` would be ignored so the result dtype would be ``datetime64[ns]``. As a result, the ``np.nan`` would be cast to ``NaT``.

*Previous behavior*:

.. code-block:: ipython

In [4]: res
Out[4]:
bar
0 2013-01-01
1 NaT

Now the float-dtype is respected. Since the common dtype for these DataFrames is object, the ``np.nan`` is retained.

*New behavior*:

.. ipython:: python

res

.. _whatsnew_140.notable_bug_fixes.notable_bug_fix3:

notable_bug_fix3
^^^^^^^^^^^^^^^^

.. ---------------------------------------------------------------------------
Expand Down
14 changes: 13 additions & 1 deletion pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1921,6 +1921,7 @@ def _setitem_with_indexer_missing(self, indexer, value):
# no columns and scalar
raise ValueError("cannot set a frame with no defined columns")

has_dtype = hasattr(value, "dtype")
if isinstance(value, ABCSeries):
# append a Series
value = value.reindex(index=self.obj.columns, copy=True)
Expand All @@ -1938,7 +1939,18 @@ def _setitem_with_indexer_missing(self, indexer, value):

value = Series(value, index=self.obj.columns, name=indexer)

self.obj._mgr = self.obj.append(value)._mgr
if not len(self.obj):
# We will ignore the existing dtypes instead of using
# internals.concat logic
df = value.to_frame().T
df.index = [indexer]
if not has_dtype:
# i.e. if we already had a Series or ndarray, keep that
# dtype. But if we had a list or dict, then do inference
df = df.infer_objects()
self.obj._mgr = df._mgr
else:
self.obj._mgr = self.obj.append(value)._mgr
self.obj._maybe_update_cacher(clear=True)

def _ensure_iterable_column_indexer(self, column_indexer):
Expand Down
35 changes: 2 additions & 33 deletions pandas/core/internals/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,26 +32,20 @@
is_1d_only_ea_obj,
is_datetime64tz_dtype,
is_dtype_equal,
is_scalar,
needs_i8_conversion,
)
from pandas.core.dtypes.concat import (
cast_to_common_type,
concat_compat,
)
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.missing import (
is_valid_na_for_dtype,
isna,
isna_all,
)
from pandas.core.dtypes.missing import is_valid_na_for_dtype

import pandas.core.algorithms as algos
from pandas.core.arrays import (
DatetimeArray,
ExtensionArray,
)
from pandas.core.arrays.sparse import SparseDtype
from pandas.core.construction import ensure_wrapped_if_datetimelike
from pandas.core.internals.array_manager import (
ArrayManager,
Expand Down Expand Up @@ -422,29 +416,7 @@ def is_na(self) -> bool:
blk = self.block
if blk.dtype.kind == "V":
return True

if not blk._can_hold_na:
return False

values = blk.values
if values.size == 0:
return True
if isinstance(values.dtype, SparseDtype):
return False

if values.ndim == 1:
# TODO(EA2D): no need for special case with 2D EAs
val = values[0]
if not is_scalar(val) or not isna(val):
# ideally isna_all would do this short-circuiting
return False
return isna_all(values)
else:
val = values[0][0]
if not is_scalar(val) or not isna(val):
# ideally isna_all would do this short-circuiting
return False
return all(isna_all(row) for row in values)
return False

def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
values: ArrayLike
Expand Down Expand Up @@ -590,9 +562,6 @@ def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool):
# different from missing.na_value_for_dtype
return None
elif dtype.kind in ["i", "u"]:
if not has_none_blocks:
# different from missing.na_value_for_dtype
return None
return np.nan
elif dtype.kind == "O":
return np.nan
Expand Down
15 changes: 4 additions & 11 deletions pandas/tests/frame/methods/test_append.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def test_append_empty_dataframe(self):
expected = df1.copy()
tm.assert_frame_equal(result, expected)

def test_append_dtypes(self, using_array_manager):
def test_append_dtypes(self):

# GH 5754
# row appends of different dtypes (so need to do by-item)
Expand All @@ -164,10 +164,7 @@ def test_append_dtypes(self, using_array_manager):
expected = DataFrame(
{"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")}
)
if using_array_manager:
# TODO(ArrayManager) decide on exact casting rules in concat
# With ArrayManager, all-NaN float is not ignored
expected = expected.astype(object)
expected = expected.astype(object)
tm.assert_frame_equal(result, expected)

df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
Expand All @@ -176,9 +173,7 @@ def test_append_dtypes(self, using_array_manager):
expected = DataFrame(
{"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")}
)
if using_array_manager:
# With ArrayManager, all-NaN float is not ignored
expected = expected.astype(object)
expected = expected.astype(object)
tm.assert_frame_equal(result, expected)

df1 = DataFrame({"bar": np.nan}, index=range(1))
Expand All @@ -187,9 +182,7 @@ def test_append_dtypes(self, using_array_manager):
expected = DataFrame(
{"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")}
)
if using_array_manager:
# With ArrayManager, all-NaN float is not ignored
expected = expected.astype(object)
expected = expected.astype(object)
tm.assert_frame_equal(result, expected)

df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/indexing/test_partial.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,8 @@ def test_partial_setting_mixed_dtype(self):
# columns will align
df = DataFrame(columns=["A", "B"])
df.loc[0] = Series(1, index=range(4))
tm.assert_frame_equal(df, DataFrame(columns=["A", "B"], index=[0]))
expected = DataFrame(columns=["A", "B"], index=[0], dtype=np.float64)
tm.assert_frame_equal(df, expected)

# columns will align
# TODO: it isn't great that this behavior depends on consolidation
Expand All @@ -185,11 +186,10 @@ def test_partial_setting_mixed_dtype(self):
with pytest.raises(ValueError, match=msg):
df.loc[0] = [1, 2, 3]

# TODO: #15657, these are left as object and not coerced
df = DataFrame(columns=["A", "B"])
df.loc[3] = [6, 7]

exp = DataFrame([[6, 7]], index=[3], columns=["A", "B"], dtype="object")
exp = DataFrame([[6, 7]], index=[3], columns=["A", "B"], dtype=np.int64)
tm.assert_frame_equal(df, exp)

def test_series_partial_set(self):
Expand Down
8 changes: 3 additions & 5 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -695,7 +695,7 @@ def _constructor(self):

assert isinstance(result, NotADataFrame)

def test_join_append_timedeltas(self, using_array_manager):
def test_join_append_timedeltas(self):
# timedelta64 issues with join/merge
# GH 5695

Expand All @@ -707,11 +707,9 @@ def test_join_append_timedeltas(self, using_array_manager):
{
"d": [datetime(2013, 11, 5, 5, 56), datetime(2013, 11, 5, 5, 56)],
"t": [timedelta(0, 22500), timedelta(0, 22500)],
}
},
dtype=object,
)
if using_array_manager:
# TODO(ArrayManager) decide on exact casting rules in concat
expected = expected.astype(object)
tm.assert_frame_equal(result, expected)

def test_join_append_timedeltas2(self):
Expand Down