diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 3bdcab4fbcfea..eb1bc270a7d1b 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -150,9 +150,44 @@ The ``dayfirst`` option of :func:`to_datetime` isn't strict, and this can lead t Now, a warning will be raised if a date string cannot be parsed accordance to the given ``dayfirst`` value when the value is a delimited date string (e.g. ``31-12-2012``). -.. _whatsnew_140.notable_bug_fixes.notable_bug_fix2: +.. _whatsnew_140.notable_bug_fixes.concat_with_empty_or_all_na: -notable_bug_fix2 +Ignoring dtypes in concat with empty or all-NA columns +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When using :func:`concat` to concatenate two or more :class:`DataFrame` objects, +if one of the DataFrames was empty or had all-NA values, its dtype was _sometimes_ +ignored when finding the concatenated dtype. These are now consistently _not_ ignored (:issue:`43507`). + +.. ipython:: python + + df1 = pd.DataFrame({"bar": [pd.Timestamp("2013-01-01")]}, index=range(1)) + df2 = pd.DataFrame({"bar": np.nan}, index=range(1, 2)) + res = df1.append(df2) + +Previously, the float-dtype in ``df2`` would be ignored so the result dtype would be ``datetime64[ns]``. As a result, the ``np.nan`` would be cast to ``NaT``. + +*Previous behavior*: + +.. code-block:: ipython + + In [4]: res + Out[4]: + bar + 0 2013-01-01 + 1 NaT + +Now the float-dtype is respected. Since the common dtype for these DataFrames is object, the ``np.nan`` is retained. + +*New behavior*: + +.. ipython:: python + + res + +.. _whatsnew_140.notable_bug_fixes.notable_bug_fix3: + +notable_bug_fix3 ^^^^^^^^^^^^^^^^ .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 26a2996af6b2a..bbb3cb3391dfa 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1921,6 +1921,7 @@ def _setitem_with_indexer_missing(self, indexer, value): # no columns and scalar raise ValueError("cannot set a frame with no defined columns") + has_dtype = hasattr(value, "dtype") if isinstance(value, ABCSeries): # append a Series value = value.reindex(index=self.obj.columns, copy=True) @@ -1938,7 +1939,18 @@ def _setitem_with_indexer_missing(self, indexer, value): value = Series(value, index=self.obj.columns, name=indexer) - self.obj._mgr = self.obj.append(value)._mgr + if not len(self.obj): + # We will ignore the existing dtypes instead of using + # internals.concat logic + df = value.to_frame().T + df.index = [indexer] + if not has_dtype: + # i.e. if we already had a Series or ndarray, keep that + # dtype. But if we had a list or dict, then do inference + df = df.infer_objects() + self.obj._mgr = df._mgr + else: + self.obj._mgr = self.obj.append(value)._mgr self.obj._maybe_update_cacher(clear=True) def _ensure_iterable_column_indexer(self, column_indexer): diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index d0e017b06ffbc..1360f1d1a508a 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -32,7 +32,6 @@ is_1d_only_ea_obj, is_datetime64tz_dtype, is_dtype_equal, - is_scalar, needs_i8_conversion, ) from pandas.core.dtypes.concat import ( @@ -40,18 +39,13 @@ concat_compat, ) from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.missing import ( - is_valid_na_for_dtype, - isna, - isna_all, -) +from pandas.core.dtypes.missing import is_valid_na_for_dtype import pandas.core.algorithms as algos from pandas.core.arrays import ( DatetimeArray, ExtensionArray, ) -from pandas.core.arrays.sparse import SparseDtype from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.internals.array_manager import ( ArrayManager, @@ -422,29 +416,7 @@ def is_na(self) -> bool: blk = self.block if blk.dtype.kind == "V": return True - - if not blk._can_hold_na: - return False - - values = blk.values - if values.size == 0: - return True - if isinstance(values.dtype, SparseDtype): - return False - - if values.ndim == 1: - # TODO(EA2D): no need for special case with 2D EAs - val = values[0] - if not is_scalar(val) or not isna(val): - # ideally isna_all would do this short-circuiting - return False - return isna_all(values) - else: - val = values[0][0] - if not is_scalar(val) or not isna(val): - # ideally isna_all would do this short-circuiting - return False - return all(isna_all(row) for row in values) + return False def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: values: ArrayLike @@ -590,9 +562,6 @@ def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool): # different from missing.na_value_for_dtype return None elif dtype.kind in ["i", "u"]: - if not has_none_blocks: - # different from missing.na_value_for_dtype - return None return np.nan elif dtype.kind == "O": return np.nan diff --git a/pandas/tests/frame/methods/test_append.py b/pandas/tests/frame/methods/test_append.py index 80f97ecaee121..fae901b7ba303 100644 --- a/pandas/tests/frame/methods/test_append.py +++ b/pandas/tests/frame/methods/test_append.py @@ -140,7 +140,7 @@ def test_append_empty_dataframe(self): expected = df1.copy() tm.assert_frame_equal(result, expected) - def test_append_dtypes(self, using_array_manager): + def test_append_dtypes(self): # GH 5754 # row appends of different dtypes (so need to do by-item) @@ -164,10 +164,7 @@ def test_append_dtypes(self, using_array_manager): expected = DataFrame( {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} ) - if using_array_manager: - # TODO(ArrayManager) decide on exact casting rules in concat - # With ArrayManager, all-NaN float is not ignored - expected = expected.astype(object) + expected = expected.astype(object) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) @@ -176,9 +173,7 @@ def test_append_dtypes(self, using_array_manager): expected = DataFrame( {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} ) - if using_array_manager: - # With ArrayManager, all-NaN float is not ignored - expected = expected.astype(object) + expected = expected.astype(object) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": np.nan}, index=range(1)) @@ -187,9 +182,7 @@ def test_append_dtypes(self, using_array_manager): expected = DataFrame( {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")} ) - if using_array_manager: - # With ArrayManager, all-NaN float is not ignored - expected = expected.astype(object) + expected = expected.astype(object) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 693e67652c912..172301a2fde84 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -168,7 +168,8 @@ def test_partial_setting_mixed_dtype(self): # columns will align df = DataFrame(columns=["A", "B"]) df.loc[0] = Series(1, index=range(4)) - tm.assert_frame_equal(df, DataFrame(columns=["A", "B"], index=[0])) + expected = DataFrame(columns=["A", "B"], index=[0], dtype=np.float64) + tm.assert_frame_equal(df, expected) # columns will align # TODO: it isn't great that this behavior depends on consolidation @@ -185,11 +186,10 @@ def test_partial_setting_mixed_dtype(self): with pytest.raises(ValueError, match=msg): df.loc[0] = [1, 2, 3] - # TODO: #15657, these are left as object and not coerced df = DataFrame(columns=["A", "B"]) df.loc[3] = [6, 7] - exp = DataFrame([[6, 7]], index=[3], columns=["A", "B"], dtype="object") + exp = DataFrame([[6, 7]], index=[3], columns=["A", "B"], dtype=np.int64) tm.assert_frame_equal(df, exp) def test_series_partial_set(self): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 060c37c207a2a..3462fa486d936 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -695,7 +695,7 @@ def _constructor(self): assert isinstance(result, NotADataFrame) - def test_join_append_timedeltas(self, using_array_manager): + def test_join_append_timedeltas(self): # timedelta64 issues with join/merge # GH 5695 @@ -707,11 +707,9 @@ def test_join_append_timedeltas(self, using_array_manager): { "d": [datetime(2013, 11, 5, 5, 56), datetime(2013, 11, 5, 5, 56)], "t": [timedelta(0, 22500), timedelta(0, 22500)], - } + }, + dtype=object, ) - if using_array_manager: - # TODO(ArrayManager) decide on exact casting rules in concat - expected = expected.astype(object) tm.assert_frame_equal(result, expected) def test_join_append_timedeltas2(self):