From bef208bfc4a981afc3f2928097e05da8a3b79114 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 3 Sep 2021 10:11:17 -0700 Subject: [PATCH 1/5] REF: remove no-longer reachable cases from internals.concat --- pandas/core/internals/concat.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index d0e017b06ffbc..be648d392ccdf 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -1,6 +1,5 @@ from __future__ import annotations -import copy import itertools from typing import ( TYPE_CHECKING, @@ -679,20 +678,15 @@ def _trim_join_unit(join_unit: JoinUnit, length: int) -> JoinUnit: Extra items that didn't fit are returned as a separate block. """ - if 0 not in join_unit.indexers: - extra_indexers = join_unit.indexers + assert 0 not in join_unit.indexers - if join_unit.block is None: - extra_block = None - else: - extra_block = join_unit.block.getitem_block(slice(length, None)) - join_unit.block = join_unit.block.getitem_block(slice(length)) - else: - extra_block = join_unit.block + extra_indexers = join_unit.indexers - extra_indexers = copy.copy(join_unit.indexers) - extra_indexers[0] = extra_indexers[0][length:] - join_unit.indexers[0] = join_unit.indexers[0][:length] + if join_unit.block is None: + extra_block = None + else: + extra_block = join_unit.block.getitem_block(slice(length, None)) + join_unit.block = join_unit.block.getitem_block(slice(length)) extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:] join_unit.shape = (length,) + join_unit.shape[1:] From 80fcb9adc67ddf40461fa62e7be6d79d47c4c73c Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 3 Sep 2021 16:25:05 -0700 Subject: [PATCH 2/5] fix incorrect assertion --- pandas/core/internals/concat.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index be648d392ccdf..97e51532de5ca 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -1,5 +1,6 @@ from __future__ import annotations +import copy import itertools from typing import ( TYPE_CHECKING, @@ -678,9 +679,20 @@ def _trim_join_unit(join_unit: JoinUnit, length: int) -> JoinUnit: Extra items that didn't fit are returned as a separate block. """ - assert 0 not in join_unit.indexers + if 0 not in join_unit.indexers: + extra_indexers = join_unit.indexers - extra_indexers = join_unit.indexers + if join_unit.block is None: + extra_block = None + else: + extra_block = join_unit.block.getitem_block(slice(length, None)) + join_unit.block = join_unit.block.getitem_block(slice(length)) + else: + extra_block = join_unit.block + + extra_indexers = copy.copy(join_unit.indexers) + extra_indexers[0] = extra_indexers[0][length:] + join_unit.indexers[0] = join_unit.indexers[0][:length] if join_unit.block is None: extra_block = None From d50f5d418f615b75968a95fede23ba66f44921d5 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 4 Sep 2021 13:08:56 -0700 Subject: [PATCH 3/5] typo fixup --- pandas/core/internals/concat.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 97e51532de5ca..d0e017b06ffbc 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -694,12 +694,6 @@ def _trim_join_unit(join_unit: JoinUnit, length: int) -> JoinUnit: extra_indexers[0] = extra_indexers[0][length:] join_unit.indexers[0] = join_unit.indexers[0][:length] - if join_unit.block is None: - extra_block = None - else: - extra_block = join_unit.block.getitem_block(slice(length, None)) - join_unit.block = join_unit.block.getitem_block(slice(length)) - extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:] join_unit.shape = (length,) + join_unit.shape[1:] From ce0dcaa0f4d93d86bf67f8bdbc5aa4823928104b Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 10 Sep 2021 19:23:02 -0700 Subject: [PATCH 4/5] BUG/API: concat with empty DataFrames --- pandas/core/indexing.py | 14 ++++++++- pandas/core/internals/concat.py | 35 ++--------------------- pandas/tests/frame/methods/test_append.py | 15 +++------- pandas/tests/indexing/test_partial.py | 6 ++-- pandas/tests/reshape/merge/test_merge.py | 8 ++---- 5 files changed, 25 insertions(+), 53 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 26a2996af6b2a..bbb3cb3391dfa 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1921,6 +1921,7 @@ def _setitem_with_indexer_missing(self, indexer, value): # no columns and scalar raise ValueError("cannot set a frame with no defined columns") + has_dtype = hasattr(value, "dtype") if isinstance(value, ABCSeries): # append a Series value = value.reindex(index=self.obj.columns, copy=True) @@ -1938,7 +1939,18 @@ def _setitem_with_indexer_missing(self, indexer, value): value = Series(value, index=self.obj.columns, name=indexer) - self.obj._mgr = self.obj.append(value)._mgr + if not len(self.obj): + # We will ignore the existing dtypes instead of using + # internals.concat logic + df = value.to_frame().T + df.index = [indexer] + if not has_dtype: + # i.e. if we already had a Series or ndarray, keep that + # dtype. But if we had a list or dict, then do inference + df = df.infer_objects() + self.obj._mgr = df._mgr + else: + self.obj._mgr = self.obj.append(value)._mgr self.obj._maybe_update_cacher(clear=True) def _ensure_iterable_column_indexer(self, column_indexer): diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index d0e017b06ffbc..1360f1d1a508a 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -32,7 +32,6 @@ is_1d_only_ea_obj, is_datetime64tz_dtype, is_dtype_equal, - is_scalar, needs_i8_conversion, ) from pandas.core.dtypes.concat import ( @@ -40,18 +39,13 @@ concat_compat, ) from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.missing import ( - is_valid_na_for_dtype, - isna, - isna_all, -) +from pandas.core.dtypes.missing import is_valid_na_for_dtype import pandas.core.algorithms as algos from pandas.core.arrays import ( DatetimeArray, ExtensionArray, ) -from pandas.core.arrays.sparse import SparseDtype from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.internals.array_manager import ( ArrayManager, @@ -422,29 +416,7 @@ def is_na(self) -> bool: blk = self.block if blk.dtype.kind == "V": return True - - if not blk._can_hold_na: - return False - - values = blk.values - if values.size == 0: - return True - if isinstance(values.dtype, SparseDtype): - return False - - if values.ndim == 1: - # TODO(EA2D): no need for special case with 2D EAs - val = values[0] - if not is_scalar(val) or not isna(val): - # ideally isna_all would do this short-circuiting - return False - return isna_all(values) - else: - val = values[0][0] - if not is_scalar(val) or not isna(val): - # ideally isna_all would do this short-circuiting - return False - return all(isna_all(row) for row in values) + return False def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: values: ArrayLike @@ -590,9 +562,6 @@ def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool): # different from missing.na_value_for_dtype return None elif dtype.kind in ["i", "u"]: - if not has_none_blocks: - # different from missing.na_value_for_dtype - return None return np.nan elif dtype.kind == "O": return np.nan diff --git a/pandas/tests/frame/methods/test_append.py b/pandas/tests/frame/methods/test_append.py index 80f97ecaee121..fae901b7ba303 100644 --- a/pandas/tests/frame/methods/test_append.py +++ b/pandas/tests/frame/methods/test_append.py @@ -140,7 +140,7 @@ def test_append_empty_dataframe(self): expected = df1.copy() tm.assert_frame_equal(result, expected) - def test_append_dtypes(self, using_array_manager): + def test_append_dtypes(self): # GH 5754 # row appends of different dtypes (so need to do by-item) @@ -164,10 +164,7 @@ def test_append_dtypes(self, using_array_manager): expected = DataFrame( {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} ) - if using_array_manager: - # TODO(ArrayManager) decide on exact casting rules in concat - # With ArrayManager, all-NaN float is not ignored - expected = expected.astype(object) + expected = expected.astype(object) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) @@ -176,9 +173,7 @@ def test_append_dtypes(self, using_array_manager): expected = DataFrame( {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} ) - if using_array_manager: - # With ArrayManager, all-NaN float is not ignored - expected = expected.astype(object) + expected = expected.astype(object) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": np.nan}, index=range(1)) @@ -187,9 +182,7 @@ def test_append_dtypes(self, using_array_manager): expected = DataFrame( {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")} ) - if using_array_manager: - # With ArrayManager, all-NaN float is not ignored - expected = expected.astype(object) + expected = expected.astype(object) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 693e67652c912..172301a2fde84 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -168,7 +168,8 @@ def test_partial_setting_mixed_dtype(self): # columns will align df = DataFrame(columns=["A", "B"]) df.loc[0] = Series(1, index=range(4)) - tm.assert_frame_equal(df, DataFrame(columns=["A", "B"], index=[0])) + expected = DataFrame(columns=["A", "B"], index=[0], dtype=np.float64) + tm.assert_frame_equal(df, expected) # columns will align # TODO: it isn't great that this behavior depends on consolidation @@ -185,11 +186,10 @@ def test_partial_setting_mixed_dtype(self): with pytest.raises(ValueError, match=msg): df.loc[0] = [1, 2, 3] - # TODO: #15657, these are left as object and not coerced df = DataFrame(columns=["A", "B"]) df.loc[3] = [6, 7] - exp = DataFrame([[6, 7]], index=[3], columns=["A", "B"], dtype="object") + exp = DataFrame([[6, 7]], index=[3], columns=["A", "B"], dtype=np.int64) tm.assert_frame_equal(df, exp) def test_series_partial_set(self): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 5c07a9662359e..4e3aa10a21f63 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -695,7 +695,7 @@ def _constructor(self): assert isinstance(result, NotADataFrame) - def test_join_append_timedeltas(self, using_array_manager): + def test_join_append_timedeltas(self): # timedelta64 issues with join/merge # GH 5695 @@ -707,11 +707,9 @@ def test_join_append_timedeltas(self, using_array_manager): { "d": [datetime(2013, 11, 5, 5, 56), datetime(2013, 11, 5, 5, 56)], "t": [timedelta(0, 22500), timedelta(0, 22500)], - } + }, + dtype=object, ) - if using_array_manager: - # TODO(ArrayManager) decide on exact casting rules in concat - expected = expected.astype(object) tm.assert_frame_equal(result, expected) def test_join_append_timedeltas2(self): From defa01364c795a99e9515e3e89a2a80d6468103a Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 14 Sep 2021 13:26:09 -0700 Subject: [PATCH 5/5] whatsnew --- doc/source/whatsnew/v1.4.0.rst | 39 ++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 3bdcab4fbcfea..eb1bc270a7d1b 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -150,9 +150,44 @@ The ``dayfirst`` option of :func:`to_datetime` isn't strict, and this can lead t Now, a warning will be raised if a date string cannot be parsed accordance to the given ``dayfirst`` value when the value is a delimited date string (e.g. ``31-12-2012``). -.. _whatsnew_140.notable_bug_fixes.notable_bug_fix2: +.. _whatsnew_140.notable_bug_fixes.concat_with_empty_or_all_na: -notable_bug_fix2 +Ignoring dtypes in concat with empty or all-NA columns +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When using :func:`concat` to concatenate two or more :class:`DataFrame` objects, +if one of the DataFrames was empty or had all-NA values, its dtype was _sometimes_ +ignored when finding the concatenated dtype. These are now consistently _not_ ignored (:issue:`43507`). + +.. ipython:: python + + df1 = pd.DataFrame({"bar": [pd.Timestamp("2013-01-01")]}, index=range(1)) + df2 = pd.DataFrame({"bar": np.nan}, index=range(1, 2)) + res = df1.append(df2) + +Previously, the float-dtype in ``df2`` would be ignored so the result dtype would be ``datetime64[ns]``. As a result, the ``np.nan`` would be cast to ``NaT``. + +*Previous behavior*: + +.. code-block:: ipython + + In [4]: res + Out[4]: + bar + 0 2013-01-01 + 1 NaT + +Now the float-dtype is respected. Since the common dtype for these DataFrames is object, the ``np.nan`` is retained. + +*New behavior*: + +.. ipython:: python + + res + +.. _whatsnew_140.notable_bug_fixes.notable_bug_fix3: + +notable_bug_fix3 ^^^^^^^^^^^^^^^^ .. ---------------------------------------------------------------------------