diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 4cd76346a4a8f..e2b42a0ea14f5 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -406,6 +406,7 @@ Reshaping - Improved error message when creating a :class:`DataFrame` column from a multi-dimensional :class:`numpy.ndarray` (:issue:`42463`) - :func:`concat` creating :class:`MultiIndex` with duplicate level entries when concatenating a :class:`DataFrame` with duplicates in :class:`Index` and multiple keys (:issue:`42651`) - Bug in :meth:`pandas.cut` on :class:`Series` with duplicate indices (:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`) +- Bug in :meth:`DataFrame.append` failing to retain dtypes when appended columns do not match (:issue:`43392`) - Sparse diff --git a/pandas/core/frame.py b/pandas/core/frame.py index db12129a15ef9..1d382e494cb48 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8966,6 +8966,7 @@ def append( 3 3 4 4 """ + combined_columns = None if isinstance(other, (Series, dict)): if isinstance(other, dict): if not ignore_index: @@ -8980,21 +8981,15 @@ def append( index = Index([other.name], name=self.index.name) idx_diff = other.index.difference(self.columns) combined_columns = self.columns.append(idx_diff) - other = ( - other.reindex(combined_columns, copy=False) - .to_frame() - .T.infer_objects() - .rename_axis(index.names, copy=False) - ) - if not self.columns.equals(combined_columns): - self = self.reindex(columns=combined_columns) + row_df = other.to_frame().T + # infer_objects is needed for + # test_append_empty_frame_to_series_with_dateutil_tz + other = row_df.infer_objects().rename_axis(index.names, copy=False) elif isinstance(other, list): if not other: pass elif not isinstance(other[0], DataFrame): other = DataFrame(other) - if (self.columns.get_indexer(other.columns) >= 0).all(): - other = other.reindex(columns=self.columns) from pandas.core.reshape.concat import concat @@ -9002,14 +8997,24 @@ def append( to_concat = [self, *other] else: to_concat = [self, other] - return ( - concat( - to_concat, - ignore_index=ignore_index, - verify_integrity=verify_integrity, - sort=sort, - ) - ).__finalize__(self, method="append") + + result = concat( + to_concat, + ignore_index=ignore_index, + verify_integrity=verify_integrity, + sort=sort, + ) + if ( + combined_columns is not None + and not sort + and not combined_columns.equals(result.columns) + ): + # TODO: reindexing here is a kludge bc union_indexes does not + # pass sort to index.union, xref #43375 + # combined_columns.equals check is necessary for preserving dtype + # in test_crosstab_normalize + result = result.reindex(combined_columns, axis=1) + return result.__finalize__(self, method="append") def join( self, diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py index d47fcb8ac1ca3..061afb3a7e0f5 100644 --- a/pandas/tests/reshape/concat/test_append.py +++ b/pandas/tests/reshape/concat/test_append.py @@ -5,8 +5,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( DataFrame, @@ -198,8 +196,12 @@ def test_append_same_columns_type(self, index): ser = Series([7, 8], index=ser_index, name=2) result = df.append(ser) expected = DataFrame( - [[1.0, 2.0, 3.0], [4, 5, 6], [7, 8, np.nan]], index=[0, 1, 2], columns=index + [[1, 2, 3.0], [4, 5, 6], [7, 8, np.nan]], index=[0, 1, 2], columns=index ) + # integer dtype is preserved for columns present in ser.index + assert expected.dtypes.iloc[0].kind == "i" + assert expected.dtypes.iloc[1].kind == "i" + tm.assert_frame_equal(result, expected) # ser wider than df @@ -301,14 +303,10 @@ def test_append_missing_column_proper_upcast(self, sort): assert appended["A"].dtype == "f8" assert appended["B"].dtype == "O" - # TODO(ArrayManager) DataFrame.append reindexes a Series itself (giving - # float dtype) -> delay reindexing until concat_array_managers which properly - # takes care of all-null dtype inference - @td.skip_array_manager_not_yet_implemented def test_append_empty_frame_to_series_with_dateutil_tz(self): # GH 23682 date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc()) - ser = Series({"date": date, "a": 1.0, "b": 2.0}) + ser = Series({"a": 1.0, "b": 2.0, "date": date}) df = DataFrame(columns=["c", "d"]) result_a = df.append(ser, ignore_index=True) expected = DataFrame( @@ -327,8 +325,6 @@ def test_append_empty_frame_to_series_with_dateutil_tz(self): result_b = result_a.append(ser, ignore_index=True) tm.assert_frame_equal(result_b, expected) - # column order is different - expected = expected[["c", "d", "date", "a", "b"]] result = df.append([ser, ser], ignore_index=True) tm.assert_frame_equal(result, expected)