diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 408a52e0526ee..14146b9e455b4 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -380,6 +380,7 @@ Other Enhancements - :class:`IntervalIndex` now supports time zone aware ``Interval`` objects (:issue:`18537`, :issue:`18538`) - :func:`Series` / :func:`DataFrame` tab completion also returns identifiers in the first level of a :func:`MultiIndex`. (:issue:`16326`) - :func:`read_excel()` has gained the ``nrows`` parameter (:issue:`16645`) +- :meth:`DataFrame.append` can now in more cases preserve the type of the calling dataframe's columns (e.g. if both are ``CategoricalIndex``) (:issue:`18359`) - :func:``DataFrame.to_json`` and ``Series.to_json`` now accept an ``index`` argument which allows the user to exclude the index from the JSON output (:issue:`17394`) - ``IntervalIndex.to_tuples()`` has gained the ``na_tuple`` parameter to control whether NA is returned as a tuple of NA, or NA itself (:issue:`18756`) - ``Categorical.rename_categories``, ``CategoricalIndex.rename_categories`` and :attr:`Series.cat.rename_categories` diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9e57579ddfc05..ca20def643c2b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6113,8 +6113,11 @@ def append(self, other, ignore_index=False, verify_integrity=False): # index name will be reset index = Index([other.name], name=self.index.name) - combined_columns = self.columns.tolist() + self.columns.union( - other.index).difference(self.columns).tolist() + idx_diff = other.index.difference(self.columns) + try: + combined_columns = self.columns.append(idx_diff) + except TypeError: + combined_columns = self.columns.astype(object).append(idx_diff) other = other.reindex(combined_columns, copy=False) other = DataFrame(other.values.reshape((1, len(other))), index=index, diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index ffd37dc4b2f59..640d09f3587fb 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1,5 +1,7 @@ from warnings import catch_warnings +from itertools import combinations, product +import datetime as dt import dateutil import numpy as np from numpy.random import randn @@ -829,12 +831,102 @@ def test_append_preserve_index_name(self): result = df1.append(df2) assert result.index.name == 'A' + indexes_can_append = [ + pd.RangeIndex(3), + pd.Index([4, 5, 6]), + pd.Index([4.5, 5.5, 6.5]), + pd.Index(list('abc')), + pd.CategoricalIndex('A B C'.split()), + pd.CategoricalIndex('D E F'.split(), ordered=True), + pd.DatetimeIndex([dt.datetime(2013, 1, 3, 0, 0), + dt.datetime(2013, 1, 3, 6, 10), + dt.datetime(2013, 1, 3, 7, 12)]), + ] + + indexes_cannot_append_with_other = [ + pd.IntervalIndex.from_breaks([0, 1, 2, 3]), + pd.MultiIndex.from_arrays(['A B C'.split(), 'D E F'.split()]), + ] + + all_indexes = indexes_can_append + indexes_cannot_append_with_other + + @pytest.mark.parametrize("index", + all_indexes, + ids=lambda x: x.__class__.__name__) + def test_append_same_columns_type(self, index): + # GH18359 + + # df wider than ser + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index) + ser_index = index[:2] + ser = pd.Series([7, 8], index=ser_index, name=2) + result = df.append(ser) + expected = pd.DataFrame([[1., 2., 3.], [4, 5, 6], [7, 8, np.nan]], + index=[0, 1, 2], + columns=index) + assert_frame_equal(result, expected) + + # ser wider than df + ser_index = index + index = index[:2] + df = pd.DataFrame([[1, 2], [4, 5]], columns=index) + ser = pd.Series([7, 8, 9], index=ser_index, name=2) + result = df.append(ser) + expected = pd.DataFrame([[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]], + index=[0, 1, 2], + columns=ser_index) + assert_frame_equal(result, expected) + + @pytest.mark.parametrize("df_columns, series_index", + combinations(indexes_can_append, r=2), + ids=lambda x: x.__class__.__name__) + def test_append_different_columns_types(self, df_columns, series_index): + # GH18359 + # See also test 'test_append_different_columns_types_raises' below + # for errors raised when appending + + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns) + ser = pd.Series([7, 8, 9], index=series_index, name=2) + + result = df.append(ser) + idx_diff = ser.index.difference(df_columns) + combined_columns = Index(df_columns.tolist()).append(idx_diff) + expected = pd.DataFrame([[1., 2., 3., np.nan, np.nan, np.nan], + [4, 5, 6, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, 7, 8, 9]], + index=[0, 1, 2], + columns=combined_columns) + assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "index_can_append, index_cannot_append_with_other", + product(indexes_can_append, indexes_cannot_append_with_other), + ids=lambda x: x.__class__.__name__) + def test_append_different_columns_types_raises( + self, index_can_append, index_cannot_append_with_other): + # GH18359 + # Dataframe.append will raise if IntervalIndex/MultiIndex appends + # or is appended to a different index type + # + # See also test 'test_append_different_columns_types' above for + # appending without raising. + + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_can_append) + ser = pd.Series([7, 8, 9], index=index_cannot_append_with_other, + name=2) + with pytest.raises(TypeError): + df.append(ser) + + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], + columns=index_cannot_append_with_other) + ser = pd.Series([7, 8, 9], index=index_can_append, name=2) + with pytest.raises(TypeError): + df.append(ser) + def test_append_dtype_coerce(self): # GH 4993 # appending with datetime will incorrectly convert datetime64 - import datetime as dt - from pandas import NaT df1 = DataFrame(index=[1, 2], data=[dt.datetime(2013, 1, 1, 0, 0), dt.datetime(2013, 1, 2, 0, 0)], @@ -845,7 +937,9 @@ def test_append_dtype_coerce(self): dt.datetime(2013, 1, 4, 7, 10)]], columns=['start_time', 'end_time']) - expected = concat([Series([NaT, NaT, dt.datetime(2013, 1, 3, 6, 10), + expected = concat([Series([pd.NaT, + pd.NaT, + dt.datetime(2013, 1, 3, 6, 10), dt.datetime(2013, 1, 4, 7, 10)], name='end_time'), Series([dt.datetime(2013, 1, 1, 0, 0), diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 92bedbabdf2f1..1004b40bfb4c1 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1540,12 +1540,14 @@ def test_crosstab_normalize(self): index=pd.Index([1, 2, 'All'], name='a', dtype='object'), - columns=pd.Index([3, 4], name='b')) + columns=pd.Index([3, 4], name='b', + dtype='object')) col_normal_margins = pd.DataFrame([[0.5, 0, 0.2], [0.5, 1.0, 0.8]], index=pd.Index([1, 2], name='a', dtype='object'), columns=pd.Index([3, 4, 'All'], - name='b')) + name='b', + dtype='object')) all_normal_margins = pd.DataFrame([[0.2, 0, 0.2], [0.2, 0.6, 0.8], @@ -1554,7 +1556,8 @@ def test_crosstab_normalize(self): name='a', dtype='object'), columns=pd.Index([3, 4, 'All'], - name='b')) + name='b', + dtype='object')) tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index', margins=True), row_normal_margins) tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns',