ENH: DataFrame.append preserves columns dtype if possible (#19021)

topper-123 · jreback · commit 3e691a4cba56 · 2018-04-19T20:35:13.000-04:00
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -409,6 +409,7 @@ Other Enhancements
 - :class:`IntervalIndex` now supports time zone aware ``Interval`` objects (:issue:`18537`, :issue:`18538`)
 - :func:`Series` / :func:`DataFrame` tab completion also returns identifiers in the first level of a :func:`MultiIndex`. (:issue:`16326`)
 - :func:`read_excel()` has gained the ``nrows`` parameter (:issue:`16645`)
+- :meth:`DataFrame.append` can now in more cases preserve the type of the calling dataframe's columns (e.g. if both are ``CategoricalIndex``) (:issue:`18359`)
 - :func:``DataFrame.to_json`` and ``Series.to_json`` now accept an ``index`` argument which allows the user to exclude the index from the JSON output (:issue:`17394`)
 - ``IntervalIndex.to_tuples()`` has gained the ``na_tuple`` parameter to control whether NA is returned as a tuple of NA, or NA itself (:issue:`18756`)
 - ``Categorical.rename_categories``, ``CategoricalIndex.rename_categories`` and :attr:`Series.cat.rename_categories`
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -6138,8 +6138,11 @@ def append(self, other, ignore_index=False, verify_integrity=False):
                 # index name will be reset
                 index = Index([other.name], name=self.index.name)
 
-            combined_columns = self.columns.tolist() + self.columns.union(
-                other.index).difference(self.columns).tolist()
+            idx_diff = other.index.difference(self.columns)
+            try:
+                combined_columns = self.columns.append(idx_diff)
+            except TypeError:
+                combined_columns = self.columns.astype(object).append(idx_diff)
             other = other.reindex(combined_columns, copy=False)
             other = DataFrame(other.values.reshape((1, len(other))),
                               index=index,
diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py
@@ -1,5 +1,7 @@
 from warnings import catch_warnings
+from itertools import combinations, product
 
+import datetime as dt
 import dateutil
 import numpy as np
 from numpy.random import randn
@@ -829,12 +831,102 @@ def test_append_preserve_index_name(self):
         result = df1.append(df2)
         assert result.index.name == 'A'
 
+    indexes_can_append = [
+        pd.RangeIndex(3),
+        pd.Index([4, 5, 6]),
+        pd.Index([4.5, 5.5, 6.5]),
+        pd.Index(list('abc')),
+        pd.CategoricalIndex('A B C'.split()),
+        pd.CategoricalIndex('D E F'.split(), ordered=True),
+        pd.DatetimeIndex([dt.datetime(2013, 1, 3, 0, 0),
+                          dt.datetime(2013, 1, 3, 6, 10),
+                          dt.datetime(2013, 1, 3, 7, 12)]),
+    ]
+
+    indexes_cannot_append_with_other = [
+        pd.IntervalIndex.from_breaks([0, 1, 2, 3]),
+        pd.MultiIndex.from_arrays(['A B C'.split(), 'D E F'.split()]),
+    ]
+
+    all_indexes = indexes_can_append + indexes_cannot_append_with_other
+
+    @pytest.mark.parametrize("index",
+                             all_indexes,
+                             ids=lambda x: x.__class__.__name__)
+    def test_append_same_columns_type(self, index):
+        # GH18359
+
+        # df wider than ser
+        df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index)
+        ser_index = index[:2]
+        ser = pd.Series([7, 8], index=ser_index, name=2)
+        result = df.append(ser)
+        expected = pd.DataFrame([[1., 2., 3.], [4, 5, 6], [7, 8, np.nan]],
+                                index=[0, 1, 2],
+                                columns=index)
+        assert_frame_equal(result, expected)
+
+        # ser wider than df
+        ser_index = index
+        index = index[:2]
+        df = pd.DataFrame([[1, 2], [4, 5]], columns=index)
+        ser = pd.Series([7, 8, 9], index=ser_index, name=2)
+        result = df.append(ser)
+        expected = pd.DataFrame([[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]],
+                                index=[0, 1, 2],
+                                columns=ser_index)
+        assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("df_columns, series_index",
+                             combinations(indexes_can_append, r=2),
+                             ids=lambda x: x.__class__.__name__)
+    def test_append_different_columns_types(self, df_columns, series_index):
+        # GH18359
+        # See also test 'test_append_different_columns_types_raises' below
+        # for errors raised when appending
+
+        df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns)
+        ser = pd.Series([7, 8, 9], index=series_index, name=2)
+
+        result = df.append(ser)
+        idx_diff = ser.index.difference(df_columns)
+        combined_columns = Index(df_columns.tolist()).append(idx_diff)
+        expected = pd.DataFrame([[1., 2., 3., np.nan, np.nan, np.nan],
+                                 [4, 5, 6, np.nan, np.nan, np.nan],
+                                 [np.nan, np.nan, np.nan, 7, 8, 9]],
+                                index=[0, 1, 2],
+                                columns=combined_columns)
+        assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "index_can_append, index_cannot_append_with_other",
+        product(indexes_can_append, indexes_cannot_append_with_other),
+        ids=lambda x: x.__class__.__name__)
+    def test_append_different_columns_types_raises(
+            self, index_can_append, index_cannot_append_with_other):
+        # GH18359
+        # Dataframe.append will raise if IntervalIndex/MultiIndex appends
+        # or is appended to a different index type
+        #
+        # See also test 'test_append_different_columns_types' above for
+        # appending without raising.
+
+        df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_can_append)
+        ser = pd.Series([7, 8, 9], index=index_cannot_append_with_other,
+                        name=2)
+        with pytest.raises(TypeError):
+            df.append(ser)
+
+        df = pd.DataFrame([[1, 2, 3], [4, 5, 6]],
+                          columns=index_cannot_append_with_other)
+        ser = pd.Series([7, 8, 9], index=index_can_append, name=2)
+        with pytest.raises(TypeError):
+            df.append(ser)
+
     def test_append_dtype_coerce(self):
 
         # GH 4993
         # appending with datetime will incorrectly convert datetime64
-        import datetime as dt
-        from pandas import NaT
 
         df1 = DataFrame(index=[1, 2], data=[dt.datetime(2013, 1, 1, 0, 0),
                                             dt.datetime(2013, 1, 2, 0, 0)],
@@ -845,7 +937,9 @@ def test_append_dtype_coerce(self):
                                              dt.datetime(2013, 1, 4, 7, 10)]],
                         columns=['start_time', 'end_time'])
 
-        expected = concat([Series([NaT, NaT, dt.datetime(2013, 1, 3, 6, 10),
+        expected = concat([Series([pd.NaT,
+                                   pd.NaT,
+                                   dt.datetime(2013, 1, 3, 6, 10),
                                    dt.datetime(2013, 1, 4, 7, 10)],
                                   name='end_time'),
                            Series([dt.datetime(2013, 1, 1, 0, 0),
diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
@@ -1540,12 +1540,14 @@ def test_crosstab_normalize(self):
                                           index=pd.Index([1, 2, 'All'],
                                                          name='a',
                                                          dtype='object'),
-                                          columns=pd.Index([3, 4], name='b'))
+                                          columns=pd.Index([3, 4], name='b',
+                                                           dtype='object'))
         col_normal_margins = pd.DataFrame([[0.5, 0, 0.2], [0.5, 1.0, 0.8]],
                                           index=pd.Index([1, 2], name='a',
                                                          dtype='object'),
                                           columns=pd.Index([3, 4, 'All'],
-                                                           name='b'))
+                                                           name='b',
+                                                           dtype='object'))
 
         all_normal_margins = pd.DataFrame([[0.2, 0, 0.2],
                                            [0.2, 0.6, 0.8],
@@ -1554,7 +1556,8 @@ def test_crosstab_normalize(self):
                                                          name='a',
                                                          dtype='object'),
                                           columns=pd.Index([3, 4, 'All'],
-                                                           name='b'))
+                                                           name='b',
+                                                           dtype='object'))
         tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index',
                                           margins=True), row_normal_margins)
         tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns',