Skip to content

Commit 3e691a4

Browse files
topper-123jreback
authored andcommitted
ENH: DataFrame.append preserves columns dtype if possible (#19021)
1 parent 78fee04 commit 3e691a4

File tree

4 files changed

+109
-8
lines changed

4 files changed

+109
-8
lines changed

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,7 @@ Other Enhancements
409409
- :class:`IntervalIndex` now supports time zone aware ``Interval`` objects (:issue:`18537`, :issue:`18538`)
410410
- :func:`Series` / :func:`DataFrame` tab completion also returns identifiers in the first level of a :func:`MultiIndex`. (:issue:`16326`)
411411
- :func:`read_excel()` has gained the ``nrows`` parameter (:issue:`16645`)
412+
- :meth:`DataFrame.append` can now in more cases preserve the type of the calling dataframe's columns (e.g. if both are ``CategoricalIndex``) (:issue:`18359`)
412413
- :func:``DataFrame.to_json`` and ``Series.to_json`` now accept an ``index`` argument which allows the user to exclude the index from the JSON output (:issue:`17394`)
413414
- ``IntervalIndex.to_tuples()`` has gained the ``na_tuple`` parameter to control whether NA is returned as a tuple of NA, or NA itself (:issue:`18756`)
414415
- ``Categorical.rename_categories``, ``CategoricalIndex.rename_categories`` and :attr:`Series.cat.rename_categories`

pandas/core/frame.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -6138,8 +6138,11 @@ def append(self, other, ignore_index=False, verify_integrity=False):
61386138
# index name will be reset
61396139
index = Index([other.name], name=self.index.name)
61406140

6141-
combined_columns = self.columns.tolist() + self.columns.union(
6142-
other.index).difference(self.columns).tolist()
6141+
idx_diff = other.index.difference(self.columns)
6142+
try:
6143+
combined_columns = self.columns.append(idx_diff)
6144+
except TypeError:
6145+
combined_columns = self.columns.astype(object).append(idx_diff)
61436146
other = other.reindex(combined_columns, copy=False)
61446147
other = DataFrame(other.values.reshape((1, len(other))),
61456148
index=index,

pandas/tests/reshape/test_concat.py

+97-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from warnings import catch_warnings
2+
from itertools import combinations, product
23

4+
import datetime as dt
35
import dateutil
46
import numpy as np
57
from numpy.random import randn
@@ -829,12 +831,102 @@ def test_append_preserve_index_name(self):
829831
result = df1.append(df2)
830832
assert result.index.name == 'A'
831833

834+
indexes_can_append = [
835+
pd.RangeIndex(3),
836+
pd.Index([4, 5, 6]),
837+
pd.Index([4.5, 5.5, 6.5]),
838+
pd.Index(list('abc')),
839+
pd.CategoricalIndex('A B C'.split()),
840+
pd.CategoricalIndex('D E F'.split(), ordered=True),
841+
pd.DatetimeIndex([dt.datetime(2013, 1, 3, 0, 0),
842+
dt.datetime(2013, 1, 3, 6, 10),
843+
dt.datetime(2013, 1, 3, 7, 12)]),
844+
]
845+
846+
indexes_cannot_append_with_other = [
847+
pd.IntervalIndex.from_breaks([0, 1, 2, 3]),
848+
pd.MultiIndex.from_arrays(['A B C'.split(), 'D E F'.split()]),
849+
]
850+
851+
all_indexes = indexes_can_append + indexes_cannot_append_with_other
852+
853+
@pytest.mark.parametrize("index",
854+
all_indexes,
855+
ids=lambda x: x.__class__.__name__)
856+
def test_append_same_columns_type(self, index):
857+
# GH18359
858+
859+
# df wider than ser
860+
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index)
861+
ser_index = index[:2]
862+
ser = pd.Series([7, 8], index=ser_index, name=2)
863+
result = df.append(ser)
864+
expected = pd.DataFrame([[1., 2., 3.], [4, 5, 6], [7, 8, np.nan]],
865+
index=[0, 1, 2],
866+
columns=index)
867+
assert_frame_equal(result, expected)
868+
869+
# ser wider than df
870+
ser_index = index
871+
index = index[:2]
872+
df = pd.DataFrame([[1, 2], [4, 5]], columns=index)
873+
ser = pd.Series([7, 8, 9], index=ser_index, name=2)
874+
result = df.append(ser)
875+
expected = pd.DataFrame([[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]],
876+
index=[0, 1, 2],
877+
columns=ser_index)
878+
assert_frame_equal(result, expected)
879+
880+
@pytest.mark.parametrize("df_columns, series_index",
881+
combinations(indexes_can_append, r=2),
882+
ids=lambda x: x.__class__.__name__)
883+
def test_append_different_columns_types(self, df_columns, series_index):
884+
# GH18359
885+
# See also test 'test_append_different_columns_types_raises' below
886+
# for errors raised when appending
887+
888+
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns)
889+
ser = pd.Series([7, 8, 9], index=series_index, name=2)
890+
891+
result = df.append(ser)
892+
idx_diff = ser.index.difference(df_columns)
893+
combined_columns = Index(df_columns.tolist()).append(idx_diff)
894+
expected = pd.DataFrame([[1., 2., 3., np.nan, np.nan, np.nan],
895+
[4, 5, 6, np.nan, np.nan, np.nan],
896+
[np.nan, np.nan, np.nan, 7, 8, 9]],
897+
index=[0, 1, 2],
898+
columns=combined_columns)
899+
assert_frame_equal(result, expected)
900+
901+
@pytest.mark.parametrize(
902+
"index_can_append, index_cannot_append_with_other",
903+
product(indexes_can_append, indexes_cannot_append_with_other),
904+
ids=lambda x: x.__class__.__name__)
905+
def test_append_different_columns_types_raises(
906+
self, index_can_append, index_cannot_append_with_other):
907+
# GH18359
908+
# Dataframe.append will raise if IntervalIndex/MultiIndex appends
909+
# or is appended to a different index type
910+
#
911+
# See also test 'test_append_different_columns_types' above for
912+
# appending without raising.
913+
914+
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_can_append)
915+
ser = pd.Series([7, 8, 9], index=index_cannot_append_with_other,
916+
name=2)
917+
with pytest.raises(TypeError):
918+
df.append(ser)
919+
920+
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]],
921+
columns=index_cannot_append_with_other)
922+
ser = pd.Series([7, 8, 9], index=index_can_append, name=2)
923+
with pytest.raises(TypeError):
924+
df.append(ser)
925+
832926
def test_append_dtype_coerce(self):
833927

834928
# GH 4993
835929
# appending with datetime will incorrectly convert datetime64
836-
import datetime as dt
837-
from pandas import NaT
838930

839931
df1 = DataFrame(index=[1, 2], data=[dt.datetime(2013, 1, 1, 0, 0),
840932
dt.datetime(2013, 1, 2, 0, 0)],
@@ -845,7 +937,9 @@ def test_append_dtype_coerce(self):
845937
dt.datetime(2013, 1, 4, 7, 10)]],
846938
columns=['start_time', 'end_time'])
847939

848-
expected = concat([Series([NaT, NaT, dt.datetime(2013, 1, 3, 6, 10),
940+
expected = concat([Series([pd.NaT,
941+
pd.NaT,
942+
dt.datetime(2013, 1, 3, 6, 10),
849943
dt.datetime(2013, 1, 4, 7, 10)],
850944
name='end_time'),
851945
Series([dt.datetime(2013, 1, 1, 0, 0),

pandas/tests/reshape/test_pivot.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -1540,12 +1540,14 @@ def test_crosstab_normalize(self):
15401540
index=pd.Index([1, 2, 'All'],
15411541
name='a',
15421542
dtype='object'),
1543-
columns=pd.Index([3, 4], name='b'))
1543+
columns=pd.Index([3, 4], name='b',
1544+
dtype='object'))
15441545
col_normal_margins = pd.DataFrame([[0.5, 0, 0.2], [0.5, 1.0, 0.8]],
15451546
index=pd.Index([1, 2], name='a',
15461547
dtype='object'),
15471548
columns=pd.Index([3, 4, 'All'],
1548-
name='b'))
1549+
name='b',
1550+
dtype='object'))
15491551

15501552
all_normal_margins = pd.DataFrame([[0.2, 0, 0.2],
15511553
[0.2, 0.6, 0.8],
@@ -1554,7 +1556,8 @@ def test_crosstab_normalize(self):
15541556
name='a',
15551557
dtype='object'),
15561558
columns=pd.Index([3, 4, 'All'],
1557-
name='b'))
1559+
name='b',
1560+
dtype='object'))
15581561
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index',
15591562
margins=True), row_normal_margins)
15601563
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns',

0 commit comments

Comments
 (0)