-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: DataFrame.append preserves columns dtype if possible #19021
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b3e8b93
093cd2c
4cb8a5b
ed895a4
fd1e7a5
13537ae
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,7 @@ | ||
from warnings import catch_warnings | ||
from itertools import combinations, product | ||
|
||
import datetime as dt | ||
import dateutil | ||
import numpy as np | ||
from numpy.random import randn | ||
|
@@ -829,12 +831,102 @@ def test_append_preserve_index_name(self): | |
result = df1.append(df2) | ||
assert result.index.name == 'A' | ||
|
||
indexes_can_append = [ | ||
pd.RangeIndex(3), | ||
pd.Index([4, 5, 6]), | ||
pd.Index([4.5, 5.5, 6.5]), | ||
pd.Index(list('abc')), | ||
pd.CategoricalIndex('A B C'.split()), | ||
pd.CategoricalIndex('D E F'.split(), ordered=True), | ||
pd.DatetimeIndex([dt.datetime(2013, 1, 3, 0, 0), | ||
dt.datetime(2013, 1, 3, 6, 10), | ||
dt.datetime(2013, 1, 3, 7, 12)]), | ||
] | ||
|
||
indexes_cannot_append_with_other = [ | ||
pd.IntervalIndex.from_breaks([0, 1, 2, 3]), | ||
pd.MultiIndex.from_arrays(['A B C'.split(), 'D E F'.split()]), | ||
] | ||
|
||
all_indexes = indexes_can_append + indexes_cannot_append_with_other | ||
|
||
@pytest.mark.parametrize("index", | ||
all_indexes, | ||
ids=lambda x: x.__class__.__name__) | ||
def test_append_same_columns_type(self, index): | ||
# GH18359 | ||
|
||
# df wider than ser | ||
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index) | ||
ser_index = index[:2] | ||
ser = pd.Series([7, 8], index=ser_index, name=2) | ||
result = df.append(ser) | ||
expected = pd.DataFrame([[1., 2., 3.], [4, 5, 6], [7, 8, np.nan]], | ||
index=[0, 1, 2], | ||
columns=index) | ||
assert_frame_equal(result, expected) | ||
|
||
# ser wider than df | ||
ser_index = index | ||
index = index[:2] | ||
df = pd.DataFrame([[1, 2], [4, 5]], columns=index) | ||
ser = pd.Series([7, 8, 9], index=ser_index, name=2) | ||
result = df.append(ser) | ||
expected = pd.DataFrame([[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]], | ||
index=[0, 1, 2], | ||
columns=ser_index) | ||
assert_frame_equal(result, expected) | ||
|
||
@pytest.mark.parametrize("df_columns, series_index", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can just use 2 parameterize here, one for df_columns and one for series_index; you don't need combinations, pytest already does this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't get this approach to work, AFAIKS, parametrize doesn't do this for you, and I've googled it without results. Can you give an example? |
||
combinations(indexes_can_append, r=2), | ||
ids=lambda x: x.__class__.__name__) | ||
def test_append_different_columns_types(self, df_columns, series_index): | ||
# GH18359 | ||
# See also test 'test_append_different_columns_types_raises' below | ||
# for errors raised when appending | ||
|
||
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns) | ||
ser = pd.Series([7, 8, 9], index=series_index, name=2) | ||
|
||
result = df.append(ser) | ||
idx_diff = ser.index.difference(df_columns) | ||
combined_columns = Index(df_columns.tolist()).append(idx_diff) | ||
expected = pd.DataFrame([[1., 2., 3., np.nan, np.nan, np.nan], | ||
[4, 5, 6, np.nan, np.nan, np.nan], | ||
[np.nan, np.nan, np.nan, 7, 8, 9]], | ||
index=[0, 1, 2], | ||
columns=combined_columns) | ||
assert_frame_equal(result, expected) | ||
|
||
@pytest.mark.parametrize( | ||
"index_can_append, index_cannot_append_with_other", | ||
product(indexes_can_append, indexes_cannot_append_with_other), | ||
ids=lambda x: x.__class__.__name__) | ||
def test_append_different_columns_types_raises( | ||
self, index_can_append, index_cannot_append_with_other): | ||
# GH18359 | ||
# Dataframe.append will raise if IntervalIndex/MultiIndex appends | ||
# or is appended to a different index type | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @topper-123 can you help me understand why raising is the correct thing to do here? i have a branch that fixes a Series construction bug:
but the fix for that (in MultiIndex.reindex) causes this test to fail |
||
# | ||
# See also test 'test_append_different_columns_types' above for | ||
# appending without raising. | ||
|
||
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_can_append) | ||
ser = pd.Series([7, 8, 9], index=index_cannot_append_with_other, | ||
name=2) | ||
with pytest.raises(TypeError): | ||
df.append(ser) | ||
|
||
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], | ||
columns=index_cannot_append_with_other) | ||
ser = pd.Series([7, 8, 9], index=index_can_append, name=2) | ||
with pytest.raises(TypeError): | ||
df.append(ser) | ||
|
||
def test_append_dtype_coerce(self): | ||
|
||
# GH 4993 | ||
# appending with datetime will incorrectly convert datetime64 | ||
import datetime as dt | ||
from pandas import NaT | ||
|
||
df1 = DataFrame(index=[1, 2], data=[dt.datetime(2013, 1, 1, 0, 0), | ||
dt.datetime(2013, 1, 2, 0, 0)], | ||
|
@@ -845,7 +937,9 @@ def test_append_dtype_coerce(self): | |
dt.datetime(2013, 1, 4, 7, 10)]], | ||
columns=['start_time', 'end_time']) | ||
|
||
expected = concat([Series([NaT, NaT, dt.datetime(2013, 1, 3, 6, 10), | ||
expected = concat([Series([pd.NaT, | ||
pd.NaT, | ||
dt.datetime(2013, 1, 3, 6, 10), | ||
dt.datetime(2013, 1, 4, 7, 10)], | ||
name='end_time'), | ||
Series([dt.datetime(2013, 1, 1, 0, 0), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
so this should not be done here, rather if a TypeError happens, then
Index.append
handles.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't understand you here, could you expand.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
.append`` should never raise
so you are catching something that does not happen.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The below fails, though:
This is what is tested in the
if not isinstance(df_columns, (pd.IntervalIndex, pd.MultiIndex)):
part of the tests. In addition, the reason why pd.IntervalIndex and pd.MultiIndex are excluded for testing there is because of reindexing issues:There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hmm, the MI case we did on purpose. However you cannot convert to lists and expect things to work. for the TypeError, do
.astype(object)
on each left and right then.append
again. we never should coerce to lists.