Skip to content

[WIP] API/CLN: Refactor DataFrame.append #22915

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 31 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
96d8933
Add test cases
Sep 14, 2018
fbd41eb
Rewrite `DataFrame.append`
Sep 14, 2018
5db58d8
Fix test case
Sep 14, 2018
788c685
Rewrite the sorting behaviour
Sep 15, 2018
b3059aa
Let dtypes keep the same if result has one row
Sep 15, 2018
1fe121f
Add check for ignore_index
Sep 15, 2018
945e96b
Fix _append_dict behaviour
Sep 16, 2018
8c16f53
More sort mechanisms
Sep 16, 2018
6fcf546
Make test catch more exceptions
Sep 16, 2018
734f2e3
Performance fix
Sep 16, 2018
1ad6e82
Add test case
Sep 17, 2018
dab0730
missing space
Sep 17, 2018
3314bc5
Write TypeError messages for append
Sep 17, 2018
96153c3
Write TypeError split for mixed lists
Sep 17, 2018
facafb8
Preserve name of index in result
Sep 17, 2018
ecc2467
Preserve columns index name
Sep 19, 2018
583de41
Create/Isolate new tests
Sep 25, 2018
0d4a832
Implement append reindexing code
Sep 26, 2018
c07e84c
Convert result columns to specific dtypes
Sep 27, 2018
cb28274
Fix _normalize_dataframes dtypes problem
Sep 28, 2018
6cf42b4
Add named index to test cases
Sep 28, 2018
adc6a2f
Ignore empty columns dtype on append
Sep 29, 2018
87cc878
Remove sort=None handling
Sep 29, 2018
eee2dae
Modify behavior of append on duplicates
Sep 29, 2018
5dfe32e
Merge pull request #7 from araraonline/duplicate
Sep 29, 2018
b373368
clean lines
Sep 30, 2018
4b32c64
Fix indexes.api._reindex
Sep 30, 2018
2617449
clean test
Sep 30, 2018
c1e8e0f
Fix small error: 'pd' is not defined
Oct 1, 2018
a8f03b7
Ignore empty indexes in _merge_index_list
Oct 1, 2018
4a54942
Implement sort=None behavior (#8)
Oct 1, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 140 additions & 33 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6338,45 +6338,152 @@ def append(self, other, ignore_index=False,
3 3
4 4
"""
if isinstance(other, (Series, dict)):
if isinstance(other, dict):
other = Series(other)
if other.name is None and not ignore_index:
kwargs = {
'ignore_index': ignore_index,
'verify_integrity': verify_integrity,
'sort': sort,
}

obj_type = type(other)
kwargs['_obj_type'] = obj_type
if issubclass(obj_type, dict):
return self._append_dict(other, **kwargs)
elif issubclass(obj_type, Series):
return self._append_series(other, **kwargs)
elif issubclass(obj_type, DataFrame):
return self._append_frame(other, **kwargs)
elif issubclass(obj_type, list):

try:
item_type = type(other[0])
except IndexError: # empty list!
return self._append_list_of_frames(other, **kwargs)
if not all(isinstance(i, item_type) for i in other[1:]):
if issubclass(item_type, (dict, Series, DataFrame)):
raise TypeError("When other is a list, its elements must"
" be all of the same type")
else:
raise TypeError("The value of other must be a"
" DataFrame or Series/dict-like object,"
" or list of these")
kwargs['_item_type'] = item_type

if issubclass(item_type, dict):
return self._append_list_of_dicts(other, **kwargs)
elif issubclass(item_type, Series):
return self._append_list_of_series(other, **kwargs)
elif issubclass(item_type, DataFrame):
return self._append_list_of_frames(other, **kwargs)
else:
raise TypeError("The value of other must be a"
" DataFrame or Series/dict-like object,"
" or list of these")
else:
raise TypeError("The value of other must be a"
" DataFrame or Series/dict-like object,"
" or list of these")

def _append_dict(self, other, *args, **kwargs):
return self._append_list_of_dicts([other], *args, **kwargs)

def _append_series(self, other, *args, **kwargs):
return self._append_list_of_series([other], *args, **kwargs)

def _append_frame(self, other, *args, **kwargs):
return self._append_list_of_frames([other], *args, **kwargs)

def _append_list_of_dicts(self, other, *args, **kwargs):
if not kwargs['ignore_index']:
raise TypeError('Can only append a dict if ignore_index=True')
return self._append_frame(DataFrame(other), *args, **kwargs)

def _append_list_of_series(self, other, *args, **kwargs):
if not kwargs['ignore_index']:
if any(series.name is None for series in other):
raise TypeError('Can only append a Series if ignore_index=True'
' or if the Series has a name')

if other.name is None:
index = None
else:
# other must have the same index name as self, otherwise
# index name will be reset
index = Index([other.name], name=self.index.name)
if len(other) == 1:
# manually create DF for performance
ser = other[0]
df = DataFrame(ser.values.reshape(1, ser.shape[0]),
index=[ser.name], columns=ser.index)
else:
df = DataFrame(other)

idx_diff = other.index.difference(self.columns)
try:
combined_columns = self.columns.append(idx_diff)
except TypeError:
combined_columns = self.columns.astype(object).append(idx_diff)
other = other.reindex(combined_columns, copy=False)
other = DataFrame(other.values.reshape((1, len(other))),
index=index,
columns=combined_columns)
other = other._convert(datetime=True, timedelta=True)
if not self.columns.equals(combined_columns):
self = self.reindex(columns=combined_columns)
elif isinstance(other, list) and not isinstance(other[0], DataFrame):
other = DataFrame(other)
if (self.columns.get_indexer(other.columns) >= 0).all():
other = other.loc[:, self.columns]
return self._append_frame(df, *args, **kwargs)

def _append_list_of_frames(self, other, *args, **kwargs):
ignore_index = kwargs['ignore_index']
verify_integrity = kwargs['verify_integrity']
sort = kwargs['sort']
_obj_type = kwargs['_obj_type']
_item_type = kwargs.get('_item_type')

from pandas.core.indexes.api import _normalize_dataframes
from pandas.core.reshape.concat import concat
if isinstance(other, (list, tuple)):
to_concat = [self] + other
else:
to_concat = [self, other]
return concat(to_concat, ignore_index=ignore_index,
verify_integrity=verify_integrity,
sort=sort)

# sorting behavior when sort=None
# TODO: remove when kwarg value change
if sort is None:
# stabilish desired behavior
if _obj_type in (dict, Series):
# dict/ser

sort = False
warn = False
elif _item_type in (dict, Series):
# [dict]/[ser]

if (self.columns.get_indexer(other[0].columns) >= 0).all():
# self.columns >= other[0].columns
sort = False
warn = False
else:
sort = True
types = [df.columns.dtype for df in [self] + other]
common = find_common_type(types)
warn = (common == object)
else:
# frame/[frame]

if all(self.columns.equals(df.columns) for df in other):
# all values the same
sort = False
warn = False
else:
sort = True
types = [df.columns.dtype for df in [self] + other]
common = find_common_type(types)
warn = (common == object)

# warn if necessary
if warn:
from pandas.core.indexes.api import _sort_msg
warnings.warn(_sort_msg, FutureWarning)

# The behavior of concat is a bit problematic as it is. To get around,
# we prepare the DataFrames before feeding them into concat.
to_concat = [self] + other
to_concat_norm = _normalize_dataframes(to_concat, sort=sort)
result = concat(to_concat_norm, ignore_index=ignore_index,
verify_integrity=verify_integrity, sort=sort)

# preserve base DataFrame indexes names
# XXX: how will this work with MultiIndex (?)
result.columns.name = self.columns.name
if not ignore_index:
result.index.name = self.index.name

# Reindexing the columns created an artificial float64 where it
# was not needed. We can convert the columns back to the expected
# type.
if result.shape[0] == 1:
base_frame = next(df for df in to_concat_norm if df.shape[0] == 1)
dtypes = base_frame.dtypes.to_dict()
result = result.astype(dtypes) # won't work well dups cols

return result

def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
sort=False):
Expand Down
Loading