Skip to content

Commit c4da79b

Browse files
brycepgTomAugspurger
authored andcommitted
Stop concat from attempting to sort mismatched columns by default (pandas-dev#20613)
* Stop concat from attempting to sort mismatched columns by default Preserve column order upon concatenation to obey least astonishment principle. Allow old behavior to be enabled by adding a boolean switch to concat and DataFrame.append, mismatch_sort, which is by default disabled. Closes pandas-dev#4588
1 parent 93e7123 commit c4da79b

File tree

19 files changed

+400
-91
lines changed

19 files changed

+400
-91
lines changed

doc/source/merging.rst

+16-8
Original file line numberDiff line numberDiff line change
@@ -153,10 +153,10 @@ Set logic on the other axes
153153
~~~~~~~~~~~~~~~~~~~~~~~~~~~
154154

155155
When gluing together multiple DataFrames, you have a choice of how to handle
156-
the other axes (other than the one being concatenated). This can be done in
156+
the other axes (other than the one being concatenated). This can be done in
157157
the following three ways:
158158

159-
- Take the (sorted) union of them all, ``join='outer'``. This is the default
159+
- Take the union of them all, ``join='outer'``. This is the default
160160
option as it results in zero information loss.
161161
- Take the intersection, ``join='inner'``.
162162
- Use a specific index, as passed to the ``join_axes`` argument.
@@ -167,10 +167,10 @@ behavior:
167167
.. ipython:: python
168168
169169
df4 = pd.DataFrame({'B': ['B2', 'B3', 'B6', 'B7'],
170-
'D': ['D2', 'D3', 'D6', 'D7'],
171-
'F': ['F2', 'F3', 'F6', 'F7']},
172-
index=[2, 3, 6, 7])
173-
result = pd.concat([df1, df4], axis=1)
170+
'D': ['D2', 'D3', 'D6', 'D7'],
171+
'F': ['F2', 'F3', 'F6', 'F7']},
172+
index=[2, 3, 6, 7])
173+
result = pd.concat([df1, df4], axis=1, sort=False)
174174
175175
176176
.. ipython:: python
@@ -181,8 +181,16 @@ behavior:
181181
labels=['df1', 'df4'], vertical=False);
182182
plt.close('all');
183183
184-
Note that the row indexes have been unioned and sorted. Here is the same thing
185-
with ``join='inner'``:
184+
.. warning::
185+
186+
.. versionchanged:: 0.23.0
187+
188+
The default behavior with ``join='outer'`` is to sort the other axis
189+
(columns in this case). In a future version of pandas, the default will
190+
be to not sort. We specified ``sort=False`` to opt in to the new
191+
behavior now.
192+
193+
Here is the same thing with ``join='inner'``:
186194

187195
.. ipython:: python
188196

doc/source/whatsnew/v0.23.0.txt

+30
Original file line numberDiff line numberDiff line change
@@ -694,6 +694,36 @@ Returning a ``Series`` allows one to control the exact return structure and colu
694694

695695
df.apply(lambda x: Series([1, 2, 3], index=['D', 'E', 'F']), axis=1)
696696

697+
.. _whatsnew_0230.api_breaking.concat:
698+
699+
Concatenation will no longer sort
700+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
701+
702+
In a future version of pandas :func:`pandas.concat` will no longer sort the non-concatenation axis when it is not already aligned.
703+
The current behavior is the same as the previous (sorting), but now a warning is issued when ``sort`` is not specified and the non-concatenation axis is not aligned (:issue:`4588`).
704+
705+
.. ipython:: python
706+
:okwarning:
707+
708+
df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=['b', 'a'])
709+
df2 = pd.DataFrame({"a": [4, 5]})
710+
711+
pd.concat([df1, df2])
712+
713+
To keep the previous behavior (sorting) and silence the warning, pass ``sort=True``
714+
715+
.. ipython:: python
716+
717+
pd.concat([df1, df2], sort=True)
718+
719+
To accept the future behavior (no sorting), pass ``sort=False``
720+
721+
.. ipython
722+
723+
pd.concat([df1, df2], sort=False)
724+
725+
Note that this change also applies to :meth:`DataFrame.append`, which has also received a ``sort`` keyword for controlling this behavior.
726+
697727

698728
.. _whatsnew_0230.api_breaking.build_changes:
699729

pandas/_libs/lib.pyx

+6-5
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ def fast_unique_multiple(list arrays):
157157

158158
@cython.wraparound(False)
159159
@cython.boundscheck(False)
160-
def fast_unique_multiple_list(list lists):
160+
def fast_unique_multiple_list(list lists, bint sort=True):
161161
cdef:
162162
list buf
163163
Py_ssize_t k = len(lists)
@@ -174,10 +174,11 @@ def fast_unique_multiple_list(list lists):
174174
if val not in table:
175175
table[val] = stub
176176
uniques.append(val)
177-
try:
178-
uniques.sort()
179-
except Exception:
180-
pass
177+
if sort:
178+
try:
179+
uniques.sort()
180+
except Exception:
181+
pass
181182

182183
return uniques
183184

pandas/core/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -507,7 +507,7 @@ def is_any_frame():
507507
for r in compat.itervalues(result))
508508

509509
if isinstance(result, list):
510-
return concat(result, keys=keys, axis=1), True
510+
return concat(result, keys=keys, axis=1, sort=True), True
511511

512512
elif is_any_frame():
513513
# we have a dict of DataFrames

pandas/core/frame.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -6073,7 +6073,8 @@ def infer(x):
60736073
# ----------------------------------------------------------------------
60746074
# Merging / joining methods
60756075

6076-
def append(self, other, ignore_index=False, verify_integrity=False):
6076+
def append(self, other, ignore_index=False,
6077+
verify_integrity=False, sort=None):
60776078
"""
60786079
Append rows of `other` to the end of this frame, returning a new
60796080
object. Columns not in this frame are added as new columns.
@@ -6086,6 +6087,14 @@ def append(self, other, ignore_index=False, verify_integrity=False):
60866087
If True, do not use the index labels.
60876088
verify_integrity : boolean, default False
60886089
If True, raise ValueError on creating index with duplicates.
6090+
sort : boolean, default None
6091+
Sort columns if the columns of `self` and `other` are not aligned.
6092+
The default sorting is deprecated and will change to not-sorting
6093+
in a future version of pandas. Explicitly pass ``sort=True`` to
6094+
silence the warning and sort. Explicitly pass ``sort=False`` to
6095+
silence the warning and not sort.
6096+
6097+
.. versionadded:: 0.23.0
60896098
60906099
Returns
60916100
-------
@@ -6197,7 +6206,8 @@ def append(self, other, ignore_index=False, verify_integrity=False):
61976206
else:
61986207
to_concat = [self, other]
61996208
return concat(to_concat, ignore_index=ignore_index,
6200-
verify_integrity=verify_integrity)
6209+
verify_integrity=verify_integrity,
6210+
sort=sort)
62016211

62026212
def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
62036213
sort=False):
@@ -7516,7 +7526,7 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
75167526
from pandas.core.index import _get_objs_combined_axis
75177527

75187528
if columns is None:
7519-
columns = _get_objs_combined_axis(data)
7529+
columns = _get_objs_combined_axis(data, sort=False)
75207530

75217531
indexer_cache = {}
75227532

pandas/core/groupby/groupby.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1101,7 +1101,8 @@ def reset_identity(values):
11011101
group_names = self.grouper.names
11021102

11031103
result = concat(values, axis=self.axis, keys=group_keys,
1104-
levels=group_levels, names=group_names)
1104+
levels=group_levels, names=group_names,
1105+
sort=False)
11051106
else:
11061107

11071108
# GH5610, returns a MI, with the first level being a

pandas/core/indexes/api.py

+40-13
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
import textwrap
2+
import warnings
3+
14
from pandas.core.indexes.base import (Index,
25
_new_Index,
36
_ensure_index,
@@ -17,6 +20,16 @@
1720
from pandas._libs import lib
1821
from pandas._libs.tslib import NaT
1922

23+
_sort_msg = textwrap.dedent("""\
24+
Sorting because non-concatenation axis is not aligned. A future version
25+
of pandas will change to not sort by default.
26+
27+
To accept the future behavior, pass 'sort=True'.
28+
29+
To retain the current behavior and silence the warning, pass sort=False
30+
""")
31+
32+
2033
# TODO: there are many places that rely on these private methods existing in
2134
# pandas.core.index
2235
__all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index',
@@ -31,33 +44,40 @@
3144
'_all_indexes_same']
3245

3346

34-
def _get_objs_combined_axis(objs, intersect=False, axis=0):
47+
def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True):
3548
# Extract combined index: return intersection or union (depending on the
3649
# value of "intersect") of indexes on given axis, or None if all objects
3750
# lack indexes (e.g. they are numpy arrays)
3851
obs_idxes = [obj._get_axis(axis) for obj in objs
3952
if hasattr(obj, '_get_axis')]
4053
if obs_idxes:
41-
return _get_combined_index(obs_idxes, intersect=intersect)
54+
return _get_combined_index(obs_idxes, intersect=intersect, sort=sort)
4255

4356

44-
def _get_combined_index(indexes, intersect=False):
57+
def _get_combined_index(indexes, intersect=False, sort=False):
4558
# TODO: handle index names!
4659
indexes = com._get_distinct_objs(indexes)
4760
if len(indexes) == 0:
48-
return Index([])
49-
if len(indexes) == 1:
50-
return indexes[0]
51-
if intersect:
61+
index = Index([])
62+
elif len(indexes) == 1:
63+
index = indexes[0]
64+
elif intersect:
5265
index = indexes[0]
5366
for other in indexes[1:]:
5467
index = index.intersection(other)
55-
return index
56-
union = _union_indexes(indexes)
57-
return _ensure_index(union)
68+
else:
69+
index = _union_indexes(indexes, sort=sort)
70+
index = _ensure_index(index)
71+
72+
if sort:
73+
try:
74+
index = index.sort_values()
75+
except TypeError:
76+
pass
77+
return index
5878

5979

60-
def _union_indexes(indexes):
80+
def _union_indexes(indexes, sort=True):
6181
if len(indexes) == 0:
6282
raise AssertionError('Must have at least 1 Index to union')
6383
if len(indexes) == 1:
@@ -74,7 +94,8 @@ def conv(i):
7494
i = i.tolist()
7595
return i
7696

77-
return Index(lib.fast_unique_multiple_list([conv(i) for i in inds]))
97+
return Index(
98+
lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort))
7899

79100
if kind == 'special':
80101
result = indexes[0]
@@ -89,13 +110,19 @@ def conv(i):
89110
index = indexes[0]
90111
for other in indexes[1:]:
91112
if not index.equals(other):
113+
114+
if sort is None:
115+
# TODO: remove once pd.concat sort default changes
116+
warnings.warn(_sort_msg, FutureWarning, stacklevel=8)
117+
sort = True
118+
92119
return _unique_indices(indexes)
93120

94121
name = _get_consensus_names(indexes)[0]
95122
if name != index.name:
96123
index = index._shallow_copy(name=name)
97124
return index
98-
else:
125+
else: # kind='list'
99126
return _unique_indices(indexes)
100127

101128

pandas/core/panel.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1499,8 +1499,11 @@ def _extract_axis(self, data, axis=0, intersect=False):
14991499
raw_lengths.append(v.shape[axis])
15001500

15011501
if have_frames:
1502+
# we want the "old" behavior here, of sorting only
1503+
# 1. we're doing a union (intersect=False)
1504+
# 2. the indices are not aligned.
15021505
index = _get_objs_combined_axis(data.values(), axis=axis,
1503-
intersect=intersect)
1506+
intersect=intersect, sort=None)
15041507

15051508
if have_raw_arrays:
15061509
lengths = list(set(raw_lengths))

pandas/core/reshape/concat.py

+20-4
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
2222
keys=None, levels=None, names=None, verify_integrity=False,
23-
copy=True):
23+
sort=None, copy=True):
2424
"""
2525
Concatenate pandas objects along a particular axis with optional set logic
2626
along the other axes.
@@ -60,6 +60,19 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
6060
verify_integrity : boolean, default False
6161
Check whether the new concatenated axis contains duplicates. This can
6262
be very expensive relative to the actual data concatenation
63+
sort : boolean, default None
64+
Sort non-concatenation axis if it is not already aligned when `join`
65+
is 'outer'. The current default of sorting is deprecated and will
66+
change to not-sorting in a future version of pandas.
67+
68+
Explicitly pass ``sort=True`` to silence the warning and sort.
69+
Explicitly pass ``sort=False`` to silence the warning and not sort.
70+
71+
This has no effect when ``join='inner'``, which already preserves
72+
the order of the non-concatenation axis.
73+
74+
.. versionadded:: 0.23.0
75+
6376
copy : boolean, default True
6477
If False, do not copy data unnecessarily
6578
@@ -209,7 +222,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
209222
ignore_index=ignore_index, join=join,
210223
keys=keys, levels=levels, names=names,
211224
verify_integrity=verify_integrity,
212-
copy=copy)
225+
copy=copy, sort=sort)
213226
return op.get_result()
214227

215228

@@ -220,7 +233,8 @@ class _Concatenator(object):
220233

221234
def __init__(self, objs, axis=0, join='outer', join_axes=None,
222235
keys=None, levels=None, names=None,
223-
ignore_index=False, verify_integrity=False, copy=True):
236+
ignore_index=False, verify_integrity=False, copy=True,
237+
sort=False):
224238
if isinstance(objs, (NDFrame, compat.string_types)):
225239
raise TypeError('first argument must be an iterable of pandas '
226240
'objects, you passed an object of type '
@@ -355,6 +369,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None,
355369
self.keys = keys
356370
self.names = names or getattr(keys, 'names', None)
357371
self.levels = levels
372+
self.sort = sort
358373

359374
self.ignore_index = ignore_index
360375
self.verify_integrity = verify_integrity
@@ -447,7 +462,8 @@ def _get_comb_axis(self, i):
447462
data_axis = self.objs[0]._get_block_manager_axis(i)
448463
try:
449464
return _get_objs_combined_axis(self.objs, axis=data_axis,
450-
intersect=self.intersect)
465+
intersect=self.intersect,
466+
sort=self.sort)
451467
except IndexError:
452468
types = [type(x).__name__ for x in self.objs]
453469
raise TypeError("Cannot concatenate list of {types}"

pandas/core/reshape/pivot.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -449,7 +449,8 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None,
449449
rownames = _get_names(index, rownames, prefix='row')
450450
colnames = _get_names(columns, colnames, prefix='col')
451451

452-
common_idx = _get_objs_combined_axis(index + columns, intersect=True)
452+
common_idx = _get_objs_combined_axis(index + columns, intersect=True,
453+
sort=False)
453454

454455
data = {}
455456
data.update(zip(rownames, index))

pandas/tests/frame/test_combine_concat.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def test_append_series_dict(self):
9696

9797
result = df.append(series[::-1][:3], ignore_index=True)
9898
expected = df.append(DataFrame({0: series[::-1][:3]}).T,
99-
ignore_index=True)
99+
ignore_index=True, sort=True)
100100
assert_frame_equal(result, expected.loc[:, result.columns])
101101

102102
# can append when name set
@@ -119,8 +119,8 @@ def test_append_list_of_series_dicts(self):
119119
# different columns
120120
dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4},
121121
{'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}]
122-
result = df.append(dicts, ignore_index=True)
123-
expected = df.append(DataFrame(dicts), ignore_index=True)
122+
result = df.append(dicts, ignore_index=True, sort=True)
123+
expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
124124
assert_frame_equal(result, expected)
125125

126126
def test_append_empty_dataframe(self):

0 commit comments

Comments
 (0)