From 913723bb59d4912d86854675c5424963628f0b85 Mon Sep 17 00:00:00 2001 From: Bryce Guinta Date: Wed, 4 Apr 2018 20:51:53 -0600 Subject: [PATCH 01/34] Stop concat from attempting to sort mismatched columns by default Preserve column order upon concatenation to obey least astonishment principle. Allow old behavior to be enabled by adding a boolean switch to concat and DataFrame.append, mismatch_sort, which is by default disabled. Close #4588 --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/_libs/lib.pyx | 11 ++++----- pandas/core/frame.py | 8 +++++-- pandas/core/indexes/api.py | 13 ++++++----- pandas/core/reshape/concat.py | 13 +++++++---- pandas/tests/reshape/test_concat.py | 35 +++++++++++++++++++++++------ 6 files changed, 57 insertions(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index a808b83119a40..c261891aa8897 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1160,6 +1160,7 @@ Reshaping - Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`) - Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`) - Bug in :class:`Series` constructor with a ``dtype=str``, previously raised in some cases (:issue:`19853`) +- Stop :func:`concat` and ``Dataframe.append`` from sorting columns by default. Use ``sort=True`` to retain old behavior (:issue:`4588`) Other ^^^^^ diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 30521760327b4..ae9d240afcb93 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -157,7 +157,7 @@ def fast_unique_multiple(list arrays): @cython.wraparound(False) @cython.boundscheck(False) -def fast_unique_multiple_list(list lists): +def fast_unique_multiple_list(list lists, bint sort=True): cdef: list buf Py_ssize_t k = len(lists) @@ -174,10 +174,11 @@ def fast_unique_multiple_list(list lists): if val not in table: table[val] = stub uniques.append(val) - try: - uniques.sort() - except Exception: - pass + if sort: + try: + uniques.sort() + except Exception: + pass return uniques diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f33ef9597f456..f82305ac3913a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5982,7 +5982,8 @@ def infer(x): # ---------------------------------------------------------------------- # Merging / joining methods - def append(self, other, ignore_index=False, verify_integrity=False): + def append(self, other, ignore_index=False, + verify_integrity=False, sort=False): """ Append rows of `other` to the end of this frame, returning a new object. Columns not in this frame are added as new columns. @@ -5995,6 +5996,8 @@ def append(self, other, ignore_index=False, verify_integrity=False): If True, do not use the index labels. verify_integrity : boolean, default False If True, raise ValueError on creating index with duplicates. + sort: boolean, default False + Sort columns if given object doesn't have the same columns Returns ------- @@ -6103,7 +6106,8 @@ def append(self, other, ignore_index=False, verify_integrity=False): else: to_concat = [self, other] return concat(to_concat, ignore_index=ignore_index, - verify_integrity=verify_integrity) + verify_integrity=verify_integrity, + sort=sort) def join(self, other, on=None, how='left', lsuffix='', rsuffix='', sort=False): diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 2e5ec8b554ce7..75232e3db7e55 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -31,17 +31,17 @@ '_all_indexes_same'] -def _get_objs_combined_axis(objs, intersect=False, axis=0): +def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): # Extract combined index: return intersection or union (depending on the # value of "intersect") of indexes on given axis, or None if all objects # lack indexes (e.g. they are numpy arrays) obs_idxes = [obj._get_axis(axis) for obj in objs if hasattr(obj, '_get_axis')] if obs_idxes: - return _get_combined_index(obs_idxes, intersect=intersect) + return _get_combined_index(obs_idxes, intersect=intersect, sort=sort) -def _get_combined_index(indexes, intersect=False): +def _get_combined_index(indexes, intersect=False, sort=True): # TODO: handle index names! indexes = com._get_distinct_objs(indexes) if len(indexes) == 0: @@ -53,11 +53,11 @@ def _get_combined_index(indexes, intersect=False): for other in indexes[1:]: index = index.intersection(other) return index - union = _union_indexes(indexes) + union = _union_indexes(indexes, sort=sort) return _ensure_index(union) -def _union_indexes(indexes): +def _union_indexes(indexes, sort=True): if len(indexes) == 0: raise AssertionError('Must have at least 1 Index to union') if len(indexes) == 1: @@ -74,7 +74,8 @@ def conv(i): i = i.tolist() return i - return Index(lib.fast_unique_multiple_list([conv(i) for i in inds])) + return Index( + lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort)) if kind == 'special': result = indexes[0] diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 20f4384a3d698..3630edbcbf58f 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -20,7 +20,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, - copy=True): + sort=False, copy=True): """ Concatenate pandas objects along a particular axis with optional set logic along the other axes. @@ -60,6 +60,8 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, verify_integrity : boolean, default False Check whether the new concatenated axis contains duplicates. This can be very expensive relative to the actual data concatenation + sort : boolean, default False + Sort columns if all passed object columns are not the same copy : boolean, default True If False, do not copy data unnecessarily @@ -209,7 +211,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, ignore_index=ignore_index, join=join, keys=keys, levels=levels, names=names, verify_integrity=verify_integrity, - copy=copy) + copy=copy, sort=sort) return op.get_result() @@ -220,7 +222,8 @@ class _Concatenator(object): def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, levels=None, names=None, - ignore_index=False, verify_integrity=False, copy=True): + ignore_index=False, verify_integrity=False, copy=True, + sort=False): if isinstance(objs, (NDFrame, compat.string_types)): raise TypeError('first argument must be an iterable of pandas ' 'objects, you passed an object of type ' @@ -355,6 +358,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, self.keys = keys self.names = names or getattr(keys, 'names', None) self.levels = levels + self.sort = sort self.ignore_index = ignore_index self.verify_integrity = verify_integrity @@ -447,7 +451,8 @@ def _get_comb_axis(self, i): data_axis = self.objs[0]._get_block_manager_axis(i) try: return _get_objs_combined_axis(self.objs, axis=data_axis, - intersect=self.intersect) + intersect=self.intersect, + sort=self.sort) except IndexError: types = [type(x).__name__ for x in self.objs] raise TypeError("Cannot concatenate list of {types}" diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 437b4179c580a..48260d90746c3 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -5,7 +5,7 @@ from numpy.random import randn from datetime import datetime -from pandas.compat import StringIO, iteritems, PY2 +from pandas.compat import StringIO, iteritems import pandas as pd from pandas import (DataFrame, concat, read_csv, isna, Series, date_range, @@ -852,8 +852,9 @@ def test_append_dtype_coerce(self): dt.datetime(2013, 1, 2, 0, 0), dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 4, 0, 0)], - name='start_time')], axis=1) - result = df1.append(df2, ignore_index=True) + name='start_time')], + axis=1, sort=True) + result = df1.append(df2, ignore_index=True, sort=True) assert_frame_equal(result, expected) def test_append_missing_column_proper_upcast(self): @@ -1011,7 +1012,8 @@ def test_concat_ignore_index(self): frame1.index = Index(["x", "y", "z"]) frame2.index = Index(["x", "y", "q"]) - v1 = concat([frame1, frame2], axis=1, ignore_index=True) + v1 = concat([frame1, frame2], axis=1, + ignore_index=True, sort=True) nan = np.nan expected = DataFrame([[nan, nan, nan, 4.3], @@ -1463,7 +1465,7 @@ def test_concat_series_axis1(self): # must reindex, #2603 s = Series(randn(3), index=['c', 'a', 'b'], name='A') s2 = Series(randn(4), index=['d', 'a', 'b', 'c'], name='B') - result = concat([s, s2], axis=1) + result = concat([s, s2], axis=1, sort=True) expected = DataFrame({'A': s, 'B': s2}) assert_frame_equal(result, expected) @@ -2070,8 +2072,6 @@ def test_concat_order(self): for i in range(100)] result = pd.concat(dfs).columns expected = dfs[0].columns - if PY2: - expected = expected.sort_values() tm.assert_index_equal(result, expected) def test_concat_datetime_timezone(self): @@ -2155,3 +2155,24 @@ def test_concat_empty_and_non_empty_series_regression(): expected = s1 result = pd.concat([s1, s2]) tm.assert_series_equal(result, expected) + + +def test_concat_preserve_column_order_differing_columns(): + # GH 4588 regression test + # for new columns in concat + dfa = pd.DataFrame(columns=['C', 'A'], data=[[1, 2]]) + dfb = pd.DataFrame(columns=['C', 'Z'], data=[[5, 6]]) + result = pd.concat([dfa, dfb]) + assert result.columns.tolist() == ['C', 'A', 'Z'] + + +def test_concat_preserve_column_order_uneven_data(): + # GH 4588 regression test + # add to column, concat with uneven data + df = pd.DataFrame() + df['b'] = [1, 2, 3] + df['c'] = [1, 2, 3] + df['a'] = [1, 2, 3] + df2 = pd.DataFrame({'a': [4, 5]}) + df3 = pd.concat([df, df2]) + assert df3.columns.tolist() == ['b', 'c', 'a'] From 02b2db93a5171b63615a450a7819a1c0ddc7f94e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 26 Apr 2018 09:13:23 -0500 Subject: [PATCH 02/34] Updates API: Updated the default to be compatible and warn. DOC: updated the whatsnew and concat docstring. --- doc/source/whatsnew/v0.23.0.txt | 26 ++++++++++++++++- pandas/core/indexes/api.py | 20 ++++++++++++++ pandas/core/reshape/concat.py | 13 +++++++-- pandas/tests/reshape/test_concat.py | 43 ++++++++++++++++++++++++++--- 4 files changed, 94 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 0bbe8102a4bbd..ab5a174b9a3bb 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -639,6 +639,31 @@ Returning a ``Series`` allows one to control the exact return structure and colu df.apply(lambda x: Series([1, 2, 3], index=['D', 'E', 'F']), axis=1) +Concatenation will no longer sort +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In a future version of pandas :func:`pandas.concat` will no longer sort the non-concatenation axis when it is not already aligned. +The current behavior is the same as the previous (sorting), but now a warning is issued. + +.. ipython:: python + :okwarning: + + df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=['b', 'a']) + df2 = pd.DataFrame({"a": [4, 5]}) + + pd.concat([df1, df2]) + +To keep the previous behavior (sorting) and silence the warning, pass ``sort=True`` + +.. ipython:: python + + pd.concat([df1, df2], sort=True) + +To accept the future behavior (no sorting), pass ``sort=False`` + +.. ipython + + pd.concat([df1, df2], sort=False) .. _whatsnew_0230.api_breaking.build_changes: @@ -1237,7 +1262,6 @@ Reshaping - Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`) - Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`) - Bug in :class:`Series` constructor with a ``dtype=str``, previously raised in some cases (:issue:`19853`) -- Stop :func:`concat` and ``Dataframe.append`` from sorting columns by default. Use ``sort=True`` to retain old behavior (:issue:`4588`) Other ^^^^^ diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 75232e3db7e55..32cf5c47bbd6b 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -1,3 +1,6 @@ +import textwrap +import warnings + from pandas.core.indexes.base import (Index, _new_Index, _ensure_index, @@ -17,6 +20,16 @@ from pandas._libs import lib from pandas._libs.tslib import NaT +_sort_msg = textwrap.dedent("""\ +Sorting because non-concatenation axis is not aligned. A future version +of pandas will change to not sort by default. + +To accept the future behavior, pass 'sort=True'. + +To retain the current behavior and silence the warning, pass sort=False +""") + + # TODO: there are many places that rely on these private methods existing in # pandas.core.index __all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index', @@ -90,6 +103,12 @@ def conv(i): index = indexes[0] for other in indexes[1:]: if not index.equals(other): + + if sort is None: + # TODO: remove once pd.concat sort default changes + warnings.warn(_sort_msg, FutureWarning, stacklevel=8) + sort = True + return _unique_indices(indexes) name = _get_consensus_names(indexes)[0] @@ -97,6 +116,7 @@ def conv(i): index = index._shallow_copy(name=name) return index else: + # XXX: here too? return _unique_indices(indexes) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 531d1715cdf27..4879e32d8348b 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -20,7 +20,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, - sort=False, copy=True): + sort=None, copy=True): """ Concatenate pandas objects along a particular axis with optional set logic along the other axes. @@ -60,8 +60,15 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, verify_integrity : boolean, default False Check whether the new concatenated axis contains duplicates. This can be very expensive relative to the actual data concatenation - sort : boolean, default False - Sort columns if all passed object columns are not the same + sort : boolean, default None + Sort non-concatenation axis if it is not already aligned. The current + default of sorting is deprecated and will change to not-sorting in a + future version of pandas. Explicitly pass ``sort=True`` to silence + the warning and sort. Explicitly pass ``sort=False`` to silence the + warning and not sort. + + .. versionadded:: 0.23.0 + copy : boolean, default True If False, do not copy data unnecessarily diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 8051f39284d6f..c4f7a3454c7f7 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2251,13 +2251,43 @@ def test_concat_empty_and_non_empty_series_regression(): tm.assert_series_equal(result, expected) +def test_concat_sort_columns(): + # GH-4588 + df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=['b', 'a']) + df2 = pd.DataFrame({"a": [3, 4]}) + + expected = pd.DataFrame({"a": [1, 2, 3, 4], + "b": [1, 2, None, None]}, + columns=['a', 'b']) + with tm.assert_produces_warning(FutureWarning): + result = pd.concat([df1, df2], ignore_index=True) + + tm.assert_frame_equal(result, expected) + + +def test_concat_sorts_index(): + df1 = pd.DataFrame({"a": [1, 2, 3]}, index=['c', 'a', 'b']) + df2 = pd.DataFrame({"b": [1, 2]}, index=['a', 'b']) + + with tm.assert_produces_warning(FutureWarning): + result = pd.concat([df1, df2], axis=1) + + expected = pd.DataFrame({"a": [2, 3, 1], "b": [1, 2, None]}, + index=['a', 'b', 'c'], + columns=['a', 'b']) + tm.assert_frame_equal(result, expected) + + def test_concat_preserve_column_order_differing_columns(): # GH 4588 regression test # for new columns in concat dfa = pd.DataFrame(columns=['C', 'A'], data=[[1, 2]]) dfb = pd.DataFrame(columns=['C', 'Z'], data=[[5, 6]]) - result = pd.concat([dfa, dfb]) - assert result.columns.tolist() == ['C', 'A', 'Z'] + result = pd.concat([dfa, dfb], ignore_index=True) + + expected = pd.DataFrame({"A": [2, None], "C": [1, 5], + "Z": [None, 6]}, columns=["A", "C", "Z"]) + tm.assert_frame_equal(result, expected) def test_concat_preserve_column_order_uneven_data(): @@ -2268,5 +2298,10 @@ def test_concat_preserve_column_order_uneven_data(): df['c'] = [1, 2, 3] df['a'] = [1, 2, 3] df2 = pd.DataFrame({'a': [4, 5]}) - df3 = pd.concat([df, df2]) - assert df3.columns.tolist() == ['b', 'c', 'a'] + result = pd.concat([df, df2]) + expected = pd.DataFrame({ + 'a': [1, 2, 3, 4, 5], + 'b': [1, 2, 3, None, None], + 'c': [1, 2, 3, None, None] + }, index=[0, 1, 2, 0, 1]) + tm.assert_frame_equal(result, expected) From a497763acc6effdda3f4083742e72a90be9be193 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 26 Apr 2018 15:46:06 -0500 Subject: [PATCH 03/34] Test fallout --- pandas/core/base.py | 2 +- pandas/core/groupby/groupby.py | 3 +- pandas/core/indexes/api.py | 3 +- pandas/tests/indexing/test_iloc.py | 3 +- pandas/tests/indexing/test_partial.py | 5 +-- pandas/tests/reshape/merge/test_merge.py | 2 +- pandas/tests/reshape/test_concat.py | 11 ++++--- pandas/tests/sparse/test_combine_concat.py | 38 +++++++++++++++------- 8 files changed, 42 insertions(+), 25 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 9ca1c8bea4db7..2f25a9ce41369 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -507,7 +507,7 @@ def is_any_frame(): for r in compat.itervalues(result)) if isinstance(result, list): - return concat(result, keys=keys, axis=1), True + return concat(result, keys=keys, axis=1, sort=True), True elif is_any_frame(): # we have a dict of DataFrames diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8c20d62117e25..00ea96890dd27 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1098,7 +1098,8 @@ def reset_identity(values): group_names = self.grouper.names result = concat(values, axis=self.axis, keys=group_keys, - levels=group_levels, names=group_names) + levels=group_levels, names=group_names, + sort=True) else: # GH5610, returns a MI, with the first level being a diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 32cf5c47bbd6b..b919c8ab9a23f 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -115,8 +115,7 @@ def conv(i): if name != index.name: index = index._shallow_copy(name=name) return index - else: - # XXX: here too? + else: # kind='list return _unique_indices(indexes) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index f1178d44dbfe0..bfc74db73b813 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -629,7 +629,8 @@ def test_iloc_non_unique_indexing(self): new_list.append(s * 3) expected = DataFrame(new_list) - expected = concat([expected, DataFrame(index=idx[idx > sidx.max()])]) + expected = concat([expected, DataFrame(index=idx[idx > sidx.max()])], + sort=True) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df2.loc[idx] tm.assert_frame_equal(result, expected, check_index_type=False) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index f95f493c66043..3c7a7f070805d 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -156,8 +156,9 @@ def f(): df_orig = DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) - expected = pd.concat([df_orig, DataFrame( - {'A': 7}, index=[dates[-1] + 1])]) + expected = pd.concat([df_orig, + DataFrame({'A': 7}, index=[dates[-1] + 1])], + sort=True) df = df_orig.copy() df.loc[dates[-1] + 1, 'A'] = 7 tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index dbf7c7f100b0e..4f68514e8fcaf 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1286,7 +1286,7 @@ def test_join_multi_levels(self): index=MultiIndex.from_tuples( [(4, np.nan)], names=['household_id', 'asset_id']))) - ], axis=0).reindex(columns=expected.columns)) + ], axis=0, sort=True).reindex(columns=expected.columns)) assert_frame_equal(result, expected) # invalid cases diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index c4f7a3454c7f7..ef21181452bc2 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1318,7 +1318,7 @@ def test_with_mixed_tuples(self): df2 = DataFrame({u'B': 'foo', (u'B', 1): 'bar'}, index=range(2)) # it works - concat([df1, df2]) + concat([df1, df2], sort=True) def test_handle_empty_objects(self): df = DataFrame(np.random.randn(10, 4), columns=list('abcd')) @@ -1328,7 +1328,7 @@ def test_handle_empty_objects(self): empty = df[5:5] frames = [baz, empty, empty, df[5:]] - concatted = concat(frames, axis=0) + concatted = concat(frames, axis=0, sort=True) expected = df.reindex(columns=['a', 'b', 'c', 'd', 'foo']) expected['foo'] = expected['foo'].astype('O') @@ -2056,7 +2056,7 @@ def test_categorical_concat(self): cat_values = ["one", "one", "two", "one", "two", "two", "one"] df2['h'] = Series(Categorical(cat_values)) - res = pd.concat((df1, df2), axis=0, ignore_index=True) + res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=True) exp = DataFrame({'a': [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12], 'b': [1, 4, 7, 10, 13, 16, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], @@ -2165,6 +2165,7 @@ def test_concat_order(self): dfs += [pd.DataFrame(index=range(3), columns=[None, 1, 'a']) for i in range(100)] result = pd.concat(dfs).columns + expected = dfs[0].columns tm.assert_index_equal(result, expected) @@ -2283,7 +2284,7 @@ def test_concat_preserve_column_order_differing_columns(): # for new columns in concat dfa = pd.DataFrame(columns=['C', 'A'], data=[[1, 2]]) dfb = pd.DataFrame(columns=['C', 'Z'], data=[[5, 6]]) - result = pd.concat([dfa, dfb], ignore_index=True) + result = pd.concat([dfa, dfb], ignore_index=True, sort=True) expected = pd.DataFrame({"A": [2, None], "C": [1, 5], "Z": [None, 6]}, columns=["A", "C", "Z"]) @@ -2298,7 +2299,7 @@ def test_concat_preserve_column_order_uneven_data(): df['c'] = [1, 2, 3] df['a'] = [1, 2, 3] df2 = pd.DataFrame({'a': [4, 5]}) - result = pd.concat([df, df2]) + result = pd.concat([df, df2], sort=True) expected = pd.DataFrame({ 'a': [1, 2, 3, 4, 5], 'b': [1, 2, 3, None, None], diff --git a/pandas/tests/sparse/test_combine_concat.py b/pandas/tests/sparse/test_combine_concat.py index 70fd1da529d46..9e392457edbc3 100644 --- a/pandas/tests/sparse/test_combine_concat.py +++ b/pandas/tests/sparse/test_combine_concat.py @@ -202,17 +202,29 @@ def test_concat_different_fill_value(self): exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) + def test_concat_different_columns_sort_warns(self): + sparse = self.dense1.to_sparse() + sparse3 = self.dense3.to_sparse() + + with tm.assert_produces_warning(FutureWarning): + res = pd.concat([sparse, sparse3]) + with tm.assert_produces_warning(FutureWarning): + exp = pd.concat([self.dense1, self.dense3]) + + exp = exp.to_sparse() + tm.assert_sp_frame_equal(res, exp) + def test_concat_different_columns(self): # fill_value = np.nan sparse = self.dense1.to_sparse() sparse3 = self.dense3.to_sparse() - res = pd.concat([sparse, sparse3]) - exp = pd.concat([self.dense1, self.dense3]).to_sparse() + res = pd.concat([sparse, sparse3], sort=True) + exp = pd.concat([self.dense1, self.dense3], sort=True).to_sparse() tm.assert_sp_frame_equal(res, exp) - res = pd.concat([sparse3, sparse]) - exp = pd.concat([self.dense3, self.dense1]).to_sparse() + res = pd.concat([sparse3, sparse], sort=True) + exp = pd.concat([self.dense3, self.dense1], sort=True).to_sparse() exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) @@ -220,13 +232,15 @@ def test_concat_different_columns(self): sparse = self.dense1.to_sparse(fill_value=0) sparse3 = self.dense3.to_sparse(fill_value=0) - res = pd.concat([sparse, sparse3]) - exp = pd.concat([self.dense1, self.dense3]).to_sparse(fill_value=0) + res = pd.concat([sparse, sparse3], sort=True) + exp = (pd.concat([self.dense1, self.dense3], sort=True) + .to_sparse(fill_value=0)) exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) - res = pd.concat([sparse3, sparse]) - exp = pd.concat([self.dense3, self.dense1]).to_sparse(fill_value=0) + res = pd.concat([sparse3, sparse], sort=True) + exp = (pd.concat([self.dense3, self.dense1], sort=True) + .to_sparse(fill_value=0)) exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) @@ -234,13 +248,13 @@ def test_concat_different_columns(self): sparse = self.dense1.to_sparse() sparse3 = self.dense3.to_sparse(fill_value=0) # each columns keeps its fill_value, thus compare in dense - res = pd.concat([sparse, sparse3]) - exp = pd.concat([self.dense1, self.dense3]) + res = pd.concat([sparse, sparse3], sort=True) + exp = pd.concat([self.dense1, self.dense3], sort=True) assert isinstance(res, pd.SparseDataFrame) tm.assert_frame_equal(res.to_dense(), exp) - res = pd.concat([sparse3, sparse]) - exp = pd.concat([self.dense3, self.dense1]) + res = pd.concat([sparse3, sparse], sort=True) + exp = pd.concat([self.dense3, self.dense1], sort=True) assert isinstance(res, pd.SparseDataFrame) tm.assert_frame_equal(res.to_dense(), exp) From 954a1b696703f30537b17b58f80ce38316be9a9c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 26 Apr 2018 16:14:05 -0500 Subject: [PATCH 04/34] Updated append --- doc/source/whatsnew/v0.23.0.txt | 5 ++- pandas/core/frame.py | 10 ++++-- pandas/tests/reshape/test_concat.py | 48 ++++++++++++++++++++++------- 3 files changed, 48 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index ab5a174b9a3bb..cf7ef24a32ed5 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -643,7 +643,7 @@ Concatenation will no longer sort ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In a future version of pandas :func:`pandas.concat` will no longer sort the non-concatenation axis when it is not already aligned. -The current behavior is the same as the previous (sorting), but now a warning is issued. +The current behavior is the same as the previous (sorting), but now a warning is issued (:issue:`4588`). .. ipython:: python :okwarning: @@ -665,6 +665,9 @@ To accept the future behavior (no sorting), pass ``sort=False`` pd.concat([df1, df2], sort=False) +Note that this change also applies to :meth:`DataFrame.append`, which has also received a `sort` keyword for controlling this behavior. + + .. _whatsnew_0230.api_breaking.build_changes: Build Changes diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 36fca8d77bf38..ee1ca5e832f09 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6039,7 +6039,7 @@ def infer(x): # Merging / joining methods def append(self, other, ignore_index=False, - verify_integrity=False, sort=False): + verify_integrity=False, sort=None): """ Append rows of `other` to the end of this frame, returning a new object. Columns not in this frame are added as new columns. @@ -6052,8 +6052,12 @@ def append(self, other, ignore_index=False, If True, do not use the index labels. verify_integrity : boolean, default False If True, raise ValueError on creating index with duplicates. - sort: boolean, default False - Sort columns if given object doesn't have the same columns + sort : boolean, default None + Sort columns if the columns of `self` and `other` are not aligned. + The default sorting is deprecated and will change to not-sorting + in a future version of pandas. Explicitly pass ``sort=True`` to + silence the warning and sort. Explicitly pass ``sort=False`` to + silence the warning and not sort. Returns ------- diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index ef21181452bc2..494d340d7f880 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -727,10 +727,10 @@ def test_append(self): tm.assert_almost_equal(appended['A'], self.frame['A']) del end_frame['A'] - partial_appended = begin_frame.append(end_frame) + partial_appended = begin_frame.append(end_frame, sort=True) assert 'A' in partial_appended - partial_appended = end_frame.append(begin_frame) + partial_appended = end_frame.append(begin_frame, sort=True) assert 'A' in partial_appended # mixed type handling @@ -738,8 +738,9 @@ def test_append(self): tm.assert_frame_equal(appended, self.mixed_frame) # what to test here - mixed_appended = self.mixed_frame[:5].append(self.frame[5:]) - mixed_appended2 = self.frame[:5].append(self.mixed_frame[5:]) + mixed_appended = self.mixed_frame[:5].append(self.frame[5:], sort=True) + mixed_appended2 = self.frame[:5].append(self.mixed_frame[5:], + sort=True) # all equal except 'foo' column tm.assert_frame_equal( @@ -772,7 +773,7 @@ def test_append(self): def test_append_length0_frame(self): df = DataFrame(columns=['A', 'B', 'C']) df3 = DataFrame(index=[0, 1], columns=['A', 'B']) - df5 = df.append(df3) + df5 = df.append(df3, sort=True) expected = DataFrame(index=[0, 1], columns=['A', 'B', 'C']) assert_frame_equal(df5, expected) @@ -793,6 +794,31 @@ def test_append_records(self): expected = DataFrame(np.concatenate((arr1, arr2))) assert_frame_equal(result, expected) + def test_append_sorts(self): + df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=['b', 'a']) + df2 = pd.DataFrame({"a": [1, 2], 'c': [3, 4]}, index=[2, 3]) + # default, changing in the future + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # from append we have an extra function call. Not worth hacking + # around to get the right stackleve. + result = df1.append(df2) + + expected = pd.DataFrame({"b": [1, 2, None, None], + "a": [1, 2, 1, 2], + "c": [None, None, 3, 4]}, + columns=['a', 'b', 'c']) + tm.assert_frame_equal(result, expected) + + # sort=True, the previous behavior + result = df1.append(df2, sort=True) + tm.assert_frame_equal(result, expected) + + # sort=False, the future behvior. + result = df1.append(df2, sort=False) + expected = expected[['b', 'a', 'c']] + tm.assert_frame_equal(result, expected) + def test_append_different_columns(self): df = DataFrame({'bools': np.random.randn(10) > 0, 'ints': np.random.randint(0, 10, 10), @@ -802,7 +828,7 @@ def test_append_different_columns(self): a = df[:5].loc[:, ['bools', 'ints', 'floats']] b = df[5:].loc[:, ['strings', 'ints', 'floats']] - appended = a.append(b) + appended = a.append(b, sort=True) assert isna(appended['strings'][0:4]).all() assert isna(appended['bools'][5:]).all() @@ -815,7 +841,7 @@ def test_append_many(self): chunks[-1] = chunks[-1].copy() chunks[-1]['foo'] = 'bar' - result = chunks[0].append(chunks[1:]) + result = chunks[0].append(chunks[1:], sort=True) tm.assert_frame_equal(result.loc[:, self.frame.columns], self.frame) assert (result['foo'][15:] == 'bar').all() assert result['foo'][:15].isna().all() @@ -956,7 +982,7 @@ def test_append_missing_column_proper_upcast(self): df2 = DataFrame({'B': np.array([True, False, True, False], dtype=bool)}) - appended = df1.append(df2, ignore_index=True) + appended = df1.append(df2, ignore_index=True, sort=True) assert appended['A'].dtype == 'f8' assert appended['B'].dtype == 'O' @@ -1052,7 +1078,7 @@ def test_concat_dataframe_keys_bug(self): 'value': Series([7, 8], index=Index(['a', 'b'], name='id'))}) # it works - result = concat([t1, t2], axis=1, keys=['t1', 't2']) + result = concat([t1, t2], axis=1, keys=['t1', 't2'], sort=True) assert list(result.columns) == [('t1', 'value'), ('t2', 'value')] def test_concat_series_partial_columns_names(self): @@ -1505,7 +1531,7 @@ def df(): panel3 = panel3.rename_axis(lambda x: '%s_1' % x, axis=2) # it works! - concat([panel1, panel3], axis=1, verify_integrity=True) + concat([panel1, panel3], axis=1, verify_integrity=True, sort=True) def test_concat_series(self): @@ -2164,7 +2190,7 @@ def test_concat_order(self): dfs = [pd.DataFrame(index=range(3), columns=['a', 1, None])] dfs += [pd.DataFrame(index=range(3), columns=[None, 1, 'a']) for i in range(100)] - result = pd.concat(dfs).columns + result = pd.concat(dfs, sort=True).columns expected = dfs[0].columns tm.assert_index_equal(result, expected) From 2a203774bf9044657dcd76fd989e3fe784d0c028 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 26 Apr 2018 16:15:44 -0500 Subject: [PATCH 05/34] versionadded --- pandas/core/frame.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ee1ca5e832f09..9f673733bf7e4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6059,6 +6059,8 @@ def append(self, other, ignore_index=False, silence the warning and sort. Explicitly pass ``sort=False`` to silence the warning and not sort. + .. versionadded:: 0.23.0 + Returns ------- appended : DataFrame From 35570c4f8276b10d42b8c275b6f44c135bec86e5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 27 Apr 2018 06:49:21 -0500 Subject: [PATCH 06/34] Squash more test warnings --- pandas/tests/frame/test_combine_concat.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index e82faaeef2986..15ca65395e4fc 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -96,7 +96,7 @@ def test_append_series_dict(self): result = df.append(series[::-1][:3], ignore_index=True) expected = df.append(DataFrame({0: series[::-1][:3]}).T, - ignore_index=True) + ignore_index=True, sort=True) assert_frame_equal(result, expected.loc[:, result.columns]) # can append when name set @@ -119,8 +119,8 @@ def test_append_list_of_series_dicts(self): # different columns dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4}, {'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}] - result = df.append(dicts, ignore_index=True) - expected = df.append(DataFrame(dicts), ignore_index=True) + result = df.append(dicts, ignore_index=True, sort=True) + expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) assert_frame_equal(result, expected) def test_append_empty_dataframe(self): From 983d0c1db55ee033214875324523d86177f315be Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 27 Apr 2018 07:22:04 -0500 Subject: [PATCH 07/34] py2 compat --- pandas/tests/reshape/test_concat.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 494d340d7f880..ae7a2c2c5f5fd 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -7,7 +7,7 @@ from numpy.random import randn from datetime import datetime -from pandas.compat import StringIO, iteritems +from pandas.compat import StringIO, iteritems, PY2 import pandas as pd from pandas import (DataFrame, concat, read_csv, isna, Series, date_range, @@ -2190,9 +2190,15 @@ def test_concat_order(self): dfs = [pd.DataFrame(index=range(3), columns=['a', 1, None])] dfs += [pd.DataFrame(index=range(3), columns=[None, 1, 'a']) for i in range(100)] + result = pd.concat(dfs, sort=True).columns - expected = dfs[0].columns + if PY2: + # Different sort order between incomparable objects between + # python 2 and python3 via Index.union. + expected = dfs[1].columns + else: + expected = dfs[0].columns tm.assert_index_equal(result, expected) def test_concat_datetime_timezone(self): From 4960e3f57a11581e10a5cf7304d11713110b0de9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 27 Apr 2018 10:02:15 -0500 Subject: [PATCH 08/34] Document outer is not affected --- pandas/core/reshape/concat.py | 14 +++++++++----- pandas/tests/reshape/test_concat.py | 17 +++++++++++++++++ 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 4879e32d8348b..b36e9b8d900fd 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -61,11 +61,15 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, Check whether the new concatenated axis contains duplicates. This can be very expensive relative to the actual data concatenation sort : boolean, default None - Sort non-concatenation axis if it is not already aligned. The current - default of sorting is deprecated and will change to not-sorting in a - future version of pandas. Explicitly pass ``sort=True`` to silence - the warning and sort. Explicitly pass ``sort=False`` to silence the - warning and not sort. + Sort non-concatenation axis if it is not already aligned when `join` + is 'outer'. The current default of sorting is deprecated and will + change to not-sorting in a future version of pandas. + + Explicitly pass ``sort=True`` to silence the warning and sort. + Explicitly pass ``sort=False`` to silence the warning and not sort. + + This has no effect when ``join='inner'``, which already preserves + the order of the non-concatenation axis. .. versionadded:: 0.23.0 diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index ae7a2c2c5f5fd..f74d652ae012c 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2311,6 +2311,23 @@ def test_concat_sorts_index(): tm.assert_frame_equal(result, expected) +def test_concat_inner_sort_unaffected(): + # https://github.com/pandas-dev/pandas/pull/20613 + df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 2]}, + columns=['b', 'a', 'c']) + df2 = pd.DataFrame({"a": [1, 2], 'b': [3, 4]}, index=[3, 4]) + with tm.assert_produces_warning(None): + r0 = pd.concat([df1, df2], join='inner', ignore_index=True) + r1 = pd.concat([df1, df2], join='inner', sort=True, ignore_index=True) + r2 = pd.concat([df1, df2], join='inner', sort=True, ignore_index=True) + + expected = pd.DataFrame({"b": [1, 2, 3, 4], "a": [1, 2, 1, 2]}, + columns=['b', 'a']) + tm.assert_frame_equal(r0, expected) + tm.assert_frame_equal(r1, expected) + tm.assert_frame_equal(r2, expected) + + def test_concat_preserve_column_order_differing_columns(): # GH 4588 regression test # for new columns in concat From 8bbbdd52a9d223215eef408c75f2526ce828e19b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 27 Apr 2018 10:08:31 -0500 Subject: [PATCH 09/34] Docs --- doc/source/merging.rst | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 74b21c21252ec..de37dc6dab59e 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -153,10 +153,10 @@ Set logic on the other axes ~~~~~~~~~~~~~~~~~~~~~~~~~~~ When gluing together multiple DataFrames, you have a choice of how to handle -the other axes (other than the one being concatenated). This can be done in +the other axes (other than the one being concatenated). This can be done in the following three ways: -- Take the (sorted) union of them all, ``join='outer'``. This is the default +- Take the union of them all, ``join='outer'``. This is the default option as it results in zero information loss. - Take the intersection, ``join='inner'``. - Use a specific index, as passed to the ``join_axes`` argument. @@ -167,10 +167,10 @@ behavior: .. ipython:: python df4 = pd.DataFrame({'B': ['B2', 'B3', 'B6', 'B7'], - 'D': ['D2', 'D3', 'D6', 'D7'], - 'F': ['F2', 'F3', 'F6', 'F7']}, - index=[2, 3, 6, 7]) - result = pd.concat([df1, df4], axis=1) + 'D': ['D2', 'D3', 'D6', 'D7'], + 'F': ['F2', 'F3', 'F6', 'F7']}, + index=[2, 3, 6, 7]) + result = pd.concat([df1, df4], axis=1, sort=False) .. ipython:: python @@ -181,8 +181,14 @@ behavior: labels=['df1', 'df4'], vertical=False); plt.close('all'); -Note that the row indexes have been unioned and sorted. Here is the same thing -with ``join='inner'``: +.. versionchanged:: 0.23.0 + + The default behavior with ``join='outer'`` is to sort the other axis + (columns in this case). In a future version of pandas, the default will + be to not sort. We specified ``sort=False`` to opt in to the new + behavior now. + +Here is the same thing with ``join='inner'``: .. ipython:: python From dcfa6d0a84be399bc22b1ec6fe68900f6b3a5588 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 28 Apr 2018 13:32:04 -0500 Subject: [PATCH 10/34] Sort for intersection --- doc/source/merging.rst | 4 +++- doc/source/whatsnew/v0.23.0.txt | 2 ++ pandas/core/indexes/api.py | 7 +++++-- pandas/tests/reshape/test_concat.py | 24 ++++++++++++++++++++++++ 4 files changed, 34 insertions(+), 3 deletions(-) diff --git a/doc/source/merging.rst b/doc/source/merging.rst index de37dc6dab59e..1161656731f88 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -181,7 +181,9 @@ behavior: labels=['df1', 'df4'], vertical=False); plt.close('all'); -.. versionchanged:: 0.23.0 +.. warning:: + + .. versionchanged:: 0.23.0 The default behavior with ``join='outer'`` is to sort the other axis (columns in this case). In a future version of pandas, the default will diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index cf7ef24a32ed5..bbbf2172efea4 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -639,6 +639,8 @@ Returning a ``Series`` allows one to control the exact return structure and colu df.apply(lambda x: Series([1, 2, 3], index=['D', 'E', 'F']), axis=1) +.. _whatsnew_0230.api_breaking.concat: + Concatenation will no longer sort ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index b919c8ab9a23f..f345c21b2f2f0 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -60,7 +60,10 @@ def _get_combined_index(indexes, intersect=False, sort=True): if len(indexes) == 0: return Index([]) if len(indexes) == 1: - return indexes[0] + index = indexes[0] + if sort: + index = index.sort_values() + return index if intersect: index = indexes[0] for other in indexes[1:]: @@ -115,7 +118,7 @@ def conv(i): if name != index.name: index = index._shallow_copy(name=name) return index - else: # kind='list + else: # kind='list' return _unique_indices(indexes) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index f74d652ae012c..fc91647321a05 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2355,3 +2355,27 @@ def test_concat_preserve_column_order_uneven_data(): 'c': [1, 2, 3, None, None] }, index=[0, 1, 2, 0, 1]) tm.assert_frame_equal(result, expected) + + +def test_concat_aligned_sort(): + # GH-4588 + df = pd.DataFrame({"b": [1, 2], "a": [3, 4]}, columns=['b', 'a']) + result = pd.concat([df, df], sort=True, ignore_index=True) + expected = pd.DataFrame({'b': [1, 2, 1, 2], 'a': [3, 4, 3, 4]}, + columns=['a', 'b']) + tm.assert_frame_equal(result, expected) + + +def test_concat_aligned_sort_raises(): + # GH-4588 + df = pd.DataFrame({1: [1, 2], "a": [3, 4]}, columns=[1, 'a']) + + if PY2: + expected = pd.DataFrame({1: [1, 2, 1, 2], 'a': [3, 4, 3, 4]}, + columns=[1, 'a']) + result = pd.concat([df, df], ignore_index=True) + tm.assert_frame_equal(result, expected) + else: + msg = "'<' not supported between instances" + with tm.assert_raises_regex(TypeError, msg): + pd.concat([df, df], sort=True) From 2eaeb1eb5ee6ea143f3215ecec7cb7ddac1b8a80 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 28 Apr 2018 14:46:11 -0500 Subject: [PATCH 11/34] More tests --- pandas/core/indexes/api.py | 22 ++-- pandas/tests/reshape/merge/test_merge.py | 2 +- pandas/tests/reshape/test_concat.py | 126 +++++++++++++---------- 3 files changed, 88 insertions(+), 62 deletions(-) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index f345c21b2f2f0..07ddbcc6fec18 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -58,19 +58,23 @@ def _get_combined_index(indexes, intersect=False, sort=True): # TODO: handle index names! indexes = com._get_distinct_objs(indexes) if len(indexes) == 0: - return Index([]) - if len(indexes) == 1: + index = Index([]) + elif len(indexes) == 1: index = indexes[0] - if sort: - index = index.sort_values() - return index - if intersect: + elif intersect: index = indexes[0] for other in indexes[1:]: index = index.intersection(other) - return index - union = _union_indexes(indexes, sort=sort) - return _ensure_index(union) + else: + index = _union_indexes(indexes, sort=sort) + index = _ensure_index(index) + + if sort and not index.is_monotonic_increasing: + try: + index = index.sort_values() + except TypeError: + pass + return index def _union_indexes(indexes, sort=True): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 4f68514e8fcaf..f3827ac251cf0 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -826,7 +826,7 @@ def test_validation(self): # Dups on left left_w_dups = left.append(pd.DataFrame({'a': ['a'], 'c': ['cow']}, - index=[3])) + index=[3]), sort=True) merge(left_w_dups, right, left_index=True, right_index=True, validate='many_to_one') diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index fc91647321a05..67471bd2d2c6d 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -21,6 +21,12 @@ import pytest +@pytest.fixture(params=[True, False]) +def sort(request): + """Boolean sort keyword for concat and DataFrame.append.""" + return request.param + + class ConcatenateBase(object): def setup_method(self, method): @@ -716,7 +722,7 @@ def test_concat_categorical_empty(self): class TestAppend(ConcatenateBase): - def test_append(self): + def test_append(self, sort): begin_index = self.frame.index[:5] end_index = self.frame.index[5:] @@ -727,10 +733,10 @@ def test_append(self): tm.assert_almost_equal(appended['A'], self.frame['A']) del end_frame['A'] - partial_appended = begin_frame.append(end_frame, sort=True) + partial_appended = begin_frame.append(end_frame, sort=sort) assert 'A' in partial_appended - partial_appended = end_frame.append(begin_frame, sort=True) + partial_appended = end_frame.append(begin_frame, sort=sort) assert 'A' in partial_appended # mixed type handling @@ -738,9 +744,9 @@ def test_append(self): tm.assert_frame_equal(appended, self.mixed_frame) # what to test here - mixed_appended = self.mixed_frame[:5].append(self.frame[5:], sort=True) + mixed_appended = self.mixed_frame[:5].append(self.frame[5:], sort=sort) mixed_appended2 = self.frame[:5].append(self.mixed_frame[5:], - sort=True) + sort=sort) # all equal except 'foo' column tm.assert_frame_equal( @@ -770,10 +776,10 @@ def test_append(self): result = df.append(row) tm.assert_frame_equal(result, expected) - def test_append_length0_frame(self): + def test_append_length0_frame(self, sort): df = DataFrame(columns=['A', 'B', 'C']) df3 = DataFrame(index=[0, 1], columns=['A', 'B']) - df5 = df.append(df3, sort=True) + df5 = df.append(df3, sort=sort) expected = DataFrame(index=[0, 1], columns=['A', 'B', 'C']) assert_frame_equal(df5, expected) @@ -819,7 +825,7 @@ def test_append_sorts(self): expected = expected[['b', 'a', 'c']] tm.assert_frame_equal(result, expected) - def test_append_different_columns(self): + def test_append_different_columns(self, sort): df = DataFrame({'bools': np.random.randn(10) > 0, 'ints': np.random.randint(0, 10, 10), 'floats': np.random.randn(10), @@ -828,11 +834,11 @@ def test_append_different_columns(self): a = df[:5].loc[:, ['bools', 'ints', 'floats']] b = df[5:].loc[:, ['strings', 'ints', 'floats']] - appended = a.append(b, sort=True) + appended = a.append(b, sort=sort) assert isna(appended['strings'][0:4]).all() assert isna(appended['bools'][5:]).all() - def test_append_many(self): + def test_append_many(self, sort): chunks = [self.frame[:5], self.frame[5:10], self.frame[10:15], self.frame[15:]] @@ -841,7 +847,7 @@ def test_append_many(self): chunks[-1] = chunks[-1].copy() chunks[-1]['foo'] = 'bar' - result = chunks[0].append(chunks[1:], sort=True) + result = chunks[0].append(chunks[1:], sort=sort) tm.assert_frame_equal(result.loc[:, self.frame.columns], self.frame) assert (result['foo'][15:] == 'bar').all() assert result['foo'][:15].isna().all() @@ -949,7 +955,7 @@ def test_append_different_columns_types_raises( with pytest.raises(TypeError): df.append(ser) - def test_append_dtype_coerce(self): + def test_append_dtype_coerce(self, sort): # GH 4993 # appending with datetime will incorrectly convert datetime64 @@ -973,16 +979,21 @@ def test_append_dtype_coerce(self): dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 4, 0, 0)], name='start_time')], - axis=1, sort=True) - result = df1.append(df2, ignore_index=True, sort=True) + axis=1, sort=sort) + result = df1.append(df2, ignore_index=True, sort=sort) + if sort: + expected = expected[['end_time', 'start_time']] + else: + expected = expected[['start_time', 'end_time']] + assert_frame_equal(result, expected) - def test_append_missing_column_proper_upcast(self): + def test_append_missing_column_proper_upcast(self, sort): df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8')}) df2 = DataFrame({'B': np.array([True, False, True, False], dtype=bool)}) - appended = df1.append(df2, ignore_index=True, sort=True) + appended = df1.append(df2, ignore_index=True, sort=sort) assert appended['A'].dtype == 'f8' assert appended['B'].dtype == 'O' @@ -1070,7 +1081,7 @@ def test_concat_keys_specific_levels(self): Index(level, name='group_key')) assert result.columns.names[0] == 'group_key' - def test_concat_dataframe_keys_bug(self): + def test_concat_dataframe_keys_bug(self, sort): t1 = DataFrame({ 'value': Series([1, 2, 3], index=Index(['a', 'b', 'c'], name='id'))}) @@ -1078,7 +1089,7 @@ def test_concat_dataframe_keys_bug(self): 'value': Series([7, 8], index=Index(['a', 'b'], name='id'))}) # it works - result = concat([t1, t2], axis=1, keys=['t1', 't2'], sort=True) + result = concat([t1, t2], axis=1, keys=['t1', 't2'], sort=sort) assert list(result.columns) == [('t1', 'value'), ('t2', 'value')] def test_concat_series_partial_columns_names(self): @@ -1124,7 +1135,7 @@ def test_concat_dict(self): expected = concat([frames[k] for k in keys], keys=keys) tm.assert_frame_equal(result, expected) - def test_concat_ignore_index(self): + def test_concat_ignore_index(self, sort): frame1 = DataFrame({"test1": ["a", "b", "c"], "test2": [1, 2, 3], "test3": [4.5, 3.2, 1.2]}) @@ -1133,7 +1144,7 @@ def test_concat_ignore_index(self): frame2.index = Index(["x", "y", "q"]) v1 = concat([frame1, frame2], axis=1, - ignore_index=True, sort=True) + ignore_index=True, sort=sort) nan = np.nan expected = DataFrame([[nan, nan, nan, 4.3], @@ -1141,6 +1152,8 @@ def test_concat_ignore_index(self): ['b', 2, 3.2, 2.2], ['c', 3, 1.2, nan]], index=Index(["q", "x", "y", "z"])) + if not sort: + expected = expected.loc[['x', 'y', 'z', 'q']] tm.assert_frame_equal(v1, expected) @@ -1337,16 +1350,16 @@ def test_dups_index(self): result = df.append(df) assert_frame_equal(result, expected) - def test_with_mixed_tuples(self): + def test_with_mixed_tuples(self, sort): # 10697 # columns have mixed tuples, so handle properly df1 = DataFrame({u'A': 'foo', (u'B', 1): 'bar'}, index=range(2)) df2 = DataFrame({u'B': 'foo', (u'B', 1): 'bar'}, index=range(2)) # it works - concat([df1, df2], sort=True) + concat([df1, df2], sort=sort) - def test_handle_empty_objects(self): + def test_handle_empty_objects(self, sort): df = DataFrame(np.random.randn(10, 4), columns=list('abcd')) baz = df[:5].copy() @@ -1354,7 +1367,7 @@ def test_handle_empty_objects(self): empty = df[5:5] frames = [baz, empty, empty, df[5:]] - concatted = concat(frames, axis=0, sort=True) + concatted = concat(frames, axis=0, sort=sort) expected = df.reindex(columns=['a', 'b', 'c', 'd', 'foo']) expected['foo'] = expected['foo'].astype('O') @@ -1506,7 +1519,7 @@ def test_panel_concat_other_axes(self): expected.loc['ItemC', :, :2] = 'baz' tm.assert_panel_equal(result, expected) - def test_panel_concat_buglet(self): + def test_panel_concat_buglet(self, sort): with catch_warnings(record=True): # #2257 def make_panel(): @@ -1531,7 +1544,7 @@ def df(): panel3 = panel3.rename_axis(lambda x: '%s_1' % x, axis=2) # it works! - concat([panel1, panel3], axis=1, verify_integrity=True, sort=True) + concat([panel1, panel3], axis=1, verify_integrity=True, sort=sort) def test_concat_series(self): @@ -1556,7 +1569,7 @@ def test_concat_series(self): expected.index = exp_index tm.assert_series_equal(result, expected) - def test_concat_series_axis1(self): + def test_concat_series_axis1(self, sort=sort): ts = tm.makeTimeSeries() pieces = [ts[:-2], ts[2:], ts[2:-2]] @@ -1585,7 +1598,7 @@ def test_concat_series_axis1(self): # must reindex, #2603 s = Series(randn(3), index=['c', 'a', 'b'], name='A') s2 = Series(randn(4), index=['d', 'a', 'b', 'c'], name='B') - result = concat([s, s2], axis=1, sort=True) + result = concat([s, s2], axis=1, sort=sort) expected = DataFrame({'A': s, 'B': s2}) assert_frame_equal(result, expected) @@ -2071,7 +2084,7 @@ def test_categorical_concat_dtypes(self): expected = Series([True, False, False], index=index) tm.assert_series_equal(result, expected) - def test_categorical_concat(self): + def test_categorical_concat(self, sort): # See GH 10177 df1 = DataFrame(np.arange(18, dtype='int64').reshape(6, 3), columns=["a", "b", "c"]) @@ -2082,7 +2095,7 @@ def test_categorical_concat(self): cat_values = ["one", "one", "two", "one", "two", "two", "one"] df2['h'] = Series(Categorical(cat_values)) - res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=True) + res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=sort) exp = DataFrame({'a': [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12], 'b': [1, 4, 7, 10, 13, 16, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], @@ -2311,21 +2324,28 @@ def test_concat_sorts_index(): tm.assert_frame_equal(result, expected) -def test_concat_inner_sort_unaffected(): +@pytest.mark.parametrize('sort', [None, False, True]) +def test_concat_inner_sort(sort): # https://github.com/pandas-dev/pandas/pull/20613 df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 2]}, columns=['b', 'a', 'c']) df2 = pd.DataFrame({"a": [1, 2], 'b': [3, 4]}, index=[3, 4]) - with tm.assert_produces_warning(None): - r0 = pd.concat([df1, df2], join='inner', ignore_index=True) - r1 = pd.concat([df1, df2], join='inner', sort=True, ignore_index=True) - r2 = pd.concat([df1, df2], join='inner', sort=True, ignore_index=True) + + if sort is None: + with tm.assert_produces_warning(None): + # unset sort should *not* warn for inner join + # since that never sorted + result = pd.concat([df1, df2], sort=sort, join='inner', + ignore_index=True) + else: + result = pd.concat([df1, df2], sort=sort, join='inner', + ignore_index=True) expected = pd.DataFrame({"b": [1, 2, 3, 4], "a": [1, 2, 1, 2]}, columns=['b', 'a']) - tm.assert_frame_equal(r0, expected) - tm.assert_frame_equal(r1, expected) - tm.assert_frame_equal(r2, expected) + if sort: + expected = expected[['a', 'b']] + tm.assert_frame_equal(result, expected) def test_concat_preserve_column_order_differing_columns(): @@ -2359,23 +2379,25 @@ def test_concat_preserve_column_order_uneven_data(): def test_concat_aligned_sort(): # GH-4588 - df = pd.DataFrame({"b": [1, 2], "a": [3, 4]}, columns=['b', 'a']) + df = pd.DataFrame({"c": [1, 2], "b": [3, 4], 'a': [5, 6]}, + columns=['c', 'b', 'a']) result = pd.concat([df, df], sort=True, ignore_index=True) - expected = pd.DataFrame({'b': [1, 2, 1, 2], 'a': [3, 4, 3, 4]}, - columns=['a', 'b']) + expected = pd.DataFrame({'a': [5, 6, 5, 6], 'b': [3, 4, 3, 4], + 'c': [1, 2, 1, 2]}, + columns=['a', 'b', 'c']) + tm.assert_frame_equal(result, expected) + + result = pd.concat([df, df[['c', 'b']]], join='inner', sort=True, + ignore_index=True) + expected = expected[['b', 'c']] tm.assert_frame_equal(result, expected) -def test_concat_aligned_sort_raises(): +def test_concat_aligned_sort_does_not_raise(): # GH-4588 + # We catch TypeErrors from sorting internally and do not re-raise. df = pd.DataFrame({1: [1, 2], "a": [3, 4]}, columns=[1, 'a']) - - if PY2: - expected = pd.DataFrame({1: [1, 2, 1, 2], 'a': [3, 4, 3, 4]}, - columns=[1, 'a']) - result = pd.concat([df, df], ignore_index=True) - tm.assert_frame_equal(result, expected) - else: - msg = "'<' not supported between instances" - with tm.assert_raises_regex(TypeError, msg): - pd.concat([df, df], sort=True) + expected = pd.DataFrame({1: [1, 2, 1, 2], 'a': [3, 4, 3, 4]}, + columns=[1, 'a']) + result = pd.concat([df, df], ignore_index=True, sort=True) + tm.assert_frame_equal(result, expected) From bc7dd48249dc0d75053c0cc678d5f91c3aa1f4c7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 29 Apr 2018 06:29:26 -0500 Subject: [PATCH 12/34] Test fixup. Sparse as well --- pandas/core/groupby/groupby.py | 2 +- pandas/tests/sparse/frame/test_frame.py | 23 ++++++++++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 00ea96890dd27..4132d8e69704a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1099,7 +1099,7 @@ def reset_identity(values): result = concat(values, axis=self.axis, keys=group_keys, levels=group_levels, names=group_names, - sort=True) + sort=False) else: # GH5610, returns a MI, with the first level being a diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 540933cb90be2..8a8ed520d45d2 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -629,10 +629,31 @@ def test_append(self): a = self.frame.iloc[:5, :3] b = self.frame.iloc[5:] - appended = a.append(b) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # Stacklevel is set for pd.concat, not append + appended = a.append(b) tm.assert_sp_frame_equal(appended.iloc[:, :3], self.frame.iloc[:, :3], exact_indices=False) + a = a[['B', 'C', 'A']].head(2) + b = b.head(2) + + expected = pd.SparseDataFrame({ + "B": [0., 1, None, 3], + "C": [0., 1, 5, 6], + "A": [None, None, 2, 3], + "D": [None, None, 5, None], + }, index=a.index | b.index) + with tm.assert_produces_warning(None): + appended = a.append(b, sort=False) + + tm.assert_frame_equal(appended, expected) + + with tm.assert_produces_warning(None): + appended = a.append(b, sort=True) + + tm.assert_sp_frame_equal(appended, expected[['A', 'B', 'C', 'D']]) + def test_astype(self): sparse = pd.SparseDataFrame({'A': SparseArray([1, 2, 3, 4], dtype=np.int64), From b3f95dd399d6f0acc4baa3b81d922959e2401f3b Mon Sep 17 00:00:00 2001 From: Bryce Guinta Date: Wed, 4 Apr 2018 20:51:53 -0600 Subject: [PATCH 13/34] Stop concat from attempting to sort mismatched columns by default Preserve column order upon concatenation to obey least astonishment principle. Allow old behavior to be enabled by adding a boolean switch to concat and DataFrame.append, mismatch_sort, which is by default disabled. Close #4588 --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/_libs/lib.pyx | 11 ++++----- pandas/core/frame.py | 8 +++++-- pandas/core/indexes/api.py | 13 ++++++----- pandas/core/reshape/concat.py | 13 +++++++---- pandas/tests/reshape/test_concat.py | 35 +++++++++++++++++++++++------ 6 files changed, 57 insertions(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index c128058858c17..dd557bbfa45a2 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1237,6 +1237,7 @@ Reshaping - Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`) - Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`) - Bug in :class:`Series` constructor with a ``dtype=str``, previously raised in some cases (:issue:`19853`) +- Stop :func:`concat` and ``Dataframe.append`` from sorting columns by default. Use ``sort=True`` to retain old behavior (:issue:`4588`) Other ^^^^^ diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 30521760327b4..ae9d240afcb93 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -157,7 +157,7 @@ def fast_unique_multiple(list arrays): @cython.wraparound(False) @cython.boundscheck(False) -def fast_unique_multiple_list(list lists): +def fast_unique_multiple_list(list lists, bint sort=True): cdef: list buf Py_ssize_t k = len(lists) @@ -174,10 +174,11 @@ def fast_unique_multiple_list(list lists): if val not in table: table[val] = stub uniques.append(val) - try: - uniques.sort() - except Exception: - pass + if sort: + try: + uniques.sort() + except Exception: + pass return uniques diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 82d5a0286b117..5f884d5426d47 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6038,7 +6038,8 @@ def infer(x): # ---------------------------------------------------------------------- # Merging / joining methods - def append(self, other, ignore_index=False, verify_integrity=False): + def append(self, other, ignore_index=False, + verify_integrity=False, sort=False): """ Append rows of `other` to the end of this frame, returning a new object. Columns not in this frame are added as new columns. @@ -6051,6 +6052,8 @@ def append(self, other, ignore_index=False, verify_integrity=False): If True, do not use the index labels. verify_integrity : boolean, default False If True, raise ValueError on creating index with duplicates. + sort: boolean, default False + Sort columns if given object doesn't have the same columns Returns ------- @@ -6162,7 +6165,8 @@ def append(self, other, ignore_index=False, verify_integrity=False): else: to_concat = [self, other] return concat(to_concat, ignore_index=ignore_index, - verify_integrity=verify_integrity) + verify_integrity=verify_integrity, + sort=sort) def join(self, other, on=None, how='left', lsuffix='', rsuffix='', sort=False): diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 2e5ec8b554ce7..75232e3db7e55 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -31,17 +31,17 @@ '_all_indexes_same'] -def _get_objs_combined_axis(objs, intersect=False, axis=0): +def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): # Extract combined index: return intersection or union (depending on the # value of "intersect") of indexes on given axis, or None if all objects # lack indexes (e.g. they are numpy arrays) obs_idxes = [obj._get_axis(axis) for obj in objs if hasattr(obj, '_get_axis')] if obs_idxes: - return _get_combined_index(obs_idxes, intersect=intersect) + return _get_combined_index(obs_idxes, intersect=intersect, sort=sort) -def _get_combined_index(indexes, intersect=False): +def _get_combined_index(indexes, intersect=False, sort=True): # TODO: handle index names! indexes = com._get_distinct_objs(indexes) if len(indexes) == 0: @@ -53,11 +53,11 @@ def _get_combined_index(indexes, intersect=False): for other in indexes[1:]: index = index.intersection(other) return index - union = _union_indexes(indexes) + union = _union_indexes(indexes, sort=sort) return _ensure_index(union) -def _union_indexes(indexes): +def _union_indexes(indexes, sort=True): if len(indexes) == 0: raise AssertionError('Must have at least 1 Index to union') if len(indexes) == 1: @@ -74,7 +74,8 @@ def conv(i): i = i.tolist() return i - return Index(lib.fast_unique_multiple_list([conv(i) for i in inds])) + return Index( + lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort)) if kind == 'special': result = indexes[0] diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 6e564975f34cd..531d1715cdf27 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -20,7 +20,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, - copy=True): + sort=False, copy=True): """ Concatenate pandas objects along a particular axis with optional set logic along the other axes. @@ -60,6 +60,8 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, verify_integrity : boolean, default False Check whether the new concatenated axis contains duplicates. This can be very expensive relative to the actual data concatenation + sort : boolean, default False + Sort columns if all passed object columns are not the same copy : boolean, default True If False, do not copy data unnecessarily @@ -209,7 +211,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, ignore_index=ignore_index, join=join, keys=keys, levels=levels, names=names, verify_integrity=verify_integrity, - copy=copy) + copy=copy, sort=sort) return op.get_result() @@ -220,7 +222,8 @@ class _Concatenator(object): def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, levels=None, names=None, - ignore_index=False, verify_integrity=False, copy=True): + ignore_index=False, verify_integrity=False, copy=True, + sort=False): if isinstance(objs, (NDFrame, compat.string_types)): raise TypeError('first argument must be an iterable of pandas ' 'objects, you passed an object of type ' @@ -355,6 +358,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, self.keys = keys self.names = names or getattr(keys, 'names', None) self.levels = levels + self.sort = sort self.ignore_index = ignore_index self.verify_integrity = verify_integrity @@ -447,7 +451,8 @@ def _get_comb_axis(self, i): data_axis = self.objs[0]._get_block_manager_axis(i) try: return _get_objs_combined_axis(self.objs, axis=data_axis, - intersect=self.intersect) + intersect=self.intersect, + sort=self.sort) except IndexError: types = [type(x).__name__ for x in self.objs] raise TypeError("Cannot concatenate list of {types}" diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 640d09f3587fb..8051f39284d6f 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -7,7 +7,7 @@ from numpy.random import randn from datetime import datetime -from pandas.compat import StringIO, iteritems, PY2 +from pandas.compat import StringIO, iteritems import pandas as pd from pandas import (DataFrame, concat, read_csv, isna, Series, date_range, @@ -946,8 +946,9 @@ def test_append_dtype_coerce(self): dt.datetime(2013, 1, 2, 0, 0), dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 4, 0, 0)], - name='start_time')], axis=1) - result = df1.append(df2, ignore_index=True) + name='start_time')], + axis=1, sort=True) + result = df1.append(df2, ignore_index=True, sort=True) assert_frame_equal(result, expected) def test_append_missing_column_proper_upcast(self): @@ -1105,7 +1106,8 @@ def test_concat_ignore_index(self): frame1.index = Index(["x", "y", "z"]) frame2.index = Index(["x", "y", "q"]) - v1 = concat([frame1, frame2], axis=1, ignore_index=True) + v1 = concat([frame1, frame2], axis=1, + ignore_index=True, sort=True) nan = np.nan expected = DataFrame([[nan, nan, nan, 4.3], @@ -1557,7 +1559,7 @@ def test_concat_series_axis1(self): # must reindex, #2603 s = Series(randn(3), index=['c', 'a', 'b'], name='A') s2 = Series(randn(4), index=['d', 'a', 'b', 'c'], name='B') - result = concat([s, s2], axis=1) + result = concat([s, s2], axis=1, sort=True) expected = DataFrame({'A': s, 'B': s2}) assert_frame_equal(result, expected) @@ -2164,8 +2166,6 @@ def test_concat_order(self): for i in range(100)] result = pd.concat(dfs).columns expected = dfs[0].columns - if PY2: - expected = expected.sort_values() tm.assert_index_equal(result, expected) def test_concat_datetime_timezone(self): @@ -2249,3 +2249,24 @@ def test_concat_empty_and_non_empty_series_regression(): expected = s1 result = pd.concat([s1, s2]) tm.assert_series_equal(result, expected) + + +def test_concat_preserve_column_order_differing_columns(): + # GH 4588 regression test + # for new columns in concat + dfa = pd.DataFrame(columns=['C', 'A'], data=[[1, 2]]) + dfb = pd.DataFrame(columns=['C', 'Z'], data=[[5, 6]]) + result = pd.concat([dfa, dfb]) + assert result.columns.tolist() == ['C', 'A', 'Z'] + + +def test_concat_preserve_column_order_uneven_data(): + # GH 4588 regression test + # add to column, concat with uneven data + df = pd.DataFrame() + df['b'] = [1, 2, 3] + df['c'] = [1, 2, 3] + df['a'] = [1, 2, 3] + df2 = pd.DataFrame({'a': [4, 5]}) + df3 = pd.concat([df, df2]) + assert df3.columns.tolist() == ['b', 'c', 'a'] From f37d7ef9d6b9a6ba44fd7b1f059ad40e75b4ce4f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 26 Apr 2018 09:13:23 -0500 Subject: [PATCH 14/34] Updates API: Updated the default to be compatible and warn. DOC: updated the whatsnew and concat docstring. --- doc/source/whatsnew/v0.23.0.txt | 26 ++++++++++++++++- pandas/core/indexes/api.py | 20 ++++++++++++++ pandas/core/reshape/concat.py | 13 +++++++-- pandas/tests/reshape/test_concat.py | 43 ++++++++++++++++++++++++++--- 4 files changed, 94 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index dd557bbfa45a2..cfb237691c57e 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -639,6 +639,31 @@ Returning a ``Series`` allows one to control the exact return structure and colu df.apply(lambda x: Series([1, 2, 3], index=['D', 'E', 'F']), axis=1) +Concatenation will no longer sort +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In a future version of pandas :func:`pandas.concat` will no longer sort the non-concatenation axis when it is not already aligned. +The current behavior is the same as the previous (sorting), but now a warning is issued. + +.. ipython:: python + :okwarning: + + df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=['b', 'a']) + df2 = pd.DataFrame({"a": [4, 5]}) + + pd.concat([df1, df2]) + +To keep the previous behavior (sorting) and silence the warning, pass ``sort=True`` + +.. ipython:: python + + pd.concat([df1, df2], sort=True) + +To accept the future behavior (no sorting), pass ``sort=False`` + +.. ipython + + pd.concat([df1, df2], sort=False) .. _whatsnew_0230.api_breaking.build_changes: @@ -1237,7 +1262,6 @@ Reshaping - Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`) - Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`) - Bug in :class:`Series` constructor with a ``dtype=str``, previously raised in some cases (:issue:`19853`) -- Stop :func:`concat` and ``Dataframe.append`` from sorting columns by default. Use ``sort=True`` to retain old behavior (:issue:`4588`) Other ^^^^^ diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 75232e3db7e55..32cf5c47bbd6b 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -1,3 +1,6 @@ +import textwrap +import warnings + from pandas.core.indexes.base import (Index, _new_Index, _ensure_index, @@ -17,6 +20,16 @@ from pandas._libs import lib from pandas._libs.tslib import NaT +_sort_msg = textwrap.dedent("""\ +Sorting because non-concatenation axis is not aligned. A future version +of pandas will change to not sort by default. + +To accept the future behavior, pass 'sort=True'. + +To retain the current behavior and silence the warning, pass sort=False +""") + + # TODO: there are many places that rely on these private methods existing in # pandas.core.index __all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index', @@ -90,6 +103,12 @@ def conv(i): index = indexes[0] for other in indexes[1:]: if not index.equals(other): + + if sort is None: + # TODO: remove once pd.concat sort default changes + warnings.warn(_sort_msg, FutureWarning, stacklevel=8) + sort = True + return _unique_indices(indexes) name = _get_consensus_names(indexes)[0] @@ -97,6 +116,7 @@ def conv(i): index = index._shallow_copy(name=name) return index else: + # XXX: here too? return _unique_indices(indexes) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 531d1715cdf27..4879e32d8348b 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -20,7 +20,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, - sort=False, copy=True): + sort=None, copy=True): """ Concatenate pandas objects along a particular axis with optional set logic along the other axes. @@ -60,8 +60,15 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, verify_integrity : boolean, default False Check whether the new concatenated axis contains duplicates. This can be very expensive relative to the actual data concatenation - sort : boolean, default False - Sort columns if all passed object columns are not the same + sort : boolean, default None + Sort non-concatenation axis if it is not already aligned. The current + default of sorting is deprecated and will change to not-sorting in a + future version of pandas. Explicitly pass ``sort=True`` to silence + the warning and sort. Explicitly pass ``sort=False`` to silence the + warning and not sort. + + .. versionadded:: 0.23.0 + copy : boolean, default True If False, do not copy data unnecessarily diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 8051f39284d6f..c4f7a3454c7f7 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2251,13 +2251,43 @@ def test_concat_empty_and_non_empty_series_regression(): tm.assert_series_equal(result, expected) +def test_concat_sort_columns(): + # GH-4588 + df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=['b', 'a']) + df2 = pd.DataFrame({"a": [3, 4]}) + + expected = pd.DataFrame({"a": [1, 2, 3, 4], + "b": [1, 2, None, None]}, + columns=['a', 'b']) + with tm.assert_produces_warning(FutureWarning): + result = pd.concat([df1, df2], ignore_index=True) + + tm.assert_frame_equal(result, expected) + + +def test_concat_sorts_index(): + df1 = pd.DataFrame({"a": [1, 2, 3]}, index=['c', 'a', 'b']) + df2 = pd.DataFrame({"b": [1, 2]}, index=['a', 'b']) + + with tm.assert_produces_warning(FutureWarning): + result = pd.concat([df1, df2], axis=1) + + expected = pd.DataFrame({"a": [2, 3, 1], "b": [1, 2, None]}, + index=['a', 'b', 'c'], + columns=['a', 'b']) + tm.assert_frame_equal(result, expected) + + def test_concat_preserve_column_order_differing_columns(): # GH 4588 regression test # for new columns in concat dfa = pd.DataFrame(columns=['C', 'A'], data=[[1, 2]]) dfb = pd.DataFrame(columns=['C', 'Z'], data=[[5, 6]]) - result = pd.concat([dfa, dfb]) - assert result.columns.tolist() == ['C', 'A', 'Z'] + result = pd.concat([dfa, dfb], ignore_index=True) + + expected = pd.DataFrame({"A": [2, None], "C": [1, 5], + "Z": [None, 6]}, columns=["A", "C", "Z"]) + tm.assert_frame_equal(result, expected) def test_concat_preserve_column_order_uneven_data(): @@ -2268,5 +2298,10 @@ def test_concat_preserve_column_order_uneven_data(): df['c'] = [1, 2, 3] df['a'] = [1, 2, 3] df2 = pd.DataFrame({'a': [4, 5]}) - df3 = pd.concat([df, df2]) - assert df3.columns.tolist() == ['b', 'c', 'a'] + result = pd.concat([df, df2]) + expected = pd.DataFrame({ + 'a': [1, 2, 3, 4, 5], + 'b': [1, 2, 3, None, None], + 'c': [1, 2, 3, None, None] + }, index=[0, 1, 2, 0, 1]) + tm.assert_frame_equal(result, expected) From e467f91f97f9b9f5ba7f18d2eeda5b771e58cfe9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 26 Apr 2018 15:46:06 -0500 Subject: [PATCH 15/34] Test fallout --- pandas/core/base.py | 2 +- pandas/core/groupby/groupby.py | 3 +- pandas/core/indexes/api.py | 3 +- pandas/tests/indexing/test_iloc.py | 3 +- pandas/tests/indexing/test_partial.py | 5 +-- pandas/tests/reshape/merge/test_merge.py | 2 +- pandas/tests/reshape/test_concat.py | 11 ++++--- pandas/tests/sparse/test_combine_concat.py | 38 +++++++++++++++------- 8 files changed, 42 insertions(+), 25 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 9ca1c8bea4db7..2f25a9ce41369 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -507,7 +507,7 @@ def is_any_frame(): for r in compat.itervalues(result)) if isinstance(result, list): - return concat(result, keys=keys, axis=1), True + return concat(result, keys=keys, axis=1, sort=True), True elif is_any_frame(): # we have a dict of DataFrames diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8c20d62117e25..00ea96890dd27 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1098,7 +1098,8 @@ def reset_identity(values): group_names = self.grouper.names result = concat(values, axis=self.axis, keys=group_keys, - levels=group_levels, names=group_names) + levels=group_levels, names=group_names, + sort=True) else: # GH5610, returns a MI, with the first level being a diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 32cf5c47bbd6b..b919c8ab9a23f 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -115,8 +115,7 @@ def conv(i): if name != index.name: index = index._shallow_copy(name=name) return index - else: - # XXX: here too? + else: # kind='list return _unique_indices(indexes) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index f1178d44dbfe0..bfc74db73b813 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -629,7 +629,8 @@ def test_iloc_non_unique_indexing(self): new_list.append(s * 3) expected = DataFrame(new_list) - expected = concat([expected, DataFrame(index=idx[idx > sidx.max()])]) + expected = concat([expected, DataFrame(index=idx[idx > sidx.max()])], + sort=True) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df2.loc[idx] tm.assert_frame_equal(result, expected, check_index_type=False) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index f95f493c66043..3c7a7f070805d 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -156,8 +156,9 @@ def f(): df_orig = DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) - expected = pd.concat([df_orig, DataFrame( - {'A': 7}, index=[dates[-1] + 1])]) + expected = pd.concat([df_orig, + DataFrame({'A': 7}, index=[dates[-1] + 1])], + sort=True) df = df_orig.copy() df.loc[dates[-1] + 1, 'A'] = 7 tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index dbf7c7f100b0e..4f68514e8fcaf 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1286,7 +1286,7 @@ def test_join_multi_levels(self): index=MultiIndex.from_tuples( [(4, np.nan)], names=['household_id', 'asset_id']))) - ], axis=0).reindex(columns=expected.columns)) + ], axis=0, sort=True).reindex(columns=expected.columns)) assert_frame_equal(result, expected) # invalid cases diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index c4f7a3454c7f7..ef21181452bc2 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1318,7 +1318,7 @@ def test_with_mixed_tuples(self): df2 = DataFrame({u'B': 'foo', (u'B', 1): 'bar'}, index=range(2)) # it works - concat([df1, df2]) + concat([df1, df2], sort=True) def test_handle_empty_objects(self): df = DataFrame(np.random.randn(10, 4), columns=list('abcd')) @@ -1328,7 +1328,7 @@ def test_handle_empty_objects(self): empty = df[5:5] frames = [baz, empty, empty, df[5:]] - concatted = concat(frames, axis=0) + concatted = concat(frames, axis=0, sort=True) expected = df.reindex(columns=['a', 'b', 'c', 'd', 'foo']) expected['foo'] = expected['foo'].astype('O') @@ -2056,7 +2056,7 @@ def test_categorical_concat(self): cat_values = ["one", "one", "two", "one", "two", "two", "one"] df2['h'] = Series(Categorical(cat_values)) - res = pd.concat((df1, df2), axis=0, ignore_index=True) + res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=True) exp = DataFrame({'a': [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12], 'b': [1, 4, 7, 10, 13, 16, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], @@ -2165,6 +2165,7 @@ def test_concat_order(self): dfs += [pd.DataFrame(index=range(3), columns=[None, 1, 'a']) for i in range(100)] result = pd.concat(dfs).columns + expected = dfs[0].columns tm.assert_index_equal(result, expected) @@ -2283,7 +2284,7 @@ def test_concat_preserve_column_order_differing_columns(): # for new columns in concat dfa = pd.DataFrame(columns=['C', 'A'], data=[[1, 2]]) dfb = pd.DataFrame(columns=['C', 'Z'], data=[[5, 6]]) - result = pd.concat([dfa, dfb], ignore_index=True) + result = pd.concat([dfa, dfb], ignore_index=True, sort=True) expected = pd.DataFrame({"A": [2, None], "C": [1, 5], "Z": [None, 6]}, columns=["A", "C", "Z"]) @@ -2298,7 +2299,7 @@ def test_concat_preserve_column_order_uneven_data(): df['c'] = [1, 2, 3] df['a'] = [1, 2, 3] df2 = pd.DataFrame({'a': [4, 5]}) - result = pd.concat([df, df2]) + result = pd.concat([df, df2], sort=True) expected = pd.DataFrame({ 'a': [1, 2, 3, 4, 5], 'b': [1, 2, 3, None, None], diff --git a/pandas/tests/sparse/test_combine_concat.py b/pandas/tests/sparse/test_combine_concat.py index 70fd1da529d46..9e392457edbc3 100644 --- a/pandas/tests/sparse/test_combine_concat.py +++ b/pandas/tests/sparse/test_combine_concat.py @@ -202,17 +202,29 @@ def test_concat_different_fill_value(self): exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) + def test_concat_different_columns_sort_warns(self): + sparse = self.dense1.to_sparse() + sparse3 = self.dense3.to_sparse() + + with tm.assert_produces_warning(FutureWarning): + res = pd.concat([sparse, sparse3]) + with tm.assert_produces_warning(FutureWarning): + exp = pd.concat([self.dense1, self.dense3]) + + exp = exp.to_sparse() + tm.assert_sp_frame_equal(res, exp) + def test_concat_different_columns(self): # fill_value = np.nan sparse = self.dense1.to_sparse() sparse3 = self.dense3.to_sparse() - res = pd.concat([sparse, sparse3]) - exp = pd.concat([self.dense1, self.dense3]).to_sparse() + res = pd.concat([sparse, sparse3], sort=True) + exp = pd.concat([self.dense1, self.dense3], sort=True).to_sparse() tm.assert_sp_frame_equal(res, exp) - res = pd.concat([sparse3, sparse]) - exp = pd.concat([self.dense3, self.dense1]).to_sparse() + res = pd.concat([sparse3, sparse], sort=True) + exp = pd.concat([self.dense3, self.dense1], sort=True).to_sparse() exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) @@ -220,13 +232,15 @@ def test_concat_different_columns(self): sparse = self.dense1.to_sparse(fill_value=0) sparse3 = self.dense3.to_sparse(fill_value=0) - res = pd.concat([sparse, sparse3]) - exp = pd.concat([self.dense1, self.dense3]).to_sparse(fill_value=0) + res = pd.concat([sparse, sparse3], sort=True) + exp = (pd.concat([self.dense1, self.dense3], sort=True) + .to_sparse(fill_value=0)) exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) - res = pd.concat([sparse3, sparse]) - exp = pd.concat([self.dense3, self.dense1]).to_sparse(fill_value=0) + res = pd.concat([sparse3, sparse], sort=True) + exp = (pd.concat([self.dense3, self.dense1], sort=True) + .to_sparse(fill_value=0)) exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) @@ -234,13 +248,13 @@ def test_concat_different_columns(self): sparse = self.dense1.to_sparse() sparse3 = self.dense3.to_sparse(fill_value=0) # each columns keeps its fill_value, thus compare in dense - res = pd.concat([sparse, sparse3]) - exp = pd.concat([self.dense1, self.dense3]) + res = pd.concat([sparse, sparse3], sort=True) + exp = pd.concat([self.dense1, self.dense3], sort=True) assert isinstance(res, pd.SparseDataFrame) tm.assert_frame_equal(res.to_dense(), exp) - res = pd.concat([sparse3, sparse]) - exp = pd.concat([self.dense3, self.dense1]) + res = pd.concat([sparse3, sparse], sort=True) + exp = pd.concat([self.dense3, self.dense1], sort=True) assert isinstance(res, pd.SparseDataFrame) tm.assert_frame_equal(res.to_dense(), exp) From 058fae5d76108f69dd578c74c9778daaf3603078 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 26 Apr 2018 16:14:05 -0500 Subject: [PATCH 16/34] Updated append --- doc/source/whatsnew/v0.23.0.txt | 5 ++- pandas/core/frame.py | 10 ++++-- pandas/tests/reshape/test_concat.py | 48 ++++++++++++++++++++++------- 3 files changed, 48 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index cfb237691c57e..241fb46df7f25 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -643,7 +643,7 @@ Concatenation will no longer sort ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In a future version of pandas :func:`pandas.concat` will no longer sort the non-concatenation axis when it is not already aligned. -The current behavior is the same as the previous (sorting), but now a warning is issued. +The current behavior is the same as the previous (sorting), but now a warning is issued (:issue:`4588`). .. ipython:: python :okwarning: @@ -665,6 +665,9 @@ To accept the future behavior (no sorting), pass ``sort=False`` pd.concat([df1, df2], sort=False) +Note that this change also applies to :meth:`DataFrame.append`, which has also received a `sort` keyword for controlling this behavior. + + .. _whatsnew_0230.api_breaking.build_changes: Build Changes diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5f884d5426d47..db312dc67c986 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6039,7 +6039,7 @@ def infer(x): # Merging / joining methods def append(self, other, ignore_index=False, - verify_integrity=False, sort=False): + verify_integrity=False, sort=None): """ Append rows of `other` to the end of this frame, returning a new object. Columns not in this frame are added as new columns. @@ -6052,8 +6052,12 @@ def append(self, other, ignore_index=False, If True, do not use the index labels. verify_integrity : boolean, default False If True, raise ValueError on creating index with duplicates. - sort: boolean, default False - Sort columns if given object doesn't have the same columns + sort : boolean, default None + Sort columns if the columns of `self` and `other` are not aligned. + The default sorting is deprecated and will change to not-sorting + in a future version of pandas. Explicitly pass ``sort=True`` to + silence the warning and sort. Explicitly pass ``sort=False`` to + silence the warning and not sort. Returns ------- diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index ef21181452bc2..494d340d7f880 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -727,10 +727,10 @@ def test_append(self): tm.assert_almost_equal(appended['A'], self.frame['A']) del end_frame['A'] - partial_appended = begin_frame.append(end_frame) + partial_appended = begin_frame.append(end_frame, sort=True) assert 'A' in partial_appended - partial_appended = end_frame.append(begin_frame) + partial_appended = end_frame.append(begin_frame, sort=True) assert 'A' in partial_appended # mixed type handling @@ -738,8 +738,9 @@ def test_append(self): tm.assert_frame_equal(appended, self.mixed_frame) # what to test here - mixed_appended = self.mixed_frame[:5].append(self.frame[5:]) - mixed_appended2 = self.frame[:5].append(self.mixed_frame[5:]) + mixed_appended = self.mixed_frame[:5].append(self.frame[5:], sort=True) + mixed_appended2 = self.frame[:5].append(self.mixed_frame[5:], + sort=True) # all equal except 'foo' column tm.assert_frame_equal( @@ -772,7 +773,7 @@ def test_append(self): def test_append_length0_frame(self): df = DataFrame(columns=['A', 'B', 'C']) df3 = DataFrame(index=[0, 1], columns=['A', 'B']) - df5 = df.append(df3) + df5 = df.append(df3, sort=True) expected = DataFrame(index=[0, 1], columns=['A', 'B', 'C']) assert_frame_equal(df5, expected) @@ -793,6 +794,31 @@ def test_append_records(self): expected = DataFrame(np.concatenate((arr1, arr2))) assert_frame_equal(result, expected) + def test_append_sorts(self): + df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=['b', 'a']) + df2 = pd.DataFrame({"a": [1, 2], 'c': [3, 4]}, index=[2, 3]) + # default, changing in the future + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # from append we have an extra function call. Not worth hacking + # around to get the right stackleve. + result = df1.append(df2) + + expected = pd.DataFrame({"b": [1, 2, None, None], + "a": [1, 2, 1, 2], + "c": [None, None, 3, 4]}, + columns=['a', 'b', 'c']) + tm.assert_frame_equal(result, expected) + + # sort=True, the previous behavior + result = df1.append(df2, sort=True) + tm.assert_frame_equal(result, expected) + + # sort=False, the future behvior. + result = df1.append(df2, sort=False) + expected = expected[['b', 'a', 'c']] + tm.assert_frame_equal(result, expected) + def test_append_different_columns(self): df = DataFrame({'bools': np.random.randn(10) > 0, 'ints': np.random.randint(0, 10, 10), @@ -802,7 +828,7 @@ def test_append_different_columns(self): a = df[:5].loc[:, ['bools', 'ints', 'floats']] b = df[5:].loc[:, ['strings', 'ints', 'floats']] - appended = a.append(b) + appended = a.append(b, sort=True) assert isna(appended['strings'][0:4]).all() assert isna(appended['bools'][5:]).all() @@ -815,7 +841,7 @@ def test_append_many(self): chunks[-1] = chunks[-1].copy() chunks[-1]['foo'] = 'bar' - result = chunks[0].append(chunks[1:]) + result = chunks[0].append(chunks[1:], sort=True) tm.assert_frame_equal(result.loc[:, self.frame.columns], self.frame) assert (result['foo'][15:] == 'bar').all() assert result['foo'][:15].isna().all() @@ -956,7 +982,7 @@ def test_append_missing_column_proper_upcast(self): df2 = DataFrame({'B': np.array([True, False, True, False], dtype=bool)}) - appended = df1.append(df2, ignore_index=True) + appended = df1.append(df2, ignore_index=True, sort=True) assert appended['A'].dtype == 'f8' assert appended['B'].dtype == 'O' @@ -1052,7 +1078,7 @@ def test_concat_dataframe_keys_bug(self): 'value': Series([7, 8], index=Index(['a', 'b'], name='id'))}) # it works - result = concat([t1, t2], axis=1, keys=['t1', 't2']) + result = concat([t1, t2], axis=1, keys=['t1', 't2'], sort=True) assert list(result.columns) == [('t1', 'value'), ('t2', 'value')] def test_concat_series_partial_columns_names(self): @@ -1505,7 +1531,7 @@ def df(): panel3 = panel3.rename_axis(lambda x: '%s_1' % x, axis=2) # it works! - concat([panel1, panel3], axis=1, verify_integrity=True) + concat([panel1, panel3], axis=1, verify_integrity=True, sort=True) def test_concat_series(self): @@ -2164,7 +2190,7 @@ def test_concat_order(self): dfs = [pd.DataFrame(index=range(3), columns=['a', 1, None])] dfs += [pd.DataFrame(index=range(3), columns=[None, 1, 'a']) for i in range(100)] - result = pd.concat(dfs).columns + result = pd.concat(dfs, sort=True).columns expected = dfs[0].columns tm.assert_index_equal(result, expected) From 04e51518c31d62da9ca7c95ce53388e1edec06e0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 26 Apr 2018 16:15:44 -0500 Subject: [PATCH 17/34] versionadded --- pandas/core/frame.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index db312dc67c986..9c7a1e123dbc5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6059,6 +6059,8 @@ def append(self, other, ignore_index=False, silence the warning and sort. Explicitly pass ``sort=False`` to silence the warning and not sort. + .. versionadded:: 0.23.0 + Returns ------- appended : DataFrame From c864679d58ebcf02bed3fdf37c8b33e7945148ba Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 27 Apr 2018 06:49:21 -0500 Subject: [PATCH 18/34] Squash more test warnings --- pandas/tests/frame/test_combine_concat.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index e82faaeef2986..15ca65395e4fc 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -96,7 +96,7 @@ def test_append_series_dict(self): result = df.append(series[::-1][:3], ignore_index=True) expected = df.append(DataFrame({0: series[::-1][:3]}).T, - ignore_index=True) + ignore_index=True, sort=True) assert_frame_equal(result, expected.loc[:, result.columns]) # can append when name set @@ -119,8 +119,8 @@ def test_append_list_of_series_dicts(self): # different columns dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4}, {'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}] - result = df.append(dicts, ignore_index=True) - expected = df.append(DataFrame(dicts), ignore_index=True) + result = df.append(dicts, ignore_index=True, sort=True) + expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) assert_frame_equal(result, expected) def test_append_empty_dataframe(self): From 7e975c9649cf8401889f0fdbbba5d447217aa2a5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 27 Apr 2018 07:22:04 -0500 Subject: [PATCH 19/34] py2 compat --- pandas/tests/reshape/test_concat.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 494d340d7f880..ae7a2c2c5f5fd 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -7,7 +7,7 @@ from numpy.random import randn from datetime import datetime -from pandas.compat import StringIO, iteritems +from pandas.compat import StringIO, iteritems, PY2 import pandas as pd from pandas import (DataFrame, concat, read_csv, isna, Series, date_range, @@ -2190,9 +2190,15 @@ def test_concat_order(self): dfs = [pd.DataFrame(index=range(3), columns=['a', 1, None])] dfs += [pd.DataFrame(index=range(3), columns=[None, 1, 'a']) for i in range(100)] + result = pd.concat(dfs, sort=True).columns - expected = dfs[0].columns + if PY2: + # Different sort order between incomparable objects between + # python 2 and python3 via Index.union. + expected = dfs[1].columns + else: + expected = dfs[0].columns tm.assert_index_equal(result, expected) def test_concat_datetime_timezone(self): From a8ba4307bb051ebbd9e6caa8251236dd9af3be85 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 27 Apr 2018 10:02:15 -0500 Subject: [PATCH 20/34] Document outer is not affected --- pandas/core/reshape/concat.py | 14 +++++++++----- pandas/tests/reshape/test_concat.py | 17 +++++++++++++++++ 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 4879e32d8348b..b36e9b8d900fd 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -61,11 +61,15 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, Check whether the new concatenated axis contains duplicates. This can be very expensive relative to the actual data concatenation sort : boolean, default None - Sort non-concatenation axis if it is not already aligned. The current - default of sorting is deprecated and will change to not-sorting in a - future version of pandas. Explicitly pass ``sort=True`` to silence - the warning and sort. Explicitly pass ``sort=False`` to silence the - warning and not sort. + Sort non-concatenation axis if it is not already aligned when `join` + is 'outer'. The current default of sorting is deprecated and will + change to not-sorting in a future version of pandas. + + Explicitly pass ``sort=True`` to silence the warning and sort. + Explicitly pass ``sort=False`` to silence the warning and not sort. + + This has no effect when ``join='inner'``, which already preserves + the order of the non-concatenation axis. .. versionadded:: 0.23.0 diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index ae7a2c2c5f5fd..f74d652ae012c 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2311,6 +2311,23 @@ def test_concat_sorts_index(): tm.assert_frame_equal(result, expected) +def test_concat_inner_sort_unaffected(): + # https://github.com/pandas-dev/pandas/pull/20613 + df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 2]}, + columns=['b', 'a', 'c']) + df2 = pd.DataFrame({"a": [1, 2], 'b': [3, 4]}, index=[3, 4]) + with tm.assert_produces_warning(None): + r0 = pd.concat([df1, df2], join='inner', ignore_index=True) + r1 = pd.concat([df1, df2], join='inner', sort=True, ignore_index=True) + r2 = pd.concat([df1, df2], join='inner', sort=True, ignore_index=True) + + expected = pd.DataFrame({"b": [1, 2, 3, 4], "a": [1, 2, 1, 2]}, + columns=['b', 'a']) + tm.assert_frame_equal(r0, expected) + tm.assert_frame_equal(r1, expected) + tm.assert_frame_equal(r2, expected) + + def test_concat_preserve_column_order_differing_columns(): # GH 4588 regression test # for new columns in concat From 62b1e7bd124a59349dfee80458d4956b78fe2359 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 27 Apr 2018 10:08:31 -0500 Subject: [PATCH 21/34] Docs --- doc/source/merging.rst | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 74b21c21252ec..de37dc6dab59e 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -153,10 +153,10 @@ Set logic on the other axes ~~~~~~~~~~~~~~~~~~~~~~~~~~~ When gluing together multiple DataFrames, you have a choice of how to handle -the other axes (other than the one being concatenated). This can be done in +the other axes (other than the one being concatenated). This can be done in the following three ways: -- Take the (sorted) union of them all, ``join='outer'``. This is the default +- Take the union of them all, ``join='outer'``. This is the default option as it results in zero information loss. - Take the intersection, ``join='inner'``. - Use a specific index, as passed to the ``join_axes`` argument. @@ -167,10 +167,10 @@ behavior: .. ipython:: python df4 = pd.DataFrame({'B': ['B2', 'B3', 'B6', 'B7'], - 'D': ['D2', 'D3', 'D6', 'D7'], - 'F': ['F2', 'F3', 'F6', 'F7']}, - index=[2, 3, 6, 7]) - result = pd.concat([df1, df4], axis=1) + 'D': ['D2', 'D3', 'D6', 'D7'], + 'F': ['F2', 'F3', 'F6', 'F7']}, + index=[2, 3, 6, 7]) + result = pd.concat([df1, df4], axis=1, sort=False) .. ipython:: python @@ -181,8 +181,14 @@ behavior: labels=['df1', 'df4'], vertical=False); plt.close('all'); -Note that the row indexes have been unioned and sorted. Here is the same thing -with ``join='inner'``: +.. versionchanged:: 0.23.0 + + The default behavior with ``join='outer'`` is to sort the other axis + (columns in this case). In a future version of pandas, the default will + be to not sort. We specified ``sort=False`` to opt in to the new + behavior now. + +Here is the same thing with ``join='inner'``: .. ipython:: python From 0ace673e72da77d4ffceeca6f96eb5a0e0b3d7ec Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 28 Apr 2018 13:32:04 -0500 Subject: [PATCH 22/34] Sort for intersection --- doc/source/merging.rst | 4 +++- doc/source/whatsnew/v0.23.0.txt | 2 ++ pandas/core/indexes/api.py | 7 +++++-- pandas/tests/reshape/test_concat.py | 24 ++++++++++++++++++++++++ 4 files changed, 34 insertions(+), 3 deletions(-) diff --git a/doc/source/merging.rst b/doc/source/merging.rst index de37dc6dab59e..1161656731f88 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -181,7 +181,9 @@ behavior: labels=['df1', 'df4'], vertical=False); plt.close('all'); -.. versionchanged:: 0.23.0 +.. warning:: + + .. versionchanged:: 0.23.0 The default behavior with ``join='outer'`` is to sort the other axis (columns in this case). In a future version of pandas, the default will diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 241fb46df7f25..6c2b64f402f0a 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -639,6 +639,8 @@ Returning a ``Series`` allows one to control the exact return structure and colu df.apply(lambda x: Series([1, 2, 3], index=['D', 'E', 'F']), axis=1) +.. _whatsnew_0230.api_breaking.concat: + Concatenation will no longer sort ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index b919c8ab9a23f..f345c21b2f2f0 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -60,7 +60,10 @@ def _get_combined_index(indexes, intersect=False, sort=True): if len(indexes) == 0: return Index([]) if len(indexes) == 1: - return indexes[0] + index = indexes[0] + if sort: + index = index.sort_values() + return index if intersect: index = indexes[0] for other in indexes[1:]: @@ -115,7 +118,7 @@ def conv(i): if name != index.name: index = index._shallow_copy(name=name) return index - else: # kind='list + else: # kind='list' return _unique_indices(indexes) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index f74d652ae012c..fc91647321a05 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2355,3 +2355,27 @@ def test_concat_preserve_column_order_uneven_data(): 'c': [1, 2, 3, None, None] }, index=[0, 1, 2, 0, 1]) tm.assert_frame_equal(result, expected) + + +def test_concat_aligned_sort(): + # GH-4588 + df = pd.DataFrame({"b": [1, 2], "a": [3, 4]}, columns=['b', 'a']) + result = pd.concat([df, df], sort=True, ignore_index=True) + expected = pd.DataFrame({'b': [1, 2, 1, 2], 'a': [3, 4, 3, 4]}, + columns=['a', 'b']) + tm.assert_frame_equal(result, expected) + + +def test_concat_aligned_sort_raises(): + # GH-4588 + df = pd.DataFrame({1: [1, 2], "a": [3, 4]}, columns=[1, 'a']) + + if PY2: + expected = pd.DataFrame({1: [1, 2, 1, 2], 'a': [3, 4, 3, 4]}, + columns=[1, 'a']) + result = pd.concat([df, df], ignore_index=True) + tm.assert_frame_equal(result, expected) + else: + msg = "'<' not supported between instances" + with tm.assert_raises_regex(TypeError, msg): + pd.concat([df, df], sort=True) From d5cafdf95cb43fadaf59a2e0fd21494b0a30ae41 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 28 Apr 2018 14:46:11 -0500 Subject: [PATCH 23/34] More tests --- pandas/core/indexes/api.py | 22 ++-- pandas/tests/reshape/merge/test_merge.py | 2 +- pandas/tests/reshape/test_concat.py | 126 +++++++++++++---------- 3 files changed, 88 insertions(+), 62 deletions(-) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index f345c21b2f2f0..07ddbcc6fec18 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -58,19 +58,23 @@ def _get_combined_index(indexes, intersect=False, sort=True): # TODO: handle index names! indexes = com._get_distinct_objs(indexes) if len(indexes) == 0: - return Index([]) - if len(indexes) == 1: + index = Index([]) + elif len(indexes) == 1: index = indexes[0] - if sort: - index = index.sort_values() - return index - if intersect: + elif intersect: index = indexes[0] for other in indexes[1:]: index = index.intersection(other) - return index - union = _union_indexes(indexes, sort=sort) - return _ensure_index(union) + else: + index = _union_indexes(indexes, sort=sort) + index = _ensure_index(index) + + if sort and not index.is_monotonic_increasing: + try: + index = index.sort_values() + except TypeError: + pass + return index def _union_indexes(indexes, sort=True): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 4f68514e8fcaf..f3827ac251cf0 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -826,7 +826,7 @@ def test_validation(self): # Dups on left left_w_dups = left.append(pd.DataFrame({'a': ['a'], 'c': ['cow']}, - index=[3])) + index=[3]), sort=True) merge(left_w_dups, right, left_index=True, right_index=True, validate='many_to_one') diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index fc91647321a05..67471bd2d2c6d 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -21,6 +21,12 @@ import pytest +@pytest.fixture(params=[True, False]) +def sort(request): + """Boolean sort keyword for concat and DataFrame.append.""" + return request.param + + class ConcatenateBase(object): def setup_method(self, method): @@ -716,7 +722,7 @@ def test_concat_categorical_empty(self): class TestAppend(ConcatenateBase): - def test_append(self): + def test_append(self, sort): begin_index = self.frame.index[:5] end_index = self.frame.index[5:] @@ -727,10 +733,10 @@ def test_append(self): tm.assert_almost_equal(appended['A'], self.frame['A']) del end_frame['A'] - partial_appended = begin_frame.append(end_frame, sort=True) + partial_appended = begin_frame.append(end_frame, sort=sort) assert 'A' in partial_appended - partial_appended = end_frame.append(begin_frame, sort=True) + partial_appended = end_frame.append(begin_frame, sort=sort) assert 'A' in partial_appended # mixed type handling @@ -738,9 +744,9 @@ def test_append(self): tm.assert_frame_equal(appended, self.mixed_frame) # what to test here - mixed_appended = self.mixed_frame[:5].append(self.frame[5:], sort=True) + mixed_appended = self.mixed_frame[:5].append(self.frame[5:], sort=sort) mixed_appended2 = self.frame[:5].append(self.mixed_frame[5:], - sort=True) + sort=sort) # all equal except 'foo' column tm.assert_frame_equal( @@ -770,10 +776,10 @@ def test_append(self): result = df.append(row) tm.assert_frame_equal(result, expected) - def test_append_length0_frame(self): + def test_append_length0_frame(self, sort): df = DataFrame(columns=['A', 'B', 'C']) df3 = DataFrame(index=[0, 1], columns=['A', 'B']) - df5 = df.append(df3, sort=True) + df5 = df.append(df3, sort=sort) expected = DataFrame(index=[0, 1], columns=['A', 'B', 'C']) assert_frame_equal(df5, expected) @@ -819,7 +825,7 @@ def test_append_sorts(self): expected = expected[['b', 'a', 'c']] tm.assert_frame_equal(result, expected) - def test_append_different_columns(self): + def test_append_different_columns(self, sort): df = DataFrame({'bools': np.random.randn(10) > 0, 'ints': np.random.randint(0, 10, 10), 'floats': np.random.randn(10), @@ -828,11 +834,11 @@ def test_append_different_columns(self): a = df[:5].loc[:, ['bools', 'ints', 'floats']] b = df[5:].loc[:, ['strings', 'ints', 'floats']] - appended = a.append(b, sort=True) + appended = a.append(b, sort=sort) assert isna(appended['strings'][0:4]).all() assert isna(appended['bools'][5:]).all() - def test_append_many(self): + def test_append_many(self, sort): chunks = [self.frame[:5], self.frame[5:10], self.frame[10:15], self.frame[15:]] @@ -841,7 +847,7 @@ def test_append_many(self): chunks[-1] = chunks[-1].copy() chunks[-1]['foo'] = 'bar' - result = chunks[0].append(chunks[1:], sort=True) + result = chunks[0].append(chunks[1:], sort=sort) tm.assert_frame_equal(result.loc[:, self.frame.columns], self.frame) assert (result['foo'][15:] == 'bar').all() assert result['foo'][:15].isna().all() @@ -949,7 +955,7 @@ def test_append_different_columns_types_raises( with pytest.raises(TypeError): df.append(ser) - def test_append_dtype_coerce(self): + def test_append_dtype_coerce(self, sort): # GH 4993 # appending with datetime will incorrectly convert datetime64 @@ -973,16 +979,21 @@ def test_append_dtype_coerce(self): dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 4, 0, 0)], name='start_time')], - axis=1, sort=True) - result = df1.append(df2, ignore_index=True, sort=True) + axis=1, sort=sort) + result = df1.append(df2, ignore_index=True, sort=sort) + if sort: + expected = expected[['end_time', 'start_time']] + else: + expected = expected[['start_time', 'end_time']] + assert_frame_equal(result, expected) - def test_append_missing_column_proper_upcast(self): + def test_append_missing_column_proper_upcast(self, sort): df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8')}) df2 = DataFrame({'B': np.array([True, False, True, False], dtype=bool)}) - appended = df1.append(df2, ignore_index=True, sort=True) + appended = df1.append(df2, ignore_index=True, sort=sort) assert appended['A'].dtype == 'f8' assert appended['B'].dtype == 'O' @@ -1070,7 +1081,7 @@ def test_concat_keys_specific_levels(self): Index(level, name='group_key')) assert result.columns.names[0] == 'group_key' - def test_concat_dataframe_keys_bug(self): + def test_concat_dataframe_keys_bug(self, sort): t1 = DataFrame({ 'value': Series([1, 2, 3], index=Index(['a', 'b', 'c'], name='id'))}) @@ -1078,7 +1089,7 @@ def test_concat_dataframe_keys_bug(self): 'value': Series([7, 8], index=Index(['a', 'b'], name='id'))}) # it works - result = concat([t1, t2], axis=1, keys=['t1', 't2'], sort=True) + result = concat([t1, t2], axis=1, keys=['t1', 't2'], sort=sort) assert list(result.columns) == [('t1', 'value'), ('t2', 'value')] def test_concat_series_partial_columns_names(self): @@ -1124,7 +1135,7 @@ def test_concat_dict(self): expected = concat([frames[k] for k in keys], keys=keys) tm.assert_frame_equal(result, expected) - def test_concat_ignore_index(self): + def test_concat_ignore_index(self, sort): frame1 = DataFrame({"test1": ["a", "b", "c"], "test2": [1, 2, 3], "test3": [4.5, 3.2, 1.2]}) @@ -1133,7 +1144,7 @@ def test_concat_ignore_index(self): frame2.index = Index(["x", "y", "q"]) v1 = concat([frame1, frame2], axis=1, - ignore_index=True, sort=True) + ignore_index=True, sort=sort) nan = np.nan expected = DataFrame([[nan, nan, nan, 4.3], @@ -1141,6 +1152,8 @@ def test_concat_ignore_index(self): ['b', 2, 3.2, 2.2], ['c', 3, 1.2, nan]], index=Index(["q", "x", "y", "z"])) + if not sort: + expected = expected.loc[['x', 'y', 'z', 'q']] tm.assert_frame_equal(v1, expected) @@ -1337,16 +1350,16 @@ def test_dups_index(self): result = df.append(df) assert_frame_equal(result, expected) - def test_with_mixed_tuples(self): + def test_with_mixed_tuples(self, sort): # 10697 # columns have mixed tuples, so handle properly df1 = DataFrame({u'A': 'foo', (u'B', 1): 'bar'}, index=range(2)) df2 = DataFrame({u'B': 'foo', (u'B', 1): 'bar'}, index=range(2)) # it works - concat([df1, df2], sort=True) + concat([df1, df2], sort=sort) - def test_handle_empty_objects(self): + def test_handle_empty_objects(self, sort): df = DataFrame(np.random.randn(10, 4), columns=list('abcd')) baz = df[:5].copy() @@ -1354,7 +1367,7 @@ def test_handle_empty_objects(self): empty = df[5:5] frames = [baz, empty, empty, df[5:]] - concatted = concat(frames, axis=0, sort=True) + concatted = concat(frames, axis=0, sort=sort) expected = df.reindex(columns=['a', 'b', 'c', 'd', 'foo']) expected['foo'] = expected['foo'].astype('O') @@ -1506,7 +1519,7 @@ def test_panel_concat_other_axes(self): expected.loc['ItemC', :, :2] = 'baz' tm.assert_panel_equal(result, expected) - def test_panel_concat_buglet(self): + def test_panel_concat_buglet(self, sort): with catch_warnings(record=True): # #2257 def make_panel(): @@ -1531,7 +1544,7 @@ def df(): panel3 = panel3.rename_axis(lambda x: '%s_1' % x, axis=2) # it works! - concat([panel1, panel3], axis=1, verify_integrity=True, sort=True) + concat([panel1, panel3], axis=1, verify_integrity=True, sort=sort) def test_concat_series(self): @@ -1556,7 +1569,7 @@ def test_concat_series(self): expected.index = exp_index tm.assert_series_equal(result, expected) - def test_concat_series_axis1(self): + def test_concat_series_axis1(self, sort=sort): ts = tm.makeTimeSeries() pieces = [ts[:-2], ts[2:], ts[2:-2]] @@ -1585,7 +1598,7 @@ def test_concat_series_axis1(self): # must reindex, #2603 s = Series(randn(3), index=['c', 'a', 'b'], name='A') s2 = Series(randn(4), index=['d', 'a', 'b', 'c'], name='B') - result = concat([s, s2], axis=1, sort=True) + result = concat([s, s2], axis=1, sort=sort) expected = DataFrame({'A': s, 'B': s2}) assert_frame_equal(result, expected) @@ -2071,7 +2084,7 @@ def test_categorical_concat_dtypes(self): expected = Series([True, False, False], index=index) tm.assert_series_equal(result, expected) - def test_categorical_concat(self): + def test_categorical_concat(self, sort): # See GH 10177 df1 = DataFrame(np.arange(18, dtype='int64').reshape(6, 3), columns=["a", "b", "c"]) @@ -2082,7 +2095,7 @@ def test_categorical_concat(self): cat_values = ["one", "one", "two", "one", "two", "two", "one"] df2['h'] = Series(Categorical(cat_values)) - res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=True) + res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=sort) exp = DataFrame({'a': [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12], 'b': [1, 4, 7, 10, 13, 16, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], @@ -2311,21 +2324,28 @@ def test_concat_sorts_index(): tm.assert_frame_equal(result, expected) -def test_concat_inner_sort_unaffected(): +@pytest.mark.parametrize('sort', [None, False, True]) +def test_concat_inner_sort(sort): # https://github.com/pandas-dev/pandas/pull/20613 df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 2]}, columns=['b', 'a', 'c']) df2 = pd.DataFrame({"a": [1, 2], 'b': [3, 4]}, index=[3, 4]) - with tm.assert_produces_warning(None): - r0 = pd.concat([df1, df2], join='inner', ignore_index=True) - r1 = pd.concat([df1, df2], join='inner', sort=True, ignore_index=True) - r2 = pd.concat([df1, df2], join='inner', sort=True, ignore_index=True) + + if sort is None: + with tm.assert_produces_warning(None): + # unset sort should *not* warn for inner join + # since that never sorted + result = pd.concat([df1, df2], sort=sort, join='inner', + ignore_index=True) + else: + result = pd.concat([df1, df2], sort=sort, join='inner', + ignore_index=True) expected = pd.DataFrame({"b": [1, 2, 3, 4], "a": [1, 2, 1, 2]}, columns=['b', 'a']) - tm.assert_frame_equal(r0, expected) - tm.assert_frame_equal(r1, expected) - tm.assert_frame_equal(r2, expected) + if sort: + expected = expected[['a', 'b']] + tm.assert_frame_equal(result, expected) def test_concat_preserve_column_order_differing_columns(): @@ -2359,23 +2379,25 @@ def test_concat_preserve_column_order_uneven_data(): def test_concat_aligned_sort(): # GH-4588 - df = pd.DataFrame({"b": [1, 2], "a": [3, 4]}, columns=['b', 'a']) + df = pd.DataFrame({"c": [1, 2], "b": [3, 4], 'a': [5, 6]}, + columns=['c', 'b', 'a']) result = pd.concat([df, df], sort=True, ignore_index=True) - expected = pd.DataFrame({'b': [1, 2, 1, 2], 'a': [3, 4, 3, 4]}, - columns=['a', 'b']) + expected = pd.DataFrame({'a': [5, 6, 5, 6], 'b': [3, 4, 3, 4], + 'c': [1, 2, 1, 2]}, + columns=['a', 'b', 'c']) + tm.assert_frame_equal(result, expected) + + result = pd.concat([df, df[['c', 'b']]], join='inner', sort=True, + ignore_index=True) + expected = expected[['b', 'c']] tm.assert_frame_equal(result, expected) -def test_concat_aligned_sort_raises(): +def test_concat_aligned_sort_does_not_raise(): # GH-4588 + # We catch TypeErrors from sorting internally and do not re-raise. df = pd.DataFrame({1: [1, 2], "a": [3, 4]}, columns=[1, 'a']) - - if PY2: - expected = pd.DataFrame({1: [1, 2, 1, 2], 'a': [3, 4, 3, 4]}, - columns=[1, 'a']) - result = pd.concat([df, df], ignore_index=True) - tm.assert_frame_equal(result, expected) - else: - msg = "'<' not supported between instances" - with tm.assert_raises_regex(TypeError, msg): - pd.concat([df, df], sort=True) + expected = pd.DataFrame({1: [1, 2, 1, 2], 'a': [3, 4, 3, 4]}, + columns=[1, 'a']) + result = pd.concat([df, df], ignore_index=True, sort=True) + tm.assert_frame_equal(result, expected) From ce8ff05da9c267f70d47f1911fb661c8c6e9de71 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 29 Apr 2018 06:29:26 -0500 Subject: [PATCH 24/34] Test fixup. Sparse as well --- pandas/core/groupby/groupby.py | 2 +- pandas/tests/sparse/frame/test_frame.py | 23 ++++++++++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 00ea96890dd27..4132d8e69704a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1099,7 +1099,7 @@ def reset_identity(values): result = concat(values, axis=self.axis, keys=group_keys, levels=group_levels, names=group_names, - sort=True) + sort=False) else: # GH5610, returns a MI, with the first level being a diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 540933cb90be2..8a8ed520d45d2 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -629,10 +629,31 @@ def test_append(self): a = self.frame.iloc[:5, :3] b = self.frame.iloc[5:] - appended = a.append(b) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # Stacklevel is set for pd.concat, not append + appended = a.append(b) tm.assert_sp_frame_equal(appended.iloc[:, :3], self.frame.iloc[:, :3], exact_indices=False) + a = a[['B', 'C', 'A']].head(2) + b = b.head(2) + + expected = pd.SparseDataFrame({ + "B": [0., 1, None, 3], + "C": [0., 1, 5, 6], + "A": [None, None, 2, 3], + "D": [None, None, 5, None], + }, index=a.index | b.index) + with tm.assert_produces_warning(None): + appended = a.append(b, sort=False) + + tm.assert_frame_equal(appended, expected) + + with tm.assert_produces_warning(None): + appended = a.append(b, sort=True) + + tm.assert_sp_frame_equal(appended, expected[['A', 'B', 'C', 'D']]) + def test_astype(self): sparse = pd.SparseDataFrame({'A': SparseArray([1, 2, 3, 4], dtype=np.int64), From ce756d4824909d481eb431e6afa404dfb20b15fc Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 29 Apr 2018 14:53:37 -0500 Subject: [PATCH 25/34] ugh --- pandas/core/indexes/api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 07ddbcc6fec18..e5ab5e144086f 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -44,7 +44,7 @@ '_all_indexes_same'] -def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): +def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=False): # Extract combined index: return intersection or union (depending on the # value of "intersect") of indexes on given axis, or None if all objects # lack indexes (e.g. they are numpy arrays) @@ -54,7 +54,7 @@ def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): return _get_combined_index(obs_idxes, intersect=intersect, sort=sort) -def _get_combined_index(indexes, intersect=False, sort=True): +def _get_combined_index(indexes, intersect=False, sort=False): # TODO: handle index names! indexes = com._get_distinct_objs(indexes) if len(indexes) == 0: From 362e84d851c7f24368cf5cac469615c627d9b644 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Apr 2018 08:00:51 -0500 Subject: [PATCH 26/34] quoting --- doc/source/whatsnew/v0.23.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 6c2b64f402f0a..2591ab774e6cb 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -667,7 +667,7 @@ To accept the future behavior (no sorting), pass ``sort=False`` pd.concat([df1, df2], sort=False) -Note that this change also applies to :meth:`DataFrame.append`, which has also received a `sort` keyword for controlling this behavior. +Note that this change also applies to :meth:`DataFrame.append`, which has also received a ``sort`` keyword for controlling this behavior. .. _whatsnew_0230.api_breaking.build_changes: From 0210d3322ce9a2a2920bdda3c11a38b8e0d412fb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Apr 2018 08:03:08 -0500 Subject: [PATCH 27/34] Clarify --- doc/source/whatsnew/v0.23.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 2591ab774e6cb..94702f5e97264 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -645,7 +645,7 @@ Concatenation will no longer sort ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In a future version of pandas :func:`pandas.concat` will no longer sort the non-concatenation axis when it is not already aligned. -The current behavior is the same as the previous (sorting), but now a warning is issued (:issue:`4588`). +The current behavior is the same as the previous (sorting), but now a warning is issued when ``sort`` is not specified and the non-concatenation axis is not aligned (:issue:`4588`). .. ipython:: python :okwarning: From 06772b407f4f1215e60ff271d3158db64c2a0685 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Apr 2018 08:15:13 -0500 Subject: [PATCH 28/34] Removed unnescesary check --- pandas/core/indexes/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index e5ab5e144086f..394181f5377d4 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -69,7 +69,7 @@ def _get_combined_index(indexes, intersect=False, sort=False): index = _union_indexes(indexes, sort=sort) index = _ensure_index(index) - if sort and not index.is_monotonic_increasing: + if sort: try: index = index.sort_values() except TypeError: From e47cbb957b8d29b962df65fe1dde2de22be9c6d4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Apr 2018 09:15:13 -0500 Subject: [PATCH 29/34] Prune tests --- pandas/tests/reshape/test_concat.py | 130 ++++++++++++++-------------- 1 file changed, 65 insertions(+), 65 deletions(-) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 67471bd2d2c6d..57af67422d65f 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -27,6 +27,16 @@ def sort(request): return request.param +@pytest.fixture(params=[True, False, None]) +def sort_with_none(request): + """Boolean sort keyword for concat and DataFrame.append. + + Includes the default of None + """ + # TODO: Replace with sort once keyword changes. + return request.param + + class ConcatenateBase(object): def setup_method(self, method): @@ -800,29 +810,30 @@ def test_append_records(self): expected = DataFrame(np.concatenate((arr1, arr2))) assert_frame_equal(result, expected) - def test_append_sorts(self): + # rewrite sort fixture, since we also want to test default of None + def test_append_sorts(self, sort_with_none): df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=['b', 'a']) df2 = pd.DataFrame({"a": [1, 2], 'c': [3, 4]}, index=[2, 3]) - # default, changing in the future - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # from append we have an extra function call. Not worth hacking - # around to get the right stackleve. - result = df1.append(df2) + if sort_with_none is None: + # only warn if not explicitly specified + # don't check stacklevel since its set for concat, and append + # has an extra stack. + ctx = tm.assert_produces_warning(FutureWarning, + check_stacklevel=False) + else: + ctx = tm.assert_produces_warning(None) + + with ctx: + result = df1.append(df2, sort=sort_with_none) + # for None / True expected = pd.DataFrame({"b": [1, 2, None, None], "a": [1, 2, 1, 2], "c": [None, None, 3, 4]}, columns=['a', 'b', 'c']) - tm.assert_frame_equal(result, expected) - - # sort=True, the previous behavior - result = df1.append(df2, sort=True) - tm.assert_frame_equal(result, expected) - - # sort=False, the future behvior. - result = df1.append(df2, sort=False) - expected = expected[['b', 'a', 'c']] + if sort_with_none is False: + expected = expected[['b', 'a', 'c']] tm.assert_frame_equal(result, expected) def test_append_different_columns(self, sort): @@ -2297,86 +2308,75 @@ def test_concat_empty_and_non_empty_series_regression(): tm.assert_series_equal(result, expected) -def test_concat_sort_columns(): +def test_concat_sorts_columns(sort_with_none): # GH-4588 df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=['b', 'a']) - df2 = pd.DataFrame({"a": [3, 4]}) + df2 = pd.DataFrame({"a": [3, 4], "c": [5, 6]}) + # for sort=True/None expected = pd.DataFrame({"a": [1, 2, 3, 4], - "b": [1, 2, None, None]}, - columns=['a', 'b']) - with tm.assert_produces_warning(FutureWarning): - result = pd.concat([df1, df2], ignore_index=True) + "b": [1, 2, None, None], + "c": [None, None, 5, 6]}, + columns=['a', 'b', 'c']) + + if sort_with_none is False: + expected = expected[['b', 'a', 'c']] + if sort_with_none is None: + # only warn if not explicitly specified + ctx = tm.assert_produces_warning(FutureWarning) + else: + ctx = tm.assert_produces_warning(None) + + # default + with ctx: + result = pd.concat([df1, df2], ignore_index=True, sort=sort_with_none) tm.assert_frame_equal(result, expected) -def test_concat_sorts_index(): +def test_concat_sorts_index(sort_with_none): df1 = pd.DataFrame({"a": [1, 2, 3]}, index=['c', 'a', 'b']) df2 = pd.DataFrame({"b": [1, 2]}, index=['a', 'b']) - with tm.assert_produces_warning(FutureWarning): - result = pd.concat([df1, df2], axis=1) - + # For True/None expected = pd.DataFrame({"a": [2, 3, 1], "b": [1, 2, None]}, index=['a', 'b', 'c'], columns=['a', 'b']) + if sort_with_none is False: + expected = expected.loc[['c', 'a', 'b']] + + if sort_with_none is None: + # only warn if not explicitly specified + ctx = tm.assert_produces_warning(FutureWarning) + else: + ctx = tm.assert_produces_warning(None) + + # Warn and sort by default + with ctx: + result = pd.concat([df1, df2], axis=1, sort=sort_with_none) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize('sort', [None, False, True]) -def test_concat_inner_sort(sort): +def test_concat_inner_sort(sort_with_none): # https://github.com/pandas-dev/pandas/pull/20613 df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 2]}, columns=['b', 'a', 'c']) df2 = pd.DataFrame({"a": [1, 2], 'b': [3, 4]}, index=[3, 4]) - if sort is None: - with tm.assert_produces_warning(None): - # unset sort should *not* warn for inner join - # since that never sorted - result = pd.concat([df1, df2], sort=sort, join='inner', - ignore_index=True) - else: - result = pd.concat([df1, df2], sort=sort, join='inner', + with tm.assert_produces_warning(None): + # unset sort should *not* warn for inner join + # since that never sorted + result = pd.concat([df1, df2], sort=sort_with_none, + join='inner', ignore_index=True) expected = pd.DataFrame({"b": [1, 2, 3, 4], "a": [1, 2, 1, 2]}, columns=['b', 'a']) - if sort: + if sort_with_none is True: expected = expected[['a', 'b']] tm.assert_frame_equal(result, expected) -def test_concat_preserve_column_order_differing_columns(): - # GH 4588 regression test - # for new columns in concat - dfa = pd.DataFrame(columns=['C', 'A'], data=[[1, 2]]) - dfb = pd.DataFrame(columns=['C', 'Z'], data=[[5, 6]]) - result = pd.concat([dfa, dfb], ignore_index=True, sort=True) - - expected = pd.DataFrame({"A": [2, None], "C": [1, 5], - "Z": [None, 6]}, columns=["A", "C", "Z"]) - tm.assert_frame_equal(result, expected) - - -def test_concat_preserve_column_order_uneven_data(): - # GH 4588 regression test - # add to column, concat with uneven data - df = pd.DataFrame() - df['b'] = [1, 2, 3] - df['c'] = [1, 2, 3] - df['a'] = [1, 2, 3] - df2 = pd.DataFrame({'a': [4, 5]}) - result = pd.concat([df, df2], sort=True) - expected = pd.DataFrame({ - 'a': [1, 2, 3, 4, 5], - 'b': [1, 2, 3, None, None], - 'c': [1, 2, 3, None, None] - }, index=[0, 1, 2, 0, 1]) - tm.assert_frame_equal(result, expected) - - def test_concat_aligned_sort(): # GH-4588 df = pd.DataFrame({"c": [1, 2], "b": [3, 4], 'a': [5, 6]}, From 0182c98eb50f1871c83e968e998dc95cf0be981f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Apr 2018 12:42:27 -0500 Subject: [PATCH 30/34] Default sort --- pandas/core/indexes/api.py | 2 +- pandas/core/panel.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 394181f5377d4..f9501cd2f9ddf 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -44,7 +44,7 @@ '_all_indexes_same'] -def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=False): +def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): # Extract combined index: return intersection or union (depending on the # value of "intersect") of indexes on given axis, or None if all objects # lack indexes (e.g. they are numpy arrays) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index e08d0a7368ccb..ffda5e095a38a 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -1500,7 +1500,7 @@ def _extract_axis(self, data, axis=0, intersect=False): if have_frames: index = _get_objs_combined_axis(data.values(), axis=axis, - intersect=intersect) + intersect=intersect, sort=True) if have_raw_arrays: lengths = list(set(raw_lengths)) From 7e589989bd7c6e8c8bf2beb9ce995fd2c8f4d5f6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Apr 2018 13:14:24 -0500 Subject: [PATCH 31/34] Make both tests happy --- pandas/core/panel.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index ffda5e095a38a..16e64192fdb20 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -1499,8 +1499,11 @@ def _extract_axis(self, data, axis=0, intersect=False): raw_lengths.append(v.shape[axis]) if have_frames: + # we want the "old" behavior here, of sorting only + # 1. we're doing a union (intersect=False) + # 2. the indices are not aligned. index = _get_objs_combined_axis(data.values(), axis=axis, - intersect=intersect, sort=True) + intersect=intersect, sort=None) if have_raw_arrays: lengths = list(set(raw_lengths)) From 5b58e7534deee22cea3f1aee4e33bf565e2281d4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Apr 2018 14:22:03 -0500 Subject: [PATCH 32/34] Explicit columns --- pandas/tests/sparse/frame/test_frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 8a8ed520d45d2..9cc615e15564f 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -643,7 +643,7 @@ def test_append(self): "C": [0., 1, 5, 6], "A": [None, None, 2, 3], "D": [None, None, 5, None], - }, index=a.index | b.index) + }, index=a.index | b.index, columns=['B', 'C', 'A', 'D']) with tm.assert_produces_warning(None): appended = a.append(b, sort=False) From 074d03c091dfde8022bc8541e69a61d4555a3cb8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Apr 2018 16:28:12 -0500 Subject: [PATCH 33/34] List of series --- pandas/core/frame.py | 2 +- pandas/tests/frame/test_constructors.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1a162cf66e3ec..d475d8b944575 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7491,7 +7491,7 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): from pandas.core.index import _get_objs_combined_axis if columns is None: - columns = _get_objs_combined_axis(data) + columns = _get_objs_combined_axis(data, sort=False) indexer_cache = {} diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 47b7d60e3b6e8..6dd38187f7277 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1071,6 +1071,17 @@ def test_constructor_list_of_series(self): expected = DataFrame.from_dict(sdict, orient='index') tm.assert_frame_equal(result, expected) + def test_constructor_list_of_series_aligned_index(self): + series = [pd.Series(i, index=['b', 'a', 'c'], name=str(i)) + for i in range(3)] + result = pd.DataFrame(series) + expected = pd.DataFrame({'b': [0, 1, 2], + 'a': [0, 1, 2], + 'c': [0, 1, 2]}, + columns=['b', 'a', 'c'], + index=['0', '1', '2']) + tm.assert_frame_equal(result, expected) + def test_constructor_list_of_derived_dicts(self): class CustomDict(dict): pass From 5e1b0241358c10893939d135e91ab16cfa48cadd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Apr 2018 19:19:46 -0500 Subject: [PATCH 34/34] test, fix pivot --- pandas/core/reshape/pivot.py | 3 ++- pandas/tests/reshape/test_pivot.py | 12 ++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 74a9b59d3194a..96f8a53b4d253 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -437,7 +437,8 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, rownames = _get_names(index, rownames, prefix='row') colnames = _get_names(columns, colnames, prefix='col') - common_idx = _get_objs_combined_axis(index + columns, intersect=True) + common_idx = _get_objs_combined_axis(index + columns, intersect=True, + sort=False) data = {} data.update(zip(rownames, index)) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 1004b40bfb4c1..db287a719ae1e 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1724,3 +1724,15 @@ def test_crosstab_tuple_name(self, names): result = pd.crosstab(s1, s2) tm.assert_frame_equal(result, expected) + + def test_crosstab_unsorted_order(self): + df = pd.DataFrame({"b": [3, 1, 2], 'a': [5, 4, 6]}, + index=['C', 'A', 'B']) + result = pd.crosstab(df.index, [df.b, df.a]) + e_idx = pd.Index(['A', 'B', 'C'], name='row_0') + e_columns = pd.MultiIndex.from_tuples([(1, 4), (2, 6), (3, 5)], + names=['b', 'a']) + expected = pd.DataFrame([[1, 0, 0], [0, 1, 0], [0, 0, 1]], + index=e_idx, + columns=e_columns) + tm.assert_frame_equal(result, expected)