BUG: fix union_indexes not supporting sort=False for Index subclasses (#35098)

AlexKirko · web-flow · commit c21be0562a33 · 2020-07-09T09:02:32.000-04:00
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -1117,6 +1117,7 @@ Reshaping
 - Fixed bug in :func:`melt` where melting MultiIndex columns with ``col_level`` > 0 would raise a ``KeyError`` on ``id_vars`` (:issue:`34129`)
 - Bug in :meth:`Series.where` with an empty Series and empty ``cond`` having non-bool dtype (:issue:`34592`)
 - Fixed regression where :meth:`DataFrame.apply` would raise ``ValueError`` for elements whth ``S`` dtype (:issue:`34529`)
+- Bug in :meth:`DataFrame.append` leading to sorting columns even when ``sort=False`` is specified (:issue:`35092`)
 
 Sparse
 ^^^^^^
diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py
@@ -214,7 +214,13 @@ def conv(i):
             return result.union_many(indexes[1:])
         else:
             for other in indexes[1:]:
-                result = result.union(other)
+                # GH 35092. Index.union expects sort=None instead of sort=True
+                # to signify that sort=True isn't fully implemented and
+                # legacy implementation sometimes might not sort (see GH 24959)
+                # In this case we currently sort in _get_combined_index
+                if sort:
+                    sort = None
+                result = result.union(other, sort=sort)
             return result
     elif kind == "array":
         index = indexes[0]
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -2542,11 +2542,13 @@ def test_construct_with_two_categoricalindex_series(self):
             index=pd.CategoricalIndex(["f", "female", "m", "male", "unknown"]),
         )
         result = DataFrame([s1, s2])
+        # GH 35092. Extra s2 columns are now appended to s1 columns
+        # in original order
         expected = DataFrame(
             np.array(
-                [[np.nan, 39.0, np.nan, 6.0, 4.0], [2.0, 152.0, 2.0, 242.0, 150.0]]
+                [[39.0, 6.0, 4.0, np.nan, np.nan], [152.0, 242.0, 150.0, 2.0, 2.0]]
             ),
-            columns=["f", "female", "m", "male", "unknown"],
+            columns=["female", "male", "unknown", "f", "m"],
         )
         tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py
@@ -13,8 +13,9 @@
 from pandas.core.dtypes.common import is_period_dtype, needs_i8_conversion
 
 import pandas as pd
-from pandas import CategoricalIndex, MultiIndex, RangeIndex
+from pandas import CategoricalIndex, Index, MultiIndex, RangeIndex
 import pandas._testing as tm
+from pandas.core.indexes.api import union_indexes
 
 
 class TestCommon:
@@ -395,3 +396,18 @@ def test_astype_preserves_name(self, index, dtype, copy):
             assert result.names == index.names
         else:
             assert result.name == index.name
+
+
+@pytest.mark.parametrize("arr", [[0, 1, 4, 3]])
+@pytest.mark.parametrize("dtype", ["int8", "int16", "int32", "int64"])
+def test_union_index_no_sort(arr, sort, dtype):
+    # GH 35092. Check that we don't sort with sort=False
+    ind1 = Index(arr[:2], dtype=dtype)
+    ind2 = Index(arr[2:], dtype=dtype)
+
+    # sort is None indicates that we sort the combined index
+    if sort is None:
+        arr.sort()
+    expected = Index(arr, dtype=dtype)
+    result = union_indexes([ind1, ind2], sort=sort)
+    tm.assert_index_equal(result, expected)
diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py
@@ -2857,3 +2857,17 @@ def test_concat_frame_axis0_extension_dtypes():
     result = pd.concat([df2, df1], ignore_index=True)
     expected = pd.DataFrame({"a": [4, 5, 6, 1, 2, 3]}, dtype="Int64")
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("sort", [True, False])
+def test_append_sort(sort):
+    # GH 35092. Check that DataFrame.append respects the sort argument.
+    df1 = pd.DataFrame(data={0: [1, 2], 1: [3, 4]})
+    df2 = pd.DataFrame(data={3: [1, 2], 2: [3, 4]})
+    cols = list(df1.columns) + list(df2.columns)
+    if sort:
+        cols.sort()
+
+    result = df1.append(df2, sort=sort).columns
+    expected = type(result)(cols)
+    tm.assert_index_equal(result, expected)
diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py
@@ -691,11 +691,11 @@ def test_unbalanced(self):
         )
         df["id"] = df.index
         exp_data = {
-            "X": ["X1", "X1", "X2", "X2"],
-            "A": [1.0, 3.0, 2.0, 4.0],
-            "B": [5.0, np.nan, 6.0, np.nan],
-            "id": [0, 0, 1, 1],
-            "year": [2010, 2011, 2010, 2011],
+            "X": ["X1", "X2", "X1", "X2"],
+            "A": [1.0, 2.0, 3.0, 4.0],
+            "B": [5.0, 6.0, np.nan, np.nan],
+            "id": [0, 1, 0, 1],
+            "year": [2010, 2010, 2011, 2011],
         }
         expected = pd.DataFrame(exp_data)
         expected = expected.set_index(["id", "year"])[["X", "A", "B"]]
@@ -938,10 +938,10 @@ def test_nonnumeric_suffix(self):
         )
         expected = pd.DataFrame(
             {
-                "A": ["X1", "X1", "X2", "X2"],
-                "colname": ["placebo", "test", "placebo", "test"],
-                "result": [5.0, np.nan, 6.0, np.nan],
-                "treatment": [1.0, 3.0, 2.0, 4.0],
+                "A": ["X1", "X2", "X1", "X2"],
+                "colname": ["placebo", "placebo", "test", "test"],
+                "result": [5.0, 6.0, np.nan, np.nan],
+                "treatment": [1.0, 2.0, 3.0, 4.0],
             }
         )
         expected = expected.set_index(["A", "colname"])
@@ -985,10 +985,10 @@ def test_float_suffix(self):
         )
         expected = pd.DataFrame(
             {
-                "A": ["X1", "X1", "X1", "X1", "X2", "X2", "X2", "X2"],
-                "colname": [1, 1.1, 1.2, 2.1, 1, 1.1, 1.2, 2.1],
-                "result": [0.0, np.nan, 5.0, np.nan, 9.0, np.nan, 6.0, np.nan],
-                "treatment": [np.nan, 1.0, np.nan, 3.0, np.nan, 2.0, np.nan, 4.0],
+                "A": ["X1", "X2", "X1", "X2", "X1", "X2", "X1", "X2"],
+                "colname": [1.2, 1.2, 1.0, 1.0, 1.1, 1.1, 2.1, 2.1],
+                "result": [5.0, 6.0, 0.0, 9.0, np.nan, np.nan, np.nan, np.nan],
+                "treatment": [np.nan, np.nan, np.nan, np.nan, 1.0, 2.0, 3.0, 4.0],
             }
         )
         expected = expected.set_index(["A", "colname"])
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -636,8 +636,15 @@ def test_str_cat_align_mixed_inputs(self, join):
         # mixed list of indexed/unindexed
         u = np.array(["A", "B", "C", "D"])
         expected_outer = Series(["aaA", "bbB", "c-C", "ddD", "-e-"])
+
         # joint index of rhs [t, u]; u will be forced have index of s
-        rhs_idx = t.index & s.index if join == "inner" else t.index | s.index
+        # GH 35092. If right join, maintain order of t.index
+        if join == "inner":
+            rhs_idx = t.index & s.index
+        elif join == "right":
+            rhs_idx = t.index.union(s.index, sort=False)
+        else:
+            rhs_idx = t.index | s.index
 
         expected = expected_outer.loc[s.index.join(rhs_idx, how=join)]
         result = s.str.cat([t, u], join=join, na_rep="-")

Original file line number	Diff line number	Diff line change
`@@ -691,11 +691,11 @@ def test_unbalanced(self):`
`691`	`691`	`)`
`692`	`692`	`df["id"] = df.index`
`693`	`693`	`exp_data = {`
`694`		`- "X": ["X1", "X1", "X2", "X2"],`
`695`		`- "A": [1.0, 3.0, 2.0, 4.0],`
`696`		`- "B": [5.0, np.nan, 6.0, np.nan],`
`697`		`- "id": [0, 0, 1, 1],`
`698`		`- "year": [2010, 2011, 2010, 2011],`
	`694`	`+ "X": ["X1", "X2", "X1", "X2"],`
	`695`	`+ "A": [1.0, 2.0, 3.0, 4.0],`
	`696`	`+ "B": [5.0, 6.0, np.nan, np.nan],`
	`697`	`+ "id": [0, 1, 0, 1],`
	`698`	`+ "year": [2010, 2010, 2011, 2011],`
`699`	`699`	`}`
`700`	`700`	`expected = pd.DataFrame(exp_data)`
`701`	`701`	`expected = expected.set_index(["id", "year"])[["X", "A", "B"]]`
`@@ -938,10 +938,10 @@ def test_nonnumeric_suffix(self):`
`938`	`938`	`)`
`939`	`939`	`expected = pd.DataFrame(`
`940`	`940`	`{`
`941`		`- "A": ["X1", "X1", "X2", "X2"],`
`942`		`- "colname": ["placebo", "test", "placebo", "test"],`
`943`		`- "result": [5.0, np.nan, 6.0, np.nan],`
`944`		`- "treatment": [1.0, 3.0, 2.0, 4.0],`
	`941`	`+ "A": ["X1", "X2", "X1", "X2"],`
	`942`	`+ "colname": ["placebo", "placebo", "test", "test"],`
	`943`	`+ "result": [5.0, 6.0, np.nan, np.nan],`
	`944`	`+ "treatment": [1.0, 2.0, 3.0, 4.0],`
`945`	`945`	`}`
`946`	`946`	`)`
`947`	`947`	`expected = expected.set_index(["A", "colname"])`
`@@ -985,10 +985,10 @@ def test_float_suffix(self):`
`985`	`985`	`)`
`986`	`986`	`expected = pd.DataFrame(`
`987`	`987`	`{`
`988`		`- "A": ["X1", "X1", "X1", "X1", "X2", "X2", "X2", "X2"],`
`989`		`- "colname": [1, 1.1, 1.2, 2.1, 1, 1.1, 1.2, 2.1],`
`990`		`- "result": [0.0, np.nan, 5.0, np.nan, 9.0, np.nan, 6.0, np.nan],`
`991`		`- "treatment": [np.nan, 1.0, np.nan, 3.0, np.nan, 2.0, np.nan, 4.0],`
	`988`	`+ "A": ["X1", "X2", "X1", "X2", "X1", "X2", "X1", "X2"],`
	`989`	`+ "colname": [1.2, 1.2, 1.0, 1.0, 1.1, 1.1, 2.1, 2.1],`
	`990`	`+ "result": [5.0, 6.0, 0.0, 9.0, np.nan, np.nan, np.nan, np.nan],`
	`991`	`+ "treatment": [np.nan, np.nan, np.nan, np.nan, 1.0, 2.0, 3.0, 4.0],`
`992`	`992`	`}`
`993`	`993`	`)`
`994`	`994`	`expected = expected.set_index(["A", "colname"])`