Backport PR #47206 on branch 1.4.x (REGR: concat not sorting columns for mixed column names) (#47251)

meeseeksmachine · phofl · web-flow · commit 75a799cbf108 · 2022-06-06T08:54:45.000+01:00
Backport PR #47206: REGR: concat not sorting columns for mixed column names Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst
@@ -17,6 +17,7 @@ Fixed regressions
 - Fixed regression in :meth:`DataFrame.replace` when the replacement value was explicitly ``None`` when passed in a dictionary to ``to_replace`` also casting other columns to object dtype even when there were no values to replace (:issue:`46634`)
 - Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when ``np.nan`` in the sorting column (:issue:`46589`)
 - Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`)
+- Fixed regression in :func:`concat` not sorting columns for mixed column names (:issue:`47127`)
 - Fixed regression in :meth:`.Groupby.transform` and :meth:`.Groupby.agg` failing with ``engine="numba"`` when the index was a :class:`MultiIndex` (:issue:`46867`)
 - Fixed regression is :meth:`.Styler.to_latex` and :meth:`.Styler.to_html` where ``buf`` failed in combination with ``encoding`` (:issue:`47053`)
 - Fixed regression in :func:`read_csv` with ``index_col=False`` identifying first row as index names when ``header=None`` (:issue:`46955`)
diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py
@@ -1,6 +1,9 @@
 from __future__ import annotations
 
 import textwrap
+from typing import cast
+
+import numpy as np
 
 from pandas._libs import (
     NaT,
@@ -10,6 +13,7 @@
 
 from pandas.core.dtypes.common import is_dtype_equal
 
+from pandas.core.algorithms import safe_sort
 from pandas.core.indexes.base import (
     Index,
     _new_Index,
@@ -154,7 +158,12 @@ def _get_combined_index(
 
     if sort:
         try:
-            index = index.sort_values()
+            array_sorted = safe_sort(index)
+            array_sorted = cast(np.ndarray, array_sorted)
+            if isinstance(index, MultiIndex):
+                index = MultiIndex.from_tuples(array_sorted, names=index.names)
+            else:
+                index = Index(array_sorted, name=index.name)
         except TypeError:
             pass
 
diff --git a/pandas/tests/reshape/concat/test_dataframe.py b/pandas/tests/reshape/concat/test_dataframe.py
@@ -205,3 +205,26 @@ def test_concat_copies(self, axis, order, ignore_index):
         for arr in res._iter_column_arrays():
             for arr2 in df._iter_column_arrays():
                 assert not np.shares_memory(arr, arr2)
+
+    def test_outer_sort_columns(self):
+        # GH#47127
+        df1 = DataFrame({"A": [0], "B": [1], 0: 1})
+        df2 = DataFrame({"A": [100]})
+        result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
+        expected = DataFrame({0: [1.0, np.nan], "A": [0, 100], "B": [1.0, np.nan]})
+        tm.assert_frame_equal(result, expected)
+
+    def test_inner_sort_columns(self):
+        # GH#47127
+        df1 = DataFrame({"A": [0], "B": [1], 0: 1})
+        df2 = DataFrame({"A": [100], 0: 2})
+        result = concat([df1, df2], ignore_index=True, join="inner", sort=True)
+        expected = DataFrame({0: [1, 2], "A": [0, 100]})
+        tm.assert_frame_equal(result, expected)
+
+    def test_sort_columns_one_df(self):
+        # GH#47127
+        df1 = DataFrame({"A": [100], 0: 2})
+        result = concat([df1], ignore_index=True, join="inner", sort=True)
+        expected = DataFrame({0: [2], "A": [100]})
+        tm.assert_frame_equal(result, expected)