From 2d055fdf3249d6d2acdc72a868f9c2235f556ffd Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 3 Jun 2022 14:06:32 +0200 Subject: [PATCH 1/5] REGR: concat not sorting columns for mixed column names --- doc/source/whatsnew/v1.4.3.rst | 1 + pandas/core/indexes/api.py | 7 +++++- pandas/tests/reshape/concat/test_dataframe.py | 23 +++++++++++++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index 54cad82366e43..5898e51ab5f52 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -17,6 +17,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.replace` when the replacement value was explicitly ``None`` when passed in a dictionary to ``to_replace`` also casting other columns to object dtype even when there were no values to replace (:issue:`46634`) - Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when ``np.nan`` in the sorting column (:issue:`46589`) - Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`) +- Fixed regression in :func:`concat` not sorting columns for mixed column names (:issue:`47127`) - Fixed regression in :meth:`.Groupby.transform` and :meth:`.Groupby.agg` failing with ``engine="numba"`` when the index was a :class:`MultiIndex` (:issue:`46867`) - Fixed regression is :meth:`.Styler.to_latex` and :meth:`.Styler.to_html` where ``buf`` failed in combination with ``encoding`` (:issue:`47053`) - Fixed regression in :func:`read_csv` with ``index_col=False`` identifying first row as index names when ``header=None`` (:issue:`46955`) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 922c344510375..ea83bd933d37b 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -10,6 +10,7 @@ from pandas.core.dtypes.common import is_dtype_equal +from pandas.core.algorithms import safe_sort from pandas.core.indexes.base import ( Index, _new_Index, @@ -154,7 +155,11 @@ def _get_combined_index( if sort: try: - index = index.sort_values() + index_sorted = safe_sort(index) + if isinstance(index, MultiIndex): + index = MultiIndex.from_tuples(index_sorted, names=index.names) + else: + index = Index(index_sorted, name=index.name) except TypeError: pass diff --git a/pandas/tests/reshape/concat/test_dataframe.py b/pandas/tests/reshape/concat/test_dataframe.py index 01763926c6d89..1018fc2806fee 100644 --- a/pandas/tests/reshape/concat/test_dataframe.py +++ b/pandas/tests/reshape/concat/test_dataframe.py @@ -205,3 +205,26 @@ def test_concat_copies(self, axis, order, ignore_index): for arr in res._iter_column_arrays(): for arr2 in df._iter_column_arrays(): assert not np.shares_memory(arr, arr2) + + def test_outer_sort_columns(self): + # GH#47127 + df1 = DataFrame({"A": [0], "B": [1], 0: 1}) + df2 = DataFrame({"A": [100]}) + result = concat([df1, df2], ignore_index=True, join="outer", sort=True) + expected = DataFrame({0: [1.0, np.nan], "A": [0, 100], "B": [1.0, np.nan]}) + tm.assert_frame_equal(result, expected) + + def test_inner_sort_columns(self): + # GH#47127 + df1 = DataFrame({"A": [0], "B": [1], 0: 1}) + df2 = DataFrame({"A": [100], 0: 2}) + result = concat([df1, df2], ignore_index=True, join="inner", sort=True) + expected = DataFrame({0: [1, 2], "A": [0, 100]}) + tm.assert_frame_equal(result, expected) + + def test_sort_columns_one_df(self): + # GH#47127 + df1 = DataFrame({"A": [100], 0: 2}) + result = concat([df1], ignore_index=True, join="inner", sort=True) + expected = DataFrame({0: [2], "A": [100]}) + tm.assert_frame_equal(result, expected) From 05757d55a16269bc67edae79d92e7eaa70d1d307 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 3 Jun 2022 14:10:24 +0200 Subject: [PATCH 2/5] Fix none in columns --- pandas/core/algorithms.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 888e943488953..cf73fd7c8929e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1771,9 +1771,12 @@ def safe_sort( def _sort_mixed(values) -> np.ndarray: """order ints before strings in 1d arrays, safe in py3""" str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) - nums = np.sort(values[~str_pos]) + none_pos = np.array([x is None for x in values], dtype=bool) + nums = np.sort(values[~str_pos & ~none_pos]) strs = np.sort(values[str_pos]) - return np.concatenate([nums, np.asarray(strs, dtype=object)]) + return np.concatenate( + [nums, np.asarray(strs, dtype=object), np.array(values[none_pos])] + ) def _sort_tuples(values: np.ndarray) -> np.ndarray: From cf20cc990f079601dc9a73e070266d5d0a8d5d1c Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 13 Jun 2022 12:38:14 +0200 Subject: [PATCH 3/5] BUG: concat not sorting column names when None is included --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/tests/reshape/concat/test_concat.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 55bfb044fb31d..325e2e2109888 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -902,6 +902,7 @@ Reshaping - Bug in :func:`get_dummies` that selected object and categorical dtypes but not string (:issue:`44965`) - Bug in :meth:`DataFrame.align` when aligning a :class:`MultiIndex` to a :class:`Series` with another :class:`MultiIndex` (:issue:`46001`) - Bug in concanenation with ``IntegerDtype``, or ``FloatingDtype`` arrays where the resulting dtype did not mirror the behavior of the non-nullable dtypes (:issue:`46379`) +- Bug in :func:`concat` not sorting the column names when ``None`` is included (:issue:` - Bug in :func:`concat` with identical key leads to error when indexing :class:`MultiIndex` (:issue:`46519`) - Bug in :meth:`DataFrame.join` with a list when using suffixes to join DataFrames with duplicate column names (:issue:`46396`) - Bug in :meth:`DataFrame.pivot_table` with ``sort=False`` results in sorted index (:issue:`17041`) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index eb44b4889afb8..382717c1b70e8 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -471,10 +471,10 @@ def __iter__(self): def test_concat_order(self): # GH 17344 dfs = [DataFrame(index=range(3), columns=["a", 1, None])] - dfs += [DataFrame(index=range(3), columns=[None, 1, "a"]) for i in range(100)] + dfs += [DataFrame(index=range(3), columns=[None, 1, "a"]) for _ in range(100)] result = concat(dfs, sort=True).columns - expected = dfs[0].columns + expected = Index([1, "a", None]) tm.assert_index_equal(result, expected) def test_concat_different_extension_dtypes_upcasts(self): From a6c61c7a53a376619a5ec68bfed56d027fbb7832 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 14 Jun 2022 08:48:56 +0200 Subject: [PATCH 4/5] Update doc/source/whatsnew/v1.5.0.rst Co-authored-by: Matthew Roeschke --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 325e2e2109888..48b12338b8d03 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -902,7 +902,7 @@ Reshaping - Bug in :func:`get_dummies` that selected object and categorical dtypes but not string (:issue:`44965`) - Bug in :meth:`DataFrame.align` when aligning a :class:`MultiIndex` to a :class:`Series` with another :class:`MultiIndex` (:issue:`46001`) - Bug in concanenation with ``IntegerDtype``, or ``FloatingDtype`` arrays where the resulting dtype did not mirror the behavior of the non-nullable dtypes (:issue:`46379`) -- Bug in :func:`concat` not sorting the column names when ``None`` is included (:issue:` +- Bug in :func:`concat` not sorting the column names when ``None`` is included (:issue:`47331`) - Bug in :func:`concat` with identical key leads to error when indexing :class:`MultiIndex` (:issue:`46519`) - Bug in :meth:`DataFrame.join` with a list when using suffixes to join DataFrames with duplicate column names (:issue:`46396`) - Bug in :meth:`DataFrame.pivot_table` with ``sort=False`` results in sorted index (:issue:`17041`) From e749aa290a1f63e6df0a35e3a6ffe8b6cd39d952 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 14 Jun 2022 08:49:34 +0200 Subject: [PATCH 5/5] Add gh reference --- pandas/tests/reshape/concat/test_concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 382717c1b70e8..17c797fc36159 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -469,7 +469,7 @@ def __iter__(self): tm.assert_frame_equal(concat(CustomIterator2(), ignore_index=True), expected) def test_concat_order(self): - # GH 17344 + # GH 17344, GH#47331 dfs = [DataFrame(index=range(3), columns=["a", 1, None])] dfs += [DataFrame(index=range(3), columns=[None, 1, "a"]) for _ in range(100)]