From 956e49ab1cb0c6887a313b7fc46d1b9010033f21 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 17 Aug 2023 23:04:37 -0400 Subject: [PATCH 1/6] BUG: merge producing inconsistent sort behavior --- pandas/_libs/join.pyi | 1 + pandas/_libs/join.pyx | 17 +++- pandas/core/indexes/base.py | 14 ++- pandas/core/reshape/merge.py | 5 +- pandas/tests/indexes/numeric/test_join.py | 6 +- pandas/tests/reshape/merge/test_merge.py | 101 +++++++++++++++++++--- pandas/tests/reshape/merge/test_multi.py | 18 ++-- 7 files changed, 127 insertions(+), 35 deletions(-) diff --git a/pandas/_libs/join.pyi b/pandas/_libs/join.pyi index 7ee649a55fd8f..c7761cbf8aba9 100644 --- a/pandas/_libs/join.pyi +++ b/pandas/_libs/join.pyi @@ -6,6 +6,7 @@ def inner_join( left: np.ndarray, # const intp_t[:] right: np.ndarray, # const intp_t[:] max_groups: int, + sort: bool = ..., ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... def left_outer_join( left: np.ndarray, # const intp_t[:] diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 5929647468785..385a089a8c59d 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -23,7 +23,7 @@ from pandas._libs.dtypes cimport ( @cython.wraparound(False) @cython.boundscheck(False) def inner_join(const intp_t[:] left, const intp_t[:] right, - Py_ssize_t max_groups): + Py_ssize_t max_groups, bint sort=True): cdef: Py_ssize_t i, j, k, count = 0 intp_t[::1] left_sorter, right_sorter @@ -70,7 +70,20 @@ def inner_join(const intp_t[:] left, const intp_t[:] right, _get_result_indexer(left_sorter, left_indexer) _get_result_indexer(right_sorter, right_indexer) - return np.asarray(left_indexer), np.asarray(right_indexer) + if not sort: + # if not asked to sort, revert to original order + if len(left) == len(left_indexer): + # no multiple matches for any row on the left + # this is a short-cut to avoid groupsort_indexer + # otherwise, the `else` path also works in this case + rev = np.empty(len(left), dtype=np.intp) + rev.put(np.asarray(left_sorter), np.arange(len(left))) + else: + rev, _ = groupsort_indexer(left_indexer, len(left)) + + return np.asarray(left_indexer).take(rev), np.asarray(right_indexer).take(rev) + else: + return np.asarray(left_indexer), np.asarray(right_indexer) @cython.wraparound(False) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ee36a3515c4b3..6aff3c6ad94a6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4651,7 +4651,7 @@ def join( _validate_join_method(how) if not self.is_unique and not other.is_unique: - return self._join_non_unique(other, how=how) + return self._join_non_unique(other, how=how, sort=sort) elif not self.is_unique or not other.is_unique: if self.is_monotonic_increasing and other.is_monotonic_increasing: if not isinstance(self.dtype, IntervalDtype): @@ -4660,7 +4660,7 @@ def join( # go through object dtype for ea till engine is supported properly return self._join_monotonic(other, how=how) else: - return self._join_non_unique(other, how=how) + return self._join_non_unique(other, how=how, sort=sort) elif ( # GH48504: exclude MultiIndex to avoid going through MultiIndex._values self.is_monotonic_increasing @@ -4693,15 +4693,13 @@ def _join_via_get_indexer( elif how == "right": join_index = other elif how == "inner": - # TODO: sort=False here for backwards compat. It may - # be better to use the sort parameter passed into join - join_index = self.intersection(other, sort=False) + join_index = self.intersection(other, sort=sort) elif how == "outer": # TODO: sort=True here for backwards compat. It may # be better to use the sort parameter passed into join join_index = self.union(other) - if sort: + if sort and how in ["left", "right"]: join_index = join_index.sort_values() if join_index is self: @@ -4798,7 +4796,7 @@ def _join_multi(self, other: Index, how: JoinHow): @final def _join_non_unique( - self, other: Index, how: JoinHow = "left" + self, other: Index, how: JoinHow = "left", sort: bool = False ) -> tuple[Index, npt.NDArray[np.intp], npt.NDArray[np.intp]]: from pandas.core.reshape.merge import get_join_indexers @@ -4806,7 +4804,7 @@ def _join_non_unique( assert self.dtype == other.dtype left_idx, right_idx = get_join_indexers( - [self._values], [other._values], how=how, sort=True + [self._values], [other._values], how=how, sort=sort ) mask = left_idx == -1 diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6987a0ac7bf6b..21a8278b7ac5b 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1678,6 +1678,9 @@ def get_join_indexers( elif not sort and how in ["left", "outer"]: return _get_no_sort_one_missing_indexer(left_n, False) + if not sort and how == "outer": + sort = True + # get left & right join labels and num. of levels at each location mapped = ( _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how) @@ -1696,7 +1699,7 @@ def get_join_indexers( lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how) # preserve left frame order if how == 'left' and sort == False kwargs = {} - if how in ("left", "right"): + if how in ("inner", "left", "right"): kwargs["sort"] = sort join_func = { "inner": libjoin.inner_join, diff --git a/pandas/tests/indexes/numeric/test_join.py b/pandas/tests/indexes/numeric/test_join.py index 93ff6238b90ff..918d505216735 100644 --- a/pandas/tests/indexes/numeric/test_join.py +++ b/pandas/tests/indexes/numeric/test_join.py @@ -11,13 +11,13 @@ def test_join_non_unique(self): joined, lidx, ridx = left.join(left, return_indexers=True) - exp_joined = Index([3, 3, 3, 3, 4, 4, 4, 4]) + exp_joined = Index([4, 4, 4, 4, 3, 3, 3, 3]) tm.assert_index_equal(joined, exp_joined) - exp_lidx = np.array([2, 2, 3, 3, 0, 0, 1, 1], dtype=np.intp) + exp_lidx = np.array([0, 0, 1, 1, 2, 2, 3, 3], dtype=np.intp) tm.assert_numpy_array_equal(lidx, exp_lidx) - exp_ridx = np.array([2, 3, 2, 3, 0, 1, 0, 1], dtype=np.intp) + exp_ridx = np.array([0, 1, 0, 1, 2, 3, 2, 3], dtype=np.intp) tm.assert_numpy_array_equal(ridx, exp_ridx) def test_join_inner(self): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 91510fae0a9b1..d8007789b56f7 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1462,13 +1462,14 @@ def test_merge_readonly(self): def _check_merge(x, y): for how in ["inner", "left", "outer"]: - result = x.join(y, how=how) + for sort in [True, False]: + result = x.join(y, how=how, sort=sort) - expected = merge(x.reset_index(), y.reset_index(), how=how, sort=True) - expected = expected.set_index("index") + expected = merge(x.reset_index(), y.reset_index(), how=how, sort=sort) + expected = expected.set_index("index") - # TODO check_names on merge? - tm.assert_frame_equal(result, expected, check_names=False) + # TODO check_names on merge? + tm.assert_frame_equal(result, expected, check_names=False) class TestMergeDtypes: @@ -1751,7 +1752,7 @@ def test_merge_string_dtype(self, how, expected_data, any_string_dtype): "how, expected_data", [ ("inner", [[True, 1, 4], [False, 5, 3]]), - ("outer", [[True, 1, 4], [False, 5, 3]]), + ("outer", [[False, 5, 3], [True, 1, 4]]), ("left", [[True, 1, 4], [False, 5, 3]]), ("right", [[False, 5, 3], [True, 1, 4]]), ], @@ -2331,9 +2332,9 @@ def test_merge_suffix(col1, col2, kwargs, expected_cols): "outer", DataFrame( { - "A": [100, 200, 1, 300], - "B1": [60, 70, 80, np.nan], - "B2": [600, 700, np.nan, 800], + "A": [1, 100, 200, 300], + "B1": [80, 60, 70, np.nan], + "B2": [np.nan, 600, 700, 800], } ), ), @@ -2752,9 +2753,9 @@ def test_merge_outer_with_NaN(dtype): result = merge(right, left, on="key", how="outer") expected = DataFrame( { - "key": [np.nan, np.nan, 1, 2], - "col2": [3, 4, np.nan, np.nan], - "col1": [np.nan, np.nan, 1, 2], + "key": [1, 2, np.nan, np.nan], + "col2": [np.nan, np.nan, 3, 4], + "col1": [1, 2, np.nan, np.nan], }, dtype=dtype, ) @@ -2847,3 +2848,79 @@ def test_merge_multiindex_single_level(): result = df.merge(df2, left_on=["col"], right_index=True, how="left") tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) +@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("on_index", [True, False]) +@pytest.mark.parametrize("left_unique", [True, False]) +@pytest.mark.parametrize("left_monotonic", [True, False]) +@pytest.mark.parametrize("right_unique", [True, False]) +@pytest.mark.parametrize("right_monotonic", [True, False]) +def test_merge_combinations( + how, sort, on_index, left_unique, left_monotonic, right_unique, right_monotonic +): + # GH ##### + left = [2, 3] + if left_unique: + left.append(4 if left_monotonic else 1) + else: + left.append(3 if left_monotonic else 2) + + right = [2, 3] + if right_unique: + right.append(4 if right_monotonic else 1) + else: + right.append(3 if right_monotonic else 2) + + left = DataFrame({"key": left}) + right = DataFrame({"key": right}) + + if on_index: + left = left.set_index("key") + right = right.set_index("key") + on_kwargs = {"left_index": True, "right_index": True} + else: + on_kwargs = {"on": "key"} + + result = merge(left, right, how=how, sort=sort, **on_kwargs) + + if on_index: + left = left.reset_index() + right = right.reset_index() + + if how in ["left", "right", "inner"]: + if how in ["left", "inner"]: + expected, other, other_unique = left, right, right_unique + else: + expected, other, other_unique = right, left, left_unique + if how == "inner": + keep_values = set(left["key"].values).intersection(right["key"].values) + keep_mask = expected["key"].isin(keep_values) + expected = expected[keep_mask] + if sort: + expected = expected.sort_values("key") + if not other_unique: + other_value_counts = other["key"].value_counts() + repeats = other_value_counts.reindex(expected["key"].values, fill_value=1) + expected = expected["key"].repeat(repeats.values) + expected = expected.to_frame() + elif how == "outer": + if on_index and left_unique and left["key"].equals(right["key"]): + expected = DataFrame({"key": left["key"]}) + else: + left_counts = left["key"].value_counts() + right_counts = right["key"].value_counts() + expected_counts = left_counts.mul(right_counts, fill_value=1).astype( + "int64" + ) + expected = expected_counts.index.values.repeat(expected_counts.values) + expected = DataFrame({"key": expected}) + expected = expected.sort_values("key") + + if on_index: + expected = expected.set_index("key") + else: + expected = expected.reset_index(drop=True) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 088d1e7e3c85e..ab010bdb909f1 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -741,10 +741,8 @@ def test_join_multi_levels2(self): expected = ( DataFrame( { - "household_id": [1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], + "household_id": [2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 2, 4], "asset_id": [ - "nl0000301109", - "nl0000301109", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", @@ -754,11 +752,11 @@ def test_join_multi_levels2(self): "lu0197800237", "lu0197800237", "nl0000289965", + "nl0000301109", + "nl0000301109", None, ], "t": [ - None, - None, 233, 234, 235, @@ -769,10 +767,10 @@ def test_join_multi_levels2(self): 181, None, None, + None, + None, ], "share": [ - 1.0, - 0.4, 0.6, 0.6, 0.6, @@ -783,10 +781,10 @@ def test_join_multi_levels2(self): 0.6, 0.25, 1.0, + 0.4, + 1.0, ], "log_return": [ - None, - None, 0.09604978, -0.06524096, 0.03532373, @@ -797,6 +795,8 @@ def test_join_multi_levels2(self): 0.036997, None, None, + None, + None, ], } ) From 05507bee31f41c510083067af658f6a04dfa09a0 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 17 Aug 2023 23:23:07 -0400 Subject: [PATCH 2/6] whatsnew --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/tests/reshape/merge/test_merge.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 4ad450c965464..d2b2689aae00b 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -189,7 +189,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ -- +- Bug in :func:`merge` not following documented sort behavior in certain cases (:issue:`54611`) - Sparse diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d8007789b56f7..803faaa6ce05e 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2860,7 +2860,7 @@ def test_merge_multiindex_single_level(): def test_merge_combinations( how, sort, on_index, left_unique, left_monotonic, right_unique, right_monotonic ): - # GH ##### + # GH 54611 left = [2, 3] if left_unique: left.append(4 if left_monotonic else 1) From 040bc0d795a7fae6d5df69006119075e9a264ca9 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 18 Aug 2023 06:39:11 -0400 Subject: [PATCH 3/6] fix more tests --- pandas/core/frame.py | 16 ++++++++-------- pandas/tests/extension/base/reshaping.py | 6 +++--- pandas/tests/strings/test_cat.py | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1e10e8f11a575..01d3204cd580d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -418,10 +418,10 @@ lkey value_x rkey value_y 0 foo 1 foo 5 1 foo 1 foo 8 -2 foo 5 foo 5 -3 foo 5 foo 8 -4 bar 2 bar 6 -5 baz 3 baz 7 +2 bar 2 bar 6 +3 baz 3 baz 7 +4 foo 5 foo 5 +5 foo 5 foo 8 Merge DataFrames df1 and df2 with specified left and right suffixes appended to any overlapping columns. @@ -431,10 +431,10 @@ lkey value_left rkey value_right 0 foo 1 foo 5 1 foo 1 foo 8 -2 foo 5 foo 5 -3 foo 5 foo 8 -4 bar 2 bar 6 -5 baz 3 baz 7 +2 bar 2 bar 6 +3 baz 3 baz 7 +4 foo 5 foo 5 +5 foo 5 foo 8 Merge DataFrames df1 and df2, but raise an exception if the DataFrames have any overlapping columns. diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 5d9c03e1b2569..a9bd12917e73e 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -236,9 +236,9 @@ def test_merge_on_extension_array_duplicates(self, data): result = pd.merge(df1, df2, on="key") expected = pd.DataFrame( { - "key": key.take([0, 0, 0, 0, 1]), - "val_x": [1, 1, 3, 3, 2], - "val_y": [1, 3, 1, 3, 2], + "key": key.take([0, 0, 1, 2, 2]), + "val_x": [1, 1, 2, 3, 3], + "val_y": [1, 3, 2, 1, 3], } ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/strings/test_cat.py b/pandas/tests/strings/test_cat.py index a6303610b2037..3e620b7664335 100644 --- a/pandas/tests/strings/test_cat.py +++ b/pandas/tests/strings/test_cat.py @@ -106,7 +106,7 @@ def test_str_cat_categorical(index_or_series, dtype_caller, dtype_target, sep): # Series/Index with Series having different Index t = Series(t.values, index=t.values) - expected = Index(["aa", "aa", "aa", "bb", "bb"]) + expected = Index(["aa", "aa", "bb", "bb", "aa"]) expected = expected if box == Index else Series(expected, index=expected.str[:1]) result = s.str.cat(t, sep=sep) From 94ead41298475537d3d5bd427c9e22675663da9e Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 18 Aug 2023 07:50:35 -0400 Subject: [PATCH 4/6] fix test --- pandas/tests/reshape/merge/test_merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 803faaa6ce05e..597cf43e2f46c 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2912,7 +2912,7 @@ def test_merge_combinations( left_counts = left["key"].value_counts() right_counts = right["key"].value_counts() expected_counts = left_counts.mul(right_counts, fill_value=1).astype( - "int64" + np.intp ) expected = expected_counts.index.values.repeat(expected_counts.values) expected = DataFrame({"key": expected}) From a405adec0babab3eae3727cbe704954d656ef0db Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 18 Aug 2023 12:12:28 -0400 Subject: [PATCH 5/6] fix test --- pandas/tests/reshape/merge/test_merge.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 597cf43e2f46c..50a534ad36bcc 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2903,6 +2903,7 @@ def test_merge_combinations( if not other_unique: other_value_counts = other["key"].value_counts() repeats = other_value_counts.reindex(expected["key"].values, fill_value=1) + repeats = repeats.astype(np.intp) expected = expected["key"].repeat(repeats.values) expected = expected.to_frame() elif how == "outer": @@ -2911,9 +2912,8 @@ def test_merge_combinations( else: left_counts = left["key"].value_counts() right_counts = right["key"].value_counts() - expected_counts = left_counts.mul(right_counts, fill_value=1).astype( - np.intp - ) + expected_counts = left_counts.mul(right_counts, fill_value=1) + expected_counts = expected_counts.astype(np.intp) expected = expected_counts.index.values.repeat(expected_counts.values) expected = DataFrame({"key": expected}) expected = expected.sort_values("key") From 189ca46da9582f92503b9227b7f34612aa56460f Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 22 Aug 2023 18:14:37 -0400 Subject: [PATCH 6/6] move whatsnew to notable bug fixes --- doc/source/whatsnew/v2.2.0.rst | 47 +++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 75d9a84abc519..abc3f5f5cb135 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -39,10 +39,49 @@ Notable bug fixes These are bug fixes that might have notable behavior changes. -.. _whatsnew_220.notable_bug_fixes.notable_bug_fix1: +.. _whatsnew_220.notable_bug_fixes.merge_sort_behavior: -notable_bug_fix1 -^^^^^^^^^^^^^^^^ +:func:`merge` and :meth:`DataFrame.join` now consistently follow documented sort behavior +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions of pandas, :func:`merge` and :meth:`DataFrame.join` did not +always return a result that followed the documented sort behavior. pandas now +follows the documented sort behavior in merge and join operations (:issue:`54611`). + +As documented, ``sort=True`` sorts the join keys lexicographically in the resulting +:class:`DataFrame`. With ``sort=False``, the order of the join keys depends on the +join type (``how`` keyword): + +- ``how="left"``: preserve the order of the left keys +- ``how="right"``: preserve the order of the right keys +- ``how="inner"``: preserve the order of the left keys +- ``how="outer"``: sort keys lexicographically + +One example with changing behavior is inner joins with non-unique left join keys +and ``sort=False``: + +.. ipython:: python + + left = pd.DataFrame({"a": [1, 2, 1]}) + right = pd.DataFrame({"a": [1, 2]}) + result = pd.merge(left, right, how="inner", on="a", sort=False) + +*Old Behavior* + +.. code-block:: ipython + + In [5]: result + Out[5]: + a + 0 1 + 1 1 + 2 2 + +*New Behavior* + +.. ipython:: python + + result .. _whatsnew_220.notable_bug_fixes.notable_bug_fix2: @@ -197,7 +236,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ -- Bug in :func:`merge` not following documented sort behavior in certain cases (:issue:`54611`) +- - Sparse