diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index e8e2b8d0ef908..abc3f5f5cb135 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -39,10 +39,49 @@ Notable bug fixes These are bug fixes that might have notable behavior changes. -.. _whatsnew_220.notable_bug_fixes.notable_bug_fix1: +.. _whatsnew_220.notable_bug_fixes.merge_sort_behavior: -notable_bug_fix1 -^^^^^^^^^^^^^^^^ +:func:`merge` and :meth:`DataFrame.join` now consistently follow documented sort behavior +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions of pandas, :func:`merge` and :meth:`DataFrame.join` did not +always return a result that followed the documented sort behavior. pandas now +follows the documented sort behavior in merge and join operations (:issue:`54611`). + +As documented, ``sort=True`` sorts the join keys lexicographically in the resulting +:class:`DataFrame`. With ``sort=False``, the order of the join keys depends on the +join type (``how`` keyword): + +- ``how="left"``: preserve the order of the left keys +- ``how="right"``: preserve the order of the right keys +- ``how="inner"``: preserve the order of the left keys +- ``how="outer"``: sort keys lexicographically + +One example with changing behavior is inner joins with non-unique left join keys +and ``sort=False``: + +.. ipython:: python + + left = pd.DataFrame({"a": [1, 2, 1]}) + right = pd.DataFrame({"a": [1, 2]}) + result = pd.merge(left, right, how="inner", on="a", sort=False) + +*Old Behavior* + +.. code-block:: ipython + + In [5]: result + Out[5]: + a + 0 1 + 1 1 + 2 2 + +*New Behavior* + +.. ipython:: python + + result .. _whatsnew_220.notable_bug_fixes.notable_bug_fix2: diff --git a/pandas/_libs/join.pyi b/pandas/_libs/join.pyi index 7ee649a55fd8f..c7761cbf8aba9 100644 --- a/pandas/_libs/join.pyi +++ b/pandas/_libs/join.pyi @@ -6,6 +6,7 @@ def inner_join( left: np.ndarray, # const intp_t[:] right: np.ndarray, # const intp_t[:] max_groups: int, + sort: bool = ..., ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... def left_outer_join( left: np.ndarray, # const intp_t[:] diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 5929647468785..385a089a8c59d 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -23,7 +23,7 @@ from pandas._libs.dtypes cimport ( @cython.wraparound(False) @cython.boundscheck(False) def inner_join(const intp_t[:] left, const intp_t[:] right, - Py_ssize_t max_groups): + Py_ssize_t max_groups, bint sort=True): cdef: Py_ssize_t i, j, k, count = 0 intp_t[::1] left_sorter, right_sorter @@ -70,7 +70,20 @@ def inner_join(const intp_t[:] left, const intp_t[:] right, _get_result_indexer(left_sorter, left_indexer) _get_result_indexer(right_sorter, right_indexer) - return np.asarray(left_indexer), np.asarray(right_indexer) + if not sort: + # if not asked to sort, revert to original order + if len(left) == len(left_indexer): + # no multiple matches for any row on the left + # this is a short-cut to avoid groupsort_indexer + # otherwise, the `else` path also works in this case + rev = np.empty(len(left), dtype=np.intp) + rev.put(np.asarray(left_sorter), np.arange(len(left))) + else: + rev, _ = groupsort_indexer(left_indexer, len(left)) + + return np.asarray(left_indexer).take(rev), np.asarray(right_indexer).take(rev) + else: + return np.asarray(left_indexer), np.asarray(right_indexer) @cython.wraparound(False) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 889e138177fae..c52f9cae78c91 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -418,10 +418,10 @@ lkey value_x rkey value_y 0 foo 1 foo 5 1 foo 1 foo 8 -2 foo 5 foo 5 -3 foo 5 foo 8 -4 bar 2 bar 6 -5 baz 3 baz 7 +2 bar 2 bar 6 +3 baz 3 baz 7 +4 foo 5 foo 5 +5 foo 5 foo 8 Merge DataFrames df1 and df2 with specified left and right suffixes appended to any overlapping columns. @@ -431,10 +431,10 @@ lkey value_left rkey value_right 0 foo 1 foo 5 1 foo 1 foo 8 -2 foo 5 foo 5 -3 foo 5 foo 8 -4 bar 2 bar 6 -5 baz 3 baz 7 +2 bar 2 bar 6 +3 baz 3 baz 7 +4 foo 5 foo 5 +5 foo 5 foo 8 Merge DataFrames df1 and df2, but raise an exception if the DataFrames have any overlapping columns. diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3d2e381fc52ce..a49db84450bb3 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4633,7 +4633,7 @@ def join( _validate_join_method(how) if not self.is_unique and not other.is_unique: - return self._join_non_unique(other, how=how) + return self._join_non_unique(other, how=how, sort=sort) elif not self.is_unique or not other.is_unique: if self.is_monotonic_increasing and other.is_monotonic_increasing: # Note: 2023-08-15 we *do* have tests that get here with @@ -4645,7 +4645,7 @@ def join( # go through object dtype for ea till engine is supported properly return self._join_monotonic(other, how=how) else: - return self._join_non_unique(other, how=how) + return self._join_non_unique(other, how=how, sort=sort) elif ( # GH48504: exclude MultiIndex to avoid going through MultiIndex._values self.is_monotonic_increasing @@ -4679,15 +4679,13 @@ def _join_via_get_indexer( elif how == "right": join_index = other elif how == "inner": - # TODO: sort=False here for backwards compat. It may - # be better to use the sort parameter passed into join - join_index = self.intersection(other, sort=False) + join_index = self.intersection(other, sort=sort) elif how == "outer": # TODO: sort=True here for backwards compat. It may # be better to use the sort parameter passed into join join_index = self.union(other) - if sort: + if sort and how in ["left", "right"]: join_index = join_index.sort_values() if join_index is self: @@ -4784,7 +4782,7 @@ def _join_multi(self, other: Index, how: JoinHow): @final def _join_non_unique( - self, other: Index, how: JoinHow = "left" + self, other: Index, how: JoinHow = "left", sort: bool = False ) -> tuple[Index, npt.NDArray[np.intp], npt.NDArray[np.intp]]: from pandas.core.reshape.merge import get_join_indexers @@ -4792,7 +4790,7 @@ def _join_non_unique( assert self.dtype == other.dtype left_idx, right_idx = get_join_indexers( - [self._values], [other._values], how=how, sort=True + [self._values], [other._values], how=how, sort=sort ) mask = left_idx == -1 diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index c2cb9d643ca87..140a3024a8684 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1679,6 +1679,9 @@ def get_join_indexers( elif not sort and how in ["left", "outer"]: return _get_no_sort_one_missing_indexer(left_n, False) + if not sort and how == "outer": + sort = True + # get left & right join labels and num. of levels at each location mapped = ( _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how) @@ -1697,7 +1700,7 @@ def get_join_indexers( lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how) # preserve left frame order if how == 'left' and sort == False kwargs = {} - if how in ("left", "right"): + if how in ("inner", "left", "right"): kwargs["sort"] = sort join_func = { "inner": libjoin.inner_join, diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 5d9c03e1b2569..a9bd12917e73e 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -236,9 +236,9 @@ def test_merge_on_extension_array_duplicates(self, data): result = pd.merge(df1, df2, on="key") expected = pd.DataFrame( { - "key": key.take([0, 0, 0, 0, 1]), - "val_x": [1, 1, 3, 3, 2], - "val_y": [1, 3, 1, 3, 2], + "key": key.take([0, 0, 1, 2, 2]), + "val_x": [1, 1, 2, 3, 3], + "val_y": [1, 3, 2, 1, 3], } ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/numeric/test_join.py b/pandas/tests/indexes/numeric/test_join.py index 93ff6238b90ff..918d505216735 100644 --- a/pandas/tests/indexes/numeric/test_join.py +++ b/pandas/tests/indexes/numeric/test_join.py @@ -11,13 +11,13 @@ def test_join_non_unique(self): joined, lidx, ridx = left.join(left, return_indexers=True) - exp_joined = Index([3, 3, 3, 3, 4, 4, 4, 4]) + exp_joined = Index([4, 4, 4, 4, 3, 3, 3, 3]) tm.assert_index_equal(joined, exp_joined) - exp_lidx = np.array([2, 2, 3, 3, 0, 0, 1, 1], dtype=np.intp) + exp_lidx = np.array([0, 0, 1, 1, 2, 2, 3, 3], dtype=np.intp) tm.assert_numpy_array_equal(lidx, exp_lidx) - exp_ridx = np.array([2, 3, 2, 3, 0, 1, 0, 1], dtype=np.intp) + exp_ridx = np.array([0, 1, 0, 1, 2, 3, 2, 3], dtype=np.intp) tm.assert_numpy_array_equal(ridx, exp_ridx) def test_join_inner(self): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 91510fae0a9b1..50a534ad36bcc 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1462,13 +1462,14 @@ def test_merge_readonly(self): def _check_merge(x, y): for how in ["inner", "left", "outer"]: - result = x.join(y, how=how) + for sort in [True, False]: + result = x.join(y, how=how, sort=sort) - expected = merge(x.reset_index(), y.reset_index(), how=how, sort=True) - expected = expected.set_index("index") + expected = merge(x.reset_index(), y.reset_index(), how=how, sort=sort) + expected = expected.set_index("index") - # TODO check_names on merge? - tm.assert_frame_equal(result, expected, check_names=False) + # TODO check_names on merge? + tm.assert_frame_equal(result, expected, check_names=False) class TestMergeDtypes: @@ -1751,7 +1752,7 @@ def test_merge_string_dtype(self, how, expected_data, any_string_dtype): "how, expected_data", [ ("inner", [[True, 1, 4], [False, 5, 3]]), - ("outer", [[True, 1, 4], [False, 5, 3]]), + ("outer", [[False, 5, 3], [True, 1, 4]]), ("left", [[True, 1, 4], [False, 5, 3]]), ("right", [[False, 5, 3], [True, 1, 4]]), ], @@ -2331,9 +2332,9 @@ def test_merge_suffix(col1, col2, kwargs, expected_cols): "outer", DataFrame( { - "A": [100, 200, 1, 300], - "B1": [60, 70, 80, np.nan], - "B2": [600, 700, np.nan, 800], + "A": [1, 100, 200, 300], + "B1": [80, 60, 70, np.nan], + "B2": [np.nan, 600, 700, 800], } ), ), @@ -2752,9 +2753,9 @@ def test_merge_outer_with_NaN(dtype): result = merge(right, left, on="key", how="outer") expected = DataFrame( { - "key": [np.nan, np.nan, 1, 2], - "col2": [3, 4, np.nan, np.nan], - "col1": [np.nan, np.nan, 1, 2], + "key": [1, 2, np.nan, np.nan], + "col2": [np.nan, np.nan, 3, 4], + "col1": [1, 2, np.nan, np.nan], }, dtype=dtype, ) @@ -2847,3 +2848,79 @@ def test_merge_multiindex_single_level(): result = df.merge(df2, left_on=["col"], right_index=True, how="left") tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) +@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("on_index", [True, False]) +@pytest.mark.parametrize("left_unique", [True, False]) +@pytest.mark.parametrize("left_monotonic", [True, False]) +@pytest.mark.parametrize("right_unique", [True, False]) +@pytest.mark.parametrize("right_monotonic", [True, False]) +def test_merge_combinations( + how, sort, on_index, left_unique, left_monotonic, right_unique, right_monotonic +): + # GH 54611 + left = [2, 3] + if left_unique: + left.append(4 if left_monotonic else 1) + else: + left.append(3 if left_monotonic else 2) + + right = [2, 3] + if right_unique: + right.append(4 if right_monotonic else 1) + else: + right.append(3 if right_monotonic else 2) + + left = DataFrame({"key": left}) + right = DataFrame({"key": right}) + + if on_index: + left = left.set_index("key") + right = right.set_index("key") + on_kwargs = {"left_index": True, "right_index": True} + else: + on_kwargs = {"on": "key"} + + result = merge(left, right, how=how, sort=sort, **on_kwargs) + + if on_index: + left = left.reset_index() + right = right.reset_index() + + if how in ["left", "right", "inner"]: + if how in ["left", "inner"]: + expected, other, other_unique = left, right, right_unique + else: + expected, other, other_unique = right, left, left_unique + if how == "inner": + keep_values = set(left["key"].values).intersection(right["key"].values) + keep_mask = expected["key"].isin(keep_values) + expected = expected[keep_mask] + if sort: + expected = expected.sort_values("key") + if not other_unique: + other_value_counts = other["key"].value_counts() + repeats = other_value_counts.reindex(expected["key"].values, fill_value=1) + repeats = repeats.astype(np.intp) + expected = expected["key"].repeat(repeats.values) + expected = expected.to_frame() + elif how == "outer": + if on_index and left_unique and left["key"].equals(right["key"]): + expected = DataFrame({"key": left["key"]}) + else: + left_counts = left["key"].value_counts() + right_counts = right["key"].value_counts() + expected_counts = left_counts.mul(right_counts, fill_value=1) + expected_counts = expected_counts.astype(np.intp) + expected = expected_counts.index.values.repeat(expected_counts.values) + expected = DataFrame({"key": expected}) + expected = expected.sort_values("key") + + if on_index: + expected = expected.set_index("key") + else: + expected = expected.reset_index(drop=True) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 088d1e7e3c85e..ab010bdb909f1 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -741,10 +741,8 @@ def test_join_multi_levels2(self): expected = ( DataFrame( { - "household_id": [1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], + "household_id": [2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 2, 4], "asset_id": [ - "nl0000301109", - "nl0000301109", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", @@ -754,11 +752,11 @@ def test_join_multi_levels2(self): "lu0197800237", "lu0197800237", "nl0000289965", + "nl0000301109", + "nl0000301109", None, ], "t": [ - None, - None, 233, 234, 235, @@ -769,10 +767,10 @@ def test_join_multi_levels2(self): 181, None, None, + None, + None, ], "share": [ - 1.0, - 0.4, 0.6, 0.6, 0.6, @@ -783,10 +781,10 @@ def test_join_multi_levels2(self): 0.6, 0.25, 1.0, + 0.4, + 1.0, ], "log_return": [ - None, - None, 0.09604978, -0.06524096, 0.03532373, @@ -797,6 +795,8 @@ def test_join_multi_levels2(self): 0.036997, None, None, + None, + None, ], } ) diff --git a/pandas/tests/strings/test_cat.py b/pandas/tests/strings/test_cat.py index a6303610b2037..3e620b7664335 100644 --- a/pandas/tests/strings/test_cat.py +++ b/pandas/tests/strings/test_cat.py @@ -106,7 +106,7 @@ def test_str_cat_categorical(index_or_series, dtype_caller, dtype_target, sep): # Series/Index with Series having different Index t = Series(t.values, index=t.values) - expected = Index(["aa", "aa", "aa", "bb", "bb"]) + expected = Index(["aa", "aa", "bb", "bb", "aa"]) expected = expected if box == Index else Series(expected, index=expected.str[:1]) result = s.str.cat(t, sep=sep)