From c3565c7863e7ce57bed51439b4bdf052a86682d1 Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Fri, 2 Aug 2019 14:12:31 -0600 Subject: [PATCH 01/23] add failing test to check row order preservation --- pandas/tests/reshape/merge/test_merge.py | 26 ++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index e191bf67c51ca..1bbaed3554b9b 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2153,3 +2153,29 @@ def test_merge_multiindex_columns(): expected["id"] = "" tm.assert_frame_equal(result, expected) + + +def test_right_merge_preserves_row_order(): + population = [ + ("Jenn", "Jamaica", 3), + ("Beth", "Bulgaria", 7), + ("Carl", "Canada", 30), + ] + columns = ["name", "country", "population"] + pop = pd.DataFrame.from_records(population, columns=columns) + + people = [("Abe", "America"), ("Beth", "Bulgaria"), ("Carl", "Canada")] + columns = ["name", "country"] + ppl = pd.DataFrame.from_records(people, columns=columns) + + expected_data = [ + ("Abe", "America", np.nan), + ("Beth", "Bulgaria", 7), + ("Carl", "Canada", 30), + ] + expected_cols = ["name", "country", "population"] + expected = DataFrame.from_records(expected_data, columns=expected_cols) + + result = pop.merge(ppl, on=("name", "country"), how="right") + + assert_frame_equal(result, expected) From 0e481a74d7ef7bce7bb9a9b61bab52bf7dc759ee Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Fri, 2 Aug 2019 14:16:15 -0600 Subject: [PATCH 02/23] correct the imports --- pandas/tests/reshape/merge/test_merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 1bbaed3554b9b..21973e5a161dc 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2162,11 +2162,11 @@ def test_right_merge_preserves_row_order(): ("Carl", "Canada", 30), ] columns = ["name", "country", "population"] - pop = pd.DataFrame.from_records(population, columns=columns) + pop = DataFrame.from_records(population, columns=columns) people = [("Abe", "America"), ("Beth", "Bulgaria"), ("Carl", "Canada")] columns = ["name", "country"] - ppl = pd.DataFrame.from_records(people, columns=columns) + ppl = DataFrame.from_records(people, columns=columns) expected_data = [ ("Abe", "America", np.nan), From 42eaad168565500de0ce676deffcc33075eb1ca5 Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Fri, 2 Aug 2019 15:11:56 -0600 Subject: [PATCH 03/23] change order of args in test assertion --- pandas/tests/reshape/merge/test_merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 21973e5a161dc..00a527e0ea273 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2178,4 +2178,4 @@ def test_right_merge_preserves_row_order(): result = pop.merge(ppl, on=("name", "country"), how="right") - assert_frame_equal(result, expected) + assert_frame_equal(expected, result) From 3bba9416fe728c88b6711af74849b323436d7c04 Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Fri, 2 Aug 2019 16:31:10 -0600 Subject: [PATCH 04/23] broken commit with a bunch of print statements and comments --- pandas/core/reshape/merge.py | 41 ++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 7a22a6c846240..6a0d4752176f7 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -566,10 +566,17 @@ def __init__( indicator: bool = False, validate=None, ): - _left = _validate_operand(left) - _right = _validate_operand(right) - self.left = self.orig_left = _left - self.right = self.orig_right = _right + + # if how == "right": + # left, right = right, left + # left_index, right_index = right_index, left_index + # left_on, right_on = right_on, left_on + # how = "left" + + left = validate_operand(left) + right = validate_operand(right) + self.left = self.orig_left = left + self.right = self.orig_right = right self.how = how self.axis = axis @@ -1301,6 +1308,9 @@ def _get_join_indexers( right_keys ), "left_key and right_keys must be the same length" + # bind `sort` arg. of _factorize_keys + fkeys = partial(_factorize_keys, sort=sort) + print(left_keys, right_keys) # get left & right join labels and num. of levels at each location mapped = ( _factorize_keys(left_keys[n], right_keys[n], sort=sort) @@ -1310,13 +1320,16 @@ def _get_join_indexers( llab, rlab, shape = [list(x) for x in zipped] # get flat i8 keys from label lists + print(llab, rlab) lkey, rkey = _get_join_keys(llab, rlab, shape, sort) # factorize keys to a dense i8 space # `count` is the num. of unique keys # set(lkey) | set(rkey) == range(count) - lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) + print(lkey, rkey) + lkey, rkey, count = fkeys(lkey, rkey) + print(lkey, rkey) # preserve left frame order if how == 'left' and sort == False kwargs = copy.copy(kwargs) if how == "left": @@ -1853,8 +1866,22 @@ def _left_join_on_index(left_ax: Index, right_ax: Index, join_keys, sort: bool = def _right_outer_join(x, y, max_groups): - right_indexer, left_indexer = libjoin.left_outer_join(y, x, max_groups) - return left_indexer, right_indexer + new_x = [] + for i in y: + if i in x: + new_x.append(i) + else: + new_x.append(-1) + + return np.array(new_x), np.array([0, 1, 2]) + # right_indexer, left_indexer = libjoin.left_outer_join(y, x, max_groups) + # print('right_index: ', y, " - ", right_indexer) + # print('left_index: ', x, " - ", left_indexer) + + # assert np.array_equal(left_indexer, np.array([1, 2, -1])) + # assert np.array_equal(right_indexer, np.array([1, 2, 0])) + # return np.array([-1, 1, 2]), np.array([0,1,2]) + # return left_indexer, right_indexer _join_functions = { From 803243c69112e7c923308a3b6cd1cc7dda53ef15 Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Mon, 5 Aug 2019 11:11:09 -0600 Subject: [PATCH 05/23] add test for left merge --- pandas/tests/reshape/merge/test_merge.py | 26 ++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 00a527e0ea273..8518e696fc70b 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2179,3 +2179,29 @@ def test_right_merge_preserves_row_order(): result = pop.merge(ppl, on=("name", "country"), how="right") assert_frame_equal(expected, result) + + +def test_left_merge_preserves_row_order(): + population = [ + ("Jenn", "Jamaica", 3), + ("Beth", "Bulgaria", 7), + ("Carl", "Canada", 30), + ] + columns = ["name", "country", "population"] + pop = DataFrame.from_records(population, columns=columns) + + people = [("Abe", "America"), ("Beth", "Bulgaria"), ("Carl", "Canada")] + columns = ["name", "country"] + ppl = DataFrame.from_records(people, columns=columns) + + expected_data = [ + ("Abe", "America", np.nan), + ("Beth", "Bulgaria", 7), + ("Carl", "Canada", 30), + ] + expected_cols = ["name", "country", "population"] + expected = DataFrame.from_records(expected_data, columns=expected_cols) + + result = ppl.merge(pop, on=("name", "country"), how="left") + + assert_frame_equal(expected, result) From cc269f383c36417b786e67509db88d3125668024 Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Mon, 5 Aug 2019 13:00:39 -0600 Subject: [PATCH 06/23] swap left and right keys when how == "right" --- pandas/core/reshape/merge.py | 41 ++++++++++++------------------------ 1 file changed, 13 insertions(+), 28 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6a0d4752176f7..9b6e1aa02084f 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1304,13 +1304,17 @@ def _get_join_indexers( indexers into the left_keys, right_keys """ + _how = how + if how == "right": + left_keys, right_keys = right_keys, left_keys + _how = "left" + assert len(left_keys) == len( right_keys ), "left_key and right_keys must be the same length" # bind `sort` arg. of _factorize_keys fkeys = partial(_factorize_keys, sort=sort) - print(left_keys, right_keys) # get left & right join labels and num. of levels at each location mapped = ( _factorize_keys(left_keys[n], right_keys[n], sort=sort) @@ -1320,23 +1324,24 @@ def _get_join_indexers( llab, rlab, shape = [list(x) for x in zipped] # get flat i8 keys from label lists - print(llab, rlab) lkey, rkey = _get_join_keys(llab, rlab, shape, sort) # factorize keys to a dense i8 space # `count` is the num. of unique keys # set(lkey) | set(rkey) == range(count) - - print(lkey, rkey) lkey, rkey, count = fkeys(lkey, rkey) - print(lkey, rkey) # preserve left frame order if how == 'left' and sort == False kwargs = copy.copy(kwargs) - if how == "left": + if _how == "left": kwargs["sort"] = sort - join_func = _join_functions[how] + join_func = _join_functions[_how] + + left_indexer, right_indexer = join_func(lkey, rkey, count, **kwargs) - return join_func(lkey, rkey, count, **kwargs) + if how == "right": + left_indexer, right_indexer = right_indexer, left_indexer + + return left_indexer, right_indexer def _restore_dropped_levels_multijoin( @@ -1865,29 +1870,9 @@ def _left_join_on_index(left_ax: Index, right_ax: Index, join_keys, sort: bool = return left_ax, None, right_indexer -def _right_outer_join(x, y, max_groups): - new_x = [] - for i in y: - if i in x: - new_x.append(i) - else: - new_x.append(-1) - - return np.array(new_x), np.array([0, 1, 2]) - # right_indexer, left_indexer = libjoin.left_outer_join(y, x, max_groups) - # print('right_index: ', y, " - ", right_indexer) - # print('left_index: ', x, " - ", left_indexer) - - # assert np.array_equal(left_indexer, np.array([1, 2, -1])) - # assert np.array_equal(right_indexer, np.array([1, 2, 0])) - # return np.array([-1, 1, 2]), np.array([0,1,2]) - # return left_indexer, right_indexer - - _join_functions = { "inner": libjoin.inner_join, "left": libjoin.left_outer_join, - "right": _right_outer_join, "outer": libjoin.full_outer_join, } From f2249df8a26886e23cf5b0d19d370f11c5843af1 Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Mon, 5 Aug 2019 13:03:48 -0600 Subject: [PATCH 07/23] correct old test: right-merge row order is now the same as the right df --- pandas/tests/reshape/merge/test_merge.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 8518e696fc70b..24d06361b794a 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1289,17 +1289,17 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index): # GH 24212 # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that # -1 is interpreted as a missing value instead of the last element - df1 = pd.DataFrame({"a": [1, 2, 3], "key": [0, 2, 2]}, index=index) - df2 = pd.DataFrame({"b": [1, 2, 3, 4, 5]}) + df1 = pd.DataFrame({"a": [0, 1, 2], "key": [0, 1, 2]}, index=index) + df2 = pd.DataFrame({"b": [0, 1, 2, 3, 4, 5]}) result = df1.merge(df2, left_on="key", right_index=True, how=how) expected = pd.DataFrame( [ - [1.0, 0, 1], - [2.0, 2, 3], - [3.0, 2, 3], - [np.nan, 1, 2], - [np.nan, 3, 4], - [np.nan, 4, 5], + [0, 0, 0], + [1, 1, 1], + [2, 2, 2], + [np.nan, 3, 3], + [np.nan, 4, 4], + [np.nan, 5, 5], ], columns=["a", "key", "b"], ) From 5f728a4632f26c2159f9b911923cc306ed54280b Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Mon, 5 Aug 2019 13:08:50 -0600 Subject: [PATCH 08/23] clean up spacing and delete temp code --- pandas/core/reshape/merge.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 9b6e1aa02084f..858a00b4bfa58 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -566,13 +566,6 @@ def __init__( indicator: bool = False, validate=None, ): - - # if how == "right": - # left, right = right, left - # left_index, right_index = right_index, left_index - # left_on, right_on = right_on, left_on - # how = "left" - left = validate_operand(left) right = validate_operand(right) self.left = self.orig_left = left @@ -1315,6 +1308,7 @@ def _get_join_indexers( # bind `sort` arg. of _factorize_keys fkeys = partial(_factorize_keys, sort=sort) + # get left & right join labels and num. of levels at each location mapped = ( _factorize_keys(left_keys[n], right_keys[n], sort=sort) @@ -1330,6 +1324,7 @@ def _get_join_indexers( # `count` is the num. of unique keys # set(lkey) | set(rkey) == range(count) lkey, rkey, count = fkeys(lkey, rkey) + # preserve left frame order if how == 'left' and sort == False kwargs = copy.copy(kwargs) if _how == "left": From 7ccebb2a0437f7620400700b05bdeb230e8054b0 Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Sat, 10 Aug 2019 16:38:51 -0600 Subject: [PATCH 09/23] add whatsnew --- doc/source/whatsnew/v1.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 5b052823dfb25..1b37eded68fd5 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -949,6 +949,7 @@ Reshaping - Bug in :func:`melt` where supplying mixed strings and numeric values for ``id_vars`` or ``value_vars`` would incorrectly raise a ``ValueError`` (:issue:`29718`) - Dtypes are now preserved when transposing a ``DataFrame`` where each column is the same extension dtype (:issue:`30091`) - Bug in :func:`merge_asof` merging on a tz-aware ``left_index`` and ``right_on`` a tz-aware column (:issue:`29864`) +- :meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`) - Sparse From 90984e4638c72067a8c2995c7ee5ce7c03d4fbd2 Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Mon, 12 Aug 2019 15:04:51 -0600 Subject: [PATCH 10/23] replace .from_records with default constructor --- pandas/tests/reshape/merge/test_merge.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 24d06361b794a..dc9c7846e7f20 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2162,11 +2162,11 @@ def test_right_merge_preserves_row_order(): ("Carl", "Canada", 30), ] columns = ["name", "country", "population"] - pop = DataFrame.from_records(population, columns=columns) + pop = DataFrame(population, columns=columns) people = [("Abe", "America"), ("Beth", "Bulgaria"), ("Carl", "Canada")] columns = ["name", "country"] - ppl = DataFrame.from_records(people, columns=columns) + ppl = DataFrame(people, columns=columns) expected_data = [ ("Abe", "America", np.nan), @@ -2174,7 +2174,7 @@ def test_right_merge_preserves_row_order(): ("Carl", "Canada", 30), ] expected_cols = ["name", "country", "population"] - expected = DataFrame.from_records(expected_data, columns=expected_cols) + expected = DataFrame(expected_data, columns=expected_cols) result = pop.merge(ppl, on=("name", "country"), how="right") @@ -2188,11 +2188,11 @@ def test_left_merge_preserves_row_order(): ("Carl", "Canada", 30), ] columns = ["name", "country", "population"] - pop = DataFrame.from_records(population, columns=columns) + pop = DataFrame(population, columns=columns) people = [("Abe", "America"), ("Beth", "Bulgaria"), ("Carl", "Canada")] columns = ["name", "country"] - ppl = DataFrame.from_records(people, columns=columns) + ppl = DataFrame(people, columns=columns) expected_data = [ ("Abe", "America", np.nan), @@ -2200,7 +2200,7 @@ def test_left_merge_preserves_row_order(): ("Carl", "Canada", 30), ] expected_cols = ["name", "country", "population"] - expected = DataFrame.from_records(expected_data, columns=expected_cols) + expected = DataFrame(expected_data, columns=expected_cols) result = ppl.merge(pop, on=("name", "country"), how="left") From de44eae233e954779148542f3161437fa5692707 Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Mon, 12 Aug 2019 15:10:31 -0600 Subject: [PATCH 11/23] add GH issue # to tests --- pandas/tests/reshape/merge/test_merge.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index dc9c7846e7f20..4b94fdf1f6bf1 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2156,6 +2156,7 @@ def test_merge_multiindex_columns(): def test_right_merge_preserves_row_order(): + # GH 27453 population = [ ("Jenn", "Jamaica", 3), ("Beth", "Bulgaria", 7), @@ -2182,6 +2183,7 @@ def test_right_merge_preserves_row_order(): def test_left_merge_preserves_row_order(): + # GH 27453 population = [ ("Jenn", "Jamaica", 3), ("Beth", "Bulgaria", 7), From a8954fe07c6a1849de3282cc49d1c8c65986ab51 Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Mon, 12 Aug 2019 21:00:36 -0600 Subject: [PATCH 12/23] revert commit ed54bec7e --- pandas/core/reshape/merge.py | 41 +++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 858a00b4bfa58..3f1637f51a465 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1297,18 +1297,12 @@ def _get_join_indexers( indexers into the left_keys, right_keys """ - _how = how - if how == "right": - left_keys, right_keys = right_keys, left_keys - _how = "left" - assert len(left_keys) == len( right_keys ), "left_key and right_keys must be the same length" # bind `sort` arg. of _factorize_keys fkeys = partial(_factorize_keys, sort=sort) - # get left & right join labels and num. of levels at each location mapped = ( _factorize_keys(left_keys[n], right_keys[n], sort=sort) @@ -1318,25 +1312,22 @@ def _get_join_indexers( llab, rlab, shape = [list(x) for x in zipped] # get flat i8 keys from label lists + print(llab, rlab) lkey, rkey = _get_join_keys(llab, rlab, shape, sort) # factorize keys to a dense i8 space # `count` is the num. of unique keys # set(lkey) | set(rkey) == range(count) - lkey, rkey, count = fkeys(lkey, rkey) + print(lkey, rkey) + lkey, rkey, count = fkeys(lkey, rkey) # preserve left frame order if how == 'left' and sort == False kwargs = copy.copy(kwargs) - if _how == "left": + if how == "left": kwargs["sort"] = sort - join_func = _join_functions[_how] - - left_indexer, right_indexer = join_func(lkey, rkey, count, **kwargs) - - if how == "right": - left_indexer, right_indexer = right_indexer, left_indexer + join_func = _join_functions[how] - return left_indexer, right_indexer + return join_func(lkey, rkey, count, **kwargs) def _restore_dropped_levels_multijoin( @@ -1865,9 +1856,29 @@ def _left_join_on_index(left_ax: Index, right_ax: Index, join_keys, sort: bool = return left_ax, None, right_indexer +def _right_outer_join(x, y, max_groups): + new_x = [] + for i in y: + if i in x: + new_x.append(i) + else: + new_x.append(-1) + + return np.array(new_x), np.array([0, 1, 2]) + # right_indexer, left_indexer = libjoin.left_outer_join(y, x, max_groups) + # print('right_index: ', y, " - ", right_indexer) + # print('left_index: ', x, " - ", left_indexer) + + # assert np.array_equal(left_indexer, np.array([1, 2, -1])) + # assert np.array_equal(right_indexer, np.array([1, 2, 0])) + # return np.array([-1, 1, 2]), np.array([0,1,2]) + # return left_indexer, right_indexer + + _join_functions = { "inner": libjoin.inner_join, "left": libjoin.left_outer_join, + "right": _right_outer_join, "outer": libjoin.full_outer_join, } From d328470eaa838e9a92da6e58c7d3ad21b571ef56 Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Mon, 12 Aug 2019 21:18:47 -0600 Subject: [PATCH 13/23] change logic to swap left and right if how==right --- pandas/core/reshape/merge.py | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 3f1637f51a465..6b0689d2470b6 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1312,15 +1312,16 @@ def _get_join_indexers( llab, rlab, shape = [list(x) for x in zipped] # get flat i8 keys from label lists - print(llab, rlab) lkey, rkey = _get_join_keys(llab, rlab, shape, sort) # factorize keys to a dense i8 space # `count` is the num. of unique keys # set(lkey) | set(rkey) == range(count) - print(lkey, rkey) - lkey, rkey, count = fkeys(lkey, rkey) + if how == "right": + rkey, lkey, count = fkeys(rkey, lkey) + else: + lkey, rkey, count = fkeys(lkey, rkey) # preserve left frame order if how == 'left' and sort == False kwargs = copy.copy(kwargs) if how == "left": @@ -1857,22 +1858,8 @@ def _left_join_on_index(left_ax: Index, right_ax: Index, join_keys, sort: bool = def _right_outer_join(x, y, max_groups): - new_x = [] - for i in y: - if i in x: - new_x.append(i) - else: - new_x.append(-1) - - return np.array(new_x), np.array([0, 1, 2]) - # right_indexer, left_indexer = libjoin.left_outer_join(y, x, max_groups) - # print('right_index: ', y, " - ", right_indexer) - # print('left_index: ', x, " - ", left_indexer) - - # assert np.array_equal(left_indexer, np.array([1, 2, -1])) - # assert np.array_equal(right_indexer, np.array([1, 2, 0])) - # return np.array([-1, 1, 2]), np.array([0,1,2]) - # return left_indexer, right_indexer + right_indexer, left_indexer = libjoin.left_outer_join(y, x, max_groups) + return left_indexer, right_indexer _join_functions = { From 3c1a7cb7e4f4bca13a65e3547c499bd7f5bd3a19 Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Mon, 12 Aug 2019 21:20:49 -0600 Subject: [PATCH 14/23] clean formatting --- pandas/core/reshape/merge.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6b0689d2470b6..745ed9dc58f4b 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1303,6 +1303,7 @@ def _get_join_indexers( # bind `sort` arg. of _factorize_keys fkeys = partial(_factorize_keys, sort=sort) + # get left & right join labels and num. of levels at each location mapped = ( _factorize_keys(left_keys[n], right_keys[n], sort=sort) From 7fc644174583c67d09bd52d100732240a124c028 Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Sat, 21 Sep 2019 06:59:07 -0600 Subject: [PATCH 15/23] rename vars and add comment for clarity --- pandas/core/reshape/merge.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 745ed9dc58f4b..82b709ca0a158 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1319,17 +1319,19 @@ def _get_join_indexers( # `count` is the num. of unique keys # set(lkey) | set(rkey) == range(count) + # flip left and right keys if performing a right merge + # to preserve right merge row order (GH 27453) if how == "right": - rkey, lkey, count = fkeys(rkey, lkey) + factorized_rkey, factorized_lkey, count = fkeys(rkey, lkey) else: - lkey, rkey, count = fkeys(lkey, rkey) + factorized_lkey, factorized_rkey, count = fkeys(lkey, rkey) # preserve left frame order if how == 'left' and sort == False kwargs = copy.copy(kwargs) if how == "left": kwargs["sort"] = sort join_func = _join_functions[how] - return join_func(lkey, rkey, count, **kwargs) + return join_func(factorized_lkey, factorized_rkey, count, **kwargs) def _restore_dropped_levels_multijoin( From 1e9df2beccfd8323f12823eef80c8d2f90ad3682 Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Sat, 21 Sep 2019 07:11:16 -0600 Subject: [PATCH 16/23] combine tests into one --- pandas/tests/reshape/merge/test_merge.py | 40 ++++++------------------ 1 file changed, 9 insertions(+), 31 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 4b94fdf1f6bf1..3d790a7949728 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2155,7 +2155,8 @@ def test_merge_multiindex_columns(): tm.assert_frame_equal(result, expected) -def test_right_merge_preserves_row_order(): +@pytest.mark.parametrize("how", ["left", "right"]) +def test_merge_preserves_row_order(how): # GH 27453 population = [ ("Jenn", "Jamaica", 3), @@ -2163,11 +2164,11 @@ def test_right_merge_preserves_row_order(): ("Carl", "Canada", 30), ] columns = ["name", "country", "population"] - pop = DataFrame(population, columns=columns) + population_df = DataFrame(population, columns=columns) people = [("Abe", "America"), ("Beth", "Bulgaria"), ("Carl", "Canada")] columns = ["name", "country"] - ppl = DataFrame(people, columns=columns) + people_df = DataFrame(people, columns=columns) expected_data = [ ("Abe", "America", np.nan), @@ -2177,33 +2178,10 @@ def test_right_merge_preserves_row_order(): expected_cols = ["name", "country", "population"] expected = DataFrame(expected_data, columns=expected_cols) - result = pop.merge(ppl, on=("name", "country"), how="right") - - assert_frame_equal(expected, result) - - -def test_left_merge_preserves_row_order(): - # GH 27453 - population = [ - ("Jenn", "Jamaica", 3), - ("Beth", "Bulgaria", 7), - ("Carl", "Canada", 30), - ] - columns = ["name", "country", "population"] - pop = DataFrame(population, columns=columns) - - people = [("Abe", "America"), ("Beth", "Bulgaria"), ("Carl", "Canada")] - columns = ["name", "country"] - ppl = DataFrame(people, columns=columns) - - expected_data = [ - ("Abe", "America", np.nan), - ("Beth", "Bulgaria", 7), - ("Carl", "Canada", 30), - ] - expected_cols = ["name", "country", "population"] - expected = DataFrame(expected_data, columns=expected_cols) - - result = ppl.merge(pop, on=("name", "country"), how="left") + if how == "right": + left_df, right_df = population_df, people_df + elif how == "left": + left_df, right_df = people_df, population_df + result = left_df.merge(right_df, on=("name", "country"), how=how) assert_frame_equal(expected, result) From d017bd526f5ca775a8e55e00989c61801ac256a4 Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Sat, 21 Sep 2019 07:17:15 -0600 Subject: [PATCH 17/23] update whatsnew --- doc/source/whatsnew/v1.0.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 1b37eded68fd5..ee16c6f646eb0 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -274,6 +274,8 @@ New repr for :class:`~pandas.arrays.IntervalArray` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - :class:`pandas.arrays.IntervalArray` adopts a new ``__repr__`` in accordance with other array classes (:issue:`25022`) +- :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`). +- :meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge. Older versions do not preserve row order on right merge (:issue:`27453`) *pandas 0.25.x* From 71736b80f31ee89e63b36785fc34072dd4158bed Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Sat, 28 Sep 2019 12:16:22 -0600 Subject: [PATCH 18/23] Update doc/source/whatsnew/v1.0.0.rst Co-Authored-By: William Ayd --- doc/source/whatsnew/v1.0.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index ee16c6f646eb0..3bc5de2c587b1 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -275,7 +275,8 @@ New repr for :class:`~pandas.arrays.IntervalArray` - :class:`pandas.arrays.IntervalArray` adopts a new ``__repr__`` in accordance with other array classes (:issue:`25022`) - :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`). -- :meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge. Older versions do not preserve row order on right merge (:issue:`27453`) +- :class:`pandas.core.arrays.IntervalArray` adopts a new ``__repr__`` in accordance with other array classes (:issue:`25022`) +- :meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`) *pandas 0.25.x* From 11ec7d09eff133ddf42d02fad7946c8e4dd19b19 Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Sat, 26 Oct 2019 17:29:26 -0600 Subject: [PATCH 19/23] add before and after examples --- doc/source/whatsnew/v1.0.0.rst | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 3bc5de2c587b1..1638851eb51b2 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -276,7 +276,6 @@ New repr for :class:`~pandas.arrays.IntervalArray` - :class:`pandas.arrays.IntervalArray` adopts a new ``__repr__`` in accordance with other array classes (:issue:`25022`) - :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`). - :class:`pandas.core.arrays.IntervalArray` adopts a new ``__repr__`` in accordance with other array classes (:issue:`25022`) -- :meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`) *pandas 0.25.x* @@ -295,6 +294,32 @@ New repr for :class:`~pandas.arrays.IntervalArray` pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)]) +- :meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`) + +.. ipython:: python + + left_df = pd.DataFrame({"colors": ["blue", "red"]}, index = pd.Index([0, 1])) + right_df = pd.DataFrame({"hats": ["small", "big"]}, index = pd.Index([1, 0])) + +*pandas 0.25.x* + +.. ipython:: python + left_df.merge(right_df, left_index=True, right_index=True, how="right") + colors hats + 0 blue big + 1 red small + + +*pandas 1.0.0* + +.. ipython:: python + + left_df.merge(right_df, left_index=True, right_index=True, how="right") + colors hats + 1 red small + 0 blue big + + All :class:`SeriesGroupBy` aggregation methods now respect the ``observed`` keyword ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From c238b5040d72ed92280b2ca31039c9ee9f6ecadc Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Sat, 2 Nov 2019 12:49:21 -0600 Subject: [PATCH 20/23] linting --- doc/source/whatsnew/v1.0.0.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 1638851eb51b2..da2d8b97d518d 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -298,14 +298,14 @@ New repr for :class:`~pandas.arrays.IntervalArray` .. ipython:: python - left_df = pd.DataFrame({"colors": ["blue", "red"]}, index = pd.Index([0, 1])) - right_df = pd.DataFrame({"hats": ["small", "big"]}, index = pd.Index([1, 0])) + left_df = pd.DataFrame({"colors": ["blue", "red"]}, index=pd.Index([0, 1])) + right_df = pd.DataFrame({"hats": ["small", "big"]}, index=pd.Index([1, 0])) *pandas 0.25.x* .. ipython:: python left_df.merge(right_df, left_index=True, right_index=True, how="right") - colors hats + colors hats 0 blue big 1 red small From e29ed1cdf04e670fb50580f245444f53d15f3809 Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Tue, 31 Dec 2019 20:23:03 -0600 Subject: [PATCH 21/23] cleanup --- doc/source/whatsnew/v1.0.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index da2d8b97d518d..d6634419fc2a0 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -274,7 +274,6 @@ New repr for :class:`~pandas.arrays.IntervalArray` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - :class:`pandas.arrays.IntervalArray` adopts a new ``__repr__`` in accordance with other array classes (:issue:`25022`) -- :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`). - :class:`pandas.core.arrays.IntervalArray` adopts a new ``__repr__`` in accordance with other array classes (:issue:`25022`) *pandas 0.25.x* From 2b1b675922854c4b4fc3bebb98b922c9bfd6dd73 Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Wed, 22 Jan 2020 20:03:36 -0700 Subject: [PATCH 22/23] changes requested by jreback --- doc/source/whatsnew/v1.0.0.rst | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index d6634419fc2a0..b28b3dc64a3ab 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -274,7 +274,6 @@ New repr for :class:`~pandas.arrays.IntervalArray` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - :class:`pandas.arrays.IntervalArray` adopts a new ``__repr__`` in accordance with other array classes (:issue:`25022`) -- :class:`pandas.core.arrays.IntervalArray` adopts a new ``__repr__`` in accordance with other array classes (:issue:`25022`) *pandas 0.25.x* @@ -293,20 +292,21 @@ New repr for :class:`~pandas.arrays.IntervalArray` pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)]) -- :meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`) +:meth:`DataFrame.merge` preserves right frame's row order +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`) -.. ipython:: python +.. code-block:: python left_df = pd.DataFrame({"colors": ["blue", "red"]}, index=pd.Index([0, 1])) right_df = pd.DataFrame({"hats": ["small", "big"]}, index=pd.Index([1, 0])) + left_df + right_df *pandas 0.25.x* -.. ipython:: python +.. code-block:: python left_df.merge(right_df, left_index=True, right_index=True, how="right") - colors hats - 0 blue big - 1 red small *pandas 1.0.0* @@ -976,7 +976,6 @@ Reshaping - Bug in :func:`melt` where supplying mixed strings and numeric values for ``id_vars`` or ``value_vars`` would incorrectly raise a ``ValueError`` (:issue:`29718`) - Dtypes are now preserved when transposing a ``DataFrame`` where each column is the same extension dtype (:issue:`30091`) - Bug in :func:`merge_asof` merging on a tz-aware ``left_index`` and ``right_on`` a tz-aware column (:issue:`29864`) -- :meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`) - Sparse From ef14ba028106426460d2596a629098818afbf315 Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Wed, 22 Jan 2020 20:15:47 -0700 Subject: [PATCH 23/23] update docs --- doc/source/whatsnew/v1.0.0.rst | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index b28b3dc64a3ab..0215264d9ad01 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -311,12 +311,9 @@ New repr for :class:`~pandas.arrays.IntervalArray` *pandas 1.0.0* -.. ipython:: python - +.. code-block:: python left_df.merge(right_df, left_index=True, right_index=True, how="right") - colors hats - 1 red small - 0 blue big +