From 08b778de353c34ad53c85a46babc5e338968079d Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 30 May 2020 00:20:49 +0200 Subject: [PATCH 1/8] Change as index string Change merge Change test_merge --- pandas/core/reshape/merge.py | 15 +++++++--- pandas/tests/reshape/merge/test_merge.py | 30 ++++++++++++++++--- .../merge/test_merge_index_as_string.py | 5 +++- 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0c796c8f45a52..c3e8dfeeabb52 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -938,16 +938,23 @@ def _create_join_index( ------- join_index """ - if self.how in (how, "outer") and not isinstance(other_index, MultiIndex): + if self.how in (how, "outer") and not isinstance(index, MultiIndex): # if final index requires values in other_index but not target # index, indexer may hold missing (-1) values, causing Index.take # to take the final value in target index. So, we set the last # element to be the desired fill value. We do not use allow_fill # and fill_value because it throws a ValueError on integer indices - mask = indexer == -1 + mask = other_indexer == -1 if np.any(mask): - fill_value = na_value_for_dtype(index.dtype, compat=False) - index = index.append(Index([fill_value])) + fill_value = na_value_for_dtype(other_index.dtype, compat=False) + if isinstance(other_index, MultiIndex): + fill_index = MultiIndex.from_tuples( + [[fill_value] * other_index.nlevels] + ) + else: + fill_index = Index([fill_value]) + other_index = other_index.append(fill_index) + return other_index.take(other_indexer) return index.take(indexer) def _get_merge_keys(self): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 4408aa0bbce4a..13bb2dedab96f 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -16,6 +16,7 @@ DataFrame, DatetimeIndex, Float64Index, + Index, Int64Index, IntervalIndex, MultiIndex, @@ -361,7 +362,9 @@ def test_handle_join_key_pass_array(self): key = np.array([0, 1, 1, 2, 2, 3], dtype=np.int64) merged = merge(left, right, left_index=True, right_on=key, how="outer") - tm.assert_series_equal(merged["key_0"], Series(key, name="key_0")) + tm.assert_series_equal( + merged["key_0"], Series(key, name="key_0", index=[0, 1, 1, 2, 2, np.nan]) + ) def test_no_overlap_more_informative_error(self): dt = datetime.now() @@ -472,7 +475,10 @@ def check1(exp, kwarg): def check2(exp, kwarg): result = pd.merge(left, right, how="right", **kwarg) tm.assert_frame_equal(result, exp) + + def check3(exp, kwarg, index): result = pd.merge(left, right, how="outer", **kwarg) + exp.index = index tm.assert_frame_equal(result, exp) for kwarg in [ @@ -482,6 +488,13 @@ def check2(exp, kwarg): check1(exp_in, kwarg) check2(exp_out, kwarg) + check3(exp_out, dict(left_index=True, right_index=True), exp_out.index) + check3( + exp_out.copy(), + dict(left_index=True, right_on="x"), + Index([np.nan, np.nan, np.nan]), + ) + kwarg = dict(left_on="a", right_index=True) check1(exp_in, kwarg) exp_out["a"] = [0, 1, 2] @@ -1300,7 +1313,7 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index): ], columns=["a", "key", "b"], ) - expected.set_index(expected_index, inplace=True) + expected.set_index(df2.index, inplace=True) tm.assert_frame_equal(result, expected) def test_merge_right_index_right(self): @@ -1313,7 +1326,7 @@ def test_merge_right_index_right(self): expected = pd.DataFrame( {"a": [1, 2, 3, None], "key": [0, 1, 1, 2], "b": [1, 2, 2, 3]}, columns=["a", "key", "b"], - index=[0, 1, 2, np.nan], + index=[0, 1, 1, 2], ) result = left.merge(right, left_on="key", right_index=True, how="right") tm.assert_frame_equal(result, expected) @@ -1350,7 +1363,7 @@ def test_merge_take_missing_values_from_index_of_other_dtype(self): "key": pd.Categorical(["a", "a", "b", "c"]), "b": [1, 1, 2, 3], }, - index=[0, 1, 2, np.nan], + index=pd.Categorical(["a", "a", "b", "c"]), ) expected = expected.reindex(columns=["a", "key", "b"]) tm.assert_frame_equal(result, expected) @@ -2227,3 +2240,12 @@ def test_categorical_non_unique_monotonic(n_categories): index=left_index, ) tm.assert_frame_equal(expected, result) + + +def test_right_index_true_right_join_target_index(): + df_left = pd.DataFrame(index=["a", "b"]) + df_right = pd.DataFrame({"x": ["a", "c"]}) + + result = pd.merge(df_left, df_right, left_index=True, right_on="x", how="left") + expected = pd.DataFrame({"x": ["a", "b"]}, index=Index(["a", "b"])) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge_index_as_string.py b/pandas/tests/reshape/merge/test_merge_index_as_string.py index 08614d04caf4b..17643c716954c 100644 --- a/pandas/tests/reshape/merge/test_merge_index_as_string.py +++ b/pandas/tests/reshape/merge/test_merge_index_as_string.py @@ -185,5 +185,8 @@ def test_join_indexes_and_columns_on(df1, df2, left_index, join_type): result = left_df.join( right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y" ) - + if join_type == "right" and left_index == "inner": + result.index = result.index.droplevel("outer") + if join_type == "outer" and left_index == "inner": + result.index = result.index.droplevel(0) tm.assert_frame_equal(result, expected, check_like=True) From 9214611bd9779f76326dad9d3e1c98a8caa84919 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 30 May 2020 12:48:52 +0200 Subject: [PATCH 2/8] Add new test and rename old one --- pandas/core/reshape/merge.py | 1 + pandas/tests/reshape/merge/test_merge.py | 14 +++++++++----- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index c3e8dfeeabb52..e06641de218b4 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -932,6 +932,7 @@ def _create_join_index( index: Index being rearranged other_index: Index used to supply values not found in index indexer: how to rearrange index + other_indexer: how to rearrange the index in case of self.how from how or outer. how: replacement is only necessary if indexer based on other_index Returns diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 13bb2dedab96f..64e683c4628d3 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2242,10 +2242,14 @@ def test_categorical_non_unique_monotonic(n_categories): tm.assert_frame_equal(expected, result) -def test_right_index_true_right_join_target_index(): - df_left = pd.DataFrame(index=["a", "b"]) - df_right = pd.DataFrame({"x": ["a", "c"]}) +@pytest.mark.parametrize( + ("index", "how", "values"), + [(Index(["a", "b"]), "left", ["a", "b"]), (Index([0, 1]), "right", ["a", "c"])], +) +def test_left_index_true_left_join_target_index(index, how, values): + left = pd.DataFrame(index=["a", "b"]) + right = pd.DataFrame({"x": ["a", "c"]}) - result = pd.merge(df_left, df_right, left_index=True, right_on="x", how="left") - expected = pd.DataFrame({"x": ["a", "b"]}, index=Index(["a", "b"])) + result = pd.merge(left, right, left_index=True, right_on="x", how=how) + expected = pd.DataFrame({"x": values}, index=index) tm.assert_frame_equal(result, expected) From ccd32d45ae0c763137fb786f18379320ece7d08e Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 30 May 2020 12:50:26 +0200 Subject: [PATCH 3/8] Rename test --- pandas/tests/reshape/merge/test_merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 64e683c4628d3..331b50af6d858 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2246,7 +2246,7 @@ def test_categorical_non_unique_monotonic(n_categories): ("index", "how", "values"), [(Index(["a", "b"]), "left", ["a", "b"]), (Index([0, 1]), "right", ["a", "c"])], ) -def test_left_index_true_left_join_target_index(index, how, values): +def test_left_index_true_left_and_righ_join_target_index(index, how, values): left = pd.DataFrame(index=["a", "b"]) right = pd.DataFrame({"x": ["a", "c"]}) From 967bd749971421698c3114a3643fe14197e79235 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 7 Sep 2020 22:20:08 +0200 Subject: [PATCH 4/8] Try different approach --- pandas/core/reshape/merge.py | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index e06641de218b4..ef54f25866208 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -888,11 +888,10 @@ def _get_join_info(self): if self.right_index: if len(self.left) > 0: join_index = self._create_join_index( - self.left.index, self.right.index, - left_indexer, + self.left.index, right_indexer, - how="right", + how="left", ) else: join_index = self.right.index.take(right_indexer) @@ -900,11 +899,10 @@ def _get_join_info(self): elif self.left_index: if len(self.right) > 0: join_index = self._create_join_index( - self.right.index, self.left.index, - right_indexer, + self.right.index, left_indexer, - how="left", + how="right", ) else: join_index = self.left.index.take(left_indexer) @@ -921,7 +919,6 @@ def _create_join_index( index: Index, other_index: Index, indexer, - other_indexer, how: str = "left", ): """ @@ -932,30 +929,22 @@ def _create_join_index( index: Index being rearranged other_index: Index used to supply values not found in index indexer: how to rearrange index - other_indexer: how to rearrange the index in case of self.how from how or outer. how: replacement is only necessary if indexer based on other_index Returns ------- join_index """ - if self.how in (how, "outer") and not isinstance(index, MultiIndex): + if self.how in (how, "outer") and not isinstance(other_index, MultiIndex): # if final index requires values in other_index but not target # index, indexer may hold missing (-1) values, causing Index.take # to take the final value in target index. So, we set the last # element to be the desired fill value. We do not use allow_fill # and fill_value because it throws a ValueError on integer indices - mask = other_indexer == -1 + mask = indexer == -1 if np.any(mask): - fill_value = na_value_for_dtype(other_index.dtype, compat=False) - if isinstance(other_index, MultiIndex): - fill_index = MultiIndex.from_tuples( - [[fill_value] * other_index.nlevels] - ) - else: - fill_index = Index([fill_value]) - other_index = other_index.append(fill_index) - return other_index.take(other_indexer) + fill_value = na_value_for_dtype(index.dtype, compat=False) + index = index.append(Index([fill_value])) return index.take(indexer) def _get_merge_keys(self): From e7ae28cf39a7c9c5bc91b115d021df4963fcdc44 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 26 Nov 2020 22:26:50 +0100 Subject: [PATCH 5/8] Delete pd --- pandas/tests/reshape/merge/test_merge.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index c9e1025c16749..8b1a2c0467b9e 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2369,9 +2369,9 @@ def test_merge_join_cols_error_reporting_on_and_index(func, kwargs): [(Index(["a", "b"]), "left", ["a", "b"]), (Index([0, 1]), "right", ["a", "c"])], ) def test_left_index_true_left_and_righ_join_target_index(index, how, values): - left = pd.DataFrame(index=["a", "b"]) - right = pd.DataFrame({"x": ["a", "c"]}) + left = DataFrame(index=["a", "b"]) + right = DataFrame({"x": ["a", "c"]}) - result = pd.merge(left, right, left_index=True, right_on="x", how=how) - expected = pd.DataFrame({"x": values}, index=index) + result = merge(left, right, left_index=True, right_on="x", how=how) + expected = DataFrame({"x": values}, index=index) tm.assert_frame_equal(result, expected) From fc59f9ca4c28a3c7d3679339e69ffc7b9a3ef007 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 26 Nov 2020 22:34:29 +0100 Subject: [PATCH 6/8] Delete pd in different test --- pandas/tests/reshape/merge/test_merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 8b1a2c0467b9e..3376f2d92f6fc 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -438,7 +438,7 @@ def test_merge_left_empty_right_empty(self, join_type, kwarg): exp_in = DataFrame( columns=["a", "b", "c", "x", "y", "z"], - index=pd.Index([], dtype=object), + index=Index([], dtype=object), dtype=object, ) From 896d04e960e1c037c449ca524536d2f4adbcf67f Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 26 Nov 2020 22:49:26 +0100 Subject: [PATCH 7/8] Categorical fix --- pandas/tests/reshape/merge/test_merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 3376f2d92f6fc..7cb6341a8a770 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1370,7 +1370,7 @@ def test_merge_take_missing_values_from_index_of_other_dtype(self): "key": Categorical(["a", "a", "b", "c"]), "b": [1, 1, 2, 3], }, - index=pd.Categorical(["a", "a", "b", "c"]), + index=Categorical(["a", "a", "b", "c"]), ) expected = expected.reindex(columns=["a", "key", "b"]) tm.assert_frame_equal(result, expected) From 612eee745822e3cf96ccd2e3c3d4bd724910e4d0 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 27 Nov 2020 00:10:21 +0100 Subject: [PATCH 8/8] Fix doctest --- pandas/core/reshape/merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index c4ac8af735a2d..17beb17219c2a 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -438,8 +438,8 @@ def merge_asof( >>> pd.merge_asof(left, right, left_index=True, right_index=True) left_val right_val 1 a 1 - 5 b 3 - 10 c 7 + 3 b 3 + 7 c 7 Here is a real-world times-series example