From 01355d59fc22860de0d7f04b86318944643ba0f6 Mon Sep 17 00:00:00 2001 From: Alexander Haupt <1485187+elrubio@users.noreply.github.com> Date: Fri, 9 Feb 2018 18:39:20 +0100 Subject: [PATCH 1/4] BUG: Fix 'left' join turned into 'outer' join when joining with a sequence of dataframes (#19607) --- pandas/core/frame.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6d8dcb8a1ca89..a6417f821a4e6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5328,18 +5328,17 @@ def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', raise ValueError('Joining multiple DataFrames only supported' ' for joining on index') - # join indexes only using concat - if how == 'left': - how = 'outer' - join_axes = [self.index] - else: - join_axes = None - frames = [self] + list(other) can_concat = all(df.index.is_unique for df in frames) + # join indexes only using concat if can_concat: + if how == 'left': + how = 'outer' + join_axes = [self.index] + else: + join_axes = None return concat(frames, axis=1, join=how, join_axes=join_axes, verify_integrity=True) From d1a85e4b19c61a58179a17757090aaf807d41587 Mon Sep 17 00:00:00 2001 From: Alexander Haupt <1485187+elrubio@users.noreply.github.com> Date: Fri, 9 Feb 2018 18:43:22 +0100 Subject: [PATCH 2/4] TST: Check for correct index after left-joining a sequence of dataframes (#19607) --- pandas/tests/frame/test_join.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py index afecba2026dd7..5a3074716494a 100644 --- a/pandas/tests/frame/test_join.py +++ b/pandas/tests/frame/test_join.py @@ -31,6 +31,11 @@ def right(): return DataFrame({'b': [300, 100, 200]}, index=[3, 1, 2]) +@pytest.fixture +def right_non_unique(): + return DataFrame({'c': [400, 500, 600]}, index=[2, 2, 4]) + + @pytest.mark.parametrize( "how, sort, expected", [('inner', False, DataFrame({'a': [20, 10], @@ -165,3 +170,11 @@ def test_join_period_index(frame_with_period_index): index=frame_with_period_index.index) tm.assert_frame_equal(joined, expected) + + +def test_join_left_sequence_non_unique_index(left, right, right_non_unique): + # left join sequence of dataframes with non-unique indices (issue #19607) + joined = left.join([right_non_unique], how='left') + tm.assert_index_equal( + joined.index.unique().sort_values(), + left.index.sort_values()) From 6eb27ab4725922d15ef65948ebf0c245f6706d03 Mon Sep 17 00:00:00 2001 From: Alexander Haupt <1485187+elrubio@users.noreply.github.com> Date: Sat, 10 Feb 2018 00:42:56 +0100 Subject: [PATCH 3/4] Test fixture removed, using assert_frame_equal --- pandas/tests/frame/test_join.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py index 5a3074716494a..ccdba6df2521a 100644 --- a/pandas/tests/frame/test_join.py +++ b/pandas/tests/frame/test_join.py @@ -31,11 +31,6 @@ def right(): return DataFrame({'b': [300, 100, 200]}, index=[3, 1, 2]) -@pytest.fixture -def right_non_unique(): - return DataFrame({'c': [400, 500, 600]}, index=[2, 2, 4]) - - @pytest.mark.parametrize( "how, sort, expected", [('inner', False, DataFrame({'a': [20, 10], @@ -172,9 +167,18 @@ def test_join_period_index(frame_with_period_index): tm.assert_frame_equal(joined, expected) -def test_join_left_sequence_non_unique_index(left, right, right_non_unique): - # left join sequence of dataframes with non-unique indices (issue #19607) - joined = left.join([right_non_unique], how='left') - tm.assert_index_equal( - joined.index.unique().sort_values(), - left.index.sort_values()) +def test_join_left_sequence_non_unique_index(): + # https://github.com/pandas-dev/pandas/issues/19607 + df1 = DataFrame({'a': [0, 10, 20]}, index=[1, 2, 3]) + df2 = DataFrame({'b': [100, 200, 300]}, index=[4, 3, 2]) + df3 = DataFrame({'c': [400, 500, 600]}, index=[2, 2, 4]) + + joined = df1.join([df2, df3], how='left') + + expected = DataFrame({ + 'a': [0, 10, 10, 20], + 'b': [np.nan, 300, 300, 200], + 'c': [np.nan, 400, 500, np.nan] + }, index=[1, 2, 2, 3]) + + tm.assert_frame_equal(joined, expected) From 0a068a09e45f03ab831a125c7e96cfb2e68ee28e Mon Sep 17 00:00:00 2001 From: Alexander Haupt <1485187+elrubio@users.noreply.github.com> Date: Sat, 10 Feb 2018 01:21:33 +0100 Subject: [PATCH 4/4] Mention bug in release notes --- doc/source/whatsnew/v0.23.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 083242cd69b74..3b626ddced5bf 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -772,7 +772,7 @@ Reshaping - Bug in timezone comparisons, manifesting as a conversion of the index to UTC in ``.concat()`` (:issue:`18523`) - Bug in :func:`concat` when concatting sparse and dense series it returns only a ``SparseDataFrame``. Should be a ``DataFrame``. (:issue:`18914`, :issue:`18686`, and :issue:`16874`) - Improved error message for :func:`DataFrame.merge` when there is no common merge key (:issue:`19427`) -- +- Bug in :func:`DataFrame.join` which does an *outer* instead of a *left* join when being called with multiple DataFrames and some have non-unique indices (:issue:`19624`) Other ^^^^^