diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index de2261a79da47..3f5976903418f 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -603,6 +603,8 @@ Bug Fixes - Bug in line and kde plot cannot accept multiple colors when ``subplots=True`` (:issue:`9894`) - Bug in ``DataFrame.plot`` raises ``ValueError`` when color name is specified by multiple characters (:issue:`10387`) +- Bug in left and right ``align`` of ``Series`` with ``MultiIndex`` may be inverted (:issue:`10665`) +- Bug in left and right ``join`` of with ``MultiIndex`` may be inverted (:issue:`10741`) - Bug in ``read_stata`` when reading a file with a different order set in ``columns`` (:issue:`10757`) diff --git a/pandas/core/index.py b/pandas/core/index.py index ce6c60df2fd94..a2ad29221685a 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -2156,6 +2156,8 @@ def _join_multi(self, other, how, return_indexers=True): if self_is_mi: self, other = other, self flip_order = True + # flip if join method is right or left + how = {'right': 'left', 'left': 'right'}.get(how, how) level = other.names.index(jl) result = self._join_level(other, level, how=how, diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 77ef5fecf22c9..cf63d167eeb81 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -4749,6 +4749,33 @@ def test_join_str_datetime(self): self.assertEqual(len(tst.columns), 3) + def test_join_multiindex_leftright(self): + # GH 10741 + df1 = pd.DataFrame([['a', 'x', 0.471780], ['a','y', 0.774908], + ['a', 'z', 0.563634], ['b', 'x', -0.353756], + ['b', 'y', 0.368062], ['b', 'z', -1.721840], + ['c', 'x', 1], ['c', 'y', 2], ['c', 'z', 3]], + columns=['first', 'second', 'value1']).set_index(['first', 'second']) + df2 = pd.DataFrame([['a', 10], ['b', 20]], columns=['first', 'value2']).set_index(['first']) + + exp = pd.DataFrame([[0.471780, 10], [0.774908, 10], [0.563634, 10], + [-0.353756, 20], [0.368062, 20], [-1.721840, 20], + [1.000000, np.nan], [2.000000, np.nan], [3.000000, np.nan]], + index=df1.index, columns=['value1', 'value2']) + + # these must be the same results (but columns are flipped) + tm.assert_frame_equal(df1.join(df2, how='left'), exp) + tm.assert_frame_equal(df2.join(df1, how='right'), exp[['value2', 'value1']]) + + exp_idx = pd.MultiIndex.from_product([['a', 'b'], ['x', 'y', 'z']], + names=['first', 'second']) + exp = pd.DataFrame([[0.471780, 10], [0.774908, 10], [0.563634, 10], + [-0.353756, 20], [0.368062, 20], [-1.721840, 20]], + index=exp_idx, columns=['value1', 'value2']) + + tm.assert_frame_equal(df1.join(df2, how='right'), exp) + tm.assert_frame_equal(df2.join(df1, how='left'), exp[['value2', 'value1']]) + def test_from_records_sequencelike(self): df = DataFrame({'A' : np.array(np.random.randn(6), dtype = np.float64), 'A1': np.array(np.random.randn(6), dtype = np.float64), @@ -9895,6 +9922,39 @@ def test_align_int_fill_bug(self): expected = df2 - df2.mean() assert_frame_equal(result, expected) + def test_align_multiindex(self): + # GH 10665 + # same test cases as test_align_multiindex in test_series.py + + midx = pd.MultiIndex.from_product([range(2), range(3), range(2)], + names=('a', 'b', 'c')) + idx = pd.Index(range(2), name='b') + df1 = pd.DataFrame(np.arange(12), index=midx) + df2 = pd.DataFrame(np.arange(2), index=idx) + + # these must be the same results (but flipped) + res1l, res1r = df1.align(df2, join='left') + res2l, res2r = df2.align(df1, join='right') + + expl = df1 + tm.assert_frame_equal(expl, res1l) + tm.assert_frame_equal(expl, res2r) + expr = pd.DataFrame([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx) + tm.assert_frame_equal(expr, res1r) + tm.assert_frame_equal(expr, res2l) + + res1l, res1r = df1.align(df2, join='right') + res2l, res2r = df2.align(df1, join='left') + + exp_idx = pd.MultiIndex.from_product([range(2), range(2), range(2)], + names=('a', 'b', 'c')) + expl = pd.DataFrame([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) + tm.assert_frame_equal(expl, res1l) + tm.assert_frame_equal(expl, res2r) + expr = pd.DataFrame([0, 0, 1, 1] * 2, index=exp_idx) + tm.assert_frame_equal(expr, res1r) + tm.assert_frame_equal(expr, res2l) + def test_where(self): default_frame = DataFrame(np.random.randn(5, 3),columns=['A','B','C']) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 2699e780f0edb..948d43d8d7fd7 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -4589,6 +4589,37 @@ def test_join_self(self): joined = res.join(res, how=kind) self.assertIs(res, joined) + def test_join_multi(self): + # GH 10665 + midx = pd.MultiIndex.from_product([np.arange(4), np.arange(4)], names=['a', 'b']) + idx = pd.Index([1, 2, 5], name='b') + + # inner + jidx, lidx, ridx = midx.join(idx, how='inner', return_indexers=True) + exp_idx = pd.MultiIndex.from_product([np.arange(4), [1, 2]], names=['a', 'b']) + exp_lidx = np.array([1, 2, 5, 6, 9, 10, 13, 14]) + exp_ridx = np.array([0, 1, 0, 1, 0, 1, 0, 1]) + self.assert_index_equal(jidx, exp_idx) + self.assert_numpy_array_equal(lidx, exp_lidx) + self.assert_numpy_array_equal(ridx, exp_ridx) + # flip + jidx, ridx, lidx = idx.join(midx, how='inner', return_indexers=True) + self.assert_index_equal(jidx, exp_idx) + self.assert_numpy_array_equal(lidx, exp_lidx) + self.assert_numpy_array_equal(ridx, exp_ridx) + + # keep MultiIndex + jidx, lidx, ridx = midx.join(idx, how='left', return_indexers=True) + exp_ridx = np.array([-1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1]) + self.assert_index_equal(jidx, midx) + self.assertIsNone(lidx) + self.assert_numpy_array_equal(ridx, exp_ridx) + # flip + jidx, ridx, lidx = idx.join(midx, how='right', return_indexers=True) + self.assert_index_equal(jidx, midx) + self.assertIsNone(lidx) + self.assert_numpy_array_equal(ridx, exp_ridx) + def test_reindex(self): result, indexer = self.index.reindex(list(self.index[:4])) tm.assertIsInstance(result, MultiIndex) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 66a38cd858846..fe76ec0f36a97 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -6288,6 +6288,38 @@ def test_align_sameindex(self): # self.assertIsNot(a.index, self.ts.index) # self.assertIsNot(b.index, self.ts.index) + def test_align_multiindex(self): + # GH 10665 + + midx = pd.MultiIndex.from_product([range(2), range(3), range(2)], + names=('a', 'b', 'c')) + idx = pd.Index(range(2), name='b') + s1 = pd.Series(np.arange(12), index=midx) + s2 = pd.Series(np.arange(2), index=idx) + + # these must be the same results (but flipped) + res1l, res1r = s1.align(s2, join='left') + res2l, res2r = s2.align(s1, join='right') + + expl = s1 + tm.assert_series_equal(expl, res1l) + tm.assert_series_equal(expl, res2r) + expr = pd.Series([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx) + tm.assert_series_equal(expr, res1r) + tm.assert_series_equal(expr, res2l) + + res1l, res1r = s1.align(s2, join='right') + res2l, res2r = s2.align(s1, join='left') + + exp_idx = pd.MultiIndex.from_product([range(2), range(2), range(2)], + names=('a', 'b', 'c')) + expl = pd.Series([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) + tm.assert_series_equal(expl, res1l) + tm.assert_series_equal(expl, res2r) + expr = pd.Series([0, 0, 1, 1] * 2, index=exp_idx) + tm.assert_series_equal(expr, res1r) + tm.assert_series_equal(expr, res2l) + def test_reindex(self): identity = self.series.reindex(self.series.index)