From c12bb3f634ee3451b90571cf90c3aa7b6a29fca9 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Sun, 5 Mar 2017 21:49:04 +0100 Subject: [PATCH 01/23] BUG: Fix index order for Index.intersection() --- pandas/indexes/base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 54f73a2466286..556227f3c34f2 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2128,15 +2128,15 @@ def intersection(self, other): pass try: - indexer = Index(self._values).get_indexer(other._values) + indexer = Index(other._values).get_indexer(self._values) indexer = indexer.take((indexer != -1).nonzero()[0]) except: # duplicates - indexer = Index(self._values).get_indexer_non_unique( - other._values)[0].unique() + indexer = Index(other._values).get_indexer_non_unique( + self._values)[0].unique() indexer = indexer[indexer != -1] - taken = self.take(indexer) + taken = other.take(indexer) if self.name != other.name: taken.name = None return taken From c2a8dc3009b0ac5a81ae08365ddc02b5de86097f Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Mon, 6 Mar 2017 23:35:40 +0100 Subject: [PATCH 02/23] Implement tests --- pandas/tests/frame/test_misc_api.py | 21 ++++++++------------- pandas/tests/indexes/test_base.py | 16 +++++++++------- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/pandas/tests/frame/test_misc_api.py b/pandas/tests/frame/test_misc_api.py index 321d46739b24c..8179242c7d62f 100644 --- a/pandas/tests/frame/test_misc_api.py +++ b/pandas/tests/frame/test_misc_api.py @@ -61,37 +61,32 @@ def test_join_index(self): # left / right f = self.frame.reindex(columns=['A', 'B'])[:10] - f2 = self.frame.reindex(columns=['C', 'D']) + f2 = self.frame.reindex(columns=['C', 'D'])[5:][::-1] joined = f.join(f2) self.assert_index_equal(f.index, joined.index) - self.assertEqual(len(joined.columns), 4) + expected_columns = pd.Index(['A', 'B', 'C', 'D']) + self.assert_index_equal(joined.columns, expected_columns) joined = f.join(f2, how='left') self.assert_index_equal(joined.index, f.index) - self.assertEqual(len(joined.columns), 4) + self.assert_index_equal(joined.columns, expected_columns) joined = f.join(f2, how='right') self.assert_index_equal(joined.index, f2.index) - self.assertEqual(len(joined.columns), 4) + self.assert_index_equal(joined.columns, expected_columns) # inner - f = self.frame.reindex(columns=['A', 'B'])[:10] - f2 = self.frame.reindex(columns=['C', 'D']) - joined = f.join(f2, how='inner') self.assert_index_equal(joined.index, f.index.intersection(f2.index)) - self.assertEqual(len(joined.columns), 4) + self.assert_index_equal(joined.columns, expected_columns) # outer - f = self.frame.reindex(columns=['A', 'B'])[:10] - f2 = self.frame.reindex(columns=['C', 'D']) - joined = f.join(f2, how='outer') - self.assertTrue(tm.equalContents(self.frame.index, joined.index)) - self.assertEqual(len(joined.columns), 4) + self..assert_index_equal(joined.index, self.frame.index.sort_values()) + self.assert_index_equal(joined.columns, expected_columns) assertRaisesRegexp(ValueError, 'join method', f.join, f2, how='foo') diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index c4dc10d8174cc..ce01335ea01e7 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -626,14 +626,16 @@ def test_intersection(self): # non monotonic idx1 = Index([5, 3, 2, 4, 1], name='idx') idx2 = Index([4, 7, 6, 5, 3], name='idx') - result2 = idx1.intersection(idx2) - self.assertTrue(tm.equalContents(result2, expected2)) - self.assertEqual(result2.name, expected2.name) + expected = Index([5, 3, 4], name='idx') + result = idx1.intersection(idx2) + self.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) - idx3 = Index([4, 7, 6, 5, 3], name='other') - result3 = idx1.intersection(idx3) - self.assertTrue(tm.equalContents(result3, expected3)) - self.assertEqual(result3.name, expected3.name) + idx2 = Index([4, 7, 6, 5, 3], name='other') + expected = Index([5, 3, 4], name=None) + result = idx1.intersection(idx2) + self.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) # non-monotonic non-unique idx1 = Index(['A', 'B', 'A', 'C']) From a4ead99367c7004834a513fddee4bc68b3cac9d7 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Tue, 7 Mar 2017 00:28:12 +0100 Subject: [PATCH 03/23] Fix typo --- pandas/tests/frame/test_misc_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_misc_api.py b/pandas/tests/frame/test_misc_api.py index 8179242c7d62f..6b72d31421afd 100644 --- a/pandas/tests/frame/test_misc_api.py +++ b/pandas/tests/frame/test_misc_api.py @@ -85,7 +85,7 @@ def test_join_index(self): # outer joined = f.join(f2, how='outer') - self..assert_index_equal(joined.index, self.frame.index.sort_values()) + self.assert_index_equal(joined.index, self.frame.index.sort_values()) self.assert_index_equal(joined.columns, expected_columns) assertRaisesRegexp(ValueError, 'join method', f.join, f2, how='foo') From e7bcd2854050a8ef9aadaaa737d4bd8c1081f756 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Wed, 8 Mar 2017 02:08:49 +0100 Subject: [PATCH 04/23] Fix typo in documentation --- pandas/tseries/index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 11d2d29597fc0..05f415b2178cb 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1528,7 +1528,7 @@ def _get_freq(self): def _set_freq(self, value): self.offset = value freq = property(fget=_get_freq, fset=_set_freq, - doc="get/set the frequncy of the Index") + doc="get/set the frequency of the Index") year = _field_accessor('year', 'Y', "The year of the datetime") month = _field_accessor('month', 'M', From d9e29f83fc93085fd3d6bafb481dcc1c11e9b691 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Wed, 8 Mar 2017 02:15:15 +0100 Subject: [PATCH 05/23] Create new DatetimeIndex from the Index.intersection result --- pandas/tseries/index.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 05f415b2178cb..4826b81b31cc0 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1203,9 +1203,9 @@ def intersection(self, other): not other.offset.isAnchored() or (not self.is_monotonic or not other.is_monotonic)): result = Index.intersection(self, other) - if isinstance(result, DatetimeIndex): - if result.freq is None: - result.offset = to_offset(result.inferred_freq) + result = self._simple_new(result._values, name=result.name, tz=result.tz) + if result.freq is None: + result.offset = to_offset(result.inferred_freq) return result if len(self) == 0: From 1197b99c13f1c16f4d6bd2862f4bff05f7d18fda Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Sun, 12 Mar 2017 13:31:58 +0100 Subject: [PATCH 06/23] Fix DataFrame column order when read from HDF file Issue #15582 --- pandas/io/pytables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index b3b253f151541..f75a4761e0948 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4321,7 +4321,7 @@ def _reindex_axis(obj, axis, labels, other=None): labels = _ensure_index(labels.unique()) if other is not None: - labels = labels & _ensure_index(other.unique()) + labels = _ensure_index(other.unique()) & labels if not labels.equals(ax): slicer = [slice(None, None)] * obj.ndim slicer[axis] = labels From 784fe75997c88a2298f2cea634e6f22a9fcc1af6 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Sun, 12 Mar 2017 23:49:42 +0100 Subject: [PATCH 07/23] Fix error: line too long --- pandas/tseries/index.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 4826b81b31cc0..46ce88ef30893 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1203,7 +1203,8 @@ def intersection(self, other): not other.offset.isAnchored() or (not self.is_monotonic or not other.is_monotonic)): result = Index.intersection(self, other) - result = self._simple_new(result._values, name=result.name, tz=result.tz) + result = self._simple_new(result._values, name=result.name, + tz=result.tz) if result.freq is None: result.offset = to_offset(result.inferred_freq) return result From b977278bfda63596d95130e1b94de21a9b0affa2 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Tue, 14 Mar 2017 23:51:41 +0100 Subject: [PATCH 08/23] Address requested changes --- pandas/core/frame.py | 5 +- pandas/indexes/base.py | 4 +- pandas/tests/frame/test_join.py | 113 ++++++++++++++++++++++++++++ pandas/tests/frame/test_misc_api.py | 81 -------------------- pandas/tests/indexes/test_base.py | 7 +- pandas/tseries/index.py | 4 +- 6 files changed, 124 insertions(+), 90 deletions(-) create mode 100644 pandas/tests/frame/test_join.py diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6b5e8e0799421..8626c67118d12 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4463,7 +4463,7 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', * left: use calling frame's index (or column if on is specified) * right: use other frame's index * outer: form union of calling frame's index (or column if on is - specified) with other frame's index + specified) with other frame's index, and sort it * inner: form intersection of calling frame's index (or column if on is specified) with other frame's index lsuffix : string @@ -4471,8 +4471,7 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', rsuffix : string Suffix to use from right frame's overlapping columns sort : boolean, default False - Order result DataFrame lexicographically by the join key. If False, - preserves the index order of the calling (left) DataFrame + Order result DataFrame lexicographically by the join key Notes ----- diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 556227f3c34f2..02f720e70839b 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2089,8 +2089,8 @@ def intersection(self, other): """ Form the intersection of two Index objects. - This returns a new Index with elements common to the index and `other`. - Sortedness of the result is not guaranteed. + This returns a new Index with elements common to the index and `other`, + preserving the calling index order. Parameters ---------- diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py new file mode 100644 index 0000000000000..d0cfd01973afb --- /dev/null +++ b/pandas/tests/frame/test_join.py @@ -0,0 +1,113 @@ +# -*- coding: utf-8 -*- + +from __future__ import print_function + +from datetime import datetime + +import numpy as np +from numpy import nan + +import pandas as pd + +from pandas import DataFrame, Index, Series, Timestamp, date_range +from pandas.compat import lrange + +from pandas.tests.frame.common import TestData + +import pandas.util.testing as tm +from pandas.util.testing import (assertRaisesRegexp, + assert_frame_equal, + assert_series_equal) + + +class TestDataFrameJoin(tm.TestCase, TestData): + + def test_join(self): + df1 = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) + df2 = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) + + result = df1.join(df2, how='inner') + expected = pd.DataFrame({'a': [20, 10], 'b': [200, 100]}, index=[2, 1]) + self.assert_frame_equal(result, expected) + + def test_join_index(self): + # left / right + + f = self.frame.loc[self.frame.index[:10], ['A', 'B']] + f2 = self.frame.loc[self.frame.index[5:], ['C', 'D']].iloc[::-1] + + joined = f.join(f2) + self.assert_index_equal(f.index, joined.index) + expected_columns = pd.Index(['A', 'B', 'C', 'D']) + self.assert_index_equal(joined.columns, expected_columns) + + joined = f.join(f2, how='left') + self.assert_index_equal(joined.index, f.index) + self.assert_index_equal(joined.columns, expected_columns) + + joined = f.join(f2, how='right') + self.assert_index_equal(joined.index, f2.index) + self.assert_index_equal(joined.columns, expected_columns) + + # inner + + joined = f.join(f2, how='inner') + self.assert_index_equal(joined.index, f.index[5:10]) + self.assert_index_equal(joined.columns, expected_columns) + + # outer + + joined = f.join(f2, how='outer') + self.assert_index_equal(joined.index, self.frame.index.sort_values()) + self.assert_index_equal(joined.columns, expected_columns) + + assertRaisesRegexp(ValueError, 'join method', f.join, f2, how='foo') + + # corner case - overlapping columns + for how in ('outer', 'left', 'inner'): + with assertRaisesRegexp(ValueError, 'columns overlap but ' + 'no suffix'): + self.frame.join(self.frame, how=how) + + def test_join_index_more(self): + af = self.frame.loc[:, ['A', 'B']] + bf = self.frame.loc[::2, ['C', 'D']] + + expected = af.copy() + expected['C'] = self.frame['C'][::2] + expected['D'] = self.frame['D'][::2] + + result = af.join(bf) + assert_frame_equal(result, expected) + + result = af.join(bf, how='right') + assert_frame_equal(result, expected[::2]) + + result = bf.join(af, how='right') + assert_frame_equal(result, expected.loc[:, result.columns]) + + def test_join_index_series(self): + df = self.frame.copy() + s = df.pop(self.frame.columns[-1]) + joined = df.join(s) + + # TODO should this check_names ? + assert_frame_equal(joined, self.frame, check_names=False) + + s.name = None + assertRaisesRegexp(ValueError, 'must have a name', df.join, s) + + def test_join_overlap(self): + df1 = self.frame.loc[:, ['A', 'B', 'C']] + df2 = self.frame.loc[:, ['B', 'C', 'D']] + + joined = df1.join(df2, lsuffix='_df1', rsuffix='_df2') + df1_suf = df1.loc[:, ['B', 'C']].add_suffix('_df1') + df2_suf = df2.loc[:, ['B', 'C']].add_suffix('_df2') + + no_overlap = self.frame.loc[:, ['A', 'D']] + expected = df1_suf.join(df2_suf).join(no_overlap) + + # column order not necessarily sorted + assert_frame_equal(joined, expected.loc[:, joined.columns]) + diff --git a/pandas/tests/frame/test_misc_api.py b/pandas/tests/frame/test_misc_api.py index 6b72d31421afd..42427df90401d 100644 --- a/pandas/tests/frame/test_misc_api.py +++ b/pandas/tests/frame/test_misc_api.py @@ -57,87 +57,6 @@ def test_get_value(self): expected = self.frame[col][idx] tm.assert_almost_equal(result, expected) - def test_join_index(self): - # left / right - - f = self.frame.reindex(columns=['A', 'B'])[:10] - f2 = self.frame.reindex(columns=['C', 'D'])[5:][::-1] - - joined = f.join(f2) - self.assert_index_equal(f.index, joined.index) - expected_columns = pd.Index(['A', 'B', 'C', 'D']) - self.assert_index_equal(joined.columns, expected_columns) - - joined = f.join(f2, how='left') - self.assert_index_equal(joined.index, f.index) - self.assert_index_equal(joined.columns, expected_columns) - - joined = f.join(f2, how='right') - self.assert_index_equal(joined.index, f2.index) - self.assert_index_equal(joined.columns, expected_columns) - - # inner - - joined = f.join(f2, how='inner') - self.assert_index_equal(joined.index, f.index.intersection(f2.index)) - self.assert_index_equal(joined.columns, expected_columns) - - # outer - - joined = f.join(f2, how='outer') - self.assert_index_equal(joined.index, self.frame.index.sort_values()) - self.assert_index_equal(joined.columns, expected_columns) - - assertRaisesRegexp(ValueError, 'join method', f.join, f2, how='foo') - - # corner case - overlapping columns - for how in ('outer', 'left', 'inner'): - with assertRaisesRegexp(ValueError, 'columns overlap but ' - 'no suffix'): - self.frame.join(self.frame, how=how) - - def test_join_index_more(self): - af = self.frame.loc[:, ['A', 'B']] - bf = self.frame.loc[::2, ['C', 'D']] - - expected = af.copy() - expected['C'] = self.frame['C'][::2] - expected['D'] = self.frame['D'][::2] - - result = af.join(bf) - assert_frame_equal(result, expected) - - result = af.join(bf, how='right') - assert_frame_equal(result, expected[::2]) - - result = bf.join(af, how='right') - assert_frame_equal(result, expected.loc[:, result.columns]) - - def test_join_index_series(self): - df = self.frame.copy() - s = df.pop(self.frame.columns[-1]) - joined = df.join(s) - - # TODO should this check_names ? - assert_frame_equal(joined, self.frame, check_names=False) - - s.name = None - assertRaisesRegexp(ValueError, 'must have a name', df.join, s) - - def test_join_overlap(self): - df1 = self.frame.loc[:, ['A', 'B', 'C']] - df2 = self.frame.loc[:, ['B', 'C', 'D']] - - joined = df1.join(df2, lsuffix='_df1', rsuffix='_df2') - df1_suf = df1.loc[:, ['B', 'C']].add_suffix('_df1') - df2_suf = df2.loc[:, ['B', 'C']].add_suffix('_df2') - - no_overlap = self.frame.loc[:, ['A', 'D']] - expected = df1_suf.join(df2_suf).join(no_overlap) - - # column order not necessarily sorted - assert_frame_equal(joined, expected.loc[:, joined.columns]) - def test_add_prefix_suffix(self): with_prefix = self.frame.add_prefix('foo#') expected = pd.Index(['foo#%s' % c for c in self.frame.columns]) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index ce01335ea01e7..a8197b070b032 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -629,13 +629,11 @@ def test_intersection(self): expected = Index([5, 3, 4], name='idx') result = idx1.intersection(idx2) self.assert_index_equal(result, expected) - self.assertEqual(result.name, expected.name) idx2 = Index([4, 7, 6, 5, 3], name='other') expected = Index([5, 3, 4], name=None) result = idx1.intersection(idx2) self.assert_index_equal(result, expected) - self.assertEqual(result.name, expected.name) # non-monotonic non-unique idx1 = Index(['A', 'B', 'A', 'C']) @@ -644,6 +642,11 @@ def test_intersection(self): result = idx1.intersection(idx2) self.assert_index_equal(result, expected) + idx2 = Index(['B', 'D', 'A']) + expected = Index(['A', 'B', 'A'], dtype='object') + result = idx1.intersection(idx2) + self.assert_index_equal(result, expected) + # preserve names first = self.strIndex[5:20] second = self.strIndex[:10] diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 46ce88ef30893..9ec01262b79ba 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1203,8 +1203,8 @@ def intersection(self, other): not other.offset.isAnchored() or (not self.is_monotonic or not other.is_monotonic)): result = Index.intersection(self, other) - result = self._simple_new(result._values, name=result.name, - tz=result.tz) + result = self._shallow_copy(result._values, name=result.name, + tz=result.tz, freq=None) if result.freq is None: result.offset = to_offset(result.inferred_freq) return result From ec836bd48013fc4fa84dbabb1766ded756e160b9 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Wed, 15 Mar 2017 07:04:21 +0100 Subject: [PATCH 09/23] Fix Travis errors --- pandas/tests/frame/test_join.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py index d0cfd01973afb..59e1c0fa10004 100644 --- a/pandas/tests/frame/test_join.py +++ b/pandas/tests/frame/test_join.py @@ -2,22 +2,13 @@ from __future__ import print_function -from datetime import datetime - -import numpy as np -from numpy import nan - import pandas as pd -from pandas import DataFrame, Index, Series, Timestamp, date_range -from pandas.compat import lrange - from pandas.tests.frame.common import TestData import pandas.util.testing as tm from pandas.util.testing import (assertRaisesRegexp, - assert_frame_equal, - assert_series_equal) + assert_frame_equal) class TestDataFrameJoin(tm.TestCase, TestData): @@ -109,5 +100,4 @@ def test_join_overlap(self): expected = df1_suf.join(df2_suf).join(no_overlap) # column order not necessarily sorted - assert_frame_equal(joined, expected.loc[:, joined.columns]) - + assert_frame_equal(joined, expected.loc[:, joined.columns]) \ No newline at end of file From 047b513d4c0aa9eb16c652de76e3623ff980e4b6 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Wed, 15 Mar 2017 22:11:25 +0100 Subject: [PATCH 10/23] Address requested changes --- pandas/core/frame.py | 6 ++- pandas/tests/frame/test_join.py | 66 +++++++++++++++++++++------------ 2 files changed, 47 insertions(+), 25 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8626c67118d12..ca53f863f1f15 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4465,13 +4465,15 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', * outer: form union of calling frame's index (or column if on is specified) with other frame's index, and sort it * inner: form intersection of calling frame's index (or column if - on is specified) with other frame's index + on is specified) with other frame's index, preserving the + order of the calling's one lsuffix : string Suffix to use from left frame's overlapping columns rsuffix : string Suffix to use from right frame's overlapping columns sort : boolean, default False - Order result DataFrame lexicographically by the join key + Order result DataFrame lexicographically by the join key. If False, + the order of the join key depends on the join type (how keyword) Notes ----- diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py index 59e1c0fa10004..3c35cbe52499f 100644 --- a/pandas/tests/frame/test_join.py +++ b/pandas/tests/frame/test_join.py @@ -7,19 +7,39 @@ from pandas.tests.frame.common import TestData import pandas.util.testing as tm -from pandas.util.testing import (assertRaisesRegexp, - assert_frame_equal) -class TestDataFrameJoin(tm.TestCase, TestData): +class TestDataFrameJoin(TestData): def test_join(self): df1 = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) df2 = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) + result = df1.join(df2) + expected = pd.DataFrame({'a': [20, 10, 0], 'b': [200, 100, None]}, + index=[2, 1, 0]) + tm.assert_frame_equal(result, expected) + + result = df1.join(df2, how='left') + expected = pd.DataFrame({'a': [20, 10, 0], 'b': [200, 100, None]}, + index=[2, 1, 0]) + tm.assert_frame_equal(result, expected) + + result = df1.join(df2, how='right') + expected = pd.DataFrame({'a': [10, 20, None], 'b': [100, 200, 300]}, + index=[1, 2, 3]) + tm.assert_frame_equal(result, expected) + result = df1.join(df2, how='inner') - expected = pd.DataFrame({'a': [20, 10], 'b': [200, 100]}, index=[2, 1]) - self.assert_frame_equal(result, expected) + expected = pd.DataFrame({'a': [20, 10], 'b': [200, 100]}, + index=[2, 1]) + tm.assert_frame_equal(result, expected) + + result = df1.join(df2, how='outer') + expected = pd.DataFrame({'a': [0, 10, 20, None], + 'b': [None, 100, 200, 300]}, + index=[0, 1, 2, 3]) + tm.assert_frame_equal(result, expected) def test_join_index(self): # left / right @@ -28,35 +48,35 @@ def test_join_index(self): f2 = self.frame.loc[self.frame.index[5:], ['C', 'D']].iloc[::-1] joined = f.join(f2) - self.assert_index_equal(f.index, joined.index) + tm.assert_index_equal(f.index, joined.index) expected_columns = pd.Index(['A', 'B', 'C', 'D']) - self.assert_index_equal(joined.columns, expected_columns) + tm.assert_index_equal(joined.columns, expected_columns) joined = f.join(f2, how='left') - self.assert_index_equal(joined.index, f.index) - self.assert_index_equal(joined.columns, expected_columns) + tm.assert_index_equal(joined.index, f.index) + tm.assert_index_equal(joined.columns, expected_columns) joined = f.join(f2, how='right') - self.assert_index_equal(joined.index, f2.index) - self.assert_index_equal(joined.columns, expected_columns) + tm.assert_index_equal(joined.index, f2.index) + tm.assert_index_equal(joined.columns, expected_columns) # inner joined = f.join(f2, how='inner') - self.assert_index_equal(joined.index, f.index[5:10]) - self.assert_index_equal(joined.columns, expected_columns) + tm.assert_index_equal(joined.index, f.index[5:10]) + tm.assert_index_equal(joined.columns, expected_columns) # outer joined = f.join(f2, how='outer') - self.assert_index_equal(joined.index, self.frame.index.sort_values()) - self.assert_index_equal(joined.columns, expected_columns) + tm.assert_index_equal(joined.index, self.frame.index.sort_values()) + tm.assert_index_equal(joined.columns, expected_columns) - assertRaisesRegexp(ValueError, 'join method', f.join, f2, how='foo') + tm.assertRaisesRegexp(ValueError, 'join method', f.join, f2, how='foo') # corner case - overlapping columns for how in ('outer', 'left', 'inner'): - with assertRaisesRegexp(ValueError, 'columns overlap but ' + with tm.assertRaisesRegexp(ValueError, 'columns overlap but ' 'no suffix'): self.frame.join(self.frame, how=how) @@ -69,13 +89,13 @@ def test_join_index_more(self): expected['D'] = self.frame['D'][::2] result = af.join(bf) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = af.join(bf, how='right') - assert_frame_equal(result, expected[::2]) + tm.assert_frame_equal(result, expected[::2]) result = bf.join(af, how='right') - assert_frame_equal(result, expected.loc[:, result.columns]) + tm.assert_frame_equal(result, expected.loc[:, result.columns]) def test_join_index_series(self): df = self.frame.copy() @@ -83,10 +103,10 @@ def test_join_index_series(self): joined = df.join(s) # TODO should this check_names ? - assert_frame_equal(joined, self.frame, check_names=False) + tm.assert_frame_equal(joined, self.frame, check_names=False) s.name = None - assertRaisesRegexp(ValueError, 'must have a name', df.join, s) + tm.assertRaisesRegexp(ValueError, 'must have a name', df.join, s) def test_join_overlap(self): df1 = self.frame.loc[:, ['A', 'B', 'C']] @@ -100,4 +120,4 @@ def test_join_overlap(self): expected = df1_suf.join(df2_suf).join(no_overlap) # column order not necessarily sorted - assert_frame_equal(joined, expected.loc[:, joined.columns]) \ No newline at end of file + tm.assert_frame_equal(joined, expected.loc[:, joined.columns]) From c96306d618c2f325a4a141be1bd0dce3b42ae1ec Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Wed, 15 Mar 2017 22:14:12 +0100 Subject: [PATCH 11/23] Add sort argument to Index.join Issue #15582: fix the sort=True bug in DataFrame.join --- pandas/indexes/base.py | 7 ++++++- pandas/indexes/range.py | 10 +++++++--- pandas/tests/frame/test_join.py | 35 +++++++++++++++++++++++++++++++++ pandas/tools/merge.py | 3 ++- pandas/tseries/index.py | 5 +++-- 5 files changed, 53 insertions(+), 7 deletions(-) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 02f720e70839b..657a823837c9e 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2831,7 +2831,8 @@ def _reindex_non_unique(self, target): new_index = self._shallow_copy_with_infer(new_labels, freq=None) return new_index, indexer, new_indexer - def join(self, other, how='left', level=None, return_indexers=False): + def join(self, other, how='left', level=None, return_indexers=False, + sort=False): """ *this is an internal non-public method* @@ -2844,6 +2845,7 @@ def join(self, other, how='left', level=None, return_indexers=False): how : {'left', 'right', 'inner', 'outer'} level : int or level name, default None return_indexers : boolean, default False + sort : boolean, default False Returns ------- @@ -2929,6 +2931,9 @@ def join(self, other, how='left', level=None, return_indexers=False): elif how == 'outer': join_index = self.union(other) + if sort: + join_index = join_index.sort_values() + if return_indexers: if join_index is self: lindexer = None diff --git a/pandas/indexes/range.py b/pandas/indexes/range.py index 103a3ac2fd5f4..47902b0fc400d 100644 --- a/pandas/indexes/range.py +++ b/pandas/indexes/range.py @@ -431,7 +431,8 @@ def union(self, other): return self._int64index.union(other) - def join(self, other, how='left', level=None, return_indexers=False): + def join(self, other, how='left', level=None, return_indexers=False, + sort=False): """ *this is an internal non-public method* @@ -444,6 +445,7 @@ def join(self, other, how='left', level=None, return_indexers=False): how : {'left', 'right', 'inner', 'outer'} level : int or level name, default None return_indexers : boolean, default False + sort : boolean, default False Returns ------- @@ -451,9 +453,11 @@ def join(self, other, how='left', level=None, return_indexers=False): """ if how == 'outer' and self is not other: # note: could return RangeIndex in more circumstances - return self._int64index.join(other, how, level, return_indexers) + return self._int64index.join(other, how, level, return_indexers, + sort) - return super(RangeIndex, self).join(other, how, level, return_indexers) + return super(RangeIndex, self).join(other, how, level, return_indexers, + sort) def __len__(self): """ diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py index 3c35cbe52499f..0df5d57cc2976 100644 --- a/pandas/tests/frame/test_join.py +++ b/pandas/tests/frame/test_join.py @@ -41,6 +41,41 @@ def test_join(self): index=[0, 1, 2, 3]) tm.assert_frame_equal(result, expected) + def test_join_sort(self): + df1 = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) + df2 = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) + + result = df1.join(df2, sort=True) + expected = pd.DataFrame({'a': [0, 10, 20], 'b': [None, 100, 200]}, + index=[0, 1, 2]) + tm.assert_frame_equal(result, expected) + + result = df1.join(df2, how='left', sort=True) + expected = pd.DataFrame({'a': [0, 10, 20], 'b': [None, 100, 200]}, + index=[0, 1, 2]) + tm.assert_frame_equal(result, expected) + + result = df1.join(df2, how='right', sort=True) + expected = pd.DataFrame({'a': [10, 20, None], 'b': [100, 200, 300]}, + index=[1, 2, 3]) + tm.assert_frame_equal(result, expected) + + result = df2.join(df1, how='right', sort=True) + expected = pd.DataFrame([[None, 0], [100, 10], [200, 20]], + columns=['b', 'a'], index=[0, 1, 2]) + tm.assert_frame_equal(result, expected) + + result = df1.join(df2, how='inner', sort=True) + expected = pd.DataFrame({'a': [10, 20], 'b': [100, 200]}, + index=[1, 2]) + tm.assert_frame_equal(result, expected) + + result = df1.join(df2, how='outer', sort=True) + expected = pd.DataFrame({'a': [0, 10, 20, None], + 'b': [None, 100, 200, 300]}, + index=[0, 1, 2, 3]) + tm.assert_frame_equal(result, expected) + def test_join_index(self): # left / right diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 60d523a8ea539..7de2549cadfc7 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -733,7 +733,8 @@ def _get_join_info(self): if self.left_index and self.right_index and self.how != 'asof': join_index, left_indexer, right_indexer = \ - left_ax.join(right_ax, how=self.how, return_indexers=True) + left_ax.join(right_ax, how=self.how, return_indexers=True, + sort=self.sort) elif self.right_index and self.how == 'left': join_index, left_indexer, right_indexer = \ _left_join_on_index(left_ax, right_ax, self.left_join_keys, diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 9ec01262b79ba..5ca45bb63bd59 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1037,7 +1037,8 @@ def union_many(self, others): this.offset = to_offset(this.inferred_freq) return this - def join(self, other, how='left', level=None, return_indexers=False): + def join(self, other, how='left', level=None, return_indexers=False, + sort=False): """ See Index.join """ @@ -1051,7 +1052,7 @@ def join(self, other, how='left', level=None, return_indexers=False): this, other = self._maybe_utc_convert(other) return Index.join(this, other, how=how, level=level, - return_indexers=return_indexers) + return_indexers=return_indexers, sort=sort) def _maybe_utc_convert(self, other): this = self From f0d9d03e96730e5dd4d41098241bee6f5c03a563 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Wed, 15 Mar 2017 23:29:33 +0100 Subject: [PATCH 12/23] Add whatsnew --- doc/source/whatsnew/v0.20.0.txt | 49 +++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 37a70435ed6ff..192d4f95a8cba 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -738,6 +738,55 @@ New Behavior: TypeError: Cannot compare 2014-01-01 00:00:00 of type to string column +.. _whatsnew_0200.api_breaking.index_order: + +Index order after DataFrame inner join or Index intersection +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``DataFrame`` inner join and the ``Index`` intersection, now preserve the +order of the calling's Index (left) instead of the other's Index (right) +(:issue:`15582`) + +Previous Behavior: + +.. code-block:: ipython + In [2]: df1 = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) + + In [3]: df2 = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) + + In [4]: df1.join(df2, how='inner') + Out[4]: + a b + 1 10 100 + 2 20 200 + + In [5]: idx1 = pd.Index([5, 3, 2, 4, 1]) + + In [6]: idx2 = pd.Index([4, 7, 6, 5, 3]) + + In [7]: idx1.intersection(idx2) + Out[7]: Int64Index([4, 5, 3], dtype='int64') + +New Behavior: + +.. code-block:: ipython + In [2]: df1 = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) + + In [3]: df2 = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) + + In [4]: df1.join(df2, how='inner') + Out[4]: + a b + 2 20 200 + 1 10 100 + + In [5]: idx1 = pd.Index([5, 3, 2, 4, 1]) + + In [6]: idx2 = pd.Index([4, 7, 6, 5, 3]) + + In [7]: idx1.intersection(idx2) + Out[7]: Int64Index([5, 3, 4], dtype='int64') + .. _whatsnew_0200.api: From ef2581e7792b6357722b450e992e32a9ce534aba Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Thu, 16 Mar 2017 00:07:31 +0100 Subject: [PATCH 13/23] Fix Travis error --- pandas/tests/frame/test_join.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py index 0df5d57cc2976..ec6f8ceea5a0c 100644 --- a/pandas/tests/frame/test_join.py +++ b/pandas/tests/frame/test_join.py @@ -112,7 +112,7 @@ def test_join_index(self): # corner case - overlapping columns for how in ('outer', 'left', 'inner'): with tm.assertRaisesRegexp(ValueError, 'columns overlap but ' - 'no suffix'): + 'no suffix'): self.frame.join(self.frame, how=how) def test_join_index_more(self): From 3c200fe734423e0c5c58fc9c1e198d52022b200c Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Sat, 18 Mar 2017 10:43:31 +0100 Subject: [PATCH 14/23] Add new tests --- pandas/tests/frame/test_merge.py | 86 ++++++++++++++++++++++++++++++++ pandas/tests/tools/test_merge.py | 78 +++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+) create mode 100644 pandas/tests/frame/test_merge.py diff --git a/pandas/tests/frame/test_merge.py b/pandas/tests/frame/test_merge.py new file mode 100644 index 0000000000000..f29a371d5a970 --- /dev/null +++ b/pandas/tests/frame/test_merge.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- + +from __future__ import print_function + +import numpy as np +import pandas as pd +import pandas.util.testing as tm + + +class TestDataFrameMerge(object): + + def test_merge_on_indexes(self): + df1 = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) + df2 = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) + + # default how='inner' + result = df1.merge(df2, left_index=True, right_index=True) + expected = pd.DataFrame({'a': [20, 10], 'b': [200, 100]}, + index=[2, 1]) + tm.assert_frame_equal(result, expected) + + # how='left' + result = df1.merge(df2, left_index=True, right_index=True, how='left') + expected = pd.DataFrame({'a': [20, 10, 0], 'b': [200, 100, np.nan]}, + index=[2, 1, 0]) + tm.assert_frame_equal(result, expected) + + # how='right' + result = df1.merge(df2, left_index=True, right_index=True, how='right') + expected = pd.DataFrame({'a': [10, 20, np.nan], 'b': [100, 200, 300]}, + index=[1, 2, 3]) + tm.assert_frame_equal(result, expected) + + # how='inner' + result = df1.merge(df2, left_index=True, right_index=True, how='inner') + expected = pd.DataFrame({'a': [20, 10], 'b': [200, 100]}, + index=[2, 1]) + tm.assert_frame_equal(result, expected) + + # how='outer' + result = df1.merge(df2, left_index=True, right_index=True, how='outer') + expected = pd.DataFrame({'a': [0, 10, 20, np.nan], + 'b': [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3]) + tm.assert_frame_equal(result, expected) + + def test_merge_on_indexes_sort(self): + df1 = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) + df2 = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) + + # default how='inner' + result = df1.merge(df2, left_index=True, right_index=True, sort=True) + expected = pd.DataFrame({'a': [10, 20], 'b': [100, 200]}, + index=[1, 2]) + tm.assert_frame_equal(result, expected) + + # how='left' + result = df1.merge(df2, left_index=True, right_index=True, how='left', sort=True) + expected = pd.DataFrame({'a': [0, 10, 20], 'b': [np.nan, 100, 200]}, + index=[0, 1, 2]) + tm.assert_frame_equal(result, expected) + + # how='right' (already sorted) + result = df1.merge(df2, left_index=True, right_index=True, how='right', sort=True) + expected = pd.DataFrame({'a': [10, 20, np.nan], 'b': [100, 200, 300]}, + index=[1, 2, 3]) + tm.assert_frame_equal(result, expected) + + # how='right' + result = df2.merge(df1, left_index=True, right_index=True, how='right', sort=True) + expected = pd.DataFrame([[np.nan, 0], [100, 10], [200, 20]], + columns=['b', 'a'], index=[0, 1, 2]) + tm.assert_frame_equal(result, expected) + + # how='inner' + result = df1.merge(df2, left_index=True, right_index=True, how='inner', sort=True) + expected = pd.DataFrame({'a': [10, 20], 'b': [100, 200]}, + index=[1, 2]) + tm.assert_frame_equal(result, expected) + + # how='outer' + result = df1.merge(df2, left_index=True, right_index=True, how='outer', sort=True) + expected = pd.DataFrame({'a': [0, 10, 20, np.nan], + 'b': [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3]) + tm.assert_frame_equal(result, expected) \ No newline at end of file diff --git a/pandas/tests/tools/test_merge.py b/pandas/tests/tools/test_merge.py index ff27500355998..51c96d19d3677 100644 --- a/pandas/tests/tools/test_merge.py +++ b/pandas/tests/tools/test_merge.py @@ -1355,3 +1355,81 @@ def test_dtype_on_merged_different(self, change, how, left, right): np.dtype('int64')], index=['X', 'Y', 'Z']) assert_series_equal(result, expected) + +class TestMergeOnIndexes(object): + + def test_merge_on_indexes(self): + df1 = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) + df2 = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) + + # default how='inner' + result = pd.merge(df1, df2, left_index=True, right_index=True) + expected = pd.DataFrame({'a': [20, 10], 'b': [200, 100]}, + index=[2, 1]) + tm.assert_frame_equal(result, expected) + + # how='left' + result = pd.merge(df1, df2, left_index=True, right_index=True, how='left') + expected = pd.DataFrame({'a': [20, 10, 0], 'b': [200, 100, np.nan]}, + index=[2, 1, 0]) + tm.assert_frame_equal(result, expected) + + # how='right' + result = pd.merge(df1, df2, left_index=True, right_index=True, how='right') + expected = pd.DataFrame({'a': [10, 20, np.nan], 'b': [100, 200, 300]}, + index=[1, 2, 3]) + tm.assert_frame_equal(result, expected) + + # how='inner' + result = pd.merge(df1, df2, left_index=True, right_index=True, how='inner') + expected = pd.DataFrame({'a': [20, 10], 'b': [200, 100]}, + index=[2, 1]) + tm.assert_frame_equal(result, expected) + + # how='outer' + result = pd.merge(df1, df2, left_index=True, right_index=True, how='outer') + expected = pd.DataFrame({'a': [0, 10, 20, np.nan], + 'b': [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3]) + tm.assert_frame_equal(result, expected) + + def test_merge_on_indexes_sort(self): + df1 = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) + df2 = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) + + # default how='inner' + result = pd.merge(df1, df2, left_index=True, right_index=True, sort=True) + expected = pd.DataFrame({'a': [10, 20], 'b': [100, 200]}, + index=[1, 2]) + tm.assert_frame_equal(result, expected) + + # how='left' + result = pd.merge(df1, df2, left_index=True, right_index=True, how='left', sort=True) + expected = pd.DataFrame({'a': [0, 10, 20], 'b': [np.nan, 100, 200]}, + index=[0, 1, 2]) + tm.assert_frame_equal(result, expected) + + # how='right' (already sorted) + result = pd.merge(df1, df2, left_index=True, right_index=True, how='right', sort=True) + expected = pd.DataFrame({'a': [10, 20, np.nan], 'b': [100, 200, 300]}, + index=[1, 2, 3]) + tm.assert_frame_equal(result, expected) + + # how='right' + result = pd.merge(df2, df1, left_index=True, right_index=True, how='right', sort=True) + expected = pd.DataFrame([[np.nan, 0], [100, 10], [200, 20]], + columns=['b', 'a'], index=[0, 1, 2]) + tm.assert_frame_equal(result, expected) + + # how='inner' + result = pd.merge(df1, df2, left_index=True, right_index=True, how='inner', sort=True) + expected = pd.DataFrame({'a': [10, 20], 'b': [100, 200]}, + index=[1, 2]) + tm.assert_frame_equal(result, expected) + + # how='outer' + result = pd.merge(df1, df2, left_index=True, right_index=True, how='outer', sort=True) + expected = pd.DataFrame({'a': [0, 10, 20, np.nan], + 'b': [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3]) + tm.assert_frame_equal(result, expected) From 33eb740977a90560f310975280cda085a47d444d Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Sat, 18 Mar 2017 10:44:57 +0100 Subject: [PATCH 15/23] Address requested changes --- doc/source/whatsnew/v0.20.0.txt | 147 +++++++++++++++++++++++++------- pandas/core/frame.py | 7 +- pandas/indexes/base.py | 12 ++- pandas/indexes/range.py | 19 +---- pandas/tests/frame/test_join.py | 35 +++++--- 5 files changed, 154 insertions(+), 66 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 192d4f95a8cba..c75eb483e3b22 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -309,6 +309,7 @@ Other enhancements - ``pd.types.concat.union_categoricals`` gained the ``ignore_ordered`` argument to allow ignoring the ordered attribute of unioned categoricals (:issue:`13410`). See the :ref:`categorical union docs ` for more information. - ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`) - ``pd.DataFrame.to_latex`` and ``pd.DataFrame.to_string`` now allow optional header aliases. (:issue:`15536`) +- ``Index.intersection()`` accepts parameter ``sort`` (:issue:`15582`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations @@ -740,52 +741,135 @@ New Behavior: .. _whatsnew_0200.api_breaking.index_order: -Index order after DataFrame inner join or Index intersection -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Index order after inner join due to Index intersection +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The ``DataFrame`` inner join and the ``Index`` intersection, now preserve the -order of the calling's Index (left) instead of the other's Index (right) -(:issue:`15582`) +The ``Index.intersection`` now preserves the order of the calling Index (left) +instead of the other Index (right) (:issue:`15582`). This affects the inner +joins (methods ``Index.join``, ``DataFrame.join``, ``DataFrame.merge`` and +``pd.merge``) and the alignments with inner join (methods ``Series.align`` and +``DataFrame.align``). -Previous Behavior: +- ``Index.intersection`` and ``Index.join`` -.. code-block:: ipython - In [2]: df1 = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) + .. ipython:: python - In [3]: df2 = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) + idx1 = pd.Index([2, 1, 0]) + idx1 + idx2 = pd.Index([1, 2, 3]) + idx2 - In [4]: df1.join(df2, how='inner') - Out[4]: - a b - 1 10 100 - 2 20 200 + Previous Behavior: - In [5]: idx1 = pd.Index([5, 3, 2, 4, 1]) + .. code-block:: ipython - In [6]: idx2 = pd.Index([4, 7, 6, 5, 3]) + In [4]: idx1.intersection(idx2) + Out[4]: Int64Index([1, 2], dtype='int64') - In [7]: idx1.intersection(idx2) - Out[7]: Int64Index([4, 5, 3], dtype='int64') + In [5]: idx1.join(idx2, how='inner') + Out[5]: Int64Index([1, 2], dtype='int64') -New Behavior: + New Behavior: -.. code-block:: ipython - In [2]: df1 = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) + .. ipython:: python + + idx1.intersection(idx2) + + idx1.join(idx2, how='inner') + +- ``Series.align`` + + .. ipython:: python + + s1 = pd.Series([20, 10, 0], index=[2, 1, 0]) + s1 + s2 = pd.Series([100, 200, 300], index=[1, 2, 3]) + s2 + + Previous Behavior: + + .. code-block:: ipython + + In [4]: (res1, res2) = s1.align(s2, join='inner') + + In [5]: res1 + Out[5]: + 1 10 + 2 20 + dtype: int64 + + In [6]: res2 + Out[6]: + 1 100 + 2 200 + dtype: int64 + + New Behavior: + + .. ipython:: python + + (res1, res2) = s1.align(s2, join='inner') + res1 + res2 + +- ``DataFrame.join``, ``DataFrame.merge`` and ``pd.merge`` + + .. ipython:: python + + df1 = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) + df1 + df2 = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) + df2 + + Previous Behavior: + + .. code-block:: ipython + + In [4]: df1.join(df2, how='inner') + Out[4]: + a b + 1 10 100 + 2 20 200 + + In [5]: df1.merge(df2, how='inner', left_index=True, right_index=True) + Out[5]: + a b + 1 10 100 + 2 20 200 + + In [6]: pd.merge(df1, df2, how='inner', left_index=True, right_index=True) + Out[6]: + a b + 1 10 100 + 2 20 200 + + In [7]: (res1, res2) = df1.align(df2, axis=0, join='inner') + + In [8]: res1 + Out[8]: + a + 1 10 + 2 20 + + In [9]: res2 + Out[9]: + b + 1 100 + 2 200 + + New Behavior: - In [3]: df2 = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) + .. ipython:: python - In [4]: df1.join(df2, how='inner') - Out[4]: - a b - 2 20 200 - 1 10 100 + df1.join(df2, how='inner') - In [5]: idx1 = pd.Index([5, 3, 2, 4, 1]) + df1.merge(df2, how='inner', left_index=True, right_index=True) - In [6]: idx2 = pd.Index([4, 7, 6, 5, 3]) + pd.merge(df1, df2, how='inner', left_index=True, right_index=True) - In [7]: idx1.intersection(idx2) - Out[7]: Int64Index([5, 3, 4], dtype='int64') + (res1, res2) = df1.align(df2, axis=0, join='inner') + res1 + res2 .. _whatsnew_0200.api: @@ -1024,3 +1108,4 @@ Bug Fixes - Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`) - Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`) - Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`) +- Bug with ``sort=True`` in ``DataFrame.join``, ``DataFrame.merge`` and ``pd.merge`` when joining on index (:issue:`15582`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ca53f863f1f15..94363b28688e8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -127,7 +127,8 @@ * left: use only keys from left frame (SQL: left outer join) * right: use only keys from right frame (SQL: right outer join) * outer: use union of keys from both frames (SQL: full outer join) - * inner: use intersection of keys from both frames (SQL: inner join) + * inner: use intersection of keys from both frames (SQL: inner join), + preserving the order of the left keys on : label or list Field names to join on. Must be found in both DataFrames. If on is None and not merging on indexes, then it merges on the intersection of @@ -147,7 +148,8 @@ Use the index from the right DataFrame as the join key. Same caveats as left_index sort : boolean, default False - Sort the join keys lexicographically in the result DataFrame + Sort the join keys lexicographically in the result DataFrame. If False, + the order of the join keys depends on the join type (how keyword) suffixes : 2-length sequence (tuple, list, ...) Suffix to apply to overlapping column names in the left and right side, respectively @@ -4464,6 +4466,7 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', * right: use other frame's index * outer: form union of calling frame's index (or column if on is specified) with other frame's index, and sort it + lexicographically * inner: form intersection of calling frame's index (or column if on is specified) with other frame's index, preserving the order of the calling's one diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 657a823837c9e..abf835dbcd7f2 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2090,7 +2090,7 @@ def intersection(self, other): Form the intersection of two Index objects. This returns a new Index with elements common to the index and `other`, - preserving the calling index order. + preserving the order of the calling index. Parameters ---------- @@ -2831,9 +2831,7 @@ def _reindex_non_unique(self, target): new_index = self._shallow_copy_with_infer(new_labels, freq=None) return new_index, indexer, new_indexer - def join(self, other, how='left', level=None, return_indexers=False, - sort=False): - """ + _index_shared_docs['join'] = """ *this is an internal non-public method* Compute join_index and indexers to conform data @@ -2847,10 +2845,16 @@ def join(self, other, how='left', level=None, return_indexers=False, return_indexers : boolean, default False sort : boolean, default False + .. versionadded:: 0.20.0 + Returns ------- join_index, (left_indexer, right_indexer) """ + + @Appender(_index_shared_docs['join']) + def join(self, other, how='left', level=None, return_indexers=False, + sort=False): from .multi import MultiIndex self_is_mi = isinstance(self, MultiIndex) other_is_mi = isinstance(other, MultiIndex) diff --git a/pandas/indexes/range.py b/pandas/indexes/range.py index 47902b0fc400d..be68c97fb7890 100644 --- a/pandas/indexes/range.py +++ b/pandas/indexes/range.py @@ -431,26 +431,9 @@ def union(self, other): return self._int64index.union(other) + @Appender(_index_shared_docs['join']) def join(self, other, how='left', level=None, return_indexers=False, sort=False): - """ - *this is an internal non-public method* - - Compute join_index and indexers to conform data - structures to the new index. - - Parameters - ---------- - other : Index - how : {'left', 'right', 'inner', 'outer'} - level : int or level name, default None - return_indexers : boolean, default False - sort : boolean, default False - - Returns - ------- - join_index, (left_indexer, right_indexer) - """ if how == 'outer' and self is not other: # note: could return RangeIndex in more circumstances return self._int64index.join(other, how, level, return_indexers, diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py index ec6f8ceea5a0c..0e54c2ac46f6a 100644 --- a/pandas/tests/frame/test_join.py +++ b/pandas/tests/frame/test_join.py @@ -2,6 +2,8 @@ from __future__ import print_function +import numpy as np + import pandas as pd from pandas.tests.frame.common import TestData @@ -15,29 +17,34 @@ def test_join(self): df1 = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) df2 = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) + # default how='left' result = df1.join(df2) - expected = pd.DataFrame({'a': [20, 10, 0], 'b': [200, 100, None]}, + expected = pd.DataFrame({'a': [20, 10, 0], 'b': [200, 100, np.nan]}, index=[2, 1, 0]) tm.assert_frame_equal(result, expected) + # how='left' result = df1.join(df2, how='left') - expected = pd.DataFrame({'a': [20, 10, 0], 'b': [200, 100, None]}, + expected = pd.DataFrame({'a': [20, 10, 0], 'b': [200, 100, np.nan]}, index=[2, 1, 0]) tm.assert_frame_equal(result, expected) + # how='right' result = df1.join(df2, how='right') - expected = pd.DataFrame({'a': [10, 20, None], 'b': [100, 200, 300]}, + expected = pd.DataFrame({'a': [10, 20, np.nan], 'b': [100, 200, 300]}, index=[1, 2, 3]) tm.assert_frame_equal(result, expected) + # how='inner' result = df1.join(df2, how='inner') expected = pd.DataFrame({'a': [20, 10], 'b': [200, 100]}, index=[2, 1]) tm.assert_frame_equal(result, expected) + # how='outer' result = df1.join(df2, how='outer') - expected = pd.DataFrame({'a': [0, 10, 20, None], - 'b': [None, 100, 200, 300]}, + expected = pd.DataFrame({'a': [0, 10, 20, np.nan], + 'b': [np.nan, 100, 200, 300]}, index=[0, 1, 2, 3]) tm.assert_frame_equal(result, expected) @@ -45,34 +52,40 @@ def test_join_sort(self): df1 = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) df2 = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) + # default how='left' result = df1.join(df2, sort=True) - expected = pd.DataFrame({'a': [0, 10, 20], 'b': [None, 100, 200]}, + expected = pd.DataFrame({'a': [0, 10, 20], 'b': [np.nan, 100, 200]}, index=[0, 1, 2]) tm.assert_frame_equal(result, expected) + # how='left' result = df1.join(df2, how='left', sort=True) - expected = pd.DataFrame({'a': [0, 10, 20], 'b': [None, 100, 200]}, + expected = pd.DataFrame({'a': [0, 10, 20], 'b': [np.nan, 100, 200]}, index=[0, 1, 2]) tm.assert_frame_equal(result, expected) + # how='right' (already sorted) result = df1.join(df2, how='right', sort=True) - expected = pd.DataFrame({'a': [10, 20, None], 'b': [100, 200, 300]}, + expected = pd.DataFrame({'a': [10, 20, np.nan], 'b': [100, 200, 300]}, index=[1, 2, 3]) tm.assert_frame_equal(result, expected) + # how='right' result = df2.join(df1, how='right', sort=True) - expected = pd.DataFrame([[None, 0], [100, 10], [200, 20]], + expected = pd.DataFrame([[np.nan, 0], [100, 10], [200, 20]], columns=['b', 'a'], index=[0, 1, 2]) tm.assert_frame_equal(result, expected) + # how='inner' result = df1.join(df2, how='inner', sort=True) expected = pd.DataFrame({'a': [10, 20], 'b': [100, 200]}, index=[1, 2]) tm.assert_frame_equal(result, expected) + # how='outer' result = df1.join(df2, how='outer', sort=True) - expected = pd.DataFrame({'a': [0, 10, 20, None], - 'b': [None, 100, 200, 300]}, + expected = pd.DataFrame({'a': [0, 10, 20, np.nan], + 'b': [np.nan, 100, 200, 300]}, index=[0, 1, 2, 3]) tm.assert_frame_equal(result, expected) From 654288b8939e3cbe391c1d128f3c33f291bfba24 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Sun, 19 Mar 2017 00:43:54 +0100 Subject: [PATCH 16/23] Fix Travis errors --- pandas/tests/frame/test_merge.py | 17 +++++++++++------ pandas/tests/tools/test_merge.py | 30 ++++++++++++++++++++---------- 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/pandas/tests/frame/test_merge.py b/pandas/tests/frame/test_merge.py index f29a371d5a970..86d901ba6acd6 100644 --- a/pandas/tests/frame/test_merge.py +++ b/pandas/tests/frame/test_merge.py @@ -55,32 +55,37 @@ def test_merge_on_indexes_sort(self): tm.assert_frame_equal(result, expected) # how='left' - result = df1.merge(df2, left_index=True, right_index=True, how='left', sort=True) + result = df1.merge(df2, left_index=True, right_index=True, how='left', + sort=True) expected = pd.DataFrame({'a': [0, 10, 20], 'b': [np.nan, 100, 200]}, index=[0, 1, 2]) tm.assert_frame_equal(result, expected) # how='right' (already sorted) - result = df1.merge(df2, left_index=True, right_index=True, how='right', sort=True) + result = df1.merge(df2, left_index=True, right_index=True, + how='right', sort=True) expected = pd.DataFrame({'a': [10, 20, np.nan], 'b': [100, 200, 300]}, index=[1, 2, 3]) tm.assert_frame_equal(result, expected) # how='right' - result = df2.merge(df1, left_index=True, right_index=True, how='right', sort=True) + result = df2.merge(df1, left_index=True, right_index=True, + how='right', sort=True) expected = pd.DataFrame([[np.nan, 0], [100, 10], [200, 20]], columns=['b', 'a'], index=[0, 1, 2]) tm.assert_frame_equal(result, expected) # how='inner' - result = df1.merge(df2, left_index=True, right_index=True, how='inner', sort=True) + result = df1.merge(df2, left_index=True, right_index=True, + how='inner', sort=True) expected = pd.DataFrame({'a': [10, 20], 'b': [100, 200]}, index=[1, 2]) tm.assert_frame_equal(result, expected) # how='outer' - result = df1.merge(df2, left_index=True, right_index=True, how='outer', sort=True) + result = df1.merge(df2, left_index=True, right_index=True, + how='outer', sort=True) expected = pd.DataFrame({'a': [0, 10, 20, np.nan], 'b': [np.nan, 100, 200, 300]}, index=[0, 1, 2, 3]) - tm.assert_frame_equal(result, expected) \ No newline at end of file + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/tools/test_merge.py b/pandas/tests/tools/test_merge.py index 51c96d19d3677..faaaf009ce239 100644 --- a/pandas/tests/tools/test_merge.py +++ b/pandas/tests/tools/test_merge.py @@ -1369,25 +1369,29 @@ def test_merge_on_indexes(self): tm.assert_frame_equal(result, expected) # how='left' - result = pd.merge(df1, df2, left_index=True, right_index=True, how='left') + result = pd.merge(df1, df2, left_index=True, right_index=True, + how='left') expected = pd.DataFrame({'a': [20, 10, 0], 'b': [200, 100, np.nan]}, index=[2, 1, 0]) tm.assert_frame_equal(result, expected) # how='right' - result = pd.merge(df1, df2, left_index=True, right_index=True, how='right') + result = pd.merge(df1, df2, left_index=True, right_index=True, + how='right') expected = pd.DataFrame({'a': [10, 20, np.nan], 'b': [100, 200, 300]}, index=[1, 2, 3]) tm.assert_frame_equal(result, expected) # how='inner' - result = pd.merge(df1, df2, left_index=True, right_index=True, how='inner') + result = pd.merge(df1, df2, left_index=True, right_index=True, + how='inner') expected = pd.DataFrame({'a': [20, 10], 'b': [200, 100]}, index=[2, 1]) tm.assert_frame_equal(result, expected) # how='outer' - result = pd.merge(df1, df2, left_index=True, right_index=True, how='outer') + result = pd.merge(df1, df2, left_index=True, right_index=True, + how='outer') expected = pd.DataFrame({'a': [0, 10, 20, np.nan], 'b': [np.nan, 100, 200, 300]}, index=[0, 1, 2, 3]) @@ -1398,37 +1402,43 @@ def test_merge_on_indexes_sort(self): df2 = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) # default how='inner' - result = pd.merge(df1, df2, left_index=True, right_index=True, sort=True) + result = pd.merge(df1, df2, left_index=True, right_index=True, + sort=True) expected = pd.DataFrame({'a': [10, 20], 'b': [100, 200]}, index=[1, 2]) tm.assert_frame_equal(result, expected) # how='left' - result = pd.merge(df1, df2, left_index=True, right_index=True, how='left', sort=True) + result = pd.merge(df1, df2, left_index=True, right_index=True, + how='left', sort=True) expected = pd.DataFrame({'a': [0, 10, 20], 'b': [np.nan, 100, 200]}, index=[0, 1, 2]) tm.assert_frame_equal(result, expected) # how='right' (already sorted) - result = pd.merge(df1, df2, left_index=True, right_index=True, how='right', sort=True) + result = pd.merge(df1, df2, left_index=True, right_index=True, + how='right', sort=True) expected = pd.DataFrame({'a': [10, 20, np.nan], 'b': [100, 200, 300]}, index=[1, 2, 3]) tm.assert_frame_equal(result, expected) # how='right' - result = pd.merge(df2, df1, left_index=True, right_index=True, how='right', sort=True) + result = pd.merge(df2, df1, left_index=True, right_index=True, + how='right', sort=True) expected = pd.DataFrame([[np.nan, 0], [100, 10], [200, 20]], columns=['b', 'a'], index=[0, 1, 2]) tm.assert_frame_equal(result, expected) # how='inner' - result = pd.merge(df1, df2, left_index=True, right_index=True, how='inner', sort=True) + result = pd.merge(df1, df2, left_index=True, right_index=True, + how='inner', sort=True) expected = pd.DataFrame({'a': [10, 20], 'b': [100, 200]}, index=[1, 2]) tm.assert_frame_equal(result, expected) # how='outer' - result = pd.merge(df1, df2, left_index=True, right_index=True, how='outer', sort=True) + result = pd.merge(df1, df2, left_index=True, right_index=True, + how='outer', sort=True) expected = pd.DataFrame({'a': [0, 10, 20, np.nan], 'b': [np.nan, 100, 200, 300]}, index=[0, 1, 2, 3]) From 5bf1508a27dc0b678a3bb422afe0724569981d29 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Mon, 20 Mar 2017 07:53:54 +0100 Subject: [PATCH 17/23] Address requested changes --- doc/source/whatsnew/v0.20.0.txt | 7 +------ pandas/core/frame.py | 10 +++++----- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index c75eb483e3b22..c2ce5d36b1bf5 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -309,7 +309,7 @@ Other enhancements - ``pd.types.concat.union_categoricals`` gained the ``ignore_ordered`` argument to allow ignoring the ordered attribute of unioned categoricals (:issue:`13410`). See the :ref:`categorical union docs ` for more information. - ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`) - ``pd.DataFrame.to_latex`` and ``pd.DataFrame.to_string`` now allow optional header aliases. (:issue:`15536`) -- ``Index.intersection()`` accepts parameter ``sort`` (:issue:`15582`) +- ``Index.join()`` accepts parameter ``sort`` (:issue:`15582`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations @@ -774,7 +774,6 @@ joins (methods ``Index.join``, ``DataFrame.join``, ``DataFrame.merge`` and .. ipython:: python idx1.intersection(idx2) - idx1.join(idx2, how='inner') - ``Series.align`` @@ -862,11 +861,7 @@ joins (methods ``Index.join``, ``DataFrame.join``, ``DataFrame.merge`` and .. ipython:: python df1.join(df2, how='inner') - df1.merge(df2, how='inner', left_index=True, right_index=True) - - pd.merge(df1, df2, how='inner', left_index=True, right_index=True) - (res1, res2) = df1.align(df2, axis=0, join='inner') res1 res2 diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 94363b28688e8..aa2db9f133dd7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -128,7 +128,7 @@ * right: use only keys from right frame (SQL: right outer join) * outer: use union of keys from both frames (SQL: full outer join) * inner: use intersection of keys from both frames (SQL: inner join), - preserving the order of the left keys + preserving the order of the left keys on : label or list Field names to join on. Must be found in both DataFrames. If on is None and not merging on indexes, then it merges on the intersection of @@ -4465,11 +4465,11 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', * left: use calling frame's index (or column if on is specified) * right: use other frame's index * outer: form union of calling frame's index (or column if on is - specified) with other frame's index, and sort it - lexicographically + specified) with other frame's index, and sort it + lexicographically * inner: form intersection of calling frame's index (or column if - on is specified) with other frame's index, preserving the - order of the calling's one + on is specified) with other frame's index, preserving the order + of the calling's one lsuffix : string Suffix to use from left frame's overlapping columns rsuffix : string From 9e39794808d38f8a425b39c099aa5a4c247d074d Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Mon, 20 Mar 2017 21:19:55 +0100 Subject: [PATCH 18/23] Address requested changes --- doc/source/whatsnew/v0.20.0.txt | 77 +++------------------------ pandas/tests/frame/test_merge.py | 91 -------------------------------- 2 files changed, 6 insertions(+), 162 deletions(-) delete mode 100644 pandas/tests/frame/test_merge.py diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index c2ce5d36b1bf5..f3a2ef48ed186 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -309,7 +309,6 @@ Other enhancements - ``pd.types.concat.union_categoricals`` gained the ``ignore_ordered`` argument to allow ignoring the ordered attribute of unioned categoricals (:issue:`13410`). See the :ref:`categorical union docs ` for more information. - ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`) - ``pd.DataFrame.to_latex`` and ``pd.DataFrame.to_string`` now allow optional header aliases. (:issue:`15536`) -- ``Index.join()`` accepts parameter ``sort`` (:issue:`15582`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations @@ -746,11 +745,9 @@ Index order after inner join due to Index intersection The ``Index.intersection`` now preserves the order of the calling Index (left) instead of the other Index (right) (:issue:`15582`). This affects the inner -joins (methods ``Index.join``, ``DataFrame.join``, ``DataFrame.merge`` and -``pd.merge``) and the alignments with inner join (methods ``Series.align`` and -``DataFrame.align``). +joins (methods ``DataFrame.join`` and ``pd.merge``) and the .align methods. -- ``Index.intersection`` and ``Index.join`` +- ``Index.intersection`` .. ipython:: python @@ -766,52 +763,13 @@ joins (methods ``Index.join``, ``DataFrame.join``, ``DataFrame.merge`` and In [4]: idx1.intersection(idx2) Out[4]: Int64Index([1, 2], dtype='int64') - In [5]: idx1.join(idx2, how='inner') - Out[5]: Int64Index([1, 2], dtype='int64') - New Behavior: .. ipython:: python idx1.intersection(idx2) - idx1.join(idx2, how='inner') -- ``Series.align`` - - .. ipython:: python - - s1 = pd.Series([20, 10, 0], index=[2, 1, 0]) - s1 - s2 = pd.Series([100, 200, 300], index=[1, 2, 3]) - s2 - - Previous Behavior: - - .. code-block:: ipython - - In [4]: (res1, res2) = s1.align(s2, join='inner') - - In [5]: res1 - Out[5]: - 1 10 - 2 20 - dtype: int64 - - In [6]: res2 - Out[6]: - 1 100 - 2 200 - dtype: int64 - - New Behavior: - - .. ipython:: python - - (res1, res2) = s1.align(s2, join='inner') - res1 - res2 - -- ``DataFrame.join``, ``DataFrame.merge`` and ``pd.merge`` +- ``DataFrame.join`` and ``pd.merge`` .. ipython:: python @@ -830,41 +788,18 @@ joins (methods ``Index.join``, ``DataFrame.join``, ``DataFrame.merge`` and 1 10 100 2 20 200 - In [5]: df1.merge(df2, how='inner', left_index=True, right_index=True) + In [5]: pd.merge(df1, df2, how='inner', left_index=True, right_index=True) Out[5]: a b 1 10 100 2 20 200 - In [6]: pd.merge(df1, df2, how='inner', left_index=True, right_index=True) - Out[6]: - a b - 1 10 100 - 2 20 200 - - In [7]: (res1, res2) = df1.align(df2, axis=0, join='inner') - - In [8]: res1 - Out[8]: - a - 1 10 - 2 20 - - In [9]: res2 - Out[9]: - b - 1 100 - 2 200 - New Behavior: .. ipython:: python df1.join(df2, how='inner') - df1.merge(df2, how='inner', left_index=True, right_index=True) - (res1, res2) = df1.align(df2, axis=0, join='inner') - res1 - res2 + pd.merge(df1, df2, how='inner', left_index=True, right_index=True) .. _whatsnew_0200.api: @@ -1103,4 +1038,4 @@ Bug Fixes - Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`) - Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`) - Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`) -- Bug with ``sort=True`` in ``DataFrame.join``, ``DataFrame.merge`` and ``pd.merge`` when joining on index (:issue:`15582`) +- Bug with ``sort=True`` in ``DataFrame.join`` and ``pd.merge`` when joining on indexes (:issue:`15582`) diff --git a/pandas/tests/frame/test_merge.py b/pandas/tests/frame/test_merge.py deleted file mode 100644 index 86d901ba6acd6..0000000000000 --- a/pandas/tests/frame/test_merge.py +++ /dev/null @@ -1,91 +0,0 @@ -# -*- coding: utf-8 -*- - -from __future__ import print_function - -import numpy as np -import pandas as pd -import pandas.util.testing as tm - - -class TestDataFrameMerge(object): - - def test_merge_on_indexes(self): - df1 = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) - df2 = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) - - # default how='inner' - result = df1.merge(df2, left_index=True, right_index=True) - expected = pd.DataFrame({'a': [20, 10], 'b': [200, 100]}, - index=[2, 1]) - tm.assert_frame_equal(result, expected) - - # how='left' - result = df1.merge(df2, left_index=True, right_index=True, how='left') - expected = pd.DataFrame({'a': [20, 10, 0], 'b': [200, 100, np.nan]}, - index=[2, 1, 0]) - tm.assert_frame_equal(result, expected) - - # how='right' - result = df1.merge(df2, left_index=True, right_index=True, how='right') - expected = pd.DataFrame({'a': [10, 20, np.nan], 'b': [100, 200, 300]}, - index=[1, 2, 3]) - tm.assert_frame_equal(result, expected) - - # how='inner' - result = df1.merge(df2, left_index=True, right_index=True, how='inner') - expected = pd.DataFrame({'a': [20, 10], 'b': [200, 100]}, - index=[2, 1]) - tm.assert_frame_equal(result, expected) - - # how='outer' - result = df1.merge(df2, left_index=True, right_index=True, how='outer') - expected = pd.DataFrame({'a': [0, 10, 20, np.nan], - 'b': [np.nan, 100, 200, 300]}, - index=[0, 1, 2, 3]) - tm.assert_frame_equal(result, expected) - - def test_merge_on_indexes_sort(self): - df1 = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) - df2 = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) - - # default how='inner' - result = df1.merge(df2, left_index=True, right_index=True, sort=True) - expected = pd.DataFrame({'a': [10, 20], 'b': [100, 200]}, - index=[1, 2]) - tm.assert_frame_equal(result, expected) - - # how='left' - result = df1.merge(df2, left_index=True, right_index=True, how='left', - sort=True) - expected = pd.DataFrame({'a': [0, 10, 20], 'b': [np.nan, 100, 200]}, - index=[0, 1, 2]) - tm.assert_frame_equal(result, expected) - - # how='right' (already sorted) - result = df1.merge(df2, left_index=True, right_index=True, - how='right', sort=True) - expected = pd.DataFrame({'a': [10, 20, np.nan], 'b': [100, 200, 300]}, - index=[1, 2, 3]) - tm.assert_frame_equal(result, expected) - - # how='right' - result = df2.merge(df1, left_index=True, right_index=True, - how='right', sort=True) - expected = pd.DataFrame([[np.nan, 0], [100, 10], [200, 20]], - columns=['b', 'a'], index=[0, 1, 2]) - tm.assert_frame_equal(result, expected) - - # how='inner' - result = df1.merge(df2, left_index=True, right_index=True, - how='inner', sort=True) - expected = pd.DataFrame({'a': [10, 20], 'b': [100, 200]}, - index=[1, 2]) - tm.assert_frame_equal(result, expected) - - # how='outer' - result = df1.merge(df2, left_index=True, right_index=True, - how='outer', sort=True) - expected = pd.DataFrame({'a': [0, 10, 20, np.nan], - 'b': [np.nan, 100, 200, 300]}, - index=[0, 1, 2, 3]) - tm.assert_frame_equal(result, expected) From 968c7f197652411e3c6e92d4703c27ce2652577a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 25 Mar 2017 11:02:57 -0400 Subject: [PATCH 19/23] DOC/TST: change to use parameterization --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/tests/frame/test_join.py | 239 ++++++++++++++----------------- pandas/tests/tools/test_merge.py | 120 ++++++---------- 3 files changed, 145 insertions(+), 216 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index f3a2ef48ed186..613985a6ded52 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -1023,6 +1023,7 @@ Bug Fixes - Bug in ``DataFrame.to_stata()`` and ``StataWriter`` which produces incorrectly formatted files to be produced for some locales (:issue:`13856`) - Bug in ``StataReader`` and ``StataWriter`` which allows invalid encodings (:issue:`15723`) +- Bug with ``sort=True`` in ``DataFrame.join`` and ``pd.merge`` when joining on indexes (:issue:`15582`) - Bug in ``pd.concat()`` in which concatting with an empty dataframe with ``join='inner'`` was being improperly handled (:issue:`15328`) - Bug in ``groupby.agg()`` incorrectly localizing timezone on ``datetime`` (:issue:`15426`, :issue:`10668`, :issue:`13046`) @@ -1038,4 +1039,3 @@ Bug Fixes - Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`) - Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`) - Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`) -- Bug with ``sort=True`` in ``DataFrame.join`` and ``pd.merge`` when joining on indexes (:issue:`15582`) diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py index 0e54c2ac46f6a..b4e8a8234af31 100644 --- a/pandas/tests/frame/test_join.py +++ b/pandas/tests/frame/test_join.py @@ -1,171 +1,140 @@ # -*- coding: utf-8 -*- -from __future__ import print_function - +import pytest import numpy as np -import pandas as pd - +from pandas import DataFrame, Index from pandas.tests.frame.common import TestData - import pandas.util.testing as tm -class TestDataFrameJoin(TestData): - - def test_join(self): - df1 = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) - df2 = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) +@pytest.fixture +def frame(): + return TestData().frame + + +@pytest.fixture +def df1(): + return DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) + + +@pytest.fixture +def df2(): + return DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) + + +@pytest.mark.parametrize( + "how, sort, expected", + [('inner', False, DataFrame({'a': [20, 10], + 'b': [200, 100]}, + index=[2, 1])), + ('inner', True, DataFrame({'a': [10, 20], + 'b': [100, 200]}, + index=[1, 2])), + ('left', False, DataFrame({'a': [20, 10, 0], + 'b': [200, 100, np.nan]}, + index=[2, 1, 0])), + ('left', True, DataFrame({'a': [0, 10, 20], + 'b': [np.nan, 100, 200]}, + index=[0, 1, 2])), + ('right', False, DataFrame({'a': [10, 20, np.nan], + 'b': [100, 200, 300]}, + index=[1, 2, 3])), + ('right', True, DataFrame({'a': [10, 20, np.nan], + 'b': [100, 200, 300]}, + index=[1, 2, 3])), + ('outer', False, DataFrame({'a': [0, 10, 20, np.nan], + 'b': [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3])), + ('outer', True, DataFrame({'a': [0, 10, 20, np.nan], + 'b': [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3]))]) +def test_join(df1, df2, how, sort, expected): - # default how='left' - result = df1.join(df2) - expected = pd.DataFrame({'a': [20, 10, 0], 'b': [200, 100, np.nan]}, - index=[2, 1, 0]) - tm.assert_frame_equal(result, expected) + result = df1.join(df2, how=how, sort=sort) + tm.assert_frame_equal(result, expected) - # how='left' - result = df1.join(df2, how='left') - expected = pd.DataFrame({'a': [20, 10, 0], 'b': [200, 100, np.nan]}, - index=[2, 1, 0]) - tm.assert_frame_equal(result, expected) - # how='right' - result = df1.join(df2, how='right') - expected = pd.DataFrame({'a': [10, 20, np.nan], 'b': [100, 200, 300]}, - index=[1, 2, 3]) - tm.assert_frame_equal(result, expected) +def test_join_index(frame): + # left / right - # how='inner' - result = df1.join(df2, how='inner') - expected = pd.DataFrame({'a': [20, 10], 'b': [200, 100]}, - index=[2, 1]) - tm.assert_frame_equal(result, expected) + f = frame.loc[frame.index[:10], ['A', 'B']] + f2 = frame.loc[frame.index[5:], ['C', 'D']].iloc[::-1] - # how='outer' - result = df1.join(df2, how='outer') - expected = pd.DataFrame({'a': [0, 10, 20, np.nan], - 'b': [np.nan, 100, 200, 300]}, - index=[0, 1, 2, 3]) - tm.assert_frame_equal(result, expected) - - def test_join_sort(self): - df1 = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) - df2 = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) - - # default how='left' - result = df1.join(df2, sort=True) - expected = pd.DataFrame({'a': [0, 10, 20], 'b': [np.nan, 100, 200]}, - index=[0, 1, 2]) - tm.assert_frame_equal(result, expected) - - # how='left' - result = df1.join(df2, how='left', sort=True) - expected = pd.DataFrame({'a': [0, 10, 20], 'b': [np.nan, 100, 200]}, - index=[0, 1, 2]) - tm.assert_frame_equal(result, expected) - - # how='right' (already sorted) - result = df1.join(df2, how='right', sort=True) - expected = pd.DataFrame({'a': [10, 20, np.nan], 'b': [100, 200, 300]}, - index=[1, 2, 3]) - tm.assert_frame_equal(result, expected) - - # how='right' - result = df2.join(df1, how='right', sort=True) - expected = pd.DataFrame([[np.nan, 0], [100, 10], [200, 20]], - columns=['b', 'a'], index=[0, 1, 2]) - tm.assert_frame_equal(result, expected) - - # how='inner' - result = df1.join(df2, how='inner', sort=True) - expected = pd.DataFrame({'a': [10, 20], 'b': [100, 200]}, - index=[1, 2]) - tm.assert_frame_equal(result, expected) - - # how='outer' - result = df1.join(df2, how='outer', sort=True) - expected = pd.DataFrame({'a': [0, 10, 20, np.nan], - 'b': [np.nan, 100, 200, 300]}, - index=[0, 1, 2, 3]) - tm.assert_frame_equal(result, expected) + joined = f.join(f2) + tm.assert_index_equal(f.index, joined.index) + expected_columns = Index(['A', 'B', 'C', 'D']) + tm.assert_index_equal(joined.columns, expected_columns) - def test_join_index(self): - # left / right + joined = f.join(f2, how='left') + tm.assert_index_equal(joined.index, f.index) + tm.assert_index_equal(joined.columns, expected_columns) - f = self.frame.loc[self.frame.index[:10], ['A', 'B']] - f2 = self.frame.loc[self.frame.index[5:], ['C', 'D']].iloc[::-1] + joined = f.join(f2, how='right') + tm.assert_index_equal(joined.index, f2.index) + tm.assert_index_equal(joined.columns, expected_columns) - joined = f.join(f2) - tm.assert_index_equal(f.index, joined.index) - expected_columns = pd.Index(['A', 'B', 'C', 'D']) - tm.assert_index_equal(joined.columns, expected_columns) + # inner - joined = f.join(f2, how='left') - tm.assert_index_equal(joined.index, f.index) - tm.assert_index_equal(joined.columns, expected_columns) + joined = f.join(f2, how='inner') + tm.assert_index_equal(joined.index, f.index[5:10]) + tm.assert_index_equal(joined.columns, expected_columns) - joined = f.join(f2, how='right') - tm.assert_index_equal(joined.index, f2.index) - tm.assert_index_equal(joined.columns, expected_columns) + # outer - # inner + joined = f.join(f2, how='outer') + tm.assert_index_equal(joined.index, frame.index.sort_values()) + tm.assert_index_equal(joined.columns, expected_columns) - joined = f.join(f2, how='inner') - tm.assert_index_equal(joined.index, f.index[5:10]) - tm.assert_index_equal(joined.columns, expected_columns) + tm.assertRaisesRegexp(ValueError, 'join method', f.join, f2, how='foo') - # outer + # corner case - overlapping columns + for how in ('outer', 'left', 'inner'): + with tm.assertRaisesRegexp(ValueError, 'columns overlap but ' + 'no suffix'): + frame.join(frame, how=how) - joined = f.join(f2, how='outer') - tm.assert_index_equal(joined.index, self.frame.index.sort_values()) - tm.assert_index_equal(joined.columns, expected_columns) - tm.assertRaisesRegexp(ValueError, 'join method', f.join, f2, how='foo') +def test_join_index_more(frame): + af = frame.loc[:, ['A', 'B']] + bf = frame.loc[::2, ['C', 'D']] - # corner case - overlapping columns - for how in ('outer', 'left', 'inner'): - with tm.assertRaisesRegexp(ValueError, 'columns overlap but ' - 'no suffix'): - self.frame.join(self.frame, how=how) + expected = af.copy() + expected['C'] = frame['C'][::2] + expected['D'] = frame['D'][::2] - def test_join_index_more(self): - af = self.frame.loc[:, ['A', 'B']] - bf = self.frame.loc[::2, ['C', 'D']] + result = af.join(bf) + tm.assert_frame_equal(result, expected) - expected = af.copy() - expected['C'] = self.frame['C'][::2] - expected['D'] = self.frame['D'][::2] + result = af.join(bf, how='right') + tm.assert_frame_equal(result, expected[::2]) - result = af.join(bf) - tm.assert_frame_equal(result, expected) + result = bf.join(af, how='right') + tm.assert_frame_equal(result, expected.loc[:, result.columns]) - result = af.join(bf, how='right') - tm.assert_frame_equal(result, expected[::2]) - result = bf.join(af, how='right') - tm.assert_frame_equal(result, expected.loc[:, result.columns]) +def test_join_index_series(frame): + df = frame.copy() + s = df.pop(frame.columns[-1]) + joined = df.join(s) - def test_join_index_series(self): - df = self.frame.copy() - s = df.pop(self.frame.columns[-1]) - joined = df.join(s) + # TODO should this check_names ? + tm.assert_frame_equal(joined, frame, check_names=False) - # TODO should this check_names ? - tm.assert_frame_equal(joined, self.frame, check_names=False) + s.name = None + tm.assertRaisesRegexp(ValueError, 'must have a name', df.join, s) - s.name = None - tm.assertRaisesRegexp(ValueError, 'must have a name', df.join, s) - def test_join_overlap(self): - df1 = self.frame.loc[:, ['A', 'B', 'C']] - df2 = self.frame.loc[:, ['B', 'C', 'D']] +def test_join_overlap(frame): + df1 = frame.loc[:, ['A', 'B', 'C']] + df2 = frame.loc[:, ['B', 'C', 'D']] - joined = df1.join(df2, lsuffix='_df1', rsuffix='_df2') - df1_suf = df1.loc[:, ['B', 'C']].add_suffix('_df1') - df2_suf = df2.loc[:, ['B', 'C']].add_suffix('_df2') + joined = df1.join(df2, lsuffix='_df1', rsuffix='_df2') + df1_suf = df1.loc[:, ['B', 'C']].add_suffix('_df1') + df2_suf = df2.loc[:, ['B', 'C']].add_suffix('_df2') - no_overlap = self.frame.loc[:, ['A', 'D']] - expected = df1_suf.join(df2_suf).join(no_overlap) + no_overlap = frame.loc[:, ['A', 'D']] + expected = df1_suf.join(df2_suf).join(no_overlap) - # column order not necessarily sorted - tm.assert_frame_equal(joined, expected.loc[:, joined.columns]) + # column order not necessarily sorted + tm.assert_frame_equal(joined, expected.loc[:, joined.columns]) diff --git a/pandas/tests/tools/test_merge.py b/pandas/tests/tools/test_merge.py index faaaf009ce239..68d64d2c11e5d 100644 --- a/pandas/tests/tools/test_merge.py +++ b/pandas/tests/tools/test_merge.py @@ -1356,90 +1356,50 @@ def test_dtype_on_merged_different(self, change, how, left, right): index=['X', 'Y', 'Z']) assert_series_equal(result, expected) -class TestMergeOnIndexes(object): - - def test_merge_on_indexes(self): - df1 = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) - df2 = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) - - # default how='inner' - result = pd.merge(df1, df2, left_index=True, right_index=True) - expected = pd.DataFrame({'a': [20, 10], 'b': [200, 100]}, - index=[2, 1]) - tm.assert_frame_equal(result, expected) - - # how='left' - result = pd.merge(df1, df2, left_index=True, right_index=True, - how='left') - expected = pd.DataFrame({'a': [20, 10, 0], 'b': [200, 100, np.nan]}, - index=[2, 1, 0]) - tm.assert_frame_equal(result, expected) - - # how='right' - result = pd.merge(df1, df2, left_index=True, right_index=True, - how='right') - expected = pd.DataFrame({'a': [10, 20, np.nan], 'b': [100, 200, 300]}, - index=[1, 2, 3]) - tm.assert_frame_equal(result, expected) - - # how='inner' - result = pd.merge(df1, df2, left_index=True, right_index=True, - how='inner') - expected = pd.DataFrame({'a': [20, 10], 'b': [200, 100]}, - index=[2, 1]) - tm.assert_frame_equal(result, expected) - - # how='outer' - result = pd.merge(df1, df2, left_index=True, right_index=True, - how='outer') - expected = pd.DataFrame({'a': [0, 10, 20, np.nan], - 'b': [np.nan, 100, 200, 300]}, - index=[0, 1, 2, 3]) - tm.assert_frame_equal(result, expected) - - def test_merge_on_indexes_sort(self): - df1 = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) - df2 = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) - # default how='inner' - result = pd.merge(df1, df2, left_index=True, right_index=True, - sort=True) - expected = pd.DataFrame({'a': [10, 20], 'b': [100, 200]}, - index=[1, 2]) - tm.assert_frame_equal(result, expected) +@pytest.fixture +def df1(): + return DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) - # how='left' - result = pd.merge(df1, df2, left_index=True, right_index=True, - how='left', sort=True) - expected = pd.DataFrame({'a': [0, 10, 20], 'b': [np.nan, 100, 200]}, - index=[0, 1, 2]) - tm.assert_frame_equal(result, expected) - # how='right' (already sorted) - result = pd.merge(df1, df2, left_index=True, right_index=True, - how='right', sort=True) - expected = pd.DataFrame({'a': [10, 20, np.nan], 'b': [100, 200, 300]}, - index=[1, 2, 3]) - tm.assert_frame_equal(result, expected) +@pytest.fixture +def df2(): + return DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) - # how='right' - result = pd.merge(df2, df1, left_index=True, right_index=True, - how='right', sort=True) - expected = pd.DataFrame([[np.nan, 0], [100, 10], [200, 20]], - columns=['b', 'a'], index=[0, 1, 2]) - tm.assert_frame_equal(result, expected) - # how='inner' - result = pd.merge(df1, df2, left_index=True, right_index=True, - how='inner', sort=True) - expected = pd.DataFrame({'a': [10, 20], 'b': [100, 200]}, - index=[1, 2]) - tm.assert_frame_equal(result, expected) +class TestMergeOnIndexes(object): - # how='outer' - result = pd.merge(df1, df2, left_index=True, right_index=True, - how='outer', sort=True) - expected = pd.DataFrame({'a': [0, 10, 20, np.nan], - 'b': [np.nan, 100, 200, 300]}, - index=[0, 1, 2, 3]) + @pytest.mark.parametrize( + "how, sort, expected", + [('inner', False, DataFrame({'a': [20, 10], + 'b': [200, 100]}, + index=[2, 1])), + ('inner', True, DataFrame({'a': [10, 20], + 'b': [100, 200]}, + index=[1, 2])), + ('left', False, DataFrame({'a': [20, 10, 0], + 'b': [200, 100, np.nan]}, + index=[2, 1, 0])), + ('left', True, DataFrame({'a': [0, 10, 20], + 'b': [np.nan, 100, 200]}, + index=[0, 1, 2])), + ('right', False, DataFrame({'a': [10, 20, np.nan], + 'b': [100, 200, 300]}, + index=[1, 2, 3])), + ('right', True, DataFrame({'a': [10, 20, np.nan], + 'b': [100, 200, 300]}, + index=[1, 2, 3])), + ('outer', False, DataFrame({'a': [0, 10, 20, np.nan], + 'b': [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3])), + ('outer', True, DataFrame({'a': [0, 10, 20, np.nan], + 'b': [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3]))]) + def test_merge_on_indexes(self, df1, df2, how, sort, expected): + + result = pd.merge(df1, df2, + left_index=True, + right_index=True, + how=how, + sort=sort) tm.assert_frame_equal(result, expected) From 8d2e9cc9fe73ebe8c479dc61edfcf95e0fe980ca Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Tue, 28 Mar 2017 14:51:12 +0200 Subject: [PATCH 20/23] Address requested changes --- doc/source/whatsnew/v0.20.0.txt | 11 ++--------- pandas/core/frame.py | 9 ++++++--- pandas/indexes/base.py | 2 ++ pandas/tests/frame/test_join.py | 6 +++--- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 613985a6ded52..ee7059e93a807 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -740,8 +740,8 @@ New Behavior: .. _whatsnew_0200.api_breaking.index_order: -Index order after inner join due to Index intersection -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Index.intersection and inner join now preserve the order of the left Index +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The ``Index.intersection`` now preserves the order of the calling Index (left) instead of the other Index (right) (:issue:`15582`). This affects the inner @@ -788,18 +788,11 @@ joins (methods ``DataFrame.join`` and ``pd.merge``) and the .align methods. 1 10 100 2 20 200 - In [5]: pd.merge(df1, df2, how='inner', left_index=True, right_index=True) - Out[5]: - a b - 1 10 100 - 2 20 200 - New Behavior: .. ipython:: python df1.join(df2, how='inner') - pd.merge(df1, df2, how='inner', left_index=True, right_index=True) .. _whatsnew_0200.api: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index aa2db9f133dd7..325e387fd59d4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -124,9 +124,12 @@ ----------%s right : DataFrame how : {'left', 'right', 'outer', 'inner'}, default 'inner' - * left: use only keys from left frame (SQL: left outer join) - * right: use only keys from right frame (SQL: right outer join) - * outer: use union of keys from both frames (SQL: full outer join) + * left: use only keys from left frame (SQL: left outer join), preserving + their order + * right: use only keys from right frame (SQL: right outer join), preserving + their order + * outer: use union of keys from both frames (SQL: full outer join), and + sort them lexicographically * inner: use intersection of keys from both frames (SQL: inner join), preserving the order of the left keys on : label or list diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index abf835dbcd7f2..7f0de963e5c56 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2844,6 +2844,8 @@ def _reindex_non_unique(self, target): level : int or level name, default None return_indexers : boolean, default False sort : boolean, default False + Sort the join keys lexicographically in the result Index. If False, + the order of the join keys depends on the join type (how keyword) .. versionadded:: 0.20.0 diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py index b4e8a8234af31..1b1f25fddbc85 100644 --- a/pandas/tests/frame/test_join.py +++ b/pandas/tests/frame/test_join.py @@ -20,7 +20,7 @@ def df1(): @pytest.fixture def df2(): - return DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) + return DataFrame({'b': [300, 100, 200]}, index=[3, 1, 2]) @pytest.mark.parametrize( @@ -37,8 +37,8 @@ def df2(): ('left', True, DataFrame({'a': [0, 10, 20], 'b': [np.nan, 100, 200]}, index=[0, 1, 2])), - ('right', False, DataFrame({'a': [10, 20, np.nan], - 'b': [100, 200, 300]}, + ('right', False, DataFrame({'a': [np.nan, 10, 20], + 'b': [300, 100, 200]}, index=[1, 2, 3])), ('right', True, DataFrame({'a': [10, 20, np.nan], 'b': [100, 200, 300]}, From 73df69ecb1a6c05c9e1254abd4c9f242badf96a3 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Tue, 28 Mar 2017 15:19:48 +0200 Subject: [PATCH 21/23] Address requested changes --- doc/source/whatsnew/v0.20.0.txt | 24 ++++++++++++------------ pandas/core/frame.py | 16 ++++++++-------- pandas/tests/frame/test_join.py | 8 ++++---- pandas/tests/tools/test_merge.py | 14 +++++++------- 4 files changed, 31 insertions(+), 31 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index ee7059e93a807..a808ad4bd9dc6 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -751,38 +751,38 @@ joins (methods ``DataFrame.join`` and ``pd.merge``) and the .align methods. .. ipython:: python - idx1 = pd.Index([2, 1, 0]) - idx1 - idx2 = pd.Index([1, 2, 3]) - idx2 + left = pd.Index([2, 1, 0]) + left + right = pd.Index([1, 2, 3]) + right Previous Behavior: .. code-block:: ipython - In [4]: idx1.intersection(idx2) + In [4]: left.intersection(right) Out[4]: Int64Index([1, 2], dtype='int64') New Behavior: .. ipython:: python - idx1.intersection(idx2) + left.intersection(right) - ``DataFrame.join`` and ``pd.merge`` .. ipython:: python - df1 = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) - df1 - df2 = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) - df2 + left = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) + left + right = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) + right Previous Behavior: .. code-block:: ipython - In [4]: df1.join(df2, how='inner') + In [4]: left.join(right, how='inner') Out[4]: a b 1 10 100 @@ -792,7 +792,7 @@ joins (methods ``DataFrame.join`` and ``pd.merge``) and the .align methods. .. ipython:: python - df1.join(df2, how='inner') + left.join(right, how='inner') .. _whatsnew_0200.api: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 325e387fd59d4..0f558232786ec 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -124,14 +124,14 @@ ----------%s right : DataFrame how : {'left', 'right', 'outer', 'inner'}, default 'inner' - * left: use only keys from left frame (SQL: left outer join), preserving - their order - * right: use only keys from right frame (SQL: right outer join), preserving - their order - * outer: use union of keys from both frames (SQL: full outer join), and - sort them lexicographically - * inner: use intersection of keys from both frames (SQL: inner join), - preserving the order of the left keys + * left: use only keys from left frame, similar to a SQL left outer join; + preserve key order + * right: use only keys from right frame, similar to a SQL right outer join; + preserve key order + * outer: use union of keys from both frames, similar to a SQL full outer + join; sort keys lexicographically + * inner: use intersection of keys from both frames, similar to a SQL inner + join; preserve the order of the left keys on : label or list Field names to join on. Must be found in both DataFrames. If on is None and not merging on indexes, then it merges on the intersection of diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py index 1b1f25fddbc85..fbb14c3eb2d94 100644 --- a/pandas/tests/frame/test_join.py +++ b/pandas/tests/frame/test_join.py @@ -14,12 +14,12 @@ def frame(): @pytest.fixture -def df1(): +def left(): return DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) @pytest.fixture -def df2(): +def right(): return DataFrame({'b': [300, 100, 200]}, index=[3, 1, 2]) @@ -49,9 +49,9 @@ def df2(): ('outer', True, DataFrame({'a': [0, 10, 20, np.nan], 'b': [np.nan, 100, 200, 300]}, index=[0, 1, 2, 3]))]) -def test_join(df1, df2, how, sort, expected): +def test_join(left, right, how, sort, expected): - result = df1.join(df2, how=how, sort=sort) + result = left.join(right, how=how, sort=sort) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/tools/test_merge.py b/pandas/tests/tools/test_merge.py index 68d64d2c11e5d..59c86d08e3f43 100644 --- a/pandas/tests/tools/test_merge.py +++ b/pandas/tests/tools/test_merge.py @@ -1358,13 +1358,13 @@ def test_dtype_on_merged_different(self, change, how, left, right): @pytest.fixture -def df1(): +def left(): return DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) @pytest.fixture -def df2(): - return DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) +def right(): + return DataFrame({'b': [300, 100, 200]}, index=[3, 1, 2]) class TestMergeOnIndexes(object): @@ -1383,8 +1383,8 @@ class TestMergeOnIndexes(object): ('left', True, DataFrame({'a': [0, 10, 20], 'b': [np.nan, 100, 200]}, index=[0, 1, 2])), - ('right', False, DataFrame({'a': [10, 20, np.nan], - 'b': [100, 200, 300]}, + ('right', False, DataFrame({'a': [np.nan, 10, 20], + 'b': [300, 100, 200]}, index=[1, 2, 3])), ('right', True, DataFrame({'a': [10, 20, np.nan], 'b': [100, 200, 300]}, @@ -1395,9 +1395,9 @@ class TestMergeOnIndexes(object): ('outer', True, DataFrame({'a': [0, 10, 20, np.nan], 'b': [np.nan, 100, 200, 300]}, index=[0, 1, 2, 3]))]) - def test_merge_on_indexes(self, df1, df2, how, sort, expected): + def test_merge_on_indexes(self, left, right, how, sort, expected): - result = pd.merge(df1, df2, + result = pd.merge(left, right, left_index=True, right_index=True, how=how, From 64e86a4b5c097d4cfa55c2e27aad6188d5ea6a16 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Tue, 28 Mar 2017 15:51:04 +0200 Subject: [PATCH 22/23] Fix test on right join --- pandas/tests/frame/test_join.py | 2 +- pandas/tests/tools/test_merge.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py index fbb14c3eb2d94..f7a510023ca07 100644 --- a/pandas/tests/frame/test_join.py +++ b/pandas/tests/frame/test_join.py @@ -39,7 +39,7 @@ def right(): index=[0, 1, 2])), ('right', False, DataFrame({'a': [np.nan, 10, 20], 'b': [300, 100, 200]}, - index=[1, 2, 3])), + index=[3, 1, 2])), ('right', True, DataFrame({'a': [10, 20, np.nan], 'b': [100, 200, 300]}, index=[1, 2, 3])), diff --git a/pandas/tests/tools/test_merge.py b/pandas/tests/tools/test_merge.py index 59c86d08e3f43..9e3251e1f9698 100644 --- a/pandas/tests/tools/test_merge.py +++ b/pandas/tests/tools/test_merge.py @@ -1385,7 +1385,7 @@ class TestMergeOnIndexes(object): index=[0, 1, 2])), ('right', False, DataFrame({'a': [np.nan, 10, 20], 'b': [300, 100, 200]}, - index=[1, 2, 3])), + index=[3, 1, 2])), ('right', True, DataFrame({'a': [10, 20, np.nan], 'b': [100, 200, 300]}, index=[1, 2, 3])), From 2d4e143fa18ebb67963784b0d1751e9c35222317 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Wed, 29 Mar 2017 00:41:03 +0200 Subject: [PATCH 23/23] Fix pytest fixture name collision --- pandas/tests/tools/test_merge.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/tools/test_merge.py b/pandas/tests/tools/test_merge.py index 9e3251e1f9698..8011bc4a1cfc2 100644 --- a/pandas/tests/tools/test_merge.py +++ b/pandas/tests/tools/test_merge.py @@ -1358,12 +1358,12 @@ def test_dtype_on_merged_different(self, change, how, left, right): @pytest.fixture -def left(): +def left_df(): return DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) @pytest.fixture -def right(): +def right_df(): return DataFrame({'b': [300, 100, 200]}, index=[3, 1, 2]) @@ -1395,9 +1395,9 @@ class TestMergeOnIndexes(object): ('outer', True, DataFrame({'a': [0, 10, 20, np.nan], 'b': [np.nan, 100, 200, 300]}, index=[0, 1, 2, 3]))]) - def test_merge_on_indexes(self, left, right, how, sort, expected): + def test_merge_on_indexes(self, left_df, right_df, how, sort, expected): - result = pd.merge(left, right, + result = pd.merge(left_df, right_df, left_index=True, right_index=True, how=how,