diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 37a70435ed6ff..a808ad4bd9dc6 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -738,6 +738,62 @@ New Behavior: TypeError: Cannot compare 2014-01-01 00:00:00 of type to string column +.. _whatsnew_0200.api_breaking.index_order: + +Index.intersection and inner join now preserve the order of the left Index +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``Index.intersection`` now preserves the order of the calling Index (left) +instead of the other Index (right) (:issue:`15582`). This affects the inner +joins (methods ``DataFrame.join`` and ``pd.merge``) and the .align methods. + +- ``Index.intersection`` + + .. ipython:: python + + left = pd.Index([2, 1, 0]) + left + right = pd.Index([1, 2, 3]) + right + + Previous Behavior: + + .. code-block:: ipython + + In [4]: left.intersection(right) + Out[4]: Int64Index([1, 2], dtype='int64') + + New Behavior: + + .. ipython:: python + + left.intersection(right) + +- ``DataFrame.join`` and ``pd.merge`` + + .. ipython:: python + + left = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) + left + right = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) + right + + Previous Behavior: + + .. code-block:: ipython + + In [4]: left.join(right, how='inner') + Out[4]: + a b + 1 10 100 + 2 20 200 + + New Behavior: + + .. ipython:: python + + left.join(right, how='inner') + .. _whatsnew_0200.api: @@ -960,6 +1016,7 @@ Bug Fixes - Bug in ``DataFrame.to_stata()`` and ``StataWriter`` which produces incorrectly formatted files to be produced for some locales (:issue:`13856`) - Bug in ``StataReader`` and ``StataWriter`` which allows invalid encodings (:issue:`15723`) +- Bug with ``sort=True`` in ``DataFrame.join`` and ``pd.merge`` when joining on indexes (:issue:`15582`) - Bug in ``pd.concat()`` in which concatting with an empty dataframe with ``join='inner'`` was being improperly handled (:issue:`15328`) - Bug in ``groupby.agg()`` incorrectly localizing timezone on ``datetime`` (:issue:`15426`, :issue:`10668`, :issue:`13046`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6b5e8e0799421..0f558232786ec 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -124,10 +124,14 @@ ----------%s right : DataFrame how : {'left', 'right', 'outer', 'inner'}, default 'inner' - * left: use only keys from left frame (SQL: left outer join) - * right: use only keys from right frame (SQL: right outer join) - * outer: use union of keys from both frames (SQL: full outer join) - * inner: use intersection of keys from both frames (SQL: inner join) + * left: use only keys from left frame, similar to a SQL left outer join; + preserve key order + * right: use only keys from right frame, similar to a SQL right outer join; + preserve key order + * outer: use union of keys from both frames, similar to a SQL full outer + join; sort keys lexicographically + * inner: use intersection of keys from both frames, similar to a SQL inner + join; preserve the order of the left keys on : label or list Field names to join on. Must be found in both DataFrames. If on is None and not merging on indexes, then it merges on the intersection of @@ -147,7 +151,8 @@ Use the index from the right DataFrame as the join key. Same caveats as left_index sort : boolean, default False - Sort the join keys lexicographically in the result DataFrame + Sort the join keys lexicographically in the result DataFrame. If False, + the order of the join keys depends on the join type (how keyword) suffixes : 2-length sequence (tuple, list, ...) Suffix to apply to overlapping column names in the left and right side, respectively @@ -4463,16 +4468,18 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', * left: use calling frame's index (or column if on is specified) * right: use other frame's index * outer: form union of calling frame's index (or column if on is - specified) with other frame's index + specified) with other frame's index, and sort it + lexicographically * inner: form intersection of calling frame's index (or column if - on is specified) with other frame's index + on is specified) with other frame's index, preserving the order + of the calling's one lsuffix : string Suffix to use from left frame's overlapping columns rsuffix : string Suffix to use from right frame's overlapping columns sort : boolean, default False Order result DataFrame lexicographically by the join key. If False, - preserves the index order of the calling (left) DataFrame + the order of the join key depends on the join type (how keyword) Notes ----- diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 54f73a2466286..7f0de963e5c56 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2089,8 +2089,8 @@ def intersection(self, other): """ Form the intersection of two Index objects. - This returns a new Index with elements common to the index and `other`. - Sortedness of the result is not guaranteed. + This returns a new Index with elements common to the index and `other`, + preserving the order of the calling index. Parameters ---------- @@ -2128,15 +2128,15 @@ def intersection(self, other): pass try: - indexer = Index(self._values).get_indexer(other._values) + indexer = Index(other._values).get_indexer(self._values) indexer = indexer.take((indexer != -1).nonzero()[0]) except: # duplicates - indexer = Index(self._values).get_indexer_non_unique( - other._values)[0].unique() + indexer = Index(other._values).get_indexer_non_unique( + self._values)[0].unique() indexer = indexer[indexer != -1] - taken = self.take(indexer) + taken = other.take(indexer) if self.name != other.name: taken.name = None return taken @@ -2831,8 +2831,7 @@ def _reindex_non_unique(self, target): new_index = self._shallow_copy_with_infer(new_labels, freq=None) return new_index, indexer, new_indexer - def join(self, other, how='left', level=None, return_indexers=False): - """ + _index_shared_docs['join'] = """ *this is an internal non-public method* Compute join_index and indexers to conform data @@ -2844,11 +2843,20 @@ def join(self, other, how='left', level=None, return_indexers=False): how : {'left', 'right', 'inner', 'outer'} level : int or level name, default None return_indexers : boolean, default False + sort : boolean, default False + Sort the join keys lexicographically in the result Index. If False, + the order of the join keys depends on the join type (how keyword) + + .. versionadded:: 0.20.0 Returns ------- join_index, (left_indexer, right_indexer) """ + + @Appender(_index_shared_docs['join']) + def join(self, other, how='left', level=None, return_indexers=False, + sort=False): from .multi import MultiIndex self_is_mi = isinstance(self, MultiIndex) other_is_mi = isinstance(other, MultiIndex) @@ -2929,6 +2937,9 @@ def join(self, other, how='left', level=None, return_indexers=False): elif how == 'outer': join_index = self.union(other) + if sort: + join_index = join_index.sort_values() + if return_indexers: if join_index is self: lindexer = None diff --git a/pandas/indexes/range.py b/pandas/indexes/range.py index 103a3ac2fd5f4..be68c97fb7890 100644 --- a/pandas/indexes/range.py +++ b/pandas/indexes/range.py @@ -431,29 +431,16 @@ def union(self, other): return self._int64index.union(other) - def join(self, other, how='left', level=None, return_indexers=False): - """ - *this is an internal non-public method* - - Compute join_index and indexers to conform data - structures to the new index. - - Parameters - ---------- - other : Index - how : {'left', 'right', 'inner', 'outer'} - level : int or level name, default None - return_indexers : boolean, default False - - Returns - ------- - join_index, (left_indexer, right_indexer) - """ + @Appender(_index_shared_docs['join']) + def join(self, other, how='left', level=None, return_indexers=False, + sort=False): if how == 'outer' and self is not other: # note: could return RangeIndex in more circumstances - return self._int64index.join(other, how, level, return_indexers) + return self._int64index.join(other, how, level, return_indexers, + sort) - return super(RangeIndex, self).join(other, how, level, return_indexers) + return super(RangeIndex, self).join(other, how, level, return_indexers, + sort) def __len__(self): """ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index b3b253f151541..f75a4761e0948 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4321,7 +4321,7 @@ def _reindex_axis(obj, axis, labels, other=None): labels = _ensure_index(labels.unique()) if other is not None: - labels = labels & _ensure_index(other.unique()) + labels = _ensure_index(other.unique()) & labels if not labels.equals(ax): slicer = [slice(None, None)] * obj.ndim slicer[axis] = labels diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py new file mode 100644 index 0000000000000..f7a510023ca07 --- /dev/null +++ b/pandas/tests/frame/test_join.py @@ -0,0 +1,140 @@ +# -*- coding: utf-8 -*- + +import pytest +import numpy as np + +from pandas import DataFrame, Index +from pandas.tests.frame.common import TestData +import pandas.util.testing as tm + + +@pytest.fixture +def frame(): + return TestData().frame + + +@pytest.fixture +def left(): + return DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) + + +@pytest.fixture +def right(): + return DataFrame({'b': [300, 100, 200]}, index=[3, 1, 2]) + + +@pytest.mark.parametrize( + "how, sort, expected", + [('inner', False, DataFrame({'a': [20, 10], + 'b': [200, 100]}, + index=[2, 1])), + ('inner', True, DataFrame({'a': [10, 20], + 'b': [100, 200]}, + index=[1, 2])), + ('left', False, DataFrame({'a': [20, 10, 0], + 'b': [200, 100, np.nan]}, + index=[2, 1, 0])), + ('left', True, DataFrame({'a': [0, 10, 20], + 'b': [np.nan, 100, 200]}, + index=[0, 1, 2])), + ('right', False, DataFrame({'a': [np.nan, 10, 20], + 'b': [300, 100, 200]}, + index=[3, 1, 2])), + ('right', True, DataFrame({'a': [10, 20, np.nan], + 'b': [100, 200, 300]}, + index=[1, 2, 3])), + ('outer', False, DataFrame({'a': [0, 10, 20, np.nan], + 'b': [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3])), + ('outer', True, DataFrame({'a': [0, 10, 20, np.nan], + 'b': [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3]))]) +def test_join(left, right, how, sort, expected): + + result = left.join(right, how=how, sort=sort) + tm.assert_frame_equal(result, expected) + + +def test_join_index(frame): + # left / right + + f = frame.loc[frame.index[:10], ['A', 'B']] + f2 = frame.loc[frame.index[5:], ['C', 'D']].iloc[::-1] + + joined = f.join(f2) + tm.assert_index_equal(f.index, joined.index) + expected_columns = Index(['A', 'B', 'C', 'D']) + tm.assert_index_equal(joined.columns, expected_columns) + + joined = f.join(f2, how='left') + tm.assert_index_equal(joined.index, f.index) + tm.assert_index_equal(joined.columns, expected_columns) + + joined = f.join(f2, how='right') + tm.assert_index_equal(joined.index, f2.index) + tm.assert_index_equal(joined.columns, expected_columns) + + # inner + + joined = f.join(f2, how='inner') + tm.assert_index_equal(joined.index, f.index[5:10]) + tm.assert_index_equal(joined.columns, expected_columns) + + # outer + + joined = f.join(f2, how='outer') + tm.assert_index_equal(joined.index, frame.index.sort_values()) + tm.assert_index_equal(joined.columns, expected_columns) + + tm.assertRaisesRegexp(ValueError, 'join method', f.join, f2, how='foo') + + # corner case - overlapping columns + for how in ('outer', 'left', 'inner'): + with tm.assertRaisesRegexp(ValueError, 'columns overlap but ' + 'no suffix'): + frame.join(frame, how=how) + + +def test_join_index_more(frame): + af = frame.loc[:, ['A', 'B']] + bf = frame.loc[::2, ['C', 'D']] + + expected = af.copy() + expected['C'] = frame['C'][::2] + expected['D'] = frame['D'][::2] + + result = af.join(bf) + tm.assert_frame_equal(result, expected) + + result = af.join(bf, how='right') + tm.assert_frame_equal(result, expected[::2]) + + result = bf.join(af, how='right') + tm.assert_frame_equal(result, expected.loc[:, result.columns]) + + +def test_join_index_series(frame): + df = frame.copy() + s = df.pop(frame.columns[-1]) + joined = df.join(s) + + # TODO should this check_names ? + tm.assert_frame_equal(joined, frame, check_names=False) + + s.name = None + tm.assertRaisesRegexp(ValueError, 'must have a name', df.join, s) + + +def test_join_overlap(frame): + df1 = frame.loc[:, ['A', 'B', 'C']] + df2 = frame.loc[:, ['B', 'C', 'D']] + + joined = df1.join(df2, lsuffix='_df1', rsuffix='_df2') + df1_suf = df1.loc[:, ['B', 'C']].add_suffix('_df1') + df2_suf = df2.loc[:, ['B', 'C']].add_suffix('_df2') + + no_overlap = frame.loc[:, ['A', 'D']] + expected = df1_suf.join(df2_suf).join(no_overlap) + + # column order not necessarily sorted + tm.assert_frame_equal(joined, expected.loc[:, joined.columns]) diff --git a/pandas/tests/frame/test_misc_api.py b/pandas/tests/frame/test_misc_api.py index 321d46739b24c..42427df90401d 100644 --- a/pandas/tests/frame/test_misc_api.py +++ b/pandas/tests/frame/test_misc_api.py @@ -57,92 +57,6 @@ def test_get_value(self): expected = self.frame[col][idx] tm.assert_almost_equal(result, expected) - def test_join_index(self): - # left / right - - f = self.frame.reindex(columns=['A', 'B'])[:10] - f2 = self.frame.reindex(columns=['C', 'D']) - - joined = f.join(f2) - self.assert_index_equal(f.index, joined.index) - self.assertEqual(len(joined.columns), 4) - - joined = f.join(f2, how='left') - self.assert_index_equal(joined.index, f.index) - self.assertEqual(len(joined.columns), 4) - - joined = f.join(f2, how='right') - self.assert_index_equal(joined.index, f2.index) - self.assertEqual(len(joined.columns), 4) - - # inner - - f = self.frame.reindex(columns=['A', 'B'])[:10] - f2 = self.frame.reindex(columns=['C', 'D']) - - joined = f.join(f2, how='inner') - self.assert_index_equal(joined.index, f.index.intersection(f2.index)) - self.assertEqual(len(joined.columns), 4) - - # outer - - f = self.frame.reindex(columns=['A', 'B'])[:10] - f2 = self.frame.reindex(columns=['C', 'D']) - - joined = f.join(f2, how='outer') - self.assertTrue(tm.equalContents(self.frame.index, joined.index)) - self.assertEqual(len(joined.columns), 4) - - assertRaisesRegexp(ValueError, 'join method', f.join, f2, how='foo') - - # corner case - overlapping columns - for how in ('outer', 'left', 'inner'): - with assertRaisesRegexp(ValueError, 'columns overlap but ' - 'no suffix'): - self.frame.join(self.frame, how=how) - - def test_join_index_more(self): - af = self.frame.loc[:, ['A', 'B']] - bf = self.frame.loc[::2, ['C', 'D']] - - expected = af.copy() - expected['C'] = self.frame['C'][::2] - expected['D'] = self.frame['D'][::2] - - result = af.join(bf) - assert_frame_equal(result, expected) - - result = af.join(bf, how='right') - assert_frame_equal(result, expected[::2]) - - result = bf.join(af, how='right') - assert_frame_equal(result, expected.loc[:, result.columns]) - - def test_join_index_series(self): - df = self.frame.copy() - s = df.pop(self.frame.columns[-1]) - joined = df.join(s) - - # TODO should this check_names ? - assert_frame_equal(joined, self.frame, check_names=False) - - s.name = None - assertRaisesRegexp(ValueError, 'must have a name', df.join, s) - - def test_join_overlap(self): - df1 = self.frame.loc[:, ['A', 'B', 'C']] - df2 = self.frame.loc[:, ['B', 'C', 'D']] - - joined = df1.join(df2, lsuffix='_df1', rsuffix='_df2') - df1_suf = df1.loc[:, ['B', 'C']].add_suffix('_df1') - df2_suf = df2.loc[:, ['B', 'C']].add_suffix('_df2') - - no_overlap = self.frame.loc[:, ['A', 'D']] - expected = df1_suf.join(df2_suf).join(no_overlap) - - # column order not necessarily sorted - assert_frame_equal(joined, expected.loc[:, joined.columns]) - def test_add_prefix_suffix(self): with_prefix = self.frame.add_prefix('foo#') expected = pd.Index(['foo#%s' % c for c in self.frame.columns]) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index c4dc10d8174cc..a8197b070b032 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -626,14 +626,14 @@ def test_intersection(self): # non monotonic idx1 = Index([5, 3, 2, 4, 1], name='idx') idx2 = Index([4, 7, 6, 5, 3], name='idx') - result2 = idx1.intersection(idx2) - self.assertTrue(tm.equalContents(result2, expected2)) - self.assertEqual(result2.name, expected2.name) + expected = Index([5, 3, 4], name='idx') + result = idx1.intersection(idx2) + self.assert_index_equal(result, expected) - idx3 = Index([4, 7, 6, 5, 3], name='other') - result3 = idx1.intersection(idx3) - self.assertTrue(tm.equalContents(result3, expected3)) - self.assertEqual(result3.name, expected3.name) + idx2 = Index([4, 7, 6, 5, 3], name='other') + expected = Index([5, 3, 4], name=None) + result = idx1.intersection(idx2) + self.assert_index_equal(result, expected) # non-monotonic non-unique idx1 = Index(['A', 'B', 'A', 'C']) @@ -642,6 +642,11 @@ def test_intersection(self): result = idx1.intersection(idx2) self.assert_index_equal(result, expected) + idx2 = Index(['B', 'D', 'A']) + expected = Index(['A', 'B', 'A'], dtype='object') + result = idx1.intersection(idx2) + self.assert_index_equal(result, expected) + # preserve names first = self.strIndex[5:20] second = self.strIndex[:10] diff --git a/pandas/tests/tools/test_merge.py b/pandas/tests/tools/test_merge.py index ff27500355998..8011bc4a1cfc2 100644 --- a/pandas/tests/tools/test_merge.py +++ b/pandas/tests/tools/test_merge.py @@ -1355,3 +1355,51 @@ def test_dtype_on_merged_different(self, change, how, left, right): np.dtype('int64')], index=['X', 'Y', 'Z']) assert_series_equal(result, expected) + + +@pytest.fixture +def left_df(): + return DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) + + +@pytest.fixture +def right_df(): + return DataFrame({'b': [300, 100, 200]}, index=[3, 1, 2]) + + +class TestMergeOnIndexes(object): + + @pytest.mark.parametrize( + "how, sort, expected", + [('inner', False, DataFrame({'a': [20, 10], + 'b': [200, 100]}, + index=[2, 1])), + ('inner', True, DataFrame({'a': [10, 20], + 'b': [100, 200]}, + index=[1, 2])), + ('left', False, DataFrame({'a': [20, 10, 0], + 'b': [200, 100, np.nan]}, + index=[2, 1, 0])), + ('left', True, DataFrame({'a': [0, 10, 20], + 'b': [np.nan, 100, 200]}, + index=[0, 1, 2])), + ('right', False, DataFrame({'a': [np.nan, 10, 20], + 'b': [300, 100, 200]}, + index=[3, 1, 2])), + ('right', True, DataFrame({'a': [10, 20, np.nan], + 'b': [100, 200, 300]}, + index=[1, 2, 3])), + ('outer', False, DataFrame({'a': [0, 10, 20, np.nan], + 'b': [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3])), + ('outer', True, DataFrame({'a': [0, 10, 20, np.nan], + 'b': [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3]))]) + def test_merge_on_indexes(self, left_df, right_df, how, sort, expected): + + result = pd.merge(left_df, right_df, + left_index=True, + right_index=True, + how=how, + sort=sort) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 60d523a8ea539..7de2549cadfc7 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -733,7 +733,8 @@ def _get_join_info(self): if self.left_index and self.right_index and self.how != 'asof': join_index, left_indexer, right_indexer = \ - left_ax.join(right_ax, how=self.how, return_indexers=True) + left_ax.join(right_ax, how=self.how, return_indexers=True, + sort=self.sort) elif self.right_index and self.how == 'left': join_index, left_indexer, right_indexer = \ _left_join_on_index(left_ax, right_ax, self.left_join_keys, diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 11d2d29597fc0..5ca45bb63bd59 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1037,7 +1037,8 @@ def union_many(self, others): this.offset = to_offset(this.inferred_freq) return this - def join(self, other, how='left', level=None, return_indexers=False): + def join(self, other, how='left', level=None, return_indexers=False, + sort=False): """ See Index.join """ @@ -1051,7 +1052,7 @@ def join(self, other, how='left', level=None, return_indexers=False): this, other = self._maybe_utc_convert(other) return Index.join(this, other, how=how, level=level, - return_indexers=return_indexers) + return_indexers=return_indexers, sort=sort) def _maybe_utc_convert(self, other): this = self @@ -1203,9 +1204,10 @@ def intersection(self, other): not other.offset.isAnchored() or (not self.is_monotonic or not other.is_monotonic)): result = Index.intersection(self, other) - if isinstance(result, DatetimeIndex): - if result.freq is None: - result.offset = to_offset(result.inferred_freq) + result = self._shallow_copy(result._values, name=result.name, + tz=result.tz, freq=None) + if result.freq is None: + result.offset = to_offset(result.inferred_freq) return result if len(self) == 0: @@ -1528,7 +1530,7 @@ def _get_freq(self): def _set_freq(self, value): self.offset = value freq = property(fget=_get_freq, fset=_set_freq, - doc="get/set the frequncy of the Index") + doc="get/set the frequency of the Index") year = _field_accessor('year', 'Y', "The year of the datetime") month = _field_accessor('month', 'M',