-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: Fix index order for Index.intersection() #15583
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c12bb3f
c2a8dc3
a4ead99
e7bcd28
d9e29f8
1197b99
784fe75
b977278
ec836bd
047b513
c96306d
f0d9d03
ef2581e
3c200fe
33eb740
654288b
5bf1508
9e39794
968c7f1
8d2e9cc
73df69e
64e86a4
2d4e143
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2089,8 +2089,8 @@ def intersection(self, other): | |
""" | ||
Form the intersection of two Index objects. | ||
|
||
This returns a new Index with elements common to the index and `other`. | ||
Sortedness of the result is not guaranteed. | ||
This returns a new Index with elements common to the index and `other`, | ||
preserving the order of the calling index. | ||
|
||
Parameters | ||
---------- | ||
|
@@ -2128,15 +2128,15 @@ def intersection(self, other): | |
pass | ||
|
||
try: | ||
indexer = Index(self._values).get_indexer(other._values) | ||
indexer = Index(other._values).get_indexer(self._values) | ||
indexer = indexer.take((indexer != -1).nonzero()[0]) | ||
except: | ||
# duplicates | ||
indexer = Index(self._values).get_indexer_non_unique( | ||
other._values)[0].unique() | ||
indexer = Index(other._values).get_indexer_non_unique( | ||
self._values)[0].unique() | ||
indexer = indexer[indexer != -1] | ||
|
||
taken = self.take(indexer) | ||
taken = other.take(indexer) | ||
if self.name != other.name: | ||
taken.name = None | ||
return taken | ||
|
@@ -2831,8 +2831,7 @@ def _reindex_non_unique(self, target): | |
new_index = self._shallow_copy_with_infer(new_labels, freq=None) | ||
return new_index, indexer, new_indexer | ||
|
||
def join(self, other, how='left', level=None, return_indexers=False): | ||
""" | ||
_index_shared_docs['join'] = """ | ||
*this is an internal non-public method* | ||
|
||
Compute join_index and indexers to conform data | ||
|
@@ -2844,11 +2843,20 @@ def join(self, other, how='left', level=None, return_indexers=False): | |
how : {'left', 'right', 'inner', 'outer'} | ||
level : int or level name, default None | ||
return_indexers : boolean, default False | ||
sort : boolean, default False | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a versionadded tag |
||
Sort the join keys lexicographically in the result Index. If False, | ||
the order of the join keys depends on the join type (how keyword) | ||
|
||
.. versionadded:: 0.20.0 | ||
|
||
Returns | ||
------- | ||
join_index, (left_indexer, right_indexer) | ||
""" | ||
|
||
@Appender(_index_shared_docs['join']) | ||
def join(self, other, how='left', level=None, return_indexers=False, | ||
sort=False): | ||
from .multi import MultiIndex | ||
self_is_mi = isinstance(self, MultiIndex) | ||
other_is_mi = isinstance(other, MultiIndex) | ||
|
@@ -2929,6 +2937,9 @@ def join(self, other, how='left', level=None, return_indexers=False): | |
elif how == 'outer': | ||
join_index = self.union(other) | ||
|
||
if sort: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this should only be on an outer join right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I admit there is an ambiguity here. By default ( If we set There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jreback Is this the expected result? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure. can you show a small example with this both turned on and off (for each of the how's). Sorry this is a bit of work, but this is a bit of unexplored API area There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In file There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this doesn't answer my question though, isn't sort ignored for all but inner join? If so then I would set it False (to make the code clear for the others). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, sort=True is only ignored for outer join, because the outer join always sorts the result index. For the other types of join (left, right, inner), the default result is unsorted, as it preserves the left index order (for left and inner) or the right index order (for right), which might not be previously sorted. By setting sort=True, we force the resulting index order to be sorted, even for the cases of left, right and inner joins. What do you mean by I would set it False? The parameter sort is set False by default. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @albertvillanova what I mean is that reading the code it is very confusing and not obvious how What I mean is I would explicity set
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jreback There is not really an interaction between
very clear. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jorisvandenbossche Great explanation. |
||
join_index = join_index.sort_values() | ||
|
||
if return_indexers: | ||
if join_index is self: | ||
lindexer = None | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
import pytest | ||
import numpy as np | ||
|
||
from pandas import DataFrame, Index | ||
from pandas.tests.frame.common import TestData | ||
import pandas.util.testing as tm | ||
|
||
|
||
@pytest.fixture | ||
def frame(): | ||
return TestData().frame | ||
|
||
|
||
@pytest.fixture | ||
def left(): | ||
return DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) | ||
|
||
|
||
@pytest.fixture | ||
def right(): | ||
return DataFrame({'b': [300, 100, 200]}, index=[3, 1, 2]) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"how, sort, expected", | ||
[('inner', False, DataFrame({'a': [20, 10], | ||
'b': [200, 100]}, | ||
index=[2, 1])), | ||
('inner', True, DataFrame({'a': [10, 20], | ||
'b': [100, 200]}, | ||
index=[1, 2])), | ||
('left', False, DataFrame({'a': [20, 10, 0], | ||
'b': [200, 100, np.nan]}, | ||
index=[2, 1, 0])), | ||
('left', True, DataFrame({'a': [0, 10, 20], | ||
'b': [np.nan, 100, 200]}, | ||
index=[0, 1, 2])), | ||
('right', False, DataFrame({'a': [np.nan, 10, 20], | ||
'b': [300, 100, 200]}, | ||
index=[3, 1, 2])), | ||
('right', True, DataFrame({'a': [10, 20, np.nan], | ||
'b': [100, 200, 300]}, | ||
index=[1, 2, 3])), | ||
('outer', False, DataFrame({'a': [0, 10, 20, np.nan], | ||
'b': [np.nan, 100, 200, 300]}, | ||
index=[0, 1, 2, 3])), | ||
('outer', True, DataFrame({'a': [0, 10, 20, np.nan], | ||
'b': [np.nan, 100, 200, 300]}, | ||
index=[0, 1, 2, 3]))]) | ||
def test_join(left, right, how, sort, expected): | ||
|
||
result = left.join(right, how=how, sort=sort) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
def test_join_index(frame): | ||
# left / right | ||
|
||
f = frame.loc[frame.index[:10], ['A', 'B']] | ||
f2 = frame.loc[frame.index[5:], ['C', 'D']].iloc[::-1] | ||
|
||
joined = f.join(f2) | ||
tm.assert_index_equal(f.index, joined.index) | ||
expected_columns = Index(['A', 'B', 'C', 'D']) | ||
tm.assert_index_equal(joined.columns, expected_columns) | ||
|
||
joined = f.join(f2, how='left') | ||
tm.assert_index_equal(joined.index, f.index) | ||
tm.assert_index_equal(joined.columns, expected_columns) | ||
|
||
joined = f.join(f2, how='right') | ||
tm.assert_index_equal(joined.index, f2.index) | ||
tm.assert_index_equal(joined.columns, expected_columns) | ||
|
||
# inner | ||
|
||
joined = f.join(f2, how='inner') | ||
tm.assert_index_equal(joined.index, f.index[5:10]) | ||
tm.assert_index_equal(joined.columns, expected_columns) | ||
|
||
# outer | ||
|
||
joined = f.join(f2, how='outer') | ||
tm.assert_index_equal(joined.index, frame.index.sort_values()) | ||
tm.assert_index_equal(joined.columns, expected_columns) | ||
|
||
tm.assertRaisesRegexp(ValueError, 'join method', f.join, f2, how='foo') | ||
|
||
# corner case - overlapping columns | ||
for how in ('outer', 'left', 'inner'): | ||
with tm.assertRaisesRegexp(ValueError, 'columns overlap but ' | ||
'no suffix'): | ||
frame.join(frame, how=how) | ||
|
||
|
||
def test_join_index_more(frame): | ||
af = frame.loc[:, ['A', 'B']] | ||
bf = frame.loc[::2, ['C', 'D']] | ||
|
||
expected = af.copy() | ||
expected['C'] = frame['C'][::2] | ||
expected['D'] = frame['D'][::2] | ||
|
||
result = af.join(bf) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
result = af.join(bf, how='right') | ||
tm.assert_frame_equal(result, expected[::2]) | ||
|
||
result = bf.join(af, how='right') | ||
tm.assert_frame_equal(result, expected.loc[:, result.columns]) | ||
|
||
|
||
def test_join_index_series(frame): | ||
df = frame.copy() | ||
s = df.pop(frame.columns[-1]) | ||
joined = df.join(s) | ||
|
||
# TODO should this check_names ? | ||
tm.assert_frame_equal(joined, frame, check_names=False) | ||
|
||
s.name = None | ||
tm.assertRaisesRegexp(ValueError, 'must have a name', df.join, s) | ||
|
||
|
||
def test_join_overlap(frame): | ||
df1 = frame.loc[:, ['A', 'B', 'C']] | ||
df2 = frame.loc[:, ['B', 'C', 'D']] | ||
|
||
joined = df1.join(df2, lsuffix='_df1', rsuffix='_df2') | ||
df1_suf = df1.loc[:, ['B', 'C']].add_suffix('_df1') | ||
df2_suf = df2.loc[:, ['B', 'C']].add_suffix('_df2') | ||
|
||
no_overlap = frame.loc[:, ['A', 'D']] | ||
expected = df1_suf.join(df2_suf).join(no_overlap) | ||
|
||
# column order not necessarily sorted | ||
tm.assert_frame_equal(joined, expected.loc[:, joined.columns]) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you name these left and right