Skip to content

ENH: sort=bool keyword argument for index.difference #17878

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 11 commits into from
62 changes: 34 additions & 28 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2183,7 +2183,7 @@ def _get_consensus_name(self, other):
return self._shallow_copy(name=name)
return self

def union(self, other):
def union(self, other, sort=True):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need to implement this in subclasses of Index, eg. DatetimeIndex.

[pandas] find ./pandas -type f | grep .py$ |  xargs grep -n "def union"                                 17:41:36  ☁  fix-drop_duplicates-when-duplicate-column-names ☂ ✭
./pandas/core/common.py:353:def union(*seqs):
./pandas/core/dtypes/concat.py:214:def union_categoricals(to_union, sort_categories=False, ignore_order=False):
./pandas/core/indexes/base.py:2186:    def union(self, other):
./pandas/core/indexes/datetimes.py:983:    def union(self, other):
./pandas/core/indexes/datetimes.py:1036:    def union_many(self, others):
./pandas/core/indexes/multi.py:2584:    def union(self, other):
./pandas/core/indexes/range.py:416:    def union(self, other):
./pandas/core/indexes/timedeltas.py:496:    def union(self, other):

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Giftlin : I don't think you have addressed this comment yet.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh yeah.. will do it 👍

"""
Form the union of two Index objects and sorts if possible.

Expand Down Expand Up @@ -2241,27 +2241,31 @@ def union(self, other):
allow_fill=False)
result = _concat._concat_compat((self._values, other_diff))

try:
self._values[0] < other_diff[0]
except TypeError as e:
warnings.warn("%s, sort order is undefined for "
"incomparable objects" % e, RuntimeWarning,
stacklevel=3)
else:
types = frozenset((self.inferred_type,
other.inferred_type))
if not types & _unsortable_types:
result.sort()
if sort:
try:
self._values[0] < other_diff[0]
except TypeError as e:
warnings.warn("%s, sort order is undefined for "
"incomparable objects" % e,
RuntimeWarning,
stacklevel=3)
else:
types = frozenset((self.inferred_type,
other.inferred_type))
if not types & _unsortable_types:
result.sort()

else:
result = self._values

try:
result = np.sort(result)
except TypeError as e:
warnings.warn("%s, sort order is undefined for "
"incomparable objects" % e, RuntimeWarning,
stacklevel=3)
if sort:
try:
result = np.sort(result)
except TypeError as e:
warnings.warn("%s, sort order is undefined for "
"incomparable objects" % e,
RuntimeWarning,
stacklevel=3)

# for subclasses
return self._wrap_union_result(other, result)
Expand Down Expand Up @@ -2326,7 +2330,7 @@ def intersection(self, other):
taken.name = None
return taken

def difference(self, other):
def difference(self, other, sort=True):
"""
Return a new Index with elements from the index that are not in
`other`.
Expand Down Expand Up @@ -2366,14 +2370,15 @@ def difference(self, other):
label_diff = np.setdiff1d(np.arange(this.size), indexer,
assume_unique=True)
the_diff = this.values.take(label_diff)
try:
the_diff = sorting.safe_sort(the_diff)
except TypeError:
pass
if sort:
try:
the_diff = sorting.safe_sort(the_diff)
except TypeError:
pass

return this._shallow_copy(the_diff, name=result_name, freq=None)

def symmetric_difference(self, other, result_name=None):
def symmetric_difference(self, other, result_name=None, sort=True):
"""
Compute the symmetric difference of two Index objects.
It's sorted if sorting is possible.
Expand Down Expand Up @@ -2426,10 +2431,11 @@ def symmetric_difference(self, other, result_name=None):
right_diff = other.values.take(right_indexer)

the_diff = _concat._concat_compat([left_diff, right_diff])
try:
the_diff = sorting.safe_sort(the_diff)
except TypeError:
pass
if sort:
try:
the_diff = sorting.safe_sort(the_diff)
except TypeError:
pass

attribs = self._get_attributes_dict()
attribs['name'] = result_name
Expand Down
174 changes: 174 additions & 0 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -766,6 +766,103 @@ def test_union(self):
tm.assert_contains_all(self.strIndex, secondCat)
tm.assert_contains_all(self.dateIndex, firstCat)

def test_union_sorting_false(self):

# GH 17839
dt_index = pd.DatetimeIndex(["20/12/2012", "15/05/2015",
"1/1/2011", "1/7/2017"])
first = pd.Index([2, 0, 4, 3])
second = pd.Index([1, 4])
everything = pd.Index([2, 0, 4, 3, 1])
union = first.union(second, sort=False)
assert tm.equalContents(union, everything)

cases = [klass(second.values) for klass in [np.array, Series, list]]
for case in cases:
result = first.union(case, sort=False)
assert tm.equalContents(result, everything)

# Corner cases
union = first.union(first, sort=False)
assert union is first

union = first.union([], sort=False)
assert union is first

union = Index([]).union(first, sort=False)
assert union is first

# preserve names
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pls dont' repeat code like this, use parametrize instead.

first = Index(list('ba'), name='A')
second = Index(list('ba'), name='B')
union = first.union(second, sort=False)
expected = Index(list('ba'), name=None)
tm.assert_index_equal(union, expected)

first = Index(list('ba'), name='A')
second = Index([], name='B')
union = first.union(second, sort=False)
expected = Index(list('ba'), name=None)
tm.assert_index_equal(union, expected)

first = Index([], name='A')
second = Index(list('ba'), name='B')
union = first.union(second, sort=False)
expected = Index(list('ba'), name=None)
tm.assert_index_equal(union, expected)

first = Index(list('ba'))
second = Index(list('ba'), name='B')
union = first.union(second, sort=False)
expected = Index(list('ba'), name='B')
tm.assert_index_equal(union, expected)

first = Index([])
second = Index(list('ba'), name='B')
union = first.union(second, sort=False)
expected = Index(list('ba'), name='B')
tm.assert_index_equal(union, expected)

first = Index(list('ba'))
second = Index([], name='B')
union = first.union(second, sort=False)
expected = Index(list('ba'), name='B')
tm.assert_index_equal(union, expected)

first = Index(list('ba'), name='A')
second = Index(list('ba'))
union = first.union(second, sort=False)
expected = Index(list('ba'), name='A')
tm.assert_index_equal(union, expected)

first = Index(list('ba'), name='A')
second = Index([])
union = first.union(second, sort=False)
expected = Index(list('ba'), name='A')
tm.assert_index_equal(union, expected)

first = Index([], name='A')
second = Index(list('ba'))
union = first.union(second, sort=False)
expected = Index(list('ba'), name='A')
tm.assert_index_equal(union, expected)

first = pd.Index([2, 0, 4, 3])
with tm.assert_produces_warning(RuntimeWarning):
firstCat = first.union(dt_index, sort=False)
secondCat = first.union(first, sort=False)

if dt_index.dtype == np.object_:
appended = np.append(first, dt_index)
else:
appended = np.append(first, dt_index.astype('O'))

assert tm.equalContents(firstCat, appended)
assert tm.equalContents(secondCat, first)
tm.assert_contains_all(first, firstCat)
tm.assert_contains_all(first, secondCat)
tm.assert_contains_all(dt_index, firstCat)

def test_add(self):
idx = self.strIndex
expected = Index(self.strIndex.values * 2)
Expand Down Expand Up @@ -896,6 +993,34 @@ def test_difference(self):
assert len(result) == 0
assert result.name == first.name

def test_difference_sorting_false(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • Reference issue number
  • Will need test cases that cause your code to issue the warnings you added

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gfyoung any changes required?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


# GH 17839
first = pd.Index([2, 0, 4, 3])
second = pd.Index([2, 1])
answer = pd.Index([0, 4, 3])
first.name = 'name'
# different names
result = first.difference(second, sort=False)

assert tm.equalContents(result, answer)
assert result.name is None

# same names
second.name = 'name'
result = first.difference(second, sort=False)
assert result.name == 'name'

# with empty
result = first.difference([], sort=False)
assert tm.equalContents(result, first)
assert result.name == first.name

# with everything
result = first.difference(first, sort=False)
assert len(result) == 0
assert result.name == first.name

def test_symmetric_difference(self):
# smoke
idx1 = Index([1, 2, 3, 4], name='idx1')
Expand Down Expand Up @@ -944,6 +1069,55 @@ def test_symmetric_difference(self):
assert tm.equalContents(result, expected)
assert result.name == 'new_name'

def test_symmetric_difference_sorting_false(self):

# GH 17839
# smoke
idx1 = Index([1, 2, 3, 4], name='idx1')
idx2 = Index([2, 3, 4, 5])
result = idx1.symmetric_difference(idx2, sort=False)
expected = Index([1, 5])
assert tm.equalContents(result, expected)
assert result.name is None

# __xor__ syntax
expected = idx1 ^ idx2
assert tm.equalContents(result, expected)
assert result.name is None

# multiIndex
idx1 = MultiIndex.from_tuples(self.tuples)
idx2 = MultiIndex.from_tuples([('foo', 1), ('bar', 3)])
result = idx1.symmetric_difference(idx2, sort=False)
expected = MultiIndex.from_tuples([('bar', 2), ('baz', 3), ('bar', 3)])
assert tm.equalContents(result, expected)

# nans:
idx1 = Index([1, np.nan, 2, 3])
idx2 = Index([0, 1, np.nan])
idx3 = Index([0, 1])

result = idx1.symmetric_difference(idx2, sort=False)
expected = Index([2.0, 3.0, 0.0])
tm.assert_index_equal(result, expected)

result = idx1.symmetric_difference(idx3, sort=False)
expected = Index([np.nan, 2.0, 3.0, 0.0])
tm.assert_index_equal(result, expected)

# other not an Index:
idx1 = Index([5, 3, 4, 2], name='idx1')
idx2 = np.array([3, 2, 4, 1])
expected = Index([5, 1])
result = idx1.symmetric_difference(idx2, sort=False)
assert tm.equalContents(result, expected)
assert result.name == 'idx1'

result = idx1.symmetric_difference(idx2, result_name='new_name',
sort=False)
assert tm.equalContents(result, expected)
assert result.name == 'new_name'

def test_is_numeric(self):
assert not self.dateIndex.is_numeric()
assert not self.strIndex.is_numeric()
Expand Down