From b8fe2e8baf9f321e8f66554077b189366c26d275 Mon Sep 17 00:00:00 2001 From: Giftlin Rajaiah Date: Sun, 15 Oct 2017 01:33:13 +0530 Subject: [PATCH 1/9] ENH: sort=bool keyword argument for index.difference #17839 ENH: sort=bool keyword argument for index.difference #17839 --- pandas/core/indexes/base.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c3343f149005c..35eef34658ae6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2326,7 +2326,7 @@ def intersection(self, other): taken.name = None return taken - def difference(self, other): + def difference(self, other, sort=True): """ Return a new Index with elements from the index that are not in `other`. @@ -2366,14 +2366,15 @@ def difference(self, other): label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) the_diff = this.values.take(label_diff) - try: - the_diff = sorting.safe_sort(the_diff) - except TypeError: - pass + if sort: + try: + the_diff = sorting.safe_sort(the_diff) + except TypeError: + pass return this._shallow_copy(the_diff, name=result_name, freq=None) - def symmetric_difference(self, other, result_name=None): + def symmetric_difference(self, other, result_name=None, sort=True): """ Compute the symmetric difference of two Index objects. It's sorted if sorting is possible. @@ -2426,10 +2427,11 @@ def symmetric_difference(self, other, result_name=None): right_diff = other.values.take(right_indexer) the_diff = _concat._concat_compat([left_diff, right_diff]) - try: - the_diff = sorting.safe_sort(the_diff) - except TypeError: - pass + if sort: + try: + the_diff = sorting.safe_sort(the_diff) + except TypeError: + pass attribs = self._get_attributes_dict() attribs['name'] = result_name @@ -4220,3 +4222,4 @@ def _trim_front(strings): def _validate_join_method(method): if method not in ['left', 'right', 'inner', 'outer']: raise ValueError('do not recognize join method %s' % method) + From 5e1ee8b3459b3d22e44f7ce48b854aea5d1e4c25 Mon Sep 17 00:00:00 2001 From: Giftlin Rajaiah Date: Sun, 15 Oct 2017 02:36:16 +0530 Subject: [PATCH 2/9] ENH: sort=bool keyword argument for index.difference #17839 union --- pandas/core/indexes/base.py | 61 ++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c3343f149005c..c9b216cfabe74 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2183,7 +2183,7 @@ def _get_consensus_name(self, other): return self._shallow_copy(name=name) return self - def union(self, other): + def union(self, other, sort=True): """ Form the union of two Index objects and sorts if possible. @@ -2241,27 +2241,29 @@ def union(self, other): allow_fill=False) result = _concat._concat_compat((self._values, other_diff)) - try: - self._values[0] < other_diff[0] - except TypeError as e: - warnings.warn("%s, sort order is undefined for " - "incomparable objects" % e, RuntimeWarning, - stacklevel=3) - else: - types = frozenset((self.inferred_type, - other.inferred_type)) - if not types & _unsortable_types: - result.sort() + if sort: + try: + self._values[0] < other_diff[0] + except TypeError as e: + warnings.warn("%s, sort order is undefined for " + "incomparable objects" % e, RuntimeWarning, + stacklevel=3) + else: + types = frozenset((self.inferred_type, + other.inferred_type)) + if not types & _unsortable_types: + result.sort() else: result = self._values - try: - result = np.sort(result) - except TypeError as e: - warnings.warn("%s, sort order is undefined for " - "incomparable objects" % e, RuntimeWarning, - stacklevel=3) + if sort: + try: + result = np.sort(result) + except TypeError as e: + warnings.warn("%s, sort order is undefined for " + "incomparable objects" % e, RuntimeWarning, + stacklevel=3) # for subclasses return self._wrap_union_result(other, result) @@ -2326,7 +2328,7 @@ def intersection(self, other): taken.name = None return taken - def difference(self, other): + def difference(self, other, sort=True): """ Return a new Index with elements from the index that are not in `other`. @@ -2366,14 +2368,15 @@ def difference(self, other): label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) the_diff = this.values.take(label_diff) - try: - the_diff = sorting.safe_sort(the_diff) - except TypeError: - pass + if sort: + try: + the_diff = sorting.safe_sort(the_diff) + except TypeError: + pass return this._shallow_copy(the_diff, name=result_name, freq=None) - def symmetric_difference(self, other, result_name=None): + def symmetric_difference(self, other, result_name=None, sort=True): """ Compute the symmetric difference of two Index objects. It's sorted if sorting is possible. @@ -2426,10 +2429,11 @@ def symmetric_difference(self, other, result_name=None): right_diff = other.values.take(right_indexer) the_diff = _concat._concat_compat([left_diff, right_diff]) - try: - the_diff = sorting.safe_sort(the_diff) - except TypeError: - pass + if sort: + try: + the_diff = sorting.safe_sort(the_diff) + except TypeError: + pass attribs = self._get_attributes_dict() attribs['name'] = result_name @@ -4220,3 +4224,4 @@ def _trim_front(strings): def _validate_join_method(method): if method not in ['left', 'right', 'inner', 'outer']: raise ValueError('do not recognize join method %s' % method) + From f831506cd180cbd2d65c84e461f18491c93a5baa Mon Sep 17 00:00:00 2001 From: Giftlin Rajaiah Date: Sun, 15 Oct 2017 11:12:37 +0530 Subject: [PATCH 3/9] test difference sorting false --- pandas/tests/indexes/test_base.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 307cda7f2d1cb..300969d0d9a37 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -896,6 +896,33 @@ def test_difference(self): assert len(result) == 0 assert result.name == first.name + def test_difference_sorting_false(self): + + first = pd.Index([2, 0, 4, 3]) + second = pd.Index([2, 1]) + answer = pd.Index([0, 4, 3]) + first.name = 'name' + # different names + result = first.difference(second, sort=False) + + assert tm.equalContents(result, answer) + assert result.name is None + + # same names + second.name = 'name' + result = first.difference(second, sort=False) + assert result.name == 'name' + + # with empty + result = first.difference([], sort=False) + assert tm.equalContents(result, first) + assert result.name == first.name + + # with everything + result = first.difference(first, sort=False) + assert len(result) == 0 + assert result.name == first.name + def test_symmetric_difference(self): # smoke idx1 = Index([1, 2, 3, 4], name='idx1') @@ -2174,3 +2201,4 @@ class TestIndexUtils(object): def test_ensure_index_from_sequences(self, data, names, expected): result = _ensure_index_from_sequences(data, names) tm.assert_index_equal(result, expected) + From f11a40ec27be2ce53917c4459bc140ecc77843d0 Mon Sep 17 00:00:00 2001 From: Giftlin Rajaiah Date: Sun, 15 Oct 2017 14:22:26 +0530 Subject: [PATCH 4/9] Lint --- pandas/tests/indexes/test_base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 300969d0d9a37..f426717b34d15 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -2201,4 +2201,3 @@ class TestIndexUtils(object): def test_ensure_index_from_sequences(self, data, names, expected): result = _ensure_index_from_sequences(data, names) tm.assert_index_equal(result, expected) - From a8d6259333a815ea3db95bc436ba5cfb14ac2db1 Mon Sep 17 00:00:00 2001 From: Giftlin Rajaiah Date: Sun, 15 Oct 2017 14:27:16 +0530 Subject: [PATCH 5/9] Lint --- pandas/core/indexes/base.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ed01776eaa790..805cd213f0203 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2246,7 +2246,8 @@ def union(self, other, sort=True): self._values[0] < other_diff[0] except TypeError as e: warnings.warn("%s, sort order is undefined for " - "incomparable objects" % e, RuntimeWarning, + "incomparable objects" % e, + RuntimeWarning, stacklevel=3) else: types = frozenset((self.inferred_type, @@ -2262,7 +2263,8 @@ def union(self, other, sort=True): result = np.sort(result) except TypeError as e: warnings.warn("%s, sort order is undefined for " - "incomparable objects" % e, RuntimeWarning, + "incomparable objects" % e, + RuntimeWarning, stacklevel=3) # for subclasses @@ -4243,4 +4245,3 @@ def _trim_front(strings): def _validate_join_method(method): if method not in ['left', 'right', 'inner', 'outer']: raise ValueError('do not recognize join method %s' % method) - From d749e8d26463921aed6d3ea8afc7eac83076b5c0 Mon Sep 17 00:00:00 2001 From: Giftlin Rajaiah Date: Wed, 18 Oct 2017 15:29:55 +0530 Subject: [PATCH 6/9] union - testcase --- pandas/tests/indexes/test_base.py | 97 +++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index f426717b34d15..d367fe10429a6 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -766,6 +766,102 @@ def test_union(self): tm.assert_contains_all(self.strIndex, secondCat) tm.assert_contains_all(self.dateIndex, firstCat) + def test_union_sorting_false(self): + + # GH 17839 + dt_index = pd.DatetimeIndex(["20/12/2012", "15/05/2015", "1/1/2011", "1/7/2017"]) + first = pd.Index([2, 0, 4, 3]) + second = pd.Index([1, 4]) + everything = pd.Index([2, 0, 4, 3, 1]) + union = first.union(second, sort=False) + assert tm.equalContents(union, everything) + + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = first.union(case, sort=False) + assert tm.equalContents(result, everything) + + # Corner cases + union = first.union(first, sort=False) + assert union is first + + union = first.union([], sort=False) + assert union is first + + union = Index([]).union(first, sort=False) + assert union is first + + # preserve names + first = Index(list('ba'), name='A') + second = Index(list('ba'), name='B') + union = first.union(second, sort=False) + expected = Index(list('ba'), name=None) + tm.assert_index_equal(union, expected) + + first = Index(list('ba'), name='A') + second = Index([], name='B') + union = first.union(second, sort=False) + expected = Index(list('ba'), name=None) + tm.assert_index_equal(union, expected) + + first = Index([], name='A') + second = Index(list('ba'), name='B') + union = first.union(second, sort=False) + expected = Index(list('ba'), name=None) + tm.assert_index_equal(union, expected) + + first = Index(list('ba')) + second = Index(list('ba'), name='B') + union = first.union(second, sort=False) + expected = Index(list('ba'), name='B') + tm.assert_index_equal(union, expected) + + first = Index([]) + second = Index(list('ba'), name='B') + union = first.union(second, sort=False) + expected = Index(list('ba'), name='B') + tm.assert_index_equal(union, expected) + + first = Index(list('ba')) + second = Index([], name='B') + union = first.union(second, sort=False) + expected = Index(list('ba'), name='B') + tm.assert_index_equal(union, expected) + + first = Index(list('ba'), name='A') + second = Index(list('ba')) + union = first.union(second, sort=False) + expected = Index(list('ba'), name='A') + tm.assert_index_equal(union, expected) + + first = Index(list('ba'), name='A') + second = Index([]) + union = first.union(second, sort=False) + expected = Index(list('ba'), name='A') + tm.assert_index_equal(union, expected) + + first = Index([], name='A') + second = Index(list('ba')) + union = first.union(second, sort=False) + expected = Index(list('ba'), name='A') + tm.assert_index_equal(union, expected) + + first = pd.Index([2, 0, 4, 3]) + with tm.assert_produces_warning(RuntimeWarning): + firstCat = first.union(dt_index, sort=False) + secondCat = first.union(first, sort=False) + + if dt_index.dtype == np.object_: + appended = np.append(first, dt_index) + else: + appended = np.append(first, dt_index.astype('O')) + + assert tm.equalContents(firstCat, appended) + assert tm.equalContents(secondCat, first) + tm.assert_contains_all(first, firstCat) + tm.assert_contains_all(first, secondCat) + tm.assert_contains_all(dt_index, firstCat) + def test_add(self): idx = self.strIndex expected = Index(self.strIndex.values * 2) @@ -898,6 +994,7 @@ def test_difference(self): def test_difference_sorting_false(self): + # GH 17839 first = pd.Index([2, 0, 4, 3]) second = pd.Index([2, 1]) answer = pd.Index([0, 4, 3]) From 193b7dd648febf0f1681533684ee0db45bd6a579 Mon Sep 17 00:00:00 2001 From: Giftlin Rajaiah Date: Wed, 18 Oct 2017 15:48:26 +0530 Subject: [PATCH 7/9] symmetric difference sort test case --- pandas/tests/indexes/test_base.py | 48 +++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index d367fe10429a6..6dbe058685c08 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1068,6 +1068,54 @@ def test_symmetric_difference(self): assert tm.equalContents(result, expected) assert result.name == 'new_name' + def test_symmetric_difference_sorting_false(self): + + # GH 17839 + # smoke + idx1 = Index([1, 2, 3, 4], name='idx1') + idx2 = Index([2, 3, 4, 5]) + result = idx1.symmetric_difference(idx2, sort=False) + expected = Index([1, 5]) + assert tm.equalContents(result, expected) + assert result.name is None + + # __xor__ syntax + expected = idx1 ^ idx2 + assert tm.equalContents(result, expected) + assert result.name is None + + # multiIndex + idx1 = MultiIndex.from_tuples(self.tuples) + idx2 = MultiIndex.from_tuples([('foo', 1), ('bar', 3)]) + result = idx1.symmetric_difference(idx2, sort=False) + expected = MultiIndex.from_tuples([('bar', 2), ('baz', 3), ('bar', 3)]) + assert tm.equalContents(result, expected) + + # nans: + idx1 = Index([1, np.nan, 2, 3]) + idx2 = Index([0, 1, np.nan]) + idx3 = Index([0, 1]) + + result = idx1.symmetric_difference(idx2, sort=False) + expected = Index([2.0, 3.0, 0.0]) + tm.assert_index_equal(result, expected) + + result = idx1.symmetric_difference(idx3, sort=False) + expected = Index([np.nan, 2.0, 3.0, 0.0]) + tm.assert_index_equal(result, expected) + + # other not an Index: + idx1 = Index([5, 3, 4, 2], name='idx1') + idx2 = np.array([3, 2, 4, 1]) + expected = Index([5, 1]) + result = idx1.symmetric_difference(idx2, sort=False) + assert tm.equalContents(result, expected) + assert result.name == 'idx1' + + result = idx1.symmetric_difference(idx2, result_name='new_name', sort=False) + assert tm.equalContents(result, expected) + assert result.name == 'new_name' + def test_is_numeric(self): assert not self.dateIndex.is_numeric() assert not self.strIndex.is_numeric() From 17767bfdf28ebe7e541205ac34dd83ab2c7d3717 Mon Sep 17 00:00:00 2001 From: Giftlin Rajaiah Date: Wed, 18 Oct 2017 15:50:03 +0530 Subject: [PATCH 8/9] lint --- pandas/tests/indexes/test_base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 6dbe058685c08..804ba911629e5 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -769,7 +769,8 @@ def test_union(self): def test_union_sorting_false(self): # GH 17839 - dt_index = pd.DatetimeIndex(["20/12/2012", "15/05/2015", "1/1/2011", "1/7/2017"]) + dt_index = pd.DatetimeIndex(["20/12/2012", "15/05/2015", + "1/1/2011", "1/7/2017"]) first = pd.Index([2, 0, 4, 3]) second = pd.Index([1, 4]) everything = pd.Index([2, 0, 4, 3, 1]) From 5039133798455294e48184f25410f6d17f30743d Mon Sep 17 00:00:00 2001 From: Giftlin Rajaiah Date: Wed, 18 Oct 2017 15:53:16 +0530 Subject: [PATCH 9/9] lint --- pandas/tests/indexes/test_base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 804ba911629e5..5835aee9a68b7 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1113,7 +1113,8 @@ def test_symmetric_difference_sorting_false(self): assert tm.equalContents(result, expected) assert result.name == 'idx1' - result = idx1.symmetric_difference(idx2, result_name='new_name', sort=False) + result = idx1.symmetric_difference(idx2, result_name='new_name', + sort=False) assert tm.equalContents(result, expected) assert result.name == 'new_name'