Skip to content

BUG: Index.difference of itself doesn't preserve type #20062

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 16, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -939,6 +939,7 @@ Indexing
- Bug in :class:`IntervalIndex` where set operations that returned an empty ``IntervalIndex`` had the wrong dtype (:issue:`19101`)
- Bug in :meth:`DataFrame.drop_duplicates` where no ``KeyError`` is raised when passing in columns that don't exist on the ``DataFrame`` (issue:`19726`)
- Bug in ``Index`` subclasses constructors that ignore unexpected keyword arguments (:issue:`19348`)
- Bug in :meth:`Index.difference` when taking difference of an ``Index`` with itself (:issue:`20040`)


MultiIndex
Expand Down
8 changes: 6 additions & 2 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs):
Must be careful not to recurse.
"""
if not hasattr(values, 'dtype'):
if values is None and dtype is not None:
if (values is None or not len(values)) and dtype is not None:
values = np.empty(0, dtype=dtype)
else:
values = np.array(values, copy=False)
Expand Down Expand Up @@ -491,6 +491,8 @@ def _shallow_copy(self, values=None, **kwargs):
values = self.values
attributes = self._get_attributes_dict()
attributes.update(kwargs)
if not len(values) and 'dtype' not in kwargs:
attributes['dtype'] = self.dtype
return self._simple_new(values, **attributes)

def _shallow_copy_with_infer(self, values=None, **kwargs):
Expand All @@ -511,6 +513,8 @@ def _shallow_copy_with_infer(self, values=None, **kwargs):
attributes = self._get_attributes_dict()
attributes.update(kwargs)
attributes['copy'] = False
if not len(values) and 'dtype' not in kwargs:
attributes['dtype'] = self.dtype
if self._infer_as_myclass:
try:
return self._constructor(values, **attributes)
Expand Down Expand Up @@ -2815,7 +2819,7 @@ def difference(self, other):
self._assert_can_do_setop(other)

if self.equals(other):
return Index([], name=self.name)
return self._shallow_copy([])

other, result_name = self._convert_can_do_setop(other)

Expand Down
4 changes: 2 additions & 2 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2755,7 +2755,7 @@ def intersection(self, other):
other_tuples = other._ndarray_values
uniq_tuples = sorted(set(self_tuples) & set(other_tuples))
if len(uniq_tuples) == 0:
return MultiIndex(levels=[[]] * self.nlevels,
return MultiIndex(levels=self.levels,
labels=[[]] * self.nlevels,
names=result_names, verify_integrity=False)
else:
Expand All @@ -2777,7 +2777,7 @@ def difference(self, other):
return self

if self.equals(other):
return MultiIndex(levels=[[]] * self.nlevels,
return MultiIndex(levels=self.levels,
labels=[[]] * self.nlevels,
names=result_names, verify_integrity=False)

Expand Down
55 changes: 53 additions & 2 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from pandas import (period_range, date_range, Series,
DataFrame, Float64Index, Int64Index, UInt64Index,
CategoricalIndex, DatetimeIndex, TimedeltaIndex,
PeriodIndex, isna)
PeriodIndex, RangeIndex, isna)
from pandas.core.index import _get_combined_index, _ensure_index_from_sequences
from pandas.util.testing import assert_almost_equal
from pandas.compat.numpy import np_datetime64_compat
Expand All @@ -44,7 +44,7 @@ def setup_method(self, method):
tdIndex=tm.makeTimedeltaIndex(100),
intIndex=tm.makeIntIndex(100),
uintIndex=tm.makeUIntIndex(100),
rangeIndex=tm.makeIntIndex(100),
rangeIndex=tm.makeRangeIndex(100),
floatIndex=tm.makeFloatIndex(100),
boolIndex=Index([True, False]),
catIndex=tm.makeCategoricalIndex(100),
Expand All @@ -57,6 +57,15 @@ def setup_method(self, method):
def create_index(self):
return Index(list('abcde'))

def generate_index_types(self, skip_index_keys=[]):
"""
Return a generator of the various index types, leaving
out the ones with a key in skip_index_keys
"""
for key, idx in self.indices.items():
if key not in skip_index_keys:
yield key, idx

def test_new_axis(self):
new_index = self.dateIndex[None, :]
assert new_index.ndim == 2
Expand Down Expand Up @@ -406,6 +415,27 @@ def test_constructor_dtypes_timedelta(self):
pd.TimedeltaIndex(list(values), dtype=dtype)]:
tm.assert_index_equal(res, idx)

def test_constructor_empty(self):
skip_index_keys = ["repeats", "periodIndex", "rangeIndex",
"tuples"]
for key, idx in self.generate_index_types(skip_index_keys):
empty = idx.__class__([])
assert isinstance(empty, idx.__class__)
assert not len(empty)

empty = PeriodIndex([], freq='B')
assert isinstance(empty, PeriodIndex)
assert not len(empty)

empty = RangeIndex(step=1)
assert isinstance(empty, pd.RangeIndex)
assert not len(empty)

empty = MultiIndex(levels=[[1, 2], ['blue', 'red']],
labels=[[], []])
assert isinstance(empty, MultiIndex)
assert not len(empty)

def test_view_with_args(self):

restricted = ['unicodeIndex', 'strIndex', 'catIndex', 'boolIndex',
Expand Down Expand Up @@ -1034,6 +1064,27 @@ def test_symmetric_difference(self):
assert tm.equalContents(result, expected)
assert result.name == 'new_name'

def test_difference_type(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a test which tries to construct all indices as empty

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback for a test to construct all indices as empty, is this a test of the public API's (which have to differ based on the specific index class), or a test of the private Index._shallow_copy([]) ?

For example, pd.RangeIndex([]) currently fails with TypeError: RangeIndex(...) must be called with integers, list was passed for start, which is correct.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right you would skip that one. I think there is already a test of construct empty (pretty sure), but make just point it out and make it IS testing everything.

# GH 20040
# If taking difference of a set and itself, it
# needs to preserve the type of the index
skip_index_keys = ['repeats']
for key, idx in self.generate_index_types(skip_index_keys):
result = idx.difference(idx)
expected = idx.drop(idx)
tm.assert_index_equal(result, expected)

def test_intersection_difference(self):
# GH 20040
# Test that the intersection of an index with an
# empty index produces the same index as the difference
# of an index with itself. Test for all types
skip_index_keys = ['repeats']
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for followup PR (or can change here if want to update where needed). a function to return yeld the key, idx while filtering on certain types would be great. (like what you are doing but in a module level function)

for key, idx in self.generate_index_types(skip_index_keys):
inter = idx.intersection(idx.drop(idx))
diff = idx.difference(idx)
tm.assert_index_equal(inter, diff)

def test_is_numeric(self):
assert not self.dateIndex.is_numeric()
assert not self.strIndex.is_numeric()
Expand Down