Skip to content

BUG: drop_duplicates drops name(s). #10367

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 23, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ Bug Fixes
~~~~~~~~~
- Bug in ``DataFrame.apply`` when function returns categorical series. (:issue:`9573`)


- Bug in ``Index.drop_duplicates`` dropping name(s) (:issue:`10115`)
- Bug in ``pd.Series`` when setting a value on an empty ``Series`` whose index has a frequency. (:issue:`10193`)
- Bug in ``DataFrame.reset_index`` when index contains `NaT`. (:issue:`10388`)
- Bug in ``ExcelReader`` when worksheet is empty (:issue:`6403`)
4 changes: 1 addition & 3 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2573,14 +2573,12 @@ def drop(self, labels, errors='raise'):

@Appender(_shared_docs['drop_duplicates'] % _index_doc_kwargs)
def drop_duplicates(self, take_last=False):
result = super(Index, self).drop_duplicates(take_last=take_last)
return self._constructor(result)
return super(Index, self).drop_duplicates(take_last=take_last)

@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
def duplicated(self, take_last=False):
return super(Index, self).duplicated(take_last=take_last)


def _evaluate_with_timedelta_like(self, other, op, opstr):
raise TypeError("can only perform ops with timedelta like values")

Expand Down
27 changes: 26 additions & 1 deletion pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,10 +207,19 @@ def test_duplicates(self):

if not len(ind):
continue
if isinstance(ind, MultiIndex):
continue
idx = self._holder([ind[0]]*5)
self.assertFalse(idx.is_unique)
self.assertTrue(idx.has_duplicates)

# GH 10115
# preserve names
idx.name = 'foo'
result = idx.drop_duplicates()
self.assertEqual(result.name, 'foo')
self.assert_index_equal(result, Index([ind[0]],name='foo'))

def test_sort(self):
for ind in self.indices.values():
self.assertRaises(TypeError, ind.sort)
Expand Down Expand Up @@ -1830,10 +1839,13 @@ def test_reindexing(self):

def test_duplicates(self):

idx = CategoricalIndex([0, 0, 0])
idx = CategoricalIndex([0, 0, 0], name='foo')
self.assertFalse(idx.is_unique)
self.assertTrue(idx.has_duplicates)

expected = CategoricalIndex([0], name='foo')
self.assert_index_equal(idx.drop_duplicates(), expected)

def test_get_indexer(self):

idx1 = CategoricalIndex(list('aabcde'),categories=list('edabc'))
Expand Down Expand Up @@ -4603,6 +4615,19 @@ def check(nlevels, with_nulls):
self.assert_array_equal(mi.duplicated(),
np.zeros(len(mi), dtype='bool'))

def test_duplicate_meta_data(self):
# GH 10115
index = MultiIndex(levels=[[0, 1], [0, 1, 2]],
labels=[[0, 0, 0, 0, 1, 1, 1],
[0, 1, 2, 0, 0, 1, 2]])
for idx in [index,
index.set_names([None, None]),
index.set_names([None, 'Num']),
index.set_names(['Upper','Num']),
]:
self.assertTrue(idx.has_duplicates)
self.assertEqual(idx.drop_duplicates().names, idx.names)

def test_tolist(self):
result = self.index.tolist()
exp = list(self.index.values)
Expand Down
40 changes: 40 additions & 0 deletions pandas/tseries/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,20 @@ def test_getitem(self):
self.assert_index_equal(result, expected)
self.assertEqual(result.freq, expected.freq)

def test_drop_duplicates_metadata(self):
#GH 10115
idx = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx')
result = idx.drop_duplicates()
self.assert_index_equal(idx, result)
self.assertEqual(idx.freq, result.freq)

idx_dup = idx.append(idx)
self.assertIsNone(idx_dup.freq) # freq is reset
result = idx_dup.drop_duplicates()
self.assert_index_equal(idx, result)
self.assertIsNone(result.freq)


class TestTimedeltaIndexOps(Ops):

def setUp(self):
Expand Down Expand Up @@ -802,6 +816,20 @@ def test_getitem(self):
self.assert_index_equal(result, expected)
self.assertEqual(result.freq, expected.freq)

def test_drop_duplicates_metadata(self):
#GH 10115
idx = pd.timedelta_range('1 day', '31 day', freq='D', name='idx')
result = idx.drop_duplicates()
self.assert_index_equal(idx, result)
self.assertEqual(idx.freq, result.freq)

idx_dup = idx.append(idx)
self.assertIsNone(idx_dup.freq) # freq is reset
result = idx_dup.drop_duplicates()
self.assert_index_equal(idx, result)
self.assertIsNone(result.freq)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm I think that these tests are in test_index.py (there is of course some duplication, pardon the pun, with where tests live for these sub-classes).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I intended these tests are for freq rather than name, because name are tested in test_index. Maybe assert_index_equal should check all metadata to simplify the test (I've once tried and found some tests / logics must be fixed).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh, that's fine then.



class TestPeriodIndexOps(Ops):

def setUp(self):
Expand Down Expand Up @@ -1228,6 +1256,18 @@ def test_value_counts_unique(self):

tm.assert_index_equal(idx.unique(), exp_idx)

def test_drop_duplicates_metadata(self):
#GH 10115
idx = pd.period_range('2011-01-01', '2011-01-31', freq='D', name='idx')
result = idx.drop_duplicates()
self.assert_index_equal(idx, result)
self.assertEqual(idx.freq, result.freq)

idx_dup = idx.append(idx) # freq will not be reset
result = idx_dup.drop_duplicates()
self.assert_index_equal(idx, result)
self.assertEqual(idx.freq, result.freq)


if __name__ == '__main__':
import nose
Expand Down