From 60a6e9f34018c8093553485d3611efb6af28a248 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Tue, 16 Jun 2015 23:11:02 +0900 Subject: [PATCH] BUG: drop_duplicates drops name(s). --- doc/source/whatsnew/v0.17.0.txt | 2 +- pandas/core/index.py | 4 +--- pandas/tests/test_index.py | 27 ++++++++++++++++++++- pandas/tseries/tests/test_base.py | 40 +++++++++++++++++++++++++++++++ 4 files changed, 68 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index fc2e6b1cb936f..d4c49b82ed2e4 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -63,7 +63,7 @@ Bug Fixes ~~~~~~~~~ - Bug in ``DataFrame.apply`` when function returns categorical series. (:issue:`9573`) - +- Bug in ``Index.drop_duplicates`` dropping name(s) (:issue:`10115`) - Bug in ``pd.Series`` when setting a value on an empty ``Series`` whose index has a frequency. (:issue:`10193`) - Bug in ``DataFrame.reset_index`` when index contains `NaT`. (:issue:`10388`) - Bug in ``ExcelReader`` when worksheet is empty (:issue:`6403`) diff --git a/pandas/core/index.py b/pandas/core/index.py index fad71c94cc417..dc2da1177330e 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -2573,14 +2573,12 @@ def drop(self, labels, errors='raise'): @Appender(_shared_docs['drop_duplicates'] % _index_doc_kwargs) def drop_duplicates(self, take_last=False): - result = super(Index, self).drop_duplicates(take_last=take_last) - return self._constructor(result) + return super(Index, self).drop_duplicates(take_last=take_last) @Appender(_shared_docs['duplicated'] % _index_doc_kwargs) def duplicated(self, take_last=False): return super(Index, self).duplicated(take_last=take_last) - def _evaluate_with_timedelta_like(self, other, op, opstr): raise TypeError("can only perform ops with timedelta like values") diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index f422c3b49b691..7dab82ea2b63d 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -207,10 +207,19 @@ def test_duplicates(self): if not len(ind): continue + if isinstance(ind, MultiIndex): + continue idx = self._holder([ind[0]]*5) self.assertFalse(idx.is_unique) self.assertTrue(idx.has_duplicates) + # GH 10115 + # preserve names + idx.name = 'foo' + result = idx.drop_duplicates() + self.assertEqual(result.name, 'foo') + self.assert_index_equal(result, Index([ind[0]],name='foo')) + def test_sort(self): for ind in self.indices.values(): self.assertRaises(TypeError, ind.sort) @@ -1830,10 +1839,13 @@ def test_reindexing(self): def test_duplicates(self): - idx = CategoricalIndex([0, 0, 0]) + idx = CategoricalIndex([0, 0, 0], name='foo') self.assertFalse(idx.is_unique) self.assertTrue(idx.has_duplicates) + expected = CategoricalIndex([0], name='foo') + self.assert_index_equal(idx.drop_duplicates(), expected) + def test_get_indexer(self): idx1 = CategoricalIndex(list('aabcde'),categories=list('edabc')) @@ -4603,6 +4615,19 @@ def check(nlevels, with_nulls): self.assert_array_equal(mi.duplicated(), np.zeros(len(mi), dtype='bool')) + def test_duplicate_meta_data(self): + # GH 10115 + index = MultiIndex(levels=[[0, 1], [0, 1, 2]], + labels=[[0, 0, 0, 0, 1, 1, 1], + [0, 1, 2, 0, 0, 1, 2]]) + for idx in [index, + index.set_names([None, None]), + index.set_names([None, 'Num']), + index.set_names(['Upper','Num']), + ]: + self.assertTrue(idx.has_duplicates) + self.assertEqual(idx.drop_duplicates().names, idx.names) + def test_tolist(self): result = self.index.tolist() exp = list(self.index.values) diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index fc432d5236f62..dc0bc14ce1ea6 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -330,6 +330,20 @@ def test_getitem(self): self.assert_index_equal(result, expected) self.assertEqual(result.freq, expected.freq) + def test_drop_duplicates_metadata(self): + #GH 10115 + idx = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') + result = idx.drop_duplicates() + self.assert_index_equal(idx, result) + self.assertEqual(idx.freq, result.freq) + + idx_dup = idx.append(idx) + self.assertIsNone(idx_dup.freq) # freq is reset + result = idx_dup.drop_duplicates() + self.assert_index_equal(idx, result) + self.assertIsNone(result.freq) + + class TestTimedeltaIndexOps(Ops): def setUp(self): @@ -802,6 +816,20 @@ def test_getitem(self): self.assert_index_equal(result, expected) self.assertEqual(result.freq, expected.freq) + def test_drop_duplicates_metadata(self): + #GH 10115 + idx = pd.timedelta_range('1 day', '31 day', freq='D', name='idx') + result = idx.drop_duplicates() + self.assert_index_equal(idx, result) + self.assertEqual(idx.freq, result.freq) + + idx_dup = idx.append(idx) + self.assertIsNone(idx_dup.freq) # freq is reset + result = idx_dup.drop_duplicates() + self.assert_index_equal(idx, result) + self.assertIsNone(result.freq) + + class TestPeriodIndexOps(Ops): def setUp(self): @@ -1228,6 +1256,18 @@ def test_value_counts_unique(self): tm.assert_index_equal(idx.unique(), exp_idx) + def test_drop_duplicates_metadata(self): + #GH 10115 + idx = pd.period_range('2011-01-01', '2011-01-31', freq='D', name='idx') + result = idx.drop_duplicates() + self.assert_index_equal(idx, result) + self.assertEqual(idx.freq, result.freq) + + idx_dup = idx.append(idx) # freq will not be reset + result = idx_dup.drop_duplicates() + self.assert_index_equal(idx, result) + self.assertEqual(idx.freq, result.freq) + if __name__ == '__main__': import nose