Skip to content

Commit 537e6a6

Browse files
committed
ENH: add duplicated/drop_duplicates functions to Series. close #1923
1 parent 96545d0 commit 537e6a6

File tree

4 files changed

+67
-0
lines changed

4 files changed

+67
-0
lines changed

RELEASE.rst

+3
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ pandas 0.10.0
3434
- Grouped histogram via `by` keyword in Series/DataFrame.hist (#2186)
3535
- Support optional ``min_periods`` keyword in ``corr`` and ``cov``
3636
for both Series and DataFrame (#2002)
37+
- Add ``duplicated`` and ``drop_duplicates`` functions to Series (#1923)
3738

3839
**API Changes**
3940

@@ -74,6 +75,8 @@ pandas 0.10.0
7475
- pop(...) and del works with DataFrame with duplicate columns (#2349)
7576
- Treat empty strings as NA in date parsing (rather than let dateutil do
7677
something weird) (#2263)
78+
- Prevent uint64 -> int64 overflows (#2355)
79+
- Enable joins between MultiIndex and regular Index (#2024)
7780

7881
pandas 0.9.1
7982
============

pandas/core/series.py

+33
Original file line numberDiff line numberDiff line change
@@ -1248,6 +1248,39 @@ def nunique(self):
12481248
"""
12491249
return len(self.value_counts())
12501250

1251+
def drop_duplicates(self, take_last=False):
1252+
"""
1253+
Return Series with duplicate values removed
1254+
1255+
Parameters
1256+
----------
1257+
take_last : boolean, default False
1258+
Take the last observed index in a group. Default first
1259+
1260+
Returns
1261+
-------
1262+
deduplicated : Series
1263+
"""
1264+
duplicated = self.duplicated(take_last=take_last)
1265+
return self[-duplicated]
1266+
1267+
def duplicated(self, take_last=False):
1268+
"""
1269+
Return boolean Series denoting duplicate values
1270+
1271+
Parameters
1272+
----------
1273+
take_last : boolean, default False
1274+
Take the last observed index in a group. Default first
1275+
1276+
Returns
1277+
-------
1278+
duplicated : Series
1279+
"""
1280+
keys = com._ensure_object(self.values)
1281+
duplicated = lib.duplicated(keys, take_last=take_last)
1282+
return Series(duplicated, index=self.index, name=self.name)
1283+
12511284
sum = _make_stat_func(nanops.nansum, 'sum', 'sum')
12521285
mean = _make_stat_func(nanops.nanmean, 'mean', 'mean')
12531286
median = _make_stat_func(nanops.nanmedian, 'median', 'median', extras='')

pandas/tests/test_series.py

+19
Original file line numberDiff line numberDiff line change
@@ -2036,6 +2036,25 @@ def test_unique(self):
20362036
expected = np.array([1, 2, 3, None], dtype=object)
20372037
self.assert_(np.array_equal(result, expected))
20382038

2039+
def test_drop_duplicates(self):
2040+
s = Series([1, 2, 3, 3])
2041+
2042+
result = s.duplicated()
2043+
expected = Series([False, False, False, True])
2044+
assert_series_equal(result, expected)
2045+
2046+
result = s.duplicated(take_last=True)
2047+
expected = Series([False, False, True, False])
2048+
assert_series_equal(result, expected)
2049+
2050+
result = s.drop_duplicates()
2051+
expected = s[[True, True, True, False]]
2052+
assert_series_equal(result, expected)
2053+
2054+
result = s.drop_duplicates(take_last=True)
2055+
expected = s[[True, True, False, True]]
2056+
assert_series_equal(result, expected)
2057+
20392058
def test_sort(self):
20402059
ts = self.ts.copy()
20412060
ts.sort()

vb_suite/reindex.py

+12
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,18 @@ def backfill():
164164
name='frame_drop_dup_na_inplace',
165165
start_date=datetime(2012, 5, 16))
166166

167+
setup = common_setup + """
168+
s = Series(np.random.randint(0, 1000, size=10000))
169+
s2 = Series(np.tile([rands(10) for i in xrange(1000)], 10))
170+
"""
171+
172+
series_drop_duplicates_int = Benchmark('s.drop_duplicates()', setup,
173+
start_date=datetime(2012, 11, 27))
174+
175+
series_drop_duplicates_string = \
176+
Benchmark('s2.drop_duplicates()', setup,
177+
start_date=datetime(2012, 11, 27))
178+
167179
#----------------------------------------------------------------------
168180
# fillna, many columns
169181

0 commit comments

Comments
 (0)