From 5c857f1857aef3f8eca12cbfcb9b17bee3ab1603 Mon Sep 17 00:00:00 2001 From: Chang She Date: Fri, 11 May 2012 20:08:04 -0400 Subject: [PATCH 1/5] BUG: DataFrame.drop_duplicates with NA values --- pandas/src/groupby.pyx | 13 ++++++++++ pandas/tests/test_frame.py | 53 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/pandas/src/groupby.pyx b/pandas/src/groupby.pyx index a05e619636dd4..5b6afb86e172b 100644 --- a/pandas/src/groupby.pyx +++ b/pandas/src/groupby.pyx @@ -1306,6 +1306,7 @@ def duplicated(list values, take_last=False): cdef: Py_ssize_t i, n dict seen = {} + int has_nan = 0 object row n = len(values) @@ -1316,6 +1317,12 @@ def duplicated(list values, take_last=False): row = values[i] if row in seen: result[i] = 1 + elif row != row: + if has_nan == 1: + result[i] = 1 + else: + has_nan = 1 + result[i] = 0 else: seen[row] = None result[i] = 0 @@ -1324,6 +1331,12 @@ def duplicated(list values, take_last=False): row = values[i] if row in seen: result[i] = 1 + elif row != row: + if has_nan == 1: + result[i] = 1 + else: + has_nan = 1 + result[i] = 0 else: seen[row] = None result[i] = 0 diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index b21bd09957bd7..592b354ab240d 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -3214,6 +3214,59 @@ def test_drop_duplicates(self): expected = df2.drop_duplicates(['A', 'B'], take_last=True) assert_frame_equal(result, expected) + def test_drop_duplicates_NA(self): + # none + df = DataFrame({'A' : [None, None, 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B' : ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C' : [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.], + 'D' : range(8)}) + + # single column + result = df.drop_duplicates('A') + expected = df.ix[[0, 2, 3]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates('A', take_last=True) + expected = df.ix[[1, 6, 7]] + assert_frame_equal(result, expected) + + # multi column + result = df.drop_duplicates(['A', 'B']) + expected = df.ix[[0, 2, 3, 6]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates(['A', 'B'], take_last=True) + expected = df.ix[[1, 5, 6, 7]] + assert_frame_equal(result, expected) + + # nan + df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B' : ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C' : [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.], + 'D' : range(8)}) + + # single column + result = df.drop_duplicates('C') + expected = df[:2] + assert_frame_equal(result, expected) + + result = df.drop_duplicates('C', take_last=True) + expected = df.ix[[3, 7]] + assert_frame_equal(result, expected) + + # multi column + result = df.drop_duplicates(['C', 'B']) + expected = df.ix[[0, 1, 2, 4]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates(['C', 'B'], take_last=True) + expected = df.ix[[1, 3, 6, 7]] + assert_frame_equal(result, expected) + def test_drop_col_still_multiindex(self): arrays = [[ 'a', 'b', 'c', 'top'], [ '', '', '', 'OD' ], From 699d548ab2fd41b4a338104c48347d94858815f2 Mon Sep 17 00:00:00 2001 From: Chang She Date: Tue, 15 May 2012 16:33:40 -0400 Subject: [PATCH 2/5] use fast zip with a placeholder value just for np.nan --- pandas/core/frame.py | 27 +++++++++++++------- pandas/src/groupby.pyx | 34 ++++++++++++++++++++++--- pandas/src/tseries.pyx | 51 ++++++++++++++++++++++++++++++++++++++ pandas/tests/test_frame.py | 9 ++++--- 4 files changed, 104 insertions(+), 17 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9afccce266522..1d3bda572eaa2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2337,7 +2337,7 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None): new_labels = labels[mask] return self.reindex(**{axis_name: new_labels}) - def drop_duplicates(self, cols=None, take_last=False): + def drop_duplicates(self, cols=None, take_last=False, skipna=True): """ Return DataFrame with duplicate rows removed, optionally only considering certain columns @@ -2349,15 +2349,17 @@ def drop_duplicates(self, cols=None, take_last=False): default use all of the columns take_last : boolean, default False Take the last observed row in a row. Defaults to the first row + skipna : boolean, default True + If True then keep NaN Returns ------- deduplicated : DataFrame """ - duplicated = self.duplicated(cols, take_last=take_last) + duplicated = self.duplicated(cols, take_last=take_last, skipna=skipna) return self[-duplicated] - def duplicated(self, cols=None, take_last=False): + def duplicated(self, cols=None, take_last=False, skipna=True): """ Return boolean Series denoting duplicate rows, optionally only considering certain columns @@ -2369,20 +2371,29 @@ def duplicated(self, cols=None, take_last=False): default use all of the columns take_last : boolean, default False Take the last observed row in a row. Defaults to the first row + skipna : boolean, default True + If True then NaN are not marked as duplicates Returns ------- duplicated : Series """ + zip_func = lib.fast_zip if skipna else lib.fast_zip_fillna + if cols is not None: if isinstance(cols, list): - keys = zip(*[self[x] for x in cols]) + values = [self[x].values for x in cols] + keys = zip_func(values) + dup_func = lib.duplicated_skipna else: - keys = list(self[cols]) + keys = self[cols] + dup_func = lib.duplicated_skipna if skipna else lib.duplicated else: - keys = zip(*self.values.T) + values = list(self.values.T) + keys = zip_func(values) + dup_func = lib.duplicated_skipna - duplicated = lib.duplicated(keys, take_last=take_last) + duplicated = dup_func(list(keys), take_last=take_last) return Series(duplicated, index=self.index) #---------------------------------------------------------------------- @@ -4374,7 +4385,6 @@ def _homogenize(data, index, columns, dtype=None): def _put_str(s, space): return ('%s' % s)[:space].ljust(space) - def _is_sequence(x): try: iter(x) @@ -4383,7 +4393,6 @@ def _is_sequence(x): except Exception: return False - def install_ipython_completers(): # pragma: no cover """Register the DataFrame type with IPython's tab completion machinery, so that it knows about accessing column names as attributes.""" diff --git a/pandas/src/groupby.pyx b/pandas/src/groupby.pyx index 5b6afb86e172b..359412813f681 100644 --- a/pandas/src/groupby.pyx +++ b/pandas/src/groupby.pyx @@ -1301,12 +1301,39 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, return counts +def duplicated_skipna(list values, take_last=False): + cdef: + Py_ssize_t i, n + dict seen = {} + object row + + n = len(values) + cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8) + + if take_last: + for i from n > i >= 0: + row = values[i] + if row in seen: + result[i] = 1 + else: + seen[row] = None + result[i] = 0 + else: + for i from 0 <= i < n: + row = values[i] + if row in seen: + result[i] = 1 + else: + seen[row] = None + result[i] = 0 + + return result.view(np.bool_) def duplicated(list values, take_last=False): cdef: Py_ssize_t i, n dict seen = {} - int has_nan = 0 + bint has_nan = 0 object row n = len(values) @@ -1318,7 +1345,7 @@ def duplicated(list values, take_last=False): if row in seen: result[i] = 1 elif row != row: - if has_nan == 1: + if has_nan: result[i] = 1 else: has_nan = 1 @@ -1332,7 +1359,7 @@ def duplicated(list values, take_last=False): if row in seen: result[i] = 1 elif row != row: - if has_nan == 1: + if has_nan: result[i] = 1 else: has_nan = 1 @@ -1343,7 +1370,6 @@ def duplicated(list values, take_last=False): return result.view(np.bool_) - def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups): cdef: Py_ssize_t i, group_size, n, lab, start diff --git a/pandas/src/tseries.pyx b/pandas/src/tseries.pyx index b8685a051eba3..20d4bea0d586a 100644 --- a/pandas/src/tseries.pyx +++ b/pandas/src/tseries.pyx @@ -379,6 +379,57 @@ def fast_zip(list ndarrays): return result +cdef class _PandasNull: + pass + +pandas_null = _PandasNull() + +def fast_zip_fillna(list ndarrays, fill_value=pandas_null): + ''' + For zipping multiple ndarrays into an ndarray of tuples + ''' + cdef: + Py_ssize_t i, j, k, n + ndarray[object] result + flatiter it + object val, tup + + k = len(ndarrays) + n = len(ndarrays[0]) + + result = np.empty(n, dtype=object) + + # initialize tuples on first pass + arr = ndarrays[0] + it = PyArray_IterNew(arr) + for i in range(n): + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) + tup = PyTuple_New(k) + + if val != val: + val = fill_value + + PyTuple_SET_ITEM(tup, 0, val) + Py_INCREF(val) + result[i] = tup + PyArray_ITER_NEXT(it) + + for j in range(1, k): + arr = ndarrays[j] + it = PyArray_IterNew(arr) + if len(arr) != n: + raise ValueError('all arrays must be same length') + + for i in range(n): + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) + if val != val: + val = fill_value + + PyTuple_SET_ITEM(result[i], j, val) + Py_INCREF(val) + PyArray_ITER_NEXT(it) + + return result def get_reverse_indexer(ndarray[int64_t] indexer, Py_ssize_t length): cdef: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 592b354ab240d..16d4352feeb77 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -1,3 +1,4 @@ + # pylint: disable-msg=W0612,E1101 from copy import deepcopy from datetime import datetime, timedelta @@ -3250,20 +3251,20 @@ def test_drop_duplicates_NA(self): 'D' : range(8)}) # single column - result = df.drop_duplicates('C') + result = df.drop_duplicates('C', skipna=False) expected = df[:2] assert_frame_equal(result, expected) - result = df.drop_duplicates('C', take_last=True) + result = df.drop_duplicates('C', take_last=True, skipna=False) expected = df.ix[[3, 7]] assert_frame_equal(result, expected) # multi column - result = df.drop_duplicates(['C', 'B']) + result = df.drop_duplicates(['C', 'B'], skipna=False) expected = df.ix[[0, 1, 2, 4]] assert_frame_equal(result, expected) - result = df.drop_duplicates(['C', 'B'], take_last=True) + result = df.drop_duplicates(['C', 'B'], take_last=True, skipna=False) expected = df.ix[[1, 3, 6, 7]] assert_frame_equal(result, expected) From 0bd2bb7bc6b78dedb3fa06cecf45758d3a5c1762 Mon Sep 17 00:00:00 2001 From: Chang She Date: Tue, 15 May 2012 16:41:28 -0400 Subject: [PATCH 3/5] TST: vbench for drop_duplicate with skipna set to False --- vb_suite/reindex.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vb_suite/reindex.py b/vb_suite/reindex.py index 9c307131ae5ac..e20784b1cf8df 100644 --- a/vb_suite/reindex.py +++ b/vb_suite/reindex.py @@ -135,6 +135,11 @@ def backfill(): name='frame_drop_duplicates', start_date=datetime(2011, 11, 15)) +statement2 = "df.drop_duplicates(['key1', 'key2'], skipna=False)" +frame_drop_duplicates_na = Benchmark(statement, setup, + name='frame_drop_duplicates', + start_date=datetime(2012, 5, 15)) + #---------------------------------------------------------------------- # fillna, many columns From 125c7fd16814c7d57bf6e8760093e025b356e82f Mon Sep 17 00:00:00 2001 From: Chang She Date: Tue, 15 May 2012 18:31:45 -0400 Subject: [PATCH 4/5] optimized a little bit for speed --- pandas/core/frame.py | 21 +++----- pandas/src/groupby.pyx | 96 ++++++++++++++++++++++-------------- pandas/src/tseries.pyx | 52 ------------------- pandas/tests/test_frame.py | 8 +-- pandas/tests/test_tseries.py | 6 ++- vb_suite/reindex.py | 18 +++++-- 6 files changed, 89 insertions(+), 112 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1d3bda572eaa2..3b008d6790288 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2337,7 +2337,7 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None): new_labels = labels[mask] return self.reindex(**{axis_name: new_labels}) - def drop_duplicates(self, cols=None, take_last=False, skipna=True): + def drop_duplicates(self, cols=None, take_last=False): """ Return DataFrame with duplicate rows removed, optionally only considering certain columns @@ -2356,10 +2356,10 @@ def drop_duplicates(self, cols=None, take_last=False, skipna=True): ------- deduplicated : DataFrame """ - duplicated = self.duplicated(cols, take_last=take_last, skipna=skipna) + duplicated = self.duplicated(cols, take_last=take_last) return self[-duplicated] - def duplicated(self, cols=None, take_last=False, skipna=True): + def duplicated(self, cols=None, take_last=False): """ Return boolean Series denoting duplicate rows, optionally only considering certain columns @@ -2371,29 +2371,22 @@ def duplicated(self, cols=None, take_last=False, skipna=True): default use all of the columns take_last : boolean, default False Take the last observed row in a row. Defaults to the first row - skipna : boolean, default True - If True then NaN are not marked as duplicates Returns ------- duplicated : Series """ - zip_func = lib.fast_zip if skipna else lib.fast_zip_fillna - if cols is not None: if isinstance(cols, list): values = [self[x].values for x in cols] - keys = zip_func(values) - dup_func = lib.duplicated_skipna + keys = lib.fast_zip_fillna(values) else: - keys = self[cols] - dup_func = lib.duplicated_skipna if skipna else lib.duplicated + keys = lib.fast_zip_fillna([self[cols]]) else: values = list(self.values.T) - keys = zip_func(values) - dup_func = lib.duplicated_skipna + keys = lib.fast_zip_fillna(values) - duplicated = dup_func(list(keys), take_last=take_last) + duplicated = lib.duplicated(keys, take_last=take_last) return Series(duplicated, index=self.index) #---------------------------------------------------------------------- diff --git a/pandas/src/groupby.pyx b/pandas/src/groupby.pyx index 359412813f681..78c3b0ff3f11a 100644 --- a/pandas/src/groupby.pyx +++ b/pandas/src/groupby.pyx @@ -1301,39 +1301,72 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, return counts -def duplicated_skipna(list values, take_last=False): +cdef class _PandasNull: + + def __richcmp__(_PandasNull self, object other, int op): + if op == 2: # == + return isinstance(other, _PandasNull) + elif op == 3: # != + return not isinstance(other, _PandasNull) + else: + return False + + def __hash__(self): + return 0 + +pandas_null = _PandasNull() + +def fast_zip_fillna(list ndarrays, fill_value=pandas_null): + ''' + For zipping multiple ndarrays into an ndarray of tuples + ''' cdef: - Py_ssize_t i, n - dict seen = {} - object row + Py_ssize_t i, j, k, n + ndarray[object] result + flatiter it + object val, tup - n = len(values) - cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8) + k = len(ndarrays) + n = len(ndarrays[0]) - if take_last: - for i from n > i >= 0: - row = values[i] - if row in seen: - result[i] = 1 - else: - seen[row] = None - result[i] = 0 - else: - for i from 0 <= i < n: - row = values[i] - if row in seen: - result[i] = 1 - else: - seen[row] = None - result[i] = 0 + result = np.empty(n, dtype=object) - return result.view(np.bool_) + # initialize tuples on first pass + arr = ndarrays[0] + it = PyArray_IterNew(arr) + for i in range(n): + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) + tup = PyTuple_New(k) + + if val != val: + val = fill_value -def duplicated(list values, take_last=False): + PyTuple_SET_ITEM(tup, 0, val) + Py_INCREF(val) + result[i] = tup + PyArray_ITER_NEXT(it) + + for j in range(1, k): + arr = ndarrays[j] + it = PyArray_IterNew(arr) + if len(arr) != n: + raise ValueError('all arrays must be same length') + + for i in range(n): + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) + if val != val: + val = fill_value + + PyTuple_SET_ITEM(result[i], j, val) + Py_INCREF(val) + PyArray_ITER_NEXT(it) + + return result + +def duplicated(ndarray[object] values, take_last=False): cdef: Py_ssize_t i, n dict seen = {} - bint has_nan = 0 object row n = len(values) @@ -1342,14 +1375,9 @@ def duplicated(list values, take_last=False): if take_last: for i from n > i >= 0: row = values[i] + if row in seen: result[i] = 1 - elif row != row: - if has_nan: - result[i] = 1 - else: - has_nan = 1 - result[i] = 0 else: seen[row] = None result[i] = 0 @@ -1358,12 +1386,6 @@ def duplicated(list values, take_last=False): row = values[i] if row in seen: result[i] = 1 - elif row != row: - if has_nan: - result[i] = 1 - else: - has_nan = 1 - result[i] = 0 else: seen[row] = None result[i] = 0 diff --git a/pandas/src/tseries.pyx b/pandas/src/tseries.pyx index 20d4bea0d586a..c0ef954795612 100644 --- a/pandas/src/tseries.pyx +++ b/pandas/src/tseries.pyx @@ -379,58 +379,6 @@ def fast_zip(list ndarrays): return result -cdef class _PandasNull: - pass - -pandas_null = _PandasNull() - -def fast_zip_fillna(list ndarrays, fill_value=pandas_null): - ''' - For zipping multiple ndarrays into an ndarray of tuples - ''' - cdef: - Py_ssize_t i, j, k, n - ndarray[object] result - flatiter it - object val, tup - - k = len(ndarrays) - n = len(ndarrays[0]) - - result = np.empty(n, dtype=object) - - # initialize tuples on first pass - arr = ndarrays[0] - it = PyArray_IterNew(arr) - for i in range(n): - val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) - tup = PyTuple_New(k) - - if val != val: - val = fill_value - - PyTuple_SET_ITEM(tup, 0, val) - Py_INCREF(val) - result[i] = tup - PyArray_ITER_NEXT(it) - - for j in range(1, k): - arr = ndarrays[j] - it = PyArray_IterNew(arr) - if len(arr) != n: - raise ValueError('all arrays must be same length') - - for i in range(n): - val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) - if val != val: - val = fill_value - - PyTuple_SET_ITEM(result[i], j, val) - Py_INCREF(val) - PyArray_ITER_NEXT(it) - - return result - def get_reverse_indexer(ndarray[int64_t] indexer, Py_ssize_t length): cdef: Py_ssize_t i, n = len(indexer) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 16d4352feeb77..a12c3fd58be20 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -3251,20 +3251,20 @@ def test_drop_duplicates_NA(self): 'D' : range(8)}) # single column - result = df.drop_duplicates('C', skipna=False) + result = df.drop_duplicates('C') expected = df[:2] assert_frame_equal(result, expected) - result = df.drop_duplicates('C', take_last=True, skipna=False) + result = df.drop_duplicates('C', take_last=True) expected = df.ix[[3, 7]] assert_frame_equal(result, expected) # multi column - result = df.drop_duplicates(['C', 'B'], skipna=False) + result = df.drop_duplicates(['C', 'B']) expected = df.ix[[0, 1, 2, 4]] assert_frame_equal(result, expected) - result = df.drop_duplicates(['C', 'B'], take_last=True, skipna=False) + result = df.drop_duplicates(['C', 'B'], take_last=True) expected = df.ix[[1, 3, 6, 7]] assert_frame_equal(result, expected) diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py index d9ddf63fea29c..2f052a68b04d2 100644 --- a/pandas/tests/test_tseries.py +++ b/pandas/tests/test_tseries.py @@ -165,7 +165,7 @@ def test_groupsort_indexer(): def test_duplicated_with_nas(): - keys = [0, 1, nan, 0, 2, nan] + keys = np.array([0, 1, nan, 0, 2, nan], dtype=object) result = lib.duplicated(keys) expected = [False, False, False, True, False, True] @@ -175,7 +175,9 @@ def test_duplicated_with_nas(): expected = [True, False, True, False, False, False] assert(np.array_equal(result, expected)) - keys = [(0, 0), (0, nan), (nan, 0), (nan, nan)] * 2 + keys = np.empty(8, dtype=object) + for i, t in enumerate(zip([0, 0, nan, nan]*2, [0, nan, 0, nan]*2)): + keys[i] = t result = lib.duplicated(keys) falses = [False] * 4 diff --git a/vb_suite/reindex.py b/vb_suite/reindex.py index e20784b1cf8df..24109e0559b4a 100644 --- a/vb_suite/reindex.py +++ b/vb_suite/reindex.py @@ -114,6 +114,7 @@ def backfill(): # pathological, but realistic setup = common_setup + """ +import pandas._tseries as lib N = 10000 K = 10 @@ -135,11 +136,22 @@ def backfill(): name='frame_drop_duplicates', start_date=datetime(2011, 11, 15)) -statement2 = "df.drop_duplicates(['key1', 'key2'], skipna=False)" -frame_drop_duplicates_na = Benchmark(statement, setup, - name='frame_drop_duplicates', +lib_fast_zip = Benchmark('lib.fast_zip(df.values.T)', setup, + name='lib_fast_zip', + start_date=datetime(2012, 1, 1)) + +setup = setup + """ +df.ix[:10000, :] = np.nan +""" +statement2 = "df.drop_duplicates(['key1', 'key2'])" +frame_drop_duplicates_na = Benchmark(statement2, setup, + name='frame_drop_duplicates_na', start_date=datetime(2012, 5, 15)) +lib_fast_zip_fillna = Benchmark('lib.fast_zip_fillna(df.values.T)', setup, + name='lib_fast_zip_fillna', + start_date=datetime(2012, 5, 15)) + #---------------------------------------------------------------------- # fillna, many columns From 45bf000748684fb8574012a35270f44f45bf2474 Mon Sep 17 00:00:00 2001 From: Chang She Date: Wed, 16 May 2012 16:52:05 -0400 Subject: [PATCH 5/5] ENH: inplace option to DataFrame.drop_duplicates #805 with vbench --- pandas/core/frame.py | 14 +++++++++-- pandas/tests/test_frame.py | 50 ++++++++++++++++++++++++++++++++++++++ vb_suite/reindex.py | 10 ++++++++ 3 files changed, 72 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3b008d6790288..c7a1edeb6115e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2337,7 +2337,7 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None): new_labels = labels[mask] return self.reindex(**{axis_name: new_labels}) - def drop_duplicates(self, cols=None, take_last=False): + def drop_duplicates(self, cols=None, take_last=False, inplace=False): """ Return DataFrame with duplicate rows removed, optionally only considering certain columns @@ -2351,13 +2351,23 @@ def drop_duplicates(self, cols=None, take_last=False): Take the last observed row in a row. Defaults to the first row skipna : boolean, default True If True then keep NaN + inplace : boolean, default False + Whether to drop duplicates in place or to return a copy Returns ------- deduplicated : DataFrame """ + duplicated = self.duplicated(cols, take_last=take_last) - return self[-duplicated] + + if inplace: + inds, = (-duplicated).nonzero() + self._data = self._data.take(inds) + self._clear_item_cache() + return self + else: + return self[-duplicated] def duplicated(self, cols=None, take_last=False): """ diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index a12c3fd58be20..ad5436420b085 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -3268,6 +3268,56 @@ def test_drop_duplicates_NA(self): expected = df.ix[[1, 3, 6, 7]] assert_frame_equal(result, expected) + def test_drop_duplicates_inplace(self): + orig = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B' : ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C' : [1, 1, 2, 2, 2, 2, 1, 2], + 'D' : range(8)}) + + # single column + df = orig.copy() + df.drop_duplicates('A', inplace=True) + expected = orig[:2] + result = df + assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates('A', take_last=True, inplace=True) + expected = orig.ix[[6, 7]] + result = df + assert_frame_equal(result, expected) + + # multi column + df = orig.copy() + df.drop_duplicates(['A', 'B'], inplace=True) + expected = orig.ix[[0, 1, 2, 3]] + result = df + assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates(['A', 'B'], take_last=True, inplace=True) + expected = orig.ix[[0, 5, 6, 7]] + result = df + assert_frame_equal(result, expected) + + # consider everything + orig2 = orig.ix[:, ['A', 'B', 'C']].copy() + + df2 = orig2.copy() + df2.drop_duplicates(inplace=True) + # in this case only + expected = orig2.drop_duplicates(['A', 'B']) + result = df2 + assert_frame_equal(result, expected) + + df2 = orig2.copy() + df2.drop_duplicates(take_last=True, inplace=True) + expected = orig2.drop_duplicates(['A', 'B'], take_last=True) + result = df2 + assert_frame_equal(result, expected) + def test_drop_col_still_multiindex(self): arrays = [[ 'a', 'b', 'c', 'top'], [ '', '', '', 'OD' ], diff --git a/vb_suite/reindex.py b/vb_suite/reindex.py index 24109e0559b4a..62b26724eff46 100644 --- a/vb_suite/reindex.py +++ b/vb_suite/reindex.py @@ -136,6 +136,11 @@ def backfill(): name='frame_drop_duplicates', start_date=datetime(2011, 11, 15)) +statement = "df.drop_duplicates(['key1', 'key2'], inplace=True)" +frame_drop_dup_inplace = Benchmark(statement, setup, + name='frame_drop_dup_inplace', + start_date=datetime(2012, 5, 16)) + lib_fast_zip = Benchmark('lib.fast_zip(df.values.T)', setup, name='lib_fast_zip', start_date=datetime(2012, 1, 1)) @@ -152,6 +157,11 @@ def backfill(): name='lib_fast_zip_fillna', start_date=datetime(2012, 5, 15)) +statement2 = "df.drop_duplicates(['key1', 'key2'], inplace=True)" +frame_drop_dup_na_inplace = Benchmark(statement2, setup, + name='frame_drop_dup_na_inplace', + start_date=datetime(2012, 5, 16)) + #---------------------------------------------------------------------- # fillna, many columns