Skip to content

Commit 56637d0

Browse files
Chang Shewesm
Chang She
authored andcommitted
use fast zip with a placeholder value just for np.nan
1 parent 6d324db commit 56637d0

File tree

4 files changed

+104
-16
lines changed

4 files changed

+104
-16
lines changed

pandas/core/frame.py

+18-8
Original file line numberDiff line numberDiff line change
@@ -2344,7 +2344,7 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None):
23442344
new_labels = labels[mask]
23452345
return self.reindex(**{axis_name: new_labels})
23462346

2347-
def drop_duplicates(self, cols=None, take_last=False):
2347+
def drop_duplicates(self, cols=None, take_last=False, skipna=True):
23482348
"""
23492349
Return DataFrame with duplicate rows removed, optionally only
23502350
considering certain columns
@@ -2356,15 +2356,17 @@ def drop_duplicates(self, cols=None, take_last=False):
23562356
default use all of the columns
23572357
take_last : boolean, default False
23582358
Take the last observed row in a row. Defaults to the first row
2359+
skipna : boolean, default True
2360+
If True then keep NaN
23592361
23602362
Returns
23612363
-------
23622364
deduplicated : DataFrame
23632365
"""
2364-
duplicated = self.duplicated(cols, take_last=take_last)
2366+
duplicated = self.duplicated(cols, take_last=take_last, skipna=skipna)
23652367
return self[-duplicated]
23662368

2367-
def duplicated(self, cols=None, take_last=False):
2369+
def duplicated(self, cols=None, take_last=False, skipna=True):
23682370
"""
23692371
Return boolean Series denoting duplicate rows, optionally only
23702372
considering certain columns
@@ -2376,20 +2378,29 @@ def duplicated(self, cols=None, take_last=False):
23762378
default use all of the columns
23772379
take_last : boolean, default False
23782380
Take the last observed row in a row. Defaults to the first row
2381+
skipna : boolean, default True
2382+
If True then NaN are not marked as duplicates
23792383
23802384
Returns
23812385
-------
23822386
duplicated : Series
23832387
"""
2388+
zip_func = lib.fast_zip if skipna else lib.fast_zip_fillna
2389+
23842390
if cols is not None:
23852391
if isinstance(cols, list):
2386-
keys = zip(*[self[x] for x in cols])
2392+
values = [self[x].values for x in cols]
2393+
keys = zip_func(values)
2394+
dup_func = lib.duplicated_skipna
23872395
else:
2388-
keys = list(self[cols])
2396+
keys = self[cols]
2397+
dup_func = lib.duplicated_skipna if skipna else lib.duplicated
23892398
else:
2390-
keys = zip(*self.values.T)
2399+
values = list(self.values.T)
2400+
keys = zip_func(values)
2401+
dup_func = lib.duplicated_skipna
23912402

2392-
duplicated = lib.duplicated(keys, take_last=take_last)
2403+
duplicated = dup_func(list(keys), take_last=take_last)
23932404
return Series(duplicated, index=self.index)
23942405

23952406
#----------------------------------------------------------------------
@@ -4527,7 +4538,6 @@ def _homogenize(data, index, columns, dtype=None):
45274538
def _put_str(s, space):
45284539
return ('%s' % s)[:space].ljust(space)
45294540

4530-
45314541
def _is_sequence(x):
45324542
try:
45334543
iter(x)

pandas/src/groupby.pyx

+30-4
Original file line numberDiff line numberDiff line change
@@ -1301,12 +1301,39 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
13011301

13021302
return counts
13031303

1304+
def duplicated_skipna(list values, take_last=False):
1305+
cdef:
1306+
Py_ssize_t i, n
1307+
dict seen = {}
1308+
object row
1309+
1310+
n = len(values)
1311+
cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8)
1312+
1313+
if take_last:
1314+
for i from n > i >= 0:
1315+
row = values[i]
1316+
if row in seen:
1317+
result[i] = 1
1318+
else:
1319+
seen[row] = None
1320+
result[i] = 0
1321+
else:
1322+
for i from 0 <= i < n:
1323+
row = values[i]
1324+
if row in seen:
1325+
result[i] = 1
1326+
else:
1327+
seen[row] = None
1328+
result[i] = 0
1329+
1330+
return result.view(np.bool_)
13041331

13051332
def duplicated(list values, take_last=False):
13061333
cdef:
13071334
Py_ssize_t i, n
13081335
dict seen = {}
1309-
int has_nan = 0
1336+
bint has_nan = 0
13101337
object row
13111338

13121339
n = len(values)
@@ -1318,7 +1345,7 @@ def duplicated(list values, take_last=False):
13181345
if row in seen:
13191346
result[i] = 1
13201347
elif row != row:
1321-
if has_nan == 1:
1348+
if has_nan:
13221349
result[i] = 1
13231350
else:
13241351
has_nan = 1
@@ -1332,7 +1359,7 @@ def duplicated(list values, take_last=False):
13321359
if row in seen:
13331360
result[i] = 1
13341361
elif row != row:
1335-
if has_nan == 1:
1362+
if has_nan:
13361363
result[i] = 1
13371364
else:
13381365
has_nan = 1
@@ -1343,7 +1370,6 @@ def duplicated(list values, take_last=False):
13431370

13441371
return result.view(np.bool_)
13451372

1346-
13471373
def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups):
13481374
cdef:
13491375
Py_ssize_t i, group_size, n, lab, start

pandas/src/tseries.pyx

+51
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,57 @@ def fast_zip(list ndarrays):
404404

405405
return result
406406

407+
cdef class _PandasNull:
408+
pass
409+
410+
pandas_null = _PandasNull()
411+
412+
def fast_zip_fillna(list ndarrays, fill_value=pandas_null):
413+
'''
414+
For zipping multiple ndarrays into an ndarray of tuples
415+
'''
416+
cdef:
417+
Py_ssize_t i, j, k, n
418+
ndarray[object] result
419+
flatiter it
420+
object val, tup
421+
422+
k = len(ndarrays)
423+
n = len(ndarrays[0])
424+
425+
result = np.empty(n, dtype=object)
426+
427+
# initialize tuples on first pass
428+
arr = ndarrays[0]
429+
it = <flatiter> PyArray_IterNew(arr)
430+
for i in range(n):
431+
val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
432+
tup = PyTuple_New(k)
433+
434+
if val != val:
435+
val = fill_value
436+
437+
PyTuple_SET_ITEM(tup, 0, val)
438+
Py_INCREF(val)
439+
result[i] = tup
440+
PyArray_ITER_NEXT(it)
441+
442+
for j in range(1, k):
443+
arr = ndarrays[j]
444+
it = <flatiter> PyArray_IterNew(arr)
445+
if len(arr) != n:
446+
raise ValueError('all arrays must be same length')
447+
448+
for i in range(n):
449+
val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
450+
if val != val:
451+
val = fill_value
452+
453+
PyTuple_SET_ITEM(result[i], j, val)
454+
Py_INCREF(val)
455+
PyArray_ITER_NEXT(it)
456+
457+
return result
407458

408459
def get_reverse_indexer(ndarray[int64_t] indexer, Py_ssize_t length):
409460
cdef:

pandas/tests/test_frame.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
12
# pylint: disable-msg=W0612,E1101
23
from copy import deepcopy
34
from datetime import datetime, timedelta
@@ -3322,20 +3323,20 @@ def test_drop_duplicates_NA(self):
33223323
'D' : range(8)})
33233324

33243325
# single column
3325-
result = df.drop_duplicates('C')
3326+
result = df.drop_duplicates('C', skipna=False)
33263327
expected = df[:2]
33273328
assert_frame_equal(result, expected)
33283329

3329-
result = df.drop_duplicates('C', take_last=True)
3330+
result = df.drop_duplicates('C', take_last=True, skipna=False)
33303331
expected = df.ix[[3, 7]]
33313332
assert_frame_equal(result, expected)
33323333

33333334
# multi column
3334-
result = df.drop_duplicates(['C', 'B'])
3335+
result = df.drop_duplicates(['C', 'B'], skipna=False)
33353336
expected = df.ix[[0, 1, 2, 4]]
33363337
assert_frame_equal(result, expected)
33373338

3338-
result = df.drop_duplicates(['C', 'B'], take_last=True)
3339+
result = df.drop_duplicates(['C', 'B'], take_last=True, skipna=False)
33393340
expected = df.ix[[1, 3, 6, 7]]
33403341
assert_frame_equal(result, expected)
33413342

0 commit comments

Comments
 (0)