Skip to content

Commit 030ab4d

Browse files
Chang Shewesm
Chang She
authored andcommitted
optimized a little bit for speed
1 parent c3e4828 commit 030ab4d

File tree

6 files changed

+89
-112
lines changed

6 files changed

+89
-112
lines changed

pandas/core/frame.py

+7-14
Original file line numberDiff line numberDiff line change
@@ -2344,7 +2344,7 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None):
23442344
new_labels = labels[mask]
23452345
return self.reindex(**{axis_name: new_labels})
23462346

2347-
def drop_duplicates(self, cols=None, take_last=False, skipna=True):
2347+
def drop_duplicates(self, cols=None, take_last=False):
23482348
"""
23492349
Return DataFrame with duplicate rows removed, optionally only
23502350
considering certain columns
@@ -2363,10 +2363,10 @@ def drop_duplicates(self, cols=None, take_last=False, skipna=True):
23632363
-------
23642364
deduplicated : DataFrame
23652365
"""
2366-
duplicated = self.duplicated(cols, take_last=take_last, skipna=skipna)
2366+
duplicated = self.duplicated(cols, take_last=take_last)
23672367
return self[-duplicated]
23682368

2369-
def duplicated(self, cols=None, take_last=False, skipna=True):
2369+
def duplicated(self, cols=None, take_last=False):
23702370
"""
23712371
Return boolean Series denoting duplicate rows, optionally only
23722372
considering certain columns
@@ -2378,29 +2378,22 @@ def duplicated(self, cols=None, take_last=False, skipna=True):
23782378
default use all of the columns
23792379
take_last : boolean, default False
23802380
Take the last observed row in a row. Defaults to the first row
2381-
skipna : boolean, default True
2382-
If True then NaN are not marked as duplicates
23832381
23842382
Returns
23852383
-------
23862384
duplicated : Series
23872385
"""
2388-
zip_func = lib.fast_zip if skipna else lib.fast_zip_fillna
2389-
23902386
if cols is not None:
23912387
if isinstance(cols, list):
23922388
values = [self[x].values for x in cols]
2393-
keys = zip_func(values)
2394-
dup_func = lib.duplicated_skipna
2389+
keys = lib.fast_zip_fillna(values)
23952390
else:
2396-
keys = self[cols]
2397-
dup_func = lib.duplicated_skipna if skipna else lib.duplicated
2391+
keys = lib.fast_zip_fillna([self[cols]])
23982392
else:
23992393
values = list(self.values.T)
2400-
keys = zip_func(values)
2401-
dup_func = lib.duplicated_skipna
2394+
keys = lib.fast_zip_fillna(values)
24022395

2403-
duplicated = dup_func(list(keys), take_last=take_last)
2396+
duplicated = lib.duplicated(keys, take_last=take_last)
24042397
return Series(duplicated, index=self.index)
24052398

24062399
#----------------------------------------------------------------------

pandas/src/groupby.pyx

+59-37
Original file line numberDiff line numberDiff line change
@@ -1301,39 +1301,72 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
13011301

13021302
return counts
13031303

1304-
def duplicated_skipna(list values, take_last=False):
1304+
cdef class _PandasNull:
1305+
1306+
def __richcmp__(_PandasNull self, object other, int op):
1307+
if op == 2: # ==
1308+
return isinstance(other, _PandasNull)
1309+
elif op == 3: # !=
1310+
return not isinstance(other, _PandasNull)
1311+
else:
1312+
return False
1313+
1314+
def __hash__(self):
1315+
return 0
1316+
1317+
pandas_null = _PandasNull()
1318+
1319+
def fast_zip_fillna(list ndarrays, fill_value=pandas_null):
1320+
'''
1321+
For zipping multiple ndarrays into an ndarray of tuples
1322+
'''
13051323
cdef:
1306-
Py_ssize_t i, n
1307-
dict seen = {}
1308-
object row
1324+
Py_ssize_t i, j, k, n
1325+
ndarray[object] result
1326+
flatiter it
1327+
object val, tup
13091328

1310-
n = len(values)
1311-
cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8)
1329+
k = len(ndarrays)
1330+
n = len(ndarrays[0])
13121331

1313-
if take_last:
1314-
for i from n > i >= 0:
1315-
row = values[i]
1316-
if row in seen:
1317-
result[i] = 1
1318-
else:
1319-
seen[row] = None
1320-
result[i] = 0
1321-
else:
1322-
for i from 0 <= i < n:
1323-
row = values[i]
1324-
if row in seen:
1325-
result[i] = 1
1326-
else:
1327-
seen[row] = None
1328-
result[i] = 0
1332+
result = np.empty(n, dtype=object)
13291333

1330-
return result.view(np.bool_)
1334+
# initialize tuples on first pass
1335+
arr = ndarrays[0]
1336+
it = <flatiter> PyArray_IterNew(arr)
1337+
for i in range(n):
1338+
val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
1339+
tup = PyTuple_New(k)
1340+
1341+
if val != val:
1342+
val = fill_value
13311343

1332-
def duplicated(list values, take_last=False):
1344+
PyTuple_SET_ITEM(tup, 0, val)
1345+
Py_INCREF(val)
1346+
result[i] = tup
1347+
PyArray_ITER_NEXT(it)
1348+
1349+
for j in range(1, k):
1350+
arr = ndarrays[j]
1351+
it = <flatiter> PyArray_IterNew(arr)
1352+
if len(arr) != n:
1353+
raise ValueError('all arrays must be same length')
1354+
1355+
for i in range(n):
1356+
val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
1357+
if val != val:
1358+
val = fill_value
1359+
1360+
PyTuple_SET_ITEM(result[i], j, val)
1361+
Py_INCREF(val)
1362+
PyArray_ITER_NEXT(it)
1363+
1364+
return result
1365+
1366+
def duplicated(ndarray[object] values, take_last=False):
13331367
cdef:
13341368
Py_ssize_t i, n
13351369
dict seen = {}
1336-
bint has_nan = 0
13371370
object row
13381371

13391372
n = len(values)
@@ -1342,14 +1375,9 @@ def duplicated(list values, take_last=False):
13421375
if take_last:
13431376
for i from n > i >= 0:
13441377
row = values[i]
1378+
13451379
if row in seen:
13461380
result[i] = 1
1347-
elif row != row:
1348-
if has_nan:
1349-
result[i] = 1
1350-
else:
1351-
has_nan = 1
1352-
result[i] = 0
13531381
else:
13541382
seen[row] = None
13551383
result[i] = 0
@@ -1358,12 +1386,6 @@ def duplicated(list values, take_last=False):
13581386
row = values[i]
13591387
if row in seen:
13601388
result[i] = 1
1361-
elif row != row:
1362-
if has_nan:
1363-
result[i] = 1
1364-
else:
1365-
has_nan = 1
1366-
result[i] = 0
13671389
else:
13681390
seen[row] = None
13691391
result[i] = 0

pandas/src/tseries.pyx

-52
Original file line numberDiff line numberDiff line change
@@ -404,58 +404,6 @@ def fast_zip(list ndarrays):
404404

405405
return result
406406

407-
cdef class _PandasNull:
408-
pass
409-
410-
pandas_null = _PandasNull()
411-
412-
def fast_zip_fillna(list ndarrays, fill_value=pandas_null):
413-
'''
414-
For zipping multiple ndarrays into an ndarray of tuples
415-
'''
416-
cdef:
417-
Py_ssize_t i, j, k, n
418-
ndarray[object] result
419-
flatiter it
420-
object val, tup
421-
422-
k = len(ndarrays)
423-
n = len(ndarrays[0])
424-
425-
result = np.empty(n, dtype=object)
426-
427-
# initialize tuples on first pass
428-
arr = ndarrays[0]
429-
it = <flatiter> PyArray_IterNew(arr)
430-
for i in range(n):
431-
val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
432-
tup = PyTuple_New(k)
433-
434-
if val != val:
435-
val = fill_value
436-
437-
PyTuple_SET_ITEM(tup, 0, val)
438-
Py_INCREF(val)
439-
result[i] = tup
440-
PyArray_ITER_NEXT(it)
441-
442-
for j in range(1, k):
443-
arr = ndarrays[j]
444-
it = <flatiter> PyArray_IterNew(arr)
445-
if len(arr) != n:
446-
raise ValueError('all arrays must be same length')
447-
448-
for i in range(n):
449-
val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
450-
if val != val:
451-
val = fill_value
452-
453-
PyTuple_SET_ITEM(result[i], j, val)
454-
Py_INCREF(val)
455-
PyArray_ITER_NEXT(it)
456-
457-
return result
458-
459407
def get_reverse_indexer(ndarray[int64_t] indexer, Py_ssize_t length):
460408
cdef:
461409
Py_ssize_t i, n = len(indexer)

pandas/tests/test_frame.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -3323,20 +3323,20 @@ def test_drop_duplicates_NA(self):
33233323
'D' : range(8)})
33243324

33253325
# single column
3326-
result = df.drop_duplicates('C', skipna=False)
3326+
result = df.drop_duplicates('C')
33273327
expected = df[:2]
33283328
assert_frame_equal(result, expected)
33293329

3330-
result = df.drop_duplicates('C', take_last=True, skipna=False)
3330+
result = df.drop_duplicates('C', take_last=True)
33313331
expected = df.ix[[3, 7]]
33323332
assert_frame_equal(result, expected)
33333333

33343334
# multi column
3335-
result = df.drop_duplicates(['C', 'B'], skipna=False)
3335+
result = df.drop_duplicates(['C', 'B'])
33363336
expected = df.ix[[0, 1, 2, 4]]
33373337
assert_frame_equal(result, expected)
33383338

3339-
result = df.drop_duplicates(['C', 'B'], take_last=True, skipna=False)
3339+
result = df.drop_duplicates(['C', 'B'], take_last=True)
33403340
expected = df.ix[[1, 3, 6, 7]]
33413341
assert_frame_equal(result, expected)
33423342

pandas/tests/test_tseries.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ def test_ensure_platform_int():
170170
assert(result is arr)
171171

172172
def test_duplicated_with_nas():
173-
keys = [0, 1, nan, 0, 2, nan]
173+
keys = np.array([0, 1, nan, 0, 2, nan], dtype=object)
174174

175175
result = lib.duplicated(keys)
176176
expected = [False, False, False, True, False, True]
@@ -180,7 +180,9 @@ def test_duplicated_with_nas():
180180
expected = [True, False, True, False, False, False]
181181
assert(np.array_equal(result, expected))
182182

183-
keys = [(0, 0), (0, nan), (nan, 0), (nan, nan)] * 2
183+
keys = np.empty(8, dtype=object)
184+
for i, t in enumerate(zip([0, 0, nan, nan]*2, [0, nan, 0, nan]*2)):
185+
keys[i] = t
184186

185187
result = lib.duplicated(keys)
186188
falses = [False] * 4

vb_suite/reindex.py

+15-3
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ def backfill():
114114

115115
# pathological, but realistic
116116
setup = common_setup + """
117+
import pandas._tseries as lib
117118
N = 10000
118119
K = 10
119120
@@ -135,11 +136,22 @@ def backfill():
135136
name='frame_drop_duplicates',
136137
start_date=datetime(2011, 11, 15))
137138

138-
statement2 = "df.drop_duplicates(['key1', 'key2'], skipna=False)"
139-
frame_drop_duplicates_na = Benchmark(statement, setup,
140-
name='frame_drop_duplicates',
139+
lib_fast_zip = Benchmark('lib.fast_zip(df.values.T)', setup,
140+
name='lib_fast_zip',
141+
start_date=datetime(2012, 1, 1))
142+
143+
setup = setup + """
144+
df.ix[:10000, :] = np.nan
145+
"""
146+
statement2 = "df.drop_duplicates(['key1', 'key2'])"
147+
frame_drop_duplicates_na = Benchmark(statement2, setup,
148+
name='frame_drop_duplicates_na',
141149
start_date=datetime(2012, 5, 15))
142150

151+
lib_fast_zip_fillna = Benchmark('lib.fast_zip_fillna(df.values.T)', setup,
152+
name='lib_fast_zip_fillna',
153+
start_date=datetime(2012, 5, 15))
154+
143155
#----------------------------------------------------------------------
144156
# fillna, many columns
145157

0 commit comments

Comments
 (0)