Skip to content

Commit 0d6fb1b

Browse files
committed
ENH: specialized cython diff functions
1 parent 06f74e5 commit 0d6fb1b

File tree

6 files changed

+123
-18
lines changed

6 files changed

+123
-18
lines changed

RELEASE.rst

+1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ pandas 0.9.1
5151
- DataFrame.dot can accept ndarrays (#2042)
5252
- Support negative periods in Panel.shift (#2164)
5353
- Make .drop(...) work with non-unique indexes (#2101)
54+
- Improve performance of Series/DataFrame.diff (re: #2087)
5455

5556
**Bug fixes**
5657

pandas/core/common.py

+31-9
Original file line numberDiff line numberDiff line change
@@ -373,16 +373,38 @@ def mask_out_axis(arr, mask, axis, fill_value=np.nan):
373373

374374
arr[tuple(indexer)] = fill_value
375375

376-
def diff(arr, n, indexer, axis=0):
377-
out_arr = arr - arr.take(indexer, axis=axis)
378-
out_arr = _maybe_upcast(out_arr)
376+
_diff_special = {
377+
'float64': lib.diff_2d_float64,
378+
'int64': lib.diff_2d_int64,
379+
'int32': lib.diff_2d_int32
380+
}
379381

380-
indexer = [slice(None)] * arr.ndim
381-
if n > 0:
382-
indexer[axis] = slice(None, n)
383-
elif n < 0:
384-
indexer[axis] = slice(None, n)
385-
out_arr[tuple(indexer)] = np.nan
382+
def diff(arr, n, axis=0):
383+
dtype = arr.dtype
384+
if issubclass(dtype.type, np.integer):
385+
dtype = np.float64
386+
elif issubclass(dtype.type, np.bool_):
387+
dtype = np.object_
388+
389+
out_arr = np.empty(arr.shape, dtype=dtype)
390+
391+
na_indexer = [slice(None)] * arr.ndim
392+
na_indexer[axis] = slice(None, n)
393+
out_arr[tuple(na_indexer)] = np.nan
394+
395+
if arr.ndim == 2 and arr.dtype.name in _diff_special:
396+
f = _diff_special[arr.dtype.name]
397+
f(arr, out_arr, n, axis)
398+
else:
399+
res_indexer = [slice(None)] * arr.ndim
400+
res_indexer[axis] = slice(n, None)
401+
res_indexer = tuple(res_indexer)
402+
403+
lag_indexer = [slice(None)] * arr.ndim
404+
lag_indexer[axis] = slice(None, -n)
405+
lag_indexer = tuple(lag_indexer)
406+
407+
out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer]
386408

387409
return out_arr
388410

pandas/core/frame.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -3680,8 +3680,7 @@ def diff(self, periods=1):
36803680
-------
36813681
diffed : DataFrame
36823682
"""
3683-
indexer = com._shift_indexer(len(self), periods)
3684-
new_blocks = [b.diff(periods, indexer) for b in self._data.blocks]
3683+
new_blocks = [b.diff(periods) for b in self._data.blocks]
36853684
new_data = BlockManager(new_blocks, [self.columns, self.index])
36863685
return self._constructor(new_data)
36873686

pandas/core/internals.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -294,10 +294,8 @@ def take(self, indexer, axis=1, fill_value=np.nan):
294294
def get_values(self, dtype):
295295
return self.values
296296

297-
def diff(self, n, indexer=None):
298-
if indexer is None:
299-
indexer = com._shift_indexer(self.shape[1], n)
300-
new_values = com.diff(self.values, n, indexer, axis=1)
297+
def diff(self, n):
298+
new_values = com.diff(self.values, n, axis=1)
301299
return make_block(new_values, self.items, self.ref_items)
302300

303301

pandas/core/series.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -1511,9 +1511,8 @@ def diff(self, periods=1):
15111511
-------
15121512
diffed : Series
15131513
"""
1514-
indexer = com._shift_indexer(len(self), periods)
1515-
val = com.diff(self.values, periods, indexer)
1516-
return Series(val, self.index, name=self.name)
1514+
result = com.diff(self.values[:, np.newaxis], periods)
1515+
return Series(result.squeeze(), self.index, name=self.name)
15171516

15181517
def autocorr(self):
15191518
"""

pandas/src/stats.pyx

+86
Original file line numberDiff line numberDiff line change
@@ -565,3 +565,89 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
565565
# for j in range(K):
566566
# result[i, j] = values[i, indexer[i, j]]
567567
# return result
568+
569+
@cython.wraparound(False)
570+
@cython.boundscheck(False)
571+
def diff_2d_float64(ndarray[float64_t, ndim=2] arr,
572+
ndarray[float64_t, ndim=2] out,
573+
Py_ssize_t periods, int axis):
574+
cdef:
575+
Py_ssize_t i, j, sx, sy
576+
577+
sx, sy = (<object> arr).shape
578+
if arr.flags.f_contiguous:
579+
if axis == 0:
580+
for j in range(sy):
581+
for i in range(periods, sx):
582+
out[i, j] = arr[i, j] - arr[i - periods, j]
583+
else:
584+
for j in range(periods, sy):
585+
for i in range(sx):
586+
out[i, j] = arr[i, j] - arr[i, j - periods]
587+
else:
588+
if axis == 0:
589+
for i in range(periods, sx):
590+
for j in range(sy):
591+
out[i, j] = arr[i, j] - arr[i - periods, j]
592+
else:
593+
for i in range(sx):
594+
for j in range(periods, sy):
595+
out[i, j] = arr[i, j] - arr[i, j - periods]
596+
597+
@cython.wraparound(False)
598+
@cython.boundscheck(False)
599+
def diff_2d_int64(ndarray[int64_t, ndim=2] arr,
600+
ndarray[float64_t, ndim=2] out,
601+
Py_ssize_t periods, int axis):
602+
cdef:
603+
Py_ssize_t i, j, sx, sy
604+
605+
sx, sy = (<object> arr).shape
606+
if arr.flags.f_contiguous:
607+
if axis == 0:
608+
for j in range(sy):
609+
for i in range(periods, sx):
610+
out[i, j] = arr[i, j] - arr[i - periods, j]
611+
else:
612+
for j in range(periods, sy):
613+
for i in range(sx):
614+
out[i, j] = arr[i, j] - arr[i, j - periods]
615+
else:
616+
if axis == 0:
617+
for i in range(periods, sx):
618+
for j in range(sy):
619+
out[i, j] = arr[i, j] - arr[i - periods, j]
620+
else:
621+
for i in range(sx):
622+
for j in range(periods, sy):
623+
out[i, j] = arr[i, j] - arr[i, j - periods]
624+
625+
626+
@cython.wraparound(False)
627+
@cython.boundscheck(False)
628+
def diff_2d_int32(ndarray[int64_t, ndim=2] arr,
629+
ndarray[float64_t, ndim=2] out,
630+
Py_ssize_t periods, int axis):
631+
cdef:
632+
Py_ssize_t i, j, sx, sy
633+
634+
sx, sy = (<object> arr).shape
635+
if arr.flags.f_contiguous:
636+
if axis == 0:
637+
for j in range(sy):
638+
for i in range(periods, sx):
639+
out[i, j] = arr[i, j] - arr[i - periods, j]
640+
else:
641+
for j in range(periods, sy):
642+
for i in range(sx):
643+
out[i, j] = arr[i, j] - arr[i, j - periods]
644+
else:
645+
if axis == 0:
646+
for i in range(periods, sx):
647+
for j in range(sy):
648+
out[i, j] = arr[i, j] - arr[i - periods, j]
649+
else:
650+
for i in range(sx):
651+
for j in range(periods, sy):
652+
out[i, j] = arr[i, j] - arr[i, j - periods]
653+

0 commit comments

Comments
 (0)