diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst new file mode 100644 index 0000000000000..db28dfde926bf --- /dev/null +++ b/doc/source/enhancingperf.rst @@ -0,0 +1,273 @@ +.. _enhancingperf: + +.. currentmodule:: pandas + +.. ipython:: python + :suppress: + + import os + import csv + from pandas import DataFrame + import pandas as pd + + import numpy as np + np.random.seed(123456) + randn = np.random.randn + randint = np.random.randint + np.set_printoptions(precision=4, suppress=True) + + +********************* +Enhancing Performance +********************* + +.. _enhancingperf.cython: + +Cython (Writing C extensions for pandas) +---------------------------------------- + +For many use cases writing pandas in pure python and numpy is sufficient. In some +computationally heavy applications however, it can be possible to achieve sizeable +speed-ups by offloading work to `cython `_. + +This tutorial assumes you have refactored as much as possible in python, for example +trying to remove for loops and making use of numpy vectorization, it's always worth +optimising in python first. + +This tutorial walks through a "typical" process of cythonizing a slow computation. +We use an `example from the cython documentation `_ +but in the context of pandas. Our final cythonized solution is around 100 times +faster than the pure python. + +.. _enhancingperf.pure: + +Pure python +~~~~~~~~~~~ + +We have a DataFrame to which we want to apply a function row-wise. + +.. ipython:: python + + df = DataFrame({'a': randn(1000), 'b': randn(1000),'N': randint(100, 1000, (1000)), 'x': 'x'}) + df + +Here's the function in pure python: + +.. ipython:: python + + def f(x): + return x * (x - 1) + + def integrate_f(a, b, N): + s = 0 + dx = (b - a) / N + for i in range(N): + s += f(a + i * dx) + return s * dx + +We achieve our result by by using ``apply`` (row-wise): + +.. ipython:: python + + %timeit df.apply(lambda x: integrate_f(x['a'], x['b'], x['N']), axis=1) + +But clearly this isn't fast enough for us. Let's take a look and see where the +time is spent during this operation (limited to the most time consuming +four calls) using the `prun ipython magic function `_: + +.. ipython:: python + + %prun -l 4 df.apply(lambda x: integrate_f(x['a'], x['b'], x['N']), axis=1) + +By far the majority of time is spend inside either ``integrate_f`` or ``f``, +hence we'll concentrate our efforts cythonizing these two functions. + +.. note:: + + In python 2 replacing the ``range`` with its generator counterpart (``xrange``) + would mean the ``range`` line would vanish. In python 3 range is already a generator. + +.. _enhancingperf.plain: + +Plain cython +~~~~~~~~~~~~ + +First we're going to need to import the cython magic function to ipython: + +.. ipython:: python + + %load_ext cythonmagic + + +Now, let's simply copy our functions over to cython as is (the suffix +is here to distinguish between function versions): + +.. ipython:: + + In [2]: %%cython + ...: def f_plain(x): + ...: return x * (x - 1) + ...: def integrate_f_plain(a, b, N): + ...: s = 0 + ...: dx = (b - a) / N + ...: for i in range(N): + ...: s += f_plain(a + i * dx) + ...: return s * dx + ...: + +.. note:: + + If you're having trouble pasting the above into your ipython, you may need + to be using bleeding edge ipython for paste to play well with cell magics. + + +.. ipython:: python + + %timeit df.apply(lambda x: integrate_f_plain(x['a'], x['b'], x['N']), axis=1) + +Already this has shaved a third off, not too bad for a simple copy and paste. + +.. _enhancingperf.type: + +Adding type +~~~~~~~~~~~ + +We get another huge improvement simply by providing type information: + +.. ipython:: + + In [3]: %%cython + ...: cdef double f_typed(double x) except? -2: + ...: return x * (x - 1) + ...: cpdef double integrate_f_typed(double a, double b, int N): + ...: cdef int i + ...: cdef double s, dx + ...: s = 0 + ...: dx = (b - a) / N + ...: for i in range(N): + ...: s += f_typed(a + i * dx) + ...: return s * dx + ...: + +.. ipython:: python + + %timeit df.apply(lambda x: integrate_f_typed(x['a'], x['b'], x['N']), axis=1) + +Now, we're talking! It's now over ten times faster than the original python +implementation, and we haven't *really* modified the code. Let's have another +look at what's eating up time: + +.. ipython:: python + + %prun -l 4 df.apply(lambda x: integrate_f_typed(x['a'], x['b'], x['N']), axis=1) + +.. _enhancingperf.ndarray: + +Using ndarray +~~~~~~~~~~~~~ + +It's calling series... a lot! It's creating a Series from each row, and get-ting from both +the index and the series (three times for each row). Function calls are expensive +in python, so maybe we could minimise these by cythonizing the apply part. + +.. note:: + + We are now passing ndarrays into the cython function, fortunately cython plays + very nicely with numpy. + +.. ipython:: + + In [4]: %%cython + ...: cimport numpy as np + ...: import numpy as np + ...: cdef double f_typed(double x) except? -2: + ...: return x * (x - 1) + ...: cpdef double integrate_f_typed(double a, double b, int N): + ...: cdef int i + ...: cdef double s, dx + ...: s = 0 + ...: dx = (b - a) / N + ...: for i in range(N): + ...: s += f_typed(a + i * dx) + ...: return s * dx + ...: cpdef np.ndarray[double] apply_integrate_f(np.ndarray col_a, np.ndarray col_b, np.ndarray col_N): + ...: assert (col_a.dtype == np.float and col_b.dtype == np.float and col_N.dtype == np.int) + ...: cdef Py_ssize_t i, n = len(col_N) + ...: assert (len(col_a) == len(col_b) == n) + ...: cdef np.ndarray[double] res = np.empty(n) + ...: for i in range(len(col_a)): + ...: res[i] = integrate_f_typed(col_a[i], col_b[i], col_N[i]) + ...: return res + ...: + + +The implementation is simple, it creates an array of zeros and loops over +the rows, applying our ``integrate_f_typed``, and putting this in the zeros array. + + +.. note:: + + Loop like this would be *extremely* slow in python, but in cython looping over + numpy arrays is *fast*. + +.. ipython:: python + + %timeit apply_integrate_f(df['a'], df['b'], df['N']) + +We've gone another three times faster! Let's check again where the time is spent: + +.. ipython:: python + + %prun -l 4 apply_integrate_f(df['a'], df['b'], df['N']) + +As one might expect, the majority of the time is now spent in ``apply_integrate_f``, +so if we wanted to make anymore efficiencies we must continue to concentrate our +efforts here. + +.. _enhancingperf.boundswrap: + +More advanced techniques +~~~~~~~~~~~~~~~~~~~~~~~~ + +There is still scope for improvement, here's an example of using some more +advanced cython techniques: + +.. ipython:: + + In [5]: %%cython + ...: cimport cython + ...: cimport numpy as np + ...: import numpy as np + ...: cdef double f_typed(double x) except? -2: + ...: return x * (x - 1) + ...: cpdef double integrate_f_typed(double a, double b, int N): + ...: cdef int i + ...: cdef double s, dx + ...: s = 0 + ...: dx = (b - a) / N + ...: for i in range(N): + ...: s += f_typed(a + i * dx) + ...: return s * dx + ...: @cython.boundscheck(False) + ...: @cython.wraparound(False) + ...: cpdef np.ndarray[double] apply_integrate_f_wrap(np.ndarray[double] col_a, np.ndarray[double] col_b, np.ndarray[Py_ssize_t] col_N): + ...: cdef Py_ssize_t i, n = len(col_N) + ...: assert len(col_a) == len(col_b) == n + ...: cdef np.ndarray[double] res = np.empty(n) + ...: for i in range(n): + ...: res[i] = integrate_f_typed(col_a[i], col_b[i], col_N[i]) + ...: return res + ...: + +.. ipython:: python + + %timeit apply_integrate_f_wrap(df['a'], df['b'], df['N']) + +This shaves another third off! + +Further topics +~~~~~~~~~~~~~~ + +- Loading C modules into cython. + +Read more in the `cython docs `_. \ No newline at end of file diff --git a/doc/source/index.rst b/doc/source/index.rst index 21a79ffdb85fd..67f1a3c1e6312 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -126,6 +126,7 @@ See the package overview for more detail about what's in the library. visualization rplot io + performance sparse gotchas r_interface diff --git a/doc/sphinxext/ipython_directive.py b/doc/sphinxext/ipython_directive.py index bc3c46dd5cc93..b237341e81125 100644 --- a/doc/sphinxext/ipython_directive.py +++ b/doc/sphinxext/ipython_directive.py @@ -296,11 +296,14 @@ def process_input(self, data, input_prompt, lineno): is_savefig = decorator is not None and \ decorator.startswith('@savefig') - input_lines = input.split('\n') + def _remove_first_space_if_any(line): + return line[1:] if line.startswith(' ') else line + + input_lines = map(_remove_first_space_if_any, input.split('\n')) self.datacontent = data - continuation = ' %s:'%''.join(['.']*(len(str(lineno))+2)) + continuation = ' %s: '%''.join(['.']*(len(str(lineno))+2)) if is_savefig: image_file, image_directive = self.process_image(decorator)