From 2d64fb555f707c86aa448350ebc3858d5260b8c4 Mon Sep 17 00:00:00 2001
From: Andy Hayden <andyhayden1@gmail.com>
Date: Mon, 17 Jun 2013 12:31:29 +0100
Subject: [PATCH 1/3] ENH add cython tutorial

---
 doc/source/cython.rst              | 219 +++++++++++++++++++++++++++++
 doc/sphinxext/ipython_directive.py |   7 +-
 2 files changed, 224 insertions(+), 2 deletions(-)
 create mode 100644 doc/source/cython.rst

diff --git a/doc/source/cython.rst b/doc/source/cython.rst
new file mode 100644
index 0000000000000..726843ec04151
--- /dev/null
+++ b/doc/source/cython.rst
@@ -0,0 +1,219 @@
+.. _cython:
+
+.. currentmodule:: pandas
+
+.. ipython:: python
+   :suppress:
+
+   import os
+   import csv
+   from pandas import DataFrame
+   import pandas as pd
+
+   import numpy as np
+   np.random.seed(123456)
+   randn = np.random.randn
+   randint = np.random.randint
+   np.set_printoptions(precision=4, suppress=True)
+
+
+****************************************
+Cython (Writing C extensions for pandas)
+****************************************
+
+For many use cases writing pandas in pure python and numpy is sufficient. In some computationally heavy applications however, it can be possible to achieve sizeable speed-ups by offloading work to `cython <http://cython.org/>`_.
+
+- Say something about this being tutorial for "advanced" users?
+
+.. note::
+
+    The first thing to do here is to see if we can refactor in python, removing for loops (TODO add some waffle, and maybe trivial example, maybe even just using a for loop rather than apply in this example) a way which could make use of numpy...
+
+
+This tutorial walksthrough a "typical" process of cythonizing a slow computation, we use an `example from the cython documentation <http://docs.cython.org/src/quickstart/cythonize.html>`_ in the context of pandas:
+
+We have a function, ``integrate_f``, which we want to apply row-wise across a DataFrame, ``df``:
+
+.. ipython:: python
+
+   df = DataFrame({'x': 'x', 'a': randn(1000), 'b': randn(1000),'N': randint(100, 1000, (1000))})
+   df
+
+.. ipython:: python
+
+   def f(x):
+       return x * (x - 1)
+
+   def integrate_f(a, b, N):
+       s = 0
+       dx = (b - a) / N
+       for i in range(N):
+           s += f(a + i * dx)
+       return s * dx
+
+In pure pandas we might achieve this using a row-wise ``apply``:
+
+.. ipython:: python
+   
+   %timeit df.apply(lambda x: integrate_f(x['a'], x['b'], x['N']), axis=1)
+
+Clearly this isn't fast enough for us, so let's take a look and see where the time is spent performing this operation (limited to the most time consuming four calls) using the `prun ipython magic function <http://ipython.org/ipython-doc/stable/api/generated/IPython.core.magics.execution.html#IPython.core.magics.execution.ExecutionMagics.prun>`_:
+
+.. ipython:: python
+
+   %prun -l 4 df.apply(lambda x: integrate_f(x['a'], x['b'], x['N']), axis=1)
+
+By far the majority of time is spend inside either ``integrate_f`` or ``f``, hence we concentrate our efforts cythonizing these two functions.
+
+.. note::
+ 
+    In python 2 replacing the ``range`` with its generator counterpart (``xrange``) would mean the ``range`` line would vanish. In python 3 range is already a generator.
+
+First, let's simply just copy our function over to cython as is (here the ``_plain`` suffix stands for "plain cython", allowing us to distinguish between our cython functions):
+
+.. ipython:: python
+
+   %load_ext cythonmagic
+
+.. ipython::
+
+   In [2]: %%cython
+      ...: def f_plain(x):
+      ...:     return x * (x - 1)
+      ...: def integrate_f_plain(a, b, N):
+      ...:     s = 0
+      ...:     dx = (b - a) / N
+      ...:     for i in range(N):
+      ...:         s += f_plain(a + i * dx)
+      ...:     return s * dx
+      ...:
+
+.. ipython:: python
+
+   %timeit df.apply(lambda x: integrate_f_plain(x['a'], x['b'], x['N']), axis=1)
+
+
+We're already shaved a third off, not too bad for a simple copy and paste. We'll get another huge improvement simply by providing type information:
+
+.. ipython::
+
+   In [3]: %%cython
+      ...: cdef double f_typed(double x) except? -2:
+      ...:     return x * (x - 1)
+      ...: cpdef double integrate_f_typed(double a, double b, int N):
+      ...:     cdef int i
+      ...:     cdef double s, dx
+      ...:     s = 0
+      ...:     dx = (b - a) / N
+      ...:     for i in range(N):
+      ...:         s += f_typed(a + i * dx)
+      ...:     return s * dx
+      ...:
+
+.. ipython:: python
+
+   %timeit df.apply(lambda x: integrate_f_typed(x['a'], x['b'], x['N']), axis=1)
+
+Now, we're talking! Already we're over ten times faster than the original python version, and we haven't *really* modified the code. Let's go back and have another look at what's eating up time now:
+
+.. ipython:: python
+
+   %prun -l 4 df.apply(lambda x: integrate_f_typed(x['a'], x['b'], x['N']), axis=1)
+
+It's calling series and frames... a lot, in fact they're getting called for every row in the DataFrame. Function calls are expensive in python, so maybe we should cythonize the apply part and see if we can minimise these.
+
+We are now passing ndarrays into the cython function, fortunately cython plays very nicely with numpy. TODO mention the ``Py_ssize_t``.
+
+.. ipython:: 
+
+   In [4]: %%cython
+      ...: cimport numpy as np
+      ...: import numpy as np
+      ...: cdef double f_typed(double x) except? -2:
+      ...:     return x**2-x
+      ...: cpdef double integrate_f_typed(double a, double b, int N):
+      ...:     cdef int i
+      ...:     cdef double s, dx
+      ...:     s = 0
+      ...:     dx = (b-a)/N
+      ...:     for i in range(N):
+      ...:         s += f_typed(a+i*dx)
+      ...:     return s * dx
+      ...: cpdef np.ndarray[double] apply_integrate_f(np.ndarray col_a, np.ndarray col_b, np.ndarray col_N):
+      ...:     assert (col_a.dtype == np.float and col_b.dtype == np.float and col_N.dtype == np.int)
+      ...:     cdef Py_ssize_t i, n = len(col_N)
+      ...:     assert (len(col_a) == len(col_b) == n)
+      ...:     cdef np.ndarray[double] res = np.empty(n)
+      ...:     for i in range(len(col_a)):
+      ...:         res[i] = integrate_f_typed(col_a[i], col_b[i], col_N[i])
+      ...:     return res
+      ...:
+
+
+We create an array of zeros and loop over the rows, applying our ``integrate_f_typed`` function to fill it up. It's worth mentioning here that although a loop like this would be extremely slow in python (TODO: "as we saw" considerably slower than the apply?) while looping over a numpy array in cython is *fast*.
+
+.. ipython:: python
+
+   %timeit apply_integrate_f(df['a'], df['b'], df['N'])
+
+We've gone another three times faster! Let's check again where the time is spent:
+
+.. ipython:: python
+
+   %prun -l 4 apply_integrate_f(df['a'], df['b'], df['N'])
+
+As on might expect, the majority of the time is now spent in ``apply_integrate_f``, so if we wanted to make anymore efficiencies we must continue to concentrate our efforts here...
+
+TODO explain decorators, and why they make it so fast!
+
+.. ipython::
+
+   In [5]: %%cython
+      ...: cimport cython
+      ...: cimport numpy as np
+      ...: import numpy as np
+      ...: cdef double f_typed(double x) except? -2:
+      ...:     return x**2-x
+      ...: cpdef double integrate_f_typed(double a, double b, int N):
+      ...:     cdef int i
+      ...:     cdef double s, dx
+      ...:     s = 0
+      ...:     dx = (b-a)/N
+      ...:     for i in range(N):
+      ...:         s += f_typed(a+i*dx)
+      ...:     return s * dx
+      ...: @cython.boundscheck(False)
+      ...: @cython.wraparound(False)
+      ...: cpdef np.ndarray[double] apply_integrate_f_wrap(np.ndarray[double] col_a, np.ndarray[double] col_b, np.ndarray[Py_ssize_t] col_N):
+      ...:     cdef Py_ssize_t i, n = len(col_N)
+      ...:     assert len(col_a) == len(col_b) == n
+      ...:     cdef np.ndarray[double] res = np.empty(n)
+      ...:     for i in range(n):
+      ...:         res[i] = integrate_f_typed(col_a[i], col_b[i], col_N[i])
+      ...:     return res
+      ...:
+
+.. ipython:: python
+
+   %timeit apply_integrate_f_wrap(df['a'], df['b'], df['N'])
+
+Again we've shaved another third off, so let's have a look at where the time is spent:
+
+.. ipython:: python
+
+   %prun -l 4 apply_integrate_f_wrap(df['a'], df['b'], df['N'])
+
+We can see that now all the time appears to be spent in ``apply_integrate_f_wrap`` and not much anywhere else. It would make sense to continue looking here for efficiencies...
+
+TODO more? Have a 2D ndarray example?
+
+Using cython has made our calculation around 100 times faster than the original python only version, and yet we're left with something which doesn't look too dissimilar.
+
+TODO some warning that you don't need to cythonize every function (!)
+
+Further topics:
+
+- One can also load in functions from other C modules you've already written.
+- More??
+
+Read more in the `cython docs <http://docs.cython.org/>`_.
\ No newline at end of file
diff --git a/doc/sphinxext/ipython_directive.py b/doc/sphinxext/ipython_directive.py
index bc3c46dd5cc93..b237341e81125 100644
--- a/doc/sphinxext/ipython_directive.py
+++ b/doc/sphinxext/ipython_directive.py
@@ -296,11 +296,14 @@ def process_input(self, data, input_prompt, lineno):
         is_savefig = decorator is not None and \
                      decorator.startswith('@savefig')
 
-        input_lines = input.split('\n')
+        def _remove_first_space_if_any(line):
+            return line[1:] if line.startswith(' ') else line
+
+        input_lines = map(_remove_first_space_if_any, input.split('\n'))
 
         self.datacontent = data
 
-        continuation = '   %s:'%''.join(['.']*(len(str(lineno))+2))
+        continuation = '   %s: '%''.join(['.']*(len(str(lineno))+2))
 
         if is_savefig:
             image_file, image_directive = self.process_image(decorator)

From 69330fad485638438b1fc653fe836314c06a6219 Mon Sep 17 00:00:00 2001
From: Andy Hayden <andyhayden1@gmail.com>
Date: Thu, 20 Jun 2013 01:15:21 +0100
Subject: [PATCH 2/3] FIX add cython to toctree

---
 doc/source/index.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/index.rst b/doc/source/index.rst
index 21a79ffdb85fd..ef414d41a4051 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -131,5 +131,6 @@ See the package overview for more detail about what's in the library.
     r_interface
     related
     comparison_with_r
+    cython
     api
 

From 700d8fb30a65069b8e50ea9b459cb6342984ec8b Mon Sep 17 00:00:00 2001
From: Andy Hayden <andyhayden1@gmail.com>
Date: Fri, 21 Jun 2013 00:45:57 +0100
Subject: [PATCH 3/3] FIX remove todos

change name to Enhancing performance
add in some sections
---
 doc/source/{cython.rst => enhancingperf.rst} | 144 +++++++++++++------
 doc/source/index.rst                         |   2 +-
 2 files changed, 100 insertions(+), 46 deletions(-)
 rename doc/source/{cython.rst => enhancingperf.rst} (51%)

diff --git a/doc/source/cython.rst b/doc/source/enhancingperf.rst
similarity index 51%
rename from doc/source/cython.rst
rename to doc/source/enhancingperf.rst
index 726843ec04151..db28dfde926bf 100644
--- a/doc/source/cython.rst
+++ b/doc/source/enhancingperf.rst
@@ -1,4 +1,4 @@
-.. _cython:
+.. _enhancingperf:
 
 .. currentmodule:: pandas
 
@@ -17,28 +17,42 @@
    np.set_printoptions(precision=4, suppress=True)
 
 
-****************************************
-Cython (Writing C extensions for pandas)
-****************************************
+*********************
+Enhancing Performance
+*********************
 
-For many use cases writing pandas in pure python and numpy is sufficient. In some computationally heavy applications however, it can be possible to achieve sizeable speed-ups by offloading work to `cython <http://cython.org/>`_.
+.. _enhancingperf.cython:
 
-- Say something about this being tutorial for "advanced" users?
+Cython (Writing C extensions for pandas)
+----------------------------------------
 
-.. note::
+For many use cases writing pandas in pure python and numpy is sufficient. In some 
+computationally heavy applications however, it can be possible to achieve sizeable
+speed-ups by offloading work to `cython <http://cython.org/>`_.
 
-    The first thing to do here is to see if we can refactor in python, removing for loops (TODO add some waffle, and maybe trivial example, maybe even just using a for loop rather than apply in this example) a way which could make use of numpy...
+This tutorial assumes you have refactored as much as possible in python, for example
+trying to remove for loops and making use of numpy vectorization, it's always worth
+optimising in python first.
 
+This tutorial walks through a "typical" process of cythonizing a slow computation.
+We use an `example from the cython documentation <http://docs.cython.org/src/quickstart/cythonize.html>`_ 
+but in the context of pandas. Our final cythonized solution is around 100 times
+faster than the pure python.
 
-This tutorial walksthrough a "typical" process of cythonizing a slow computation, we use an `example from the cython documentation <http://docs.cython.org/src/quickstart/cythonize.html>`_ in the context of pandas:
+.. _enhancingperf.pure:
 
-We have a function, ``integrate_f``, which we want to apply row-wise across a DataFrame, ``df``:
+Pure python
+~~~~~~~~~~~
+
+We have a DataFrame to which we want to apply a function row-wise.
 
 .. ipython:: python
 
-   df = DataFrame({'x': 'x', 'a': randn(1000), 'b': randn(1000),'N': randint(100, 1000, (1000))})
+   df = DataFrame({'a': randn(1000), 'b': randn(1000),'N': randint(100, 1000, (1000)), 'x': 'x'})
    df
 
+Here's the function in pure python:
+
 .. ipython:: python
 
    def f(x):
@@ -51,30 +65,43 @@ We have a function, ``integrate_f``, which we want to apply row-wise across a Da
            s += f(a + i * dx)
        return s * dx
 
-In pure pandas we might achieve this using a row-wise ``apply``:
+We achieve our result by by using ``apply`` (row-wise):
 
 .. ipython:: python
    
    %timeit df.apply(lambda x: integrate_f(x['a'], x['b'], x['N']), axis=1)
 
-Clearly this isn't fast enough for us, so let's take a look and see where the time is spent performing this operation (limited to the most time consuming four calls) using the `prun ipython magic function <http://ipython.org/ipython-doc/stable/api/generated/IPython.core.magics.execution.html#IPython.core.magics.execution.ExecutionMagics.prun>`_:
+But clearly this isn't fast enough for us. Let's take a look and see where the
+time is spent during this operation (limited to the most time consuming
+four calls) using the `prun ipython magic function <http://ipython.org/ipython-doc/stable/api/generated/IPython.core.magics.execution.html#IPython.core.magics.execution.ExecutionMagics.prun>`_:
 
 .. ipython:: python
 
    %prun -l 4 df.apply(lambda x: integrate_f(x['a'], x['b'], x['N']), axis=1)
 
-By far the majority of time is spend inside either ``integrate_f`` or ``f``, hence we concentrate our efforts cythonizing these two functions.
+By far the majority of time is spend inside either ``integrate_f`` or ``f``,
+hence we'll concentrate our efforts cythonizing these two functions.
 
 .. note::
  
-    In python 2 replacing the ``range`` with its generator counterpart (``xrange``) would mean the ``range`` line would vanish. In python 3 range is already a generator.
+  In python 2 replacing the ``range`` with its generator counterpart (``xrange``)
+  would mean the ``range`` line would vanish. In python 3 range is already a generator.
 
-First, let's simply just copy our function over to cython as is (here the ``_plain`` suffix stands for "plain cython", allowing us to distinguish between our cython functions):
+.. _enhancingperf.plain:
+
+Plain cython
+~~~~~~~~~~~~
+
+First we're going to need to import the cython magic function to ipython:
 
 .. ipython:: python
 
    %load_ext cythonmagic
 
+
+Now, let's simply copy our functions over to cython as is (the suffix
+is here to distinguish between function versions):
+
 .. ipython::
 
    In [2]: %%cython
@@ -88,12 +115,24 @@ First, let's simply just copy our function over to cython as is (here the ``_pla
       ...:     return s * dx
       ...:
 
+.. note::
+
+  If you're having trouble pasting the above into your ipython, you may need
+  to be using bleeding edge ipython for paste to play well with cell magics.
+
+
 .. ipython:: python
 
    %timeit df.apply(lambda x: integrate_f_plain(x['a'], x['b'], x['N']), axis=1)
 
+Already this has shaved a third off, not too bad for a simple copy and paste. 
+
+.. _enhancingperf.type:
+
+Adding type
+~~~~~~~~~~~
 
-We're already shaved a third off, not too bad for a simple copy and paste. We'll get another huge improvement simply by providing type information:
+We get another huge improvement simply by providing type information:
 
 .. ipython::
 
@@ -114,15 +153,27 @@ We're already shaved a third off, not too bad for a simple copy and paste. We'll
 
    %timeit df.apply(lambda x: integrate_f_typed(x['a'], x['b'], x['N']), axis=1)
 
-Now, we're talking! Already we're over ten times faster than the original python version, and we haven't *really* modified the code. Let's go back and have another look at what's eating up time now:
+Now, we're talking! It's now over ten times faster than the original python
+implementation, and we haven't *really* modified the code. Let's have another
+look at what's eating up time:
 
 .. ipython:: python
 
    %prun -l 4 df.apply(lambda x: integrate_f_typed(x['a'], x['b'], x['N']), axis=1)
 
-It's calling series and frames... a lot, in fact they're getting called for every row in the DataFrame. Function calls are expensive in python, so maybe we should cythonize the apply part and see if we can minimise these.
+.. _enhancingperf.ndarray:
+
+Using ndarray
+~~~~~~~~~~~~~
+
+It's calling series... a lot! It's creating a Series from each row, and get-ting from both
+the index and the series (three times for each row). Function calls are expensive
+in python, so maybe we could minimise these by cythonizing the apply part.
+
+.. note::
 
-We are now passing ndarrays into the cython function, fortunately cython plays very nicely with numpy. TODO mention the ``Py_ssize_t``.
+  We are now passing ndarrays into the cython function, fortunately cython plays
+  very nicely with numpy.
 
 .. ipython:: 
 
@@ -130,14 +181,14 @@ We are now passing ndarrays into the cython function, fortunately cython plays v
       ...: cimport numpy as np
       ...: import numpy as np
       ...: cdef double f_typed(double x) except? -2:
-      ...:     return x**2-x
+      ...:     return x * (x - 1)
       ...: cpdef double integrate_f_typed(double a, double b, int N):
       ...:     cdef int i
       ...:     cdef double s, dx
       ...:     s = 0
-      ...:     dx = (b-a)/N
+      ...:     dx = (b - a) / N
       ...:     for i in range(N):
-      ...:         s += f_typed(a+i*dx)
+      ...:         s += f_typed(a + i * dx)
       ...:     return s * dx
       ...: cpdef np.ndarray[double] apply_integrate_f(np.ndarray col_a, np.ndarray col_b, np.ndarray col_N):
       ...:     assert (col_a.dtype == np.float and col_b.dtype == np.float and col_N.dtype == np.int)
@@ -150,7 +201,14 @@ We are now passing ndarrays into the cython function, fortunately cython plays v
       ...:
 
 
-We create an array of zeros and loop over the rows, applying our ``integrate_f_typed`` function to fill it up. It's worth mentioning here that although a loop like this would be extremely slow in python (TODO: "as we saw" considerably slower than the apply?) while looping over a numpy array in cython is *fast*.
+The implementation is simple, it creates an array of zeros and loops over
+the rows, applying our ``integrate_f_typed``, and putting this in the zeros array.
+
+
+.. note::
+
+    Loop like this would be *extremely* slow in python, but in cython looping over
+    numpy arrays is *fast*.
 
 .. ipython:: python
 
@@ -162,9 +220,17 @@ We've gone another three times faster! Let's check again where the time is spent
 
    %prun -l 4 apply_integrate_f(df['a'], df['b'], df['N'])
 
-As on might expect, the majority of the time is now spent in ``apply_integrate_f``, so if we wanted to make anymore efficiencies we must continue to concentrate our efforts here...
+As one might expect, the majority of the time is now spent in ``apply_integrate_f``,
+so if we wanted to make anymore efficiencies we must continue to concentrate our
+efforts here.
+
+.. _enhancingperf.boundswrap:
 
-TODO explain decorators, and why they make it so fast!
+More advanced techniques
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+There is still scope for improvement, here's an example of using some more
+advanced cython techniques:
 
 .. ipython::
 
@@ -173,14 +239,14 @@ TODO explain decorators, and why they make it so fast!
       ...: cimport numpy as np
       ...: import numpy as np
       ...: cdef double f_typed(double x) except? -2:
-      ...:     return x**2-x
+      ...:     return x * (x - 1)
       ...: cpdef double integrate_f_typed(double a, double b, int N):
       ...:     cdef int i
       ...:     cdef double s, dx
       ...:     s = 0
-      ...:     dx = (b-a)/N
+      ...:     dx = (b - a) / N
       ...:     for i in range(N):
-      ...:         s += f_typed(a+i*dx)
+      ...:         s += f_typed(a + i * dx)
       ...:     return s * dx
       ...: @cython.boundscheck(False)
       ...: @cython.wraparound(False)
@@ -197,23 +263,11 @@ TODO explain decorators, and why they make it so fast!
 
    %timeit apply_integrate_f_wrap(df['a'], df['b'], df['N'])
 
-Again we've shaved another third off, so let's have a look at where the time is spent:
-
-.. ipython:: python
-
-   %prun -l 4 apply_integrate_f_wrap(df['a'], df['b'], df['N'])
-
-We can see that now all the time appears to be spent in ``apply_integrate_f_wrap`` and not much anywhere else. It would make sense to continue looking here for efficiencies...
-
-TODO more? Have a 2D ndarray example?
-
-Using cython has made our calculation around 100 times faster than the original python only version, and yet we're left with something which doesn't look too dissimilar.
-
-TODO some warning that you don't need to cythonize every function (!)
+This shaves another third off!
 
-Further topics:
+Further topics
+~~~~~~~~~~~~~~
 
-- One can also load in functions from other C modules you've already written.
-- More??
+- Loading C modules into cython.
 
 Read more in the `cython docs <http://docs.cython.org/>`_.
\ No newline at end of file
diff --git a/doc/source/index.rst b/doc/source/index.rst
index ef414d41a4051..67f1a3c1e6312 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -126,11 +126,11 @@ See the package overview for more detail about what's in the library.
     visualization
     rplot
     io
+    performance
     sparse
     gotchas
     r_interface
     related
     comparison_with_r
-    cython
     api