diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst
new file mode 100644
index 0000000000000..db28dfde926bf
--- /dev/null
+++ b/doc/source/enhancingperf.rst
@@ -0,0 +1,273 @@
+.. _enhancingperf:
+
+.. currentmodule:: pandas
+
+.. ipython:: python
+ :suppress:
+
+ import os
+ import csv
+ from pandas import DataFrame
+ import pandas as pd
+
+ import numpy as np
+ np.random.seed(123456)
+ randn = np.random.randn
+ randint = np.random.randint
+ np.set_printoptions(precision=4, suppress=True)
+
+
+*********************
+Enhancing Performance
+*********************
+
+.. _enhancingperf.cython:
+
+Cython (Writing C extensions for pandas)
+----------------------------------------
+
+For many use cases writing pandas in pure python and numpy is sufficient. In some
+computationally heavy applications however, it can be possible to achieve sizeable
+speed-ups by offloading work to `cython `_.
+
+This tutorial assumes you have refactored as much as possible in python, for example
+trying to remove for loops and making use of numpy vectorization, it's always worth
+optimising in python first.
+
+This tutorial walks through a "typical" process of cythonizing a slow computation.
+We use an `example from the cython documentation `_
+but in the context of pandas. Our final cythonized solution is around 100 times
+faster than the pure python.
+
+.. _enhancingperf.pure:
+
+Pure python
+~~~~~~~~~~~
+
+We have a DataFrame to which we want to apply a function row-wise.
+
+.. ipython:: python
+
+ df = DataFrame({'a': randn(1000), 'b': randn(1000),'N': randint(100, 1000, (1000)), 'x': 'x'})
+ df
+
+Here's the function in pure python:
+
+.. ipython:: python
+
+ def f(x):
+ return x * (x - 1)
+
+ def integrate_f(a, b, N):
+ s = 0
+ dx = (b - a) / N
+ for i in range(N):
+ s += f(a + i * dx)
+ return s * dx
+
+We achieve our result by by using ``apply`` (row-wise):
+
+.. ipython:: python
+
+ %timeit df.apply(lambda x: integrate_f(x['a'], x['b'], x['N']), axis=1)
+
+But clearly this isn't fast enough for us. Let's take a look and see where the
+time is spent during this operation (limited to the most time consuming
+four calls) using the `prun ipython magic function `_:
+
+.. ipython:: python
+
+ %prun -l 4 df.apply(lambda x: integrate_f(x['a'], x['b'], x['N']), axis=1)
+
+By far the majority of time is spend inside either ``integrate_f`` or ``f``,
+hence we'll concentrate our efforts cythonizing these two functions.
+
+.. note::
+
+ In python 2 replacing the ``range`` with its generator counterpart (``xrange``)
+ would mean the ``range`` line would vanish. In python 3 range is already a generator.
+
+.. _enhancingperf.plain:
+
+Plain cython
+~~~~~~~~~~~~
+
+First we're going to need to import the cython magic function to ipython:
+
+.. ipython:: python
+
+ %load_ext cythonmagic
+
+
+Now, let's simply copy our functions over to cython as is (the suffix
+is here to distinguish between function versions):
+
+.. ipython::
+
+ In [2]: %%cython
+ ...: def f_plain(x):
+ ...: return x * (x - 1)
+ ...: def integrate_f_plain(a, b, N):
+ ...: s = 0
+ ...: dx = (b - a) / N
+ ...: for i in range(N):
+ ...: s += f_plain(a + i * dx)
+ ...: return s * dx
+ ...:
+
+.. note::
+
+ If you're having trouble pasting the above into your ipython, you may need
+ to be using bleeding edge ipython for paste to play well with cell magics.
+
+
+.. ipython:: python
+
+ %timeit df.apply(lambda x: integrate_f_plain(x['a'], x['b'], x['N']), axis=1)
+
+Already this has shaved a third off, not too bad for a simple copy and paste.
+
+.. _enhancingperf.type:
+
+Adding type
+~~~~~~~~~~~
+
+We get another huge improvement simply by providing type information:
+
+.. ipython::
+
+ In [3]: %%cython
+ ...: cdef double f_typed(double x) except? -2:
+ ...: return x * (x - 1)
+ ...: cpdef double integrate_f_typed(double a, double b, int N):
+ ...: cdef int i
+ ...: cdef double s, dx
+ ...: s = 0
+ ...: dx = (b - a) / N
+ ...: for i in range(N):
+ ...: s += f_typed(a + i * dx)
+ ...: return s * dx
+ ...:
+
+.. ipython:: python
+
+ %timeit df.apply(lambda x: integrate_f_typed(x['a'], x['b'], x['N']), axis=1)
+
+Now, we're talking! It's now over ten times faster than the original python
+implementation, and we haven't *really* modified the code. Let's have another
+look at what's eating up time:
+
+.. ipython:: python
+
+ %prun -l 4 df.apply(lambda x: integrate_f_typed(x['a'], x['b'], x['N']), axis=1)
+
+.. _enhancingperf.ndarray:
+
+Using ndarray
+~~~~~~~~~~~~~
+
+It's calling series... a lot! It's creating a Series from each row, and get-ting from both
+the index and the series (three times for each row). Function calls are expensive
+in python, so maybe we could minimise these by cythonizing the apply part.
+
+.. note::
+
+ We are now passing ndarrays into the cython function, fortunately cython plays
+ very nicely with numpy.
+
+.. ipython::
+
+ In [4]: %%cython
+ ...: cimport numpy as np
+ ...: import numpy as np
+ ...: cdef double f_typed(double x) except? -2:
+ ...: return x * (x - 1)
+ ...: cpdef double integrate_f_typed(double a, double b, int N):
+ ...: cdef int i
+ ...: cdef double s, dx
+ ...: s = 0
+ ...: dx = (b - a) / N
+ ...: for i in range(N):
+ ...: s += f_typed(a + i * dx)
+ ...: return s * dx
+ ...: cpdef np.ndarray[double] apply_integrate_f(np.ndarray col_a, np.ndarray col_b, np.ndarray col_N):
+ ...: assert (col_a.dtype == np.float and col_b.dtype == np.float and col_N.dtype == np.int)
+ ...: cdef Py_ssize_t i, n = len(col_N)
+ ...: assert (len(col_a) == len(col_b) == n)
+ ...: cdef np.ndarray[double] res = np.empty(n)
+ ...: for i in range(len(col_a)):
+ ...: res[i] = integrate_f_typed(col_a[i], col_b[i], col_N[i])
+ ...: return res
+ ...:
+
+
+The implementation is simple, it creates an array of zeros and loops over
+the rows, applying our ``integrate_f_typed``, and putting this in the zeros array.
+
+
+.. note::
+
+ Loop like this would be *extremely* slow in python, but in cython looping over
+ numpy arrays is *fast*.
+
+.. ipython:: python
+
+ %timeit apply_integrate_f(df['a'], df['b'], df['N'])
+
+We've gone another three times faster! Let's check again where the time is spent:
+
+.. ipython:: python
+
+ %prun -l 4 apply_integrate_f(df['a'], df['b'], df['N'])
+
+As one might expect, the majority of the time is now spent in ``apply_integrate_f``,
+so if we wanted to make anymore efficiencies we must continue to concentrate our
+efforts here.
+
+.. _enhancingperf.boundswrap:
+
+More advanced techniques
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+There is still scope for improvement, here's an example of using some more
+advanced cython techniques:
+
+.. ipython::
+
+ In [5]: %%cython
+ ...: cimport cython
+ ...: cimport numpy as np
+ ...: import numpy as np
+ ...: cdef double f_typed(double x) except? -2:
+ ...: return x * (x - 1)
+ ...: cpdef double integrate_f_typed(double a, double b, int N):
+ ...: cdef int i
+ ...: cdef double s, dx
+ ...: s = 0
+ ...: dx = (b - a) / N
+ ...: for i in range(N):
+ ...: s += f_typed(a + i * dx)
+ ...: return s * dx
+ ...: @cython.boundscheck(False)
+ ...: @cython.wraparound(False)
+ ...: cpdef np.ndarray[double] apply_integrate_f_wrap(np.ndarray[double] col_a, np.ndarray[double] col_b, np.ndarray[Py_ssize_t] col_N):
+ ...: cdef Py_ssize_t i, n = len(col_N)
+ ...: assert len(col_a) == len(col_b) == n
+ ...: cdef np.ndarray[double] res = np.empty(n)
+ ...: for i in range(n):
+ ...: res[i] = integrate_f_typed(col_a[i], col_b[i], col_N[i])
+ ...: return res
+ ...:
+
+.. ipython:: python
+
+ %timeit apply_integrate_f_wrap(df['a'], df['b'], df['N'])
+
+This shaves another third off!
+
+Further topics
+~~~~~~~~~~~~~~
+
+- Loading C modules into cython.
+
+Read more in the `cython docs `_.
\ No newline at end of file
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 21a79ffdb85fd..67f1a3c1e6312 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -126,6 +126,7 @@ See the package overview for more detail about what's in the library.
visualization
rplot
io
+ performance
sparse
gotchas
r_interface
diff --git a/doc/sphinxext/ipython_directive.py b/doc/sphinxext/ipython_directive.py
index bc3c46dd5cc93..b237341e81125 100644
--- a/doc/sphinxext/ipython_directive.py
+++ b/doc/sphinxext/ipython_directive.py
@@ -296,11 +296,14 @@ def process_input(self, data, input_prompt, lineno):
is_savefig = decorator is not None and \
decorator.startswith('@savefig')
- input_lines = input.split('\n')
+ def _remove_first_space_if_any(line):
+ return line[1:] if line.startswith(' ') else line
+
+ input_lines = map(_remove_first_space_if_any, input.split('\n'))
self.datacontent = data
- continuation = ' %s:'%''.join(['.']*(len(str(lineno))+2))
+ continuation = ' %s: '%''.join(['.']*(len(str(lineno))+2))
if is_savefig:
image_file, image_directive = self.process_image(decorator)