From 7e7d786763f5b137ae6697759d2448ede7abe055 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 16 Sep 2019 07:48:05 -0500
Subject: [PATCH 01/16] DOC: Add scaling to large datasets section

Closes https://github.com/pandas-dev/pandas/issues/28315
---
 doc/.gitignore                  |   4 +
 doc/source/index.rst.template   |   1 +
 doc/source/user_guide/scale.rst | 343 ++++++++++++++++++++++++++++++++
 environment.yml                 |   2 +
 pandas/util/testing.py          |  81 ++++++++
 5 files changed, 431 insertions(+)
 create mode 100644 doc/.gitignore
 create mode 100644 doc/source/user_guide/scale.rst

diff --git a/doc/.gitignore b/doc/.gitignore
new file mode 100644
index 0000000000000..66f657d848211
--- /dev/null
+++ b/doc/.gitignore
@@ -0,0 +1,4 @@
+data/
+timeseries.csv
+timeseries.parquet
+timeseries_wide.parquet
\ No newline at end of file
diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template
index f5669626aa2b3..6ff42eee9dad2 100644
--- a/doc/source/index.rst.template
+++ b/doc/source/index.rst.template
@@ -83,6 +83,7 @@ See the :ref:`overview` for more detail about what's in the library.
   * :doc:`user_guide/style`
   * :doc:`user_guide/options`
   * :doc:`user_guide/enhancingperf`
+  * :doc:`user_guide/scale`
   * :doc:`user_guide/sparse`
   * :doc:`user_guide/gotchas`
   * :doc:`user_guide/cookbook`
diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst
new file mode 100644
index 0000000000000..e2ca4b79106cd
--- /dev/null
+++ b/doc/source/user_guide/scale.rst
@@ -0,0 +1,343 @@
+.. _scale:
+
+*************************
+Scaling to large datasets
+*************************
+
+Pandas provide data structures for in-memory analytics. This makes using pandas
+to analyze larger than memory datasets somewhat tricky.
+
+This document provides a few recommendations for scaling to larger datasets.
+It's a complement to :ref:`enhancingperf`, which focuses on speeding up analysis
+for datasets that fit in memory.
+
+But first, it's worth considering *not using pandas*. Pandas isn't the right
+tool for all situations. If you're working with very large datasets and a tool
+like PostgreSQL fits your needs, then you should probably be using that.
+Assuming you want or need the expressivity and power of pandas, let's carry on.
+
+.. ipython:: python
+
+   import pandas as pd
+   import numpy as np
+   from pandas.util.testing import make_timeseries
+
+
+Use more efficient file formats
+-------------------------------
+
+Depending on your workload, data loading may be a bottleneck. In these case you
+might consider switching from a slow format like CSV to a faster format like
+Parquet. Loading from a file format like Parquet will also require less memory
+usage, letting you load larger datasets into pandas before running out of
+memory.
+
+.. ipython:: python
+
+   # Make a random in-memory dataset
+   ts = make_timeseries(freq="30S", seed=0)
+   ts
+
+
+We'll now write and read the file using CSV and parquet.
+
+
+.. ipython:: python
+
+   %time ts.to_csv("timeseries.csv")
+
+.. ipython:: python
+
+   %time ts2 = pd.read_csv("timeseries.csv", index_col="timestamp", parse_dates=["timestamp"])
+
+.. ipython:: python
+
+   %time ts.to_parquet("timeseries.parquet")
+
+.. ipython:: python
+
+   %time _ = pd.read_parquet("timeseries.parquet")
+
+Notice that parquet gives much higher performance for reading and writing, both
+in terms of speed and lower peak memory usage. See :ref:`io` for more.
+
+Load less data
+--------------
+
+Suppose our raw dataset on disk has many columns, but we need just a subset
+for our analysis. To get those columns, we can either
+
+1. Load the entire dataset then select those columns.
+2. Just load the columns we need.
+
+Loading just the columns you need can be much faster and requires less memory.
+
+.. ipython:: python
+
+   # make a similar dataset with many columns
+   timeseries = [
+       make_timeseries(freq="1T", seed=i).rename(columns=lambda x: f"{x}_{i}")
+       for i in range(10)
+   ]
+   ts_wide = pd.concat(timeseries, axis=1)
+   ts_wide.head()
+   ts_wide.to_parquet("timeseries_wide.parquet")
+
+
+Option 1 loads in all the data and then filters to what we need.
+
+.. ipython:: python
+
+   columns = ['id_0', 'name_0', 'x_0', 'y_0']
+   
+   %time _ = pd.read_parquet("timeseries_wide.parquet")[columns]
+
+Option 2 only loads the columns we request. This is faster and has a lower peak
+memory usage, since the entire dataset isn't in memory at once.
+
+.. ipython:: python
+
+   %time _ = pd.read_parquet("timeseries_wide.parquet", columns=columns)
+
+
+With :func:`pandas.read_csv`, you can specify ``usecols`` to limit the columns
+read into memory.
+
+
+Use efficient datatypes
+-----------------------
+
+The default pandas data types are not the most memory efficient. This is
+especially true for high-cardinality text data (columns with relatively few
+unique values). By using more efficient data types you can store larger datasets
+in memory.
+
+.. ipython:: python
+
+   ts.dtypes
+
+.. ipython:: python
+
+   ts.memory_usage(deep=True)  # memory usage in bytes
+
+
+The ``name`` column is taking up much more memory than any other. It has just a
+few unique values, so it's a good candidate for converting to a
+:class:`Categorical`. With a Categorical, we store each unique name once and use
+space-efficient integers to know which specific name is used in each row.
+
+
+.. ipython:: python
+
+   ts2 = ts.copy()
+   ts2['name'] = ts2['name'].astype('category')
+   ts2.memory_usage(deep=True)
+
+We can go a bit further and downcast the numeric columns to their smallest types
+using :func:`pandas.to_numeric`.
+
+.. ipython:: python
+
+   ts2['id'] = pd.to_numeric(ts2['id'], downcast='unsigned')
+   ts2[['x', 'y']] = ts2[['x', 'y']].apply(pd.to_numeric, downcast='float')
+   ts2.dtypes
+
+.. ipython:: python
+
+   ts2.memory_usage(deep=True)
+
+.. ipython:: python
+
+   reduction = ts2.memory_usage(deep=True).sum() / ts.memory_usage(deep=True).sum()
+   print(f"{reduction:0.2f}")
+
+In all, we've reduced the in-memory footprint of this dataset to 1/5 of its
+original size.
+
+See :ref:`categorical` for more on ``Categorical`` and :ref:`basics.dtypes`
+for an overview of all of pandas' dtypes.
+
+Use Other libraries
+-------------------
+
+Pandas is just one library offering a DataFrame API. Because of its popularity,
+pandas' API has become something of a standard that other libraries implement.
+
+For example, `Dask`_, a parallel computing library, has `dask.dataframe`_, a
+pandas-like API for working with larger than memory datasets in parallel. Dask
+can use multiple threads or processes on a single machine, or a cluster of
+machines to process data in parallel.
+
+Let's make a larger dataset on disk (as parquet files) that's split into chunks,
+one per year.
+
+.. ipython:: python
+             
+   import pathlib
+   
+   N = 12
+   starts = [f'20{i:>02d}-01-01' for i in range(N)]
+   ends = [f'20{i:>02d}-12-13' for i in range(N)]
+   
+   pathlib.Path("data/timeseries").mkdir(exist_ok=True)
+   
+   for i, (start, end) in enumerate(zip(starts, ends)):
+       ts = make_timeseries(start=start, end=end, freq='1T', seed=i)
+       ts.to_parquet(f"data/timeseries/ts-{i}.parquet")
+   
+We'll import ``dask.dataframe`` and notice that the API feels similar to pandas.
+We can use Dask's ``read_parquet`` function, but provide a globstring of files to read in.
+
+.. ipython:: python
+
+   import dask.dataframe as dd
+
+   ddf = dd.read_parquet("data/timeseries/ts*.parquet", engine="pyarrow")
+   ddf
+
+Inspecting the ``ddf`` object, we see a few things
+
+* There are familiar attributes like ``.columns`` and ``.dtypes``
+* There are familiar methods like ``.groupby``, ``.sum``, etc.
+* There are new attributes like ``.npartitions`` and ``.divisions``
+
+The partitions and divisions are how Dask parallizes computation. A **Dask**
+DataFrame is made up of many **Pandas** DataFrames. A single method call on a
+Dask DataFrame ends up making many pandas method calls, and Dask knows how to
+coordinate everything to get the result.
+
+.. ipython:: python
+
+   ddf.columns
+   ddf.dtypes
+   ddf.npartitions
+
+One major difference: the ``dask.dataframe`` API is *lazy*. If you look at the
+repr above, you'll notice that the values aren't actually printed out; just the
+column names and dtypes. That's because Dask hasn't actually read the data yet.
+Rather than executing immediately, doing operations build up a **task graph**.
+
+.. ipython:: python
+
+   ddf
+   ddf['name']
+   ddf['name'].value_counts()
+
+Each of these calls is instant because the result isn't being computed yet.
+We're just building up a list of computation to do when someone needs the
+result. Dask knows that the return type of a ``pandas.Series.value_counts``
+is a pandas Series with a certain dtype and a certain name. So the Dask version
+returns a Dask Series with the same dtype and the same name.
+
+To get the actual result you can call ``.compute()``.
+             
+.. ipython:: python
+             
+   %time ddf['name'].value_counts().compute()
+
+At that point, the full task graph (reading in data, selecting the columns,
+doing the ``value_counts``) is executed *in parallel*. You get back the same
+thing you'd get back from pandas, in this case a concrete pandas Series with the
+count of each ``name``.
+
+By default, ``dask.dataframe`` operations use a threadpool to do operations in
+parallel. We can also connect to a cluster to distribute the work on many
+machines. In this case we'll connect to a local "cluster" made up of several
+processes on this single machine.
+
+.. ipython:: python
+
+   from dask.distributed import Client, LocalCluster
+   
+   cluster = LocalCluster()
+   client = Client(cluster)
+   client
+
+Once this ``client`` is created, all of Dask's computation will take place on
+the cluster (which is just processes in this case).
+
+Dask implements the most used parts of the pandas API. For example, we can do
+a familiar groupby aggregation.
+
+.. ipython:: python
+
+   %time ddf.groupby('name')[['x', 'y']].mean().compute().head()
+
+The grouping and aggregation is done out-of-core and in parallel.
+
+When Dask knows the ``divisions`` of a dataset, certain optimizations are
+possible. When reading parquet datasets written by dask, the divisions will be
+known automatically. In this case, since we created the parquet files manually,
+we need to supply the divisions manually.
+
+.. ipython:: python
+
+   divisions = tuple(pd.to_datetime(starts)) + (pd.Timestamp(ends[-1]),)
+   ddf.divisions = divisions
+   ddf
+
+Now we can do things like fast random access with ``.loc``.
+
+.. ipython:: python
+
+   ddf.loc['2002-01-01 12:01':'2002-01-01 12:05'].compute()
+
+Dask knows to just look in the 3rd partition for selecting values in `2002`. It
+doesn't need to look at any other data.
+
+Many workflows involve a large amount of data and processing it in a way that
+reduces the size to something that fits in memory. In this case, we'll resample
+to daily frequency and take the mean. Once we've taken the mean, we know the
+results will fit in memory, so we can safely call ``compute`` without running
+out of memory. At that point it's just a regular pandas object.
+
+.. ipython:: python
+
+   @savefig dask_resample.png
+   ddf[['x', 'y']].resample("1D").mean().cumsum().compute().plot()
+
+These Dask examples have all be done using multiple processes on a single
+machine. Dask can be `deployed on a cluster
+<https://docs.dask.org/en/latest/setup.html>`_ to scale up to even larger
+datasets.
+
+You see more dask examples at https://examples.dask.org.
+
+Use chunking
+------------
+
+If using another library like Dask is not an option, you can achieve similar
+results with a bit of work.
+
+For example, we can recreate the out-of-core ``value_counts`` we did earlier
+with Dask. The peak memory usage of this will be the size of the single largest
+DataFrame.
+
+.. ipython:: python
+
+   files = list(pathlib.Path("data/timeseries/").glob("ts*.parquet"))
+   files
+
+.. ipython:: python
+
+   %%time
+   counts = pd.Series(dtype=int)
+   for path in files:
+       df = pd.read_parquet(path)
+       counts = counts.add(df['name'].value_counts(), fill_value=0)
+   counts.astype(int)
+
+This matches the counts we saw above with Dask.
+
+Some readers, like :meth:`pandas.read_csv` offer parameters to control the
+``chunksize``. Manually chunking is an OK option for workflows that don't
+require too sophisticated of operations. Some operations, like ``groupby``, are
+much harder to do chunkwise. In these cases, you may be better switching to a
+library like Dask, which implements these chunked algorithms for you.
+
+.. ipython:: python
+
+   del client, cluster
+
+.. _Dask: https://dask.org
+.. _dask.dataframe: https://docs.dask.org/en/latest/dataframe.html
diff --git a/environment.yml b/environment.yml
index 7629fa52e7829..e43def5e29dc7 100644
--- a/environment.yml
+++ b/environment.yml
@@ -35,6 +35,8 @@ dependencies:
   - nbconvert>=5.4.1
   - nbsphinx
   - pandoc
+  - dask
+  - distributed
 
   # web (jinja2 is also needed, but it's also an optional pandas dependency)
   - markdown
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
index af726caa52e88..7240a81d2b7ae 100644
--- a/pandas/util/testing.py
+++ b/pandas/util/testing.py
@@ -1652,6 +1652,87 @@ def makeMultiIndex(k=10, names=None, **kwargs):
     return MultiIndex.from_product((("foo", "bar"), (1, 2)), names=names, **kwargs)
 
 
+_names = [
+    "Alice",
+    "Bob",
+    "Charlie",
+    "Dan",
+    "Edith",
+    "Frank",
+    "George",
+    "Hannah",
+    "Ingrid",
+    "Jerry",
+    "Kevin",
+    "Laura",
+    "Michael",
+    "Norbert",
+    "Oliver",
+    "Patricia",
+    "Quinn",
+    "Ray",
+    "Sarah",
+    "Tim",
+    "Ursula",
+    "Victor",
+    "Wendy",
+    "Xavier",
+    "Yvonne",
+    "Zelda",
+]
+
+
+def make_timeseries(start="2000-01-01", end="2000-12-31", freq="1D", seed=None):
+    """
+    Make a DataFrame with a DatetimeIndex
+
+    Parameters
+    ----------
+    start : str or Timestamp, default "2000-01-01"
+        The start of the index. Passed to date_range with `freq`.
+    end : str or Timestamp, default "2000-12-31"
+        The end of the index. Passed to date_range with `freq`.
+    freq : str or Freq
+        The frequency to use for the DatetimeIndex
+    seed : int, optional
+        The random state seed.
+
+        * name : object dtype with string names
+        * id : int dtype with
+        * x, y : float dtype
+
+    Examples
+    --------
+    >>> make_timeseries()
+                  id    name         x         y
+    timestamp
+    2000-01-01   982   Frank  0.031261  0.986727
+    2000-01-02  1025   Edith -0.086358 -0.032920
+    2000-01-03   982   Edith  0.473177  0.298654
+    2000-01-04  1009   Sarah  0.534344 -0.750377
+    2000-01-05   963   Zelda -0.271573  0.054424
+    ...          ...     ...       ...       ...
+    2000-12-27   980  Ingrid -0.132333 -0.422195
+    2000-12-28   972   Frank -0.376007 -0.298687
+    2000-12-29  1009  Ursula -0.865047 -0.503133
+    2000-12-30  1000  Hannah -0.063757 -0.507336
+    2000-12-31   972     Tim -0.869120  0.531685
+    """
+    index = pd.date_range(start=start, end=end, freq=freq, name="timestamp")
+    n = len(index)
+    state = np.random.RandomState(seed)
+    columns = {
+        "name": state.choice(_names, size=n),
+        "id": state.poisson(1000, size=n),
+        "x": state.rand(n) * 2 - 1,
+        "y": state.rand(n) * 2 - 1,
+    }
+    df = pd.DataFrame(columns, index=index, columns=sorted(columns))
+    if df.index[-1] == end:
+        df = df.iloc[:-1]
+    return df
+
+
 def all_index_generator(k=10):
     """Generator which can be iterated over to get instances of all the various
     index classes.

From 506edd1f6685d50aa5eafc4c5a694013cca2ad4f Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 23 Sep 2019 07:42:17 -0500
Subject: [PATCH 02/16] emphasize memory

---
 doc/source/user_guide/scale.rst | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst
index e2ca4b79106cd..89c178dd1a07b 100644
--- a/doc/source/user_guide/scale.rst
+++ b/doc/source/user_guide/scale.rst
@@ -235,10 +235,14 @@ To get the actual result you can call ``.compute()``.
              
    %time ddf['name'].value_counts().compute()
 
-At that point, the full task graph (reading in data, selecting the columns,
-doing the ``value_counts``) is executed *in parallel*. You get back the same
-thing you'd get back from pandas, in this case a concrete pandas Series with the
-count of each ``name``.
+At that point, you get back the same thing you'd get with pandas, in this case
+a concrete pandas Series with the count of each ``name``.
+
+Calling ``.compute`` causes the full task graph to be executed. This includes
+reading the data, selecting the columns, and doing the ``value_counts``. The
+execution is done *in parallel* where possible, and Dask tries to keep the
+overall memory footprint small. You can work with datasets that are much larger
+than memory, as long as each partition (a regular pandas DataFrame) fits in memory.
 
 By default, ``dask.dataframe`` operations use a threadpool to do operations in
 parallel. We can also connect to a cluster to distribute the work on many

From 35a4ddec6e76cbf2bdf1327f771ff83f5fdcf079 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 23 Sep 2019 10:04:28 -0500
Subject: [PATCH 03/16] fixups

---
 doc/source/user_guide/scale.rst | 117 ++++++++++++++++++--------------
 requirements-dev.txt            |   2 +
 2 files changed, 68 insertions(+), 51 deletions(-)

diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst
index 89c178dd1a07b..fcf62bb9d8e3e 100644
--- a/doc/source/user_guide/scale.rst
+++ b/doc/source/user_guide/scale.rst
@@ -48,7 +48,8 @@ We'll now write and read the file using CSV and parquet.
 
 .. ipython:: python
 
-   %time ts2 = pd.read_csv("timeseries.csv", index_col="timestamp", parse_dates=["timestamp"])
+   col="timestamp"
+   %time pd.read_csv("timeseries.csv", index_col=col, parse_dates=[col])
 
 .. ipython:: python
 
@@ -89,7 +90,7 @@ Option 1 loads in all the data and then filters to what we need.
 .. ipython:: python
 
    columns = ['id_0', 'name_0', 'x_0', 'y_0']
-   
+
    %time _ = pd.read_parquet("timeseries_wide.parquet")[columns]
 
 Option 2 only loads the columns we request. This is faster and has a lower peak
@@ -148,7 +149,8 @@ using :func:`pandas.to_numeric`.
 
 .. ipython:: python
 
-   reduction = ts2.memory_usage(deep=True).sum() / ts.memory_usage(deep=True).sum()
+   reduction = (ts2.memory_usage(deep=True).sum() /
+                ts.memory_usage(deep=True).sum())
    print(f"{reduction:0.2f}")
 
 In all, we've reduced the in-memory footprint of this dataset to 1/5 of its
@@ -157,34 +159,78 @@ original size.
 See :ref:`categorical` for more on ``Categorical`` and :ref:`basics.dtypes`
 for an overview of all of pandas' dtypes.
 
-Use Other libraries
--------------------
+Use chunking
+------------
 
-Pandas is just one library offering a DataFrame API. Because of its popularity,
-pandas' API has become something of a standard that other libraries implement.
+Some workloads can be achieved with chunking: splitting a large problem like "convert this
+directory of CSVs to parquet" into a bunch of small problems ("convert this individual parquet
+file into a CSV. Now repeat that for each file in this directory."). As long as each chunk
+fits in memory, you can work with datasets that are much larger than memory.
 
-For example, `Dask`_, a parallel computing library, has `dask.dataframe`_, a
-pandas-like API for working with larger than memory datasets in parallel. Dask
-can use multiple threads or processes on a single machine, or a cluster of
-machines to process data in parallel.
+.. note::
+
+   Chunking works well when the operation you're performing requires zero or minimal
+   coordination between chunks. For more complicated workflows, you're better off
+   :ref:`using another library <scale.other_libraries>`.
 
 Let's make a larger dataset on disk (as parquet files) that's split into chunks,
 one per year.
 
 .. ipython:: python
-             
+
    import pathlib
-   
+
    N = 12
    starts = [f'20{i:>02d}-01-01' for i in range(N)]
    ends = [f'20{i:>02d}-12-13' for i in range(N)]
-   
+
    pathlib.Path("data/timeseries").mkdir(exist_ok=True)
-   
+
    for i, (start, end) in enumerate(zip(starts, ends)):
        ts = make_timeseries(start=start, end=end, freq='1T', seed=i)
        ts.to_parquet(f"data/timeseries/ts-{i}.parquet")
-   
+
+   files = list(pathlib.Path("data/timeseries/").glob("ts*.parquet"))
+   files
+
+Now we'll implement an out-of-core ``value_counts``. The peak memory usage of this
+workflow is the single largest chunk, plus a small series storing the unique value
+counts up to this point.
+
+
+.. ipython:: python
+
+   %%time
+   counts = pd.Series(dtype=int)
+   for path in files:
+       # Only one dataframe is in memory at a time...
+       df = pd.read_parquet(path)
+       # ... plus a small Series `counts`, which is updated.
+       counts = counts.add(df['name'].value_counts(), fill_value=0)
+   counts.astype(int)
+
+Some readers, like :meth:`pandas.read_csv` offer parameters to control the
+``chunksize``. Manually chunking is an OK option for workflows that don't
+require too sophisticated of operations. Some operations, like ``groupby``, are
+much harder to do chunkwise. In these cases, you may be better switching to a
+different library that implements these out-of-core algorithms for you.
+
+.. _scale.other_libraries:
+
+Use Other libraries
+-------------------
+
+Pandas is just one library offering a DataFrame API. Because of its popularity,
+pandas' API has become something of a standard that other libraries implement.
+The pandas documentation maintains a list of libraries implemetning a DataFrame API
+in :ref:`our ecosystem page <ecosystem.out-of-core>`.
+
+For example, `Dask`_, a parallel computing library, has `dask.dataframe`_, a
+pandas-like API for working with larger than memory datasets in parallel. Dask
+can use multiple threads or processes on a single machine, or a cluster of
+machines to process data in parallel.
+
+
 We'll import ``dask.dataframe`` and notice that the API feels similar to pandas.
 We can use Dask's ``read_parquet`` function, but provide a globstring of files to read in.
 
@@ -230,9 +276,9 @@ is a pandas Series with a certain dtype and a certain name. So the Dask version
 returns a Dask Series with the same dtype and the same name.
 
 To get the actual result you can call ``.compute()``.
-             
+
 .. ipython:: python
-             
+
    %time ddf['name'].value_counts().compute()
 
 At that point, you get back the same thing you'd get with pandas, in this case
@@ -252,7 +298,7 @@ processes on this single machine.
 .. ipython:: python
 
    from dask.distributed import Client, LocalCluster
-   
+
    cluster = LocalCluster()
    client = Client(cluster)
    client
@@ -307,39 +353,8 @@ datasets.
 
 You see more dask examples at https://examples.dask.org.
 
-Use chunking
-------------
-
-If using another library like Dask is not an option, you can achieve similar
-results with a bit of work.
-
-For example, we can recreate the out-of-core ``value_counts`` we did earlier
-with Dask. The peak memory usage of this will be the size of the single largest
-DataFrame.
-
-.. ipython:: python
-
-   files = list(pathlib.Path("data/timeseries/").glob("ts*.parquet"))
-   files
-
-.. ipython:: python
-
-   %%time
-   counts = pd.Series(dtype=int)
-   for path in files:
-       df = pd.read_parquet(path)
-       counts = counts.add(df['name'].value_counts(), fill_value=0)
-   counts.astype(int)
-
-This matches the counts we saw above with Dask.
-
-Some readers, like :meth:`pandas.read_csv` offer parameters to control the
-``chunksize``. Manually chunking is an OK option for workflows that don't
-require too sophisticated of operations. Some operations, like ``groupby``, are
-much harder to do chunkwise. In these cases, you may be better switching to a
-library like Dask, which implements these chunked algorithms for you.
-
 .. ipython:: python
+   :suppress:
 
    del client, cluster
 
diff --git a/requirements-dev.txt b/requirements-dev.txt
index fd8e6378240b4..edd9931e35025 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -17,6 +17,8 @@ numpydoc>=0.9.0
 nbconvert>=5.4.1
 nbsphinx
 pandoc
+dask
+distributed
 markdown
 feedparser
 pyyaml

From efb3260ab4485b8338c8ac00df2aca077fec72a9 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 23 Sep 2019 10:34:50 -0500
Subject: [PATCH 04/16] code checks

---
 doc/source/user_guide/scale.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst
index fcf62bb9d8e3e..84cad194f67df 100644
--- a/doc/source/user_guide/scale.rst
+++ b/doc/source/user_guide/scale.rst
@@ -48,7 +48,7 @@ We'll now write and read the file using CSV and parquet.
 
 .. ipython:: python
 
-   col="timestamp"
+   col = "timestamp"
    %time pd.read_csv("timeseries.csv", index_col=col, parse_dates=[col])
 
 .. ipython:: python
@@ -149,8 +149,8 @@ using :func:`pandas.to_numeric`.
 
 .. ipython:: python
 
-   reduction = (ts2.memory_usage(deep=True).sum() /
-                ts.memory_usage(deep=True).sum())
+   reduction = (ts2.memory_usage(deep=True).sum()
+                / ts.memory_usage(deep=True).sum())
    print(f"{reduction:0.2f}")
 
 In all, we've reduced the in-memory footprint of this dataset to 1/5 of its

From 3201f4204799cc18bc0dbb680e952e1bbd7db459 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 23 Sep 2019 11:14:55 -0500
Subject: [PATCH 05/16] bump pyarrow

---
 environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index e43def5e29dc7..99cbc154d9864 100644
--- a/environment.yml
+++ b/environment.yml
@@ -78,7 +78,7 @@ dependencies:
   - html5lib  # pandas.read_html
   - lxml  # pandas.read_html
   - openpyxl  # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
-  - pyarrow>=0.9.0  # pandas.read_paquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather
+  - pyarrow>=0.13.1  # pandas.read_paquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather
   - pyqt>=5.9.2  # pandas.read_clipboard
   - pytables>=3.4.2  # pandas.read_hdf, DataFrame.to_hdf
   - python-snappy  # required by pyarrow

From eae9593b1fe770ad016ff19679eb5ab2dc0a7380 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 23 Sep 2019 11:44:07 -0500
Subject: [PATCH 06/16] update deps

---
 requirements-dev.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index edd9931e35025..33158cfc31012 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -50,7 +50,7 @@ fastparquet>=0.2.1
 html5lib
 lxml
 openpyxl
-pyarrow>=0.9.0
+pyarrow>=0.13.1
 pyqt5>=5.9.2
 tables>=3.4.2
 python-snappy

From 68ff6eecd4ca77a58efba7f8d864af967d9bc9ed Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 23 Sep 2019 11:44:57 -0500
Subject: [PATCH 07/16] include in user_guide index

---
 doc/source/user_guide/index.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/user_guide/index.rst b/doc/source/user_guide/index.rst
index 05df83decbd7e..b86961a71433b 100644
--- a/doc/source/user_guide/index.rst
+++ b/doc/source/user_guide/index.rst
@@ -38,6 +38,7 @@ Further information on any specific method can be obtained in the
     style
     options
     enhancingperf
+    scale
     sparse
     gotchas
     cookbook

From a4baa41ec41a3c279bf88b7a677f6fb339a8d1f6 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 23 Sep 2019 15:35:43 -0500
Subject: [PATCH 08/16] update

---
 doc/source/user_guide/scale.rst | 128 +++++++++++++++++++++-----------
 pandas/util/testing.py          |   4 +-
 2 files changed, 87 insertions(+), 45 deletions(-)

diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst
index 84cad194f67df..565d4639e2544 100644
--- a/doc/source/user_guide/scale.rst
+++ b/doc/source/user_guide/scale.rst
@@ -4,10 +4,10 @@
 Scaling to large datasets
 *************************
 
-Pandas provide data structures for in-memory analytics. This makes using pandas
+Pandas provide data structures for in-memory analytics, which makes using pandas
 to analyze larger than memory datasets somewhat tricky.
 
-This document provides a few recommendations for scaling to larger datasets.
+This document provides a few recommendations for scaling your analysis to larger datasets.
 It's a complement to :ref:`enhancingperf`, which focuses on speeding up analysis
 for datasets that fit in memory.
 
@@ -20,7 +20,11 @@ Assuming you want or need the expressivity and power of pandas, let's carry on.
 
    import pandas as pd
    import numpy as np
-   from pandas.util.testing import make_timeseries
+
+.. ipython:: python
+   :suppress:
+
+   from pandas.util.testing import _make_timeseries
 
 
 Use more efficient file formats
@@ -33,58 +37,78 @@ usage, letting you load larger datasets into pandas before running out of
 memory.
 
 .. ipython:: python
+   :suppress:
 
    # Make a random in-memory dataset
-   ts = make_timeseries(freq="30S", seed=0)
-   ts
-
-
-We'll now write and read the file using CSV and parquet.
-
-
-.. ipython:: python
-
-   %time ts.to_csv("timeseries.csv")
+   ts = _make_timeseries(freq="30S", seed=0)
+   ts.to_csv("timeseries.csv")
+   ts.to_parquet("timeseries.parquet")
+
+For example, suppose we have a dataset like the following::
+
+                          id      name         x         y
+   timestamp
+   2000-01-01 00:00:00  1029   Michael  0.278837  0.247932
+   2000-01-01 00:00:30  1010  Patricia  0.077144  0.490260
+   2000-01-01 00:01:00  1001    Victor  0.214525  0.258635
+   2000-01-01 00:01:30  1018     Alice -0.646866  0.822104
+   2000-01-01 00:02:00   991       Dan  0.902389  0.466665
+   ...                   ...       ...       ...       ...
+   2000-12-30 23:58:00   992     Sarah  0.721155  0.944118
+   2000-12-30 23:58:30  1007    Ursula  0.409277  0.133227
+   2000-12-30 23:59:00  1009    Hannah -0.452802  0.184318
+   2000-12-30 23:59:30   978     Kevin -0.904728 -0.179146
+   2000-12-31 00:00:00   973    Ingrid -0.370763 -0.794667
+
+That dataset has been stored on disk as CSV and Parquet. We want to
+compare the performance of reading those two formats.
 
 .. ipython:: python
 
    col = "timestamp"
    %time pd.read_csv("timeseries.csv", index_col=col, parse_dates=[col])
 
-.. ipython:: python
-
-   %time ts.to_parquet("timeseries.parquet")
-
 .. ipython:: python
 
    %time _ = pd.read_parquet("timeseries.parquet")
 
-Notice that parquet gives much higher performance for reading and writing, both
+Notice that parquet gives higher performance for reading (and writing), both
 in terms of speed and lower peak memory usage. See :ref:`io` for more.
 
 Load less data
 --------------
 
-Suppose our raw dataset on disk has many columns, but we need just a subset
-for our analysis. To get those columns, we can either
-
-1. Load the entire dataset then select those columns.
-2. Just load the columns we need.
-
-Loading just the columns you need can be much faster and requires less memory.
-
 .. ipython:: python
+   :suppress:
 
    # make a similar dataset with many columns
    timeseries = [
-       make_timeseries(freq="1T", seed=i).rename(columns=lambda x: f"{x}_{i}")
+       _make_timeseries(freq="1T", seed=i).rename(columns=lambda x: f"{x}_{i}")
        for i in range(10)
    ]
    ts_wide = pd.concat(timeseries, axis=1)
-   ts_wide.head()
    ts_wide.to_parquet("timeseries_wide.parquet")
 
+Suppose our raw dataset on disk has many columns::
+
+                        id_0    name_0       x_0       y_0  id_1   name_1       x_1  ...  name_8       x_8       y_8  id_9   name_9       x_9       y_9
+   timestamp                                                                         ...
+   2000-01-01 00:00:00  1015   Michael -0.399453  0.095427   994    Frank -0.176842  ...     Dan -0.315310  0.713892  1025   Victor -0.135779  0.346801
+   2000-01-01 00:01:00   969  Patricia  0.650773 -0.874275  1003    Laura  0.459153  ...  Ursula  0.913244 -0.630308  1047    Wendy -0.886285  0.035852
+   2000-01-01 00:02:00  1016    Victor -0.721465 -0.584710  1046  Michael  0.524994  ...     Ray -0.656593  0.692568  1064   Yvonne  0.070426  0.432047
+   2000-01-01 00:03:00   939     Alice -0.746004 -0.908008   996   Ingrid -0.414523  ...   Jerry -0.958994  0.608210   978    Wendy  0.855949 -0.648988
+   2000-01-01 00:04:00  1017       Dan  0.919451 -0.803504  1048    Jerry -0.569235  ...   Frank -0.577022 -0.409088   994      Bob -0.270132  0.335176
+   ...                   ...       ...       ...       ...   ...      ...       ...  ...     ...       ...       ...   ...      ...       ...       ...
+   2000-12-30 23:56:00   999       Tim  0.162578  0.512817   973    Kevin -0.403352  ...     Tim -0.380415  0.008097  1041  Charlie  0.191477 -0.599519
+   2000-12-30 23:57:00   970     Laura -0.433586 -0.600289   958   Oliver -0.966577  ...   Zelda  0.971274  0.402032  1038   Ursula  0.574016 -0.930992
+   2000-12-30 23:58:00  1065     Edith  0.232211 -0.454540   971      Tim  0.158484  ...   Alice -0.222079 -0.919274  1022      Dan  0.031345 -0.657755
+   2000-12-30 23:59:00  1019    Ingrid  0.322208 -0.615974   981   Hannah  0.607517  ...   Sarah -0.424440 -0.117274   990   George -0.375530  0.563312
+   2000-12-31 00:00:00   937    Ursula -0.906523  0.943178  1018    Alice -0.564513  ...   Jerry  0.236837  0.807650   985   Oliver  0.777642  0.783392
 
+   [525601 rows x 40 columns]
+
+
+To load the columns we want, we have two options.
 Option 1 loads in all the data and then filters to what we need.
 
 .. ipython:: python
@@ -94,16 +118,15 @@ Option 1 loads in all the data and then filters to what we need.
    %time _ = pd.read_parquet("timeseries_wide.parquet")[columns]
 
 Option 2 only loads the columns we request. This is faster and has a lower peak
-memory usage, since the entire dataset isn't in memory at once.
+memory usage since the entire dataset isn't in memory at once.
 
 .. ipython:: python
 
    %time _ = pd.read_parquet("timeseries_wide.parquet", columns=columns)
 
-
 With :func:`pandas.read_csv`, you can specify ``usecols`` to limit the columns
-read into memory.
-
+read into memory. Not all file formats that can be read by pandas provide an option
+to read a subset of columns.
 
 Use efficient datatypes
 -----------------------
@@ -173,10 +196,11 @@ fits in memory, you can work with datasets that are much larger than memory.
    coordination between chunks. For more complicated workflows, you're better off
    :ref:`using another library <scale.other_libraries>`.
 
-Let's make a larger dataset on disk (as parquet files) that's split into chunks,
-one per year.
+Suppose we have an even larger "logical dataset" on disk that's a directory of parquet
+files. Each file in the directory represents a different year of the entire dataset.
 
 .. ipython:: python
+   :suppress:
 
    import pathlib
 
@@ -187,20 +211,36 @@ one per year.
    pathlib.Path("data/timeseries").mkdir(exist_ok=True)
 
    for i, (start, end) in enumerate(zip(starts, ends)):
-       ts = make_timeseries(start=start, end=end, freq='1T', seed=i)
-       ts.to_parquet(f"data/timeseries/ts-{i}.parquet")
-
-   files = list(pathlib.Path("data/timeseries/").glob("ts*.parquet"))
-   files
+       ts = _make_timeseries(start=start, end=end, freq='1T', seed=i)
+       ts.to_parquet(f"data/timeseries/ts-{i:0>2d}.parquet")
+
+
+::
+
+   data
+   └── timeseries
+       ├── ts-00.parquet
+       ├── ts-01.parquet
+       ├── ts-02.parquet
+       ├── ts-03.parquet
+       ├── ts-04.parquet
+       ├── ts-05.parquet
+       ├── ts-06.parquet
+       ├── ts-07.parquet
+       ├── ts-08.parquet
+       ├── ts-09.parquet
+       ├── ts-10.parquet
+       └── ts-11.parquet
 
 Now we'll implement an out-of-core ``value_counts``. The peak memory usage of this
 workflow is the single largest chunk, plus a small series storing the unique value
-counts up to this point.
-
+counts up to this point. As long as each individual file fits in memory, this will
+work for arbitrary-sized datasets.
 
 .. ipython:: python
 
    %%time
+   files = list(pathlib.Path("data/timeseries/").glob("ts*.parquet"))
    counts = pd.Series(dtype=int)
    for path in files:
        # Only one dataframe is in memory at a time...
@@ -210,14 +250,16 @@ counts up to this point.
    counts.astype(int)
 
 Some readers, like :meth:`pandas.read_csv` offer parameters to control the
-``chunksize``. Manually chunking is an OK option for workflows that don't
+``chunksize`` when reading a single file.
+
+Manually chunking is an OK option for workflows that don't
 require too sophisticated of operations. Some operations, like ``groupby``, are
 much harder to do chunkwise. In these cases, you may be better switching to a
 different library that implements these out-of-core algorithms for you.
 
 .. _scale.other_libraries:
 
-Use Other libraries
+Use other libraries
 -------------------
 
 Pandas is just one library offering a DataFrame API. Because of its popularity,
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
index 7240a81d2b7ae..96c05f3495b42 100644
--- a/pandas/util/testing.py
+++ b/pandas/util/testing.py
@@ -1682,7 +1682,7 @@ def makeMultiIndex(k=10, names=None, **kwargs):
 ]
 
 
-def make_timeseries(start="2000-01-01", end="2000-12-31", freq="1D", seed=None):
+def _make_timeseries(start="2000-01-01", end="2000-12-31", freq="1D", seed=None):
     """
     Make a DataFrame with a DatetimeIndex
 
@@ -1703,7 +1703,7 @@ def make_timeseries(start="2000-01-01", end="2000-12-31", freq="1D", seed=None):
 
     Examples
     --------
-    >>> make_timeseries()
+    >>> _make_timeseries()
                   id    name         x         y
     timestamp
     2000-01-01   982   Frank  0.031261  0.986727

From 78d22e6719c851b786d19598072f35f51d809e51 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 23 Sep 2019 15:43:51 -0500
Subject: [PATCH 09/16] fixups

---
 doc/.gitignore                  | 2 +-
 doc/source/user_guide/scale.rst | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/.gitignore b/doc/.gitignore
index 66f657d848211..e23892d6100e8 100644
--- a/doc/.gitignore
+++ b/doc/.gitignore
@@ -1,4 +1,4 @@
 data/
 timeseries.csv
 timeseries.parquet
-timeseries_wide.parquet
\ No newline at end of file
+timeseries_wide.parquet
diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst
index 565d4639e2544..f3bdc71bace69 100644
--- a/doc/source/user_guide/scale.rst
+++ b/doc/source/user_guide/scale.rst
@@ -4,7 +4,7 @@
 Scaling to large datasets
 *************************
 
-Pandas provide data structures for in-memory analytics, which makes using pandas
+Pandas provides data structures for in-memory analytics, which makes using pandas
 to analyze larger than memory datasets somewhat tricky.
 
 This document provides a few recommendations for scaling your analysis to larger datasets.
@@ -30,7 +30,7 @@ Assuming you want or need the expressivity and power of pandas, let's carry on.
 Use more efficient file formats
 -------------------------------
 
-Depending on your workload, data loading may be a bottleneck. In these case you
+Depending on your workload, data loading may be a bottleneck. In these cases you
 might consider switching from a slow format like CSV to a faster format like
 Parquet. Loading from a file format like Parquet will also require less memory
 usage, letting you load larger datasets into pandas before running out of
@@ -66,7 +66,7 @@ compare the performance of reading those two formats.
 .. ipython:: python
 
    col = "timestamp"
-   %time pd.read_csv("timeseries.csv", index_col=col, parse_dates=[col])
+   %time _ = pd.read_csv("timeseries.csv", index_col=col, parse_dates=[col])
 
 .. ipython:: python
 
@@ -249,7 +249,7 @@ work for arbitrary-sized datasets.
        counts = counts.add(df['name'].value_counts(), fill_value=0)
    counts.astype(int)
 
-Some readers, like :meth:`pandas.read_csv` offer parameters to control the
+Some readers, like :meth:`pandas.read_csv`, offer parameters to control the
 ``chunksize`` when reading a single file.
 
 Manually chunking is an OK option for workflows that don't

From f7bc6dc5aaf089b75624b115b6a7fc58718b2dd2 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 24 Sep 2019 13:30:21 -0500
Subject: [PATCH 10/16] updates

---
 doc/source/user_guide/scale.rst         | 96 +++++++++----------------
 environment.yml                         |  8 ++-
 requirements-dev.txt                    |  8 ++-
 scripts/generate_pip_deps_from_conda.py |  2 +-
 4 files changed, 46 insertions(+), 68 deletions(-)

diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst
index f3bdc71bace69..4d83a60fed525 100644
--- a/doc/source/user_guide/scale.rst
+++ b/doc/source/user_guide/scale.rst
@@ -5,7 +5,9 @@ Scaling to large datasets
 *************************
 
 Pandas provides data structures for in-memory analytics, which makes using pandas
-to analyze larger than memory datasets somewhat tricky.
+to analyze datasets that are larger than memory datasets somewhat tricky. Even datasets
+that are a sizable fraction of memory become unwieldy, as some pandas operations need
+to make intermediate copies.
 
 This document provides a few recommendations for scaling your analysis to larger datasets.
 It's a complement to :ref:`enhancingperf`, which focuses on speeding up analysis
@@ -14,66 +16,24 @@ for datasets that fit in memory.
 But first, it's worth considering *not using pandas*. Pandas isn't the right
 tool for all situations. If you're working with very large datasets and a tool
 like PostgreSQL fits your needs, then you should probably be using that.
-Assuming you want or need the expressivity and power of pandas, let's carry on.
+Assuming you want or need the expressiveness and power of pandas, let's carry on.
 
 .. ipython:: python
 
    import pandas as pd
    import numpy as np
+   %load_ext memory_profiler
 
 .. ipython:: python
    :suppress:
 
    from pandas.util.testing import _make_timeseries
 
-
-Use more efficient file formats
--------------------------------
-
-Depending on your workload, data loading may be a bottleneck. In these cases you
-might consider switching from a slow format like CSV to a faster format like
-Parquet. Loading from a file format like Parquet will also require less memory
-usage, letting you load larger datasets into pandas before running out of
-memory.
-
-.. ipython:: python
-   :suppress:
-
    # Make a random in-memory dataset
    ts = _make_timeseries(freq="30S", seed=0)
    ts.to_csv("timeseries.csv")
    ts.to_parquet("timeseries.parquet")
 
-For example, suppose we have a dataset like the following::
-
-                          id      name         x         y
-   timestamp
-   2000-01-01 00:00:00  1029   Michael  0.278837  0.247932
-   2000-01-01 00:00:30  1010  Patricia  0.077144  0.490260
-   2000-01-01 00:01:00  1001    Victor  0.214525  0.258635
-   2000-01-01 00:01:30  1018     Alice -0.646866  0.822104
-   2000-01-01 00:02:00   991       Dan  0.902389  0.466665
-   ...                   ...       ...       ...       ...
-   2000-12-30 23:58:00   992     Sarah  0.721155  0.944118
-   2000-12-30 23:58:30  1007    Ursula  0.409277  0.133227
-   2000-12-30 23:59:00  1009    Hannah -0.452802  0.184318
-   2000-12-30 23:59:30   978     Kevin -0.904728 -0.179146
-   2000-12-31 00:00:00   973    Ingrid -0.370763 -0.794667
-
-That dataset has been stored on disk as CSV and Parquet. We want to
-compare the performance of reading those two formats.
-
-.. ipython:: python
-
-   col = "timestamp"
-   %time _ = pd.read_csv("timeseries.csv", index_col=col, parse_dates=[col])
-
-.. ipython:: python
-
-   %time _ = pd.read_parquet("timeseries.parquet")
-
-Notice that parquet gives higher performance for reading (and writing), both
-in terms of speed and lower peak memory usage. See :ref:`io` for more.
 
 Load less data
 --------------
@@ -115,14 +75,16 @@ Option 1 loads in all the data and then filters to what we need.
 
    columns = ['id_0', 'name_0', 'x_0', 'y_0']
 
-   %time _ = pd.read_parquet("timeseries_wide.parquet")[columns]
+   %memit pd.read_parquet("timeseries_wide.parquet")[columns]
 
-Option 2 only loads the columns we request. This is faster and has a lower peak
-memory usage since the entire dataset isn't in memory at once.
+Option 2 only loads the columns we request.
 
 .. ipython:: python
 
-   %time _ = pd.read_parquet("timeseries_wide.parquet", columns=columns)
+   %memit pd.read_parquet("timeseries_wide.parquet", columns=columns)
+
+In particular, notice that the ``increment`` in memory reported by ``memory-profiler``
+is much smaller when ``columns`` is specified.
 
 With :func:`pandas.read_csv`, you can specify ``usecols`` to limit the columns
 read into memory. Not all file formats that can be read by pandas provide an option
@@ -136,6 +98,14 @@ especially true for high-cardinality text data (columns with relatively few
 unique values). By using more efficient data types you can store larger datasets
 in memory.
 
+.. ipython:: python
+
+   ts = dd.read_parquet("timeseries.parquet")
+   ts
+
+Now, let's inspect the data types and memory usage to see where we should focus our
+attention.
+
 .. ipython:: python
 
    ts.dtypes
@@ -186,8 +156,8 @@ Use chunking
 ------------
 
 Some workloads can be achieved with chunking: splitting a large problem like "convert this
-directory of CSVs to parquet" into a bunch of small problems ("convert this individual parquet
-file into a CSV. Now repeat that for each file in this directory."). As long as each chunk
+directory of CSVs to parquet" into a bunch of small problems ("convert this individual CSV
+file into a Parquet file. Now repeat that for each file in this directory."). As long as each chunk
 fits in memory, you can work with datasets that are much larger than memory.
 
 .. note::
@@ -240,7 +210,7 @@ work for arbitrary-sized datasets.
 .. ipython:: python
 
    %%time
-   files = list(pathlib.Path("data/timeseries/").glob("ts*.parquet"))
+   files = pathlib.Path("data/timeseries/").glob("ts*.parquet")
    counts = pd.Series(dtype=int)
    for path in files:
        # Only one dataframe is in memory at a time...
@@ -264,7 +234,7 @@ Use other libraries
 
 Pandas is just one library offering a DataFrame API. Because of its popularity,
 pandas' API has become something of a standard that other libraries implement.
-The pandas documentation maintains a list of libraries implemetning a DataFrame API
+The pandas documentation maintains a list of libraries implementing a DataFrame API
 in :ref:`our ecosystem page <ecosystem.out-of-core>`.
 
 For example, `Dask`_, a parallel computing library, has `dask.dataframe`_, a
@@ -337,13 +307,14 @@ parallel. We can also connect to a cluster to distribute the work on many
 machines. In this case we'll connect to a local "cluster" made up of several
 processes on this single machine.
 
-.. ipython:: python
+.. code-block:: python
 
-   from dask.distributed import Client, LocalCluster
+   >>> from dask.distributed import Client, LocalCluster
 
-   cluster = LocalCluster()
-   client = Client(cluster)
-   client
+   >>> cluster = LocalCluster()
+   >>> client = Client(cluster)
+   >>> client
+   <Client: 'tcp://127.0.0.1:53349' processes=4 threads=8, memory=17.18 GB>
 
 Once this ``client`` is created, all of Dask's computation will take place on
 the cluster (which is just processes in this case).
@@ -364,6 +335,10 @@ we need to supply the divisions manually.
 
 .. ipython:: python
 
+   N = 12
+   starts = [f'20{i:>02d}-01-01' for i in range(N)]
+   ends = [f'20{i:>02d}-12-13' for i in range(N)]
+
    divisions = tuple(pd.to_datetime(starts)) + (pd.Timestamp(ends[-1]),)
    ddf.divisions = divisions
    ddf
@@ -395,10 +370,5 @@ datasets.
 
 You see more dask examples at https://examples.dask.org.
 
-.. ipython:: python
-   :suppress:
-
-   del client, cluster
-
 .. _Dask: https://dask.org
 .. _dask.dataframe: https://docs.dask.org/en/latest/dataframe.html
diff --git a/environment.yml b/environment.yml
index 99cbc154d9864..fafeb7504f678 100644
--- a/environment.yml
+++ b/environment.yml
@@ -35,8 +35,12 @@ dependencies:
   - nbconvert>=5.4.1
   - nbsphinx
   - pandoc
-  - dask
-  - distributed
+  - dask-core
+  - toolz>=0.7.3
+  - fsspec>=0.5.1
+  - partd>=0.3.10
+  - cloudpickle>=0.2.1
+  - memory_profiler
 
   # web (jinja2 is also needed, but it's also an optional pandas dependency)
   - markdown
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 33158cfc31012..edd08cdc25315 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -17,8 +17,12 @@ numpydoc>=0.9.0
 nbconvert>=5.4.1
 nbsphinx
 pandoc
-dask
-distributed
+dask-core
+toolz>=0.7.3
+fsspec>=0.5.1
+partd>=0.3.10
+cloudpickle>=0.2.1
+memory_profiler
 markdown
 feedparser
 pyyaml
diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py
index 29fe8bf84c12b..44fe50b99560a 100755
--- a/scripts/generate_pip_deps_from_conda.py
+++ b/scripts/generate_pip_deps_from_conda.py
@@ -20,7 +20,7 @@
 import yaml
 
 EXCLUDE = {"python=3"}
-RENAME = {"pytables": "tables", "pyqt": "pyqt5"}
+RENAME = {"pytables": "tables", "pyqt": "pyqt5", "dask-core": "dask"}
 
 
 def conda_package_to_pip(package):

From 5294cdbd9ee0f5241d9d23327901c7aed7e6140f Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 24 Sep 2019 14:27:53 -0500
Subject: [PATCH 11/16] fixup

---
 doc/source/user_guide/scale.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst
index 4d83a60fed525..f3085ab044509 100644
--- a/doc/source/user_guide/scale.rst
+++ b/doc/source/user_guide/scale.rst
@@ -100,7 +100,7 @@ in memory.
 
 .. ipython:: python
 
-   ts = dd.read_parquet("timeseries.parquet")
+   ts = pd.read_parquet("timeseries.parquet")
    ts
 
 Now, let's inspect the data types and memory usage to see where we should focus our

From c57f33a2a8307b285709bb2bb55ce78b1b869486 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 24 Sep 2019 14:36:08 -0500
Subject: [PATCH 12/16] fixup

---
 doc/source/user_guide/scale.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst
index f3085ab044509..c9fbcab3568ae 100644
--- a/doc/source/user_guide/scale.rst
+++ b/doc/source/user_guide/scale.rst
@@ -22,6 +22,7 @@ Assuming you want or need the expressiveness and power of pandas, let's carry on
 
    import pandas as pd
    import numpy as np
+   # noqa: F821
    %load_ext memory_profiler
 
 .. ipython:: python

From 55be2bb6a7a123188a4b7a3f93a80cc491832977 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 24 Sep 2019 15:27:50 -0500
Subject: [PATCH 13/16] try updating check

---
 ci/code_checks.sh               | 3 ++-
 doc/source/user_guide/scale.rst | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index b03c4f2238445..3b41aebde2c0d 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -81,7 +81,8 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then
     flake8-rst --version
 
     MSG='Linting code-blocks in .rst documentation' ; echo $MSG
-    flake8-rst doc/source --filename=*.rst --format="$FLAKE8_FORMAT"
+    # exclude scale.rst because flake8-rst doesn't handle magics
+    flake8-rst doc/source --filename=*.rst --format="$FLAKE8_FORMAT" --exclude="scale.rst"
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
     # Check that cython casting is of the form `<type>obj` as opposed to `<type> obj`;
diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst
index c9fbcab3568ae..f3085ab044509 100644
--- a/doc/source/user_guide/scale.rst
+++ b/doc/source/user_guide/scale.rst
@@ -22,7 +22,6 @@ Assuming you want or need the expressiveness and power of pandas, let's carry on
 
    import pandas as pd
    import numpy as np
-   # noqa: F821
    %load_ext memory_profiler
 
 .. ipython:: python

From a76453f46b85cb2c82e528214c78ddb4ea5cd5d0 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 24 Sep 2019 16:29:53 -0500
Subject: [PATCH 14/16] remove memory profiling

---
 ci/code_checks.sh               | 3 +--
 doc/source/user_guide/scale.rst | 9 ++++-----
 environment.yml                 | 1 -
 requirements-dev.txt            | 1 -
 4 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 3b41aebde2c0d..b03c4f2238445 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -81,8 +81,7 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then
     flake8-rst --version
 
     MSG='Linting code-blocks in .rst documentation' ; echo $MSG
-    # exclude scale.rst because flake8-rst doesn't handle magics
-    flake8-rst doc/source --filename=*.rst --format="$FLAKE8_FORMAT" --exclude="scale.rst"
+    flake8-rst doc/source --filename=*.rst --format="$FLAKE8_FORMAT"
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
     # Check that cython casting is of the form `<type>obj` as opposed to `<type> obj`;
diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst
index f3085ab044509..7b590a3a1fcc8 100644
--- a/doc/source/user_guide/scale.rst
+++ b/doc/source/user_guide/scale.rst
@@ -22,7 +22,6 @@ Assuming you want or need the expressiveness and power of pandas, let's carry on
 
    import pandas as pd
    import numpy as np
-   %load_ext memory_profiler
 
 .. ipython:: python
    :suppress:
@@ -75,16 +74,16 @@ Option 1 loads in all the data and then filters to what we need.
 
    columns = ['id_0', 'name_0', 'x_0', 'y_0']
 
-   %memit pd.read_parquet("timeseries_wide.parquet")[columns]
+   pd.read_parquet("timeseries_wide.parquet")[columns]
 
 Option 2 only loads the columns we request.
 
 .. ipython:: python
 
-   %memit pd.read_parquet("timeseries_wide.parquet", columns=columns)
+   pd.read_parquet("timeseries_wide.parquet", columns=columns)
 
-In particular, notice that the ``increment`` in memory reported by ``memory-profiler``
-is much smaller when ``columns`` is specified.
+If we were to measure the memory usage of the two calls, we'd see that specifying
+``columns`` uses about 1/10th the memory in this case.
 
 With :func:`pandas.read_csv`, you can specify ``usecols`` to limit the columns
 read into memory. Not all file formats that can be read by pandas provide an option
diff --git a/environment.yml b/environment.yml
index fafeb7504f678..6b12bb31b2469 100644
--- a/environment.yml
+++ b/environment.yml
@@ -40,7 +40,6 @@ dependencies:
   - fsspec>=0.5.1
   - partd>=0.3.10
   - cloudpickle>=0.2.1
-  - memory_profiler
 
   # web (jinja2 is also needed, but it's also an optional pandas dependency)
   - markdown
diff --git a/requirements-dev.txt b/requirements-dev.txt
index edd08cdc25315..698e4f3aea094 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -22,7 +22,6 @@ toolz>=0.7.3
 fsspec>=0.5.1
 partd>=0.3.10
 cloudpickle>=0.2.1
-memory_profiler
 markdown
 feedparser
 pyyaml

From 98c06fa2bd2786fb05df6b77a10238cb2a4cb8f7 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 24 Sep 2019 16:30:36 -0500
Subject: [PATCH 15/16] fixup

---
 pandas/tests/groupby/test_categorical.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
index e09af3fd48ee6..fb549e18924c1 100644
--- a/pandas/tests/groupby/test_categorical.py
+++ b/pandas/tests/groupby/test_categorical.py
@@ -782,7 +782,8 @@ def test_categorical_no_compress():
 
 def test_sort():
 
-    # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby  # noqa: flake8
+    # http://stackoverflow.com/questions/23814368/
+    #   sorting-pandas-categorical-labels-after-groupby  # noqa: flake8
     # This should result in a properly sorted Series so that the plot
     # has a sorted x axis
     # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')

From 78eb2f11c9e5c74d50e5f7f3657e0bf453745bd2 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Thu, 26 Sep 2019 07:44:20 -0500
Subject: [PATCH 16/16] whatsnew, fixups

---
 doc/source/whatsnew/v1.0.0.rst | 7 +++++++
 environment.yml                | 1 +
 2 files changed, 8 insertions(+)

diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index a78bc07ac2715..a6abe39f24ac3 100644
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -111,6 +111,13 @@ Other API changes
 - :meth:`MultiIndex.from_arrays` will no longer infer names from arrays if ``names=None`` is explicitly provided (:issue:`27292`)
 -
 
+.. _whatsnew_1000.api.documentation:
+
+Documentation Improvements
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+- Added new section on :ref:`scale` (:issue:`28315`).
+
 .. _whatsnew_1000.deprecations:
 
 Deprecations
diff --git a/environment.yml b/environment.yml
index 6b12bb31b2469..7c3ec9064cba3 100644
--- a/environment.yml
+++ b/environment.yml
@@ -35,6 +35,7 @@ dependencies:
   - nbconvert>=5.4.1
   - nbsphinx
   - pandoc
+  # Dask and its dependencies
   - dask-core
   - toolz>=0.7.3
   - fsspec>=0.5.1