From 5579176aef43d5decc6a928b2810ee3965ab85a7 Mon Sep 17 00:00:00 2001
From: rvernica <rvernica@gmail.com>
Date: Mon, 21 Aug 2017 18:14:31 -0700
Subject: [PATCH 1/2] Update Performance Considerations section

* re-run all tests
* add tests for feather and pickle
---
 doc/source/io.rst | 85 +++++++++++++++++++++++++++++++++++------------
 1 file changed, 63 insertions(+), 22 deletions(-)
diff --git a/doc/source/io.rst b/doc/source/io.rst
index e338407361705..bfb7f4f588244 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -5208,7 +5208,7 @@ easy conversion to and from pandas.
 Performance Considerations
 --------------------------
 
-This is an informal comparison of various IO methods, using pandas 0.13.1.
+This is an informal comparison of various IO methods, using pandas 0.20.3.
 
 .. code-block:: ipython
 
@@ -5216,74 +5216,97 @@ This is an informal comparison of various IO methods, using pandas 0.13.1.
 
    In [2]: df.info()
    <class 'pandas.core.frame.DataFrame'>
-   Int64Index: 1000000 entries, 0 to 999999
+   RangeIndex: 1000000 entries, 0 to 999999
    Data columns (total 2 columns):
    A    1000000 non-null float64
    B    1000000 non-null float64
    dtypes: float64(2)
-   memory usage: 22.9 MB
+   memory usage: 15.3 MB
 
 Writing
 
 .. code-block:: ipython
 
    In [14]: %timeit test_sql_write(df)
-   1 loops, best of 3: 6.24 s per loop
+   2.23 s ± 27.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
 
    In [15]: %timeit test_hdf_fixed_write(df)
-   1 loops, best of 3: 237 ms per loop
+   239 ms ± 112 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
 
    In [26]: %timeit test_hdf_fixed_write_compress(df)
-   1 loops, best of 3: 245 ms per loop
+   355 ms ± 116 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
 
    In [16]: %timeit test_hdf_table_write(df)
-   1 loops, best of 3: 901 ms per loop
+   614 ms ± 50.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
 
    In [27]: %timeit test_hdf_table_write_compress(df)
-   1 loops, best of 3: 952 ms per loop
+   679 ms ± 37.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
 
    In [17]: %timeit test_csv_write(df)
-   1 loops, best of 3: 3.44 s per loop
+   4.18 s ± 50.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
+
+   In [30]: %timeit test_feather_write(df)
+   112 ms ± 764 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
+
+   In [31]: %timeit test_pickle_write(df)
+   144 ms ± 25.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
+
+   In [32]: %timeit test_pickle_write_compress(df)
+   6.45 s ± 81.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
 
 Reading
 
 .. code-block:: ipython
 
    In [18]: %timeit test_sql_read()
-   1 loops, best of 3: 766 ms per loop
+   1.33 s ± 16.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
 
    In [19]: %timeit test_hdf_fixed_read()
-   10 loops, best of 3: 19.1 ms per loop
+   11.1 ms ± 177 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
 
    In [28]: %timeit test_hdf_fixed_read_compress()
-   10 loops, best of 3: 36.3 ms per loop
+   25.1 ms ± 371 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
 
    In [20]: %timeit test_hdf_table_read()
-   10 loops, best of 3: 39 ms per loop
+   20.9 ms ± 502 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
 
    In [29]: %timeit test_hdf_table_read_compress()
-   10 loops, best of 3: 60.6 ms per loop
+   28.2 ms ± 453 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
 
    In [22]: %timeit test_csv_read()
-   1 loops, best of 3: 620 ms per loop
+   684 ms ± 14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
+
+   In [33]: %timeit test_feather_read()
+   3.51 ms ± 57.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
+
+   In [34]: %timeit test_pickle_read()
+   5.75 ms ± 110 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
+
+   In [35]: %timeit test_pickle_read_compress()
+   1.11 s ± 869 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
 
 Space on disk (in bytes)
 
 .. code-block:: none
 
-    25843712 Apr  8 14:11 test.sql
-    24007368 Apr  8 14:11 test_fixed.hdf
-    15580682 Apr  8 14:11 test_fixed_compress.hdf
-    24458444 Apr  8 14:11 test_table.hdf
-    16797283 Apr  8 14:11 test_table_compress.hdf
-    46152810 Apr  8 14:11 test.csv
+    42975232 Aug 21 18:00 test.sql
+    24007192 Aug 21 18:00 test_fixed.hdf
+    15580621 Aug 21 18:00 test_fixed_compress.hdf
+    24458524 Aug 21 18:00 test_table.hdf
+    16797892 Aug 21 18:00 test_table_compress.hdf
+    46149803 Aug 21 18:00 test.csv
+    16000248 Aug 21 18:00 test.feather
+    16000694 Aug 21 18:00 test.pkl
+    15047240 Aug 21 18:00 test.pkl.compress
 
 And here's the code
 
 .. code-block:: python
 
-   import sqlite3
    import os
+   import pandas as pd
+   import sqlite3
+   from numpy.random import randn
    from pandas.io import sql
 
    df = pd.DataFrame(randn(1000000,2),columns=list('AB'))
@@ -5329,3 +5352,21 @@ And here's the code
 
    def test_csv_read():
        pd.read_csv('test.csv',index_col=0)
+
+   def test_feather_write(df):
+       df.to_feather('test.feather')
+
+   def test_feather_read():
+       pd.read_feather('test.feather')
+
+   def test_pickle_write(df):
+       df.to_pickle('test.pkl')
+
+   def test_pickle_read():
+       pd.read_pickle('test.pkl')
+
+   def test_pickle_write_compress(df):
+       df.to_pickle('test.pkl.compress', compression='xz')
+
+   def test_pickle_read_compress():
+       pd.read_pickle('test.pkl.compress', compression='xz')

From 1ecdf3f101fb57f446010b880b75a1c495b50241 Mon Sep 17 00:00:00 2001
From: Rares Vernica <rvernica@gmail.com>
Date: Mon, 25 Sep 2017 20:27:39 -0700
Subject: [PATCH 2/2] Column of 1s in DataFrame to favor compression

* Update all timings
* Clarify wording
---
 doc/source/io.rst | 68 +++++++++++++++++++++++++----------------------
 1 file changed, 36 insertions(+), 32 deletions(-)

diff --git a/doc/source/io.rst b/doc/source/io.rst
index bfb7f4f588244..b027c7658f0e9 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -5208,19 +5208,22 @@ easy conversion to and from pandas.
 Performance Considerations
 --------------------------
 
-This is an informal comparison of various IO methods, using pandas 0.20.3.
+This is an informal comparison of various IO methods, using pandas
+0.20.3. Timings are machine dependent and small differences should be
+ignored.
 
 .. code-block:: ipython
 
-   In [1]: df = pd.DataFrame(randn(1000000,2),columns=list('AB'))
+   In [1]: sz = 1000000
+   In [2]: df = pd.DataFrame({'A': randn(sz), 'B': [1] * sz})
 
-   In [2]: df.info()
+   In [3]: df.info()
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 1000000 entries, 0 to 999999
    Data columns (total 2 columns):
    A    1000000 non-null float64
-   B    1000000 non-null float64
-   dtypes: float64(2)
+   B    1000000 non-null int64
+   dtypes: float64(1), int64(1)
    memory usage: 15.3 MB
 
 Writing
@@ -5228,76 +5231,76 @@ Writing
 .. code-block:: ipython
 
    In [14]: %timeit test_sql_write(df)
-   2.23 s ± 27.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
+   2.37 s ± 36.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
 
    In [15]: %timeit test_hdf_fixed_write(df)
-   239 ms ± 112 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
+   194 ms ± 65.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
 
    In [26]: %timeit test_hdf_fixed_write_compress(df)
-   355 ms ± 116 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
+   119 ms ± 2.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
 
    In [16]: %timeit test_hdf_table_write(df)
-   614 ms ± 50.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
+   623 ms ± 125 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
 
    In [27]: %timeit test_hdf_table_write_compress(df)
-   679 ms ± 37.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
+   563 ms ± 23.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
 
    In [17]: %timeit test_csv_write(df)
-   4.18 s ± 50.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
+   3.13 s ± 49.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
 
    In [30]: %timeit test_feather_write(df)
-   112 ms ± 764 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
+   103 ms ± 5.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
 
    In [31]: %timeit test_pickle_write(df)
-   144 ms ± 25.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
+   109 ms ± 3.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
 
    In [32]: %timeit test_pickle_write_compress(df)
-   6.45 s ± 81.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
+   3.33 s ± 55.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
 
 Reading
 
 .. code-block:: ipython
 
    In [18]: %timeit test_sql_read()
-   1.33 s ± 16.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
+   1.35 s ± 14.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
 
    In [19]: %timeit test_hdf_fixed_read()
-   11.1 ms ± 177 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
+   14.3 ms ± 438 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
 
    In [28]: %timeit test_hdf_fixed_read_compress()
-   25.1 ms ± 371 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
+   23.5 ms ± 672 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
 
    In [20]: %timeit test_hdf_table_read()
-   20.9 ms ± 502 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
+   35.4 ms ± 314 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
 
    In [29]: %timeit test_hdf_table_read_compress()
-   28.2 ms ± 453 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
+   42.6 ms ± 2.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
 
    In [22]: %timeit test_csv_read()
-   684 ms ± 14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
+   516 ms ± 27.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
 
    In [33]: %timeit test_feather_read()
-   3.51 ms ± 57.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
+   4.06 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
 
    In [34]: %timeit test_pickle_read()
-   5.75 ms ± 110 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
+   6.5 ms ± 172 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
 
    In [35]: %timeit test_pickle_read_compress()
-   1.11 s ± 869 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
+   588 ms ± 3.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
 
 Space on disk (in bytes)
 
 .. code-block:: none
 
-    42975232 Aug 21 18:00 test.sql
-    24007192 Aug 21 18:00 test_fixed.hdf
-    15580621 Aug 21 18:00 test_fixed_compress.hdf
-    24458524 Aug 21 18:00 test_table.hdf
-    16797892 Aug 21 18:00 test_table_compress.hdf
-    46149803 Aug 21 18:00 test.csv
+    34816000 Aug 21 18:00 test.sql
+    24009240 Aug 21 18:00 test_fixed.hdf
+     7919610 Aug 21 18:00 test_fixed_compress.hdf
+    24458892 Aug 21 18:00 test_table.hdf
+     8657116 Aug 21 18:00 test_table_compress.hdf
+    28520770 Aug 21 18:00 test.csv
     16000248 Aug 21 18:00 test.feather
-    16000694 Aug 21 18:00 test.pkl
-    15047240 Aug 21 18:00 test.pkl.compress
+    16000848 Aug 21 18:00 test.pkl
+     7554108 Aug 21 18:00 test.pkl.compress
 
 And here's the code
 
@@ -5309,7 +5312,8 @@ And here's the code
    from numpy.random import randn
    from pandas.io import sql
 
-   df = pd.DataFrame(randn(1000000,2),columns=list('AB'))
+   sz = 1000000
+   df = pd.DataFrame({'A': randn(sz), 'B': [1] * sz})
 
    def test_sql_write(df):
        if os.path.exists('test.sql'):