From 5579176aef43d5decc6a928b2810ee3965ab85a7 Mon Sep 17 00:00:00 2001 From: rvernica Date: Mon, 21 Aug 2017 18:14:31 -0700 Subject: [PATCH 1/2] Update Performance Considerations section * re-run all tests * add tests for feather and pickle --- doc/source/io.rst | 85 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 63 insertions(+), 22 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index e338407361705..bfb7f4f588244 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -5208,7 +5208,7 @@ easy conversion to and from pandas. Performance Considerations -------------------------- -This is an informal comparison of various IO methods, using pandas 0.13.1. +This is an informal comparison of various IO methods, using pandas 0.20.3. .. code-block:: ipython @@ -5216,74 +5216,97 @@ This is an informal comparison of various IO methods, using pandas 0.13.1. In [2]: df.info() - Int64Index: 1000000 entries, 0 to 999999 + RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 2 columns): A 1000000 non-null float64 B 1000000 non-null float64 dtypes: float64(2) - memory usage: 22.9 MB + memory usage: 15.3 MB Writing .. code-block:: ipython In [14]: %timeit test_sql_write(df) - 1 loops, best of 3: 6.24 s per loop + 2.23 s ± 27.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) In [15]: %timeit test_hdf_fixed_write(df) - 1 loops, best of 3: 237 ms per loop + 239 ms ± 112 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) In [26]: %timeit test_hdf_fixed_write_compress(df) - 1 loops, best of 3: 245 ms per loop + 355 ms ± 116 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) In [16]: %timeit test_hdf_table_write(df) - 1 loops, best of 3: 901 ms per loop + 614 ms ± 50.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) In [27]: %timeit test_hdf_table_write_compress(df) - 1 loops, best of 3: 952 ms per loop + 679 ms ± 37.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) In [17]: %timeit test_csv_write(df) - 1 loops, best of 3: 3.44 s per loop + 4.18 s ± 50.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + + In [30]: %timeit test_feather_write(df) + 112 ms ± 764 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [31]: %timeit test_pickle_write(df) + 144 ms ± 25.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [32]: %timeit test_pickle_write_compress(df) + 6.45 s ± 81.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) Reading .. code-block:: ipython In [18]: %timeit test_sql_read() - 1 loops, best of 3: 766 ms per loop + 1.33 s ± 16.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) In [19]: %timeit test_hdf_fixed_read() - 10 loops, best of 3: 19.1 ms per loop + 11.1 ms ± 177 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) In [28]: %timeit test_hdf_fixed_read_compress() - 10 loops, best of 3: 36.3 ms per loop + 25.1 ms ± 371 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) In [20]: %timeit test_hdf_table_read() - 10 loops, best of 3: 39 ms per loop + 20.9 ms ± 502 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) In [29]: %timeit test_hdf_table_read_compress() - 10 loops, best of 3: 60.6 ms per loop + 28.2 ms ± 453 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) In [22]: %timeit test_csv_read() - 1 loops, best of 3: 620 ms per loop + 684 ms ± 14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + + In [33]: %timeit test_feather_read() + 3.51 ms ± 57.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) + + In [34]: %timeit test_pickle_read() + 5.75 ms ± 110 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) + + In [35]: %timeit test_pickle_read_compress() + 1.11 s ± 869 µs per loop (mean ± std. dev. of 7 runs, 1 loop each) Space on disk (in bytes) .. code-block:: none - 25843712 Apr 8 14:11 test.sql - 24007368 Apr 8 14:11 test_fixed.hdf - 15580682 Apr 8 14:11 test_fixed_compress.hdf - 24458444 Apr 8 14:11 test_table.hdf - 16797283 Apr 8 14:11 test_table_compress.hdf - 46152810 Apr 8 14:11 test.csv + 42975232 Aug 21 18:00 test.sql + 24007192 Aug 21 18:00 test_fixed.hdf + 15580621 Aug 21 18:00 test_fixed_compress.hdf + 24458524 Aug 21 18:00 test_table.hdf + 16797892 Aug 21 18:00 test_table_compress.hdf + 46149803 Aug 21 18:00 test.csv + 16000248 Aug 21 18:00 test.feather + 16000694 Aug 21 18:00 test.pkl + 15047240 Aug 21 18:00 test.pkl.compress And here's the code .. code-block:: python - import sqlite3 import os + import pandas as pd + import sqlite3 + from numpy.random import randn from pandas.io import sql df = pd.DataFrame(randn(1000000,2),columns=list('AB')) @@ -5329,3 +5352,21 @@ And here's the code def test_csv_read(): pd.read_csv('test.csv',index_col=0) + + def test_feather_write(df): + df.to_feather('test.feather') + + def test_feather_read(): + pd.read_feather('test.feather') + + def test_pickle_write(df): + df.to_pickle('test.pkl') + + def test_pickle_read(): + pd.read_pickle('test.pkl') + + def test_pickle_write_compress(df): + df.to_pickle('test.pkl.compress', compression='xz') + + def test_pickle_read_compress(): + pd.read_pickle('test.pkl.compress', compression='xz') From 1ecdf3f101fb57f446010b880b75a1c495b50241 Mon Sep 17 00:00:00 2001 From: Rares Vernica Date: Mon, 25 Sep 2017 20:27:39 -0700 Subject: [PATCH 2/2] Column of 1s in DataFrame to favor compression * Update all timings * Clarify wording --- doc/source/io.rst | 68 +++++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index bfb7f4f588244..b027c7658f0e9 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -5208,19 +5208,22 @@ easy conversion to and from pandas. Performance Considerations -------------------------- -This is an informal comparison of various IO methods, using pandas 0.20.3. +This is an informal comparison of various IO methods, using pandas +0.20.3. Timings are machine dependent and small differences should be +ignored. .. code-block:: ipython - In [1]: df = pd.DataFrame(randn(1000000,2),columns=list('AB')) + In [1]: sz = 1000000 + In [2]: df = pd.DataFrame({'A': randn(sz), 'B': [1] * sz}) - In [2]: df.info() + In [3]: df.info() RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 2 columns): A 1000000 non-null float64 - B 1000000 non-null float64 - dtypes: float64(2) + B 1000000 non-null int64 + dtypes: float64(1), int64(1) memory usage: 15.3 MB Writing @@ -5228,76 +5231,76 @@ Writing .. code-block:: ipython In [14]: %timeit test_sql_write(df) - 2.23 s ± 27.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + 2.37 s ± 36.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) In [15]: %timeit test_hdf_fixed_write(df) - 239 ms ± 112 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + 194 ms ± 65.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) In [26]: %timeit test_hdf_fixed_write_compress(df) - 355 ms ± 116 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + 119 ms ± 2.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) In [16]: %timeit test_hdf_table_write(df) - 614 ms ± 50.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + 623 ms ± 125 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) In [27]: %timeit test_hdf_table_write_compress(df) - 679 ms ± 37.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + 563 ms ± 23.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) In [17]: %timeit test_csv_write(df) - 4.18 s ± 50.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + 3.13 s ± 49.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) In [30]: %timeit test_feather_write(df) - 112 ms ± 764 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) + 103 ms ± 5.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) In [31]: %timeit test_pickle_write(df) - 144 ms ± 25.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + 109 ms ± 3.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) In [32]: %timeit test_pickle_write_compress(df) - 6.45 s ± 81.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + 3.33 s ± 55.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) Reading .. code-block:: ipython In [18]: %timeit test_sql_read() - 1.33 s ± 16.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + 1.35 s ± 14.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) In [19]: %timeit test_hdf_fixed_read() - 11.1 ms ± 177 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) + 14.3 ms ± 438 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) In [28]: %timeit test_hdf_fixed_read_compress() - 25.1 ms ± 371 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) + 23.5 ms ± 672 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) In [20]: %timeit test_hdf_table_read() - 20.9 ms ± 502 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) + 35.4 ms ± 314 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) In [29]: %timeit test_hdf_table_read_compress() - 28.2 ms ± 453 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) + 42.6 ms ± 2.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) In [22]: %timeit test_csv_read() - 684 ms ± 14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + 516 ms ± 27.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) In [33]: %timeit test_feather_read() - 3.51 ms ± 57.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) + 4.06 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) In [34]: %timeit test_pickle_read() - 5.75 ms ± 110 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) + 6.5 ms ± 172 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) In [35]: %timeit test_pickle_read_compress() - 1.11 s ± 869 µs per loop (mean ± std. dev. of 7 runs, 1 loop each) + 588 ms ± 3.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) Space on disk (in bytes) .. code-block:: none - 42975232 Aug 21 18:00 test.sql - 24007192 Aug 21 18:00 test_fixed.hdf - 15580621 Aug 21 18:00 test_fixed_compress.hdf - 24458524 Aug 21 18:00 test_table.hdf - 16797892 Aug 21 18:00 test_table_compress.hdf - 46149803 Aug 21 18:00 test.csv + 34816000 Aug 21 18:00 test.sql + 24009240 Aug 21 18:00 test_fixed.hdf + 7919610 Aug 21 18:00 test_fixed_compress.hdf + 24458892 Aug 21 18:00 test_table.hdf + 8657116 Aug 21 18:00 test_table_compress.hdf + 28520770 Aug 21 18:00 test.csv 16000248 Aug 21 18:00 test.feather - 16000694 Aug 21 18:00 test.pkl - 15047240 Aug 21 18:00 test.pkl.compress + 16000848 Aug 21 18:00 test.pkl + 7554108 Aug 21 18:00 test.pkl.compress And here's the code @@ -5309,7 +5312,8 @@ And here's the code from numpy.random import randn from pandas.io import sql - df = pd.DataFrame(randn(1000000,2),columns=list('AB')) + sz = 1000000 + df = pd.DataFrame({'A': randn(sz), 'B': [1] * sz}) def test_sql_write(df): if os.path.exists('test.sql'):