diff --git a/doc/source/io.rst b/doc/source/io.rst index e338407361705..b027c7658f0e9 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -5208,85 +5208,112 @@ easy conversion to and from pandas. Performance Considerations -------------------------- -This is an informal comparison of various IO methods, using pandas 0.13.1. +This is an informal comparison of various IO methods, using pandas +0.20.3. Timings are machine dependent and small differences should be +ignored. .. code-block:: ipython - In [1]: df = pd.DataFrame(randn(1000000,2),columns=list('AB')) + In [1]: sz = 1000000 + In [2]: df = pd.DataFrame({'A': randn(sz), 'B': [1] * sz}) - In [2]: df.info() + In [3]: df.info() - Int64Index: 1000000 entries, 0 to 999999 + RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 2 columns): A 1000000 non-null float64 - B 1000000 non-null float64 - dtypes: float64(2) - memory usage: 22.9 MB + B 1000000 non-null int64 + dtypes: float64(1), int64(1) + memory usage: 15.3 MB Writing .. code-block:: ipython In [14]: %timeit test_sql_write(df) - 1 loops, best of 3: 6.24 s per loop + 2.37 s ± 36.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) In [15]: %timeit test_hdf_fixed_write(df) - 1 loops, best of 3: 237 ms per loop + 194 ms ± 65.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) In [26]: %timeit test_hdf_fixed_write_compress(df) - 1 loops, best of 3: 245 ms per loop + 119 ms ± 2.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) In [16]: %timeit test_hdf_table_write(df) - 1 loops, best of 3: 901 ms per loop + 623 ms ± 125 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) In [27]: %timeit test_hdf_table_write_compress(df) - 1 loops, best of 3: 952 ms per loop + 563 ms ± 23.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) In [17]: %timeit test_csv_write(df) - 1 loops, best of 3: 3.44 s per loop + 3.13 s ± 49.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + + In [30]: %timeit test_feather_write(df) + 103 ms ± 5.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [31]: %timeit test_pickle_write(df) + 109 ms ± 3.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [32]: %timeit test_pickle_write_compress(df) + 3.33 s ± 55.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) Reading .. code-block:: ipython In [18]: %timeit test_sql_read() - 1 loops, best of 3: 766 ms per loop + 1.35 s ± 14.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) In [19]: %timeit test_hdf_fixed_read() - 10 loops, best of 3: 19.1 ms per loop + 14.3 ms ± 438 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) In [28]: %timeit test_hdf_fixed_read_compress() - 10 loops, best of 3: 36.3 ms per loop + 23.5 ms ± 672 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) In [20]: %timeit test_hdf_table_read() - 10 loops, best of 3: 39 ms per loop + 35.4 ms ± 314 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) In [29]: %timeit test_hdf_table_read_compress() - 10 loops, best of 3: 60.6 ms per loop + 42.6 ms ± 2.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) In [22]: %timeit test_csv_read() - 1 loops, best of 3: 620 ms per loop + 516 ms ± 27.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + + In [33]: %timeit test_feather_read() + 4.06 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) + + In [34]: %timeit test_pickle_read() + 6.5 ms ± 172 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) + + In [35]: %timeit test_pickle_read_compress() + 588 ms ± 3.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) Space on disk (in bytes) .. code-block:: none - 25843712 Apr 8 14:11 test.sql - 24007368 Apr 8 14:11 test_fixed.hdf - 15580682 Apr 8 14:11 test_fixed_compress.hdf - 24458444 Apr 8 14:11 test_table.hdf - 16797283 Apr 8 14:11 test_table_compress.hdf - 46152810 Apr 8 14:11 test.csv + 34816000 Aug 21 18:00 test.sql + 24009240 Aug 21 18:00 test_fixed.hdf + 7919610 Aug 21 18:00 test_fixed_compress.hdf + 24458892 Aug 21 18:00 test_table.hdf + 8657116 Aug 21 18:00 test_table_compress.hdf + 28520770 Aug 21 18:00 test.csv + 16000248 Aug 21 18:00 test.feather + 16000848 Aug 21 18:00 test.pkl + 7554108 Aug 21 18:00 test.pkl.compress And here's the code .. code-block:: python - import sqlite3 import os + import pandas as pd + import sqlite3 + from numpy.random import randn from pandas.io import sql - df = pd.DataFrame(randn(1000000,2),columns=list('AB')) + sz = 1000000 + df = pd.DataFrame({'A': randn(sz), 'B': [1] * sz}) def test_sql_write(df): if os.path.exists('test.sql'): @@ -5329,3 +5356,21 @@ And here's the code def test_csv_read(): pd.read_csv('test.csv',index_col=0) + + def test_feather_write(df): + df.to_feather('test.feather') + + def test_feather_read(): + pd.read_feather('test.feather') + + def test_pickle_write(df): + df.to_pickle('test.pkl') + + def test_pickle_read(): + pd.read_pickle('test.pkl') + + def test_pickle_write_compress(df): + df.to_pickle('test.pkl.compress', compression='xz') + + def test_pickle_read_compress(): + pd.read_pickle('test.pkl.compress', compression='xz')