@@ -5576,7 +5576,7 @@ Performance considerations
5576
5576
--------------------------
5577
5577
5578
5578
This is an informal comparison of various IO methods, using pandas
5579
- 0.20.3 . Timings are machine dependent and small differences should be
5579
+ 0.24.2 . Timings are machine dependent and small differences should be
5580
5580
ignored.
5581
5581
5582
5582
.. code-block :: ipython
@@ -5597,11 +5597,18 @@ Given the next test set:
5597
5597
5598
5598
.. code-block :: python
5599
5599
5600
+
5601
+
5602
+ import numpy as np
5603
+
5600
5604
import os
5601
5605
5602
5606
sz = 1000000
5603
5607
df = pd.DataFrame({' A' : np.random.randn(sz), ' B' : [1 ] * sz})
5604
5608
5609
+ sz = 1000000
5610
+ np.random.seed(42 )
5611
+ df = pd.DataFrame({' A' : np.random.randn(sz), ' B' : [1 ] * sz})
5605
5612
5606
5613
def test_sql_write (df ):
5607
5614
if os.path.exists(' test.sql' ):
@@ -5610,151 +5617,152 @@ Given the next test set:
5610
5617
df.to_sql(name = ' test_table' , con = sql_db)
5611
5618
sql_db.close()
5612
5619
5613
-
5614
5620
def test_sql_read ():
5615
5621
sql_db = sqlite3.connect(' test.sql' )
5616
5622
pd.read_sql_query(" select * from test_table" , sql_db)
5617
5623
sql_db.close()
5618
5624
5619
-
5620
5625
def test_hdf_fixed_write (df ):
5621
5626
df.to_hdf(' test_fixed.hdf' , ' test' , mode = ' w' )
5622
5627
5623
-
5624
5628
def test_hdf_fixed_read ():
5625
5629
pd.read_hdf(' test_fixed.hdf' , ' test' )
5626
5630
5627
-
5628
5631
def test_hdf_fixed_write_compress (df ):
5629
5632
df.to_hdf(' test_fixed_compress.hdf' , ' test' , mode = ' w' , complib = ' blosc' )
5630
5633
5631
-
5632
5634
def test_hdf_fixed_read_compress ():
5633
5635
pd.read_hdf(' test_fixed_compress.hdf' , ' test' )
5634
5636
5635
-
5636
5637
def test_hdf_table_write (df ):
5637
5638
df.to_hdf(' test_table.hdf' , ' test' , mode = ' w' , format = ' table' )
5638
5639
5639
-
5640
5640
def test_hdf_table_read ():
5641
5641
pd.read_hdf(' test_table.hdf' , ' test' )
5642
5642
5643
-
5644
5643
def test_hdf_table_write_compress (df ):
5645
5644
df.to_hdf(' test_table_compress.hdf' , ' test' , mode = ' w' ,
5646
5645
complib = ' blosc' , format = ' table' )
5647
5646
5648
-
5649
5647
def test_hdf_table_read_compress ():
5650
5648
pd.read_hdf(' test_table_compress.hdf' , ' test' )
5651
5649
5652
-
5653
5650
def test_csv_write (df ):
5654
5651
df.to_csv(' test.csv' , mode = ' w' )
5655
5652
5656
-
5657
5653
def test_csv_read ():
5658
5654
pd.read_csv(' test.csv' , index_col = 0 )
5659
5655
5660
-
5661
5656
def test_feather_write (df ):
5662
5657
df.to_feather(' test.feather' )
5663
5658
5664
-
5665
5659
def test_feather_read ():
5666
5660
pd.read_feather(' test.feather' )
5667
5661
5668
-
5669
5662
def test_pickle_write (df ):
5670
5663
df.to_pickle(' test.pkl' )
5671
5664
5672
-
5673
5665
def test_pickle_read ():
5674
5666
pd.read_pickle(' test.pkl' )
5675
5667
5676
-
5677
5668
def test_pickle_write_compress (df ):
5678
5669
df.to_pickle(' test.pkl.compress' , compression = ' xz' )
5679
5670
5680
-
5681
5671
def test_pickle_read_compress ():
5682
5672
pd.read_pickle(' test.pkl.compress' , compression = ' xz' )
5683
5673
5684
- When writing, the top-three functions in terms of speed are are
5685
- ``test_pickle_write ``, ``test_feather_write `` and ``test_hdf_fixed_write_compress ``.
5674
+ def test_parquet_write (df ):
5675
+ df.to_parquet(' test.parquet' )
5676
+
5677
+ def test_parquet_read ():
5678
+ pd.read_parquet(' test.parquet' )
5679
+
5680
+ When writing, the top-three functions in terms of speed are ``test_feather_write ``, ``test_hdf_fixed_write `` and ``test_hdf_fixed_write_compress ``.
5686
5681
5687
5682
.. code-block :: ipython
5688
5683
5689
- In [14 ]: %timeit test_sql_write(df)
5690
- 2.37 s ± 36.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5684
+ In [4 ]: %timeit test_sql_write(df)
5685
+ 3.29 s ± 43.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5691
5686
5692
- In [15 ]: %timeit test_hdf_fixed_write(df)
5693
- 194 ms ± 65.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
5687
+ In [5 ]: %timeit test_hdf_fixed_write(df)
5688
+ 19.4 ms ± 560 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
5694
5689
5695
- In [26 ]: %timeit test_hdf_fixed_write_compress(df)
5696
- 119 ms ± 2.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
5690
+ In [6 ]: %timeit test_hdf_fixed_write_compress(df)
5691
+ 19.6 ms ± 308 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
5697
5692
5698
- In [16 ]: %timeit test_hdf_table_write(df)
5699
- 623 ms ± 125 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5693
+ In [7 ]: %timeit test_hdf_table_write(df)
5694
+ 449 ms ± 5.61 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5700
5695
5701
- In [27 ]: %timeit test_hdf_table_write_compress(df)
5702
- 563 ms ± 23.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5696
+ In [8 ]: %timeit test_hdf_table_write_compress(df)
5697
+ 448 ms ± 11.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5703
5698
5704
- In [17 ]: %timeit test_csv_write(df)
5705
- 3.13 s ± 49.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5699
+ In [9 ]: %timeit test_csv_write(df)
5700
+ 3.66 s ± 26.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5706
5701
5707
- In [30 ]: %timeit test_feather_write(df)
5708
- 103 ms ± 5.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
5702
+ In [10 ]: %timeit test_feather_write(df)
5703
+ 9.75 ms ± 117 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5709
5704
5710
- In [31 ]: %timeit test_pickle_write(df)
5711
- 109 ms ± 3.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
5705
+ In [11 ]: %timeit test_pickle_write(df)
5706
+ 30.1 ms ± 229 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
5712
5707
5713
- In [32]: %timeit test_pickle_write_compress(df)
5714
- 3.33 s ± 55.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5708
+ In [12]: %timeit test_pickle_write_compress(df)
5709
+ 4.29 s ± 15.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5710
+
5711
+ In [13]: %timeit test_parquet_write(df)
5712
+ 67.6 ms ± 706 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
5715
5713
5716
5714
When reading, the top three are ``test_feather_read ``, ``test_pickle_read `` and
5717
5715
``test_hdf_fixed_read ``.
5718
5716
5717
+
5719
5718
.. code-block :: ipython
5720
5719
5721
- In [18]: %timeit test_sql_read()
5722
- 1.35 s ± 14.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5720
+ In [14]: %timeit test_sql_read()
5721
+ 1.77 s ± 17.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5722
+
5723
+ In [15]: %timeit test_hdf_fixed_read()
5724
+ 19.4 ms ± 436 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
5725
+
5726
+ In [16]: %timeit test_hdf_fixed_read_compress()
5727
+ 19.5 ms ± 222 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
5723
5728
5724
- In [19 ]: %timeit test_hdf_fixed_read ()
5725
- 14.3 ms ± 438 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5729
+ In [17 ]: %timeit test_hdf_table_read ()
5730
+ 38.6 ms ± 857 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
5726
5731
5727
- In [28 ]: %timeit test_hdf_fixed_read_compress ()
5728
- 23.5 ms ± 672 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
5732
+ In [18 ]: %timeit test_hdf_table_read_compress ()
5733
+ 38.8 ms ± 1.49 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
5729
5734
5730
- In [20 ]: %timeit test_hdf_table_read ()
5731
- 35.4 ms ± 314 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
5735
+ In [19 ]: %timeit test_csv_read ()
5736
+ 452 ms ± 9.04 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5732
5737
5733
- In [29 ]: %timeit test_hdf_table_read_compress ()
5734
- 42.6 ms ± 2.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
5738
+ In [20 ]: %timeit test_feather_read ()
5739
+ 12.4 ms ± 99.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5735
5740
5736
- In [22 ]: %timeit test_csv_read ()
5737
- 516 ms ± 27.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5741
+ In [21 ]: %timeit test_pickle_read ()
5742
+ 18.4 ms ± 191 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5738
5743
5739
- In [33 ]: %timeit test_feather_read ()
5740
- 4.06 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5744
+ In [22 ]: %timeit test_pickle_read_compress ()
5745
+ 915 ms ± 7.48 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5741
5746
5742
- In [34 ]: %timeit test_pickle_read ()
5743
- 6.5 ms ± 172 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5747
+ In [23 ]: %timeit test_parquet_read ()
5748
+ 24.4 ms ± 146 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
5744
5749
5745
- In [35]: %timeit test_pickle_read_compress()
5746
- 588 ms ± 3.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5747
5750
5751
+ For this test case ``test.pkl.compress ``, ``test.parquet `` and ``test.feather `` took the least space on disk.
5748
5752
Space on disk (in bytes)
5749
5753
5750
5754
.. code-block :: none
5751
5755
5752
- 34816000 Aug 21 18:00 test.sql
5753
- 24009240 Aug 21 18:00 test_fixed.hdf
5754
- 7919610 Aug 21 18:00 test_fixed_compress.hdf
5755
- 24458892 Aug 21 18:00 test_table.hdf
5756
- 8657116 Aug 21 18:00 test_table_compress.hdf
5757
- 28520770 Aug 21 18:00 test.csv
5758
- 16000248 Aug 21 18:00 test.feather
5759
- 16000848 Aug 21 18:00 test.pkl
5760
- 7554108 Aug 21 18:00 test.pkl.compress
5756
+ 29519500 Oct 10 06:45 test.csv
5757
+ 16000248 Oct 10 06:45 test.feather
5758
+ 8281983 Oct 10 06:49 test.parquet
5759
+ 16000857 Oct 10 06:47 test.pkl
5760
+ 7552144 Oct 10 06:48 test.pkl.compress
5761
+ 34816000 Oct 10 06:42 test.sql
5762
+ 24009288 Oct 10 06:43 test_fixed.hdf
5763
+ 24009288 Oct 10 06:43 test_fixed_compress.hdf
5764
+ 24458940 Oct 10 06:44 test_table.hdf
5765
+ 24458940 Oct 10 06:44 test_table_compress.hdf
5766
+
5767
+
5768
+
0 commit comments