@@ -5159,85 +5159,112 @@ easy conversion to and from pandas.
5159
5159
Performance Considerations
5160
5160
--------------------------
5161
5161
5162
- This is an informal comparison of various IO methods, using pandas 0.13.1.
5162
+ This is an informal comparison of various IO methods, using pandas
5163
+ 0.20.3. Timings are machine dependent and small differences should be
5164
+ ignored.
5163
5165
5164
5166
.. code-block :: ipython
5165
5167
5166
- In [1]: df = pd.DataFrame(randn(1000000,2),columns=list('AB'))
5168
+ In [1]: sz = 1000000
5169
+ In [2]: df = pd.DataFrame({'A': randn(sz), 'B': [1] * sz})
5167
5170
5168
- In [2 ]: df.info()
5171
+ In [3 ]: df.info()
5169
5172
<class 'pandas.core.frame.DataFrame'>
5170
- Int64Index : 1000000 entries, 0 to 999999
5173
+ RangeIndex : 1000000 entries, 0 to 999999
5171
5174
Data columns (total 2 columns):
5172
5175
A 1000000 non-null float64
5173
- B 1000000 non-null float64
5174
- dtypes: float64(2 )
5175
- memory usage: 22.9 MB
5176
+ B 1000000 non-null int64
5177
+ dtypes: float64(1), int64(1 )
5178
+ memory usage: 15.3 MB
5176
5179
5177
5180
Writing
5178
5181
5179
5182
.. code-block :: ipython
5180
5183
5181
5184
In [14]: %timeit test_sql_write(df)
5182
- 1 loops, best of 3: 6.24 s per loop
5185
+ 2.37 s ± 36.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5183
5186
5184
5187
In [15]: %timeit test_hdf_fixed_write(df)
5185
- 1 loops, best of 3: 237 ms per loop
5188
+ 194 ms ± 65.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
5186
5189
5187
5190
In [26]: %timeit test_hdf_fixed_write_compress(df)
5188
- 1 loops, best of 3: 245 ms per loop
5191
+ 119 ms ± 2.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
5189
5192
5190
5193
In [16]: %timeit test_hdf_table_write(df)
5191
- 1 loops, best of 3: 901 ms per loop
5194
+ 623 ms ± 125 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5192
5195
5193
5196
In [27]: %timeit test_hdf_table_write_compress(df)
5194
- 1 loops, best of 3: 952 ms per loop
5197
+ 563 ms ± 23.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5195
5198
5196
5199
In [17]: %timeit test_csv_write(df)
5197
- 1 loops, best of 3: 3.44 s per loop
5200
+ 3.13 s ± 49.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5201
+
5202
+ In [30]: %timeit test_feather_write(df)
5203
+ 103 ms ± 5.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
5204
+
5205
+ In [31]: %timeit test_pickle_write(df)
5206
+ 109 ms ± 3.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
5207
+
5208
+ In [32]: %timeit test_pickle_write_compress(df)
5209
+ 3.33 s ± 55.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5198
5210
5199
5211
Reading
5200
5212
5201
5213
.. code-block :: ipython
5202
5214
5203
5215
In [18]: %timeit test_sql_read()
5204
- 1 loops, best of 3: 766 ms per loop
5216
+ 1.35 s ± 14.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5205
5217
5206
5218
In [19]: %timeit test_hdf_fixed_read()
5207
- 10 loops, best of 3: 19.1 ms per loop
5219
+ 14.3 ms ± 438 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5208
5220
5209
5221
In [28]: %timeit test_hdf_fixed_read_compress()
5210
- 10 loops, best of 3: 36.3 ms per loop
5222
+ 23.5 ms ± 672 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
5211
5223
5212
5224
In [20]: %timeit test_hdf_table_read()
5213
- 10 loops, best of 3: 39 ms per loop
5225
+ 35.4 ms ± 314 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
5214
5226
5215
5227
In [29]: %timeit test_hdf_table_read_compress()
5216
- 10 loops, best of 3: 60.6 ms per loop
5228
+ 42.6 ms ± 2.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
5217
5229
5218
5230
In [22]: %timeit test_csv_read()
5219
- 1 loops, best of 3: 620 ms per loop
5231
+ 516 ms ± 27.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5232
+
5233
+ In [33]: %timeit test_feather_read()
5234
+ 4.06 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5235
+
5236
+ In [34]: %timeit test_pickle_read()
5237
+ 6.5 ms ± 172 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5238
+
5239
+ In [35]: %timeit test_pickle_read_compress()
5240
+ 588 ms ± 3.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5220
5241
5221
5242
Space on disk (in bytes)
5222
5243
5223
5244
.. code-block :: none
5224
5245
5225
- 25843712 Apr 8 14:11 test.sql
5226
- 24007368 Apr 8 14:11 test_fixed.hdf
5227
- 15580682 Apr 8 14:11 test_fixed_compress.hdf
5228
- 24458444 Apr 8 14:11 test_table.hdf
5229
- 16797283 Apr 8 14:11 test_table_compress.hdf
5230
- 46152810 Apr 8 14:11 test.csv
5246
+ 34816000 Aug 21 18:00 test.sql
5247
+ 24009240 Aug 21 18:00 test_fixed.hdf
5248
+ 7919610 Aug 21 18:00 test_fixed_compress.hdf
5249
+ 24458892 Aug 21 18:00 test_table.hdf
5250
+ 8657116 Aug 21 18:00 test_table_compress.hdf
5251
+ 28520770 Aug 21 18:00 test.csv
5252
+ 16000248 Aug 21 18:00 test.feather
5253
+ 16000848 Aug 21 18:00 test.pkl
5254
+ 7554108 Aug 21 18:00 test.pkl.compress
5231
5255
5232
5256
And here's the code
5233
5257
5234
5258
.. code-block :: python
5235
5259
5236
- import sqlite3
5237
5260
import os
5261
+ import pandas as pd
5262
+ import sqlite3
5263
+ from numpy.random import randn
5238
5264
from pandas.io import sql
5239
5265
5240
- df = pd.DataFrame(randn(1000000 ,2 ),columns = list (' AB' ))
5266
+ sz = 1000000
5267
+ df = pd.DataFrame({' A' : randn(sz), ' B' : [1 ] * sz})
5241
5268
5242
5269
def test_sql_write (df ):
5243
5270
if os.path.exists(' test.sql' ):
@@ -5280,3 +5307,21 @@ And here's the code
5280
5307
5281
5308
def test_csv_read ():
5282
5309
pd.read_csv(' test.csv' ,index_col = 0 )
5310
+
5311
+ def test_feather_write (df ):
5312
+ df.to_feather(' test.feather' )
5313
+
5314
+ def test_feather_read ():
5315
+ pd.read_feather(' test.feather' )
5316
+
5317
+ def test_pickle_write (df ):
5318
+ df.to_pickle(' test.pkl' )
5319
+
5320
+ def test_pickle_read ():
5321
+ pd.read_pickle(' test.pkl' )
5322
+
5323
+ def test_pickle_write_compress (df ):
5324
+ df.to_pickle(' test.pkl.compress' , compression = ' xz' )
5325
+
5326
+ def test_pickle_read_compress ():
5327
+ pd.read_pickle(' test.pkl.compress' , compression = ' xz' )
0 commit comments