Skip to content

Commit 107057d

Browse files
committed
Merge pull request #6983 from jreback/pickle_perf
PERF: improved performance of compatible pickles (GH6899)
2 parents 951f055 + 2cafb82 commit 107057d

File tree

4 files changed

+38
-14
lines changed

4 files changed

+38
-14
lines changed

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,7 @@ Improvements to existing features
292292
specified (:issue:`6607`)
293293
- ``read_excel`` can now read milliseconds in Excel dates and times with xlrd >= 0.9.3. (:issue:`5945`)
294294
- ``pivot_table`` can now accept ``Grouper`` by ``index`` and ``columns`` keywords (:issue:`6913`)
295+
- Improved performance of compatible pickles (:issue:`6899`)
295296

296297
.. _release.bug_fixes-0.14.0:
297298

doc/source/v0.14.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -522,6 +522,8 @@ Performance
522522
(e.g. MonthEnd,BusinessMonthEnd), (:issue:`6479`)
523523
- Improve performance of ``CustomBusinessDay`` (:issue:`6584`)
524524
- improve performance of slice indexing on Series with string keys (:issue:`6341`, :issue:`6372`)
525+
- Performance improvements in timedelta conversions for integer dtypes (:issue:`6754`)
526+
- Improved performance of compatible pickles (:issue:`6899`)
525527

526528
Experimental
527529
~~~~~~~~~~~~

pandas/io/pickle.py

+15-3
Original file line numberDiff line numberDiff line change
@@ -34,16 +34,28 @@ def read_pickle(path):
3434
"""
3535

3636
def try_read(path, encoding=None):
37+
# try with cPickle
3738
# try with current pickle, if we have a Type Error then
3839
# try with the compat pickle to handle subclass changes
3940
# pass encoding only if its not None as py2 doesn't handle
4041
# the param
42+
43+
# cpickle
44+
# GH 6899
4145
try:
4246
with open(path, 'rb') as fh:
43-
return pc.load(fh, encoding=encoding, compat=False)
47+
return pkl.load(fh)
4448
except:
45-
with open(path, 'rb') as fh:
46-
return pc.load(fh, encoding=encoding, compat=True)
49+
50+
# reg/patched pickle
51+
try:
52+
with open(path, 'rb') as fh:
53+
return pc.load(fh, encoding=encoding, compat=False)
54+
55+
# compat pickle
56+
except:
57+
with open(path, 'rb') as fh:
58+
return pc.load(fh, encoding=encoding, compat=True)
4759

4860
try:
4961
return try_read(path)

vb_suite/packers.py

+20-11
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import os
88
import pandas as pd
99
from pandas.core import common as com
10+
from random import randrange
1011
1112
f = '__test__.msg'
1213
def remove(f):
@@ -15,40 +16,48 @@ def remove(f):
1516
except:
1617
pass
1718
18-
index = date_range('20000101',periods=50000,freq='H')
19-
df = DataFrame({'float1' : randn(50000),
20-
'float2' : randn(50000)},
19+
N=100000
20+
C=5
21+
index = date_range('20000101',periods=N,freq='H')
22+
df = DataFrame(dict([ ("float{0}".format(i),randn(N)) for i in range(C) ]),
2123
index=index)
24+
25+
N=100000
26+
C=5
27+
index = date_range('20000101',periods=N,freq='H')
28+
df2 = DataFrame(dict([ ("float{0}".format(i),randn(N)) for i in range(C) ]),
29+
index=index)
30+
df2['object'] = ['%08x'%randrange(16**8) for _ in range(N)]
2231
remove(f)
2332
"""
2433

2534
#----------------------------------------------------------------------
2635
# msgpack
2736

2837
setup = common_setup + """
29-
df.to_msgpack(f)
38+
df2.to_msgpack(f)
3039
"""
3140

3241
packers_read_pack = Benchmark("pd.read_msgpack(f)", setup, start_date=start_date)
3342

3443
setup = common_setup + """
3544
"""
3645

37-
packers_write_pack = Benchmark("df.to_msgpack(f)", setup, cleanup="remove(f)", start_date=start_date)
46+
packers_write_pack = Benchmark("df2.to_msgpack(f)", setup, cleanup="remove(f)", start_date=start_date)
3847

3948
#----------------------------------------------------------------------
4049
# pickle
4150

4251
setup = common_setup + """
43-
df.to_pickle(f)
52+
df2.to_pickle(f)
4453
"""
4554

4655
packers_read_pickle = Benchmark("pd.read_pickle(f)", setup, start_date=start_date)
4756

4857
setup = common_setup + """
4958
"""
5059

51-
packers_write_pickle = Benchmark("df.to_pickle(f)", setup, cleanup="remove(f)", start_date=start_date)
60+
packers_write_pickle = Benchmark("df2.to_pickle(f)", setup, cleanup="remove(f)", start_date=start_date)
5261

5362
#----------------------------------------------------------------------
5463
# csv
@@ -68,29 +77,29 @@ def remove(f):
6877
# hdf store
6978

7079
setup = common_setup + """
71-
df.to_hdf(f,'df')
80+
df2.to_hdf(f,'df')
7281
"""
7382

7483
packers_read_hdf_store = Benchmark("pd.read_hdf(f,'df')", setup, start_date=start_date)
7584

7685
setup = common_setup + """
7786
"""
7887

79-
packers_write_hdf_store = Benchmark("df.to_hdf(f,'df')", setup, cleanup="remove(f)", start_date=start_date)
88+
packers_write_hdf_store = Benchmark("df2.to_hdf(f,'df')", setup, cleanup="remove(f)", start_date=start_date)
8089

8190
#----------------------------------------------------------------------
8291
# hdf table
8392

8493
setup = common_setup + """
85-
df.to_hdf(f,'df',table=True)
94+
df2.to_hdf(f,'df',table=True)
8695
"""
8796

8897
packers_read_hdf_table = Benchmark("pd.read_hdf(f,'df')", setup, start_date=start_date)
8998

9099
setup = common_setup + """
91100
"""
92101

93-
packers_write_hdf_table = Benchmark("df.to_hdf(f,'df',table=True)", setup, cleanup="remove(f)", start_date=start_date)
102+
packers_write_hdf_table = Benchmark("df2.to_hdf(f,'df',table=True)", setup, cleanup="remove(f)", start_date=start_date)
94103

95104
#----------------------------------------------------------------------
96105
# json

0 commit comments

Comments
 (0)