Merge pull request #6983 from jreback/pickle_perf

jreback · jreback · commit 107057dae2c3 · 2014-04-27T18:15:44.000-04:00
PERF: improved performance of compatible pickles (GH6899)
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -292,6 +292,7 @@ Improvements to existing features
   specified (:issue:`6607`)
 - ``read_excel`` can now read milliseconds in Excel dates and times with xlrd >= 0.9.3. (:issue:`5945`)
 - ``pivot_table`` can now accept ``Grouper`` by ``index`` and ``columns`` keywords (:issue:`6913`)
+- Improved performance of compatible pickles (:issue:`6899`)
 
 .. _release.bug_fixes-0.14.0:
 
diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt
@@ -522,6 +522,8 @@ Performance
   (e.g. MonthEnd,BusinessMonthEnd), (:issue:`6479`)
 - Improve performance of ``CustomBusinessDay`` (:issue:`6584`)
 - improve performance of slice indexing on Series with string keys (:issue:`6341`, :issue:`6372`)
+- Performance improvements in timedelta conversions for integer dtypes (:issue:`6754`)
+- Improved performance of compatible pickles (:issue:`6899`)
 
 Experimental
 ~~~~~~~~~~~~
diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py
@@ -34,16 +34,28 @@ def read_pickle(path):
     """
 
     def try_read(path, encoding=None):
+        # try with cPickle
         # try with current pickle, if we have a Type Error then
         # try with the compat pickle to handle subclass changes
         # pass encoding only if its not None as py2 doesn't handle
         # the param
+
+        # cpickle
+        # GH 6899
         try:
             with open(path, 'rb') as fh:
-                return pc.load(fh, encoding=encoding, compat=False)
+                return pkl.load(fh)
         except:
-            with open(path, 'rb') as fh:
-                return pc.load(fh, encoding=encoding, compat=True)
+
+            # reg/patched pickle
+            try:
+                with open(path, 'rb') as fh:
+                    return pc.load(fh, encoding=encoding, compat=False)
+
+            # compat pickle
+            except:
+                with open(path, 'rb') as fh:
+                    return pc.load(fh, encoding=encoding, compat=True)
 
     try:
         return try_read(path)
diff --git a/vb_suite/packers.py b/vb_suite/packers.py
@@ -7,6 +7,7 @@
 import os
 import pandas as pd
 from pandas.core import common as com
+from random import randrange
 
 f = '__test__.msg'
 def remove(f):
@@ -15,40 +16,48 @@ def remove(f):
    except:
        pass
 
-index = date_range('20000101',periods=50000,freq='H')
-df = DataFrame({'float1' : randn(50000),
-                'float2' : randn(50000)},
+N=100000
+C=5
+index = date_range('20000101',periods=N,freq='H')
+df = DataFrame(dict([ ("float{0}".format(i),randn(N)) for i in range(C) ]),
                index=index)
+
+N=100000
+C=5
+index = date_range('20000101',periods=N,freq='H')
+df2 = DataFrame(dict([ ("float{0}".format(i),randn(N)) for i in range(C) ]),
+                index=index)
+df2['object'] = ['%08x'%randrange(16**8) for _ in range(N)]
 remove(f)
 """
 
 #----------------------------------------------------------------------
 # msgpack
 
 setup = common_setup + """
-df.to_msgpack(f)
+df2.to_msgpack(f)
 """
 
 packers_read_pack = Benchmark("pd.read_msgpack(f)", setup, start_date=start_date)
 
 setup = common_setup + """
 """
 
-packers_write_pack = Benchmark("df.to_msgpack(f)", setup, cleanup="remove(f)", start_date=start_date)
+packers_write_pack = Benchmark("df2.to_msgpack(f)", setup, cleanup="remove(f)", start_date=start_date)
 
 #----------------------------------------------------------------------
 # pickle
 
 setup = common_setup + """
-df.to_pickle(f)
+df2.to_pickle(f)
 """
 
 packers_read_pickle = Benchmark("pd.read_pickle(f)", setup, start_date=start_date)
 
 setup = common_setup + """
 """
 
-packers_write_pickle = Benchmark("df.to_pickle(f)", setup, cleanup="remove(f)", start_date=start_date)
+packers_write_pickle = Benchmark("df2.to_pickle(f)", setup, cleanup="remove(f)", start_date=start_date)
 
 #----------------------------------------------------------------------
 # csv
@@ -68,29 +77,29 @@ def remove(f):
 # hdf store
 
 setup = common_setup + """
-df.to_hdf(f,'df')
+df2.to_hdf(f,'df')
 """
 
 packers_read_hdf_store = Benchmark("pd.read_hdf(f,'df')", setup, start_date=start_date)
 
 setup = common_setup + """
 """
 
-packers_write_hdf_store = Benchmark("df.to_hdf(f,'df')", setup, cleanup="remove(f)", start_date=start_date)
+packers_write_hdf_store = Benchmark("df2.to_hdf(f,'df')", setup, cleanup="remove(f)", start_date=start_date)
 
 #----------------------------------------------------------------------
 # hdf table
 
 setup = common_setup + """
-df.to_hdf(f,'df',table=True)
+df2.to_hdf(f,'df',table=True)
 """
 
 packers_read_hdf_table = Benchmark("pd.read_hdf(f,'df')", setup, start_date=start_date)
 
 setup = common_setup + """
 """
 
-packers_write_hdf_table = Benchmark("df.to_hdf(f,'df',table=True)", setup, cleanup="remove(f)", start_date=start_date)
+packers_write_hdf_table = Benchmark("df2.to_hdf(f,'df',table=True)", setup, cleanup="remove(f)", start_date=start_date)
 
 #----------------------------------------------------------------------
 # json