CLN: Exception in pickle loading (pandas-dev#28645)

jbrockmendel · proost · commit cf18b5e45fc5 · 2019-12-20T01:10:56.000+09:00
diff --git a/doc/source/whatsnew/v0.8.0.rst b/doc/source/whatsnew/v0.8.0.rst
@@ -156,16 +156,15 @@ Other new features
 New plotting methods
 ~~~~~~~~~~~~~~~~~~~~
 
-.. ipython:: python
-   :suppress:
+.. code-block:: python
 
    import pandas as pd
    fx = pd.read_pickle('data/fx_prices')
    import matplotlib.pyplot as plt
 
 ``Series.plot`` now supports a ``secondary_y`` option:
 
-.. ipython:: python
+.. code-block:: python
 
    plt.figure()
 
diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py
@@ -4,7 +4,6 @@
 
 import copy
 import pickle as pkl
-import sys
 from typing import TYPE_CHECKING
 import warnings
 
@@ -25,37 +24,21 @@ def load_reduce(self):
     try:
         stack[-1] = func(*args)
         return
-    except Exception as e:
+    except TypeError as err:
 
         # If we have a deprecated function,
         # try to replace and try again.
 
         msg = "_reconstruct: First argument must be a sub-type of ndarray"
 
-        if msg in str(e):
+        if msg in str(err):
             try:
                 cls = args[0]
                 stack[-1] = object.__new__(cls)
                 return
             except TypeError:
                 pass
 
-        # try to re-encode the arguments
-        if getattr(self, "encoding", None) is not None:
-            args = tuple(
-                arg.encode(self.encoding) if isinstance(arg, str) else arg
-                for arg in args
-            )
-            try:
-                stack[-1] = func(*args)
-                return
-            except TypeError:
-                pass
-
-        # unknown exception, re-raise
-        if getattr(self, "is_verbose", None):
-            print(sys.exc_info())
-            print(func, args)
         raise
 
 
diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py
@@ -5,7 +5,7 @@
 
 from numpy.lib.format import read_array
 
-from pandas.compat import pickle_compat as pc
+from pandas.compat import PY36, pickle_compat as pc
 
 from pandas.io.common import _get_handle, _stringify_path
 
@@ -142,18 +142,24 @@ def read_pickle(path, compression="infer"):
 
     # 1) try standard library Pickle
     # 2) try pickle_compat (older pandas version) to handle subclass changes
-    # 3) try pickle_compat with latin1 encoding
+
+    excs_to_catch = (AttributeError, ImportError)
+    if PY36:
+        excs_to_catch += (ModuleNotFoundError,)
 
     try:
         with warnings.catch_warnings(record=True):
             # We want to silence any warnings about, e.g. moved modules.
             warnings.simplefilter("ignore", Warning)
             return pickle.load(f)
-    except Exception:
-        try:
-            return pc.load(f, encoding=None)
-        except Exception:
-            return pc.load(f, encoding="latin1")
+    except excs_to_catch:
+        # e.g.
+        #  "No module named 'pandas.core.sparse.series'"
+        #  "Can't get attribute '__nat_unpickle' on <module 'pandas._libs.tslib"
+        return pc.load(f, encoding=None)
+    except UnicodeDecodeError:
+        # e.g. can occur for files written in py27; see GH#28645
+        return pc.load(f, encoding="latin-1")
     finally:
         f.close()
         for _f in fh:
diff --git a/pandas/tests/io/data/test_py27.pkl b/pandas/tests/io/data/test_py27.pkl
diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py
@@ -377,3 +377,14 @@ def test_read(self, protocol, get_random_path):
             df.to_pickle(path, protocol=protocol)
             df2 = pd.read_pickle(path)
             tm.assert_frame_equal(df, df2)
+
+
+def test_unicode_decode_error():
+    # pickle file written with py27, should be readable without raising
+    #  UnicodeDecodeError, see GH#28645
+    path = os.path.join(os.path.dirname(__file__), "data", "test_py27.pkl")
+    df = pd.read_pickle(path)
+
+    # just test the columns are correct since the values are random
+    excols = pd.Index(["a", "b", "c"])
+    tm.assert_index_equal(df.columns, excols)