Skip to content

Commit cf18b5e

Browse files
jbrockmendelproost
authored andcommitted
CLN: Exception in pickle loading (pandas-dev#28645)
1 parent e450d77 commit cf18b5e

File tree

5 files changed

+28
-29
lines changed

5 files changed

+28
-29
lines changed

doc/source/whatsnew/v0.8.0.rst

+2-3
Original file line numberDiff line numberDiff line change
@@ -156,16 +156,15 @@ Other new features
156156
New plotting methods
157157
~~~~~~~~~~~~~~~~~~~~
158158

159-
.. ipython:: python
160-
:suppress:
159+
.. code-block:: python
161160
162161
import pandas as pd
163162
fx = pd.read_pickle('data/fx_prices')
164163
import matplotlib.pyplot as plt
165164
166165
``Series.plot`` now supports a ``secondary_y`` option:
167166

168-
.. ipython:: python
167+
.. code-block:: python
169168
170169
plt.figure()
171170

pandas/compat/pickle_compat.py

+2-19
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
import copy
66
import pickle as pkl
7-
import sys
87
from typing import TYPE_CHECKING
98
import warnings
109

@@ -25,37 +24,21 @@ def load_reduce(self):
2524
try:
2625
stack[-1] = func(*args)
2726
return
28-
except Exception as e:
27+
except TypeError as err:
2928

3029
# If we have a deprecated function,
3130
# try to replace and try again.
3231

3332
msg = "_reconstruct: First argument must be a sub-type of ndarray"
3433

35-
if msg in str(e):
34+
if msg in str(err):
3635
try:
3736
cls = args[0]
3837
stack[-1] = object.__new__(cls)
3938
return
4039
except TypeError:
4140
pass
4241

43-
# try to re-encode the arguments
44-
if getattr(self, "encoding", None) is not None:
45-
args = tuple(
46-
arg.encode(self.encoding) if isinstance(arg, str) else arg
47-
for arg in args
48-
)
49-
try:
50-
stack[-1] = func(*args)
51-
return
52-
except TypeError:
53-
pass
54-
55-
# unknown exception, re-raise
56-
if getattr(self, "is_verbose", None):
57-
print(sys.exc_info())
58-
print(func, args)
5942
raise
6043

6144

pandas/io/pickle.py

+13-7
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from numpy.lib.format import read_array
77

8-
from pandas.compat import pickle_compat as pc
8+
from pandas.compat import PY36, pickle_compat as pc
99

1010
from pandas.io.common import _get_handle, _stringify_path
1111

@@ -142,18 +142,24 @@ def read_pickle(path, compression="infer"):
142142

143143
# 1) try standard library Pickle
144144
# 2) try pickle_compat (older pandas version) to handle subclass changes
145-
# 3) try pickle_compat with latin1 encoding
145+
146+
excs_to_catch = (AttributeError, ImportError)
147+
if PY36:
148+
excs_to_catch += (ModuleNotFoundError,)
146149

147150
try:
148151
with warnings.catch_warnings(record=True):
149152
# We want to silence any warnings about, e.g. moved modules.
150153
warnings.simplefilter("ignore", Warning)
151154
return pickle.load(f)
152-
except Exception:
153-
try:
154-
return pc.load(f, encoding=None)
155-
except Exception:
156-
return pc.load(f, encoding="latin1")
155+
except excs_to_catch:
156+
# e.g.
157+
# "No module named 'pandas.core.sparse.series'"
158+
# "Can't get attribute '__nat_unpickle' on <module 'pandas._libs.tslib"
159+
return pc.load(f, encoding=None)
160+
except UnicodeDecodeError:
161+
# e.g. can occur for files written in py27; see GH#28645
162+
return pc.load(f, encoding="latin-1")
157163
finally:
158164
f.close()
159165
for _f in fh:

pandas/tests/io/data/test_py27.pkl

943 Bytes
Binary file not shown.

pandas/tests/io/test_pickle.py

+11
Original file line numberDiff line numberDiff line change
@@ -377,3 +377,14 @@ def test_read(self, protocol, get_random_path):
377377
df.to_pickle(path, protocol=protocol)
378378
df2 = pd.read_pickle(path)
379379
tm.assert_frame_equal(df, df2)
380+
381+
382+
def test_unicode_decode_error():
383+
# pickle file written with py27, should be readable without raising
384+
# UnicodeDecodeError, see GH#28645
385+
path = os.path.join(os.path.dirname(__file__), "data", "test_py27.pkl")
386+
df = pd.read_pickle(path)
387+
388+
# just test the columns are correct since the values are random
389+
excols = pd.Index(["a", "b", "c"])
390+
tm.assert_index_equal(df.columns, excols)

0 commit comments

Comments
 (0)