Skip to content

Commit 7d37ab8

Browse files
authored
REGR: read_pickle fallback to encoding=latin_1 upon a UnicodeDecodeError (pandas-dev#32055)
When a reading a pickle with MultiIndex columns generated in py27 `pickle_compat.load()` with `enconding=None` would throw an UnicodeDecodeError when reading a pickle created in py27. Now, `read_pickle` catches that exception and fallback to use `latin-1` explicitly.
1 parent c05ef6f commit 7d37ab8

File tree

4 files changed

+27
-16
lines changed

4 files changed

+27
-16
lines changed

doc/source/whatsnew/v1.0.2.rst

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ Fixed regressions
1919
- Fixed regression in :meth:`Series.align` when ``other`` is a DataFrame and ``method`` is not None (:issue:`31785`)
2020
- Fixed regression in :meth:`pandas.core.groupby.RollingGroupby.apply` where the ``raw`` parameter was ignored (:issue:`31754`)
2121
- Fixed regression in :meth:`rolling(..).corr() <pandas.core.window.Rolling.corr>` when using a time offset (:issue:`31789`)
22+
- Fixed regression where :func:`read_pickle` raised a ``UnicodeDecodeError`` when reading a py27 pickle with :class:`MultiIndex` column (:issue:`31988`).
2223
- Fixed regression in :class:`DataFrame` arithmetic operations with mis-matched columns (:issue:`31623`)
2324
-
2425

pandas/io/pickle.py

+13-12
Original file line numberDiff line numberDiff line change
@@ -171,21 +171,22 @@ def read_pickle(
171171

172172
# 1) try standard library Pickle
173173
# 2) try pickle_compat (older pandas version) to handle subclass changes
174-
175-
excs_to_catch = (AttributeError, ImportError, ModuleNotFoundError)
174+
# 3) try pickle_compat with latin-1 encoding upon a UnicodeDecodeError
176175

177176
try:
178-
with warnings.catch_warnings(record=True):
179-
# We want to silence any warnings about, e.g. moved modules.
180-
warnings.simplefilter("ignore", Warning)
181-
return pickle.load(f)
182-
except excs_to_catch:
183-
# e.g.
184-
# "No module named 'pandas.core.sparse.series'"
185-
# "Can't get attribute '__nat_unpickle' on <module 'pandas._libs.tslib"
186-
return pc.load(f, encoding=None)
177+
excs_to_catch = (AttributeError, ImportError, ModuleNotFoundError)
178+
try:
179+
with warnings.catch_warnings(record=True):
180+
# We want to silence any warnings about, e.g. moved modules.
181+
warnings.simplefilter("ignore", Warning)
182+
return pickle.load(f)
183+
except excs_to_catch:
184+
# e.g.
185+
# "No module named 'pandas.core.sparse.series'"
186+
# "Can't get attribute '__nat_unpickle' on <module 'pandas._libs.tslib"
187+
return pc.load(f, encoding=None)
187188
except UnicodeDecodeError:
188-
# e.g. can occur for files written in py27; see GH#28645
189+
# e.g. can occur for files written in py27; see GH#28645 and GH#31988
189190
return pc.load(f, encoding="latin-1")
190191
finally:
191192
f.close()
1.36 KB
Binary file not shown.

pandas/tests/io/test_pickle.py

+13-4
Original file line numberDiff line numberDiff line change
@@ -382,14 +382,23 @@ def test_read(self, protocol, get_random_path):
382382
tm.assert_frame_equal(df, df2)
383383

384384

385-
def test_unicode_decode_error(datapath):
385+
@pytest.mark.parametrize(
386+
["pickle_file", "excols"],
387+
[
388+
("test_py27.pkl", pd.Index(["a", "b", "c"])),
389+
(
390+
"test_mi_py27.pkl",
391+
pd.MultiIndex.from_arrays([["a", "b", "c"], ["A", "B", "C"]]),
392+
),
393+
],
394+
)
395+
def test_unicode_decode_error(datapath, pickle_file, excols):
386396
# pickle file written with py27, should be readable without raising
387-
# UnicodeDecodeError, see GH#28645
388-
path = datapath("io", "data", "pickle", "test_py27.pkl")
397+
# UnicodeDecodeError, see GH#28645 and GH#31988
398+
path = datapath("io", "data", "pickle", pickle_file)
389399
df = pd.read_pickle(path)
390400

391401
# just test the columns are correct since the values are random
392-
excols = pd.Index(["a", "b", "c"])
393402
tm.assert_index_equal(df.columns, excols)
394403

395404

0 commit comments

Comments
 (0)