From 74049cf5b58ca196e7e726339deb7d7454de7094 Mon Sep 17 00:00:00 2001 From: Pedro Reys Date: Fri, 21 Feb 2020 11:52:23 -0300 Subject: [PATCH] Backport PR #32055: REGR: read_pickle fallback to encoding=latin_1 upon a UnicodeDecodeError --- doc/source/whatsnew/v1.0.2.rst | 1 + pandas/io/pickle.py | 25 ++++++++++--------- pandas/tests/io/data/pickle/test_mi_py27.pkl | Bin 0 -> 1395 bytes pandas/tests/io/test_pickle.py | 17 ++++++++++--- 4 files changed, 27 insertions(+), 16 deletions(-) create mode 100644 pandas/tests/io/data/pickle/test_mi_py27.pkl diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index c9031ac1ae9fe..57ed6adf667c8 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed regression in :meth:`Series.align` when ``other`` is a DataFrame and ``method`` is not None (:issue:`31785`) - Fixed regression in :meth:`pandas.core.groupby.RollingGroupby.apply` where the ``raw`` parameter was ignored (:issue:`31754`) - Fixed regression in :meth:`rolling(..).corr() ` when using a time offset (:issue:`31789`) +- Fixed regression where :func:`read_pickle` raised a ``UnicodeDecodeError`` when reading a py27 pickle with :class:`MultiIndex` column (:issue:`31988`). - Fixed regression in :class:`DataFrame` arithmetic operations with mis-matched columns (:issue:`31623`) - diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index e51f24b551f31..4e731b8ecca11 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -171,21 +171,22 @@ def read_pickle( # 1) try standard library Pickle # 2) try pickle_compat (older pandas version) to handle subclass changes - - excs_to_catch = (AttributeError, ImportError, ModuleNotFoundError) + # 3) try pickle_compat with latin-1 encoding upon a UnicodeDecodeError try: - with warnings.catch_warnings(record=True): - # We want to silence any warnings about, e.g. moved modules. - warnings.simplefilter("ignore", Warning) - return pickle.load(f) - except excs_to_catch: - # e.g. - # "No module named 'pandas.core.sparse.series'" - # "Can't get attribute '__nat_unpickle' on PwD7TRsEv2h?Ktee@rxtj}jd-wJa zQspX|v;t~k1mZV8_@U9n1S9GfW459J8?Eu1HYO^XXdpEYX6 ziP3#UVrvyxC$Z{9@oMLn|V&Pu?$ zDXz0}rkyhr)C9?M*7-jiO=6^|I9^*H@MqGxslCR_HOb-z6JL`Q1PB3c=Yd~EeyMwgCEb}A=6M=wq{M$qYbwD z7k-*N?>3ybM|Pk1#a(;!LgKA6pSj!S-yR$2o^T%{Soc!b<JE34+1007t#D! zQ*UtYXSapDa_Z&QbMDUB*V)O>$rEL==} z>FkD;ofl}YP-_GLagA#d_WaAY6JINZ-D Ni#f!xY|YW4{{SjH`hoxe literal 0 HcmV?d00001 diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 3d427dde573af..7605fae945962 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -382,14 +382,23 @@ def test_read(self, protocol, get_random_path): tm.assert_frame_equal(df, df2) -def test_unicode_decode_error(datapath): +@pytest.mark.parametrize( + ["pickle_file", "excols"], + [ + ("test_py27.pkl", pd.Index(["a", "b", "c"])), + ( + "test_mi_py27.pkl", + pd.MultiIndex.from_arrays([["a", "b", "c"], ["A", "B", "C"]]), + ), + ], +) +def test_unicode_decode_error(datapath, pickle_file, excols): # pickle file written with py27, should be readable without raising - # UnicodeDecodeError, see GH#28645 - path = datapath("io", "data", "pickle", "test_py27.pkl") + # UnicodeDecodeError, see GH#28645 and GH#31988 + path = datapath("io", "data", "pickle", pickle_file) df = pd.read_pickle(path) # just test the columns are correct since the values are random - excols = pd.Index(["a", "b", "c"]) tm.assert_index_equal(df.columns, excols)