Skip to content

Commit 560ec0c

Browse files
authored
CLN: Python 2 pickle/hdf support (#57387)
1 parent 2b3037a commit 560ec0c

16 files changed

+41
-191
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ Other API changes
8888
^^^^^^^^^^^^^^^^^
8989
- 3rd party ``py.path`` objects are no longer explicitly supported in IO methods. Use :py:class:`pathlib.Path` objects instead (:issue:`57091`)
9090
- :attr:`MultiIndex.codes`, :attr:`MultiIndex.levels`, and :attr:`MultiIndex.names` now returns a ``tuple`` instead of a ``FrozenList`` (:issue:`53531`)
91+
- pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`)
9192
-
9293

9394
.. ---------------------------------------------------------------------------

pandas/io/pickle.py

+10-15
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ def read_pickle(
184184
3 3 8
185185
4 4 9
186186
"""
187+
# TypeError for Cython complaints about object.__new__ vs Tick.__new__
187188
excs_to_catch = (AttributeError, ImportError, ModuleNotFoundError, TypeError)
188189
with get_handle(
189190
filepath_or_buffer,
@@ -194,20 +195,14 @@ def read_pickle(
194195
) as handles:
195196
# 1) try standard library Pickle
196197
# 2) try pickle_compat (older pandas version) to handle subclass changes
197-
# 3) try pickle_compat with latin-1 encoding upon a UnicodeDecodeError
198198

199199
try:
200-
# TypeError for Cython complaints about object.__new__ vs Tick.__new__
201-
try:
202-
with warnings.catch_warnings(record=True):
203-
# We want to silence any warnings about, e.g. moved modules.
204-
warnings.simplefilter("ignore", Warning)
205-
return pickle.load(handles.handle)
206-
except excs_to_catch:
207-
# e.g.
208-
# "No module named 'pandas.core.sparse.series'"
209-
# "Can't get attribute '__nat_unpickle' on <module 'pandas._libs.tslib"
210-
return pc.load(handles.handle, encoding=None)
211-
except UnicodeDecodeError:
212-
# e.g. can occur for files written in py27; see GH#28645 and GH#31988
213-
return pc.load(handles.handle, encoding="latin-1")
200+
with warnings.catch_warnings(record=True):
201+
# We want to silence any warnings about, e.g. moved modules.
202+
warnings.simplefilter("ignore", Warning)
203+
return pickle.load(handles.handle)
204+
except excs_to_catch:
205+
# e.g.
206+
# "No module named 'pandas.core.sparse.series'"
207+
# "Can't get attribute '__nat_unpickle' on <module 'pandas._libs.tslib"
208+
return pc.load(handles.handle, encoding=None)

pandas/io/pytables.py

+29-46
Original file line numberDiff line numberDiff line change
@@ -132,13 +132,6 @@
132132
_default_encoding = "UTF-8"
133133

134134

135-
def _ensure_decoded(s):
136-
"""if we have bytes, decode them to unicode"""
137-
if isinstance(s, np.bytes_):
138-
s = s.decode("UTF-8")
139-
return s
140-
141-
142135
def _ensure_encoding(encoding: str | None) -> str:
143136
# set the encoding if we need
144137
if encoding is None:
@@ -1730,8 +1723,8 @@ def _create_storer(
17301723
if value is not None and not isinstance(value, (Series, DataFrame)):
17311724
raise TypeError("value must be None, Series, or DataFrame")
17321725

1733-
pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None))
1734-
tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None))
1726+
pt = getattr(group._v_attrs, "pandas_type", None)
1727+
tt = getattr(group._v_attrs, "table_type", None)
17351728

17361729
# infer the pt from the passed value
17371730
if pt is None:
@@ -1798,7 +1791,7 @@ def _create_storer(
17981791
"worm": WORMTable,
17991792
}
18001793
try:
1801-
cls = _TABLE_MAP[tt]
1794+
cls = _TABLE_MAP[tt] # type: ignore[index]
18021795
except KeyError as err:
18031796
raise TypeError(
18041797
f"cannot properly create the storer for: [_TABLE_MAP] [group->"
@@ -2145,13 +2138,13 @@ def convert(
21452138
# preventing the original recarry from being free'ed
21462139
values = values[self.cname].copy()
21472140

2148-
val_kind = _ensure_decoded(self.kind)
2141+
val_kind = self.kind
21492142
values = _maybe_convert(values, val_kind, encoding, errors)
21502143
kwargs = {}
2151-
kwargs["name"] = _ensure_decoded(self.index_name)
2144+
kwargs["name"] = self.index_name
21522145

21532146
if self.freq is not None:
2154-
kwargs["freq"] = _ensure_decoded(self.freq)
2147+
kwargs["freq"] = self.freq
21552148

21562149
factory: type[Index | DatetimeIndex] = Index
21572150
if lib.is_np_dtype(values.dtype, "M") or isinstance(
@@ -2210,7 +2203,7 @@ def maybe_set_size(self, min_itemsize=None) -> None:
22102203
min_itemsize can be an integer or a dict with this columns name
22112204
with an integer size
22122205
"""
2213-
if _ensure_decoded(self.kind) == "string":
2206+
if self.kind == "string":
22142207
if isinstance(min_itemsize, dict):
22152208
min_itemsize = min_itemsize.get(self.name)
22162209

@@ -2231,7 +2224,7 @@ def validate_and_set(self, handler: AppendableTable, append: bool) -> None:
22312224
def validate_col(self, itemsize=None):
22322225
"""validate this column: return the compared against itemsize"""
22332226
# validate this column for string truncation (or reset to the max size)
2234-
if _ensure_decoded(self.kind) == "string":
2227+
if self.kind == "string":
22352228
c = self.col
22362229
if c is not None:
22372230
if itemsize is None:
@@ -2561,14 +2554,14 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
25612554
assert isinstance(converted, np.ndarray) # for mypy
25622555

25632556
# use the meta if needed
2564-
meta = _ensure_decoded(self.meta)
2557+
meta = self.meta
25652558
metadata = self.metadata
25662559
ordered = self.ordered
25672560
tz = self.tz
25682561

25692562
assert dtype_name is not None
25702563
# convert to the correct dtype
2571-
dtype = _ensure_decoded(dtype_name)
2564+
dtype = dtype_name
25722565

25732566
# reverse converts
25742567
if dtype.startswith("datetime64"):
@@ -2618,7 +2611,7 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
26182611
converted = converted.astype("O", copy=False)
26192612

26202613
# convert nans / decode
2621-
if _ensure_decoded(kind) == "string":
2614+
if kind == "string":
26222615
converted = _unconvert_string_array(
26232616
converted, nan_rep=nan_rep, encoding=encoding, errors=errors
26242617
)
@@ -2706,18 +2699,19 @@ def is_old_version(self) -> bool:
27062699
@property
27072700
def version(self) -> tuple[int, int, int]:
27082701
"""compute and set our version"""
2709-
version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None))
2710-
try:
2711-
version = tuple(int(x) for x in version.split("."))
2712-
if len(version) == 2:
2713-
version = version + (0,)
2714-
except AttributeError:
2715-
version = (0, 0, 0)
2716-
return version
2702+
version = getattr(self.group._v_attrs, "pandas_version", None)
2703+
if isinstance(version, str):
2704+
version_tup = tuple(int(x) for x in version.split("."))
2705+
if len(version_tup) == 2:
2706+
version_tup = version_tup + (0,)
2707+
assert len(version_tup) == 3 # needed for mypy
2708+
return version_tup
2709+
else:
2710+
return (0, 0, 0)
27172711

27182712
@property
27192713
def pandas_type(self):
2720-
return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None))
2714+
return getattr(self.group._v_attrs, "pandas_type", None)
27212715

27222716
def __repr__(self) -> str:
27232717
"""return a pretty representation of myself"""
@@ -2854,9 +2848,7 @@ def _alias_to_class(self, alias):
28542848
return self._reverse_index_map.get(alias, Index)
28552849

28562850
def _get_index_factory(self, attrs):
2857-
index_class = self._alias_to_class(
2858-
_ensure_decoded(getattr(attrs, "index_class", ""))
2859-
)
2851+
index_class = self._alias_to_class(getattr(attrs, "index_class", ""))
28602852

28612853
factory: Callable
28622854

@@ -2892,12 +2884,7 @@ def f(values, freq=None, tz=None):
28922884
factory = TimedeltaIndex
28932885

28942886
if "tz" in attrs:
2895-
if isinstance(attrs["tz"], bytes):
2896-
# created by python2
2897-
kwargs["tz"] = attrs["tz"].decode("utf-8")
2898-
else:
2899-
# created by python3
2900-
kwargs["tz"] = attrs["tz"]
2887+
kwargs["tz"] = attrs["tz"]
29012888
assert index_class is DatetimeIndex # just checking
29022889

29032890
return factory, kwargs
@@ -2929,9 +2916,9 @@ def set_attrs(self) -> None:
29292916
def get_attrs(self) -> None:
29302917
"""retrieve our attributes"""
29312918
self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
2932-
self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
2919+
self.errors = getattr(self.attrs, "errors", "strict")
29332920
for n in self.attributes:
2934-
setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None)))
2921+
setattr(self, n, getattr(self.attrs, n, None))
29352922

29362923
def write(self, obj, **kwargs) -> None:
29372924
self.set_attrs()
@@ -2948,7 +2935,7 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None
29482935
if isinstance(node, tables.VLArray):
29492936
ret = node[0][start:stop]
29502937
else:
2951-
dtype = _ensure_decoded(getattr(attrs, "value_type", None))
2938+
dtype = getattr(attrs, "value_type", None)
29522939
shape = getattr(attrs, "shape", None)
29532940

29542941
if shape is not None:
@@ -2973,7 +2960,7 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None
29732960
def read_index(
29742961
self, key: str, start: int | None = None, stop: int | None = None
29752962
) -> Index:
2976-
variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety"))
2963+
variety = getattr(self.attrs, f"{key}_variety")
29772964

29782965
if variety == "multi":
29792966
return self.read_multi_index(key, start=start, stop=stop)
@@ -3063,12 +3050,11 @@ def read_index_node(
30633050
# have written a sentinel. Here we replace it with the original.
30643051
if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0:
30653052
data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type)
3066-
kind = _ensure_decoded(node._v_attrs.kind)
3053+
kind = node._v_attrs.kind
30673054
name = None
30683055

30693056
if "name" in node._v_attrs:
30703057
name = _ensure_str(node._v_attrs.name)
3071-
name = _ensure_decoded(name)
30723058

30733059
attrs = node._v_attrs
30743060
factory, kwargs = self._get_index_factory(attrs)
@@ -3584,7 +3570,7 @@ def get_attrs(self) -> None:
35843570
self.info = getattr(self.attrs, "info", None) or {}
35853571
self.nan_rep = getattr(self.attrs, "nan_rep", None)
35863572
self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
3587-
self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
3573+
self.errors = getattr(self.attrs, "errors", "strict")
35883574
self.levels: list[Hashable] = getattr(self.attrs, "levels", None) or []
35893575
self.index_axes = [a for a in self.indexables if a.is_an_indexable]
35903576
self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
@@ -4926,7 +4912,6 @@ def _set_tz(
49264912
name = None
49274913
values = values.ravel()
49284914

4929-
tz = _ensure_decoded(tz)
49304915
values = DatetimeIndex(values, name=name)
49314916
values = values.tz_localize("UTC").tz_convert(tz)
49324917
elif coerce:
@@ -5228,8 +5213,6 @@ def _dtype_to_kind(dtype_str: str) -> str:
52285213
"""
52295214
Find the "kind" string describing the given dtype name.
52305215
"""
5231-
dtype_str = _ensure_decoded(dtype_str)
5232-
52335216
if dtype_str.startswith(("string", "bytes")):
52345217
kind = "string"
52355218
elif dtype_str.startswith("float"):
Binary file not shown.
-7 KB
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
-1.36 KB
Binary file not shown.
-943 Bytes
Binary file not shown.

pandas/tests/io/pytables/test_read.py

-73
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import numpy as np
66
import pytest
77

8-
from pandas._libs.tslibs import Timestamp
98
from pandas.compat import is_platform_windows
109

1110
import pandas as pd
@@ -171,50 +170,6 @@ def test_pytables_native2_read(datapath):
171170
assert isinstance(d1, DataFrame)
172171

173172

174-
def test_legacy_table_fixed_format_read_py2(datapath):
175-
# GH 24510
176-
# legacy table with fixed format written in Python 2
177-
with ensure_clean_store(
178-
datapath("io", "data", "legacy_hdf", "legacy_table_fixed_py2.h5"), mode="r"
179-
) as store:
180-
result = store.select("df")
181-
expected = DataFrame(
182-
[[1, 2, 3, "D"]],
183-
columns=["A", "B", "C", "D"],
184-
index=Index(["ABC"], name="INDEX_NAME"),
185-
)
186-
tm.assert_frame_equal(expected, result)
187-
188-
189-
def test_legacy_table_fixed_format_read_datetime_py2(datapath):
190-
# GH 31750
191-
# legacy table with fixed format and datetime64 column written in Python 2
192-
expected = DataFrame(
193-
[[Timestamp("2020-02-06T18:00")]],
194-
columns=["A"],
195-
index=Index(["date"]),
196-
dtype="M8[ns]",
197-
)
198-
with ensure_clean_store(
199-
datapath("io", "data", "legacy_hdf", "legacy_table_fixed_datetime_py2.h5"),
200-
mode="r",
201-
) as store:
202-
result = store.select("df")
203-
tm.assert_frame_equal(expected, result)
204-
205-
206-
def test_legacy_table_read_py2(datapath):
207-
# issue: 24925
208-
# legacy table written in Python 2
209-
with ensure_clean_store(
210-
datapath("io", "data", "legacy_hdf", "legacy_table_py2.h5"), mode="r"
211-
) as store:
212-
result = store.select("table")
213-
214-
expected = DataFrame({"a": ["a", "b"], "b": [2, 3]})
215-
tm.assert_frame_equal(expected, result)
216-
217-
218173
def test_read_hdf_open_store(tmp_path, setup_path):
219174
# GH10330
220175
# No check for non-string path_or-buf, and no test of open store
@@ -348,34 +303,6 @@ def test_read_hdf_series_mode_r(tmp_path, format, setup_path):
348303
tm.assert_series_equal(result, series)
349304

350305

351-
@pytest.mark.filterwarnings(r"ignore:Period with BDay freq is deprecated:FutureWarning")
352-
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
353-
def test_read_py2_hdf_file_in_py3(datapath):
354-
# GH 16781
355-
356-
# tests reading a PeriodIndex DataFrame written in Python2 in Python3
357-
358-
# the file was generated in Python 2.7 like so:
359-
#
360-
# df = DataFrame([1.,2,3], index=pd.PeriodIndex(
361-
# ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B'))
362-
# df.to_hdf('periodindex_0.20.1_x86_64_darwin_2.7.13.h5', 'p')
363-
364-
expected = DataFrame(
365-
[1.0, 2, 3],
366-
index=pd.PeriodIndex(["2015-01-01", "2015-01-02", "2015-01-05"], freq="B"),
367-
)
368-
369-
with ensure_clean_store(
370-
datapath(
371-
"io", "data", "legacy_hdf", "periodindex_0.20.1_x86_64_darwin_2.7.13.h5"
372-
),
373-
mode="r",
374-
) as store:
375-
result = store["p"]
376-
tm.assert_frame_equal(result, expected)
377-
378-
379306
def test_read_infer_string(tmp_path, setup_path):
380307
# GH#54431
381308
pytest.importorskip("pyarrow")

0 commit comments

Comments
 (0)