Skip to content

Commit 1691a51

Browse files
Backport PR #55894 on 2.1.x: Parquet/Feather IO: disable PyExtensionType autoload (#55900)
Parquet/Feather IO: disable PyExtensionType autoload (#55894) * Parquet/Feather IO: disable PyExtensionType autoload * don't install hotfix for pyarrow >= 14.0.1 * move patching to extension type definitions * expand error message * fix compat for pyarrow not installed * add whatsnew (cherry picked from commit 851fea0)
1 parent 4158666 commit 1691a51

File tree

5 files changed

+69
-1
lines changed

5 files changed

+69
-1
lines changed

doc/source/whatsnew/v2.1.3.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ Fixed regressions
2222
Bug fixes
2323
~~~~~~~~~
2424
- Bug in :meth:`DatetimeIndex.diff` raising ``TypeError`` (:issue:`55080`)
25-
-
25+
- Bug in :meth:`Index.isin` raising for Arrow backed string and ``None`` value (:issue:`55821`)
26+
- Fix :func:`read_parquet` and :func:`read_feather` for `CVE-2023-47248 <https://www.cve.org/CVERecord?id=CVE-2023-47248>`__ (:issue:`55894`)
2627

2728
.. ---------------------------------------------------------------------------
2829
.. _whatsnew_213.other:

pandas/compat/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
pa_version_under11p0,
3232
pa_version_under13p0,
3333
pa_version_under14p0,
34+
pa_version_under14p1,
3435
)
3536

3637
if TYPE_CHECKING:
@@ -188,6 +189,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]:
188189
"pa_version_under11p0",
189190
"pa_version_under13p0",
190191
"pa_version_under14p0",
192+
"pa_version_under14p1",
191193
"IS64",
192194
"ISMUSL",
193195
"PY310",

pandas/compat/pyarrow.py

+2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
pa_version_under12p0 = _palv < Version("12.0.0")
1717
pa_version_under13p0 = _palv < Version("13.0.0")
1818
pa_version_under14p0 = _palv < Version("14.0.0")
19+
pa_version_under14p1 = _palv < Version("14.0.1")
1920
except ImportError:
2021
pa_version_under7p0 = True
2122
pa_version_under8p0 = True
@@ -25,3 +26,4 @@
2526
pa_version_under12p0 = True
2627
pa_version_under13p0 = True
2728
pa_version_under14p0 = True
29+
pa_version_under14p1 = True

pandas/core/arrays/arrow/extension_types.py

+60
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55

66
import pyarrow
77

8+
from pandas.compat import pa_version_under14p1
9+
810
from pandas.core.dtypes.dtypes import (
911
IntervalDtype,
1012
PeriodDtype,
@@ -112,3 +114,61 @@ def to_pandas_dtype(self):
112114
# register the type with a dummy instance
113115
_interval_type = ArrowIntervalType(pyarrow.int64(), "left")
114116
pyarrow.register_extension_type(_interval_type)
117+
118+
119+
_ERROR_MSG = """\
120+
Disallowed deserialization of 'arrow.py_extension_type':
121+
storage_type = {storage_type}
122+
serialized = {serialized}
123+
pickle disassembly:\n{pickle_disassembly}
124+
125+
Reading of untrusted Parquet or Feather files with a PyExtensionType column
126+
allows arbitrary code execution.
127+
If you trust this file, you can enable reading the extension type by one of:
128+
129+
- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)`
130+
- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running
131+
`import pyarrow_hotfix; pyarrow_hotfix.uninstall()`
132+
133+
We strongly recommend updating your Parquet/Feather files to use extension types
134+
derived from `pyarrow.ExtensionType` instead, and register this type explicitly.
135+
"""
136+
137+
138+
def patch_pyarrow():
139+
# starting from pyarrow 14.0.1, it has its own mechanism
140+
if not pa_version_under14p1:
141+
return
142+
143+
# if https://github.com/pitrou/pyarrow-hotfix was installed and enabled
144+
if getattr(pyarrow, "_hotfix_installed", False):
145+
return
146+
147+
class ForbiddenExtensionType(pyarrow.ExtensionType):
148+
def __arrow_ext_serialize__(self):
149+
return b""
150+
151+
@classmethod
152+
def __arrow_ext_deserialize__(cls, storage_type, serialized):
153+
import io
154+
import pickletools
155+
156+
out = io.StringIO()
157+
pickletools.dis(serialized, out)
158+
raise RuntimeError(
159+
_ERROR_MSG.format(
160+
storage_type=storage_type,
161+
serialized=serialized,
162+
pickle_disassembly=out.getvalue(),
163+
)
164+
)
165+
166+
pyarrow.unregister_extension_type("arrow.py_extension_type")
167+
pyarrow.register_extension_type(
168+
ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type")
169+
)
170+
171+
pyarrow._hotfix_installed = True
172+
173+
174+
patch_pyarrow()

pandas/io/feather_format.py

+3
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,9 @@ def read_feather(
117117
import_optional_dependency("pyarrow")
118118
from pyarrow import feather
119119

120+
# import utils to register the pyarrow extension types
121+
import pandas.core.arrays.arrow.extension_types # pyright: ignore[reportUnusedImport] # noqa: F401,E501
122+
120123
check_dtype_backend(dtype_backend)
121124

122125
with get_handle(

0 commit comments

Comments
 (0)