Skip to content

Commit 7d02f3b

Browse files
author
Joe Jevnik
committed
ENH: reimplement _move_into_mutable_buffer in C.
By writing our move function in C we can hide the original bytes object from the user while still ensuring that the lifetime is managed correctly. This implementation is designed to make it impossible to get access to the invalid bytes object from pure python.
1 parent b1726c5 commit 7d02f3b

File tree

3 files changed

+286
-73
lines changed

3 files changed

+286
-73
lines changed

pandas/io/_move.c

+274
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,274 @@
1+
#include <Python.h>
2+
3+
#define COMPILING_IN_PY2 (PY_VERSION_HEX <= 0x03000000)
4+
5+
#if !COMPILING_IN_PY2
6+
/* alias this because it is not aliased in Python 3 */
7+
#define PyString_CheckExact PyBytes_CheckExact
8+
#define PyString_AS_STRING PyBytes_AS_STRING
9+
#define PyString_GET_SIZE PyBytes_GET_SIZE
10+
#endif /* !COMPILING_IN_PY2 */
11+
12+
#ifndef Py_TPFLAGS_HAVE_GETCHARBUFFER
13+
#define Py_TPFLAGS_HAVE_GETCHARBUFFER 0
14+
#endif
15+
16+
#ifndef Py_TPFLAGS_HAVE_NEWBUFFER
17+
#define Py_TPFLAGS_HAVE_NEWBUFFER 0
18+
#endif
19+
20+
PyObject *badmove; /* bad move exception class */
21+
22+
typedef struct {
23+
PyObject_HEAD
24+
/* the bytes that own the buffer we are mutating */
25+
PyObject *invalid_bytes;
26+
} stolenbufobject;
27+
28+
PyTypeObject stolenbuf_type; /* forward declare type */
29+
30+
static void
31+
stolenbuf_dealloc(stolenbufobject *self)
32+
{
33+
Py_DECREF(self->invalid_bytes);
34+
PyObject_Del(self);
35+
}
36+
37+
static int
38+
stolenbuf_getbuffer(stolenbufobject *self, Py_buffer *view, int flags)
39+
{
40+
return PyBuffer_FillInfo(view,
41+
(PyObject*) self,
42+
(void*) PyString_AS_STRING(self->invalid_bytes),
43+
PyString_GET_SIZE(self->invalid_bytes),
44+
0, /* not readonly */
45+
flags);
46+
}
47+
48+
#if COMPILING_IN_PY2
49+
50+
static Py_ssize_t
51+
stolenbuf_getreadwritebuf(stolenbufobject *self, Py_ssize_t segment, void **out)
52+
{
53+
if (segment != 0) {
54+
PyErr_SetString(PyExc_SystemError,
55+
"accessing non-existent string segment");
56+
return -1;
57+
}
58+
*out = PyString_AS_STRING(self->invalid_bytes);
59+
return PyString_GET_SIZE(self->invalid_bytes);
60+
}
61+
62+
static Py_ssize_t
63+
stolenbuf_getsegcount(stolenbufobject *self, Py_ssize_t *len)
64+
{
65+
if (len) {
66+
*len = PyString_GET_SIZE(self->invalid_bytes);
67+
}
68+
return 1;
69+
}
70+
71+
PyBufferProcs stolenbuf_as_buffer = {
72+
(readbufferproc) stolenbuf_getreadwritebuf,
73+
(writebufferproc) stolenbuf_getreadwritebuf,
74+
(segcountproc) stolenbuf_getsegcount,
75+
(charbufferproc) stolenbuf_getreadwritebuf,
76+
(getbufferproc) stolenbuf_getbuffer,
77+
};
78+
79+
#else /* Python 3 */
80+
81+
PyBufferProcs stolenbuf_as_buffer = {
82+
(getbufferproc) stolenbuf_getbuffer,
83+
NULL,
84+
};
85+
86+
#endif /* COMPILING_IN_PY2 */
87+
88+
static PyObject *
89+
stolenbuf_new(PyObject *self, PyObject *args, PyObject *kwargs)
90+
{
91+
stolenbufobject *ret;
92+
PyObject *bytes_rvalue;
93+
94+
if (kwargs && PyDict_Size(kwargs)) {
95+
PyErr_SetString(PyExc_TypeError,
96+
"stolenbuf does not accept keyword arguments");
97+
return NULL;
98+
}
99+
100+
if (PyTuple_GET_SIZE(args) != 1) {
101+
PyErr_SetString(PyExc_TypeError,
102+
"stolenbuf requires exactly 1 positional argument");
103+
return NULL;
104+
105+
}
106+
107+
/* pull out the single, positional argument */
108+
bytes_rvalue = PyTuple_GET_ITEM(args, 0);
109+
110+
if (!PyString_CheckExact(bytes_rvalue)) {
111+
PyErr_SetString(PyExc_TypeError,
112+
"stolenbuf can only steal from bytes objects");
113+
return NULL;
114+
}
115+
116+
if (Py_REFCNT(bytes_rvalue) != 1) {
117+
/* there is a reference other than the caller's stack */
118+
PyErr_SetObject(badmove, bytes_rvalue);
119+
return NULL;
120+
}
121+
122+
if (!(ret = PyObject_New(stolenbufobject, &stolenbuf_type))) {
123+
return NULL;
124+
}
125+
126+
/* store the original bytes object in a field that is not
127+
exposed to python */
128+
Py_INCREF(bytes_rvalue);
129+
ret->invalid_bytes = bytes_rvalue;
130+
return (PyObject*) ret;
131+
}
132+
133+
PyDoc_STRVAR(
134+
stolenbuf_doc,
135+
"Moves a bytes object that is about to be destroyed into a mutable buffer\n"
136+
"without copying the data.\n"
137+
"\n"
138+
"Parameters\n"
139+
"----------\n"
140+
"bytes_rvalue : bytes with 1 refcount.\n"
141+
" The bytes object that you want to move into a mutable buffer. This\n"
142+
" cannot be a named object. It must only have a single reference.\n"
143+
"\n"
144+
"Returns\n"
145+
"-------\n"
146+
"buf : stolenbuf\n"
147+
" An object that supports the buffer protocol which can give a mutable\n"
148+
" view of the data that was previously owned by ``bytes_rvalue``.\n"
149+
"\n"
150+
"Raises\n"
151+
"------\n"
152+
"BadMove\n"
153+
" Raised when a move is attempted on an object with more than one\n"
154+
" reference.\n"
155+
"\n"
156+
"Notes\n"
157+
"-----\n"
158+
"If you want to use this function you are probably wrong.\n");
159+
160+
PyTypeObject stolenbuf_type = {
161+
PyVarObject_HEAD_INIT(&PyType_Type, 0)
162+
"pandas.io._move.stolenbuf", /* tp_name */
163+
sizeof(stolenbufobject), /* tp_basicsize */
164+
0, /* tp_itemsize */
165+
(destructor) stolenbuf_dealloc, /* tp_dealloc */
166+
0, /* tp_print */
167+
0, /* tp_getattr */
168+
0, /* tp_setattr */
169+
0, /* tp_reserved */
170+
0, /* tp_repr */
171+
0, /* tp_as_number */
172+
0, /* tp_as_sequence */
173+
0, /* tp_as_mapping */
174+
0, /* tp_hash */
175+
0, /* tp_call */
176+
0, /* tp_str */
177+
0, /* tp_getattro */
178+
0, /* tp_setattro */
179+
&stolenbuf_as_buffer, /* tp_as_buffer */
180+
Py_TPFLAGS_DEFAULT |
181+
Py_TPFLAGS_HAVE_NEWBUFFER |
182+
Py_TPFLAGS_HAVE_GETCHARBUFFER, /* tp_flags */
183+
stolenbuf_doc, /* tp_doc */
184+
0, /* tp_traverse */
185+
0, /* tp_clear */
186+
0, /* tp_richcompare */
187+
0, /* tp_weaklistoffset */
188+
0, /* tp_iter */
189+
0, /* tp_iternext */
190+
0, /* tp_methods */
191+
0, /* tp_members */
192+
0, /* tp_getset */
193+
0, /* tp_base */
194+
0, /* tp_dict */
195+
0, /* tp_descr_get */
196+
0, /* tp_descr_set */
197+
0, /* tp_dictoffset */
198+
0, /* tp_init */
199+
0, /* tp_alloc */
200+
(newfunc) stolenbuf_new, /* tp_new */
201+
};
202+
203+
#define MODULE_NAME "pandas.io._move"
204+
205+
#if !COMPILING_IN_PY2
206+
PyModuleDef _move_module = {
207+
PyModuleDef_HEAD_INIT,
208+
MODULE_NAME,
209+
NULL,
210+
-1,
211+
};
212+
#endif /* !COMPILING_IN_PY2 */
213+
214+
PyDoc_STRVAR(
215+
badmove_doc,
216+
"Exception used to indicate that a move was attempted on a value with\n"
217+
"more than a single reference.\n"
218+
"\n"
219+
"Parameters\n"
220+
"----------\n"
221+
"data : any\n"
222+
" The data which was passed to ``_move_into_mutable_buffer``.\n"
223+
"\n"
224+
"See Also\n"
225+
"--------\n"
226+
"pandas.io._move.stolenbuf\n");
227+
228+
PyMODINIT_FUNC
229+
#if !COMPILING_IN_PY2
230+
#define ERROR_RETURN NULL
231+
PyInit__move(void)
232+
#else
233+
#define ERROR_RETURN
234+
init_move(void)
235+
#endif /* !COMPILING_IN_PY2 */
236+
{
237+
PyObject *m;
238+
239+
if (!(badmove = PyErr_NewExceptionWithDoc("pandas.io._move.BadMove",
240+
badmove_doc,
241+
NULL,
242+
NULL))) {
243+
return ERROR_RETURN;
244+
}
245+
246+
if (PyType_Ready(&stolenbuf_type)) {
247+
return ERROR_RETURN;
248+
}
249+
250+
#if !COMPILING_IN_PY2
251+
if (!(m = PyModule_Create(&_move_module)))
252+
#else
253+
if (!(m = Py_InitModule(MODULE_NAME, NULL)))
254+
#endif /* !COMPILING_IN_PY2 */
255+
{
256+
return ERROR_RETURN;
257+
}
258+
259+
if (PyModule_AddObject(m,
260+
"move_into_mutable_buffer",
261+
(PyObject*) &stolenbuf_type)) {
262+
Py_DECREF(m);
263+
return ERROR_RETURN;
264+
}
265+
266+
if (PyModule_AddObject(m, "BadMove", badmove)) {
267+
Py_DECREF(m);
268+
return ERROR_RETURN;
269+
}
270+
271+
#if !COMPILING_IN_PY2
272+
return m;
273+
#endif /* !COMPILING_IN_PY2 */
274+
}

pandas/io/packers.py

+5-73
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@
4141
from datetime import datetime, date, timedelta
4242
from dateutil.parser import parse
4343
import os
44-
import sys
4544
import warnings
4645

4746
import numpy as np
@@ -64,6 +63,10 @@
6463
import pandas.core.internals as internals
6564

6665
from pandas.msgpack import Unpacker as _Unpacker, Packer as _Packer, ExtType
66+
from ._move import (
67+
BadMove as _BadMove,
68+
move_into_mutable_buffer as _move_into_mutable_buffer,
69+
)
6770

6871
# until we can pass this into our conversion functions,
6972
# this is pretty hacky
@@ -246,77 +249,6 @@ def convert(values):
246249
return ExtType(0, v.tostring())
247250

248251

249-
class _BadMove(ValueError):
250-
"""Exception used to indicate that a move was attempted on a value with
251-
more than a single reference.
252-
253-
Parameters
254-
----------
255-
data : any
256-
The data which was passed to ``_move_into_mutable_buffer``.
257-
258-
See Also
259-
--------
260-
_move_into_mutable_buffer
261-
"""
262-
def __init__(self, data):
263-
self.data = data
264-
265-
def __str__(self):
266-
return 'cannot move data from a named object'
267-
268-
269-
def _move_into_mutable_buffer(bytes_rvalue):
270-
"""Moves a bytes object that is about to be destroyed into a mutable buffer
271-
without copying the data.
272-
273-
Parameters
274-
----------
275-
bytes_rvalue : bytes with 1 refcount.
276-
The bytes object that you want to move into a mutable buffer. This
277-
cannot be a named object. It must only have a single reference.
278-
279-
Returns
280-
-------
281-
buf : memoryview
282-
A mutable buffer that was previously used as that data for
283-
``bytes_rvalue``.
284-
285-
Raises
286-
------
287-
_BadMove
288-
Raised when a move is attempted on an object with more than one
289-
reference.
290-
291-
Notes
292-
-----
293-
If you want to use this function you are probably wrong.
294-
"""
295-
if sys.getrefcount(bytes_rvalue) != 3:
296-
# The three references are:
297-
# 1. The callers stack (this is the only external reference)
298-
# 2. The locals for this function
299-
# 3. This function's stack (to pass to `sys.getrefcount`)
300-
raise _BadMove(bytes_rvalue)
301-
302-
# create a numpy array from the memory of `bytes_rvalue`
303-
arr = np.frombuffer(bytes_rvalue, dtype=np.int8)
304-
try:
305-
# mark this array as mutable
306-
arr.flags.writeable = True
307-
# At this point any mutations to `arr` will invalidate `bytes_rvalue`.
308-
# This would be fine but `np.frombuffer` is going to store this object
309-
# on `arr.base`. In order to preserve user's sanity we are going to
310-
# destroy `arr` to drop the final reference to `bytes_rvalue` and just
311-
# return a `memoryview` of the now mutable data. This dance is very
312-
# fast and makes it impossible for users to shoot themselves in the
313-
# foot.
314-
return memoryview(arr)
315-
finally:
316-
# assure that our mutable view is destroyed even if we raise
317-
del arr
318-
319-
320252
def unconvert(values, dtype, compress=None):
321253

322254
as_is_ext = isinstance(values, ExtType) and values.code == 0
@@ -350,7 +282,7 @@ def unconvert(values, dtype, compress=None):
350282
# We don't just store this in the locals because we want to
351283
# minimize the risk of giving users access to a `bytes` object
352284
# whose data is also given to a mutable buffer.
353-
values = e.data
285+
values = e.args[0]
354286
if len(values) > 1:
355287
# The empty string and single characters are memoized in many
356288
# string creating functions in the capi. This case should not

setup.py

+7
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,13 @@ def pxd(name):
510510

511511
extensions.append(ujson_ext)
512512

513+
# extension for pseudo-safely moving bytes into mutable buffers
514+
_move_ext = Extension('pandas.io._move',
515+
depends=[],
516+
sources=['pandas/io/_move.c'])
517+
extensions.append(_move_ext)
518+
519+
513520

514521
if _have_setuptools:
515522
setuptools_kwargs["test_suite"] = "nose.collector"

0 commit comments

Comments
 (0)