Skip to content

Commit 84d7275

Browse files
author
Joe Jevnik
committed
ENH: reimplement _move_into_mutable_buffer in C.
By writing our move function in C we can hide the original bytes object from the user while still ensuring that the lifetime is managed correctly. This implementation is designed to make it impossible to get access to the invalid bytes object from pure python.
1 parent 330dd76 commit 84d7275

File tree

4 files changed

+367
-77
lines changed

4 files changed

+367
-77
lines changed

pandas/io/packers.py

+53-77
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
from datetime import datetime, date, timedelta
4242
from dateutil.parser import parse
4343
import os
44-
import sys
44+
from textwrap import dedent
4545
import warnings
4646

4747
import numpy as np
@@ -64,6 +64,51 @@
6464
import pandas.core.internals as internals
6565

6666
from pandas.msgpack import Unpacker as _Unpacker, Packer as _Packer, ExtType
67+
from pandas.util._move import (
68+
BadMove as _BadMove,
69+
move_into_mutable_buffer as _move_into_mutable_buffer,
70+
)
71+
72+
# check whcih compression libs we have installed
73+
try:
74+
import zlib
75+
76+
def _check_zlib():
77+
pass
78+
except ImportError:
79+
def _check_zlib():
80+
raise ValueError('zlib is not installed')
81+
82+
_check_zlib.__doc__ = dedent(
83+
"""\
84+
Check if zlib is installed.
85+
86+
Raises
87+
------
88+
ValueError
89+
Raised when zlib is not installed.
90+
""",
91+
)
92+
93+
try:
94+
import blosc
95+
96+
def _check_blosc():
97+
pass
98+
except ImportError:
99+
def _check_blosc():
100+
raise ValueError('blosc is not installed')
101+
102+
_check_blosc.__doc__ = dedent(
103+
"""\
104+
Check if blosc is installed.
105+
106+
Raises
107+
------
108+
ValueError
109+
Raised when blosc is not installed.
110+
""",
111+
)
67112

68113
# until we can pass this into our conversion functions,
69114
# this is pretty hacky
@@ -221,102 +266,31 @@ def convert(values):
221266
return v.tolist()
222267

223268
if compressor == 'zlib':
269+
_check_zlib()
224270

225271
# return string arrays like they are
226272
if dtype == np.object_:
227273
return v.tolist()
228274

229275
# convert to a bytes array
230276
v = v.tostring()
231-
import zlib
232277
return ExtType(0, zlib.compress(v))
233278

234279
elif compressor == 'blosc':
280+
_check_blosc()
235281

236282
# return string arrays like they are
237283
if dtype == np.object_:
238284
return v.tolist()
239285

240286
# convert to a bytes array
241287
v = v.tostring()
242-
import blosc
243288
return ExtType(0, blosc.compress(v, typesize=dtype.itemsize))
244289

245290
# ndarray (on original dtype)
246291
return ExtType(0, v.tostring())
247292

248293

249-
class _BadMove(ValueError):
250-
"""Exception used to indicate that a move was attempted on a value with
251-
more than a single reference.
252-
253-
Parameters
254-
----------
255-
data : any
256-
The data which was passed to ``_move_into_mutable_buffer``.
257-
258-
See Also
259-
--------
260-
_move_into_mutable_buffer
261-
"""
262-
def __init__(self, data):
263-
self.data = data
264-
265-
def __str__(self):
266-
return 'cannot move data from a named object'
267-
268-
269-
def _move_into_mutable_buffer(bytes_rvalue):
270-
"""Moves a bytes object that is about to be destroyed into a mutable buffer
271-
without copying the data.
272-
273-
Parameters
274-
----------
275-
bytes_rvalue : bytes with 1 refcount.
276-
The bytes object that you want to move into a mutable buffer. This
277-
cannot be a named object. It must only have a single reference.
278-
279-
Returns
280-
-------
281-
buf : memoryview
282-
A mutable buffer that was previously used as that data for
283-
``bytes_rvalue``.
284-
285-
Raises
286-
------
287-
_BadMove
288-
Raised when a move is attempted on an object with more than one
289-
reference.
290-
291-
Notes
292-
-----
293-
If you want to use this function you are probably wrong.
294-
"""
295-
if sys.getrefcount(bytes_rvalue) != 3:
296-
# The three references are:
297-
# 1. The callers stack (this is the only external reference)
298-
# 2. The locals for this function
299-
# 3. This function's stack (to pass to `sys.getrefcount`)
300-
raise _BadMove(bytes_rvalue)
301-
302-
# create a numpy array from the memory of `bytes_rvalue`
303-
arr = np.frombuffer(bytes_rvalue, dtype=np.int8)
304-
try:
305-
# mark this array as mutable
306-
arr.flags.writeable = True
307-
# At this point any mutations to `arr` will invalidate `bytes_rvalue`.
308-
# This would be fine but `np.frombuffer` is going to store this object
309-
# on `arr.base`. In order to preserve user's sanity we are going to
310-
# destroy `arr` to drop the final reference to `bytes_rvalue` and just
311-
# return a `memoryview` of the now mutable data. This dance is very
312-
# fast and makes it impossible for users to shoot themselves in the
313-
# foot.
314-
return memoryview(arr)
315-
finally:
316-
# assure that our mutable view is destroyed even if we raise
317-
del arr
318-
319-
320294
def unconvert(values, dtype, compress=None):
321295

322296
as_is_ext = isinstance(values, ExtType) and values.code == 0
@@ -334,9 +308,11 @@ def unconvert(values, dtype, compress=None):
334308

335309
if compress:
336310
if compress == u'zlib':
337-
from zlib import decompress
311+
_check_zlib()
312+
decompress = zlib.decompress
338313
elif compress == u'blosc':
339-
from blosc import decompress
314+
_check_blosc()
315+
decompress = blosc.decompress
340316
else:
341317
raise ValueError("compress must be one of 'zlib' or 'blosc'")
342318

@@ -350,7 +326,7 @@ def unconvert(values, dtype, compress=None):
350326
# We don't just store this in the locals because we want to
351327
# minimize the risk of giving users access to a `bytes` object
352328
# whose data is also given to a mutable buffer.
353-
values = e.data
329+
values = e.args[0]
354330
if len(values) > 1:
355331
# The empty string and single characters are memoized in many
356332
# string creating functions in the capi. This case should not

pandas/tests/test_util.py

+33
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# -*- coding: utf-8 -*-
22
import nose
33

4+
from pandas.util._move import move_into_mutable_buffer, BadMove
45
from pandas.util.decorators import deprecate_kwarg
56
from pandas.util.validators import validate_args, validate_kwargs
67

@@ -150,6 +151,38 @@ def test_validation(self):
150151
kwargs = {'f': 'foo', 'b': 'bar'}
151152
validate_kwargs('func', kwargs, *compat_args)
152153

154+
155+
class TestMove(tm.TestCase):
156+
def test_more_than_one_ref(self):
157+
"""Test case for when we try to use ``move_into_mutable_buffer`` when
158+
the object being moved has other references.
159+
"""
160+
b = b'testing'
161+
162+
with tm.assertRaises(BadMove) as e:
163+
def handle_success(type_, value, tb):
164+
self.assertIs(value.args[0], b)
165+
return type(e).handle_success(e, type_, value, tb) # super
166+
167+
e.handle_success = handle_success
168+
move_into_mutable_buffer(b)
169+
170+
def test_exactly_one_ref(self):
171+
"""Test case for when the object being moved has exactly one reference.
172+
"""
173+
b = b'testing'
174+
175+
# We need to pass an expression on the stack to ensure that there are
176+
# not extra references hanging around. We cannot rewrite this test as
177+
# buf = b[:-3]
178+
# as_stolen_buf = move_into_mutable_buffer(buf)
179+
# because then we would have more than one reference to buf.
180+
as_stolen_buf = move_into_mutable_buffer(b[:-3])
181+
182+
# materialize as bytearray to show that it is mutable
183+
self.assertEqual(bytearray(as_stolen_buf), b'test')
184+
185+
153186
if __name__ == '__main__':
154187
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
155188
exit=False)

0 commit comments

Comments
 (0)