Skip to content

Commit 330dd76

Browse files
author
Joe Jevnik
committed
ENH: updates unconvert no-copy code to be safer
Pulls the logic that does the buffer management into a new private function named `_move_into_mutable_buffer`. The function acts like a move constructor from 'bytes' to `memoryview`. We now only call `np.frombuffer` on the resulting `memoryview` object. This accomplishes the original task of getting a mutable array from a `bytes` object without copying with the added safety of making the `base` field of the array that is returned to the user a mutable `memoryview`. This eliminates the risk of users obtaining the original `bytes` object which is now pointing to a mutable buffer and violating the immutability contract of `bytes` objects.
1 parent 830abba commit 330dd76

File tree

1 file changed

+92
-23
lines changed

1 file changed

+92
-23
lines changed

pandas/io/packers.py

+92-23
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,77 @@ def convert(values):
246246
return ExtType(0, v.tostring())
247247

248248

249+
class _BadMove(ValueError):
250+
"""Exception used to indicate that a move was attempted on a value with
251+
more than a single reference.
252+
253+
Parameters
254+
----------
255+
data : any
256+
The data which was passed to ``_move_into_mutable_buffer``.
257+
258+
See Also
259+
--------
260+
_move_into_mutable_buffer
261+
"""
262+
def __init__(self, data):
263+
self.data = data
264+
265+
def __str__(self):
266+
return 'cannot move data from a named object'
267+
268+
269+
def _move_into_mutable_buffer(bytes_rvalue):
270+
"""Moves a bytes object that is about to be destroyed into a mutable buffer
271+
without copying the data.
272+
273+
Parameters
274+
----------
275+
bytes_rvalue : bytes with 1 refcount.
276+
The bytes object that you want to move into a mutable buffer. This
277+
cannot be a named object. It must only have a single reference.
278+
279+
Returns
280+
-------
281+
buf : memoryview
282+
A mutable buffer that was previously used as that data for
283+
``bytes_rvalue``.
284+
285+
Raises
286+
------
287+
_BadMove
288+
Raised when a move is attempted on an object with more than one
289+
reference.
290+
291+
Notes
292+
-----
293+
If you want to use this function you are probably wrong.
294+
"""
295+
if sys.getrefcount(bytes_rvalue) != 3:
296+
# The three references are:
297+
# 1. The callers stack (this is the only external reference)
298+
# 2. The locals for this function
299+
# 3. This function's stack (to pass to `sys.getrefcount`)
300+
raise _BadMove(bytes_rvalue)
301+
302+
# create a numpy array from the memory of `bytes_rvalue`
303+
arr = np.frombuffer(bytes_rvalue, dtype=np.int8)
304+
try:
305+
# mark this array as mutable
306+
arr.flags.writeable = True
307+
# At this point any mutations to `arr` will invalidate `bytes_rvalue`.
308+
# This would be fine but `np.frombuffer` is going to store this object
309+
# on `arr.base`. In order to preserve user's sanity we are going to
310+
# destroy `arr` to drop the final reference to `bytes_rvalue` and just
311+
# return a `memoryview` of the now mutable data. This dance is very
312+
# fast and makes it impossible for users to shoot themselves in the
313+
# foot.
314+
return memoryview(arr)
315+
finally:
316+
# assure that our mutable view is destroyed even if we raise
317+
del arr
318+
319+
249320
def unconvert(values, dtype, compress=None):
250321

251322
as_is_ext = isinstance(values, ExtType) and values.code == 0
@@ -269,30 +340,28 @@ def unconvert(values, dtype, compress=None):
269340
else:
270341
raise ValueError("compress must be one of 'zlib' or 'blosc'")
271342

272-
values = decompress(values)
273-
if sys.getrefcount(values) == 2:
274-
arr = np.frombuffer(values, dtype=dtype)
275-
# We are setting the memory owned by a bytes object as mutable.
276-
# We can do this because we know that no one has a reference to
277-
# this object since it was just created in the call to
278-
# decompress and we have checked that we have the only
279-
# reference. the refcnt reports as 2 instead of 1 because we
280-
# incref the values object when we push it on the stack to call
281-
# getrefcnt. The 2 references are then the local variable
282-
# `values` and TOS.
283-
arr.flags.writeable = True
284-
return arr
285-
elif len(values) > 1:
286-
# The empty string and single characters are memoized in many
287-
# string creating functions in the capi. This case should not warn
288-
# even though we need to make a copy because we are only copying at
289-
# most 1 byte.
290-
warnings.warn(
291-
'copying data after decompressing; this may mean that'
292-
' decompress is caching its result',
293-
PerformanceWarning,
343+
try:
344+
return np.frombuffer(
345+
_move_into_mutable_buffer(decompress(values)),
346+
dtype=dtype,
294347
)
295-
# fall through to copying `np.fromstring`
348+
except _BadMove as e:
349+
# Pull the decompressed data off of the `_BadMove` exception.
350+
# We don't just store this in the locals because we want to
351+
# minimize the risk of giving users access to a `bytes` object
352+
# whose data is also given to a mutable buffer.
353+
values = e.data
354+
if len(values) > 1:
355+
# The empty string and single characters are memoized in many
356+
# string creating functions in the capi. This case should not
357+
# warn even though we need to make a copy because we are only
358+
# copying at most 1 byte.
359+
warnings.warn(
360+
'copying data after decompressing; this may mean that'
361+
' decompress is caching its result',
362+
PerformanceWarning,
363+
)
364+
# fall through to copying `np.fromstring`
296365

297366
# Copy the string into a numpy array.
298367
return np.fromstring(values, dtype=dtype)

0 commit comments

Comments
 (0)