Skip to content

Commit 830abba

Browse files
author
Joe Jevnik
committed
TST: adds test for memoized empty string and chars
In many capi functions for python the empty string and single characters are memoized to return preallocated bytes objects. This could be a potential problem with the implementation of pandas.io.packers.unconvert so we are adding a test for this explicitly. Currently neither zlib nor blosc hit this case because they use PyBytes_FromStringAndSize(NULL, n) which does not pull from the shared pool of bytes objects; however, this test will guard us against changes to this implementation detail in the future.
1 parent 9cd9d80 commit 830abba

File tree

2 files changed

+49
-8
lines changed

2 files changed

+49
-8
lines changed

pandas/io/packers.py

+11-7
Original file line numberDiff line numberDiff line change
@@ -274,15 +274,19 @@ def unconvert(values, dtype, compress=None):
274274
arr = np.frombuffer(values, dtype=dtype)
275275
# We are setting the memory owned by a bytes object as mutable.
276276
# We can do this because we know that no one has a reference to
277-
# this object since it was just created in the call to decompress
278-
# and we have checked that we have the only reference.
279-
# the refcnt reports as 2 instead of 1 because we incref the
280-
# values object when we push it on the stack to call getrefcnt.
281-
# The 2 references are then the local variable `values` and
282-
# TOS.
277+
# this object since it was just created in the call to
278+
# decompress and we have checked that we have the only
279+
# reference. the refcnt reports as 2 instead of 1 because we
280+
# incref the values object when we push it on the stack to call
281+
# getrefcnt. The 2 references are then the local variable
282+
# `values` and TOS.
283283
arr.flags.writeable = True
284284
return arr
285-
else:
285+
elif len(values) > 1:
286+
# The empty string and single characters are memoized in many
287+
# string creating functions in the capi. This case should not warn
288+
# even though we need to make a copy because we are only copying at
289+
# most 1 byte.
286290
warnings.warn(
287291
'copying data after decompressing; this may mean that'
288292
' decompress is caching its result',

pandas/io/tests/test_packers.py

+38-1
Original file line numberDiff line numberDiff line change
@@ -635,10 +635,47 @@ def test_compression_warns_when_decompress_caches_zlib(self):
635635
self._test_compression_warns_when_decompress_caches('zlib')
636636

637637
def test_compression_warns_when_decompress_caches_blosc(self):
638-
if not _ZLIB_INSTALLED:
638+
if not _BLOSC_INSTALLED:
639639
raise nose.SkipTest('no blosc')
640640
self._test_compression_warns_when_decompress_caches('blosc')
641641

642+
def _test_small_strings_no_warn(self, compress):
643+
empty = np.array([], dtype='uint8')
644+
with tm.assert_produces_warning(None):
645+
empty_unpacked = self.encode_decode(empty, compress=compress)
646+
647+
np.testing.assert_array_equal(empty_unpacked, empty)
648+
self.assertTrue(empty_unpacked.flags.writeable)
649+
650+
char = np.array([ord(b'a')], dtype='uint8')
651+
with tm.assert_produces_warning(None):
652+
char_unpacked = self.encode_decode(char, compress=compress)
653+
654+
np.testing.assert_array_equal(char_unpacked, char)
655+
self.assertTrue(char_unpacked.flags.writeable)
656+
# if this test fails I am sorry because the interpreter is now in a
657+
# bad state where b'a' points to 98 == ord(b'b').
658+
char_unpacked[0] = ord(b'b')
659+
660+
# we compare the ord of bytes b'a' with unicode u'a' because the should
661+
# always be the same (unless we were able to mutate the shared
662+
# character singleton in which case ord(b'a') == ord(b'b').
663+
self.assertEqual(ord(b'a'), ord(u'a'))
664+
np.testing.assert_array_equal(
665+
char_unpacked,
666+
np.array([ord(b'b')], dtype='uint8'),
667+
)
668+
669+
def test_small_strings_no_warn_zlib(self):
670+
if not _ZLIB_INSTALLED:
671+
raise nose.SkipTest('no zlib')
672+
self._test_small_strings_no_warn('zlib')
673+
674+
def test_small_strings_no_warn_blosc(self):
675+
if not _BLOSC_INSTALLED:
676+
raise nose.SkipTest('no blosc')
677+
self._test_small_strings_no_warn('blosc')
678+
642679
def test_readonly_axis_blosc(self):
643680
# GH11880
644681
if not _BLOSC_INSTALLED:

0 commit comments

Comments
 (0)