Skip to content

Commit 5314a5f

Browse files
committed
Merge pull request #10686 from kawochen/BUG-FIX-10581
BUG: GH10581 where read_msgpack does not respect encoding
2 parents a0242ba + 6c3da7f commit 5314a5f

28 files changed

+2207
-1624
lines changed

doc/source/whatsnew/v0.17.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ Other enhancements
172172

173173
- ``drop_duplicates`` and ``duplicated`` now accept ``keep`` keyword to target first, last, and all duplicates. ``take_last`` keyword is deprecated, see :ref:`deprecations <whatsnew_0170.deprecations>` (:issue:`6511`, :issue:`8505`)
174174

175+
- ``msgpack`` submodule has been updated to 0.4.6 with backward compatibility (:issue:`10581`)
176+
175177
.. ipython :: python
176178

177179
s = pd.Series(['A', 'B', 'C', 'A', 'B', 'D'])
@@ -670,4 +672,5 @@ Bug Fixes
670672
- Bug in ``Series([np.nan]).astype('M8[ms]')``, which now returns ``Series([pd.NaT])`` (:issue:`10747`)
671673
- Bug in ``PeriodIndex.order`` reset freq (:issue:`10295`)
672674
- Bug in ``iloc`` allowing memory outside bounds of a Series to be accessed with negative integers (:issue:`10779`)
675+
- Bug in ``read_msgpack`` where encoding is not respected (:issue:`10580`)
673676
- Bug preventing access to the first index when using ``iloc`` with a list containing the appropriate negative integer (:issue:`10547`, :issue:`10779`)

pandas/io/packers.py

+31-14
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
from pandas.core.internals import BlockManager, make_block
6161
import pandas.core.internals as internals
6262

63-
from pandas.msgpack import Unpacker as _Unpacker, Packer as _Packer
63+
from pandas.msgpack import Unpacker as _Unpacker, Packer as _Packer, ExtType
6464

6565
# until we can pass this into our conversion functions,
6666
# this is pretty hacky
@@ -131,7 +131,7 @@ def read_msgpack(path_or_buf, iterator=False, **kwargs):
131131
return Iterator(path_or_buf)
132132

133133
def read(fh):
134-
l = list(unpack(fh))
134+
l = list(unpack(fh, **kwargs))
135135
if len(l) == 1:
136136
return l[0]
137137
return l
@@ -222,7 +222,7 @@ def convert(values):
222222
# convert to a bytes array
223223
v = v.tostring()
224224
import zlib
225-
return zlib.compress(v)
225+
return ExtType(0, zlib.compress(v))
226226

227227
elif compressor == 'blosc':
228228

@@ -233,18 +233,24 @@ def convert(values):
233233
# convert to a bytes array
234234
v = v.tostring()
235235
import blosc
236-
return blosc.compress(v, typesize=dtype.itemsize)
236+
return ExtType(0, blosc.compress(v, typesize=dtype.itemsize))
237237

238238
# ndarray (on original dtype)
239-
return v.tostring()
239+
return ExtType(0, v.tostring())
240240

241241

242242
def unconvert(values, dtype, compress=None):
243243

244+
as_is_ext = isinstance(values, ExtType) and values.code == 0
245+
246+
if as_is_ext:
247+
values = values.data
248+
244249
if dtype == np.object_:
245250
return np.array(values, dtype=object)
246251

247-
values = values.encode('latin1')
252+
if not as_is_ext:
253+
values = values.encode('latin1')
248254

249255
if compress == 'zlib':
250256
import zlib
@@ -558,19 +564,23 @@ def create_block(b):
558564

559565

560566
def pack(o, default=encode,
561-
encoding='latin1', unicode_errors='strict', use_single_float=False):
567+
encoding='latin1', unicode_errors='strict', use_single_float=False,
568+
autoreset=1, use_bin_type=1):
562569
"""
563570
Pack an object and return the packed bytes.
564571
"""
565572

566573
return Packer(default=default, encoding=encoding,
567574
unicode_errors=unicode_errors,
568-
use_single_float=use_single_float).pack(o)
575+
use_single_float=use_single_float,
576+
autoreset=autoreset,
577+
use_bin_type=use_bin_type).pack(o)
569578

570579

571580
def unpack(packed, object_hook=decode,
572581
list_hook=None, use_list=False, encoding='latin1',
573-
unicode_errors='strict', object_pairs_hook=None):
582+
unicode_errors='strict', object_pairs_hook=None,
583+
max_buffer_size=0, ext_hook=ExtType):
574584
"""
575585
Unpack a packed object, return an iterator
576586
Note: packed lists will be returned as tuples
@@ -580,27 +590,33 @@ def unpack(packed, object_hook=decode,
580590
list_hook=list_hook,
581591
use_list=use_list, encoding=encoding,
582592
unicode_errors=unicode_errors,
583-
object_pairs_hook=object_pairs_hook)
593+
object_pairs_hook=object_pairs_hook,
594+
max_buffer_size=max_buffer_size,
595+
ext_hook=ext_hook)
584596

585597

586598
class Packer(_Packer):
587599

588600
def __init__(self, default=encode,
589601
encoding='latin1',
590602
unicode_errors='strict',
591-
use_single_float=False):
603+
use_single_float=False,
604+
autoreset=1,
605+
use_bin_type=1):
592606
super(Packer, self).__init__(default=default,
593607
encoding=encoding,
594608
unicode_errors=unicode_errors,
595-
use_single_float=use_single_float)
609+
use_single_float=use_single_float,
610+
autoreset=autoreset,
611+
use_bin_type=use_bin_type)
596612

597613

598614
class Unpacker(_Unpacker):
599615

600616
def __init__(self, file_like=None, read_size=0, use_list=False,
601617
object_hook=decode,
602618
object_pairs_hook=None, list_hook=None, encoding='latin1',
603-
unicode_errors='strict', max_buffer_size=0):
619+
unicode_errors='strict', max_buffer_size=0, ext_hook=ExtType):
604620
super(Unpacker, self).__init__(file_like=file_like,
605621
read_size=read_size,
606622
use_list=use_list,
@@ -609,7 +625,8 @@ def __init__(self, file_like=None, read_size=0, use_list=False,
609625
list_hook=list_hook,
610626
encoding=encoding,
611627
unicode_errors=unicode_errors,
612-
max_buffer_size=max_buffer_size)
628+
max_buffer_size=max_buffer_size,
629+
ext_hook=ext_hook)
613630

614631

615632
class Iterator(object):

pandas/io/tests/test_packers.py

+29-3
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,9 @@ def setUp(self):
5454
def tearDown(self):
5555
pass
5656

57-
def encode_decode(self, x, **kwargs):
57+
def encode_decode(self, x, compress=None, **kwargs):
5858
with ensure_clean(self.path) as p:
59-
to_msgpack(p, x, **kwargs)
59+
to_msgpack(p, x, compress=compress, **kwargs)
6060
return read_msgpack(p, **kwargs)
6161

6262
class TestAPI(TestPackers):
@@ -517,12 +517,38 @@ def test_compression_blosc(self):
517517
assert_frame_equal(self.frame[k], i_rec[k])
518518

519519

520+
class TestEncoding(TestPackers):
521+
def setUp(self):
522+
super(TestEncoding, self).setUp()
523+
data = {
524+
'A': [compat.u('\u2019')] * 1000,
525+
'B': np.arange(1000, dtype=np.int32),
526+
'C': list(100 * 'abcdefghij'),
527+
'D': date_range(datetime.datetime(2015, 4, 1), periods=1000),
528+
'E': [datetime.timedelta(days=x) for x in range(1000)],
529+
'G': [400] * 1000
530+
}
531+
self.frame = {
532+
'float': DataFrame(dict((k, data[k]) for k in ['A', 'A'])),
533+
'int': DataFrame(dict((k, data[k]) for k in ['B', 'B'])),
534+
'mixed': DataFrame(data),
535+
}
536+
self.utf_encodings = ['utf8', 'utf16', 'utf32']
537+
538+
def test_utf(self):
539+
# GH10581
540+
for encoding in self.utf_encodings:
541+
for frame in compat.itervalues(self.frame):
542+
result = self.encode_decode(frame, encoding=encoding)
543+
assert_frame_equal(result, frame)
544+
545+
520546
class TestMsgpack():
521547
"""
522548
How to add msgpack tests:
523549
524550
1. Install pandas version intended to output the msgpack.
525-
551+
TestPackers
526552
2. Execute "generate_legacy_storage_files.py" to create the msgpack.
527553
$ python generate_legacy_storage_files.py <output_dir> msgpack
528554

0 commit comments

Comments
 (0)