Skip to content

Commit f44c490

Browse files
committed
Merge pull request #10889 from jreback/cottrell-categ_hdf
BUG: encoding of categoricals in hdf serialization
2 parents c3a4de3 + 8609f6c commit f44c490

File tree

3 files changed

+85
-14
lines changed

3 files changed

+85
-14
lines changed

doc/source/whatsnew/v0.17.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -818,7 +818,7 @@ Bug Fixes
818818

819819

820820
- Bug in ``read_csv`` when using the ``nrows`` or ``chunksize`` parameters if file contains only a header line (:issue:`9535`)
821-
821+
- Bug in serialization of ``category`` types in HDF5 in presence of alternate encodings. (:issue:`10366`)
822822
- Bug in ``pd.DataFrame`` when constructing an empty DataFrame with a string dtype (:issue:`9428`)
823823
- Bug in ``pd.unique`` for arrays with the ``datetime64`` or ``timedelta64`` dtype that meant an array with object dtype was returned instead the original dtype (:issue:`9431`)
824824
- Bug in ``DatetimeIndex.take`` and ``TimedeltaIndex.take`` may not raise ``IndexError`` against invalid index (:issue:`10295`)

pandas/io/pytables.py

+39-13
Original file line numberDiff line numberDiff line change
@@ -3039,7 +3039,8 @@ def write_metadata(self, key, values):
30393039
30403040
"""
30413041
values = Series(values)
3042-
self.parent.put(self._get_metadata_path(key), values, format='table')
3042+
self.parent.put(self._get_metadata_path(key), values, format='table',
3043+
encoding=self.encoding, nan_rep=self.nan_rep)
30433044

30443045
def read_metadata(self, key):
30453046
""" return the meta data array for this key """
@@ -4389,11 +4390,23 @@ def _unconvert_index_legacy(data, kind, legacy=False, encoding=None):
43894390

43904391

43914392
def _convert_string_array(data, encoding, itemsize=None):
4393+
"""
4394+
we take a string-like that is object dtype and coerce to a fixed size string type
4395+
4396+
Parameters
4397+
----------
4398+
data : a numpy array of object dtype
4399+
encoding : None or string-encoding
4400+
itemsize : integer, optional, defaults to the max length of the strings
4401+
4402+
Returns
4403+
-------
4404+
data in a fixed-length string dtype, encoded to bytes if needed
4405+
"""
43924406

43934407
# encode if needed
43944408
if encoding is not None and len(data):
4395-
f = np.vectorize(lambda x: x.encode(encoding), otypes=[np.object])
4396-
data = f(data)
4409+
data = Series(data.ravel()).str.encode(encoding).values.reshape(data.shape)
43974410

43984411
# create the sized dtype
43994412
if itemsize is None:
@@ -4403,7 +4416,20 @@ def _convert_string_array(data, encoding, itemsize=None):
44034416
return data
44044417

44054418
def _unconvert_string_array(data, nan_rep=None, encoding=None):
4406-
""" deserialize a string array, possibly decoding """
4419+
"""
4420+
inverse of _convert_string_array
4421+
4422+
Parameters
4423+
----------
4424+
data : fixed length string dtyped array
4425+
nan_rep : the storage repr of NaN, optional
4426+
encoding : the encoding of the data, optional
4427+
4428+
Returns
4429+
-------
4430+
an object array of the decoded data
4431+
4432+
"""
44074433
shape = data.shape
44084434
data = np.asarray(data.ravel(), dtype=object)
44094435

@@ -4412,16 +4438,16 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None):
44124438
encoding = _ensure_encoding(encoding)
44134439
if encoding is not None and len(data):
44144440

4415-
try:
4416-
itemsize = lib.max_len_string_array(com._ensure_object(data.ravel()))
4417-
if compat.PY3:
4418-
dtype = "U{0}".format(itemsize)
4419-
else:
4420-
dtype = "S{0}".format(itemsize)
4441+
itemsize = lib.max_len_string_array(com._ensure_object(data))
4442+
if compat.PY3:
4443+
dtype = "U{0}".format(itemsize)
4444+
else:
4445+
dtype = "S{0}".format(itemsize)
4446+
4447+
if isinstance(data[0], compat.binary_type):
4448+
data = Series(data).str.decode(encoding).values
4449+
else:
44214450
data = data.astype(dtype, copy=False).astype(object, copy=False)
4422-
except (Exception) as e:
4423-
f = np.vectorize(lambda x: x.decode(encoding), otypes=[np.object])
4424-
data = f(data)
44254451

44264452
if nan_rep is None:
44274453
nan_rep = 'nan'

pandas/io/tests/test_pytables.py

+45
Original file line numberDiff line numberDiff line change
@@ -930,6 +930,51 @@ def test_encoding(self):
930930
result = store.select('df',Term('columns=A',encoding='ascii'))
931931
tm.assert_frame_equal(result,expected)
932932

933+
def test_latin_encoding(self):
934+
935+
if compat.PY2:
936+
self.assertRaisesRegexp(TypeError, '\[unicode\] is not implemented as a table column')
937+
return
938+
939+
values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'],
940+
[b'E\xc9, 17', b'a', b'b', b'c'],
941+
[b'EE, 17', b'', b'a', b'b', b'c'],
942+
[b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'],
943+
[b'', b'a', b'b', b'c'],
944+
[b'\xf8\xfc', b'a', b'b', b'c'],
945+
[b'A\xf8\xfc', b'', b'a', b'b', b'c'],
946+
[np.nan, b'', b'b', b'c'],
947+
[b'A\xf8\xfc', np.nan, b'', b'b', b'c']]
948+
949+
def _try_decode(x, encoding='latin-1'):
950+
try:
951+
return x.decode(encoding)
952+
except AttributeError:
953+
return x
954+
# not sure how to remove latin-1 from code in python 2 and 3
955+
values = [[_try_decode(x) for x in y] for y in values]
956+
957+
examples = []
958+
for dtype in ['category', object]:
959+
for val in values:
960+
examples.append(pandas.Series(val, dtype=dtype))
961+
962+
def roundtrip(s, key='data', encoding='latin-1', nan_rep=''):
963+
with ensure_clean_path(self.path) as store:
964+
s.to_hdf(store, key, format='table', encoding=encoding,
965+
nan_rep=nan_rep)
966+
retr = read_hdf(store, key)
967+
s_nan = s.replace(nan_rep, np.nan)
968+
assert_series_equal(s_nan, retr)
969+
970+
for s in examples:
971+
roundtrip(s)
972+
973+
# fails:
974+
# for x in examples:
975+
# roundtrip(s, nan_rep=b'\xf8\xfc')
976+
977+
933978
def test_append_some_nans(self):
934979

935980
with ensure_clean_store(self.path) as store:

0 commit comments

Comments
 (0)