Skip to content

Commit 8463c63

Browse files
author
David Cottrell
committed
Add tests and fix issue pandas-dev#10366 encoding and categoricals hdf serialization.
1 parent e581e1e commit 8463c63

File tree

3 files changed

+51
-1
lines changed

3 files changed

+51
-1
lines changed

doc/source/whatsnew/v0.17.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -765,3 +765,4 @@ Bug Fixes
765765
- Bug in ``read_msgpack`` where encoding is not respected (:issue:`10580`)
766766
- Bug preventing access to the first index when using ``iloc`` with a list containing the appropriate negative integer (:issue:`10547`, :issue:`10779`)
767767
- Bug in ``TimedeltaIndex`` formatter causing error while trying to save ``DataFrame`` with ``TimedeltaIndex`` using ``to_csv`` (:issue:`10833`)
768+
- Bug in ``Categorical`` hdf serialiation in presence of alternate encodings. (:issue:`10366`)

pandas/io/pytables.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -3049,7 +3049,8 @@ def write_metadata(self, key, values):
30493049
30503050
"""
30513051
values = Series(values)
3052-
self.parent.put(self._get_metadata_path(key), values, format='table')
3052+
self.parent.put(self._get_metadata_path(key), values, format='table',
3053+
encoding=self.encoding, nan_rep=self.nan_rep)
30533054

30543055
def read_metadata(self, key):
30553056
""" return the meta data array for this key """
@@ -4428,6 +4429,9 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None):
44284429
dtype = "U{0}".format(itemsize)
44294430
else:
44304431
dtype = "S{0}".format(itemsize)
4432+
# fix? issue #10366
4433+
data = _convert_string_array(data, _ensure_encoding(encoding),
4434+
itemsize=itemsize)
44314435
data = data.astype(dtype, copy=False).astype(object, copy=False)
44324436
except (Exception) as e:
44334437
f = np.vectorize(lambda x: x.decode(encoding), otypes=[np.object])

pandas/io/tests/test_pytables.py

+45
Original file line numberDiff line numberDiff line change
@@ -930,6 +930,51 @@ def test_encoding(self):
930930
result = store.select('df',Term('columns=A',encoding='ascii'))
931931
tm.assert_frame_equal(result,expected)
932932

933+
def test_latin_encoding(self):
934+
935+
if compat.PY2:
936+
self.assertRaisesRegexp(TypeError, '\[unicode\] is not implemented as a table column')
937+
return
938+
939+
values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'],
940+
[b'E\xc9, 17', b'a', b'b', b'c'],
941+
[b'EE, 17', b'', b'a', b'b', b'c'],
942+
[b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'],
943+
[b'', b'a', b'b', b'c'],
944+
[b'\xf8\xfc', b'a', b'b', b'c'],
945+
[b'A\xf8\xfc', b'', b'a', b'b', b'c'],
946+
[np.nan, b'', b'b', b'c'],
947+
[b'A\xf8\xfc', np.nan, b'', b'b', b'c']]
948+
949+
def _try_decode(x, encoding='latin-1'):
950+
try:
951+
return x.decode(encoding)
952+
except AttributeError:
953+
return x
954+
# not sure how to remove latin-1 from code in python 2 and 3
955+
values = [[_try_decode(x) for x in y] for y in values]
956+
957+
examples = []
958+
for dtype in ['category', object]:
959+
for val in values:
960+
examples.append(pandas.Series(val, dtype=dtype))
961+
962+
def roundtrip(s, key='data', encoding='latin-1', nan_rep=''):
963+
with ensure_clean_path(self.path) as store:
964+
s.to_hdf(store, key, format='table', encoding=encoding,
965+
nan_rep=nan_rep)
966+
retr = read_hdf(store, key)
967+
s_nan = s.replace(nan_rep, np.nan)
968+
assert_series_equal(s_nan, retr)
969+
970+
for s in examples:
971+
roundtrip(s)
972+
973+
# fails:
974+
# for x in examples:
975+
# roundtrip(s, nan_rep=b'\xf8\xfc')
976+
977+
933978
def test_append_some_nans(self):
934979

935980
with ensure_clean_store(self.path) as store:

0 commit comments

Comments
 (0)