diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 6b4bde588469e..f128cec97c1fe 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -765,3 +765,4 @@ Bug Fixes - Bug in ``read_msgpack`` where encoding is not respected (:issue:`10580`) - Bug preventing access to the first index when using ``iloc`` with a list containing the appropriate negative integer (:issue:`10547`, :issue:`10779`) - Bug in ``TimedeltaIndex`` formatter causing error while trying to save ``DataFrame`` with ``TimedeltaIndex`` using ``to_csv`` (:issue:`10833`) +- Bug in ``Categorical`` hdf serialiation in presence of alternate encodings. (:issue:`10366`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 8ef6363f836ae..db5856ac3fa60 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3049,7 +3049,8 @@ def write_metadata(self, key, values): """ values = Series(values) - self.parent.put(self._get_metadata_path(key), values, format='table') + self.parent.put(self._get_metadata_path(key), values, format='table', + encoding=self.encoding, nan_rep=self.nan_rep) def read_metadata(self, key): """ return the meta data array for this key """ @@ -4428,6 +4429,9 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): dtype = "U{0}".format(itemsize) else: dtype = "S{0}".format(itemsize) + # fix? issue #10366 + data = _convert_string_array(data, _ensure_encoding(encoding), + itemsize=itemsize) data = data.astype(dtype, copy=False).astype(object, copy=False) except (Exception) as e: f = np.vectorize(lambda x: x.decode(encoding), otypes=[np.object]) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 210852d83094f..022546192f297 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -930,6 +930,51 @@ def test_encoding(self): result = store.select('df',Term('columns=A',encoding='ascii')) tm.assert_frame_equal(result,expected) + def test_latin_encoding(self): + + if compat.PY2: + self.assertRaisesRegexp(TypeError, '\[unicode\] is not implemented as a table column') + return + + values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'], + [b'E\xc9, 17', b'a', b'b', b'c'], + [b'EE, 17', b'', b'a', b'b', b'c'], + [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'], + [b'', b'a', b'b', b'c'], + [b'\xf8\xfc', b'a', b'b', b'c'], + [b'A\xf8\xfc', b'', b'a', b'b', b'c'], + [np.nan, b'', b'b', b'c'], + [b'A\xf8\xfc', np.nan, b'', b'b', b'c']] + + def _try_decode(x, encoding='latin-1'): + try: + return x.decode(encoding) + except AttributeError: + return x + # not sure how to remove latin-1 from code in python 2 and 3 + values = [[_try_decode(x) for x in y] for y in values] + + examples = [] + for dtype in ['category', object]: + for val in values: + examples.append(pandas.Series(val, dtype=dtype)) + + def roundtrip(s, key='data', encoding='latin-1', nan_rep=''): + with ensure_clean_path(self.path) as store: + s.to_hdf(store, key, format='table', encoding=encoding, + nan_rep=nan_rep) + retr = read_hdf(store, key) + s_nan = s.replace(nan_rep, np.nan) + assert_series_equal(s_nan, retr) + + for s in examples: + roundtrip(s) + + # fails: + # for x in examples: + # roundtrip(s, nan_rep=b'\xf8\xfc') + + def test_append_some_nans(self): with ensure_clean_store(self.path) as store: