From b268bb0d0d1982606f8a77413d526721b53c0ff2 Mon Sep 17 00:00:00 2001 From: David Cottrell Date: Wed, 24 Jun 2015 21:01:12 +0100 Subject: [PATCH 1/2] Add tests and fix issue #10366 encoding and categoricals hdf serialization. --- doc/source/whatsnew/v0.17.0.txt | 2 +- pandas/io/pytables.py | 6 ++++- pandas/io/tests/test_pytables.py | 45 ++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 9049d8de550d0..c18bedd0cf6eb 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -818,7 +818,7 @@ Bug Fixes - Bug in ``read_csv`` when using the ``nrows`` or ``chunksize`` parameters if file contains only a header line (:issue:`9535`) - +- Bug in serialization of ``category`` types in HDF5 in presence of alternate encodings. (:issue:`10366`) - Bug in ``pd.DataFrame`` when constructing an empty DataFrame with a string dtype (:issue:`9428`) - Bug in ``pd.unique`` for arrays with the ``datetime64`` or ``timedelta64`` dtype that meant an array with object dtype was returned instead the original dtype (:issue:`9431`) - Bug in ``DatetimeIndex.take`` and ``TimedeltaIndex.take`` may not raise ``IndexError`` against invalid index (:issue:`10295`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index dd02157e201d5..cc79ba59c9e61 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3039,7 +3039,8 @@ def write_metadata(self, key, values): """ values = Series(values) - self.parent.put(self._get_metadata_path(key), values, format='table') + self.parent.put(self._get_metadata_path(key), values, format='table', + encoding=self.encoding, nan_rep=self.nan_rep) def read_metadata(self, key): """ return the meta data array for this key """ @@ -4418,6 +4419,9 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): dtype = "U{0}".format(itemsize) else: dtype = "S{0}".format(itemsize) + # fix? issue #10366 + data = _convert_string_array(data, _ensure_encoding(encoding), + itemsize=itemsize) data = data.astype(dtype, copy=False).astype(object, copy=False) except (Exception) as e: f = np.vectorize(lambda x: x.decode(encoding), otypes=[np.object]) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 3a128fa3f247d..b4f1e6a429198 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -930,6 +930,51 @@ def test_encoding(self): result = store.select('df',Term('columns=A',encoding='ascii')) tm.assert_frame_equal(result,expected) + def test_latin_encoding(self): + + if compat.PY2: + self.assertRaisesRegexp(TypeError, '\[unicode\] is not implemented as a table column') + return + + values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'], + [b'E\xc9, 17', b'a', b'b', b'c'], + [b'EE, 17', b'', b'a', b'b', b'c'], + [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'], + [b'', b'a', b'b', b'c'], + [b'\xf8\xfc', b'a', b'b', b'c'], + [b'A\xf8\xfc', b'', b'a', b'b', b'c'], + [np.nan, b'', b'b', b'c'], + [b'A\xf8\xfc', np.nan, b'', b'b', b'c']] + + def _try_decode(x, encoding='latin-1'): + try: + return x.decode(encoding) + except AttributeError: + return x + # not sure how to remove latin-1 from code in python 2 and 3 + values = [[_try_decode(x) for x in y] for y in values] + + examples = [] + for dtype in ['category', object]: + for val in values: + examples.append(pandas.Series(val, dtype=dtype)) + + def roundtrip(s, key='data', encoding='latin-1', nan_rep=''): + with ensure_clean_path(self.path) as store: + s.to_hdf(store, key, format='table', encoding=encoding, + nan_rep=nan_rep) + retr = read_hdf(store, key) + s_nan = s.replace(nan_rep, np.nan) + assert_series_equal(s_nan, retr) + + for s in examples: + roundtrip(s) + + # fails: + # for x in examples: + # roundtrip(s, nan_rep=b'\xf8\xfc') + + def test_append_some_nans(self): with ensure_clean_store(self.path) as store: From 8609f6caab1847d4767fc465b555572ba5f8c0d1 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 22 Aug 2015 15:56:59 -0400 Subject: [PATCH 2/2] clean up string conversions --- pandas/io/pytables.py | 52 ++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index cc79ba59c9e61..ea0a59ce2ab31 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4390,11 +4390,23 @@ def _unconvert_index_legacy(data, kind, legacy=False, encoding=None): def _convert_string_array(data, encoding, itemsize=None): + """ + we take a string-like that is object dtype and coerce to a fixed size string type + + Parameters + ---------- + data : a numpy array of object dtype + encoding : None or string-encoding + itemsize : integer, optional, defaults to the max length of the strings + + Returns + ------- + data in a fixed-length string dtype, encoded to bytes if needed + """ # encode if needed if encoding is not None and len(data): - f = np.vectorize(lambda x: x.encode(encoding), otypes=[np.object]) - data = f(data) + data = Series(data.ravel()).str.encode(encoding).values.reshape(data.shape) # create the sized dtype if itemsize is None: @@ -4404,7 +4416,20 @@ def _convert_string_array(data, encoding, itemsize=None): return data def _unconvert_string_array(data, nan_rep=None, encoding=None): - """ deserialize a string array, possibly decoding """ + """ + inverse of _convert_string_array + + Parameters + ---------- + data : fixed length string dtyped array + nan_rep : the storage repr of NaN, optional + encoding : the encoding of the data, optional + + Returns + ------- + an object array of the decoded data + + """ shape = data.shape data = np.asarray(data.ravel(), dtype=object) @@ -4413,19 +4438,16 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): encoding = _ensure_encoding(encoding) if encoding is not None and len(data): - try: - itemsize = lib.max_len_string_array(com._ensure_object(data.ravel())) - if compat.PY3: - dtype = "U{0}".format(itemsize) - else: - dtype = "S{0}".format(itemsize) - # fix? issue #10366 - data = _convert_string_array(data, _ensure_encoding(encoding), - itemsize=itemsize) + itemsize = lib.max_len_string_array(com._ensure_object(data)) + if compat.PY3: + dtype = "U{0}".format(itemsize) + else: + dtype = "S{0}".format(itemsize) + + if isinstance(data[0], compat.binary_type): + data = Series(data).str.decode(encoding).values + else: data = data.astype(dtype, copy=False).astype(object, copy=False) - except (Exception) as e: - f = np.vectorize(lambda x: x.decode(encoding), otypes=[np.object]) - data = f(data) if nan_rep is None: nan_rep = 'nan'