Merge pull request #10889 from jreback/cottrell-categ_hdf

jreback · jreback · commit f44c4902b2c8 · 2015-08-27T22:29:03.000-04:00
BUG: encoding of categoricals in hdf serialization
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -818,7 +818,7 @@ Bug Fixes
 
 
 - Bug in ``read_csv`` when using the ``nrows`` or ``chunksize`` parameters if file contains only a header line (:issue:`9535`)
-
+- Bug in serialization of ``category`` types in HDF5 in presence of alternate encodings. (:issue:`10366`)
 - Bug in ``pd.DataFrame`` when constructing an empty DataFrame with a string dtype (:issue:`9428`)
 - Bug in ``pd.unique`` for arrays with the ``datetime64`` or ``timedelta64`` dtype that meant an array with object dtype was returned instead the original dtype (:issue:`9431`)
 - Bug in ``DatetimeIndex.take`` and ``TimedeltaIndex.take`` may not raise ``IndexError`` against invalid index (:issue:`10295`)
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -3039,7 +3039,8 @@ def write_metadata(self, key, values):
 
         """
         values = Series(values)
-        self.parent.put(self._get_metadata_path(key), values, format='table')
+        self.parent.put(self._get_metadata_path(key), values, format='table',
+                encoding=self.encoding, nan_rep=self.nan_rep)
 
     def read_metadata(self, key):
         """ return the meta data array for this key """
@@ -4389,11 +4390,23 @@ def _unconvert_index_legacy(data, kind, legacy=False, encoding=None):
 
 
 def _convert_string_array(data, encoding, itemsize=None):
+    """
+    we take a string-like that is object dtype and coerce to a fixed size string type
+
+    Parameters
+    ----------
+    data : a numpy array of object dtype
+    encoding : None or string-encoding
+    itemsize : integer, optional, defaults to the max length of the strings
+
+    Returns
+    -------
+    data in a fixed-length string dtype, encoded to bytes if needed
+    """
 
     # encode if needed
     if encoding is not None and len(data):
-        f = np.vectorize(lambda x: x.encode(encoding), otypes=[np.object])
-        data = f(data)
+        data = Series(data.ravel()).str.encode(encoding).values.reshape(data.shape)
 
     # create the sized dtype
     if itemsize is None:
@@ -4403,7 +4416,20 @@ def _convert_string_array(data, encoding, itemsize=None):
     return data
 
 def _unconvert_string_array(data, nan_rep=None, encoding=None):
-    """ deserialize a string array, possibly decoding """
+    """
+    inverse of _convert_string_array
+
+    Parameters
+    ----------
+    data : fixed length string dtyped array
+    nan_rep : the storage repr of NaN, optional
+    encoding : the encoding of the data, optional
+
+    Returns
+    -------
+    an object array of the decoded data
+
+    """
     shape = data.shape
     data = np.asarray(data.ravel(), dtype=object)
 
@@ -4412,16 +4438,16 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None):
     encoding = _ensure_encoding(encoding)
     if encoding is not None and len(data):
 
-        try:
-            itemsize = lib.max_len_string_array(com._ensure_object(data.ravel()))
-            if compat.PY3:
-                dtype = "U{0}".format(itemsize)
-            else:
-                dtype = "S{0}".format(itemsize)
+        itemsize = lib.max_len_string_array(com._ensure_object(data))
+        if compat.PY3:
+            dtype = "U{0}".format(itemsize)
+        else:
+            dtype = "S{0}".format(itemsize)
+
+        if isinstance(data[0], compat.binary_type):
+            data = Series(data).str.decode(encoding).values
+        else:
             data = data.astype(dtype, copy=False).astype(object, copy=False)
-        except (Exception) as e:
-            f = np.vectorize(lambda x: x.decode(encoding), otypes=[np.object])
-            data = f(data)
 
     if nan_rep is None:
         nan_rep = 'nan'
diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
@@ -930,6 +930,51 @@ def test_encoding(self):
             result = store.select('df',Term('columns=A',encoding='ascii'))
             tm.assert_frame_equal(result,expected)
 
+    def test_latin_encoding(self):
+
+        if compat.PY2:
+            self.assertRaisesRegexp(TypeError, '\[unicode\] is not implemented as a table column')
+            return
+
+        values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'],
+                  [b'E\xc9, 17', b'a', b'b', b'c'],
+                  [b'EE, 17', b'', b'a', b'b', b'c'],
+                  [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'],
+                  [b'', b'a', b'b', b'c'],
+                  [b'\xf8\xfc', b'a', b'b', b'c'],
+                  [b'A\xf8\xfc', b'', b'a', b'b', b'c'],
+                  [np.nan, b'', b'b', b'c'],
+                  [b'A\xf8\xfc', np.nan, b'', b'b', b'c']]
+
+        def _try_decode(x, encoding='latin-1'):
+            try:
+                return x.decode(encoding)
+            except AttributeError:
+                return x
+        # not sure how to remove latin-1 from code in python 2 and 3
+        values = [[_try_decode(x) for x in y] for y in values]
+
+        examples = []
+        for dtype in ['category', object]:
+            for val in values:
+                examples.append(pandas.Series(val, dtype=dtype))
+
+        def roundtrip(s, key='data', encoding='latin-1', nan_rep=''):
+            with ensure_clean_path(self.path) as store:
+                s.to_hdf(store, key, format='table', encoding=encoding,
+                        nan_rep=nan_rep)
+                retr = read_hdf(store, key)
+                s_nan = s.replace(nan_rep, np.nan)
+                assert_series_equal(s_nan, retr)
+
+        for s in examples:
+            roundtrip(s)
+
+        # fails:
+        # for x in examples:
+        #     roundtrip(s, nan_rep=b'\xf8\xfc')
+
+
     def test_append_some_nans(self):
 
         with ensure_clean_store(self.path) as store: