BUG: Fix a bug occuring when using DataFrame.to_records with unicode

AlexisMignon · jreback · commit 25dcff597162 · 2017-02-27T14:43:58.000-05:00
column names in python 2. closes #11879 closes #13462
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -615,7 +615,8 @@ Bug Fixes
 - Bug in ``DataFrame.fillna()`` where the argument ``downcast`` was ignored when fillna value was of type ``dict`` (:issue:`15277`)
 - Bug in ``.reset_index()`` when an all ``NaN`` level of a ``MultiIndex`` would fail (:issue:`6322`)
 
-- Bug in ``pd.read_msgpack`` when deserializing a ``CategoricalIndex`` (:issue:`15487`)
+- Bug in ``pd.read_msgpack()`` when deserializing a ``CategoricalIndex`` (:issue:`15487`)
+- Bug in ``pd.DataFrame.to_records()`` which failed with unicode characters in column names (:issue:`11879`)
 
 
 - Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1105,13 +1105,17 @@ def to_records(self, index=True, convert_datetime64=True):
                         count += 1
             elif index_names[0] is None:
                 index_names = ['index']
-            names = lmap(str, index_names) + lmap(str, self.columns)
+            names = (lmap(compat.text_type, index_names) +
+                     lmap(compat.text_type, self.columns))
         else:
             arrays = [self[c].get_values() for c in self.columns]
-            names = lmap(str, self.columns)
+            names = lmap(compat.text_type, self.columns)
 
-        dtype = np.dtype([(x, v.dtype) for x, v in zip(names, arrays)])
-        return np.rec.fromarrays(arrays, dtype=dtype, names=names)
+        formats = [v.dtype for v in arrays]
+        return np.rec.fromarrays(
+            arrays,
+            dtype={'names': names, 'formats': formats}
+        )
 
     @classmethod
     def from_items(cls, items, columns=None, orient='columns'):
diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py
@@ -177,3 +177,18 @@ def test_to_records_with_unicode_index(self):
             .to_records()
         expected = np.rec.array([('x', 'y')], dtype=[('a', 'O'), ('b', 'O')])
         tm.assert_almost_equal(result, expected)
+
+    def test_to_records_with_unicode_column_names(self):
+        # xref issue: https://github.com/numpy/numpy/issues/2407
+        # Issue #11879. to_records used to raise an exception when used
+        # with column names containing non ascii caracters in Python 2
+        result = DataFrame(data={u"accented_name_é": [1.0]}).to_records()
+
+        # Note that numpy allows for unicode field names but dtypes need
+        # to be specified using dictionnary intsead of list of tuples.
+        expected = np.rec.array(
+            [(0, 1.0)],
+            dtype={"names": ["index", u"accented_name_é"],
+                   "formats": ['<i8', '<f8']}
+        )
+        tm.assert_almost_equal(result, expected)