BUG: avoid "b" prefix for bytes in to_csv() on Python 3 (pandas-dev#9712)

jzwinck · jzwinck · commit d165a4bda829 · 2016-08-03T17:21:48.000-05:00
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -871,3 +871,5 @@ Bug Fixes
 - Bug in ``.to_excel()`` when DataFrame contains a MultiIndex which contains a label with a NaN value (:issue:`13511`)
 - Bug in ``pd.read_csv`` in Python 2.x with non-UTF8 encoded, multi-character separated data (:issue:`3404`)
 - Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`)
+
+- Bug in ``to_csv()`` in Python 3 which emitted b'' around bytes (:issue:`9712`)
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -2020,6 +2020,14 @@ def re_replacer(s):
 
         return block
 
+    def to_native_types(self, slicer=None, na_rep='nan', quoting=None,
+                        bytes_encoding=None, **kwargs):
+        result = Block.to_native_types(self, slicer, na_rep, quoting, **kwargs)
+        if bytes_encoding is not None:
+            for arr in result:
+                lib.object_array_decode_bytes(arr, bytes_encoding)
+        return result
+
 
 class CategoricalBlock(NonConsolidatableMixIn, ObjectBlock):
     __slots__ = ()
diff --git a/pandas/formats/format.py b/pandas/formats/format.py
@@ -1378,6 +1378,12 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
         self.has_mi_columns = (isinstance(obj.columns, MultiIndex) and
                                not self.tupleize_cols)
 
+        # in Python 3, decode bytes to str so strings print without b''
+        if compat.PY3:
+            self.bytes_encoding = (encoding or get_option("display.encoding"))
+        else:
+            self.bytes_encoding = None
+
         # validate mi options
         if self.has_mi_columns:
             if cols is not None:
@@ -1387,6 +1393,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
         if cols is not None:
             if isinstance(cols, Index):
                 cols = cols.to_native_types(na_rep=na_rep,
+                                            bytes_encoding=self.bytes_encoding,
                                             float_format=float_format,
                                             date_format=date_format,
                                             quoting=self.quoting)
@@ -1399,6 +1406,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
         cols = self.obj.columns
         if isinstance(cols, Index):
             cols = cols.to_native_types(na_rep=na_rep,
+                                        bytes_encoding=self.bytes_encoding,
                                         float_format=float_format,
                                         date_format=date_format,
                                         quoting=self.quoting)
@@ -1506,6 +1514,8 @@ def _save_header(self):
             else:
                 encoded_labels = []
 
+        self._bytes_to_str(encoded_labels)
+
         if not has_mi_columns:
             encoded_labels += list(write_cols)
 
@@ -1565,6 +1575,7 @@ def _save_chunk(self, start_i, end_i):
         for i in range(len(self.blocks)):
             b = self.blocks[i]
             d = b.to_native_types(slicer=slicer, na_rep=self.na_rep,
+                                  bytes_encoding=self.bytes_encoding,
                                   float_format=self.float_format,
                                   decimal=self.decimal,
                                   date_format=self.date_format,
@@ -1575,13 +1586,22 @@ def _save_chunk(self, start_i, end_i):
                 self.data[col_loc] = col
 
         ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep,
+                                        bytes_encoding=self.bytes_encoding,
                                         float_format=self.float_format,
                                         decimal=self.decimal,
                                         date_format=self.date_format,
                                         quoting=self.quoting)
 
         lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer)
 
+    def _bytes_to_str(self, values):
+        """Modify values list by decoding bytes to str."""
+        if self.bytes_encoding:
+            for ii, value in enumerate(values):
+                if isinstance(value, bytes):
+                    values[ii] = value.decode(self.bytes_encoding)
+
+
 # from collections import namedtuple
 # ExcelCell = namedtuple("ExcelCell",
 #                        'row, col, val, style, mergestart, mergeend')
diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py
@@ -1579,12 +1579,15 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs):
             result = _trim_front(format_array(values, None, justify='left'))
         return header + result
 
-    def to_native_types(self, slicer=None, **kwargs):
+    def to_native_types(self, slicer=None, bytes_encoding=None, **kwargs):
         """ slice and dice then format """
         values = self
         if slicer is not None:
             values = values[slicer]
-        return values._format_native_types(**kwargs)
+        result = values._format_native_types(**kwargs)
+        if bytes_encoding is not None and result.dtype == object:
+            lib.object_array_decode_bytes(result, bytes_encoding)
+        return result
 
     def _format_native_types(self, na_rep='', quoting=None, **kwargs):
         """ actually format my specific types """
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
@@ -1053,6 +1053,25 @@ def string_array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_re
 
     return arr
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def object_array_decode_bytes(ndarray[object, ndim=1] arr, object encoding):
+    """Decode any instances of bytes to str in arr using the given encoding."""
+    if bytes == str: # in Python 2 these are the same and nothing needs to be done
+        return
+
+    cdef int length = arr.shape[0], i = 0
+    for i from 0 <= i < length:
+        if isinstance(arr[i], bytes):
+            arr[i] = arr[i].decode(encoding)
+        elif isinstance(arr[i], tuple):
+            mask = [isinstance(it, bytes) for it in arr[i]]
+            if any(mask):
+                val = [it.decode(encoding) if mask[j] else it for j, it in enumerate(arr[i])]
+                arr[i] = tuple(val)
+
+    return arr
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def write_csv_rows(list data, ndarray data_index, int nlevels, ndarray cols, object writer):
diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py
@@ -790,6 +790,28 @@ def test_to_csv_unicode_index_col(self):
         df2 = read_csv(buf, index_col=0, encoding='UTF-8')
         assert_frame_equal(df, df2)
 
+    def test_to_csv_bytes(self):
+        # GH 9712
+        times = pd.date_range("2013-10-27 23:00", "2013-10-28 00:00", freq="H")
+        df = DataFrame.from_items([
+            (b'hello', ['a', b'b']),
+            (b'times', times),
+        ])
+        df.loc[2] = np.nan
+        df.index.name = 'idx'
+
+        with ensure_clean() as path:
+            df.to_csv(path)
+            with open(path) as csvfile:
+                lines = csvfile.readlines()
+
+        expected = [
+            "idx,hello,times\n",
+            "0,a,2013-10-27 23:00:00\n",
+            "1,b,2013-10-28 00:00:00\n", "2,,\n",
+        ]
+        assert(lines == expected)
+
     def test_to_csv_stringio(self):
         buf = StringIO()
         self.frame.to_csv(buf)