BUG: Avoids b' prefix for bytes in to_csv() (pandas-dev#9712)

sidhant007 · sidhant007 · commit 385991b8a431 · 2020-06-29T23:41:42.000+08:00
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -1030,6 +1030,7 @@ I/O
 - Bug in :meth:`read_excel` for ODS files removes 0.0 values (:issue:`27222`)
 - Bug in :meth:`ujson.encode` was raising an `OverflowError` with numbers larger than sys.maxsize (:issue: `34395`)
 - Bug in :meth:`HDFStore.append_to_multiple` was raising a ``ValueError`` when the min_itemsize parameter is set (:issue:`11238`)
+- Bug in :meth:`to_csv` which emitted b'' around bytes (:issue:`9712`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -1558,6 +1558,17 @@ cdef class Validator:
         else:
             return False
 
+    cdef bint any(self, ndarray values) except -1:
+        if not self.n:
+            return False
+        cdef:
+            Py_ssize_t i
+            Py_ssize_t n = self.n
+        for i in range(n):
+            if self.is_valid(values[i]):
+                return True
+        return False
+
     @cython.wraparound(False)
     @cython.boundscheck(False)
     cdef bint _validate(self, ndarray values) except -1:
@@ -1709,13 +1720,24 @@ cdef class BytesValidator(Validator):
     cdef inline bint is_array_typed(self) except -1:
         return issubclass(self.dtype.type, np.bytes_)
 
-
-cdef bint is_bytes_array(ndarray values, bint skipna=False):
+cpdef bint is_bytes_array(ndarray values, bint skipna=False,
+                          bint mixing_allowed=True) except -1:
+    """Checks if all the values are bytes or not. When mixing_allowed is false and
+    some are bytes and some are not, then throws a ValueError."""
     cdef:
         BytesValidator validator = BytesValidator(len(values), values.dtype,
                                                   skipna=skipna)
-    return validator.validate(values)
-
+    is_all_bytes = validator.validate(values)
+    if mixing_allowed:
+        return is_all_bytes
+    else:
+        if is_all_bytes:
+            return True
+        else:
+            is_any_bytes = validator.any(values)
+            if is_any_bytes:
+                raise ValueError("Cannot mix types")
+            return False
 
 cdef class TemporalValidator(Validator):
     cdef:
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -77,7 +77,7 @@
 from pandas.core.ops import get_op_result_name
 from pandas.core.ops.invalid import make_invalid_op
 from pandas.core.sorting import ensure_key_mapped
-from pandas.core.strings import StringMethods
+from pandas.core.strings import StringMethods, str_decode
 
 from pandas.io.formats.printing import (
     PrettyDict,
@@ -954,6 +954,8 @@ def to_native_types(self, slicer=None, **kwargs):
                 Whether or not there are quoted values in `self`
             3) date_format : str
                 The format used to represent date-like values.
+            4) bytes_encoding : str
+                The encoding scheme to use to decode the bytes.
 
         Returns
         -------
@@ -965,7 +967,9 @@ def to_native_types(self, slicer=None, **kwargs):
             values = values[slicer]
         return values._format_native_types(**kwargs)
 
-    def _format_native_types(self, na_rep="", quoting=None, **kwargs):
+    def _format_native_types(
+        self, na_rep="", quoting=None, bytes_encoding=None, **kwargs
+    ):
         """
         Actually format specific types of the index.
         """
@@ -976,6 +980,8 @@ def _format_native_types(self, na_rep="", quoting=None, **kwargs):
             values = np.array(self, dtype=object, copy=True)
 
         values[mask] = na_rep
+        if lib.is_bytes_array(values, skipna=True, mixing_allowed=False):
+            values = str_decode(values, bytes_encoding)
         return values
 
     def _summary(self, name=None) -> str_t:
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -83,6 +83,7 @@
 )
 import pandas.core.missing as missing
 from pandas.core.nanops import nanpercentile
+from pandas.core.strings import str_decode
 
 if TYPE_CHECKING:
     from pandas import Index
@@ -653,13 +654,20 @@ def should_store(self, value: ArrayLike) -> bool:
         """
         return is_dtype_equal(value.dtype, self.dtype)
 
-    def to_native_types(self, na_rep="nan", quoting=None, **kwargs):
+    def to_native_types(
+        self, na_rep="nan", bytes_encoding=None, quoting=None, **kwargs
+    ):
         """ convert to our native types format """
         values = self.values
 
         mask = isna(values)
         itemsize = writers.word_len(na_rep)
 
+        length = values.shape[0]
+        for i in range(length):
+            if lib.is_bytes_array(values[i], skipna=True, mixing_allowed=False):
+                values[i] = str_decode(values[i], bytes_encoding)
+
         if not self.is_object and not quoting and itemsize:
             values = values.astype(str)
             if values.dtype.itemsize / np.dtype("U1").itemsize < itemsize:
diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
@@ -11,7 +11,7 @@
 
 import numpy as np
 
-from pandas._libs import writers as libwriters
+from pandas._libs import lib, writers as libwriters
 from pandas._typing import FilePathOrBuffer
 
 from pandas.core.dtypes.generic import (
@@ -108,6 +108,7 @@ def __init__(
             if isinstance(cols, ABCIndexClass):
                 cols = cols.to_native_types(
                     na_rep=na_rep,
+                    bytes_encoding=self.encoding,
                     float_format=float_format,
                     date_format=date_format,
                     quoting=self.quoting,
@@ -122,6 +123,7 @@ def __init__(
         if isinstance(cols, ABCIndexClass):
             cols = cols.to_native_types(
                 na_rep=na_rep,
+                bytes_encoding=self.encoding,
                 float_format=float_format,
                 date_format=date_format,
                 quoting=self.quoting,
@@ -278,6 +280,8 @@ def _save_header(self):
             else:
                 encoded_labels = []
 
+        self._bytes_to_str(encoded_labels)
+
         if not has_mi_columns or has_aliases:
             encoded_labels += list(write_cols)
             writer.writerow(encoded_labels)
@@ -300,6 +304,7 @@ def _save_header(self):
                         col_line.extend([""] * (len(index_label) - 1))
 
                 col_line.extend(columns._get_level_values(i))
+                self._bytes_to_str(col_line)
 
                 writer.writerow(col_line)
 
@@ -340,6 +345,7 @@ def _save_chunk(self, start_i: int, end_i: int) -> None:
             b = blocks[i]
             d = b.to_native_types(
                 na_rep=self.na_rep,
+                bytes_encoding=self.encoding,
                 float_format=self.float_format,
                 decimal=self.decimal,
                 date_format=self.date_format,
@@ -353,10 +359,19 @@ def _save_chunk(self, start_i: int, end_i: int) -> None:
         ix = data_index.to_native_types(
             slicer=slicer,
             na_rep=self.na_rep,
+            bytes_encoding=self.encoding,
             float_format=self.float_format,
             decimal=self.decimal,
             date_format=self.date_format,
             quoting=self.quoting,
         )
 
         libwriters.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer)
+
+    def _bytes_to_str(self, values):
+        """If all the values are bytes, then modify values list by decoding
+        bytes to str."""
+        np_values = np.array(values, dtype=object)
+        if lib.is_bytes_array(np_values, skipna=True, mixing_allowed=False):
+            for i, value in enumerate(values):
+                values[i] = value.decode(self.encoding)
diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py
@@ -740,6 +740,89 @@ def test_to_csv_withcommas(self):
             df2 = self.read_csv(path)
             tm.assert_frame_equal(df2, df)
 
+    def test_to_csv_bytes(self):
+        # GH 9712
+        times = date_range("2013-10-27 23:00", "2013-10-28 00:00", freq="H")
+        df = DataFrame({b"foo": [b"bar", b"baz"], b"times": times}, index=[b"A", b"B"])
+        df.loc[b"C"] = np.nan
+        df.index.name = b"idx"
+
+        df_expected = DataFrame(
+            {"foo": ["bar", "baz"], "times": times}, index=["A", "B"]
+        )
+        df_expected.loc["C"] = np.nan
+        df_expected.index.name = "idx"
+
+        with tm.ensure_clean("__tmp_to_csv_bytes__.csv") as path:
+            df.to_csv(path, header=True)
+            df_output = self.read_csv(path)
+            df_output.times = to_datetime(df_output.times)
+            tm.assert_frame_equal(df_output, df_expected)
+
+        non_unicode_byte = b"\xbc\xa6"
+        non_unicode_decoded = non_unicode_byte.decode("gb18030")
+        df = DataFrame({non_unicode_byte: [non_unicode_byte, b"foo"]})
+        df.index.name = "idx"
+
+        df_expected = DataFrame({non_unicode_decoded: [non_unicode_decoded, "foo"]})
+        df_expected.index.name = "idx"
+
+        with tm.ensure_clean("__tmp_to_csv_bytes__.csv") as path:
+            df.to_csv(path, encoding="gb18030", header=True)
+            df_output = self.read_csv(path, encoding="gb18030")
+            tm.assert_frame_equal(df_output, df_expected)
+
+        # decoding error, when transcoding fails
+        with pytest.raises(UnicodeDecodeError):
+            df.to_csv(encoding="utf-8")
+
+        # mixing of bytes and non-bytes
+        df = DataFrame({"foo": [b"bar", "baz"]})
+        with pytest.raises(ValueError):
+            df.to_csv()
+        df = DataFrame({b"foo": ["a", "b"], "bar": ["c", "d"]})
+        with pytest.raises(ValueError):
+            df.to_csv()
+        df = DataFrame({"foo": ["a", "b"], "bar": ["c", "d"]}, index=["A", b"B"])
+        with pytest.raises(ValueError):
+            df.to_csv()
+
+        # multi-indexes
+        iterables = [[b"A", b"B"], ["C", "D"]]
+        index = pd.MultiIndex.from_product(iterables, names=[b"f", b"s"])
+        data = np.array([[0, 0], [0, 0], [0, 0], [0, 0]])
+        df = pd.DataFrame(data, index=index)
+
+        with tm.ensure_clean("__tmp_to_csv_bytes__.csv") as path:
+            df.to_csv(path)
+            import sys
+
+            df.to_csv(sys.stdout)
+            with open(path) as csvfile:
+                output = csvfile.readlines()
+
+        expected = [
+            "f,s,0,1\n",
+            "A,C,0,0\n",
+            "A,D,0,0\n",
+            "B,C,0,0\n",
+            "B,D,0,0\n",
+        ]
+        assert output == expected
+
+        # mixing of bytes and non-bytes in multi-indexes
+        iterables = [[b"A", "B"], ["C", "D"]]
+        index = pd.MultiIndex.from_product(iterables)
+        df = pd.DataFrame(data, index=index)
+        with pytest.raises(ValueError):
+            df.to_csv()
+
+        iterables = [["A", "B"], ["C", "D"]]
+        index = pd.MultiIndex.from_product(iterables, names=[b"f", "s"])
+        df = pd.DataFrame(data, index=index)
+        with pytest.raises(ValueError):
+            df.to_csv()
+
     def test_to_csv_mixed(self):
         def create_cols(name):
             return [f"{name}{i:03d}" for i in range(5)]