BUG: Fix b' prefix for bytes in to_csv() (pandas-dev#9712)

sidhant007 · sidhant007 · commit 2d3d035de3aa · 2020-06-26T12:21:18.000+08:00
Add a new optional parameter named bytes_encoding to allow a specific
encoding scheme to be used to decode the bytes.
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -998,6 +998,7 @@ I/O
 - Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the `%` character and no parameters were present (:issue:`34211`)
 - Bug in :meth:`~pandas.io.stata.StataReader` which resulted in categorical variables with difference dtypes when reading data using an iterator. (:issue:`31544`)
 - :meth:`HDFStore.keys` has now an optional `include` parameter that allows the retrieval of all native HDF5 table names (:issue:`29916`)
+ - Bug in :meth:`to_csv` which emitted b'' around bytes. It now has an optional `bytes_encoding` parameter that allows to pass a specific encoding scheme according to which the bytes are decoded. (:issue:`9712`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -1558,6 +1558,17 @@ cdef class Validator:
         else:
             return False
 
+    cdef bint any(self, ndarray values) except -1:
+        if not self.n:
+            return False
+        cdef:
+            Py_ssize_t i
+            Py_ssize_t n = self.n
+        for i in range(n):
+            if self.is_valid(values[i]):
+                return True
+        return False
+
     @cython.wraparound(False)
     @cython.boundscheck(False)
     cdef bint _validate(self, ndarray values) except -1:
@@ -1710,12 +1721,17 @@ cdef class BytesValidator(Validator):
         return issubclass(self.dtype.type, np.bytes_)
 
 
-cdef bint is_bytes_array(ndarray values, bint skipna=False):
+cpdef bint is_bytes_array(ndarray values, bint skipna=False):
     cdef:
         BytesValidator validator = BytesValidator(len(values), values.dtype,
                                                   skipna=skipna)
     return validator.validate(values)
 
+cpdef bint is_any_bytes_in_array(ndarray values, bint skipna=False):
+    cdef:
+        BytesValidator validator = BytesValidator(len(values), values.dtype,
+                                                  skipna=skipna)
+    return validator.any(values)
 
 cdef class TemporalValidator(Validator):
     cdef:
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3031,6 +3031,7 @@ def to_csv(
         index_label: Optional[Union[bool_t, str, Sequence[Label]]] = None,
         mode: str = "w",
         encoding: Optional[str] = None,
+        bytes_encoding: Optional[str] = None,
         compression: Optional[Union[str, Mapping[str, str]]] = "infer",
         quoting: Optional[int] = None,
         quotechar: str = '"',
@@ -3088,6 +3089,10 @@ def to_csv(
         encoding : str, optional
             A string representing the encoding to use in the output file,
             defaults to 'utf-8'.
+        bytes_encoding : str, optional
+            A string representing the encoding to use to decode the bytes
+            in the output file, defaults to using the 'encoding' parameter or the
+            encoding specified by the file object.
         compression : str or dict, default 'infer'
             If str, represents compression mode. If dict, value at 'method' is
             the compression mode. Compression mode may be any of the following
@@ -3178,6 +3183,7 @@ def to_csv(
             line_terminator=line_terminator,
             sep=sep,
             encoding=encoding,
+            bytes_encoding=bytes_encoding,
             errors=errors,
             compression=compression,
             quoting=quoting,
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -77,7 +77,7 @@
 from pandas.core.ops import get_op_result_name
 from pandas.core.ops.invalid import make_invalid_op
 from pandas.core.sorting import ensure_key_mapped
-from pandas.core.strings import StringMethods
+from pandas.core.strings import StringMethods, str_decode
 
 from pandas.io.formats.printing import (
     PrettyDict,
@@ -954,6 +954,8 @@ def to_native_types(self, slicer=None, **kwargs):
                 Whether or not there are quoted values in `self`
             3) date_format : str
                 The format used to represent date-like values.
+            4) bytes_encoding : str
+                The encoding scheme to use to decode the bytes.
 
         Returns
         -------
@@ -965,7 +967,9 @@ def to_native_types(self, slicer=None, **kwargs):
             values = values[slicer]
         return values._format_native_types(**kwargs)
 
-    def _format_native_types(self, na_rep="", quoting=None, **kwargs):
+    def _format_native_types(
+        self, na_rep="", quoting=None, bytes_encoding=None, **kwargs
+    ):
         """
         Actually format specific types of the index.
         """
@@ -976,6 +980,12 @@ def _format_native_types(self, na_rep="", quoting=None, **kwargs):
             values = np.array(self, dtype=object, copy=True)
 
         values[mask] = na_rep
+        is_all_bytes = lib.is_bytes_array(values, skipna=True)
+        is_any_bytes = lib.is_any_bytes_in_array(values, skipna=True)
+        if is_any_bytes and not is_all_bytes:
+            raise ValueError("Cannot mix types")
+        if bytes_encoding is not None and is_all_bytes:
+            values = str_decode(values, bytes_encoding)
         return values
 
     def _summary(self, name=None) -> str_t:
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -82,6 +82,7 @@
 )
 import pandas.core.missing as missing
 from pandas.core.nanops import nanpercentile
+from pandas.core.strings import str_decode
 
 if TYPE_CHECKING:
     from pandas import Index
@@ -642,13 +643,24 @@ def should_store(self, value: ArrayLike) -> bool:
         """
         return is_dtype_equal(value.dtype, self.dtype)
 
-    def to_native_types(self, na_rep="nan", quoting=None, **kwargs):
+    def to_native_types(
+        self, na_rep="nan", bytes_encoding=None, quoting=None, **kwargs
+    ):
         """ convert to our native types format """
         values = self.values
 
         mask = isna(values)
         itemsize = writers.word_len(na_rep)
 
+        length = values.shape[0]
+        for i in range(length):
+            is_all_bytes = lib.is_bytes_array(values[i], skipna=True)
+            is_any_bytes = lib.is_any_bytes_in_array(values[i])
+            if is_any_bytes and not is_all_bytes:
+                raise ValueError("Cannot mix types")
+            if bytes_encoding is not None and is_all_bytes:
+                values[i] = str_decode(values[i], bytes_encoding)
+
         if not self.is_object and not quoting and itemsize:
             values = values.astype(str)
             if values.dtype.itemsize / np.dtype("U1").itemsize < itemsize:
diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
@@ -11,7 +11,7 @@
 
 import numpy as np
 
-from pandas._libs import writers as libwriters
+from pandas._libs import writers as libwriters, lib
 from pandas._typing import FilePathOrBuffer
 
 from pandas.core.dtypes.generic import (
@@ -30,6 +30,23 @@
 )
 
 
+class EncodingConflictWarning(Warning):
+    pass
+
+
+encoding_conflict_doc = """
+the encoding scheme: [%s] with which the the existing file object is opened \
+conflicted with the encoding scheme: [%s] mentioned in the .to_csv method. \
+Will be using encoding scheme mentioned by the file object that is [%s].
+"""
+
+
+def _mismatch_encoding(encoding, path_or_buf_encoding):
+    if encoding is None or path_or_buf_encoding is None:
+        return False
+    return encoding != path_or_buf_encoding
+
+
 class CSVFormatter:
     def __init__(
         self,
@@ -44,6 +61,7 @@ def __init__(
         index_label: Optional[Union[bool, Hashable, Sequence[Hashable]]] = None,
         mode: str = "w",
         encoding: Optional[str] = None,
+        bytes_encoding: Optional[str] = None,
         errors: str = "strict",
         compression: Union[str, Mapping[str, str], None] = "infer",
         quoting: Optional[int] = None,
@@ -75,12 +93,32 @@ def __init__(
         self.index = index
         self.index_label = index_label
         self.mode = mode
-        if encoding is None:
-            encoding = "utf-8"
+
+        if hasattr(self.path_or_buf, "encoding"):
+            if _mismatch_encoding(encoding, self.path_or_buf.encoding):
+                ws = encoding_conflict_doc % (
+                    self.path_or_buf.encoding,
+                    encoding,
+                    self.path_or_buf.encoding,
+                )
+                warnings.warn(ws, EncodingConflictWarning, stacklevel=2)
+            if self.path_or_buf.encoding is None:
+                encoding = "utf-8"
+            else:
+                encoding = self.path_or_buf.encoding
+        else:
+            if encoding is None:
+                encoding = "utf-8"
+
         self.encoding = encoding
         self.errors = errors
         self.compression = infer_compression(self.path_or_buf, compression)
 
+        if bytes_encoding is None:
+            bytes_encoding = self.encoding
+
+        self.bytes_encoding = bytes_encoding
+
         if quoting is None:
             quoting = csvlib.QUOTE_MINIMAL
         self.quoting = quoting
@@ -108,6 +146,7 @@ def __init__(
             if isinstance(cols, ABCIndexClass):
                 cols = cols.to_native_types(
                     na_rep=na_rep,
+                    bytes_encoding=bytes_encoding,
                     float_format=float_format,
                     date_format=date_format,
                     quoting=self.quoting,
@@ -122,6 +161,7 @@ def __init__(
         if isinstance(cols, ABCIndexClass):
             cols = cols.to_native_types(
                 na_rep=na_rep,
+                bytes_encoding=bytes_encoding,
                 float_format=float_format,
                 date_format=date_format,
                 quoting=self.quoting,
@@ -278,6 +318,8 @@ def _save_header(self):
             else:
                 encoded_labels = []
 
+        self._bytes_to_str(encoded_labels)
+
         if not has_mi_columns or has_aliases:
             encoded_labels += list(write_cols)
             writer.writerow(encoded_labels)
@@ -300,6 +342,7 @@ def _save_header(self):
                         col_line.extend([""] * (len(index_label) - 1))
 
                 col_line.extend(columns._get_level_values(i))
+                self._bytes_to_str(col_line)
 
                 writer.writerow(col_line)
 
@@ -340,6 +383,7 @@ def _save_chunk(self, start_i: int, end_i: int) -> None:
             b = blocks[i]
             d = b.to_native_types(
                 na_rep=self.na_rep,
+                bytes_encoding=self.bytes_encoding,
                 float_format=self.float_format,
                 decimal=self.decimal,
                 date_format=self.date_format,
@@ -353,10 +397,23 @@ def _save_chunk(self, start_i: int, end_i: int) -> None:
         ix = data_index.to_native_types(
             slicer=slicer,
             na_rep=self.na_rep,
+            bytes_encoding=self.bytes_encoding,
             float_format=self.float_format,
             decimal=self.decimal,
             date_format=self.date_format,
             quoting=self.quoting,
         )
 
         libwriters.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer)
+
+    def _bytes_to_str(self, values):
+        """If all the values are bytes, then modify values list by decoding
+        bytes to str."""
+        np_values = np.array(values, dtype=object)
+        is_all_bytes = lib.is_bytes_array(np_values)
+        is_any_bytes = lib.is_any_bytes_in_array(np_values)
+        if is_any_bytes and not is_all_bytes:
+            raise ValueError("Cannot mix types")
+        if self.bytes_encoding is not None and is_all_bytes:
+            for i, value in enumerate(values):
+                values[i] = value.decode(self.bytes_encoding)
diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py
@@ -740,6 +740,91 @@ def test_to_csv_withcommas(self):
             df2 = self.read_csv(path)
             tm.assert_frame_equal(df2, df)
 
+    def test_to_csv_bytes(self):
+        # GH 9712
+        times = date_range("2013-10-27 23:00", "2013-10-28 00:00", freq="H")
+        df = DataFrame(
+            {b"hello": [b"abcd", b"world"], b"times": times}, index=[b"A", b"B"]
+        )
+        df.loc[b"C"] = np.nan
+        df.index.name = b"idx"
+
+        df_expected = DataFrame(
+            {"hello": ["abcd", "world"], "times": times}, index=["A", "B"]
+        )
+        df_expected.loc["C"] = np.nan
+        df_expected.index.name = "idx"
+
+        with tm.ensure_clean("__tmp_to_csv_bytes__.csv") as path:
+            df.to_csv(path, header=True)
+            df_output = self.read_csv(path)
+            df_output.times = to_datetime(df_output.times)
+            tm.assert_frame_equal(df_output, df_expected)
+
+        non_unicode_byte = b"\xbc\xa6"
+        non_unicode_decoded = non_unicode_byte.decode("gb18030")
+        df = DataFrame({non_unicode_byte: [non_unicode_byte, b"world"]})
+        df.index.name = "idx"
+
+        df_expected = DataFrame({non_unicode_decoded: [non_unicode_decoded, "world"]})
+        df_expected.index.name = "idx"
+
+        with tm.ensure_clean("__tmp_to_csv_bytes__.csv") as path:
+            df.to_csv(path, bytes_encoding="gb18030", header=True)
+            df_output = self.read_csv(path)
+            tm.assert_frame_equal(df_output, df_expected)
+
+        # decoding error, when transcoding fails
+        with pytest.raises(UnicodeDecodeError):
+            df.to_csv(bytes_encoding="utf-8")
+
+        # mixing of bytes and non-bytes
+        df = DataFrame({"hello": [b"abcd", "world"]})
+        with pytest.raises(ValueError):
+            df.to_csv()
+        df = DataFrame({b"hello": ["a", "b"], "world": ["c", "d"]})
+        with pytest.raises(ValueError):
+            df.to_csv()
+        df = DataFrame({"hello": ["a", "b"], "world": ["c", "d"]}, index=["A", b"B"])
+        with pytest.raises(ValueError):
+            df.to_csv()
+
+        # multi-indexes
+        iterables = [[b"A", b"B"], ["C", "D"]]
+        index = pd.MultiIndex.from_product(iterables, names=[b"f", b"s"])
+        data = np.array([[0, 0], [0, 0], [0, 0], [0, 0]])
+        df = pd.DataFrame(data, index=index)
+
+        with tm.ensure_clean("__tmp_to_csv_bytes__.csv") as path:
+            df.to_csv(path)
+            import sys
+
+            df.to_csv(sys.stdout)
+            with open(path) as csvfile:
+                output = csvfile.readlines()
+
+        expected = [
+            "f,s,0,1\n",
+            "A,C,0,0\n",
+            "A,D,0,0\n",
+            "B,C,0,0\n",
+            "B,D,0,0\n",
+        ]
+        assert output == expected
+
+        # mixing of bytes and non-bytes in multi-indexes
+        iterables = [[b"A", "B"], ["C", "D"]]
+        index = pd.MultiIndex.from_product(iterables)
+        df = pd.DataFrame(data, index=index)
+        with pytest.raises(ValueError):
+            df.to_csv()
+
+        iterables = [["A", "B"], ["C", "D"]]
+        index = pd.MultiIndex.from_product(iterables, names=[b"f", "s"])
+        df = pd.DataFrame(data, index=index)
+        with pytest.raises(ValueError):
+            df.to_csv()
+
     def test_to_csv_mixed(self):
         def create_cols(name):
             return [f"{name}{i:03d}" for i in range(5)]