Skip to content

Commit 385991b

Browse files
committed
BUG: Avoids b' prefix for bytes in to_csv() (pandas-dev#9712)
1 parent 998e2ab commit 385991b

File tree

6 files changed

+143
-8
lines changed

6 files changed

+143
-8
lines changed

doc/source/whatsnew/v1.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1030,6 +1030,7 @@ I/O
10301030
- Bug in :meth:`read_excel` for ODS files removes 0.0 values (:issue:`27222`)
10311031
- Bug in :meth:`ujson.encode` was raising an `OverflowError` with numbers larger than sys.maxsize (:issue: `34395`)
10321032
- Bug in :meth:`HDFStore.append_to_multiple` was raising a ``ValueError`` when the min_itemsize parameter is set (:issue:`11238`)
1033+
- Bug in :meth:`to_csv` which emitted b'' around bytes (:issue:`9712`)
10331034

10341035
Plotting
10351036
^^^^^^^^

pandas/_libs/lib.pyx

+26-4
Original file line numberDiff line numberDiff line change
@@ -1558,6 +1558,17 @@ cdef class Validator:
15581558
else:
15591559
return False
15601560

1561+
cdef bint any(self, ndarray values) except -1:
1562+
if not self.n:
1563+
return False
1564+
cdef:
1565+
Py_ssize_t i
1566+
Py_ssize_t n = self.n
1567+
for i in range(n):
1568+
if self.is_valid(values[i]):
1569+
return True
1570+
return False
1571+
15611572
@cython.wraparound(False)
15621573
@cython.boundscheck(False)
15631574
cdef bint _validate(self, ndarray values) except -1:
@@ -1709,13 +1720,24 @@ cdef class BytesValidator(Validator):
17091720
cdef inline bint is_array_typed(self) except -1:
17101721
return issubclass(self.dtype.type, np.bytes_)
17111722

1712-
1713-
cdef bint is_bytes_array(ndarray values, bint skipna=False):
1723+
cpdef bint is_bytes_array(ndarray values, bint skipna=False,
1724+
bint mixing_allowed=True) except -1:
1725+
"""Checks if all the values are bytes or not. When mixing_allowed is false and
1726+
some are bytes and some are not, then throws a ValueError."""
17141727
cdef:
17151728
BytesValidator validator = BytesValidator(len(values), values.dtype,
17161729
skipna=skipna)
1717-
return validator.validate(values)
1718-
1730+
is_all_bytes = validator.validate(values)
1731+
if mixing_allowed:
1732+
return is_all_bytes
1733+
else:
1734+
if is_all_bytes:
1735+
return True
1736+
else:
1737+
is_any_bytes = validator.any(values)
1738+
if is_any_bytes:
1739+
raise ValueError("Cannot mix types")
1740+
return False
17191741

17201742
cdef class TemporalValidator(Validator):
17211743
cdef:

pandas/core/indexes/base.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@
7777
from pandas.core.ops import get_op_result_name
7878
from pandas.core.ops.invalid import make_invalid_op
7979
from pandas.core.sorting import ensure_key_mapped
80-
from pandas.core.strings import StringMethods
80+
from pandas.core.strings import StringMethods, str_decode
8181

8282
from pandas.io.formats.printing import (
8383
PrettyDict,
@@ -954,6 +954,8 @@ def to_native_types(self, slicer=None, **kwargs):
954954
Whether or not there are quoted values in `self`
955955
3) date_format : str
956956
The format used to represent date-like values.
957+
4) bytes_encoding : str
958+
The encoding scheme to use to decode the bytes.
957959
958960
Returns
959961
-------
@@ -965,7 +967,9 @@ def to_native_types(self, slicer=None, **kwargs):
965967
values = values[slicer]
966968
return values._format_native_types(**kwargs)
967969

968-
def _format_native_types(self, na_rep="", quoting=None, **kwargs):
970+
def _format_native_types(
971+
self, na_rep="", quoting=None, bytes_encoding=None, **kwargs
972+
):
969973
"""
970974
Actually format specific types of the index.
971975
"""
@@ -976,6 +980,8 @@ def _format_native_types(self, na_rep="", quoting=None, **kwargs):
976980
values = np.array(self, dtype=object, copy=True)
977981

978982
values[mask] = na_rep
983+
if lib.is_bytes_array(values, skipna=True, mixing_allowed=False):
984+
values = str_decode(values, bytes_encoding)
979985
return values
980986

981987
def _summary(self, name=None) -> str_t:

pandas/core/internals/blocks.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@
8383
)
8484
import pandas.core.missing as missing
8585
from pandas.core.nanops import nanpercentile
86+
from pandas.core.strings import str_decode
8687

8788
if TYPE_CHECKING:
8889
from pandas import Index
@@ -653,13 +654,20 @@ def should_store(self, value: ArrayLike) -> bool:
653654
"""
654655
return is_dtype_equal(value.dtype, self.dtype)
655656

656-
def to_native_types(self, na_rep="nan", quoting=None, **kwargs):
657+
def to_native_types(
658+
self, na_rep="nan", bytes_encoding=None, quoting=None, **kwargs
659+
):
657660
""" convert to our native types format """
658661
values = self.values
659662

660663
mask = isna(values)
661664
itemsize = writers.word_len(na_rep)
662665

666+
length = values.shape[0]
667+
for i in range(length):
668+
if lib.is_bytes_array(values[i], skipna=True, mixing_allowed=False):
669+
values[i] = str_decode(values[i], bytes_encoding)
670+
663671
if not self.is_object and not quoting and itemsize:
664672
values = values.astype(str)
665673
if values.dtype.itemsize / np.dtype("U1").itemsize < itemsize:

pandas/io/formats/csvs.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
import numpy as np
1313

14-
from pandas._libs import writers as libwriters
14+
from pandas._libs import lib, writers as libwriters
1515
from pandas._typing import FilePathOrBuffer
1616

1717
from pandas.core.dtypes.generic import (
@@ -108,6 +108,7 @@ def __init__(
108108
if isinstance(cols, ABCIndexClass):
109109
cols = cols.to_native_types(
110110
na_rep=na_rep,
111+
bytes_encoding=self.encoding,
111112
float_format=float_format,
112113
date_format=date_format,
113114
quoting=self.quoting,
@@ -122,6 +123,7 @@ def __init__(
122123
if isinstance(cols, ABCIndexClass):
123124
cols = cols.to_native_types(
124125
na_rep=na_rep,
126+
bytes_encoding=self.encoding,
125127
float_format=float_format,
126128
date_format=date_format,
127129
quoting=self.quoting,
@@ -278,6 +280,8 @@ def _save_header(self):
278280
else:
279281
encoded_labels = []
280282

283+
self._bytes_to_str(encoded_labels)
284+
281285
if not has_mi_columns or has_aliases:
282286
encoded_labels += list(write_cols)
283287
writer.writerow(encoded_labels)
@@ -300,6 +304,7 @@ def _save_header(self):
300304
col_line.extend([""] * (len(index_label) - 1))
301305

302306
col_line.extend(columns._get_level_values(i))
307+
self._bytes_to_str(col_line)
303308

304309
writer.writerow(col_line)
305310

@@ -340,6 +345,7 @@ def _save_chunk(self, start_i: int, end_i: int) -> None:
340345
b = blocks[i]
341346
d = b.to_native_types(
342347
na_rep=self.na_rep,
348+
bytes_encoding=self.encoding,
343349
float_format=self.float_format,
344350
decimal=self.decimal,
345351
date_format=self.date_format,
@@ -353,10 +359,19 @@ def _save_chunk(self, start_i: int, end_i: int) -> None:
353359
ix = data_index.to_native_types(
354360
slicer=slicer,
355361
na_rep=self.na_rep,
362+
bytes_encoding=self.encoding,
356363
float_format=self.float_format,
357364
decimal=self.decimal,
358365
date_format=self.date_format,
359366
quoting=self.quoting,
360367
)
361368

362369
libwriters.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer)
370+
371+
def _bytes_to_str(self, values):
372+
"""If all the values are bytes, then modify values list by decoding
373+
bytes to str."""
374+
np_values = np.array(values, dtype=object)
375+
if lib.is_bytes_array(np_values, skipna=True, mixing_allowed=False):
376+
for i, value in enumerate(values):
377+
values[i] = value.decode(self.encoding)

pandas/tests/frame/test_to_csv.py

+83
Original file line numberDiff line numberDiff line change
@@ -740,6 +740,89 @@ def test_to_csv_withcommas(self):
740740
df2 = self.read_csv(path)
741741
tm.assert_frame_equal(df2, df)
742742

743+
def test_to_csv_bytes(self):
744+
# GH 9712
745+
times = date_range("2013-10-27 23:00", "2013-10-28 00:00", freq="H")
746+
df = DataFrame({b"foo": [b"bar", b"baz"], b"times": times}, index=[b"A", b"B"])
747+
df.loc[b"C"] = np.nan
748+
df.index.name = b"idx"
749+
750+
df_expected = DataFrame(
751+
{"foo": ["bar", "baz"], "times": times}, index=["A", "B"]
752+
)
753+
df_expected.loc["C"] = np.nan
754+
df_expected.index.name = "idx"
755+
756+
with tm.ensure_clean("__tmp_to_csv_bytes__.csv") as path:
757+
df.to_csv(path, header=True)
758+
df_output = self.read_csv(path)
759+
df_output.times = to_datetime(df_output.times)
760+
tm.assert_frame_equal(df_output, df_expected)
761+
762+
non_unicode_byte = b"\xbc\xa6"
763+
non_unicode_decoded = non_unicode_byte.decode("gb18030")
764+
df = DataFrame({non_unicode_byte: [non_unicode_byte, b"foo"]})
765+
df.index.name = "idx"
766+
767+
df_expected = DataFrame({non_unicode_decoded: [non_unicode_decoded, "foo"]})
768+
df_expected.index.name = "idx"
769+
770+
with tm.ensure_clean("__tmp_to_csv_bytes__.csv") as path:
771+
df.to_csv(path, encoding="gb18030", header=True)
772+
df_output = self.read_csv(path, encoding="gb18030")
773+
tm.assert_frame_equal(df_output, df_expected)
774+
775+
# decoding error, when transcoding fails
776+
with pytest.raises(UnicodeDecodeError):
777+
df.to_csv(encoding="utf-8")
778+
779+
# mixing of bytes and non-bytes
780+
df = DataFrame({"foo": [b"bar", "baz"]})
781+
with pytest.raises(ValueError):
782+
df.to_csv()
783+
df = DataFrame({b"foo": ["a", "b"], "bar": ["c", "d"]})
784+
with pytest.raises(ValueError):
785+
df.to_csv()
786+
df = DataFrame({"foo": ["a", "b"], "bar": ["c", "d"]}, index=["A", b"B"])
787+
with pytest.raises(ValueError):
788+
df.to_csv()
789+
790+
# multi-indexes
791+
iterables = [[b"A", b"B"], ["C", "D"]]
792+
index = pd.MultiIndex.from_product(iterables, names=[b"f", b"s"])
793+
data = np.array([[0, 0], [0, 0], [0, 0], [0, 0]])
794+
df = pd.DataFrame(data, index=index)
795+
796+
with tm.ensure_clean("__tmp_to_csv_bytes__.csv") as path:
797+
df.to_csv(path)
798+
import sys
799+
800+
df.to_csv(sys.stdout)
801+
with open(path) as csvfile:
802+
output = csvfile.readlines()
803+
804+
expected = [
805+
"f,s,0,1\n",
806+
"A,C,0,0\n",
807+
"A,D,0,0\n",
808+
"B,C,0,0\n",
809+
"B,D,0,0\n",
810+
]
811+
assert output == expected
812+
813+
# mixing of bytes and non-bytes in multi-indexes
814+
iterables = [[b"A", "B"], ["C", "D"]]
815+
index = pd.MultiIndex.from_product(iterables)
816+
df = pd.DataFrame(data, index=index)
817+
with pytest.raises(ValueError):
818+
df.to_csv()
819+
820+
iterables = [["A", "B"], ["C", "D"]]
821+
index = pd.MultiIndex.from_product(iterables, names=[b"f", "s"])
822+
df = pd.DataFrame(data, index=index)
823+
with pytest.raises(ValueError):
824+
df.to_csv()
825+
743826
def test_to_csv_mixed(self):
744827
def create_cols(name):
745828
return [f"{name}{i:03d}" for i in range(5)]

0 commit comments

Comments
 (0)