Skip to content

Commit d165a4b

Browse files
committed
BUG: avoid "b" prefix for bytes in to_csv() on Python 3 (pandas-dev#9712)
1 parent e2cb799 commit d165a4b

File tree

6 files changed

+76
-2
lines changed

6 files changed

+76
-2
lines changed

doc/source/whatsnew/v0.19.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -871,3 +871,5 @@ Bug Fixes
871871
- Bug in ``.to_excel()`` when DataFrame contains a MultiIndex which contains a label with a NaN value (:issue:`13511`)
872872
- Bug in ``pd.read_csv`` in Python 2.x with non-UTF8 encoded, multi-character separated data (:issue:`3404`)
873873
- Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`)
874+
875+
- Bug in ``to_csv()`` in Python 3 which emitted b'' around bytes (:issue:`9712`)

pandas/core/internals.py

+8
Original file line numberDiff line numberDiff line change
@@ -2020,6 +2020,14 @@ def re_replacer(s):
20202020

20212021
return block
20222022

2023+
def to_native_types(self, slicer=None, na_rep='nan', quoting=None,
2024+
bytes_encoding=None, **kwargs):
2025+
result = Block.to_native_types(self, slicer, na_rep, quoting, **kwargs)
2026+
if bytes_encoding is not None:
2027+
for arr in result:
2028+
lib.object_array_decode_bytes(arr, bytes_encoding)
2029+
return result
2030+
20232031

20242032
class CategoricalBlock(NonConsolidatableMixIn, ObjectBlock):
20252033
__slots__ = ()

pandas/formats/format.py

+20
Original file line numberDiff line numberDiff line change
@@ -1378,6 +1378,12 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
13781378
self.has_mi_columns = (isinstance(obj.columns, MultiIndex) and
13791379
not self.tupleize_cols)
13801380

1381+
# in Python 3, decode bytes to str so strings print without b''
1382+
if compat.PY3:
1383+
self.bytes_encoding = (encoding or get_option("display.encoding"))
1384+
else:
1385+
self.bytes_encoding = None
1386+
13811387
# validate mi options
13821388
if self.has_mi_columns:
13831389
if cols is not None:
@@ -1387,6 +1393,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
13871393
if cols is not None:
13881394
if isinstance(cols, Index):
13891395
cols = cols.to_native_types(na_rep=na_rep,
1396+
bytes_encoding=self.bytes_encoding,
13901397
float_format=float_format,
13911398
date_format=date_format,
13921399
quoting=self.quoting)
@@ -1399,6 +1406,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
13991406
cols = self.obj.columns
14001407
if isinstance(cols, Index):
14011408
cols = cols.to_native_types(na_rep=na_rep,
1409+
bytes_encoding=self.bytes_encoding,
14021410
float_format=float_format,
14031411
date_format=date_format,
14041412
quoting=self.quoting)
@@ -1506,6 +1514,8 @@ def _save_header(self):
15061514
else:
15071515
encoded_labels = []
15081516

1517+
self._bytes_to_str(encoded_labels)
1518+
15091519
if not has_mi_columns:
15101520
encoded_labels += list(write_cols)
15111521

@@ -1565,6 +1575,7 @@ def _save_chunk(self, start_i, end_i):
15651575
for i in range(len(self.blocks)):
15661576
b = self.blocks[i]
15671577
d = b.to_native_types(slicer=slicer, na_rep=self.na_rep,
1578+
bytes_encoding=self.bytes_encoding,
15681579
float_format=self.float_format,
15691580
decimal=self.decimal,
15701581
date_format=self.date_format,
@@ -1575,13 +1586,22 @@ def _save_chunk(self, start_i, end_i):
15751586
self.data[col_loc] = col
15761587

15771588
ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep,
1589+
bytes_encoding=self.bytes_encoding,
15781590
float_format=self.float_format,
15791591
decimal=self.decimal,
15801592
date_format=self.date_format,
15811593
quoting=self.quoting)
15821594

15831595
lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer)
15841596

1597+
def _bytes_to_str(self, values):
1598+
"""Modify values list by decoding bytes to str."""
1599+
if self.bytes_encoding:
1600+
for ii, value in enumerate(values):
1601+
if isinstance(value, bytes):
1602+
values[ii] = value.decode(self.bytes_encoding)
1603+
1604+
15851605
# from collections import namedtuple
15861606
# ExcelCell = namedtuple("ExcelCell",
15871607
# 'row, col, val, style, mergestart, mergeend')

pandas/indexes/base.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -1579,12 +1579,15 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs):
15791579
result = _trim_front(format_array(values, None, justify='left'))
15801580
return header + result
15811581

1582-
def to_native_types(self, slicer=None, **kwargs):
1582+
def to_native_types(self, slicer=None, bytes_encoding=None, **kwargs):
15831583
""" slice and dice then format """
15841584
values = self
15851585
if slicer is not None:
15861586
values = values[slicer]
1587-
return values._format_native_types(**kwargs)
1587+
result = values._format_native_types(**kwargs)
1588+
if bytes_encoding is not None and result.dtype == object:
1589+
lib.object_array_decode_bytes(result, bytes_encoding)
1590+
return result
15881591

15891592
def _format_native_types(self, na_rep='', quoting=None, **kwargs):
15901593
""" actually format my specific types """

pandas/lib.pyx

+19
Original file line numberDiff line numberDiff line change
@@ -1053,6 +1053,25 @@ def string_array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_re
10531053

10541054
return arr
10551055

1056+
@cython.boundscheck(False)
1057+
@cython.wraparound(False)
1058+
def object_array_decode_bytes(ndarray[object, ndim=1] arr, object encoding):
1059+
"""Decode any instances of bytes to str in arr using the given encoding."""
1060+
if bytes == str: # in Python 2 these are the same and nothing needs to be done
1061+
return
1062+
1063+
cdef int length = arr.shape[0], i = 0
1064+
for i from 0 <= i < length:
1065+
if isinstance(arr[i], bytes):
1066+
arr[i] = arr[i].decode(encoding)
1067+
elif isinstance(arr[i], tuple):
1068+
mask = [isinstance(it, bytes) for it in arr[i]]
1069+
if any(mask):
1070+
val = [it.decode(encoding) if mask[j] else it for j, it in enumerate(arr[i])]
1071+
arr[i] = tuple(val)
1072+
1073+
return arr
1074+
10561075
@cython.boundscheck(False)
10571076
@cython.wraparound(False)
10581077
def write_csv_rows(list data, ndarray data_index, int nlevels, ndarray cols, object writer):

pandas/tests/frame/test_to_csv.py

+22
Original file line numberDiff line numberDiff line change
@@ -790,6 +790,28 @@ def test_to_csv_unicode_index_col(self):
790790
df2 = read_csv(buf, index_col=0, encoding='UTF-8')
791791
assert_frame_equal(df, df2)
792792

793+
def test_to_csv_bytes(self):
794+
# GH 9712
795+
times = pd.date_range("2013-10-27 23:00", "2013-10-28 00:00", freq="H")
796+
df = DataFrame.from_items([
797+
(b'hello', ['a', b'b']),
798+
(b'times', times),
799+
])
800+
df.loc[2] = np.nan
801+
df.index.name = 'idx'
802+
803+
with ensure_clean() as path:
804+
df.to_csv(path)
805+
with open(path) as csvfile:
806+
lines = csvfile.readlines()
807+
808+
expected = [
809+
"idx,hello,times\n",
810+
"0,a,2013-10-27 23:00:00\n",
811+
"1,b,2013-10-28 00:00:00\n", "2,,\n",
812+
]
813+
assert(lines == expected)
814+
793815
def test_to_csv_stringio(self):
794816
buf = StringIO()
795817
self.frame.to_csv(buf)

0 commit comments

Comments
 (0)