Skip to content

Commit 34bc8e5

Browse files
committed
BUG: Ignore the BOM in BOM UTF-8 CSV files
Closes pandas-devgh-4793.
1 parent 2beab41 commit 34bc8e5

File tree

4 files changed

+136
-1
lines changed

4 files changed

+136
-1
lines changed

doc/source/whatsnew/v0.19.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -797,6 +797,7 @@ Bug Fixes
797797

798798
- Bug in ``groupby().shift()``, which could cause a segfault or corruption in rare circumstances when grouping by columns with missing values (:issue:`13813`)
799799
- Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`)
800+
- Bug in ``pd.read_csv()``, which caused BOM files to be incorrectly parsed by not ignoring the BOM (:issue:`4793`)
800801
- Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`)
801802
- Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`)
802803
- Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`)

pandas/io/parsers.py

+75-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111
import numpy as np
1212

1313
from pandas import compat
14-
from pandas.compat import range, lrange, StringIO, lzip, zip, string_types, map
14+
from pandas.compat import (range, lrange, StringIO, lzip,
15+
zip, string_types, map, u)
1516
from pandas.types.common import (is_integer, _ensure_object,
1617
is_list_like, is_integer_dtype,
1718
is_float,
@@ -40,6 +41,12 @@
4041
'N/A', 'NA', '#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan', ''
4142
])
4243

44+
# BOM character (byte order mark)
45+
# This exists at the beginning of a file to indicate endianness
46+
# of a file (stream). Unfortunately, this marker screws up parsing,
47+
# so we need to remove it if we see it.
48+
_BOM = u('\ufeff')
49+
4350
_parser_params = """Also supports optionally iterating or breaking of the file
4451
into chunks.
4552
@@ -2161,6 +2168,67 @@ def _buffered_line(self):
21612168
else:
21622169
return self._next_line()
21632170

2171+
def _check_for_bom(self, first_row):
2172+
"""
2173+
Checks whether the file begins with the BOM character.
2174+
If it does, remove it. In addition, if there is quoting
2175+
in the field subsequent to the BOM, remove it as well
2176+
because it technically takes place at the beginning of
2177+
the name, not the middle of it.
2178+
"""
2179+
# first_row will be a list, so we need to check
2180+
# that that list is not empty before proceeding.
2181+
if not first_row:
2182+
return first_row
2183+
2184+
# The first element of this row is the one that could have the
2185+
# BOM that we want to remove. Check that the first element is a
2186+
# string before proceeding.
2187+
if not isinstance(first_row[0], compat.string_types):
2188+
return first_row
2189+
2190+
# Check that the string is not empty, as that would
2191+
# obviously not have a BOM at the start of it.
2192+
if not first_row[0]:
2193+
return first_row
2194+
2195+
# Since the string is non-empty, check that it does
2196+
# in fact begin with a BOM.
2197+
first_elt = first_row[0][0]
2198+
2199+
# This is to avoid warnings we get in Python 2.x if
2200+
# we find ourselves comparing with non-Unicode
2201+
if compat.PY2 and not isinstance(first_elt, unicode): # noqa
2202+
try:
2203+
first_elt = u(first_elt)
2204+
except UnicodeDecodeError:
2205+
return first_row
2206+
2207+
if first_elt != _BOM:
2208+
return first_row
2209+
2210+
first_row = first_row[0]
2211+
2212+
if len(first_row) > 1 and first_row[1] == self.quotechar:
2213+
start = 2
2214+
quote = first_row[1]
2215+
end = first_row[2:].index(quote) + 2
2216+
2217+
# Extract the data between the quotation marks
2218+
new_row = first_row[start:end]
2219+
2220+
# Extract any remaining data after the second
2221+
# quotation mark.
2222+
if len(first_row) > end + 1:
2223+
new_row += first_row[end + 1:]
2224+
return [new_row]
2225+
elif len(first_row) > 1:
2226+
return [first_row[1:]]
2227+
else:
2228+
# First row is just the BOM, so we
2229+
# return an empty string.
2230+
return [""]
2231+
21642232
def _empty(self, line):
21652233
return not line or all(not x for x in line)
21662234

@@ -2212,6 +2280,12 @@ def _next_line(self):
22122280
line = ret[0]
22132281
break
22142282

2283+
# This was the first line of the file,
2284+
# which could contain the BOM at the
2285+
# beginning of it.
2286+
if self.pos == 1:
2287+
line = self._check_for_bom(line)
2288+
22152289
self.line_pos += 1
22162290
self.buf.append(line)
22172291
return line

pandas/io/tests/parser/common.py

+51
Original file line numberDiff line numberDiff line change
@@ -1517,3 +1517,54 @@ def test_null_byte_char(self):
15171517
msg = "NULL byte detected"
15181518
with tm.assertRaisesRegexp(csv.Error, msg):
15191519
self.read_csv(StringIO(data), names=cols)
1520+
1521+
def test_utf8_bom(self):
1522+
# see gh-4793
1523+
bom = u('\ufeff')
1524+
utf8 = 'utf-8'
1525+
1526+
def _encode_data_with_bom(_data):
1527+
bom_data = (bom + _data).encode(utf8)
1528+
return BytesIO(bom_data)
1529+
1530+
# basic test
1531+
data = 'a\n1'
1532+
expected = DataFrame({'a': [1]})
1533+
1534+
out = self.read_csv(_encode_data_with_bom(data),
1535+
encoding=utf8)
1536+
tm.assert_frame_equal(out, expected)
1537+
1538+
# test with "regular" quoting
1539+
data = '"a"\n1'
1540+
expected = DataFrame({'a': [1]})
1541+
1542+
out = self.read_csv(_encode_data_with_bom(data),
1543+
encoding=utf8, quotechar='"')
1544+
tm.assert_frame_equal(out, expected)
1545+
1546+
# test in a data row instead of header
1547+
data = 'b\n1'
1548+
expected = DataFrame({'a': ['b', '1']})
1549+
1550+
out = self.read_csv(_encode_data_with_bom(data),
1551+
encoding=utf8, names=['a'])
1552+
tm.assert_frame_equal(out, expected)
1553+
1554+
# test in empty data row with skipping
1555+
data = '\n1'
1556+
expected = DataFrame({'a': [1]})
1557+
1558+
out = self.read_csv(_encode_data_with_bom(data),
1559+
encoding=utf8, names=['a'],
1560+
skip_blank_lines=True)
1561+
tm.assert_frame_equal(out, expected)
1562+
1563+
# test in empty data row without skipping
1564+
data = '\n1'
1565+
expected = DataFrame({'a': [np.nan, 1.0]})
1566+
1567+
out = self.read_csv(_encode_data_with_bom(data),
1568+
encoding=utf8, names=['a'],
1569+
skip_blank_lines=False)
1570+
tm.assert_frame_equal(out, expected)

pandas/src/parser/tokenizer.c

+9
Original file line numberDiff line numberDiff line change
@@ -704,6 +704,11 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
704704
self->datapos = i; \
705705
TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, self->datalen));
706706

707+
#define CHECK_FOR_BOM() \
708+
if (*buf == '\xef' && *(buf + 1) == '\xbb' && *(buf + 2) == '\xbf') { \
709+
buf += 3; \
710+
self->datapos += 3; \
711+
}
707712

708713
int skip_this_line(parser_t *self, int64_t rownum) {
709714
if (self->skipset != NULL) {
@@ -736,6 +741,10 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
736741

737742
TRACE(("%s\n", buf));
738743

744+
if (self->file_lines == 0) {
745+
CHECK_FOR_BOM();
746+
}
747+
739748
for (i = self->datapos; i < self->datalen; ++i)
740749
{
741750
// next character in file

0 commit comments

Comments
 (0)