diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 59a106291dad8..04df27519983a 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -797,6 +797,7 @@ Bug Fixes - Bug in ``groupby().shift()``, which could cause a segfault or corruption in rare circumstances when grouping by columns with missing values (:issue:`13813`) - Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`) +- Bug in ``pd.read_csv()``, which caused BOM files to be incorrectly parsed by not ignoring the BOM (:issue:`4793`) - Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) - Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8c615741679b5..7846ccd1a6660 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -11,7 +11,8 @@ import numpy as np from pandas import compat -from pandas.compat import range, lrange, StringIO, lzip, zip, string_types, map +from pandas.compat import (range, lrange, StringIO, lzip, + zip, string_types, map, u) from pandas.types.common import (is_integer, _ensure_object, is_list_like, is_integer_dtype, is_float, @@ -40,6 +41,12 @@ 'N/A', 'NA', '#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan', '' ]) +# BOM character (byte order mark) +# This exists at the beginning of a file to indicate endianness +# of a file (stream). Unfortunately, this marker screws up parsing, +# so we need to remove it if we see it. +_BOM = u('\ufeff') + _parser_params = """Also supports optionally iterating or breaking of the file into chunks. @@ -2161,6 +2168,67 @@ def _buffered_line(self): else: return self._next_line() + def _check_for_bom(self, first_row): + """ + Checks whether the file begins with the BOM character. + If it does, remove it. In addition, if there is quoting + in the field subsequent to the BOM, remove it as well + because it technically takes place at the beginning of + the name, not the middle of it. + """ + # first_row will be a list, so we need to check + # that that list is not empty before proceeding. + if not first_row: + return first_row + + # The first element of this row is the one that could have the + # BOM that we want to remove. Check that the first element is a + # string before proceeding. + if not isinstance(first_row[0], compat.string_types): + return first_row + + # Check that the string is not empty, as that would + # obviously not have a BOM at the start of it. + if not first_row[0]: + return first_row + + # Since the string is non-empty, check that it does + # in fact begin with a BOM. + first_elt = first_row[0][0] + + # This is to avoid warnings we get in Python 2.x if + # we find ourselves comparing with non-Unicode + if compat.PY2 and not isinstance(first_elt, unicode): # noqa + try: + first_elt = u(first_elt) + except UnicodeDecodeError: + return first_row + + if first_elt != _BOM: + return first_row + + first_row = first_row[0] + + if len(first_row) > 1 and first_row[1] == self.quotechar: + start = 2 + quote = first_row[1] + end = first_row[2:].index(quote) + 2 + + # Extract the data between the quotation marks + new_row = first_row[start:end] + + # Extract any remaining data after the second + # quotation mark. + if len(first_row) > end + 1: + new_row += first_row[end + 1:] + return [new_row] + elif len(first_row) > 1: + return [first_row[1:]] + else: + # First row is just the BOM, so we + # return an empty string. + return [""] + def _empty(self, line): return not line or all(not x for x in line) @@ -2212,6 +2280,12 @@ def _next_line(self): line = ret[0] break + # This was the first line of the file, + # which could contain the BOM at the + # beginning of it. + if self.pos == 1: + line = self._check_for_bom(line) + self.line_pos += 1 self.buf.append(line) return line diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 129e925e38d5b..7558e4bb63226 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1517,3 +1517,54 @@ def test_null_byte_char(self): msg = "NULL byte detected" with tm.assertRaisesRegexp(csv.Error, msg): self.read_csv(StringIO(data), names=cols) + + def test_utf8_bom(self): + # see gh-4793 + bom = u('\ufeff') + utf8 = 'utf-8' + + def _encode_data_with_bom(_data): + bom_data = (bom + _data).encode(utf8) + return BytesIO(bom_data) + + # basic test + data = 'a\n1' + expected = DataFrame({'a': [1]}) + + out = self.read_csv(_encode_data_with_bom(data), + encoding=utf8) + tm.assert_frame_equal(out, expected) + + # test with "regular" quoting + data = '"a"\n1' + expected = DataFrame({'a': [1]}) + + out = self.read_csv(_encode_data_with_bom(data), + encoding=utf8, quotechar='"') + tm.assert_frame_equal(out, expected) + + # test in a data row instead of header + data = 'b\n1' + expected = DataFrame({'a': ['b', '1']}) + + out = self.read_csv(_encode_data_with_bom(data), + encoding=utf8, names=['a']) + tm.assert_frame_equal(out, expected) + + # test in empty data row with skipping + data = '\n1' + expected = DataFrame({'a': [1]}) + + out = self.read_csv(_encode_data_with_bom(data), + encoding=utf8, names=['a'], + skip_blank_lines=True) + tm.assert_frame_equal(out, expected) + + # test in empty data row without skipping + data = '\n1' + expected = DataFrame({'a': [np.nan, 1.0]}) + + out = self.read_csv(_encode_data_with_bom(data), + encoding=utf8, names=['a'], + skip_blank_lines=False) + tm.assert_frame_equal(out, expected) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index cc89fc51792dd..3c09933b3ec87 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -704,6 +704,11 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { self->datapos = i; \ TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, self->datalen)); +#define CHECK_FOR_BOM() \ + if (*buf == '\xef' && *(buf + 1) == '\xbb' && *(buf + 2) == '\xbf') { \ + buf += 3; \ + self->datapos += 3; \ + } int skip_this_line(parser_t *self, int64_t rownum) { if (self->skipset != NULL) { @@ -736,6 +741,10 @@ int tokenize_bytes(parser_t *self, size_t line_limit) TRACE(("%s\n", buf)); + if (self->file_lines == 0) { + CHECK_FOR_BOM(); + } + for (i = self->datapos; i < self->datalen; ++i) { // next character in file