BUG: Ignore the BOM in BOM UTF-8 CSV files

gfyoung · gfyoung · commit 34bc8e5e80f5 · 2016-08-05T01:35:43.000-04:00
Closes pandas-devgh-4793.
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -797,6 +797,7 @@ Bug Fixes
 
 - Bug in ``groupby().shift()``, which could cause a segfault or corruption in rare circumstances when grouping by columns with missing values (:issue:`13813`)
 - Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`)
+- Bug in ``pd.read_csv()``, which caused BOM files to be incorrectly parsed by not ignoring the BOM (:issue:`4793`)
 - Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`)
 - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`)
 - Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -11,7 +11,8 @@
 import numpy as np
 
 from pandas import compat
-from pandas.compat import range, lrange, StringIO, lzip, zip, string_types, map
+from pandas.compat import (range, lrange, StringIO, lzip,
+                           zip, string_types, map, u)
 from pandas.types.common import (is_integer, _ensure_object,
                                  is_list_like, is_integer_dtype,
                                  is_float,
@@ -40,6 +41,12 @@
     'N/A', 'NA', '#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan', ''
 ])
 
+# BOM character (byte order mark)
+# This exists at the beginning of a file to indicate endianness
+# of a file (stream). Unfortunately, this marker screws up parsing,
+# so we need to remove it if we see it.
+_BOM = u('\ufeff')
+
 _parser_params = """Also supports optionally iterating or breaking of the file
 into chunks.
 
@@ -2161,6 +2168,67 @@ def _buffered_line(self):
         else:
             return self._next_line()
 
+    def _check_for_bom(self, first_row):
+        """
+        Checks whether the file begins with the BOM character.
+        If it does, remove it. In addition, if there is quoting
+        in the field subsequent to the BOM, remove it as well
+        because it technically takes place at the beginning of
+        the name, not the middle of it.
+        """
+        # first_row will be a list, so we need to check
+        # that that list is not empty before proceeding.
+        if not first_row:
+            return first_row
+
+        # The first element of this row is the one that could have the
+        # BOM that we want to remove. Check that the first element is a
+        # string before proceeding.
+        if not isinstance(first_row[0], compat.string_types):
+            return first_row
+
+        # Check that the string is not empty, as that would
+        # obviously not have a BOM at the start of it.
+        if not first_row[0]:
+            return first_row
+
+        # Since the string is non-empty, check that it does
+        # in fact begin with a BOM.
+        first_elt = first_row[0][0]
+
+        # This is to avoid warnings we get in Python 2.x if
+        # we find ourselves comparing with non-Unicode
+        if compat.PY2 and not isinstance(first_elt, unicode):  # noqa
+            try:
+                first_elt = u(first_elt)
+            except UnicodeDecodeError:
+                return first_row
+
+        if first_elt != _BOM:
+            return first_row
+
+        first_row = first_row[0]
+
+        if len(first_row) > 1 and first_row[1] == self.quotechar:
+            start = 2
+            quote = first_row[1]
+            end = first_row[2:].index(quote) + 2
+
+            # Extract the data between the quotation marks
+            new_row = first_row[start:end]
+
+            # Extract any remaining data after the second
+            # quotation mark.
+            if len(first_row) > end + 1:
+                new_row += first_row[end + 1:]
+            return [new_row]
+        elif len(first_row) > 1:
+            return [first_row[1:]]
+        else:
+            # First row is just the BOM, so we
+            # return an empty string.
+            return [""]
+
     def _empty(self, line):
         return not line or all(not x for x in line)
 
@@ -2212,6 +2280,12 @@ def _next_line(self):
                         line = ret[0]
                         break
 
+        # This was the first line of the file,
+        # which could contain the BOM at the
+        # beginning of it.
+        if self.pos == 1:
+            line = self._check_for_bom(line)
+
         self.line_pos += 1
         self.buf.append(line)
         return line
diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py
@@ -1517,3 +1517,54 @@ def test_null_byte_char(self):
             msg = "NULL byte detected"
             with tm.assertRaisesRegexp(csv.Error, msg):
                 self.read_csv(StringIO(data), names=cols)
+
+    def test_utf8_bom(self):
+        # see gh-4793
+        bom = u('\ufeff')
+        utf8 = 'utf-8'
+
+        def _encode_data_with_bom(_data):
+            bom_data = (bom + _data).encode(utf8)
+            return BytesIO(bom_data)
+
+        # basic test
+        data = 'a\n1'
+        expected = DataFrame({'a': [1]})
+
+        out = self.read_csv(_encode_data_with_bom(data),
+                            encoding=utf8)
+        tm.assert_frame_equal(out, expected)
+
+        # test with "regular" quoting
+        data = '"a"\n1'
+        expected = DataFrame({'a': [1]})
+
+        out = self.read_csv(_encode_data_with_bom(data),
+                            encoding=utf8, quotechar='"')
+        tm.assert_frame_equal(out, expected)
+
+        # test in a data row instead of header
+        data = 'b\n1'
+        expected = DataFrame({'a': ['b', '1']})
+
+        out = self.read_csv(_encode_data_with_bom(data),
+                            encoding=utf8, names=['a'])
+        tm.assert_frame_equal(out, expected)
+
+        # test in empty data row with skipping
+        data = '\n1'
+        expected = DataFrame({'a': [1]})
+
+        out = self.read_csv(_encode_data_with_bom(data),
+                            encoding=utf8, names=['a'],
+                            skip_blank_lines=True)
+        tm.assert_frame_equal(out, expected)
+
+        # test in empty data row without skipping
+        data = '\n1'
+        expected = DataFrame({'a': [np.nan, 1.0]})
+
+        out = self.read_csv(_encode_data_with_bom(data),
+                            encoding=utf8, names=['a'],
+                            skip_blank_lines=False)
+        tm.assert_frame_equal(out, expected)
diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c
@@ -704,6 +704,11 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
     self->datapos = i;                                                  \
     TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, self->datalen));
 
+#define CHECK_FOR_BOM()                                                   \
+    if (*buf == '\xef' && *(buf + 1) == '\xbb' && *(buf + 2) == '\xbf') { \
+        buf += 3;                                                         \
+        self->datapos += 3;                                               \
+    }
 
 int skip_this_line(parser_t *self, int64_t rownum) {
     if (self->skipset != NULL) {
@@ -736,6 +741,10 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
 
     TRACE(("%s\n", buf));
 
+    if (self->file_lines == 0) {
+        CHECK_FOR_BOM();
+    }
+
     for (i = self->datapos; i < self->datalen; ++i)
     {
         // next character in file