|
11 | 11 | import numpy as np
|
12 | 12 |
|
13 | 13 | from pandas import compat
|
14 |
| -from pandas.compat import range, lrange, StringIO, lzip, zip, string_types, map |
| 14 | +from pandas.compat import (range, lrange, StringIO, lzip, |
| 15 | + zip, string_types, map, u) |
15 | 16 | from pandas.types.common import (is_integer, _ensure_object,
|
16 | 17 | is_list_like, is_integer_dtype,
|
17 | 18 | is_float,
|
|
40 | 41 | 'N/A', 'NA', '#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan', ''
|
41 | 42 | ])
|
42 | 43 |
|
| 44 | +# BOM character (byte order mark) |
| 45 | +# This exists at the beginning of a file to indicate endianness |
| 46 | +# of a file (stream). Unfortunately, this marker screws up parsing, |
| 47 | +# so we need to remove it if we see it. |
| 48 | +_BOM = u('\ufeff') |
| 49 | + |
43 | 50 | _parser_params = """Also supports optionally iterating or breaking of the file
|
44 | 51 | into chunks.
|
45 | 52 |
|
@@ -2161,6 +2168,67 @@ def _buffered_line(self):
|
2161 | 2168 | else:
|
2162 | 2169 | return self._next_line()
|
2163 | 2170 |
|
| 2171 | + def _check_for_bom(self, first_row): |
| 2172 | + """ |
| 2173 | + Checks whether the file begins with the BOM character. |
| 2174 | + If it does, remove it. In addition, if there is quoting |
| 2175 | + in the field subsequent to the BOM, remove it as well |
| 2176 | + because it technically takes place at the beginning of |
| 2177 | + the name, not the middle of it. |
| 2178 | + """ |
| 2179 | + # first_row will be a list, so we need to check |
| 2180 | + # that that list is not empty before proceeding. |
| 2181 | + if not first_row: |
| 2182 | + return first_row |
| 2183 | + |
| 2184 | + # The first element of this row is the one that could have the |
| 2185 | + # BOM that we want to remove. Check that the first element is a |
| 2186 | + # string before proceeding. |
| 2187 | + if not isinstance(first_row[0], compat.string_types): |
| 2188 | + return first_row |
| 2189 | + |
| 2190 | + # Check that the string is not empty, as that would |
| 2191 | + # obviously not have a BOM at the start of it. |
| 2192 | + if not first_row[0]: |
| 2193 | + return first_row |
| 2194 | + |
| 2195 | + # Since the string is non-empty, check that it does |
| 2196 | + # in fact begin with a BOM. |
| 2197 | + first_elt = first_row[0][0] |
| 2198 | + |
| 2199 | + # This is to avoid warnings we get in Python 2.x if |
| 2200 | + # we find ourselves comparing with non-Unicode |
| 2201 | + if compat.PY2 and not isinstance(first_elt, unicode): # noqa |
| 2202 | + try: |
| 2203 | + first_elt = u(first_elt) |
| 2204 | + except UnicodeDecodeError: |
| 2205 | + return first_row |
| 2206 | + |
| 2207 | + if first_elt != _BOM: |
| 2208 | + return first_row |
| 2209 | + |
| 2210 | + first_row = first_row[0] |
| 2211 | + |
| 2212 | + if len(first_row) > 1 and first_row[1] == self.quotechar: |
| 2213 | + start = 2 |
| 2214 | + quote = first_row[1] |
| 2215 | + end = first_row[2:].index(quote) + 2 |
| 2216 | + |
| 2217 | + # Extract the data between the quotation marks |
| 2218 | + new_row = first_row[start:end] |
| 2219 | + |
| 2220 | + # Extract any remaining data after the second |
| 2221 | + # quotation mark. |
| 2222 | + if len(first_row) > end + 1: |
| 2223 | + new_row += first_row[end + 1:] |
| 2224 | + return [new_row] |
| 2225 | + elif len(first_row) > 1: |
| 2226 | + return [first_row[1:]] |
| 2227 | + else: |
| 2228 | + # First row is just the BOM, so we |
| 2229 | + # return an empty string. |
| 2230 | + return [""] |
| 2231 | + |
2164 | 2232 | def _empty(self, line):
|
2165 | 2233 | return not line or all(not x for x in line)
|
2166 | 2234 |
|
@@ -2212,6 +2280,12 @@ def _next_line(self):
|
2212 | 2280 | line = ret[0]
|
2213 | 2281 | break
|
2214 | 2282 |
|
| 2283 | + # This was the first line of the file, |
| 2284 | + # which could contain the BOM at the |
| 2285 | + # beginning of it. |
| 2286 | + if self.pos == 1: |
| 2287 | + line = self._check_for_bom(line) |
| 2288 | + |
2215 | 2289 | self.line_pos += 1
|
2216 | 2290 | self.buf.append(line)
|
2217 | 2291 | return line
|
|
0 commit comments