diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index f61c8bfbd782e..7d95c0122aae0 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -667,6 +667,7 @@ I/O - Bug in :func:`read_json` where date strings with ``Z`` were not converted to a UTC timezone (:issue:`26168`) - Added ``cache_dates=True`` parameter to :meth:`read_csv`, which allows to cache unique dates when they are parsed (:issue:`25990`) - :meth:`DataFrame.to_excel` now raises a ``ValueError`` when the caller's dimensions exceed the limitations of Excel (:issue:`26051`) +- Fixed bug in :func:`pandas.read_csv` where a BOM would result in incorrect parsing using engine='python' (:issue:`26545`) - :func:`read_excel` now raises a ``ValueError`` when input is of type :class:`pandas.io.excel.ExcelFile` and ``engine`` param is passed since :class:`pandas.io.excel.ExcelFile` has an engine defined (:issue:`26566`) - Bug while selecting from :class:`HDFStore` with ``where=''`` specified (:issue:`26610`). diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index bcbdd80865360..1058476495985 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2755,23 +2755,24 @@ def _check_for_bom(self, first_row): if first_elt != _BOM: return first_row - first_row = first_row[0] + first_row_bom = first_row[0] - if len(first_row) > 1 and first_row[1] == self.quotechar: + if len(first_row_bom) > 1 and first_row_bom[1] == self.quotechar: start = 2 - quote = first_row[1] - end = first_row[2:].index(quote) + 2 + quote = first_row_bom[1] + end = first_row_bom[2:].index(quote) + 2 # Extract the data between the quotation marks - new_row = first_row[start:end] + new_row = first_row_bom[start:end] # Extract any remaining data after the second # quotation mark. - if len(first_row) > end + 1: - new_row += first_row[end + 1:] - return [new_row] - elif len(first_row) > 1: - return [first_row[1:]] + if len(first_row_bom) > end + 1: + new_row += first_row_bom[end + 1:] + return [new_row] + first_row[1:] + + elif len(first_row_bom) > 1: + return [first_row_bom[1:]] else: # First row is just the BOM, so we # return an empty string. diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index bb5f7e683d98b..28ea90f005f3f 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1927,3 +1927,13 @@ def test_read_table_deprecated(all_parsers): check_stacklevel=False): result = parser.read_table(StringIO(data)) tm.assert_frame_equal(result, expected) + + +def test_first_row_bom(all_parsers): + # see gh-26545 + parser = all_parsers + data = '''\ufeff"Head1" "Head2" "Head3"''' + + result = parser.read_csv(StringIO(data), delimiter='\t') + expected = DataFrame(columns=["Head1", "Head2", "Head3"]) + tm.assert_frame_equal(result, expected)