Skip to content

Commit a137a9c

Browse files
luckydenisjreback
authored andcommitted
#26545 Fix: same .tsv file, get different data-frame structure using engine 'python' and 'c' (#26634)
1 parent e6d27ec commit a137a9c

File tree

3 files changed

+22
-10
lines changed

3 files changed

+22
-10
lines changed

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -667,6 +667,7 @@ I/O
667667
- Bug in :func:`read_json` where date strings with ``Z`` were not converted to a UTC timezone (:issue:`26168`)
668668
- Added ``cache_dates=True`` parameter to :meth:`read_csv`, which allows to cache unique dates when they are parsed (:issue:`25990`)
669669
- :meth:`DataFrame.to_excel` now raises a ``ValueError`` when the caller's dimensions exceed the limitations of Excel (:issue:`26051`)
670+
- Fixed bug in :func:`pandas.read_csv` where a BOM would result in incorrect parsing using engine='python' (:issue:`26545`)
670671
- :func:`read_excel` now raises a ``ValueError`` when input is of type :class:`pandas.io.excel.ExcelFile` and ``engine`` param is passed since :class:`pandas.io.excel.ExcelFile` has an engine defined (:issue:`26566`)
671672
- Bug while selecting from :class:`HDFStore` with ``where=''`` specified (:issue:`26610`).
672673

pandas/io/parsers.py

+11-10
Original file line numberDiff line numberDiff line change
@@ -2755,23 +2755,24 @@ def _check_for_bom(self, first_row):
27552755
if first_elt != _BOM:
27562756
return first_row
27572757

2758-
first_row = first_row[0]
2758+
first_row_bom = first_row[0]
27592759

2760-
if len(first_row) > 1 and first_row[1] == self.quotechar:
2760+
if len(first_row_bom) > 1 and first_row_bom[1] == self.quotechar:
27612761
start = 2
2762-
quote = first_row[1]
2763-
end = first_row[2:].index(quote) + 2
2762+
quote = first_row_bom[1]
2763+
end = first_row_bom[2:].index(quote) + 2
27642764

27652765
# Extract the data between the quotation marks
2766-
new_row = first_row[start:end]
2766+
new_row = first_row_bom[start:end]
27672767

27682768
# Extract any remaining data after the second
27692769
# quotation mark.
2770-
if len(first_row) > end + 1:
2771-
new_row += first_row[end + 1:]
2772-
return [new_row]
2773-
elif len(first_row) > 1:
2774-
return [first_row[1:]]
2770+
if len(first_row_bom) > end + 1:
2771+
new_row += first_row_bom[end + 1:]
2772+
return [new_row] + first_row[1:]
2773+
2774+
elif len(first_row_bom) > 1:
2775+
return [first_row_bom[1:]]
27752776
else:
27762777
# First row is just the BOM, so we
27772778
# return an empty string.

pandas/tests/io/parser/test_common.py

+10
Original file line numberDiff line numberDiff line change
@@ -1927,3 +1927,13 @@ def test_read_table_deprecated(all_parsers):
19271927
check_stacklevel=False):
19281928
result = parser.read_table(StringIO(data))
19291929
tm.assert_frame_equal(result, expected)
1930+
1931+
1932+
def test_first_row_bom(all_parsers):
1933+
# see gh-26545
1934+
parser = all_parsers
1935+
data = '''\ufeff"Head1" "Head2" "Head3"'''
1936+
1937+
result = parser.read_csv(StringIO(data), delimiter='\t')
1938+
expected = DataFrame(columns=["Head1", "Head2", "Head3"])
1939+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)