Skip to content

#26545 Fix: same .tsv file, get different data-frame structure using engine 'python' and 'c' #26634

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jun 12, 2019
21 changes: 11 additions & 10 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2755,23 +2755,24 @@ def _check_for_bom(self, first_row):
if first_elt != _BOM:
return first_row

first_row = first_row[0]
first_row_bom = first_row[0]

if len(first_row) > 1 and first_row[1] == self.quotechar:
if len(first_row_bom) > 1 and first_row_bom[1] == self.quotechar:
start = 2
quote = first_row[1]
end = first_row[2:].index(quote) + 2
quote = first_row_bom[1]
end = first_row_bom[2:].index(quote) + 2

# Extract the data between the quotation marks
new_row = first_row[start:end]
new_row = first_row_bom[start:end]

# Extract any remaining data after the second
# quotation mark.
if len(first_row) > end + 1:
new_row += first_row[end + 1:]
return [new_row]
elif len(first_row) > 1:
return [first_row[1:]]
if len(first_row_bom) > end + 1:
new_row += first_row_bom[end + 1:]
return [new_row] + first_row[1:]

elif len(first_row_bom) > 1:
return [first_row_bom[1:]]
else:
# First row is just the BOM, so we
# return an empty string.
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/io/parser/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1927,3 +1927,17 @@ def test_read_table_deprecated(all_parsers):
check_stacklevel=False):
result = parser.read_table(StringIO(data))
tm.assert_frame_equal(result, expected)


def test_first_row_bom_python(all_parsers):
parser = all_parsers
data = """
\ufeff"Head1" "Head2" "Head3"
"""

assert parser.read_csv(StringIO(data),
delimiter='\t',
engine='python').shape == (0, 3)

assert parser.read_csv(StringIO(data),
delimiter='\t').shape == (0, 3)