Skip to content

#26545 Fix: same .tsv file, get different data-frame structure using engine 'python' and 'c' #26634

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jun 12, 2019
21 changes: 11 additions & 10 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2755,23 +2755,24 @@ def _check_for_bom(self, first_row):
if first_elt != _BOM:
return first_row

first_row = first_row[0]
first_row_bom = first_row[0]

if len(first_row) > 1 and first_row[1] == self.quotechar:
if len(first_row_bom) > 1 and first_row_bom[1] == self.quotechar:
start = 2
quote = first_row[1]
end = first_row[2:].index(quote) + 2
quote = first_row_bom[1]
end = first_row_bom[2:].index(quote) + 2

# Extract the data between the quotation marks
new_row = first_row[start:end]
new_row = first_row_bom[start:end]

# Extract any remaining data after the second
# quotation mark.
if len(first_row) > end + 1:
new_row += first_row[end + 1:]
return [new_row]
elif len(first_row) > 1:
return [first_row[1:]]
if len(first_row_bom) > end + 1:
new_row += first_row_bom[end + 1:]
return [new_row] + first_row[1:]

elif len(first_row_bom) > 1:
return [first_row_bom[1:]]
else:
# First row is just the BOM, so we
# return an empty string.
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/io/parser/data/bom_first_line.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"Project ID" "Project Name" "Product Name" "Content Type" "Creation Date" "Translator" "Reviewer" "Lanugage" "File Name" "Segment Sequence" "Word Count" "TM Score" "Match Type" "AT Type" "MT Engine" "MT Configuration Name" "PE Distance" "Length After Automatic Translation" "Change in %" "Source" "After Automatic Translation" "Target"
"74872" "SDK Android - 18.10.2 - UI" "WS1 SDK for Android" "UI" "2018-12-10" "SDL" "Pactera" "zh_CN" "Drop2_en_0_1.vgr" "0000000017" "1" "100.0" "Exact" "TM" "" "" "0" "2" "0.00" "Other" "其他" "其他"
"74872" "SDK Android - 18.10.2 - UI" "WS1 SDK for Android" "UI" "2018-12-10" "SDL" "Pactera" "zh_CN" "Drop2_en_0_1.vgr" "0000000027" "2" "50.0" "No Match" "TM" "" "" "0" "4" "0.00" "Login expiration" "登录到期" "登录到期"
"74872" "SDK Android - 18.10.2 - UI" "WS1 SDK for Android" "UI" "2018-12-10" "SDL" "Pactera" "zh_CN" "Drop2_en_0_1.vgr" "0000000037" "6" "0.0" "No Match" "TM" "" "" "0" "11" "0.00" "Shows notification for login expiration events" "显示登录到期事件的通知" "显示登录到期事件的通知"
"74872" "SDK Android - 18.10.2 - UI" "WS1 SDK for Android" "UI" "2018-12-10" "SDL" "Pactera" "zh_CN" "Drop2_en_0_1.vgr" "0000000047" "3" "0.0" "No Match" "TM" "" "" "0" "6" "0.00" "Corporate account removal" "公司帐户移除" "公司帐户移除"
"74872" "SDK Android - 18.10.2 - UI" "WS1 SDK for Android" "UI" "2018-12-10" "SDL" "Pactera" "zh_CN" "Drop2_en_0_1.vgr" "0000000057" "5" "0.0" "No Match" "TM" "" "" "0" "8" "0.00" "Shows notification for account removal" "显示帐户移除通知" "显示帐户移除通知"
"74872" "SDK Android - 18.10.2 - UI" "WS1 SDK for Android" "UI" "2018-12-10" "SDL" "Pactera" "zh_CN" "Drop2_en_0_1.vgr" "0000000067" "3" "66.0" "No Match" "TM" "" "" "0" "6" "0.00" "Enterprise App Wipe" "企业应用擦除" "企业应用擦除"
"74872" "SDK Android - 18.10.2 - UI" "WS1 SDK for Android" "UI" "2018-12-10" "SDL" "Pactera" "zh_CN" "Drop2_en_0_1.vgr" "0000000077" "5" "23.0" "No Match" "TM" "" "" "0" "10" "0.00" "Application data is being cleared" "正在清除应用程序数据" "正在清除应用程序数据"
ion for account removal" "显示帐户移除通知" "显示帐户移除通知"
11 changes: 11 additions & 0 deletions pandas/tests/io/parser/test_check_for_bom.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import pytest

from pandas import read_csv


def test_first_row_bom():
assert read_csv('data/bom_first_line.txt',
delimiter='\t',
engine='python').shape == (8, 22)
assert read_csv('data/bom_first_line.txt',
delimiter='\t').shape == (8, 22)