From 0c8a223dbec43f3e8e437e6362902a1729e3ccde Mon Sep 17 00:00:00 2001 From: Belavin Denis Date: Mon, 3 Jun 2019 16:12:06 +0300 Subject: [PATCH 1/9] fix: same .tsv file, get different data-frame structure using engine 'python' and 'c' --- pandas/io/parsers.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index c65c11e840c27..a4e99dd758653 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2755,23 +2755,24 @@ def _check_for_bom(self, first_row): if first_elt != _BOM: return first_row - first_row = first_row[0] + first_row_bom = first_row[0] - if len(first_row) > 1 and first_row[1] == self.quotechar: + if len(first_row_bom) > 1 and first_row_bom[1] == self.quotechar: start = 2 - quote = first_row[1] - end = first_row[2:].index(quote) + 2 + quote = first_row_bom[1] + end = first_row_bom[2:].index(quote) + 2 # Extract the data between the quotation marks - new_row = first_row[start:end] + new_row = first_row_bom[start:end] # Extract any remaining data after the second # quotation mark. - if len(first_row) > end + 1: - new_row += first_row[end + 1:] - return [new_row] - elif len(first_row) > 1: - return [first_row[1:]] + if len(first_row_bom) > end + 1: + new_row += first_row_bom[end + 1:] + return [new_row] + first_row[1:] + + elif len(first_row_bom) > 1: + return [first_row_bom[1:]] else: # First row is just the BOM, so we # return an empty string. From 0df0744e2914efd403f6f3e4bd668b5ccc9841b3 Mon Sep 17 00:00:00 2001 From: Belavin Denis Date: Mon, 3 Jun 2019 17:43:00 +0300 Subject: [PATCH 2/9] #26545 fix: same .tsv file, get different data-frame structure using engine 'python' and 'c' --- pandas/tests/io/parser/data/bom_first_line.txt | 9 +++++++++ pandas/tests/io/parser/test_check_for_bom.py | 8 ++++++++ 2 files changed, 17 insertions(+) create mode 100644 pandas/tests/io/parser/data/bom_first_line.txt create mode 100644 pandas/tests/io/parser/test_check_for_bom.py diff --git a/pandas/tests/io/parser/data/bom_first_line.txt b/pandas/tests/io/parser/data/bom_first_line.txt new file mode 100644 index 0000000000000..745ac2e2bc03a --- /dev/null +++ b/pandas/tests/io/parser/data/bom_first_line.txt @@ -0,0 +1,9 @@ +"Project ID" "Project Name" "Product Name" "Content Type" "Creation Date" "Translator" "Reviewer" "Lanugage" "File Name" "Segment Sequence" "Word Count" "TM Score" "Match Type" "AT Type" "MT Engine" "MT Configuration Name" "PE Distance" "Length After Automatic Translation" "Change in %" "Source" "After Automatic Translation" "Target" +"74872" "SDK Android - 18.10.2 - UI" "WS1 SDK for Android" "UI" "2018-12-10" "SDL" "Pactera" "zh_CN" "Drop2_en_0_1.vgr" "0000000017" "1" "100.0" "Exact" "TM" "" "" "0" "2" "0.00" "Other" "其他" "其他" +"74872" "SDK Android - 18.10.2 - UI" "WS1 SDK for Android" "UI" "2018-12-10" "SDL" "Pactera" "zh_CN" "Drop2_en_0_1.vgr" "0000000027" "2" "50.0" "No Match" "TM" "" "" "0" "4" "0.00" "Login expiration" "登录到期" "登录到期" +"74872" "SDK Android - 18.10.2 - UI" "WS1 SDK for Android" "UI" "2018-12-10" "SDL" "Pactera" "zh_CN" "Drop2_en_0_1.vgr" "0000000037" "6" "0.0" "No Match" "TM" "" "" "0" "11" "0.00" "Shows notification for login expiration events" "显示登录到期事件的通知" "显示登录到期事件的通知" +"74872" "SDK Android - 18.10.2 - UI" "WS1 SDK for Android" "UI" "2018-12-10" "SDL" "Pactera" "zh_CN" "Drop2_en_0_1.vgr" "0000000047" "3" "0.0" "No Match" "TM" "" "" "0" "6" "0.00" "Corporate account removal" "公司帐户移除" "公司帐户移除" +"74872" "SDK Android - 18.10.2 - UI" "WS1 SDK for Android" "UI" "2018-12-10" "SDL" "Pactera" "zh_CN" "Drop2_en_0_1.vgr" "0000000057" "5" "0.0" "No Match" "TM" "" "" "0" "8" "0.00" "Shows notification for account removal" "显示帐户移除通知" "显示帐户移除通知" +"74872" "SDK Android - 18.10.2 - UI" "WS1 SDK for Android" "UI" "2018-12-10" "SDL" "Pactera" "zh_CN" "Drop2_en_0_1.vgr" "0000000067" "3" "66.0" "No Match" "TM" "" "" "0" "6" "0.00" "Enterprise App Wipe" "企业应用擦除" "企业应用擦除" +"74872" "SDK Android - 18.10.2 - UI" "WS1 SDK for Android" "UI" "2018-12-10" "SDL" "Pactera" "zh_CN" "Drop2_en_0_1.vgr" "0000000077" "5" "23.0" "No Match" "TM" "" "" "0" "10" "0.00" "Application data is being cleared" "正在清除应用程序数据" "正在清除应用程序数据" +ion for account removal" "显示帐户移除通知" "显示帐户移除通知" diff --git a/pandas/tests/io/parser/test_check_for_bom.py b/pandas/tests/io/parser/test_check_for_bom.py new file mode 100644 index 0000000000000..d9cd41a360d5c --- /dev/null +++ b/pandas/tests/io/parser/test_check_for_bom.py @@ -0,0 +1,8 @@ +import pytest + +from pandas import read_csv + + +def test_first_row_bom(): + assert read_csv('data/bom_first_line.txt', delimiter='\t', engine='python').shape == (8, 22) + assert read_csv('data/bom_first_line.txt', delimiter='\t').shape == (8, 22) From 11214de93740aa0ce1b2a1847788fac48a203f31 Mon Sep 17 00:00:00 2001 From: Belavin Denis Date: Mon, 3 Jun 2019 17:50:28 +0300 Subject: [PATCH 3/9] #26545 fix: same .tsv file, get different data-frame structure using engine 'python' and 'c' --- pandas/tests/io/parser/test_check_for_bom.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/parser/test_check_for_bom.py b/pandas/tests/io/parser/test_check_for_bom.py index d9cd41a360d5c..d742e55ad6713 100644 --- a/pandas/tests/io/parser/test_check_for_bom.py +++ b/pandas/tests/io/parser/test_check_for_bom.py @@ -4,5 +4,8 @@ def test_first_row_bom(): - assert read_csv('data/bom_first_line.txt', delimiter='\t', engine='python').shape == (8, 22) - assert read_csv('data/bom_first_line.txt', delimiter='\t').shape == (8, 22) + assert read_csv('data/bom_first_line.txt', + delimiter='\t', + engine='python').shape == (8, 22) + assert read_csv('data/bom_first_line.txt', + delimiter='\t').shape == (8, 22) From 0816572d5ca6a4fe4116fdc6d1cce74d7f5e5207 Mon Sep 17 00:00:00 2001 From: Belavin Denis Date: Mon, 3 Jun 2019 18:47:51 +0300 Subject: [PATCH 4/9] test-fix --- pandas/tests/io/parser/test_check_for_bom.py | 23 ++++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/pandas/tests/io/parser/test_check_for_bom.py b/pandas/tests/io/parser/test_check_for_bom.py index d742e55ad6713..1c68ec2bbb99e 100644 --- a/pandas/tests/io/parser/test_check_for_bom.py +++ b/pandas/tests/io/parser/test_check_for_bom.py @@ -1,11 +1,20 @@ +from pandas import read_csv +import os import pytest -from pandas import read_csv +class TestFirstRowBom: + + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath('io', 'parser', 'data') + self.file_path = os.path.join(self.dirpath, 'bom_first_line.txt') + + def test_first_row_bom_python(self): + assert read_csv(self.file_path, + delimiter='\t', + engine='python').shape == (8, 22) -def test_first_row_bom(): - assert read_csv('data/bom_first_line.txt', - delimiter='\t', - engine='python').shape == (8, 22) - assert read_csv('data/bom_first_line.txt', - delimiter='\t').shape == (8, 22) + def test_first_row_bom_c(self): + assert read_csv(self.file_path, + delimiter='\t').shape == (8, 22) From 45cacb0fc04f82cb4860174ba2b4a58dee0beb32 Mon Sep 17 00:00:00 2001 From: Belavin Denis Date: Mon, 3 Jun 2019 19:57:50 +0300 Subject: [PATCH 5/9] Correction of comments in the review --- .../tests/io/parser/data/bom_first_line.txt | 9 --------- pandas/tests/io/parser/test_check_for_bom.py | 20 ------------------- pandas/tests/io/parser/test_common.py | 14 +++++++++++++ 3 files changed, 14 insertions(+), 29 deletions(-) delete mode 100644 pandas/tests/io/parser/data/bom_first_line.txt delete mode 100644 pandas/tests/io/parser/test_check_for_bom.py diff --git a/pandas/tests/io/parser/data/bom_first_line.txt b/pandas/tests/io/parser/data/bom_first_line.txt deleted file mode 100644 index 745ac2e2bc03a..0000000000000 --- a/pandas/tests/io/parser/data/bom_first_line.txt +++ /dev/null @@ -1,9 +0,0 @@ -"Project ID" "Project Name" "Product Name" "Content Type" "Creation Date" "Translator" "Reviewer" "Lanugage" "File Name" "Segment Sequence" "Word Count" "TM Score" "Match Type" "AT Type" "MT Engine" "MT Configuration Name" "PE Distance" "Length After Automatic Translation" "Change in %" "Source" "After Automatic Translation" "Target" -"74872" "SDK Android - 18.10.2 - UI" "WS1 SDK for Android" "UI" "2018-12-10" "SDL" "Pactera" "zh_CN" "Drop2_en_0_1.vgr" "0000000017" "1" "100.0" "Exact" "TM" "" "" "0" "2" "0.00" "Other" "其他" "其他" -"74872" "SDK Android - 18.10.2 - UI" "WS1 SDK for Android" "UI" "2018-12-10" "SDL" "Pactera" "zh_CN" "Drop2_en_0_1.vgr" "0000000027" "2" "50.0" "No Match" "TM" "" "" "0" "4" "0.00" "Login expiration" "登录到期" "登录到期" -"74872" "SDK Android - 18.10.2 - UI" "WS1 SDK for Android" "UI" "2018-12-10" "SDL" "Pactera" "zh_CN" "Drop2_en_0_1.vgr" "0000000037" "6" "0.0" "No Match" "TM" "" "" "0" "11" "0.00" "Shows notification for login expiration events" "显示登录到期事件的通知" "显示登录到期事件的通知" -"74872" "SDK Android - 18.10.2 - UI" "WS1 SDK for Android" "UI" "2018-12-10" "SDL" "Pactera" "zh_CN" "Drop2_en_0_1.vgr" "0000000047" "3" "0.0" "No Match" "TM" "" "" "0" "6" "0.00" "Corporate account removal" "公司帐户移除" "公司帐户移除" -"74872" "SDK Android - 18.10.2 - UI" "WS1 SDK for Android" "UI" "2018-12-10" "SDL" "Pactera" "zh_CN" "Drop2_en_0_1.vgr" "0000000057" "5" "0.0" "No Match" "TM" "" "" "0" "8" "0.00" "Shows notification for account removal" "显示帐户移除通知" "显示帐户移除通知" -"74872" "SDK Android - 18.10.2 - UI" "WS1 SDK for Android" "UI" "2018-12-10" "SDL" "Pactera" "zh_CN" "Drop2_en_0_1.vgr" "0000000067" "3" "66.0" "No Match" "TM" "" "" "0" "6" "0.00" "Enterprise App Wipe" "企业应用擦除" "企业应用擦除" -"74872" "SDK Android - 18.10.2 - UI" "WS1 SDK for Android" "UI" "2018-12-10" "SDL" "Pactera" "zh_CN" "Drop2_en_0_1.vgr" "0000000077" "5" "23.0" "No Match" "TM" "" "" "0" "10" "0.00" "Application data is being cleared" "正在清除应用程序数据" "正在清除应用程序数据" -ion for account removal" "显示帐户移除通知" "显示帐户移除通知" diff --git a/pandas/tests/io/parser/test_check_for_bom.py b/pandas/tests/io/parser/test_check_for_bom.py deleted file mode 100644 index 1c68ec2bbb99e..0000000000000 --- a/pandas/tests/io/parser/test_check_for_bom.py +++ /dev/null @@ -1,20 +0,0 @@ -from pandas import read_csv -import os -import pytest - - -class TestFirstRowBom: - - @pytest.fixture(autouse=True) - def setup_method(self, datapath): - self.dirpath = datapath('io', 'parser', 'data') - self.file_path = os.path.join(self.dirpath, 'bom_first_line.txt') - - def test_first_row_bom_python(self): - assert read_csv(self.file_path, - delimiter='\t', - engine='python').shape == (8, 22) - - def test_first_row_bom_c(self): - assert read_csv(self.file_path, - delimiter='\t').shape == (8, 22) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index bb5f7e683d98b..da774e60b13e8 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1927,3 +1927,17 @@ def test_read_table_deprecated(all_parsers): check_stacklevel=False): result = parser.read_table(StringIO(data)) tm.assert_frame_equal(result, expected) + + +def test_first_row_bom_python(all_parsers): + parser = all_parsers + data = """ + \ufeff"Head1" "Head2" "Head3" + """ + + assert parser.read_csv(StringIO(data), + delimiter='\t', + engine='python').shape == (0, 3) + + assert parser.read_csv(StringIO(data), + delimiter='\t').shape == (0, 3) From d5c593cd2d5d3d1e235ccb59e2540854d5a4030c Mon Sep 17 00:00:00 2001 From: Belavin Denis Date: Tue, 4 Jun 2019 13:02:25 +0300 Subject: [PATCH 6/9] Corrected the tests, according to a comment in the review --- pandas/tests/io/parser/test_common.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index da774e60b13e8..28ea90f005f3f 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1929,15 +1929,11 @@ def test_read_table_deprecated(all_parsers): tm.assert_frame_equal(result, expected) -def test_first_row_bom_python(all_parsers): +def test_first_row_bom(all_parsers): + # see gh-26545 parser = all_parsers - data = """ - \ufeff"Head1" "Head2" "Head3" - """ - - assert parser.read_csv(StringIO(data), - delimiter='\t', - engine='python').shape == (0, 3) + data = '''\ufeff"Head1" "Head2" "Head3"''' - assert parser.read_csv(StringIO(data), - delimiter='\t').shape == (0, 3) + result = parser.read_csv(StringIO(data), delimiter='\t') + expected = DataFrame(columns=["Head1", "Head2", "Head3"]) + tm.assert_frame_equal(result, expected) From 1fd50eb97322da6cd32aee61fde9ba26b26f0964 Mon Sep 17 00:00:00 2001 From: Belavin Denis Date: Mon, 10 Jun 2019 16:42:55 +0300 Subject: [PATCH 7/9] add whatnew --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 2c66d3e4db321..6f3446f7e5379 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -492,7 +492,7 @@ I/O - Fixed memory leak in :meth:`DataFrame.to_json` when dealing with numeric data (:issue:`24889`) - Bug in :func:`read_json` where date strings with ``Z`` were not converted to a UTC timezone (:issue:`26168`) - Added ``cache_dates=True`` parameter to :meth:`read_csv`, which allows to cache unique dates when they are parsed (:issue:`25990`) - +- Fixed bug in :func:`pandas.read_csv` when using engine='python', columns were handled incorrectly if the first header had in the bom. (:issue:`26545`) Plotting ^^^^^^^^ From 946f9f02ec868d965884b1104aa27f1ff1be6e19 Mon Sep 17 00:00:00 2001 From: Denis Belavin <41421345+LuckyDenis@users.noreply.github.com> Date: Mon, 10 Jun 2019 19:33:47 +0300 Subject: [PATCH 8/9] Update doc/source/whatsnew/v0.25.0.rst Co-Authored-By: William Ayd --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index b4d347cd943da..3c756c03f6b37 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -664,7 +664,7 @@ I/O - Bug in :func:`read_json` where date strings with ``Z`` were not converted to a UTC timezone (:issue:`26168`) - Added ``cache_dates=True`` parameter to :meth:`read_csv`, which allows to cache unique dates when they are parsed (:issue:`25990`) - :meth:`DataFrame.to_excel` now raises a ``ValueError`` when the caller's dimensions exceed the limitations of Excel (:issue:`26051`) -- Fixed bug in :func:`pandas.read_csv` - when using engine='python', columns were handled incorrectly if the first header had in the bom (:issue:`26545`) +- Fixed bug in :func:`pandas.read_csv` - where a BOM would result in incorrect parsing using engine='python' (:issue:`26545`) Plotting ^^^^^^^^ From 4cf74f945082093773df28f94f45497cad39c7c1 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 10 Jun 2019 13:10:46 -0400 Subject: [PATCH 9/9] Update v0.25.0.rst --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 3c756c03f6b37..2778ca6e49618 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -664,7 +664,7 @@ I/O - Bug in :func:`read_json` where date strings with ``Z`` were not converted to a UTC timezone (:issue:`26168`) - Added ``cache_dates=True`` parameter to :meth:`read_csv`, which allows to cache unique dates when they are parsed (:issue:`25990`) - :meth:`DataFrame.to_excel` now raises a ``ValueError`` when the caller's dimensions exceed the limitations of Excel (:issue:`26051`) -- Fixed bug in :func:`pandas.read_csv` - where a BOM would result in incorrect parsing using engine='python' (:issue:`26545`) +- Fixed bug in :func:`pandas.read_csv` where a BOM would result in incorrect parsing using engine='python' (:issue:`26545`) Plotting ^^^^^^^^