From 77b9cdc16e11da6479402fcc473e243036f62121 Mon Sep 17 00:00:00 2001 From: Asish Mahapatra Date: Mon, 14 Sep 2020 13:38:08 -0400 Subject: [PATCH 1/5] add failing test --- pandas/tests/io/parser/test_common.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 1d8d5a29686a4..55dd3b5d806a9 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2127,6 +2127,11 @@ def test_first_row_bom(all_parsers): expected = DataFrame(columns=["Head1", "Head2", "Head3"]) tm.assert_frame_equal(result, expected) + # see gh-36343 + data = '''\ufeffHead1 Head2 Head3''' + + result = parser.read_csv(StringIO(data), delimiter="\t") + tm.assert_frame_equal(result, expected) def test_integer_precision(all_parsers): # Gh 7072 From 3ab5760b9d992c0d64f3c0c8981d0d52246cc7d0 Mon Sep 17 00:00:00 2001 From: Asish Mahapatra Date: Mon, 14 Sep 2020 13:41:57 -0400 Subject: [PATCH 2/5] bugfix + black --- pandas/io/parsers.py | 2 +- pandas/tests/io/parser/test_common.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 2780b1a7f86c9..7e76360db98ef 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2876,7 +2876,7 @@ def _check_for_bom(self, first_row): return [new_row] + first_row[1:] elif len(first_row_bom) > 1: - return [first_row_bom[1:]] + return [first_row_bom[1:]] + first_row[1:] else: # First row is just the BOM, so we # return an empty string. diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 55dd3b5d806a9..67c6566d1c6b9 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2128,11 +2128,12 @@ def test_first_row_bom(all_parsers): tm.assert_frame_equal(result, expected) # see gh-36343 - data = '''\ufeffHead1 Head2 Head3''' + data = """\ufeffHead1 Head2 Head3""" result = parser.read_csv(StringIO(data), delimiter="\t") tm.assert_frame_equal(result, expected) + def test_integer_precision(all_parsers): # Gh 7072 s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765 From 66bccddaaea45b4f326eb2d4bfcfc1dadca0c786 Mon Sep 17 00:00:00 2001 From: Asish Mahapatra Date: Fri, 18 Sep 2020 09:26:37 -0400 Subject: [PATCH 3/5] slight refactor --- pandas/io/parsers.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 90d37d416d501..bc622ab8c1f18 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2886,14 +2886,12 @@ def _check_for_bom(self, first_row): # quotation mark. if len(first_row_bom) > end + 1: new_row += first_row_bom[end + 1 :] - return [new_row] + first_row[1:] - elif len(first_row_bom) > 1: - return [first_row_bom[1:]] + first_row[1:] else: - # First row is just the BOM, so we - # return an empty string. - return [""] + + # No quotation so just remove BOM from first element + new_row = first_row_bom[1:] + return [new_row] + first_row[1:] def _is_line_empty(self, line): """ From 72c9b2c8e88718ed2b6d2c9b24758783c0791509 Mon Sep 17 00:00:00 2001 From: Asish Mahapatra Date: Fri, 18 Sep 2020 09:27:00 -0400 Subject: [PATCH 4/5] add whatsnew --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 6923b42d3340b..6534172a41384 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -318,6 +318,7 @@ I/O - :meth:`to_csv` did not support zip compression for binary file object not having a filename (:issue:`35058`) - :meth:`to_csv` and :meth:`read_csv` did not honor `compression` and `encoding` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, and :issue:`32392`) - :meth:`to_picke` and :meth:`read_pickle` did not support compression for file-objects (:issue:`26237`, :issue:`29054`, and :issue:`29570`) +- Bug in :meth:`read_csv` with `engine='python'` truncating data if multiple items present in first row and first element started with BOM (:issue:`36343`) Plotting ^^^^^^^^ From 891e8d415b17bc2624f14b75c94ff8bfb90eb5b5 Mon Sep 17 00:00:00 2001 From: Asish Mahapatra Date: Sat, 19 Sep 2020 03:54:58 -0400 Subject: [PATCH 5/5] make separate test case --- pandas/tests/io/parser/test_common.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 67c6566d1c6b9..7f53b14ab77a4 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2127,10 +2127,14 @@ def test_first_row_bom(all_parsers): expected = DataFrame(columns=["Head1", "Head2", "Head3"]) tm.assert_frame_equal(result, expected) + +def test_first_row_bom_unquoted(all_parsers): # see gh-36343 + parser = all_parsers data = """\ufeffHead1 Head2 Head3""" result = parser.read_csv(StringIO(data), delimiter="\t") + expected = DataFrame(columns=["Head1", "Head2", "Head3"]) tm.assert_frame_equal(result, expected)