From f49d007d9d093926a25d4e27aa3e29bb2c020bda Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 12 Dec 2020 01:44:32 +0100 Subject: [PATCH 01/10] BUG: read_csv not recognizing numbers appropriately when decimal is set --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/io/parsers.py | 11 +++- .../io/parser/test_python_parser_only.py | 55 +++++++++++++++++++ 3 files changed, 64 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 90f611c55e710..1db88fec0a326 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -146,7 +146,7 @@ MultiIndex I/O ^^^ -- +- Bug in :func:`read_csv` not recognizing scientific notation if decimal is set (:issue:`31920`) - Period diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5b623c360c3ef..4f49bb34c2aa0 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2346,9 +2346,14 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): raise ValueError("Only length-1 decimal markers supported") if self.thousands is None: - self.nonnum = re.compile(fr"[^-^0-9^{self.decimal}]+") + regex = fr"^\-?[0-9]+({self.decimal}[0-9]*)?((E|e)\-?[0-9]*)?$" + self.nonnum = re.compile(regex) else: - self.nonnum = re.compile(fr"[^-^0-9^{self.thousands}^{self.decimal}]+") + thousands = self.thousands + if "." == thousands: + thousands = fr"\{thousands}" + regex = fr"^\-?[0-9]+({thousands}|[0-9])*({self.decimal}[0-9]*)?((E|e)\-?[0-9]*)?$" + self.nonnum = re.compile(regex) def _set_no_thousands_columns(self): # Create a set of column ids that are not to be stripped of thousands @@ -3044,7 +3049,7 @@ def _search_replace_num_columns(self, lines, search, replace): not isinstance(x, str) or search not in x or (self._no_thousands_columns and i in self._no_thousands_columns) - or self.nonnum.search(x.strip()) + or not self.nonnum.search(x.strip()) ): rl.append(x) else: diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 016fae4f4a6f5..29fbf0ece7d26 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -314,3 +314,58 @@ def test_malformed_skipfooter(python_parser_only): msg = "Expected 3 fields in line 4, saw 5" with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1) + + +@pytest.mark.parametrize("thousands", [None, "."]) +@pytest.mark.parametrize( + "value, result_value", + [ + ("1,2", 1.2), + ("1,2e-1", 0.12), + ("1,2E-1", 0.12), + ("1,2e-10", 0.0000000012), + ("1,2e1", 12.0), + ("1,2E1", 12.0), + ("-1,2e-1", -0.12), + ("0,2", 0.2), + ], +) +def test_decimal_and_exponential(python_parser_only, thousands, value, result_value): + # GH#31920 + data = StringIO( + f"""a b + 1,1 {value} + """ + ) + result = python_parser_only.read_csv( + data, "\t", decimal=",", engine="python", thousands=thousands + ) + expected = DataFrame({"a": [1.1], "b": [result_value]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("thousands", [None, "."]) +@pytest.mark.parametrize( + "value", + [ + "e11,2", + "1e11,2", + "1,2,2", + "1,2.1", + "1,2e-10e1", + "--1,2", + "1a.2,1", + ], +) +def test_decimal_and_exponential_erroneous(python_parser_only, thousands, value): + # GH#31920 + data = StringIO( + f"""a b + 1,1 {value} + """ + ) + result = python_parser_only.read_csv( + data, "\t", decimal=",", engine="python", thousands=thousands + ) + expected = DataFrame({"a": [1.1], "b": [value]}) + tm.assert_frame_equal(result, expected) From cc7dd1b909f7b7026a5a1a411c2995b3d26059fe Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 12 Dec 2020 01:50:35 +0100 Subject: [PATCH 02/10] Add corner case --- pandas/io/parsers.py | 4 ++-- pandas/tests/io/parser/test_python_parser_only.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 4f49bb34c2aa0..f8d6cd31b97c9 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2346,13 +2346,13 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): raise ValueError("Only length-1 decimal markers supported") if self.thousands is None: - regex = fr"^\-?[0-9]+({self.decimal}[0-9]*)?((E|e)\-?[0-9]*)?$" + regex = fr"^\-?[0-9]*({self.decimal}[0-9]*)?([0-9](E|e)\-?[0-9]*)?$" self.nonnum = re.compile(regex) else: thousands = self.thousands if "." == thousands: thousands = fr"\{thousands}" - regex = fr"^\-?[0-9]+({thousands}|[0-9])*({self.decimal}[0-9]*)?((E|e)\-?[0-9]*)?$" + regex = fr"^\-?([0-9]|{thousands})*({self.decimal}[0-9]*)?([0-9](E|e)\-?[0-9]*)?$" self.nonnum = re.compile(regex) def _set_no_thousands_columns(self): diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 29fbf0ece7d26..c6aed0f34ed96 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -328,6 +328,7 @@ def test_malformed_skipfooter(python_parser_only): ("1,2E1", 12.0), ("-1,2e-1", -0.12), ("0,2", 0.2), + (",2", 0.2), ], ) def test_decimal_and_exponential(python_parser_only, thousands, value, result_value): From 189ca80fc0e146ac17efef27b4c40e0f51a6074e Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 12 Dec 2020 01:53:17 +0100 Subject: [PATCH 03/10] Add another corner case --- pandas/io/parsers.py | 2 +- pandas/tests/io/parser/test_python_parser_only.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f8d6cd31b97c9..18ab98a41f708 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2352,7 +2352,7 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): thousands = self.thousands if "." == thousands: thousands = fr"\{thousands}" - regex = fr"^\-?([0-9]|{thousands})*({self.decimal}[0-9]*)?([0-9](E|e)\-?[0-9]*)?$" + regex = fr"^\-?([0-9]+{thousands}|[0-9])*({self.decimal}[0-9]*)?([0-9](E|e)\-?[0-9]*)?$" self.nonnum = re.compile(regex) def _set_no_thousands_columns(self): diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index c6aed0f34ed96..ce3f64f1ba8b0 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -356,6 +356,7 @@ def test_decimal_and_exponential(python_parser_only, thousands, value, result_va "1,2e-10e1", "--1,2", "1a.2,1", + "1..2,3" ], ) def test_decimal_and_exponential_erroneous(python_parser_only, thousands, value): From 567423f7f539191caeb65a035ce274a1d149d66f Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 12 Dec 2020 01:54:40 +0100 Subject: [PATCH 04/10] Run black --- pandas/tests/io/parser/test_python_parser_only.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index ce3f64f1ba8b0..a15a0c6292be3 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -348,16 +348,7 @@ def test_decimal_and_exponential(python_parser_only, thousands, value, result_va @pytest.mark.parametrize("thousands", [None, "."]) @pytest.mark.parametrize( "value", - [ - "e11,2", - "1e11,2", - "1,2,2", - "1,2.1", - "1,2e-10e1", - "--1,2", - "1a.2,1", - "1..2,3" - ], + ["e11,2", "1e11,2", "1,2,2", "1,2.1", "1,2e-10e1", "--1,2", "1a.2,1", "1..2,3"], ) def test_decimal_and_exponential_erroneous(python_parser_only, thousands, value): # GH#31920 From ba163cfd0247663a7c6df0e7e71a04ae219f49fb Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 12 Dec 2020 01:55:21 +0100 Subject: [PATCH 05/10] Fix pep8 issue --- pandas/io/parsers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 18ab98a41f708..993f30403155f 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2352,7 +2352,8 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): thousands = self.thousands if "." == thousands: thousands = fr"\{thousands}" - regex = fr"^\-?([0-9]+{thousands}|[0-9])*({self.decimal}[0-9]*)?([0-9](E|e)\-?[0-9]*)?$" + regex = fr"^\-?([0-9]+{thousands}|[0-9])*({self.decimal}[0-9]*)?"\ + fr"([0-9](E|e)\-?[0-9]*)?$" self.nonnum = re.compile(regex) def _set_no_thousands_columns(self): From a5e568be5cd07ca1b46c0e38986c9957e6924bbe Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 12 Dec 2020 02:08:15 +0100 Subject: [PATCH 06/10] Fix pep8 --- pandas/io/parsers.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 993f30403155f..84516398d586b 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2352,8 +2352,10 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): thousands = self.thousands if "." == thousands: thousands = fr"\{thousands}" - regex = fr"^\-?([0-9]+{thousands}|[0-9])*({self.decimal}[0-9]*)?"\ - fr"([0-9](E|e)\-?[0-9]*)?$" + regex = ( + fr"^\-?([0-9]+{thousands}|[0-9])*({self.decimal}[0-9]*)?" + fr"([0-9](E|e)\-?[0-9]*)?$" + ) self.nonnum = re.compile(regex) def _set_no_thousands_columns(self): From 514f45ae4e9ef3c4faac0664f9257507977bf494 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 19 Dec 2020 19:40:45 +0100 Subject: [PATCH 07/10] Refactor code --- pandas/io/parsers.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 84516398d586b..51ae84454eb20 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2345,18 +2345,16 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): if len(self.decimal) != 1: raise ValueError("Only length-1 decimal markers supported") + decimal = re.escape(self.decimal) if self.thousands is None: - regex = fr"^\-?[0-9]*({self.decimal}[0-9]*)?([0-9](E|e)\-?[0-9]*)?$" - self.nonnum = re.compile(regex) + regex = fr"^\-?[0-9]*({decimal}[0-9]*)?([0-9](E|e)\-?[0-9]*)?$" else: - thousands = self.thousands - if "." == thousands: - thousands = fr"\{thousands}" + thousands = re.escape(self.thousands) regex = ( - fr"^\-?([0-9]+{thousands}|[0-9])*({self.decimal}[0-9]*)?" + fr"^\-?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?" fr"([0-9](E|e)\-?[0-9]*)?$" ) - self.nonnum = re.compile(regex) + self.num = re.compile(regex) def _set_no_thousands_columns(self): # Create a set of column ids that are not to be stripped of thousands @@ -3052,7 +3050,7 @@ def _search_replace_num_columns(self, lines, search, replace): not isinstance(x, str) or search not in x or (self._no_thousands_columns and i in self._no_thousands_columns) - or not self.nonnum.search(x.strip()) + or not self.num.search(x.strip()) ): rl.append(x) else: From a0eced563c644ba3f302448056ae570826997056 Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Tue, 29 Dec 2020 21:58:21 +0100 Subject: [PATCH 08/10] Update pandas/tests/io/parser/test_python_parser_only.py Co-authored-by: gfyoung --- pandas/tests/io/parser/test_python_parser_only.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index a15a0c6292be3..4ca681d253d34 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -358,7 +358,7 @@ def test_decimal_and_exponential_erroneous(python_parser_only, thousands, value) """ ) result = python_parser_only.read_csv( - data, "\t", decimal=",", engine="python", thousands=thousands + data, "\t", decimal=",", thousands=thousands ) expected = DataFrame({"a": [1.1], "b": [value]}) tm.assert_frame_equal(result, expected) From a611dad916f82af289232610697f6711e0e84a5d Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 29 Dec 2020 22:07:19 +0100 Subject: [PATCH 09/10] Fix black bug from autocommit --- pandas/tests/io/parser/test_python_parser_only.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index bab3b5996f132..04d5413abfafc 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -348,8 +348,6 @@ def test_decimal_and_exponential_erroneous(python_parser_only, thousands, value) 1,1 {value} """ ) - result = python_parser_only.read_csv( - data, "\t", decimal=",", thousands=thousands - ) + result = python_parser_only.read_csv(data, "\t", decimal=",", thousands=thousands) expected = DataFrame({"a": [1.1], "b": [value]}) tm.assert_frame_equal(result, expected) From 9ec9954d41ffc65e06db3572d656857e9d4e89f7 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 1 Jan 2021 23:36:24 +0100 Subject: [PATCH 10/10] Modify whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index a80b18f977378..c8d917a67f37b 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -255,7 +255,7 @@ I/O ^^^ - Bug in :meth:`Index.__repr__` when ``display.max_seq_items=1`` (:issue:`38415`) -- Bug in :func:`read_csv` not recognizing scientific notation if decimal is set (:issue:`31920`) +- Bug in :func:`read_csv` not recognizing scientific notation if decimal is set for ``engine="python"`` (:issue:`31920`) - Bug in :func:`read_csv` interpreting ``NA`` value as comment, when ``NA`` does contain the comment string fixed for ``engine="python"`` (:issue:`34002`) - Bug in :func:`read_csv` raising ``IndexError`` with multiple header columns and ``index_col`` specified when file has no data rows (:issue:`38292`) - Bug in :func:`read_csv` not accepting ``usecols`` with different length than ``names`` for ``engine="python"`` (:issue:`16469`)