diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 85da250648c28..0a0e94221befe 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -259,6 +259,7 @@ I/O ^^^ - Bug in :meth:`Index.__repr__` when ``display.max_seq_items=1`` (:issue:`38415`) +- Bug in :func:`read_csv` not recognizing scientific notation if decimal is set for ``engine="python"`` (:issue:`31920`) - Bug in :func:`read_csv` interpreting ``NA`` value as comment, when ``NA`` does contain the comment string fixed for ``engine="python"`` (:issue:`34002`) - Bug in :func:`read_csv` raising ``IndexError`` with multiple header columns and ``index_col`` specified when file has no data rows (:issue:`38292`) - Bug in :func:`read_csv` not accepting ``usecols`` with different length than ``names`` for ``engine="python"`` (:issue:`16469`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 68c0bbf0787e6..57ec42f2f48ba 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2344,10 +2344,16 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): if len(self.decimal) != 1: raise ValueError("Only length-1 decimal markers supported") + decimal = re.escape(self.decimal) if self.thousands is None: - self.nonnum = re.compile(fr"[^-^0-9^{self.decimal}]+") + regex = fr"^\-?[0-9]*({decimal}[0-9]*)?([0-9](E|e)\-?[0-9]*)?$" else: - self.nonnum = re.compile(fr"[^-^0-9^{self.thousands}^{self.decimal}]+") + thousands = re.escape(self.thousands) + regex = ( + fr"^\-?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?" + fr"([0-9](E|e)\-?[0-9]*)?$" + ) + self.num = re.compile(regex) def _set_no_thousands_columns(self): # Create a set of column ids that are not to be stripped of thousands @@ -3039,7 +3045,7 @@ def _search_replace_num_columns(self, lines, search, replace): not isinstance(x, str) or search not in x or (self._no_thousands_columns and i in self._no_thousands_columns) - or self.nonnum.search(x.strip()) + or not self.num.search(x.strip()) ): rl.append(x) else: diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index d55a6361fc8d2..04d5413abfafc 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -305,3 +305,49 @@ def test_malformed_skipfooter(python_parser_only): msg = "Expected 3 fields in line 4, saw 5" with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1) + + +@pytest.mark.parametrize("thousands", [None, "."]) +@pytest.mark.parametrize( + "value, result_value", + [ + ("1,2", 1.2), + ("1,2e-1", 0.12), + ("1,2E-1", 0.12), + ("1,2e-10", 0.0000000012), + ("1,2e1", 12.0), + ("1,2E1", 12.0), + ("-1,2e-1", -0.12), + ("0,2", 0.2), + (",2", 0.2), + ], +) +def test_decimal_and_exponential(python_parser_only, thousands, value, result_value): + # GH#31920 + data = StringIO( + f"""a b + 1,1 {value} + """ + ) + result = python_parser_only.read_csv( + data, "\t", decimal=",", engine="python", thousands=thousands + ) + expected = DataFrame({"a": [1.1], "b": [result_value]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("thousands", [None, "."]) +@pytest.mark.parametrize( + "value", + ["e11,2", "1e11,2", "1,2,2", "1,2.1", "1,2e-10e1", "--1,2", "1a.2,1", "1..2,3"], +) +def test_decimal_and_exponential_erroneous(python_parser_only, thousands, value): + # GH#31920 + data = StringIO( + f"""a b + 1,1 {value} + """ + ) + result = python_parser_only.read_csv(data, "\t", decimal=",", thousands=thousands) + expected = DataFrame({"a": [1.1], "b": [value]}) + tm.assert_frame_equal(result, expected)