BUG: read_csv not recognizing numbers appropriately when decimal is set (pandas-dev#38420)

phofl · luckyvs1 · commit 12ee1dbeb29d · 2021-01-19T23:18:36.000-08:00
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -259,6 +259,7 @@ I/O
 ^^^
 
 - Bug in :meth:`Index.__repr__` when ``display.max_seq_items=1`` (:issue:`38415`)
+- Bug in :func:`read_csv` not recognizing scientific notation if decimal is set for ``engine="python"`` (:issue:`31920`)
 - Bug in :func:`read_csv` interpreting ``NA`` value as comment, when ``NA`` does contain the comment string fixed for ``engine="python"`` (:issue:`34002`)
 - Bug in :func:`read_csv` raising ``IndexError`` with multiple header columns and ``index_col`` specified when file has no data rows (:issue:`38292`)
 - Bug in :func:`read_csv` not accepting ``usecols`` with different length than ``names`` for ``engine="python"`` (:issue:`16469`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -2344,10 +2344,16 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
         if len(self.decimal) != 1:
             raise ValueError("Only length-1 decimal markers supported")
 
+        decimal = re.escape(self.decimal)
         if self.thousands is None:
-            self.nonnum = re.compile(fr"[^-^0-9^{self.decimal}]+")
+            regex = fr"^\-?[0-9]*({decimal}[0-9]*)?([0-9](E|e)\-?[0-9]*)?$"
         else:
-            self.nonnum = re.compile(fr"[^-^0-9^{self.thousands}^{self.decimal}]+")
+            thousands = re.escape(self.thousands)
+            regex = (
+                fr"^\-?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
+                fr"([0-9](E|e)\-?[0-9]*)?$"
+            )
+        self.num = re.compile(regex)
 
     def _set_no_thousands_columns(self):
         # Create a set of column ids that are not to be stripped of thousands
@@ -3039,7 +3045,7 @@ def _search_replace_num_columns(self, lines, search, replace):
                     not isinstance(x, str)
                     or search not in x
                     or (self._no_thousands_columns and i in self._no_thousands_columns)
-                    or self.nonnum.search(x.strip())
+                    or not self.num.search(x.strip())
                 ):
                     rl.append(x)
                 else:
diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py
@@ -305,3 +305,49 @@ def test_malformed_skipfooter(python_parser_only):
     msg = "Expected 3 fields in line 4, saw 5"
     with pytest.raises(ParserError, match=msg):
         parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1)
+
+
+@pytest.mark.parametrize("thousands", [None, "."])
+@pytest.mark.parametrize(
+    "value, result_value",
+    [
+        ("1,2", 1.2),
+        ("1,2e-1", 0.12),
+        ("1,2E-1", 0.12),
+        ("1,2e-10", 0.0000000012),
+        ("1,2e1", 12.0),
+        ("1,2E1", 12.0),
+        ("-1,2e-1", -0.12),
+        ("0,2", 0.2),
+        (",2", 0.2),
+    ],
+)
+def test_decimal_and_exponential(python_parser_only, thousands, value, result_value):
+    # GH#31920
+    data = StringIO(
+        f"""a	b
+    1,1	{value}
+    """
+    )
+    result = python_parser_only.read_csv(
+        data, "\t", decimal=",", engine="python", thousands=thousands
+    )
+    expected = DataFrame({"a": [1.1], "b": [result_value]})
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("thousands", [None, "."])
+@pytest.mark.parametrize(
+    "value",
+    ["e11,2", "1e11,2", "1,2,2", "1,2.1", "1,2e-10e1", "--1,2", "1a.2,1", "1..2,3"],
+)
+def test_decimal_and_exponential_erroneous(python_parser_only, thousands, value):
+    # GH#31920
+    data = StringIO(
+        f"""a	b
+    1,1	{value}
+    """
+    )
+    result = python_parser_only.read_csv(data, "\t", decimal=",", thousands=thousands)
+    expected = DataFrame({"a": [1.1], "b": [value]})
+    tm.assert_frame_equal(result, expected)