CLN: Unify number recognition tests for all parsers (pandas-dev#38954)

phofl · luckyvs1 · commit bd42bc65227d · 2021-01-19T23:18:37.000-08:00
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -2349,12 +2349,12 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
 
         decimal = re.escape(self.decimal)
         if self.thousands is None:
-            regex = fr"^\-?[0-9]*({decimal}[0-9]*)?([0-9](E|e)\-?[0-9]*)?$"
+            regex = fr"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$"
         else:
             thousands = re.escape(self.thousands)
             regex = (
-                fr"^\-?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
-                fr"([0-9](E|e)\-?[0-9]*)?$"
+                fr"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
+                fr"([0-9]?(E|e)\-?[0-9]+)?$"
             )
         self.num = re.compile(regex)
 
diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
@@ -148,3 +148,58 @@ def encoding_fmt(request):
     Fixture for all possible string formats of a UTF encoding.
     """
     return request.param
+
+
+@pytest.fixture(
+    params=[
+        ("-1,0", -1.0),
+        ("-1,2e0", -1.2),
+        ("-1e0", -1.0),
+        ("+1e0", 1.0),
+        ("+1e+0", 1.0),
+        ("+1e-1", 0.1),
+        ("+,1e1", 1.0),
+        ("+1,e0", 1.0),
+        ("-,1e1", -1.0),
+        ("-1,e0", -1.0),
+        ("0,1", 0.1),
+        ("1,", 1.0),
+        (",1", 0.1),
+        ("-,1", -0.1),
+        ("1_,", 1.0),
+        ("1_234,56", 1234.56),
+        ("1_234,56e0", 1234.56),
+        # negative cases; must not parse as float
+        ("_", "_"),
+        ("-_", "-_"),
+        ("-_1", "-_1"),
+        ("-_1e0", "-_1e0"),
+        ("_1", "_1"),
+        ("_1,", "_1,"),
+        ("_1,_", "_1,_"),
+        ("_1e0", "_1e0"),
+        ("1,2e_1", "1,2e_1"),
+        ("1,2e1_0", "1,2e1_0"),
+        ("1,_2", "1,_2"),
+        (",1__2", ",1__2"),
+        (",1e", ",1e"),
+        ("-,1e", "-,1e"),
+        ("1_000,000_000", "1_000,000_000"),
+        ("1,e1_2", "1,e1_2"),
+        ("e11,2", "e11,2"),
+        ("1e11,2", "1e11,2"),
+        ("1,2,2", "1,2,2"),
+        ("1,2_1", "1,2_1"),
+        ("1,2e-10e1", "1,2e-10e1"),
+        ("--1,2", "--1,2"),
+        ("1a_2,1", "1a_2,1"),
+        ("1,2E-1", 0.12),
+        ("1,2E1", 12.0),
+    ]
+)
+def numeric_decimal(request):
+    """
+    Fixture for all numeric formats which should get recognized. The first entry
+    represents the value to read while the second represents the expected result.
+    """
+    return request.param
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -181,3 +181,35 @@ def test_delimiter_with_usecols_and_parse_dates(all_parsers):
         {"col1": [-9.1], "col2": [-9.1], "col3": [Timestamp("2010-10-10")]}
     )
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("thousands", ["_", None])
+def test_decimal_and_exponential(python_parser_only, numeric_decimal, thousands):
+    # GH#31920
+    decimal_number_check(python_parser_only, numeric_decimal, thousands, None)
+
+
+@pytest.mark.parametrize("thousands", ["_", None])
+@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
+def test_1000_sep_decimal_float_precision(
+    c_parser_only, numeric_decimal, float_precision, thousands
+):
+    # test decimal and thousand sep handling in across 'float_precision'
+    # parsers
+    decimal_number_check(c_parser_only, numeric_decimal, thousands, float_precision)
+
+
+def decimal_number_check(parser, numeric_decimal, thousands, float_precision):
+    # GH#31920
+    value = numeric_decimal[0]
+    if thousands is None and "_" in value:
+        pytest.skip("Skip test if no thousands sep is defined and sep is in value")
+    df = parser.read_csv(
+        StringIO(value),
+        sep="|",
+        thousands=thousands,
+        decimal=",",
+        header=None,
+    )
+    val = df.iloc[0, 0]
+    assert val == numeric_decimal[1]
diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py
@@ -653,64 +653,6 @@ def test_1000_sep_with_decimal(
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
-@pytest.mark.parametrize(
-    "value,expected",
-    [
-        ("-1,0", -1.0),
-        ("-1,2e0", -1.2),
-        ("-1e0", -1.0),
-        ("+1e0", 1.0),
-        ("+1e+0", 1.0),
-        ("+1e-1", 0.1),
-        ("+,1e1", 1.0),
-        ("+1,e0", 1.0),
-        ("-,1e1", -1.0),
-        ("-1,e0", -1.0),
-        ("0,1", 0.1),
-        ("1,", 1.0),
-        (",1", 0.1),
-        ("-,1", -0.1),
-        ("1_,", 1.0),
-        ("1_234,56", 1234.56),
-        ("1_234,56e0", 1234.56),
-        # negative cases; must not parse as float
-        ("_", "_"),
-        ("-_", "-_"),
-        ("-_1", "-_1"),
-        ("-_1e0", "-_1e0"),
-        ("_1", "_1"),
-        ("_1,", "_1,"),
-        ("_1,_", "_1,_"),
-        ("_1e0", "_1e0"),
-        ("1,2e_1", "1,2e_1"),
-        ("1,2e1_0", "1,2e1_0"),
-        ("1,_2", "1,_2"),
-        (",1__2", ",1__2"),
-        (",1e", ",1e"),
-        ("-,1e", "-,1e"),
-        ("1_000,000_000", "1_000,000_000"),
-        ("1,e1_2", "1,e1_2"),
-    ],
-)
-def test_1000_sep_decimal_float_precision(
-    c_parser_only, value, expected, float_precision
-):
-    # test decimal and thousand sep handling in across 'float_precision'
-    # parsers
-    parser = c_parser_only
-    df = parser.read_csv(
-        StringIO(value),
-        sep="|",
-        thousands="_",
-        decimal=",",
-        header=None,
-        float_precision=float_precision,
-    )
-    val = df.iloc[0, 0]
-    assert val == expected
-
-
 def test_float_precision_options(c_parser_only):
     # GH 17154, 36228
     parser = c_parser_only
diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py
@@ -305,49 +305,3 @@ def test_malformed_skipfooter(python_parser_only):
     msg = "Expected 3 fields in line 4, saw 5"
     with pytest.raises(ParserError, match=msg):
         parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1)
-
-
-@pytest.mark.parametrize("thousands", [None, "."])
-@pytest.mark.parametrize(
-    "value, result_value",
-    [
-        ("1,2", 1.2),
-        ("1,2e-1", 0.12),
-        ("1,2E-1", 0.12),
-        ("1,2e-10", 0.0000000012),
-        ("1,2e1", 12.0),
-        ("1,2E1", 12.0),
-        ("-1,2e-1", -0.12),
-        ("0,2", 0.2),
-        (",2", 0.2),
-    ],
-)
-def test_decimal_and_exponential(python_parser_only, thousands, value, result_value):
-    # GH#31920
-    data = StringIO(
-        f"""a	b
-    1,1	{value}
-    """
-    )
-    result = python_parser_only.read_csv(
-        data, "\t", decimal=",", engine="python", thousands=thousands
-    )
-    expected = DataFrame({"a": [1.1], "b": [result_value]})
-    tm.assert_frame_equal(result, expected)
-
-
-@pytest.mark.parametrize("thousands", [None, "."])
-@pytest.mark.parametrize(
-    "value",
-    ["e11,2", "1e11,2", "1,2,2", "1,2.1", "1,2e-10e1", "--1,2", "1a.2,1", "1..2,3"],
-)
-def test_decimal_and_exponential_erroneous(python_parser_only, thousands, value):
-    # GH#31920
-    data = StringIO(
-        f"""a	b
-    1,1	{value}
-    """
-    )
-    result = python_parser_only.read_csv(data, "\t", decimal=",", thousands=thousands)
-    expected = DataFrame({"a": [1.1], "b": [value]})
-    tm.assert_frame_equal(result, expected)