From 55962055e41bfd217a61cf90ddfdec3a801a5067 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Mon, 4 Jan 2021 02:59:21 +0100
Subject: [PATCH 1/4] Unify tests for thousands recognition

---
 pandas/io/parsers.py                          |  6 +-
 pandas/tests/io/parser/common/test_decimal.py | 18 ++++++
 pandas/tests/io/parser/conftest.py            | 45 +++++++++++++++
 pandas/tests/io/parser/test_c_parser_only.py  | 45 +--------------
 .../io/parser/test_python_parser_only.py      | 56 +++++--------------
 5 files changed, 83 insertions(+), 87 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 3058d1eed22b9..ffe0c779b9c7f 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -2349,12 +2349,12 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
 
         decimal = re.escape(self.decimal)
         if self.thousands is None:
-            regex = fr"^\-?[0-9]*({decimal}[0-9]*)?([0-9](E|e)\-?[0-9]*)?$"
+            regex = fr"^[\-|\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$"
         else:
             thousands = re.escape(self.thousands)
             regex = (
-                fr"^\-?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
-                fr"([0-9](E|e)\-?[0-9]*)?$"
+                fr"^[\-|\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
+                fr"([0-9]?(E|e)\-?[0-9]+)?$"
             )
         self.num = re.compile(regex)
 
diff --git a/pandas/tests/io/parser/common/test_decimal.py b/pandas/tests/io/parser/common/test_decimal.py
index 7ca9f253bd501..621d1a2f193c9 100644
--- a/pandas/tests/io/parser/common/test_decimal.py
+++ b/pandas/tests/io/parser/common/test_decimal.py
@@ -58,3 +58,21 @@ def test_euro_decimal_format(all_parsers):
         columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
     )
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("thousands", [None, "."])
+@pytest.mark.parametrize(
+    "value",
+    ["e11,2", "1e11,2", "1,2,2", "1,2.1", "1,2e-10e1", "--1,2", "1a.2,1", "1..2,3"],
+)
+def test_decimal_and_exponential_erroneous(all_parsers, thousands, value):
+    # GH#31920
+    data = StringIO(
+        f"""a	b
+    1,1	{value}
+    """
+    )
+    parser = all_parsers
+    result = parser.read_csv(data, "\t", decimal=",", thousands=thousands)
+    expected = DataFrame({"a": [1.1], "b": [value]})
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
index ec098353960d7..72726b166cc9a 100644
--- a/pandas/tests/io/parser/conftest.py
+++ b/pandas/tests/io/parser/conftest.py
@@ -148,3 +148,48 @@ def encoding_fmt(request):
     Fixture for all possible string formats of a UTF encoding.
     """
     return request.param
+
+
+@pytest.fixture(params=[
+        ("-1,0", -1.0),
+        ("-1,2e0", -1.2),
+        ("-1e0", -1.0),
+        ("+1e0", 1.0),
+        ("+1e+0", 1.0),
+        ("+1e-1", 0.1),
+        ("+,1e1", 1.0),
+        ("+1,e0", 1.0),
+        ("-,1e1", -1.0),
+        ("-1,e0", -1.0),
+        ("0,1", 0.1),
+        ("1,", 1.0),
+        (",1", 0.1),
+        ("-,1", -0.1),
+        ("1_,", 1.0),
+        ("1_234,56", 1234.56),
+        ("1_234,56e0", 1234.56),
+        # negative cases; must not parse as float
+        ("_", "_"),
+        ("-_", "-_"),
+        ("-_1", "-_1"),
+        ("-_1e0", "-_1e0"),
+        ("_1", "_1"),
+        ("_1,", "_1,"),
+        ("_1,_", "_1,_"),
+        ("_1e0", "_1e0"),
+        ("1,2e_1", "1,2e_1"),
+        ("1,2e1_0", "1,2e1_0"),
+        ("1,_2", "1,_2"),
+        (",1__2", ",1__2"),
+        (",1e", ",1e"),
+        ("-,1e", "-,1e"),
+        ("1_000,000_000", "1_000,000_000"),
+        ("1,e1_2", "1,e1_2"),
+    ])
+def numeric_decimal_thousands(request):
+    """
+    Fixture for all numeric formats which should get recognized
+    """
+    return request.param
+
+
diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py
index 15e7569ea9014..a4533240422fb 100644
--- a/pandas/tests/io/parser/test_c_parser_only.py
+++ b/pandas/tests/io/parser/test_c_parser_only.py
@@ -654,53 +654,14 @@ def test_1000_sep_with_decimal(
 
 
 @pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
-@pytest.mark.parametrize(
-    "value,expected",
-    [
-        ("-1,0", -1.0),
-        ("-1,2e0", -1.2),
-        ("-1e0", -1.0),
-        ("+1e0", 1.0),
-        ("+1e+0", 1.0),
-        ("+1e-1", 0.1),
-        ("+,1e1", 1.0),
-        ("+1,e0", 1.0),
-        ("-,1e1", -1.0),
-        ("-1,e0", -1.0),
-        ("0,1", 0.1),
-        ("1,", 1.0),
-        (",1", 0.1),
-        ("-,1", -0.1),
-        ("1_,", 1.0),
-        ("1_234,56", 1234.56),
-        ("1_234,56e0", 1234.56),
-        # negative cases; must not parse as float
-        ("_", "_"),
-        ("-_", "-_"),
-        ("-_1", "-_1"),
-        ("-_1e0", "-_1e0"),
-        ("_1", "_1"),
-        ("_1,", "_1,"),
-        ("_1,_", "_1,_"),
-        ("_1e0", "_1e0"),
-        ("1,2e_1", "1,2e_1"),
-        ("1,2e1_0", "1,2e1_0"),
-        ("1,_2", "1,_2"),
-        (",1__2", ",1__2"),
-        (",1e", ",1e"),
-        ("-,1e", "-,1e"),
-        ("1_000,000_000", "1_000,000_000"),
-        ("1,e1_2", "1,e1_2"),
-    ],
-)
 def test_1000_sep_decimal_float_precision(
-    c_parser_only, value, expected, float_precision
+    c_parser_only, numeric_decimal_thousands, float_precision
 ):
     # test decimal and thousand sep handling in across 'float_precision'
     # parsers
     parser = c_parser_only
     df = parser.read_csv(
-        StringIO(value),
+        StringIO(numeric_decimal_thousands[0]),
         sep="|",
         thousands="_",
         decimal=",",
@@ -708,7 +669,7 @@ def test_1000_sep_decimal_float_precision(
         float_precision=float_precision,
     )
     val = df.iloc[0, 0]
-    assert val == expected
+    assert val == numeric_decimal_thousands[1]
 
 
 def test_float_precision_options(c_parser_only):
diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py
index 04d5413abfafc..f65ca583dfdfb 100644
--- a/pandas/tests/io/parser/test_python_parser_only.py
+++ b/pandas/tests/io/parser/test_python_parser_only.py
@@ -307,47 +307,19 @@ def test_malformed_skipfooter(python_parser_only):
         parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1)
 
 
-@pytest.mark.parametrize("thousands", [None, "."])
-@pytest.mark.parametrize(
-    "value, result_value",
-    [
-        ("1,2", 1.2),
-        ("1,2e-1", 0.12),
-        ("1,2E-1", 0.12),
-        ("1,2e-10", 0.0000000012),
-        ("1,2e1", 12.0),
-        ("1,2E1", 12.0),
-        ("-1,2e-1", -0.12),
-        ("0,2", 0.2),
-        (",2", 0.2),
-    ],
-)
-def test_decimal_and_exponential(python_parser_only, thousands, value, result_value):
-    # GH#31920
-    data = StringIO(
-        f"""a	b
-    1,1	{value}
-    """
-    )
-    result = python_parser_only.read_csv(
-        data, "\t", decimal=",", engine="python", thousands=thousands
-    )
-    expected = DataFrame({"a": [1.1], "b": [result_value]})
-    tm.assert_frame_equal(result, expected)
-
-
-@pytest.mark.parametrize("thousands", [None, "."])
-@pytest.mark.parametrize(
-    "value",
-    ["e11,2", "1e11,2", "1,2,2", "1,2.1", "1,2e-10e1", "--1,2", "1a.2,1", "1..2,3"],
-)
-def test_decimal_and_exponential_erroneous(python_parser_only, thousands, value):
+@pytest.mark.parametrize("thousands", ["_", None])
+def test_decimal_and_exponential(python_parser_only, numeric_decimal_thousands, thousands):
     # GH#31920
-    data = StringIO(
-        f"""a	b
-    1,1	{value}
-    """
+    parser = python_parser_only
+    value = numeric_decimal_thousands[0]
+    if thousands is None and "_" in value:
+        pytest.skip("Skip test if no thousands sep is defined and sep is in value")
+    df = parser.read_csv(
+        StringIO(value),
+        sep="|",
+        thousands=thousands,
+        decimal=",",
+        header=None,
     )
-    result = python_parser_only.read_csv(data, "\t", decimal=",", thousands=thousands)
-    expected = DataFrame({"a": [1.1], "b": [value]})
-    tm.assert_frame_equal(result, expected)
+    val = df.iloc[0, 0]
+    assert val == numeric_decimal_thousands[1]

From 10900c8fadaa66eb93f7f6b25e7954263bd02d95 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Mon, 4 Jan 2021 03:07:33 +0100
Subject: [PATCH 2/4] Add testcases

---
 pandas/tests/io/parser/common/test_decimal.py  | 18 ------------------
 pandas/tests/io/parser/conftest.py             | 17 +++++++++++++----
 .../tests/io/parser/test_python_parser_only.py |  4 +++-
 3 files changed, 16 insertions(+), 23 deletions(-)

diff --git a/pandas/tests/io/parser/common/test_decimal.py b/pandas/tests/io/parser/common/test_decimal.py
index 621d1a2f193c9..7ca9f253bd501 100644
--- a/pandas/tests/io/parser/common/test_decimal.py
+++ b/pandas/tests/io/parser/common/test_decimal.py
@@ -58,21 +58,3 @@ def test_euro_decimal_format(all_parsers):
         columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
     )
     tm.assert_frame_equal(result, expected)
-
-
-@pytest.mark.parametrize("thousands", [None, "."])
-@pytest.mark.parametrize(
-    "value",
-    ["e11,2", "1e11,2", "1,2,2", "1,2.1", "1,2e-10e1", "--1,2", "1a.2,1", "1..2,3"],
-)
-def test_decimal_and_exponential_erroneous(all_parsers, thousands, value):
-    # GH#31920
-    data = StringIO(
-        f"""a	b
-    1,1	{value}
-    """
-    )
-    parser = all_parsers
-    result = parser.read_csv(data, "\t", decimal=",", thousands=thousands)
-    expected = DataFrame({"a": [1.1], "b": [value]})
-    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
index 72726b166cc9a..1ad092dfb1a5c 100644
--- a/pandas/tests/io/parser/conftest.py
+++ b/pandas/tests/io/parser/conftest.py
@@ -150,7 +150,8 @@ def encoding_fmt(request):
     return request.param
 
 
-@pytest.fixture(params=[
+@pytest.fixture(
+    params=[
         ("-1,0", -1.0),
         ("-1,2e0", -1.2),
         ("-1e0", -1.0),
@@ -185,11 +186,19 @@ def encoding_fmt(request):
         ("-,1e", "-,1e"),
         ("1_000,000_000", "1_000,000_000"),
         ("1,e1_2", "1,e1_2"),
-    ])
+        ("e11,2", "e11,2"),
+        ("1e11,2", "1e11,2"),
+        ("1,2,2", "1,2,2"),
+        ("1,2_1", "1,2_1"),
+        ("1,2e-10e1", "1,2e-10e1"),
+        ("--1,2", "--1,2"),
+        ("1a_2,1", "1a_2,1"),
+        ("1,2E-1", 0.12),
+        ("1,2E1", 12.0),
+    ]
+)
 def numeric_decimal_thousands(request):
     """
     Fixture for all numeric formats which should get recognized
     """
     return request.param
-
-
diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py
index f65ca583dfdfb..484c0b2408c2e 100644
--- a/pandas/tests/io/parser/test_python_parser_only.py
+++ b/pandas/tests/io/parser/test_python_parser_only.py
@@ -308,7 +308,9 @@ def test_malformed_skipfooter(python_parser_only):
 
 
 @pytest.mark.parametrize("thousands", ["_", None])
-def test_decimal_and_exponential(python_parser_only, numeric_decimal_thousands, thousands):
+def test_decimal_and_exponential(
+    python_parser_only, numeric_decimal_thousands, thousands
+):
     # GH#31920
     parser = python_parser_only
     value = numeric_decimal_thousands[0]

From 9593cd5a3ea28456c1c1f902883f307ccb358576 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Mon, 4 Jan 2021 17:58:36 +0100
Subject: [PATCH 3/4] Refactor test organization

---
 pandas/tests/io/parser/conftest.py            |  2 +-
 .../io/parser/dtypes/test_dtypes_basic.py     | 32 +++++++++++++++++++
 pandas/tests/io/parser/test_c_parser_only.py  | 19 -----------
 .../io/parser/test_python_parser_only.py      | 20 ------------
 4 files changed, 33 insertions(+), 40 deletions(-)

diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
index 1ad092dfb1a5c..dc94d7fc9f975 100644
--- a/pandas/tests/io/parser/conftest.py
+++ b/pandas/tests/io/parser/conftest.py
@@ -197,7 +197,7 @@ def encoding_fmt(request):
         ("1,2E1", 12.0),
     ]
 )
-def numeric_decimal_thousands(request):
+def numeric_decimal(request):
     """
     Fixture for all numeric formats which should get recognized
     """
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
index fc34d65fdad52..ec1ccf009b8de 100644
--- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
+++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -181,3 +181,35 @@ def test_delimiter_with_usecols_and_parse_dates(all_parsers):
         {"col1": [-9.1], "col2": [-9.1], "col3": [Timestamp("2010-10-10")]}
     )
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("thousands", ["_", None])
+def test_decimal_and_exponential(python_parser_only, numeric_decimal, thousands):
+    # GH#31920
+    decimal_number_check(python_parser_only, numeric_decimal, thousands, None)
+
+
+@pytest.mark.parametrize("thousands", ["_", None])
+@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
+def test_1000_sep_decimal_float_precision(
+    c_parser_only, numeric_decimal, float_precision, thousands
+):
+    # test decimal and thousand sep handling in across 'float_precision'
+    # parsers
+    decimal_number_check(c_parser_only, numeric_decimal, thousands, float_precision)
+
+
+def decimal_number_check(parser, numeric_decimal, thousands, float_precision):
+    # GH#31920
+    value = numeric_decimal[0]
+    if thousands is None and "_" in value:
+        pytest.skip("Skip test if no thousands sep is defined and sep is in value")
+    df = parser.read_csv(
+        StringIO(value),
+        sep="|",
+        thousands=thousands,
+        decimal=",",
+        header=None,
+    )
+    val = df.iloc[0, 0]
+    assert val == numeric_decimal[1]
diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py
index a4533240422fb..da778093237b0 100644
--- a/pandas/tests/io/parser/test_c_parser_only.py
+++ b/pandas/tests/io/parser/test_c_parser_only.py
@@ -653,25 +653,6 @@ def test_1000_sep_with_decimal(
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
-def test_1000_sep_decimal_float_precision(
-    c_parser_only, numeric_decimal_thousands, float_precision
-):
-    # test decimal and thousand sep handling in across 'float_precision'
-    # parsers
-    parser = c_parser_only
-    df = parser.read_csv(
-        StringIO(numeric_decimal_thousands[0]),
-        sep="|",
-        thousands="_",
-        decimal=",",
-        header=None,
-        float_precision=float_precision,
-    )
-    val = df.iloc[0, 0]
-    assert val == numeric_decimal_thousands[1]
-
-
 def test_float_precision_options(c_parser_only):
     # GH 17154, 36228
     parser = c_parser_only
diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py
index 484c0b2408c2e..d55a6361fc8d2 100644
--- a/pandas/tests/io/parser/test_python_parser_only.py
+++ b/pandas/tests/io/parser/test_python_parser_only.py
@@ -305,23 +305,3 @@ def test_malformed_skipfooter(python_parser_only):
     msg = "Expected 3 fields in line 4, saw 5"
     with pytest.raises(ParserError, match=msg):
         parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1)
-
-
-@pytest.mark.parametrize("thousands", ["_", None])
-def test_decimal_and_exponential(
-    python_parser_only, numeric_decimal_thousands, thousands
-):
-    # GH#31920
-    parser = python_parser_only
-    value = numeric_decimal_thousands[0]
-    if thousands is None and "_" in value:
-        pytest.skip("Skip test if no thousands sep is defined and sep is in value")
-    df = parser.read_csv(
-        StringIO(value),
-        sep="|",
-        thousands=thousands,
-        decimal=",",
-        header=None,
-    )
-    val = df.iloc[0, 0]
-    assert val == numeric_decimal_thousands[1]

From 92dff4d1b63f7b3a3217e2d712f9be51ccc70769 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Mon, 4 Jan 2021 22:01:25 +0100
Subject: [PATCH 4/4] Add comment and remove pipe

---
 pandas/io/parsers.py               | 4 ++--
 pandas/tests/io/parser/conftest.py | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index ffe0c779b9c7f..6e9cc18358153 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -2349,11 +2349,11 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
 
         decimal = re.escape(self.decimal)
         if self.thousands is None:
-            regex = fr"^[\-|\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$"
+            regex = fr"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$"
         else:
             thousands = re.escape(self.thousands)
             regex = (
-                fr"^[\-|\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
+                fr"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
                 fr"([0-9]?(E|e)\-?[0-9]+)?$"
             )
         self.num = re.compile(regex)
diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
index dc94d7fc9f975..321678c36943a 100644
--- a/pandas/tests/io/parser/conftest.py
+++ b/pandas/tests/io/parser/conftest.py
@@ -199,6 +199,7 @@ def encoding_fmt(request):
 )
 def numeric_decimal(request):
     """
-    Fixture for all numeric formats which should get recognized
+    Fixture for all numeric formats which should get recognized. The first entry
+    represents the value to read while the second represents the expected result.
     """
     return request.param