diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3058d1eed22b9..6e9cc18358153 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2349,12 +2349,12 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): decimal = re.escape(self.decimal) if self.thousands is None: - regex = fr"^\-?[0-9]*({decimal}[0-9]*)?([0-9](E|e)\-?[0-9]*)?$" + regex = fr"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$" else: thousands = re.escape(self.thousands) regex = ( - fr"^\-?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?" - fr"([0-9](E|e)\-?[0-9]*)?$" + fr"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?" + fr"([0-9]?(E|e)\-?[0-9]+)?$" ) self.num = re.compile(regex) diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index ec098353960d7..321678c36943a 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -148,3 +148,58 @@ def encoding_fmt(request): Fixture for all possible string formats of a UTF encoding. """ return request.param + + +@pytest.fixture( + params=[ + ("-1,0", -1.0), + ("-1,2e0", -1.2), + ("-1e0", -1.0), + ("+1e0", 1.0), + ("+1e+0", 1.0), + ("+1e-1", 0.1), + ("+,1e1", 1.0), + ("+1,e0", 1.0), + ("-,1e1", -1.0), + ("-1,e0", -1.0), + ("0,1", 0.1), + ("1,", 1.0), + (",1", 0.1), + ("-,1", -0.1), + ("1_,", 1.0), + ("1_234,56", 1234.56), + ("1_234,56e0", 1234.56), + # negative cases; must not parse as float + ("_", "_"), + ("-_", "-_"), + ("-_1", "-_1"), + ("-_1e0", "-_1e0"), + ("_1", "_1"), + ("_1,", "_1,"), + ("_1,_", "_1,_"), + ("_1e0", "_1e0"), + ("1,2e_1", "1,2e_1"), + ("1,2e1_0", "1,2e1_0"), + ("1,_2", "1,_2"), + (",1__2", ",1__2"), + (",1e", ",1e"), + ("-,1e", "-,1e"), + ("1_000,000_000", "1_000,000_000"), + ("1,e1_2", "1,e1_2"), + ("e11,2", "e11,2"), + ("1e11,2", "1e11,2"), + ("1,2,2", "1,2,2"), + ("1,2_1", "1,2_1"), + ("1,2e-10e1", "1,2e-10e1"), + ("--1,2", "--1,2"), + ("1a_2,1", "1a_2,1"), + ("1,2E-1", 0.12), + ("1,2E1", 12.0), + ] +) +def numeric_decimal(request): + """ + Fixture for all numeric formats which should get recognized. The first entry + represents the value to read while the second represents the expected result. + """ + return request.param diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index fc34d65fdad52..ec1ccf009b8de 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -181,3 +181,35 @@ def test_delimiter_with_usecols_and_parse_dates(all_parsers): {"col1": [-9.1], "col2": [-9.1], "col3": [Timestamp("2010-10-10")]} ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("thousands", ["_", None]) +def test_decimal_and_exponential(python_parser_only, numeric_decimal, thousands): + # GH#31920 + decimal_number_check(python_parser_only, numeric_decimal, thousands, None) + + +@pytest.mark.parametrize("thousands", ["_", None]) +@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"]) +def test_1000_sep_decimal_float_precision( + c_parser_only, numeric_decimal, float_precision, thousands +): + # test decimal and thousand sep handling in across 'float_precision' + # parsers + decimal_number_check(c_parser_only, numeric_decimal, thousands, float_precision) + + +def decimal_number_check(parser, numeric_decimal, thousands, float_precision): + # GH#31920 + value = numeric_decimal[0] + if thousands is None and "_" in value: + pytest.skip("Skip test if no thousands sep is defined and sep is in value") + df = parser.read_csv( + StringIO(value), + sep="|", + thousands=thousands, + decimal=",", + header=None, + ) + val = df.iloc[0, 0] + assert val == numeric_decimal[1] diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 15e7569ea9014..da778093237b0 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -653,64 +653,6 @@ def test_1000_sep_with_decimal( tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"]) -@pytest.mark.parametrize( - "value,expected", - [ - ("-1,0", -1.0), - ("-1,2e0", -1.2), - ("-1e0", -1.0), - ("+1e0", 1.0), - ("+1e+0", 1.0), - ("+1e-1", 0.1), - ("+,1e1", 1.0), - ("+1,e0", 1.0), - ("-,1e1", -1.0), - ("-1,e0", -1.0), - ("0,1", 0.1), - ("1,", 1.0), - (",1", 0.1), - ("-,1", -0.1), - ("1_,", 1.0), - ("1_234,56", 1234.56), - ("1_234,56e0", 1234.56), - # negative cases; must not parse as float - ("_", "_"), - ("-_", "-_"), - ("-_1", "-_1"), - ("-_1e0", "-_1e0"), - ("_1", "_1"), - ("_1,", "_1,"), - ("_1,_", "_1,_"), - ("_1e0", "_1e0"), - ("1,2e_1", "1,2e_1"), - ("1,2e1_0", "1,2e1_0"), - ("1,_2", "1,_2"), - (",1__2", ",1__2"), - (",1e", ",1e"), - ("-,1e", "-,1e"), - ("1_000,000_000", "1_000,000_000"), - ("1,e1_2", "1,e1_2"), - ], -) -def test_1000_sep_decimal_float_precision( - c_parser_only, value, expected, float_precision -): - # test decimal and thousand sep handling in across 'float_precision' - # parsers - parser = c_parser_only - df = parser.read_csv( - StringIO(value), - sep="|", - thousands="_", - decimal=",", - header=None, - float_precision=float_precision, - ) - val = df.iloc[0, 0] - assert val == expected - - def test_float_precision_options(c_parser_only): # GH 17154, 36228 parser = c_parser_only diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 04d5413abfafc..d55a6361fc8d2 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -305,49 +305,3 @@ def test_malformed_skipfooter(python_parser_only): msg = "Expected 3 fields in line 4, saw 5" with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1) - - -@pytest.mark.parametrize("thousands", [None, "."]) -@pytest.mark.parametrize( - "value, result_value", - [ - ("1,2", 1.2), - ("1,2e-1", 0.12), - ("1,2E-1", 0.12), - ("1,2e-10", 0.0000000012), - ("1,2e1", 12.0), - ("1,2E1", 12.0), - ("-1,2e-1", -0.12), - ("0,2", 0.2), - (",2", 0.2), - ], -) -def test_decimal_and_exponential(python_parser_only, thousands, value, result_value): - # GH#31920 - data = StringIO( - f"""a b - 1,1 {value} - """ - ) - result = python_parser_only.read_csv( - data, "\t", decimal=",", engine="python", thousands=thousands - ) - expected = DataFrame({"a": [1.1], "b": [result_value]}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("thousands", [None, "."]) -@pytest.mark.parametrize( - "value", - ["e11,2", "1e11,2", "1,2,2", "1,2.1", "1,2e-10e1", "--1,2", "1a.2,1", "1..2,3"], -) -def test_decimal_and_exponential_erroneous(python_parser_only, thousands, value): - # GH#31920 - data = StringIO( - f"""a b - 1,1 {value} - """ - ) - result = python_parser_only.read_csv(data, "\t", decimal=",", thousands=thousands) - expected = DataFrame({"a": [1.1], "b": [value]}) - tm.assert_frame_equal(result, expected)