From 55962055e41bfd217a61cf90ddfdec3a801a5067 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 4 Jan 2021 02:59:21 +0100 Subject: [PATCH 1/4] Unify tests for thousands recognition --- pandas/io/parsers.py | 6 +- pandas/tests/io/parser/common/test_decimal.py | 18 ++++++ pandas/tests/io/parser/conftest.py | 45 +++++++++++++++ pandas/tests/io/parser/test_c_parser_only.py | 45 +-------------- .../io/parser/test_python_parser_only.py | 56 +++++-------------- 5 files changed, 83 insertions(+), 87 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3058d1eed22b9..ffe0c779b9c7f 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2349,12 +2349,12 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): decimal = re.escape(self.decimal) if self.thousands is None: - regex = fr"^\-?[0-9]*({decimal}[0-9]*)?([0-9](E|e)\-?[0-9]*)?$" + regex = fr"^[\-|\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$" else: thousands = re.escape(self.thousands) regex = ( - fr"^\-?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?" - fr"([0-9](E|e)\-?[0-9]*)?$" + fr"^[\-|\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?" + fr"([0-9]?(E|e)\-?[0-9]+)?$" ) self.num = re.compile(regex) diff --git a/pandas/tests/io/parser/common/test_decimal.py b/pandas/tests/io/parser/common/test_decimal.py index 7ca9f253bd501..621d1a2f193c9 100644 --- a/pandas/tests/io/parser/common/test_decimal.py +++ b/pandas/tests/io/parser/common/test_decimal.py @@ -58,3 +58,21 @@ def test_euro_decimal_format(all_parsers): columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"], ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("thousands", [None, "."]) +@pytest.mark.parametrize( + "value", + ["e11,2", "1e11,2", "1,2,2", "1,2.1", "1,2e-10e1", "--1,2", "1a.2,1", "1..2,3"], +) +def test_decimal_and_exponential_erroneous(all_parsers, thousands, value): + # GH#31920 + data = StringIO( + f"""a b + 1,1 {value} + """ + ) + parser = all_parsers + result = parser.read_csv(data, "\t", decimal=",", thousands=thousands) + expected = DataFrame({"a": [1.1], "b": [value]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index ec098353960d7..72726b166cc9a 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -148,3 +148,48 @@ def encoding_fmt(request): Fixture for all possible string formats of a UTF encoding. """ return request.param + + +@pytest.fixture(params=[ + ("-1,0", -1.0), + ("-1,2e0", -1.2), + ("-1e0", -1.0), + ("+1e0", 1.0), + ("+1e+0", 1.0), + ("+1e-1", 0.1), + ("+,1e1", 1.0), + ("+1,e0", 1.0), + ("-,1e1", -1.0), + ("-1,e0", -1.0), + ("0,1", 0.1), + ("1,", 1.0), + (",1", 0.1), + ("-,1", -0.1), + ("1_,", 1.0), + ("1_234,56", 1234.56), + ("1_234,56e0", 1234.56), + # negative cases; must not parse as float + ("_", "_"), + ("-_", "-_"), + ("-_1", "-_1"), + ("-_1e0", "-_1e0"), + ("_1", "_1"), + ("_1,", "_1,"), + ("_1,_", "_1,_"), + ("_1e0", "_1e0"), + ("1,2e_1", "1,2e_1"), + ("1,2e1_0", "1,2e1_0"), + ("1,_2", "1,_2"), + (",1__2", ",1__2"), + (",1e", ",1e"), + ("-,1e", "-,1e"), + ("1_000,000_000", "1_000,000_000"), + ("1,e1_2", "1,e1_2"), + ]) +def numeric_decimal_thousands(request): + """ + Fixture for all numeric formats which should get recognized + """ + return request.param + + diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 15e7569ea9014..a4533240422fb 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -654,53 +654,14 @@ def test_1000_sep_with_decimal( @pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"]) -@pytest.mark.parametrize( - "value,expected", - [ - ("-1,0", -1.0), - ("-1,2e0", -1.2), - ("-1e0", -1.0), - ("+1e0", 1.0), - ("+1e+0", 1.0), - ("+1e-1", 0.1), - ("+,1e1", 1.0), - ("+1,e0", 1.0), - ("-,1e1", -1.0), - ("-1,e0", -1.0), - ("0,1", 0.1), - ("1,", 1.0), - (",1", 0.1), - ("-,1", -0.1), - ("1_,", 1.0), - ("1_234,56", 1234.56), - ("1_234,56e0", 1234.56), - # negative cases; must not parse as float - ("_", "_"), - ("-_", "-_"), - ("-_1", "-_1"), - ("-_1e0", "-_1e0"), - ("_1", "_1"), - ("_1,", "_1,"), - ("_1,_", "_1,_"), - ("_1e0", "_1e0"), - ("1,2e_1", "1,2e_1"), - ("1,2e1_0", "1,2e1_0"), - ("1,_2", "1,_2"), - (",1__2", ",1__2"), - (",1e", ",1e"), - ("-,1e", "-,1e"), - ("1_000,000_000", "1_000,000_000"), - ("1,e1_2", "1,e1_2"), - ], -) def test_1000_sep_decimal_float_precision( - c_parser_only, value, expected, float_precision + c_parser_only, numeric_decimal_thousands, float_precision ): # test decimal and thousand sep handling in across 'float_precision' # parsers parser = c_parser_only df = parser.read_csv( - StringIO(value), + StringIO(numeric_decimal_thousands[0]), sep="|", thousands="_", decimal=",", @@ -708,7 +669,7 @@ def test_1000_sep_decimal_float_precision( float_precision=float_precision, ) val = df.iloc[0, 0] - assert val == expected + assert val == numeric_decimal_thousands[1] def test_float_precision_options(c_parser_only): diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 04d5413abfafc..f65ca583dfdfb 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -307,47 +307,19 @@ def test_malformed_skipfooter(python_parser_only): parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1) -@pytest.mark.parametrize("thousands", [None, "."]) -@pytest.mark.parametrize( - "value, result_value", - [ - ("1,2", 1.2), - ("1,2e-1", 0.12), - ("1,2E-1", 0.12), - ("1,2e-10", 0.0000000012), - ("1,2e1", 12.0), - ("1,2E1", 12.0), - ("-1,2e-1", -0.12), - ("0,2", 0.2), - (",2", 0.2), - ], -) -def test_decimal_and_exponential(python_parser_only, thousands, value, result_value): - # GH#31920 - data = StringIO( - f"""a b - 1,1 {value} - """ - ) - result = python_parser_only.read_csv( - data, "\t", decimal=",", engine="python", thousands=thousands - ) - expected = DataFrame({"a": [1.1], "b": [result_value]}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("thousands", [None, "."]) -@pytest.mark.parametrize( - "value", - ["e11,2", "1e11,2", "1,2,2", "1,2.1", "1,2e-10e1", "--1,2", "1a.2,1", "1..2,3"], -) -def test_decimal_and_exponential_erroneous(python_parser_only, thousands, value): +@pytest.mark.parametrize("thousands", ["_", None]) +def test_decimal_and_exponential(python_parser_only, numeric_decimal_thousands, thousands): # GH#31920 - data = StringIO( - f"""a b - 1,1 {value} - """ + parser = python_parser_only + value = numeric_decimal_thousands[0] + if thousands is None and "_" in value: + pytest.skip("Skip test if no thousands sep is defined and sep is in value") + df = parser.read_csv( + StringIO(value), + sep="|", + thousands=thousands, + decimal=",", + header=None, ) - result = python_parser_only.read_csv(data, "\t", decimal=",", thousands=thousands) - expected = DataFrame({"a": [1.1], "b": [value]}) - tm.assert_frame_equal(result, expected) + val = df.iloc[0, 0] + assert val == numeric_decimal_thousands[1] From 10900c8fadaa66eb93f7f6b25e7954263bd02d95 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 4 Jan 2021 03:07:33 +0100 Subject: [PATCH 2/4] Add testcases --- pandas/tests/io/parser/common/test_decimal.py | 18 ------------------ pandas/tests/io/parser/conftest.py | 17 +++++++++++++---- .../tests/io/parser/test_python_parser_only.py | 4 +++- 3 files changed, 16 insertions(+), 23 deletions(-) diff --git a/pandas/tests/io/parser/common/test_decimal.py b/pandas/tests/io/parser/common/test_decimal.py index 621d1a2f193c9..7ca9f253bd501 100644 --- a/pandas/tests/io/parser/common/test_decimal.py +++ b/pandas/tests/io/parser/common/test_decimal.py @@ -58,21 +58,3 @@ def test_euro_decimal_format(all_parsers): columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"], ) tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("thousands", [None, "."]) -@pytest.mark.parametrize( - "value", - ["e11,2", "1e11,2", "1,2,2", "1,2.1", "1,2e-10e1", "--1,2", "1a.2,1", "1..2,3"], -) -def test_decimal_and_exponential_erroneous(all_parsers, thousands, value): - # GH#31920 - data = StringIO( - f"""a b - 1,1 {value} - """ - ) - parser = all_parsers - result = parser.read_csv(data, "\t", decimal=",", thousands=thousands) - expected = DataFrame({"a": [1.1], "b": [value]}) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 72726b166cc9a..1ad092dfb1a5c 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -150,7 +150,8 @@ def encoding_fmt(request): return request.param -@pytest.fixture(params=[ +@pytest.fixture( + params=[ ("-1,0", -1.0), ("-1,2e0", -1.2), ("-1e0", -1.0), @@ -185,11 +186,19 @@ def encoding_fmt(request): ("-,1e", "-,1e"), ("1_000,000_000", "1_000,000_000"), ("1,e1_2", "1,e1_2"), - ]) + ("e11,2", "e11,2"), + ("1e11,2", "1e11,2"), + ("1,2,2", "1,2,2"), + ("1,2_1", "1,2_1"), + ("1,2e-10e1", "1,2e-10e1"), + ("--1,2", "--1,2"), + ("1a_2,1", "1a_2,1"), + ("1,2E-1", 0.12), + ("1,2E1", 12.0), + ] +) def numeric_decimal_thousands(request): """ Fixture for all numeric formats which should get recognized """ return request.param - - diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index f65ca583dfdfb..484c0b2408c2e 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -308,7 +308,9 @@ def test_malformed_skipfooter(python_parser_only): @pytest.mark.parametrize("thousands", ["_", None]) -def test_decimal_and_exponential(python_parser_only, numeric_decimal_thousands, thousands): +def test_decimal_and_exponential( + python_parser_only, numeric_decimal_thousands, thousands +): # GH#31920 parser = python_parser_only value = numeric_decimal_thousands[0] From 9593cd5a3ea28456c1c1f902883f307ccb358576 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 4 Jan 2021 17:58:36 +0100 Subject: [PATCH 3/4] Refactor test organization --- pandas/tests/io/parser/conftest.py | 2 +- .../io/parser/dtypes/test_dtypes_basic.py | 32 +++++++++++++++++++ pandas/tests/io/parser/test_c_parser_only.py | 19 ----------- .../io/parser/test_python_parser_only.py | 20 ------------ 4 files changed, 33 insertions(+), 40 deletions(-) diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 1ad092dfb1a5c..dc94d7fc9f975 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -197,7 +197,7 @@ def encoding_fmt(request): ("1,2E1", 12.0), ] ) -def numeric_decimal_thousands(request): +def numeric_decimal(request): """ Fixture for all numeric formats which should get recognized """ diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index fc34d65fdad52..ec1ccf009b8de 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -181,3 +181,35 @@ def test_delimiter_with_usecols_and_parse_dates(all_parsers): {"col1": [-9.1], "col2": [-9.1], "col3": [Timestamp("2010-10-10")]} ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("thousands", ["_", None]) +def test_decimal_and_exponential(python_parser_only, numeric_decimal, thousands): + # GH#31920 + decimal_number_check(python_parser_only, numeric_decimal, thousands, None) + + +@pytest.mark.parametrize("thousands", ["_", None]) +@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"]) +def test_1000_sep_decimal_float_precision( + c_parser_only, numeric_decimal, float_precision, thousands +): + # test decimal and thousand sep handling in across 'float_precision' + # parsers + decimal_number_check(c_parser_only, numeric_decimal, thousands, float_precision) + + +def decimal_number_check(parser, numeric_decimal, thousands, float_precision): + # GH#31920 + value = numeric_decimal[0] + if thousands is None and "_" in value: + pytest.skip("Skip test if no thousands sep is defined and sep is in value") + df = parser.read_csv( + StringIO(value), + sep="|", + thousands=thousands, + decimal=",", + header=None, + ) + val = df.iloc[0, 0] + assert val == numeric_decimal[1] diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index a4533240422fb..da778093237b0 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -653,25 +653,6 @@ def test_1000_sep_with_decimal( tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"]) -def test_1000_sep_decimal_float_precision( - c_parser_only, numeric_decimal_thousands, float_precision -): - # test decimal and thousand sep handling in across 'float_precision' - # parsers - parser = c_parser_only - df = parser.read_csv( - StringIO(numeric_decimal_thousands[0]), - sep="|", - thousands="_", - decimal=",", - header=None, - float_precision=float_precision, - ) - val = df.iloc[0, 0] - assert val == numeric_decimal_thousands[1] - - def test_float_precision_options(c_parser_only): # GH 17154, 36228 parser = c_parser_only diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 484c0b2408c2e..d55a6361fc8d2 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -305,23 +305,3 @@ def test_malformed_skipfooter(python_parser_only): msg = "Expected 3 fields in line 4, saw 5" with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1) - - -@pytest.mark.parametrize("thousands", ["_", None]) -def test_decimal_and_exponential( - python_parser_only, numeric_decimal_thousands, thousands -): - # GH#31920 - parser = python_parser_only - value = numeric_decimal_thousands[0] - if thousands is None and "_" in value: - pytest.skip("Skip test if no thousands sep is defined and sep is in value") - df = parser.read_csv( - StringIO(value), - sep="|", - thousands=thousands, - decimal=",", - header=None, - ) - val = df.iloc[0, 0] - assert val == numeric_decimal_thousands[1] From 92dff4d1b63f7b3a3217e2d712f9be51ccc70769 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 4 Jan 2021 22:01:25 +0100 Subject: [PATCH 4/4] Add comment and remove pipe --- pandas/io/parsers.py | 4 ++-- pandas/tests/io/parser/conftest.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index ffe0c779b9c7f..6e9cc18358153 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2349,11 +2349,11 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): decimal = re.escape(self.decimal) if self.thousands is None: - regex = fr"^[\-|\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$" + regex = fr"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$" else: thousands = re.escape(self.thousands) regex = ( - fr"^[\-|\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?" + fr"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?" fr"([0-9]?(E|e)\-?[0-9]+)?$" ) self.num = re.compile(regex) diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index dc94d7fc9f975..321678c36943a 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -199,6 +199,7 @@ def encoding_fmt(request): ) def numeric_decimal(request): """ - Fixture for all numeric formats which should get recognized + Fixture for all numeric formats which should get recognized. The first entry + represents the value to read while the second represents the expected result. """ return request.param