Skip to content

CLN: Unify number recognition tests for all parsers #38954

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 4, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2349,12 +2349,12 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):

decimal = re.escape(self.decimal)
if self.thousands is None:
regex = fr"^\-?[0-9]*({decimal}[0-9]*)?([0-9](E|e)\-?[0-9]*)?$"
regex = fr"^[\-|\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the | necessary inside []?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, thanks.

else:
thousands = re.escape(self.thousands)
regex = (
fr"^\-?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
fr"([0-9](E|e)\-?[0-9]*)?$"
fr"^[\-|\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
fr"([0-9]?(E|e)\-?[0-9]+)?$"
)
self.num = re.compile(regex)

Expand Down
54 changes: 54 additions & 0 deletions pandas/tests/io/parser/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,3 +148,57 @@ def encoding_fmt(request):
Fixture for all possible string formats of a UTF encoding.
"""
return request.param


@pytest.fixture(
params=[
("-1,0", -1.0),
("-1,2e0", -1.2),
("-1e0", -1.0),
("+1e0", 1.0),
("+1e+0", 1.0),
("+1e-1", 0.1),
("+,1e1", 1.0),
("+1,e0", 1.0),
("-,1e1", -1.0),
("-1,e0", -1.0),
("0,1", 0.1),
("1,", 1.0),
(",1", 0.1),
("-,1", -0.1),
("1_,", 1.0),
("1_234,56", 1234.56),
("1_234,56e0", 1234.56),
# negative cases; must not parse as float
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add something in the docstring about this, i.e. how to interpret the tuple

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, done

("_", "_"),
("-_", "-_"),
("-_1", "-_1"),
("-_1e0", "-_1e0"),
("_1", "_1"),
("_1,", "_1,"),
("_1,_", "_1,_"),
("_1e0", "_1e0"),
("1,2e_1", "1,2e_1"),
("1,2e1_0", "1,2e1_0"),
("1,_2", "1,_2"),
(",1__2", ",1__2"),
(",1e", ",1e"),
("-,1e", "-,1e"),
("1_000,000_000", "1_000,000_000"),
("1,e1_2", "1,e1_2"),
("e11,2", "e11,2"),
("1e11,2", "1e11,2"),
("1,2,2", "1,2,2"),
("1,2_1", "1,2_1"),
("1,2e-10e1", "1,2e-10e1"),
("--1,2", "--1,2"),
("1a_2,1", "1a_2,1"),
("1,2E-1", 0.12),
("1,2E1", 12.0),
]
)
def numeric_decimal(request):
"""
Fixture for all numeric formats which should get recognized
"""
return request.param
32 changes: 32 additions & 0 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,3 +181,35 @@ def test_delimiter_with_usecols_and_parse_dates(all_parsers):
{"col1": [-9.1], "col2": [-9.1], "col3": [Timestamp("2010-10-10")]}
)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("thousands", ["_", None])
def test_decimal_and_exponential(python_parser_only, numeric_decimal, thousands):
# GH#31920
decimal_number_check(python_parser_only, numeric_decimal, thousands, None)


@pytest.mark.parametrize("thousands", ["_", None])
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
def test_1000_sep_decimal_float_precision(
c_parser_only, numeric_decimal, float_precision, thousands
):
# test decimal and thousand sep handling in across 'float_precision'
# parsers
decimal_number_check(c_parser_only, numeric_decimal, thousands, float_precision)


def decimal_number_check(parser, numeric_decimal, thousands, float_precision):
# GH#31920
value = numeric_decimal[0]
if thousands is None and "_" in value:
pytest.skip("Skip test if no thousands sep is defined and sep is in value")
df = parser.read_csv(
StringIO(value),
sep="|",
thousands=thousands,
decimal=",",
header=None,
)
val = df.iloc[0, 0]
assert val == numeric_decimal[1]
58 changes: 0 additions & 58 deletions pandas/tests/io/parser/test_c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -653,64 +653,6 @@ def test_1000_sep_with_decimal(
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
@pytest.mark.parametrize(
"value,expected",
[
("-1,0", -1.0),
("-1,2e0", -1.2),
("-1e0", -1.0),
("+1e0", 1.0),
("+1e+0", 1.0),
("+1e-1", 0.1),
("+,1e1", 1.0),
("+1,e0", 1.0),
("-,1e1", -1.0),
("-1,e0", -1.0),
("0,1", 0.1),
("1,", 1.0),
(",1", 0.1),
("-,1", -0.1),
("1_,", 1.0),
("1_234,56", 1234.56),
("1_234,56e0", 1234.56),
# negative cases; must not parse as float
("_", "_"),
("-_", "-_"),
("-_1", "-_1"),
("-_1e0", "-_1e0"),
("_1", "_1"),
("_1,", "_1,"),
("_1,_", "_1,_"),
("_1e0", "_1e0"),
("1,2e_1", "1,2e_1"),
("1,2e1_0", "1,2e1_0"),
("1,_2", "1,_2"),
(",1__2", ",1__2"),
(",1e", ",1e"),
("-,1e", "-,1e"),
("1_000,000_000", "1_000,000_000"),
("1,e1_2", "1,e1_2"),
],
)
def test_1000_sep_decimal_float_precision(
c_parser_only, value, expected, float_precision
):
# test decimal and thousand sep handling in across 'float_precision'
# parsers
parser = c_parser_only
df = parser.read_csv(
StringIO(value),
sep="|",
thousands="_",
decimal=",",
header=None,
float_precision=float_precision,
)
val = df.iloc[0, 0]
assert val == expected


def test_float_precision_options(c_parser_only):
# GH 17154, 36228
parser = c_parser_only
Expand Down
46 changes: 0 additions & 46 deletions pandas/tests/io/parser/test_python_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,49 +305,3 @@ def test_malformed_skipfooter(python_parser_only):
msg = "Expected 3 fields in line 4, saw 5"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1)


@pytest.mark.parametrize("thousands", [None, "."])
@pytest.mark.parametrize(
"value, result_value",
[
("1,2", 1.2),
("1,2e-1", 0.12),
("1,2E-1", 0.12),
("1,2e-10", 0.0000000012),
("1,2e1", 12.0),
("1,2E1", 12.0),
("-1,2e-1", -0.12),
("0,2", 0.2),
(",2", 0.2),
],
)
def test_decimal_and_exponential(python_parser_only, thousands, value, result_value):
# GH#31920
data = StringIO(
f"""a b
1,1 {value}
"""
)
result = python_parser_only.read_csv(
data, "\t", decimal=",", engine="python", thousands=thousands
)
expected = DataFrame({"a": [1.1], "b": [result_value]})
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("thousands", [None, "."])
@pytest.mark.parametrize(
"value",
["e11,2", "1e11,2", "1,2,2", "1,2.1", "1,2e-10e1", "--1,2", "1a.2,1", "1..2,3"],
)
def test_decimal_and_exponential_erroneous(python_parser_only, thousands, value):
# GH#31920
data = StringIO(
f"""a b
1,1 {value}
"""
)
result = python_parser_only.read_csv(data, "\t", decimal=",", thousands=thousands)
expected = DataFrame({"a": [1.1], "b": [value]})
tm.assert_frame_equal(result, expected)