Skip to content

Commit bd42bc6

Browse files
phoflluckyvs1
authored andcommitted
CLN: Unify number recognition tests for all parsers (pandas-dev#38954)
1 parent 69ab8c6 commit bd42bc6

File tree

5 files changed

+90
-107
lines changed

5 files changed

+90
-107
lines changed

pandas/io/parsers.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -2349,12 +2349,12 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
23492349

23502350
decimal = re.escape(self.decimal)
23512351
if self.thousands is None:
2352-
regex = fr"^\-?[0-9]*({decimal}[0-9]*)?([0-9](E|e)\-?[0-9]*)?$"
2352+
regex = fr"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$"
23532353
else:
23542354
thousands = re.escape(self.thousands)
23552355
regex = (
2356-
fr"^\-?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
2357-
fr"([0-9](E|e)\-?[0-9]*)?$"
2356+
fr"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
2357+
fr"([0-9]?(E|e)\-?[0-9]+)?$"
23582358
)
23592359
self.num = re.compile(regex)
23602360

pandas/tests/io/parser/conftest.py

+55
Original file line numberDiff line numberDiff line change
@@ -148,3 +148,58 @@ def encoding_fmt(request):
148148
Fixture for all possible string formats of a UTF encoding.
149149
"""
150150
return request.param
151+
152+
153+
@pytest.fixture(
154+
params=[
155+
("-1,0", -1.0),
156+
("-1,2e0", -1.2),
157+
("-1e0", -1.0),
158+
("+1e0", 1.0),
159+
("+1e+0", 1.0),
160+
("+1e-1", 0.1),
161+
("+,1e1", 1.0),
162+
("+1,e0", 1.0),
163+
("-,1e1", -1.0),
164+
("-1,e0", -1.0),
165+
("0,1", 0.1),
166+
("1,", 1.0),
167+
(",1", 0.1),
168+
("-,1", -0.1),
169+
("1_,", 1.0),
170+
("1_234,56", 1234.56),
171+
("1_234,56e0", 1234.56),
172+
# negative cases; must not parse as float
173+
("_", "_"),
174+
("-_", "-_"),
175+
("-_1", "-_1"),
176+
("-_1e0", "-_1e0"),
177+
("_1", "_1"),
178+
("_1,", "_1,"),
179+
("_1,_", "_1,_"),
180+
("_1e0", "_1e0"),
181+
("1,2e_1", "1,2e_1"),
182+
("1,2e1_0", "1,2e1_0"),
183+
("1,_2", "1,_2"),
184+
(",1__2", ",1__2"),
185+
(",1e", ",1e"),
186+
("-,1e", "-,1e"),
187+
("1_000,000_000", "1_000,000_000"),
188+
("1,e1_2", "1,e1_2"),
189+
("e11,2", "e11,2"),
190+
("1e11,2", "1e11,2"),
191+
("1,2,2", "1,2,2"),
192+
("1,2_1", "1,2_1"),
193+
("1,2e-10e1", "1,2e-10e1"),
194+
("--1,2", "--1,2"),
195+
("1a_2,1", "1a_2,1"),
196+
("1,2E-1", 0.12),
197+
("1,2E1", 12.0),
198+
]
199+
)
200+
def numeric_decimal(request):
201+
"""
202+
Fixture for all numeric formats which should get recognized. The first entry
203+
represents the value to read while the second represents the expected result.
204+
"""
205+
return request.param

pandas/tests/io/parser/dtypes/test_dtypes_basic.py

+32
Original file line numberDiff line numberDiff line change
@@ -181,3 +181,35 @@ def test_delimiter_with_usecols_and_parse_dates(all_parsers):
181181
{"col1": [-9.1], "col2": [-9.1], "col3": [Timestamp("2010-10-10")]}
182182
)
183183
tm.assert_frame_equal(result, expected)
184+
185+
186+
@pytest.mark.parametrize("thousands", ["_", None])
187+
def test_decimal_and_exponential(python_parser_only, numeric_decimal, thousands):
188+
# GH#31920
189+
decimal_number_check(python_parser_only, numeric_decimal, thousands, None)
190+
191+
192+
@pytest.mark.parametrize("thousands", ["_", None])
193+
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
194+
def test_1000_sep_decimal_float_precision(
195+
c_parser_only, numeric_decimal, float_precision, thousands
196+
):
197+
# test decimal and thousand sep handling in across 'float_precision'
198+
# parsers
199+
decimal_number_check(c_parser_only, numeric_decimal, thousands, float_precision)
200+
201+
202+
def decimal_number_check(parser, numeric_decimal, thousands, float_precision):
203+
# GH#31920
204+
value = numeric_decimal[0]
205+
if thousands is None and "_" in value:
206+
pytest.skip("Skip test if no thousands sep is defined and sep is in value")
207+
df = parser.read_csv(
208+
StringIO(value),
209+
sep="|",
210+
thousands=thousands,
211+
decimal=",",
212+
header=None,
213+
)
214+
val = df.iloc[0, 0]
215+
assert val == numeric_decimal[1]

pandas/tests/io/parser/test_c_parser_only.py

-58
Original file line numberDiff line numberDiff line change
@@ -653,64 +653,6 @@ def test_1000_sep_with_decimal(
653653
tm.assert_frame_equal(result, expected)
654654

655655

656-
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
657-
@pytest.mark.parametrize(
658-
"value,expected",
659-
[
660-
("-1,0", -1.0),
661-
("-1,2e0", -1.2),
662-
("-1e0", -1.0),
663-
("+1e0", 1.0),
664-
("+1e+0", 1.0),
665-
("+1e-1", 0.1),
666-
("+,1e1", 1.0),
667-
("+1,e0", 1.0),
668-
("-,1e1", -1.0),
669-
("-1,e0", -1.0),
670-
("0,1", 0.1),
671-
("1,", 1.0),
672-
(",1", 0.1),
673-
("-,1", -0.1),
674-
("1_,", 1.0),
675-
("1_234,56", 1234.56),
676-
("1_234,56e0", 1234.56),
677-
# negative cases; must not parse as float
678-
("_", "_"),
679-
("-_", "-_"),
680-
("-_1", "-_1"),
681-
("-_1e0", "-_1e0"),
682-
("_1", "_1"),
683-
("_1,", "_1,"),
684-
("_1,_", "_1,_"),
685-
("_1e0", "_1e0"),
686-
("1,2e_1", "1,2e_1"),
687-
("1,2e1_0", "1,2e1_0"),
688-
("1,_2", "1,_2"),
689-
(",1__2", ",1__2"),
690-
(",1e", ",1e"),
691-
("-,1e", "-,1e"),
692-
("1_000,000_000", "1_000,000_000"),
693-
("1,e1_2", "1,e1_2"),
694-
],
695-
)
696-
def test_1000_sep_decimal_float_precision(
697-
c_parser_only, value, expected, float_precision
698-
):
699-
# test decimal and thousand sep handling in across 'float_precision'
700-
# parsers
701-
parser = c_parser_only
702-
df = parser.read_csv(
703-
StringIO(value),
704-
sep="|",
705-
thousands="_",
706-
decimal=",",
707-
header=None,
708-
float_precision=float_precision,
709-
)
710-
val = df.iloc[0, 0]
711-
assert val == expected
712-
713-
714656
def test_float_precision_options(c_parser_only):
715657
# GH 17154, 36228
716658
parser = c_parser_only

pandas/tests/io/parser/test_python_parser_only.py

-46
Original file line numberDiff line numberDiff line change
@@ -305,49 +305,3 @@ def test_malformed_skipfooter(python_parser_only):
305305
msg = "Expected 3 fields in line 4, saw 5"
306306
with pytest.raises(ParserError, match=msg):
307307
parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1)
308-
309-
310-
@pytest.mark.parametrize("thousands", [None, "."])
311-
@pytest.mark.parametrize(
312-
"value, result_value",
313-
[
314-
("1,2", 1.2),
315-
("1,2e-1", 0.12),
316-
("1,2E-1", 0.12),
317-
("1,2e-10", 0.0000000012),
318-
("1,2e1", 12.0),
319-
("1,2E1", 12.0),
320-
("-1,2e-1", -0.12),
321-
("0,2", 0.2),
322-
(",2", 0.2),
323-
],
324-
)
325-
def test_decimal_and_exponential(python_parser_only, thousands, value, result_value):
326-
# GH#31920
327-
data = StringIO(
328-
f"""a b
329-
1,1 {value}
330-
"""
331-
)
332-
result = python_parser_only.read_csv(
333-
data, "\t", decimal=",", engine="python", thousands=thousands
334-
)
335-
expected = DataFrame({"a": [1.1], "b": [result_value]})
336-
tm.assert_frame_equal(result, expected)
337-
338-
339-
@pytest.mark.parametrize("thousands", [None, "."])
340-
@pytest.mark.parametrize(
341-
"value",
342-
["e11,2", "1e11,2", "1,2,2", "1,2.1", "1,2e-10e1", "--1,2", "1a.2,1", "1..2,3"],
343-
)
344-
def test_decimal_and_exponential_erroneous(python_parser_only, thousands, value):
345-
# GH#31920
346-
data = StringIO(
347-
f"""a b
348-
1,1 {value}
349-
"""
350-
)
351-
result = python_parser_only.read_csv(data, "\t", decimal=",", thousands=thousands)
352-
expected = DataFrame({"a": [1.1], "b": [value]})
353-
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)