Skip to content

Commit 12ee1db

Browse files
phoflluckyvs1
authored andcommitted
BUG: read_csv not recognizing numbers appropriately when decimal is set (pandas-dev#38420)
1 parent a648fbf commit 12ee1db

File tree

3 files changed

+56
-3
lines changed

3 files changed

+56
-3
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,7 @@ I/O
259259
^^^
260260

261261
- Bug in :meth:`Index.__repr__` when ``display.max_seq_items=1`` (:issue:`38415`)
262+
- Bug in :func:`read_csv` not recognizing scientific notation if decimal is set for ``engine="python"`` (:issue:`31920`)
262263
- Bug in :func:`read_csv` interpreting ``NA`` value as comment, when ``NA`` does contain the comment string fixed for ``engine="python"`` (:issue:`34002`)
263264
- Bug in :func:`read_csv` raising ``IndexError`` with multiple header columns and ``index_col`` specified when file has no data rows (:issue:`38292`)
264265
- Bug in :func:`read_csv` not accepting ``usecols`` with different length than ``names`` for ``engine="python"`` (:issue:`16469`)

pandas/io/parsers.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -2344,10 +2344,16 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
23442344
if len(self.decimal) != 1:
23452345
raise ValueError("Only length-1 decimal markers supported")
23462346

2347+
decimal = re.escape(self.decimal)
23472348
if self.thousands is None:
2348-
self.nonnum = re.compile(fr"[^-^0-9^{self.decimal}]+")
2349+
regex = fr"^\-?[0-9]*({decimal}[0-9]*)?([0-9](E|e)\-?[0-9]*)?$"
23492350
else:
2350-
self.nonnum = re.compile(fr"[^-^0-9^{self.thousands}^{self.decimal}]+")
2351+
thousands = re.escape(self.thousands)
2352+
regex = (
2353+
fr"^\-?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
2354+
fr"([0-9](E|e)\-?[0-9]*)?$"
2355+
)
2356+
self.num = re.compile(regex)
23512357

23522358
def _set_no_thousands_columns(self):
23532359
# Create a set of column ids that are not to be stripped of thousands
@@ -3039,7 +3045,7 @@ def _search_replace_num_columns(self, lines, search, replace):
30393045
not isinstance(x, str)
30403046
or search not in x
30413047
or (self._no_thousands_columns and i in self._no_thousands_columns)
3042-
or self.nonnum.search(x.strip())
3048+
or not self.num.search(x.strip())
30433049
):
30443050
rl.append(x)
30453051
else:

pandas/tests/io/parser/test_python_parser_only.py

+46
Original file line numberDiff line numberDiff line change
@@ -305,3 +305,49 @@ def test_malformed_skipfooter(python_parser_only):
305305
msg = "Expected 3 fields in line 4, saw 5"
306306
with pytest.raises(ParserError, match=msg):
307307
parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1)
308+
309+
310+
@pytest.mark.parametrize("thousands", [None, "."])
311+
@pytest.mark.parametrize(
312+
"value, result_value",
313+
[
314+
("1,2", 1.2),
315+
("1,2e-1", 0.12),
316+
("1,2E-1", 0.12),
317+
("1,2e-10", 0.0000000012),
318+
("1,2e1", 12.0),
319+
("1,2E1", 12.0),
320+
("-1,2e-1", -0.12),
321+
("0,2", 0.2),
322+
(",2", 0.2),
323+
],
324+
)
325+
def test_decimal_and_exponential(python_parser_only, thousands, value, result_value):
326+
# GH#31920
327+
data = StringIO(
328+
f"""a b
329+
1,1 {value}
330+
"""
331+
)
332+
result = python_parser_only.read_csv(
333+
data, "\t", decimal=",", engine="python", thousands=thousands
334+
)
335+
expected = DataFrame({"a": [1.1], "b": [result_value]})
336+
tm.assert_frame_equal(result, expected)
337+
338+
339+
@pytest.mark.parametrize("thousands", [None, "."])
340+
@pytest.mark.parametrize(
341+
"value",
342+
["e11,2", "1e11,2", "1,2,2", "1,2.1", "1,2e-10e1", "--1,2", "1a.2,1", "1..2,3"],
343+
)
344+
def test_decimal_and_exponential_erroneous(python_parser_only, thousands, value):
345+
# GH#31920
346+
data = StringIO(
347+
f"""a b
348+
1,1 {value}
349+
"""
350+
)
351+
result = python_parser_only.read_csv(data, "\t", decimal=",", thousands=thousands)
352+
expected = DataFrame({"a": [1.1], "b": [value]})
353+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)