Skip to content

Commit c87e40c

Browse files
authored
[FIX] Handle decimal and thousand separator in 'round_trip' converer (#35377)
1 parent 0639e7f commit c87e40c

File tree

3 files changed

+154
-2
lines changed

3 files changed

+154
-2
lines changed

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ I/O
155155
^^^
156156

157157
- Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`)
158+
- In :meth:`read_csv` `float_precision='round_trip'` now handles `decimal` and `thousands` parameters (:issue:`35365`)
158159
-
159160

160161
Plotting

pandas/_libs/src/parser/tokenizer.c

+55-2
Original file line numberDiff line numberDiff line change
@@ -1778,20 +1778,73 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
17781778
return number;
17791779
}
17801780

1781+
/* copy a decimal number string with `decimal`, `tsep` as decimal point
1782+
and thousands separator to an equivalent c-locale decimal string (striping
1783+
`tsep`, replacing `decimal` with '.'). The returned memory should be free-d
1784+
with a call to `free`.
1785+
*/
1786+
1787+
char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
1788+
char tsep) {
1789+
const char *p = s;
1790+
size_t length = strlen(s);
1791+
char *s_copy = malloc(length + 1);
1792+
char *dst = s_copy;
1793+
// Copy Leading sign
1794+
if (*p == '+' || *p == '-') {
1795+
*dst++ = *p++;
1796+
}
1797+
// Copy integer part dropping `tsep`
1798+
while (isdigit_ascii(*p)) {
1799+
*dst++ = *p++;
1800+
p += (tsep != '\0' && *p == tsep);
1801+
}
1802+
// Replace `decimal` with '.'
1803+
if (*p == decimal) {
1804+
*dst++ = '.';
1805+
p++;
1806+
}
1807+
// Copy the remainder of the string as is.
1808+
strncpy(dst, p, length + 1 - (p - s));
1809+
if (endpos != NULL)
1810+
*endpos = (char *)(s + length);
1811+
return s_copy;
1812+
}
1813+
1814+
17811815
double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
17821816
int skip_trailing, int *error, int *maybe_int) {
1817+
// 'normalize' representation to C-locale; replace decimal with '.' and
1818+
// remove t(housand)sep.
1819+
char *endptr;
1820+
char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep);
17831821
// This is called from a nogil block in parsers.pyx
17841822
// so need to explicitly get GIL before Python calls
17851823
PyGILState_STATE gstate;
17861824
gstate = PyGILState_Ensure();
1787-
1788-
double r = PyOS_string_to_double(p, q, 0);
1825+
char *endpc;
1826+
double r = PyOS_string_to_double(pc, &endpc, 0);
1827+
// PyOS_string_to_double needs to consume the whole string
1828+
if (endpc == pc + strlen(pc)) {
1829+
if (q != NULL) {
1830+
// report endptr from source string (p)
1831+
*q = (char *) endptr;
1832+
}
1833+
} else {
1834+
*error = -1;
1835+
if (q != NULL) {
1836+
// p and pc are different len due to tsep removal. Can't report
1837+
// how much it has consumed of p. Just rewind to beginning.
1838+
*q = (char *)p;
1839+
}
1840+
}
17891841
if (maybe_int != NULL) *maybe_int = 0;
17901842
if (PyErr_Occurred() != NULL) *error = -1;
17911843
else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL;
17921844
PyErr_Clear();
17931845

17941846
PyGILState_Release(gstate);
1847+
free(pc);
17951848
return r;
17961849
}
17971850

pandas/tests/io/parser/test_c_parser_only.py

+98
Original file line numberDiff line numberDiff line change
@@ -606,3 +606,101 @@ def test_unix_style_breaks(c_parser_only):
606606
result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c")
607607
expected = DataFrame(columns=["col_1", "col_2", "col_3"])
608608
tm.assert_frame_equal(result, expected)
609+
610+
611+
@pytest.mark.parametrize("float_precision", [None, "high", "round_trip"])
612+
@pytest.mark.parametrize(
613+
"data,thousands,decimal",
614+
[
615+
(
616+
"""A|B|C
617+
1|2,334.01|5
618+
10|13|10.
619+
""",
620+
",",
621+
".",
622+
),
623+
(
624+
"""A|B|C
625+
1|2.334,01|5
626+
10|13|10,
627+
""",
628+
".",
629+
",",
630+
),
631+
],
632+
)
633+
def test_1000_sep_with_decimal(
634+
c_parser_only, data, thousands, decimal, float_precision
635+
):
636+
parser = c_parser_only
637+
expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
638+
639+
result = parser.read_csv(
640+
StringIO(data),
641+
sep="|",
642+
thousands=thousands,
643+
decimal=decimal,
644+
float_precision=float_precision,
645+
)
646+
tm.assert_frame_equal(result, expected)
647+
648+
649+
@pytest.mark.parametrize(
650+
"float_precision", [None, "high", "round_trip"],
651+
)
652+
@pytest.mark.parametrize(
653+
"value,expected",
654+
[
655+
("-1,0", -1.0),
656+
("-1,2e0", -1.2),
657+
("-1e0", -1.0),
658+
("+1e0", 1.0),
659+
("+1e+0", 1.0),
660+
("+1e-1", 0.1),
661+
("+,1e1", 1.0),
662+
("+1,e0", 1.0),
663+
("-,1e1", -1.0),
664+
("-1,e0", -1.0),
665+
("0,1", 0.1),
666+
("1,", 1.0),
667+
(",1", 0.1),
668+
("-,1", -0.1),
669+
("1_,", 1.0),
670+
("1_234,56", 1234.56),
671+
("1_234,56e0", 1234.56),
672+
# negative cases; must not parse as float
673+
("_", "_"),
674+
("-_", "-_"),
675+
("-_1", "-_1"),
676+
("-_1e0", "-_1e0"),
677+
("_1", "_1"),
678+
("_1,", "_1,"),
679+
("_1,_", "_1,_"),
680+
("_1e0", "_1e0"),
681+
("1,2e_1", "1,2e_1"),
682+
("1,2e1_0", "1,2e1_0"),
683+
("1,_2", "1,_2"),
684+
(",1__2", ",1__2"),
685+
(",1e", ",1e"),
686+
("-,1e", "-,1e"),
687+
("1_000,000_000", "1_000,000_000"),
688+
("1,e1_2", "1,e1_2"),
689+
],
690+
)
691+
def test_1000_sep_decimal_float_precision(
692+
c_parser_only, value, expected, float_precision
693+
):
694+
# test decimal and thousand sep handling in across 'float_precision'
695+
# parsers
696+
parser = c_parser_only
697+
df = parser.read_csv(
698+
StringIO(value),
699+
sep="|",
700+
thousands="_",
701+
decimal=",",
702+
header=None,
703+
float_precision=float_precision,
704+
)
705+
val = df.iloc[0, 0]
706+
assert val == expected

0 commit comments

Comments
 (0)