Skip to content

Commit e4bca08

Browse files
committed
Handle decimal and tsep in round_trip converter (#35365)
In case of non c-locale decimal and tsep, copy and fixup the source string before passing it to PyOS_string_to_double
1 parent e673b69 commit e4bca08

File tree

2 files changed

+230
-2
lines changed

2 files changed

+230
-2
lines changed

pandas/_libs/src/parser/tokenizer.c

+143-2
Original file line numberDiff line numberDiff line change
@@ -1778,20 +1778,161 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
17781778
return number;
17791779
}
17801780

1781+
/* copy a decimal number string in form `decimal` and `tsep` and `sci` as
1782+
decimal point, thousands separator and sci exponent character to a an
1783+
equivalent c-locale decimal string (striping tsep, replacing `decimal`
1784+
with '.'). Return NULL if nothing could be copied.
1785+
*/
1786+
1787+
char* str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
1788+
char tsep, char sci) {
1789+
#define IS_TSEP(c) (tsep != '\0' && c == tsep)
1790+
ssize_t size = 0;
1791+
ssize_t num_digits = 0;
1792+
char has_exponent = 0;
1793+
const char *p = s;
1794+
// First count how many characters we can consume.
1795+
// Leading sign
1796+
if (*p == '+' || *p == '-') p++;
1797+
// Integer part
1798+
while (isdigit_ascii(*p)) {
1799+
p++;
1800+
p += IS_TSEP(*p);
1801+
num_digits++;
1802+
}
1803+
// Fractional part
1804+
if (*p == decimal) {
1805+
p++;
1806+
while (isdigit_ascii(*p)) {
1807+
p++;
1808+
num_digits++;
1809+
}
1810+
}
1811+
if (num_digits == 0) {
1812+
if (endpos != NULL) {
1813+
*endpos = (char *)s;
1814+
}
1815+
return NULL;
1816+
}
1817+
// Exponent part
1818+
if (toupper_ascii(*p) == toupper_ascii(sci)) {
1819+
const char * p_at_e = p;
1820+
num_digits = 0;
1821+
p++;
1822+
// Exponent sign
1823+
if (*p == '+' || *p == '-') p++;
1824+
// Exponent
1825+
while (isdigit_ascii(*p)) {
1826+
p++;
1827+
num_digits++;
1828+
}
1829+
if (num_digits == 0) {
1830+
// no digits after exponent; un-consume the (+|-)?
1831+
p = p_at_e;
1832+
has_exponent = 0;
1833+
} else {
1834+
has_exponent = 1;
1835+
}
1836+
}
1837+
1838+
size = p - s;
1839+
char *pc = malloc(size + 1);
1840+
memcpy(pc, p, size);
1841+
pc[size] = '\0';
1842+
char *dst = pc;
1843+
p = s;
1844+
num_digits = 0;
1845+
// Copy leading sign
1846+
if (*p == '+' || *p == '-') {
1847+
*dst++ = *p++;
1848+
}
1849+
// Copy integer part
1850+
while (isdigit_ascii(*p)) {
1851+
*dst++ = *p++;
1852+
p += IS_TSEP(*p);
1853+
num_digits++;
1854+
}
1855+
// Copy factional part, replacing `decimal` with '.'
1856+
if (*p == decimal) {
1857+
*dst++ = '.';
1858+
p++;
1859+
while (isdigit_ascii(*p)) {
1860+
*dst++ = *p++;
1861+
num_digits++;
1862+
}
1863+
}
1864+
assert(num_digits > 0);
1865+
// Copy exponent
1866+
if (has_exponent && toupper_ascii(*p) == toupper_ascii(sci)) {
1867+
num_digits = 0;
1868+
*dst++ = *p++;
1869+
// Copy leading exponent sign
1870+
if (*p == '+' || *p == '-') {
1871+
*dst++ = *p++;
1872+
}
1873+
// Exponent
1874+
while (isdigit_ascii(*p)) {
1875+
*dst++ = *p++;
1876+
num_digits++;
1877+
}
1878+
assert(num_digits > 0);
1879+
}
1880+
*dst = '\0';
1881+
if (endpos != NULL) {
1882+
*endpos = (char *)p;
1883+
}
1884+
return pc;
1885+
#undef IS_TSEP
1886+
}
1887+
17811888
double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
17821889
int skip_trailing, int *error, int *maybe_int) {
1890+
char *pc = NULL;
1891+
// 'normalize' representation to C-locale; replace decimal with '.' and
1892+
// remove t(housand)sep.
1893+
char *endptr = NULL;
1894+
if (decimal != '.' || tsep != '\0') {
1895+
pc = str_copy_decimal_str_c(p, &endptr, decimal, tsep, sci);
1896+
if (pc == NULL) {
1897+
if (q != NULL) {
1898+
*q = (char *)p;
1899+
}
1900+
*error = -1;
1901+
return 0.0;
1902+
}
1903+
}
17831904
// This is called from a nogil block in parsers.pyx
17841905
// so need to explicitly get GIL before Python calls
17851906
PyGILState_STATE gstate;
17861907
gstate = PyGILState_Ensure();
1787-
1788-
double r = PyOS_string_to_double(p, q, 0);
1908+
double r;
1909+
if (pc != NULL) {
1910+
char *endpc = NULL;
1911+
r = PyOS_string_to_double(pc, &endpc, 0);
1912+
// PyOS_string_to_double needs to consume the whole string
1913+
if (endpc == pc + strlen(pc)) {
1914+
if (q != NULL) {
1915+
// report endptr from source string (p)
1916+
*q = (char *) endptr;
1917+
}
1918+
} else {
1919+
*error = -1;
1920+
if (q != NULL) {
1921+
// p and pc are different len due to tsep removal. Can't report
1922+
// how much it has consumed of p. Just rewind to beginning.
1923+
*q = (char *)p;
1924+
}
1925+
}
1926+
} else {
1927+
r = PyOS_string_to_double(p, q, 0);
1928+
}
17891929
if (maybe_int != NULL) *maybe_int = 0;
17901930
if (PyErr_Occurred() != NULL) *error = -1;
17911931
else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL;
17921932
PyErr_Clear();
17931933

17941934
PyGILState_Release(gstate);
1935+
free(pc);
17951936
return r;
17961937
}
17971938

pandas/tests/io/parser/test_c_parser_only.py

+87
Original file line numberDiff line numberDiff line change
@@ -606,3 +606,90 @@ def test_unix_style_breaks(c_parser_only):
606606
result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c")
607607
expected = DataFrame(columns=["col_1", "col_2", "col_3"])
608608
tm.assert_frame_equal(result, expected)
609+
610+
611+
@pytest.mark.parametrize(
612+
"float_precision", [None, "high", "round_trip"]
613+
)
614+
@pytest.mark.parametrize(
615+
"data,thousands,decimal",
616+
[
617+
(
618+
"""A|B|C
619+
1|2,334.01|5
620+
10|13|10.
621+
""",
622+
",",
623+
".",
624+
),
625+
(
626+
"""A|B|C
627+
1|2.334,01|5
628+
10|13|10,
629+
""",
630+
".",
631+
",",
632+
)
633+
]
634+
)
635+
def test_1000_sep_with_decimal(
636+
c_parser_only, data, thousands, decimal, float_precision
637+
):
638+
parser = c_parser_only
639+
expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
640+
641+
result = parser.read_csv(
642+
StringIO(data),
643+
sep="|",
644+
thousands=thousands,
645+
decimal=decimal,
646+
float_precision=float_precision,
647+
)
648+
tm.assert_frame_equal(result, expected)
649+
650+
651+
@pytest.mark.parametrize(
652+
"float_precision", [None, "high", "round_trip"],
653+
)
654+
@pytest.mark.parametrize(
655+
'value,expected', [
656+
("-1,0", -1.0),
657+
("-1e0", -1.0),
658+
("-1,0e0", -1.0),
659+
("+1e0", 1.0),
660+
("+1e+0", 1.0),
661+
("+1e-1", 0.1),
662+
("+1e-1", 0.1),
663+
("0,1", 0.1),
664+
("1,", 1.0),
665+
(",1", 0.1),
666+
("-,1", -0.1),
667+
# This might be wrong. tsep with no following digits,
668+
("1.", 1.),
669+
# negative cases; must not parse as float
670+
("-.1", "-.1"),
671+
(".1", ".1"),
672+
("1,2e.1", "1,2e.1"),
673+
("1,2e1.0", "1,2e1.0"),
674+
("1,.2", "1,.2"),
675+
(",1..2", ",1..2"),
676+
(",1e", ",1e"),
677+
("1.000,000.000", "1.000,000.000"),
678+
679+
]
680+
)
681+
def test_float_precision_round_trip_decimal_thousands(
682+
c_parser_only, value, expected, float_precision
683+
):
684+
# test decimal and thousand sep handling in round_trip parser
685+
parser = c_parser_only
686+
df = parser.read_csv(
687+
StringIO(value),
688+
sep="|",
689+
thousands=".",
690+
decimal=",",
691+
header=None,
692+
float_precision=float_precision,
693+
)
694+
val = df.iloc[0, 0]
695+
assert val == expected

0 commit comments

Comments
 (0)