Skip to content

Commit 7b2c77c

Browse files
committed
Handle decimal and tsep in round_trip converter (pandas-dev#35365)
In case of non c-locale decimal and tsep, copy and fixup the source string before passing it to PyOS_string_to_double
1 parent e673b69 commit 7b2c77c

File tree

2 files changed

+157
-2
lines changed

2 files changed

+157
-2
lines changed

pandas/_libs/src/parser/tokenizer.c

+114-2
Original file line numberDiff line numberDiff line change
@@ -1778,20 +1778,132 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
17781778
return number;
17791779
}
17801780

1781+
/* copy a decimal number string in form `decimal` and `tsep` and `sci` as
1782+
decimal point, thousands separator and sci exponent character to a an
1783+
equivalent c-locale decimal string (striping tsep, replacing `decimal`
1784+
with '.' and sci with 'e'.
1785+
*/
1786+
1787+
char* str_copy_decimal_str_c(const char *s, char **endpos, char decimal, char tsep, char sci) {
1788+
#define IS_TSEP(c) (tsep != '\0' && c == tsep)
1789+
ssize_t size = 0;
1790+
const char *p = s;
1791+
// First count how many characters we can consume.
1792+
// Leading sign
1793+
if (*p == '+' || *p == '-') p++;
1794+
// Integer part
1795+
while (isdigit_ascii(*p)) {
1796+
p++;
1797+
p += IS_TSEP(*p);
1798+
}
1799+
// Fractional part
1800+
if (*p == decimal) {
1801+
p++;
1802+
while (isdigit_ascii(*p)) {
1803+
p++;
1804+
p+= IS_TSEP(*p);
1805+
}
1806+
}
1807+
// Exponent part
1808+
if (toupper_ascii(*p) == toupper_ascii(sci)) {
1809+
p++;
1810+
// Exponent sign
1811+
if (*p == '+' || *p == '-') p++;
1812+
// Exponent
1813+
while (isdigit_ascii(*p)) {
1814+
p++;
1815+
p+= IS_TSEP(*p);
1816+
}
1817+
}
1818+
1819+
size = p - s;
1820+
char *pc = malloc(size + 1);
1821+
memcpy(pc, p, size);
1822+
pc[size] = '\0';
1823+
char *dst = pc;
1824+
p = s;
1825+
1826+
// Copy leading sign
1827+
if (*p == '+' || *p == '-') {
1828+
*dst++ = *p++;
1829+
}
1830+
// Copy integer part
1831+
while (isdigit_ascii(*p)) {
1832+
*dst++ = *p++;
1833+
p += IS_TSEP(*p);
1834+
}
1835+
// Copy factional part, replacing `decimal` with '.'
1836+
if (*p == decimal) {
1837+
*dst++ = '.';
1838+
p++;
1839+
while (isdigit_ascii(*p)) {
1840+
*dst++ = *p++;
1841+
p += IS_TSEP(*p);
1842+
}
1843+
}
1844+
// Copy exponent replacing `sci` with 'e'
1845+
if (toupper_ascii(*p) == toupper_ascii(sci)) {
1846+
*dst++ = 'e';
1847+
p++;
1848+
// Copy leading exponent sign
1849+
if (*p == '+' || *p == '-') {
1850+
*dst++ = *p++;
1851+
}
1852+
// Exponent
1853+
while (isdigit_ascii(*p)) {
1854+
*dst++ = *p++;
1855+
p += IS_TSEP(*p);
1856+
}
1857+
}
1858+
*dst = '\0';
1859+
if (endpos != NULL) {
1860+
*endpos = (char *)p;
1861+
}
1862+
return pc;
1863+
#undef IS_TSEP
1864+
}
1865+
17811866
double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
17821867
int skip_trailing, int *error, int *maybe_int) {
1868+
char *pc = NULL;
1869+
// 'normalize' representation to C-locale; replace decimal with '.' and
1870+
// remove t(housand)sep.
1871+
char *endptr = NULL;
1872+
if (decimal != '.' || tsep != '\0') {
1873+
pc = str_copy_decimal_str_c(p, &endptr, decimal, tsep, sci);
1874+
}
17831875
// This is called from a nogil block in parsers.pyx
17841876
// so need to explicitly get GIL before Python calls
17851877
PyGILState_STATE gstate;
17861878
gstate = PyGILState_Ensure();
1787-
1788-
double r = PyOS_string_to_double(p, q, 0);
1879+
double r;
1880+
if (pc != NULL) {
1881+
char *endpc = NULL;
1882+
r = PyOS_string_to_double(pc, &endpc, 0);
1883+
// PyOS_string_to_double needs to consume the whole string
1884+
if (endpc == pc + strlen(pc)) {
1885+
if (q != NULL) {
1886+
// report endptr from source string (p)
1887+
*q = (char *) endptr;
1888+
}
1889+
} else {
1890+
*error = -1;
1891+
if (q != NULL) {
1892+
// p and pc are different len due to tsep removal. Can't report
1893+
// how much it has consumed of p. Just rewind to beginning.
1894+
*q = (char *)p;
1895+
}
1896+
}
1897+
} else {
1898+
r = PyOS_string_to_double(p, q, 0);
1899+
}
17891900
if (maybe_int != NULL) *maybe_int = 0;
17901901
if (PyErr_Occurred() != NULL) *error = -1;
17911902
else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL;
17921903
PyErr_Clear();
17931904

17941905
PyGILState_Release(gstate);
1906+
free(pc);
17951907
return r;
17961908
}
17971909

pandas/tests/io/parser/test_c_parser_only.py

+43
Original file line numberDiff line numberDiff line change
@@ -606,3 +606,46 @@ def test_unix_style_breaks(c_parser_only):
606606
result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c")
607607
expected = DataFrame(columns=["col_1", "col_2", "col_3"])
608608
tm.assert_frame_equal(result, expected)
609+
610+
611+
@pytest.mark.parametrize(
612+
"data,thousands,decimal,float_precision",
613+
[
614+
(
615+
"""A|B|C
616+
1|2,334.01|5
617+
10|13|10.
618+
""",
619+
",",
620+
".",
621+
prec,
622+
)
623+
for prec in [None, "high", "round_trip"]
624+
]
625+
+ [
626+
(
627+
"""A|B|C
628+
1|2.334,01|5
629+
10|13|10,
630+
""",
631+
".",
632+
",",
633+
prec,
634+
)
635+
for prec in [None, "high", "round_trip"]
636+
],
637+
)
638+
def test_1000_sep_with_decimal(
639+
c_parser_only, data, thousands, decimal, float_precision
640+
):
641+
parser = c_parser_only
642+
expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
643+
644+
result = parser.read_csv(
645+
StringIO(data),
646+
sep="|",
647+
thousands=thousands,
648+
decimal=decimal,
649+
float_precision=float_precision,
650+
)
651+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)