Skip to content

Commit aab2f52

Browse files
committed
Handle decimal and tsep in round_trip converter (#35365)
In case of non c-locale decimal and tsep, copy and fixup the source string before passing it to PyOS_string_to_double
1 parent e673b69 commit aab2f52

File tree

2 files changed

+182
-2
lines changed

2 files changed

+182
-2
lines changed

pandas/_libs/src/parser/tokenizer.c

+139-2
Original file line numberDiff line numberDiff line change
@@ -1778,20 +1778,157 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
17781778
return number;
17791779
}
17801780

1781+
/* copy a decimal number string in form `decimal` and `tsep` and `sci` as
1782+
decimal point, thousands separator and sci exponent character to a an
1783+
equivalent c-locale decimal string (striping tsep, replacing `decimal`
1784+
with '.'). Return NULL if nothing could be copied.
1785+
*/
1786+
1787+
char* str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
1788+
char tsep, char sci) {
1789+
#define IS_TSEP(c) (tsep != '\0' && c == tsep)
1790+
ssize_t size = 0;
1791+
ssize_t num_digits = 0;
1792+
char has_exponent = 0;
1793+
const char *p = s;
1794+
// First count how many characters we can consume.
1795+
// Leading sign
1796+
if (*p == '+' || *p == '-') p++;
1797+
// Integer part
1798+
while (isdigit_ascii(*p)) {
1799+
p++;
1800+
p += IS_TSEP(*p);
1801+
num_digits++;
1802+
}
1803+
// Fractional part
1804+
if (*p == decimal) {
1805+
p++;
1806+
while (isdigit_ascii(*p)) {
1807+
p++;
1808+
p+= IS_TSEP(*p);
1809+
}
1810+
}
1811+
if (num_digits == 0) {
1812+
if (endpos != NULL) {
1813+
*endpos = (char *)s;
1814+
}
1815+
return NULL;
1816+
}
1817+
// Exponent part
1818+
if (toupper_ascii(*p) == toupper_ascii(sci)) {
1819+
const char * p_at_e = p;
1820+
num_digits = 0;
1821+
p++;
1822+
// Exponent sign
1823+
if (*p == '+' || *p == '-') p++;
1824+
// Exponent
1825+
while (isdigit_ascii(*p)) {
1826+
p++;
1827+
p+= IS_TSEP(*p);
1828+
num_digits++;
1829+
}
1830+
if (num_digits == 0) {
1831+
// no digits after exponent; un-consume the (+|-)?
1832+
p = p_at_e;
1833+
has_exponent = 0;
1834+
} else {
1835+
has_exponent = 1;
1836+
}
1837+
}
1838+
1839+
size = p - s;
1840+
char *pc = malloc(size + 1);
1841+
memcpy(pc, p, size);
1842+
pc[size] = '\0';
1843+
char *dst = pc;
1844+
p = s;
1845+
num_digits = 0;
1846+
// Copy leading sign
1847+
if (*p == '+' || *p == '-') {
1848+
*dst++ = *p++;
1849+
}
1850+
// Copy integer part
1851+
while (isdigit_ascii(*p)) {
1852+
*dst++ = *p++;
1853+
p += IS_TSEP(*p);
1854+
num_digits++;
1855+
}
1856+
// Copy factional part, replacing `decimal` with '.'
1857+
if (*p == decimal) {
1858+
*dst++ = '.';
1859+
p++;
1860+
while (isdigit_ascii(*p)) {
1861+
*dst++ = *p++;
1862+
p += IS_TSEP(*p);
1863+
num_digits++;
1864+
}
1865+
}
1866+
assert(num_digits > 0);
1867+
// Copy exponent
1868+
if (has_exponent && toupper_ascii(*p) == toupper_ascii(sci)) {
1869+
num_digits = 0;
1870+
*dst++ = *p++;
1871+
// Copy leading exponent sign
1872+
if (*p == '+' || *p == '-') {
1873+
*dst++ = *p++;
1874+
}
1875+
// Exponent
1876+
while (isdigit_ascii(*p)) {
1877+
*dst++ = *p++;
1878+
p += IS_TSEP(*p);
1879+
num_digits++;
1880+
}
1881+
assert(num_digits > 0);
1882+
}
1883+
*dst = '\0';
1884+
if (endpos != NULL) {
1885+
*endpos = (char *)p;
1886+
}
1887+
return pc;
1888+
#undef IS_TSEP
1889+
}
1890+
17811891
double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
17821892
int skip_trailing, int *error, int *maybe_int) {
1893+
char *pc = NULL;
1894+
// 'normalize' representation to C-locale; replace decimal with '.' and
1895+
// remove t(housand)sep.
1896+
char *endptr = NULL;
1897+
if (decimal != '.' || tsep != '\0') {
1898+
pc = str_copy_decimal_str_c(p, &endptr, decimal, tsep, sci);
1899+
}
17831900
// This is called from a nogil block in parsers.pyx
17841901
// so need to explicitly get GIL before Python calls
17851902
PyGILState_STATE gstate;
17861903
gstate = PyGILState_Ensure();
1787-
1788-
double r = PyOS_string_to_double(p, q, 0);
1904+
double r;
1905+
if (pc != NULL) {
1906+
char *endpc = NULL;
1907+
r = PyOS_string_to_double(pc, &endpc, 0);
1908+
// PyOS_string_to_double needs to consume the whole string
1909+
if (endpc == pc + strlen(pc)) {
1910+
if (q != NULL) {
1911+
// report endptr from source string (p)
1912+
*q = (char *) endptr;
1913+
}
1914+
} else {
1915+
*error = -1;
1916+
if (q != NULL) {
1917+
// p and pc are different len due to tsep removal. Can't report
1918+
// how much it has consumed of p. Just rewind to beginning.
1919+
*q = (char *)p;
1920+
}
1921+
}
1922+
} else {
1923+
r = PyOS_string_to_double(p, q, 0);
1924+
}
17891925
if (maybe_int != NULL) *maybe_int = 0;
17901926
if (PyErr_Occurred() != NULL) *error = -1;
17911927
else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL;
17921928
PyErr_Clear();
17931929

17941930
PyGILState_Release(gstate);
1931+
free(pc);
17951932
return r;
17961933
}
17971934

pandas/tests/io/parser/test_c_parser_only.py

+43
Original file line numberDiff line numberDiff line change
@@ -606,3 +606,46 @@ def test_unix_style_breaks(c_parser_only):
606606
result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c")
607607
expected = DataFrame(columns=["col_1", "col_2", "col_3"])
608608
tm.assert_frame_equal(result, expected)
609+
610+
611+
@pytest.mark.parametrize(
612+
"data,thousands,decimal,float_precision",
613+
[
614+
(
615+
"""A|B|C
616+
1|2,334.01|5
617+
10|13|10.
618+
""",
619+
",",
620+
".",
621+
prec,
622+
)
623+
for prec in [None, "high", "round_trip"]
624+
]
625+
+ [
626+
(
627+
"""A|B|C
628+
1|2.334,01|5
629+
10|13|10,
630+
""",
631+
".",
632+
",",
633+
prec,
634+
)
635+
for prec in [None, "high", "round_trip"]
636+
],
637+
)
638+
def test_1000_sep_with_decimal(
639+
c_parser_only, data, thousands, decimal, float_precision
640+
):
641+
parser = c_parser_only
642+
expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
643+
644+
result = parser.read_csv(
645+
StringIO(data),
646+
sep="|",
647+
thousands=thousands,
648+
decimal=decimal,
649+
float_precision=float_precision,
650+
)
651+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)