Skip to content

Commit 7566358

Browse files
committed
Handle decimal and tsep in round_trip converter (pandas-dev#35365)
In case of non c-locale decimal and tsep, copy and fixup the source string before passing it to PyOS_string_to_double
1 parent e673b69 commit 7566358

File tree

2 files changed

+148
-2
lines changed

2 files changed

+148
-2
lines changed

pandas/_libs/src/parser/tokenizer.c

Lines changed: 112 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1778,20 +1778,130 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
17781778
return number;
17791779
}
17801780

1781+
/* copy a decimal number string in form `decimal` and `tsep` and `sci` as
1782+
decimal point, thousands separator and sci exponent character to a an
1783+
equivalent c-locale decimal string (striping tsep, replacing `decimal`
1784+
with '.' and sci with 'e'.
1785+
*/
1786+
1787+
char* str_copy_decimal_str_c(const char *s, char **endpos, char decimal, char tsep, char sci) {
1788+
#define IS_TSEP(c) (tsep != '\0' && c == tsep)
1789+
ssize_t size = 0;
1790+
const char *p = s;
1791+
// First count how many characters we can consume.
1792+
// Leading sign
1793+
if (*p == '+' || *p == '-') p++;
1794+
// Integer part
1795+
while (isdigit_ascii(*p)) {
1796+
p++;
1797+
p += IS_TSEP(*p);
1798+
}
1799+
// Fractional part
1800+
if (*p == decimal) {
1801+
p++;
1802+
while (isdigit_ascii(*p)) {
1803+
p++;
1804+
p+= IS_TSEP(*p);
1805+
}
1806+
}
1807+
// Exponent part
1808+
if (toupper_ascii(*p) == toupper_ascii(sci)) {
1809+
p++;
1810+
// Exponent sign
1811+
if (*p == '+' || *p == '-') p++;
1812+
// Exponent
1813+
while (isdigit_ascii(*p)) {
1814+
p++;
1815+
p+= IS_TSEP(*p);
1816+
}
1817+
}
1818+
1819+
size = p - s;
1820+
char *pc = strndup(s, size);
1821+
char *dst = pc;
1822+
p = s;
1823+
1824+
// Copy leading sign
1825+
if (*p == '+' || *p == '-') {
1826+
*dst++ = *p++;
1827+
}
1828+
// Copy integer part
1829+
while (isdigit_ascii(*p)) {
1830+
*dst++ = *p++;
1831+
p += IS_TSEP(*p);
1832+
}
1833+
// Copy factional part, replacing `decimal` with '.'
1834+
if (*p == decimal) {
1835+
*dst++ = '.';
1836+
p++;
1837+
while (isdigit_ascii(*p)) {
1838+
*dst++ = *p++;
1839+
p += IS_TSEP(*p);
1840+
}
1841+
}
1842+
// Copy exponent replacing `sci` with 'e'
1843+
if (toupper_ascii(*p) == toupper_ascii(sci)) {
1844+
*dst++ = 'e';
1845+
p++;
1846+
// Copy leading exponent sign
1847+
if (*p == '+' || *p == '-') {
1848+
*dst++ = *p++;
1849+
}
1850+
// Exponent
1851+
while (isdigit_ascii(*p)) {
1852+
*dst++ = *p++;
1853+
p += IS_TSEP(*p);
1854+
}
1855+
}
1856+
*dst = '\0';
1857+
if (endpos != NULL) {
1858+
*endpos = (char *)p;
1859+
}
1860+
return pc;
1861+
#undef IS_TSEP
1862+
}
1863+
17811864
double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
17821865
int skip_trailing, int *error, int *maybe_int) {
1866+
char *pc = NULL;
1867+
// 'normalize' representation to C-locale; replace decimal with '.' and
1868+
// remove t(housand)sep.
1869+
char *endptr = NULL;
1870+
if (decimal != '.' || tsep != '\0') {
1871+
pc = str_copy_decimal_str_c(p, &endptr, decimal, tsep, sci);
1872+
}
17831873
// This is called from a nogil block in parsers.pyx
17841874
// so need to explicitly get GIL before Python calls
17851875
PyGILState_STATE gstate;
17861876
gstate = PyGILState_Ensure();
1787-
1788-
double r = PyOS_string_to_double(p, q, 0);
1877+
double r;
1878+
if (pc != NULL) {
1879+
char *endpc = NULL;
1880+
r = PyOS_string_to_double(pc, &endpc, 0);
1881+
// PyOS_string_to_double needs to consume the whole string
1882+
if (endpc == pc + strlen(pc)) {
1883+
if (q != NULL) {
1884+
// report endptr from source string (p)
1885+
*q = (char *) endptr;
1886+
}
1887+
} else {
1888+
*error = -1;
1889+
if (q != NULL) {
1890+
// p and pc are different len due to tsep removal. Can't report
1891+
// how much it has consumed of p. Just rewind to beginning.
1892+
*q = (char *)p;
1893+
}
1894+
}
1895+
} else {
1896+
r = PyOS_string_to_double(p, q, 0);
1897+
}
17891898
if (maybe_int != NULL) *maybe_int = 0;
17901899
if (PyErr_Occurred() != NULL) *error = -1;
17911900
else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL;
17921901
PyErr_Clear();
17931902

17941903
PyGILState_Release(gstate);
1904+
free(pc);
17951905
return r;
17961906
}
17971907

pandas/tests/io/parser/test_c_parser_only.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -606,3 +606,39 @@ def test_unix_style_breaks(c_parser_only):
606606
result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c")
607607
expected = DataFrame(columns=["col_1", "col_2", "col_3"])
608608
tm.assert_frame_equal(result, expected)
609+
610+
@pytest.mark.parametrize(
611+
"data,thousands,decimal,float_precision",
612+
[
613+
(
614+
"""A|B|C
615+
1|2,334.01|5
616+
10|13|10.
617+
""",
618+
",",
619+
".",
620+
prec,
621+
)
622+
for prec in [None, "high", "round_trip"]
623+
] + [
624+
(
625+
"""A|B|C
626+
1|2.334,01|5
627+
10|13|10,
628+
""",
629+
".",
630+
",",
631+
prec,
632+
)
633+
for prec in [None, "high", "round_trip"]
634+
],
635+
)
636+
def test_1000_sep_with_decimal(c_parser_only, data, thousands, decimal, float_precision):
637+
parser = c_parser_only
638+
expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
639+
640+
result = parser.read_csv(
641+
StringIO(data), sep="|", thousands=thousands, decimal=decimal,
642+
float_precision=float_precision
643+
)
644+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)