Skip to content

Commit dcbd5fd

Browse files
committed
Handle decimal and tsep in round_trip converter (#35365)
In case of non c-locale decimal and tsep, copy and fixup the source string before passing it to PyOS_string_to_double
1 parent 04e9e0a commit dcbd5fd

File tree

2 files changed

+181
-2
lines changed

2 files changed

+181
-2
lines changed

pandas/_libs/src/parser/tokenizer.c

+143-2
Original file line numberDiff line numberDiff line change
@@ -1778,20 +1778,161 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
17781778
return number;
17791779
}
17801780

1781+
/* copy a decimal number string in form `decimal` and `tsep` and `sci` as
1782+
decimal point, thousands separator and sci exponent character to a an
1783+
equivalent c-locale decimal string (striping tsep, replacing `decimal`
1784+
with '.'). Return NULL if nothing could be copied.
1785+
*/
1786+
1787+
char* str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
1788+
char tsep, char sci) {
1789+
#define IS_TSEP(c) (tsep != '\0' && c == tsep)
1790+
ssize_t size = 0;
1791+
ssize_t num_digits = 0;
1792+
char has_exponent = 0;
1793+
const char *p = s;
1794+
// First count how many characters we can consume.
1795+
// Leading sign
1796+
if (*p == '+' || *p == '-') p++;
1797+
// Integer part
1798+
while (isdigit_ascii(*p)) {
1799+
p++;
1800+
p += IS_TSEP(*p);
1801+
num_digits++;
1802+
}
1803+
// Fractional part
1804+
if (*p == decimal) {
1805+
p++;
1806+
while (isdigit_ascii(*p)) {
1807+
p++;
1808+
num_digits++;
1809+
}
1810+
}
1811+
if (num_digits == 0) {
1812+
if (endpos != NULL) {
1813+
*endpos = (char *)s;
1814+
}
1815+
return NULL;
1816+
}
1817+
// Exponent part
1818+
if (toupper_ascii(*p) == toupper_ascii(sci)) {
1819+
const char * p_at_e = p;
1820+
num_digits = 0;
1821+
p++;
1822+
// Exponent sign
1823+
if (*p == '+' || *p == '-') p++;
1824+
// Exponent
1825+
while (isdigit_ascii(*p)) {
1826+
p++;
1827+
num_digits++;
1828+
}
1829+
if (num_digits == 0) {
1830+
// no digits after exponent; un-consume the (+|-)?
1831+
p = p_at_e;
1832+
has_exponent = 0;
1833+
} else {
1834+
has_exponent = 1;
1835+
}
1836+
}
1837+
1838+
size = p - s;
1839+
char *pc = malloc(size + 1);
1840+
memcpy(pc, p, size);
1841+
pc[size] = '\0';
1842+
char *dst = pc;
1843+
p = s;
1844+
num_digits = 0;
1845+
// Copy leading sign
1846+
if (*p == '+' || *p == '-') {
1847+
*dst++ = *p++;
1848+
}
1849+
// Copy integer part
1850+
while (isdigit_ascii(*p)) {
1851+
*dst++ = *p++;
1852+
p += IS_TSEP(*p);
1853+
num_digits++;
1854+
}
1855+
// Copy factional part, replacing `decimal` with '.'
1856+
if (*p == decimal) {
1857+
*dst++ = '.';
1858+
p++;
1859+
while (isdigit_ascii(*p)) {
1860+
*dst++ = *p++;
1861+
num_digits++;
1862+
}
1863+
}
1864+
assert(num_digits > 0);
1865+
// Copy exponent
1866+
if (has_exponent && toupper_ascii(*p) == toupper_ascii(sci)) {
1867+
num_digits = 0;
1868+
*dst++ = *p++;
1869+
// Copy leading exponent sign
1870+
if (*p == '+' || *p == '-') {
1871+
*dst++ = *p++;
1872+
}
1873+
// Exponent
1874+
while (isdigit_ascii(*p)) {
1875+
*dst++ = *p++;
1876+
num_digits++;
1877+
}
1878+
assert(num_digits > 0);
1879+
}
1880+
*dst = '\0';
1881+
if (endpos != NULL) {
1882+
*endpos = (char *)p;
1883+
}
1884+
return pc;
1885+
#undef IS_TSEP
1886+
}
1887+
17811888
double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
17821889
int skip_trailing, int *error, int *maybe_int) {
1890+
char *pc = NULL;
1891+
// 'normalize' representation to C-locale; replace decimal with '.' and
1892+
// remove t(housand)sep.
1893+
char *endptr = NULL;
1894+
if (decimal != '.' || tsep != '\0') {
1895+
pc = str_copy_decimal_str_c(p, &endptr, decimal, tsep, sci);
1896+
if (pc == NULL) {
1897+
if (q != NULL) {
1898+
*q = (char *)p;
1899+
}
1900+
*error = -1;
1901+
return 0.0;
1902+
}
1903+
}
17831904
// This is called from a nogil block in parsers.pyx
17841905
// so need to explicitly get GIL before Python calls
17851906
PyGILState_STATE gstate;
17861907
gstate = PyGILState_Ensure();
1787-
1788-
double r = PyOS_string_to_double(p, q, 0);
1908+
double r;
1909+
if (pc != NULL) {
1910+
char *endpc = NULL;
1911+
r = PyOS_string_to_double(pc, &endpc, 0);
1912+
// PyOS_string_to_double needs to consume the whole string
1913+
if (endpc == pc + strlen(pc)) {
1914+
if (q != NULL) {
1915+
// report endptr from source string (p)
1916+
*q = (char *) endptr;
1917+
}
1918+
} else {
1919+
*error = -1;
1920+
if (q != NULL) {
1921+
// p and pc are different len due to tsep removal. Can't report
1922+
// how much it has consumed of p. Just rewind to beginning.
1923+
*q = (char *)p;
1924+
}
1925+
}
1926+
} else {
1927+
r = PyOS_string_to_double(p, q, 0);
1928+
}
17891929
if (maybe_int != NULL) *maybe_int = 0;
17901930
if (PyErr_Occurred() != NULL) *error = -1;
17911931
else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL;
17921932
PyErr_Clear();
17931933

17941934
PyGILState_Release(gstate);
1935+
free(pc);
17951936
return r;
17961937
}
17971938

pandas/tests/io/parser/test_c_parser_only.py

+38
Original file line numberDiff line numberDiff line change
@@ -606,3 +606,41 @@ def test_unix_style_breaks(c_parser_only):
606606
result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c")
607607
expected = DataFrame(columns=["col_1", "col_2", "col_3"])
608608
tm.assert_frame_equal(result, expected)
609+
610+
611+
@pytest.mark.parametrize("float_precision", [None, "high", "round_trip"])
612+
@pytest.mark.parametrize(
613+
"data,thousands,decimal",
614+
[
615+
(
616+
"""A|B|C
617+
1|2,334.01|5
618+
10|13|10.
619+
""",
620+
",",
621+
".",
622+
),
623+
(
624+
"""A|B|C
625+
1|2.334,01|5
626+
10|13|10,
627+
""",
628+
".",
629+
",",
630+
),
631+
],
632+
)
633+
def test_1000_sep_with_decimal(
634+
c_parser_only, data, thousands, decimal, float_precision
635+
):
636+
parser = c_parser_only
637+
expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
638+
639+
result = parser.read_csv(
640+
StringIO(data),
641+
sep="|",
642+
thousands=thousands,
643+
decimal=decimal,
644+
float_precision=float_precision,
645+
)
646+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)