Skip to content

Commit 5fab153

Browse files
committed
Simplify str_copy_decimal_str.
1 parent dcbd5fd commit 5fab153

File tree

1 file changed

+24
-103
lines changed

1 file changed

+24
-103
lines changed

pandas/_libs/src/parser/tokenizer.c

+24-103
Original file line numberDiff line numberDiff line change
@@ -1778,128 +1778,49 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
17781778
return number;
17791779
}
17801780

1781-
/* copy a decimal number string in form `decimal` and `tsep` and `sci` as
1782-
decimal point, thousands separator and sci exponent character to a an
1783-
equivalent c-locale decimal string (striping tsep, replacing `decimal`
1784-
with '.'). Return NULL if nothing could be copied.
1781+
/* copy a decimal number string with `decimal`, `tsep` as decimal point
1782+
and thousands separator to an equivalent c-locale decimal string (striping
1783+
`tsep`, replacing `decimal` with '.'). The returned memory should be free-d
1784+
with a call to `free`.
17851785
*/
17861786

1787-
char* str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
1788-
char tsep, char sci) {
1789-
#define IS_TSEP(c) (tsep != '\0' && c == tsep)
1790-
ssize_t size = 0;
1791-
ssize_t num_digits = 0;
1792-
char has_exponent = 0;
1787+
char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
1788+
char tsep) {
17931789
const char *p = s;
1794-
// First count how many characters we can consume.
1795-
// Leading sign
1796-
if (*p == '+' || *p == '-') p++;
1797-
// Integer part
1798-
while (isdigit_ascii(*p)) {
1799-
p++;
1800-
p += IS_TSEP(*p);
1801-
num_digits++;
1802-
}
1803-
// Fractional part
1804-
if (*p == decimal) {
1805-
p++;
1806-
while (isdigit_ascii(*p)) {
1807-
p++;
1808-
num_digits++;
1809-
}
1810-
}
1811-
if (num_digits == 0) {
1812-
if (endpos != NULL) {
1813-
*endpos = (char *)s;
1814-
}
1815-
return NULL;
1816-
}
1817-
// Exponent part
1818-
if (toupper_ascii(*p) == toupper_ascii(sci)) {
1819-
const char * p_at_e = p;
1820-
num_digits = 0;
1821-
p++;
1822-
// Exponent sign
1823-
if (*p == '+' || *p == '-') p++;
1824-
// Exponent
1825-
while (isdigit_ascii(*p)) {
1826-
p++;
1827-
num_digits++;
1828-
}
1829-
if (num_digits == 0) {
1830-
// no digits after exponent; un-consume the (+|-)?
1831-
p = p_at_e;
1832-
has_exponent = 0;
1833-
} else {
1834-
has_exponent = 1;
1835-
}
1836-
}
1837-
1838-
size = p - s;
1839-
char *pc = malloc(size + 1);
1840-
memcpy(pc, p, size);
1841-
pc[size] = '\0';
1842-
char *dst = pc;
1843-
p = s;
1844-
num_digits = 0;
1845-
// Copy leading sign
1790+
size_t length = strlen(s);
1791+
char *s_copy = malloc(length + 1);
1792+
char *dst = s_copy;
1793+
// Copy Leading sign
18461794
if (*p == '+' || *p == '-') {
18471795
*dst++ = *p++;
18481796
}
1849-
// Copy integer part
1797+
// Copy integer part dropping `tsep`
18501798
while (isdigit_ascii(*p)) {
18511799
*dst++ = *p++;
1852-
p += IS_TSEP(*p);
1853-
num_digits++;
1800+
p += (tsep != '\0' && *p == tsep);
18541801
}
1855-
// Copy factional part, replacing `decimal` with '.'
1802+
// Replace `decimal` with '.'
18561803
if (*p == decimal) {
1857-
*dst++ = '.';
1858-
p++;
1859-
while (isdigit_ascii(*p)) {
1860-
*dst++ = *p++;
1861-
num_digits++;
1862-
}
1863-
}
1864-
assert(num_digits > 0);
1865-
// Copy exponent
1866-
if (has_exponent && toupper_ascii(*p) == toupper_ascii(sci)) {
1867-
num_digits = 0;
1868-
*dst++ = *p++;
1869-
// Copy leading exponent sign
1870-
if (*p == '+' || *p == '-') {
1871-
*dst++ = *p++;
1872-
}
1873-
// Exponent
1874-
while (isdigit_ascii(*p)) {
1875-
*dst++ = *p++;
1876-
num_digits++;
1877-
}
1878-
assert(num_digits > 0);
1879-
}
1880-
*dst = '\0';
1881-
if (endpos != NULL) {
1882-
*endpos = (char *)p;
1883-
}
1884-
return pc;
1885-
#undef IS_TSEP
1804+
*dst++ = '.';
1805+
p++;
1806+
}
1807+
// Copy the remainder of the string as is.
1808+
memcpy(dst, p, length + 1 - (p - s));
1809+
s_copy[length] = '\0';
1810+
if (endpos != NULL)
1811+
*endpos = (char *)(s + length);
1812+
return s_copy;
18861813
}
18871814

1815+
18881816
double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
18891817
int skip_trailing, int *error, int *maybe_int) {
18901818
char *pc = NULL;
18911819
// 'normalize' representation to C-locale; replace decimal with '.' and
18921820
// remove t(housand)sep.
18931821
char *endptr = NULL;
18941822
if (decimal != '.' || tsep != '\0') {
1895-
pc = str_copy_decimal_str_c(p, &endptr, decimal, tsep, sci);
1896-
if (pc == NULL) {
1897-
if (q != NULL) {
1898-
*q = (char *)p;
1899-
}
1900-
*error = -1;
1901-
return 0.0;
1902-
}
1823+
pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep);
19031824
}
19041825
// This is called from a nogil block in parsers.pyx
19051826
// so need to explicitly get GIL before Python calls

0 commit comments

Comments
 (0)