From 2da9aa77f479a634bb6e82af2b636a8361bcf958 Mon Sep 17 00:00:00 2001 From: Ales Erjavec Date: Wed, 22 Jul 2020 12:46:15 +0200 Subject: [PATCH 1/7] Handle `decimal` and `tsep` in `round_trip` converter (#35365) In case of non c-locale decimal and tsep, copy and fixup the source string before passing it to PyOS_string_to_double --- pandas/_libs/src/parser/tokenizer.c | 145 ++++++++++++++++++- pandas/tests/io/parser/test_c_parser_only.py | 38 +++++ 2 files changed, 181 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index a195c0daf5271..052f9427efd26 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1778,20 +1778,161 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, return number; } +/* copy a decimal number string in form `decimal` and `tsep` and `sci` as + decimal point, thousands separator and sci exponent character to a an + equivalent c-locale decimal string (striping tsep, replacing `decimal` + with '.'). Return NULL if nothing could be copied. +*/ + +char* str_copy_decimal_str_c(const char *s, char **endpos, char decimal, + char tsep, char sci) { + #define IS_TSEP(c) (tsep != '\0' && c == tsep) + ssize_t size = 0; + ssize_t num_digits = 0; + char has_exponent = 0; + const char *p = s; + // First count how many characters we can consume. + // Leading sign + if (*p == '+' || *p == '-') p++; + // Integer part + while (isdigit_ascii(*p)) { + p++; + p += IS_TSEP(*p); + num_digits++; + } + // Fractional part + if (*p == decimal) { + p++; + while (isdigit_ascii(*p)) { + p++; + num_digits++; + } + } + if (num_digits == 0) { + if (endpos != NULL) { + *endpos = (char *)s; + } + return NULL; + } + // Exponent part + if (toupper_ascii(*p) == toupper_ascii(sci)) { + const char * p_at_e = p; + num_digits = 0; + p++; + // Exponent sign + if (*p == '+' || *p == '-') p++; + // Exponent + while (isdigit_ascii(*p)) { + p++; + num_digits++; + } + if (num_digits == 0) { + // no digits after exponent; un-consume the (+|-)? + p = p_at_e; + has_exponent = 0; + } else { + has_exponent = 1; + } + } + + size = p - s; + char *pc = malloc(size + 1); + memcpy(pc, p, size); + pc[size] = '\0'; + char *dst = pc; + p = s; + num_digits = 0; + // Copy leading sign + if (*p == '+' || *p == '-') { + *dst++ = *p++; + } + // Copy integer part + while (isdigit_ascii(*p)) { + *dst++ = *p++; + p += IS_TSEP(*p); + num_digits++; + } + // Copy factional part, replacing `decimal` with '.' + if (*p == decimal) { + *dst++ = '.'; + p++; + while (isdigit_ascii(*p)) { + *dst++ = *p++; + num_digits++; + } + } + assert(num_digits > 0); + // Copy exponent + if (has_exponent && toupper_ascii(*p) == toupper_ascii(sci)) { + num_digits = 0; + *dst++ = *p++; + // Copy leading exponent sign + if (*p == '+' || *p == '-') { + *dst++ = *p++; + } + // Exponent + while (isdigit_ascii(*p)) { + *dst++ = *p++; + num_digits++; + } + assert(num_digits > 0); + } + *dst = '\0'; + if (endpos != NULL) { + *endpos = (char *)p; + } + return pc; + #undef IS_TSEP +} + double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int) { + char *pc = NULL; + // 'normalize' representation to C-locale; replace decimal with '.' and + // remove t(housand)sep. + char *endptr = NULL; + if (decimal != '.' || tsep != '\0') { + pc = str_copy_decimal_str_c(p, &endptr, decimal, tsep, sci); + if (pc == NULL) { + if (q != NULL) { + *q = (char *)p; + } + *error = -1; + return 0.0; + } + } // This is called from a nogil block in parsers.pyx // so need to explicitly get GIL before Python calls PyGILState_STATE gstate; gstate = PyGILState_Ensure(); - - double r = PyOS_string_to_double(p, q, 0); + double r; + if (pc != NULL) { + char *endpc = NULL; + r = PyOS_string_to_double(pc, &endpc, 0); + // PyOS_string_to_double needs to consume the whole string + if (endpc == pc + strlen(pc)) { + if (q != NULL) { + // report endptr from source string (p) + *q = (char *) endptr; + } + } else { + *error = -1; + if (q != NULL) { + // p and pc are different len due to tsep removal. Can't report + // how much it has consumed of p. Just rewind to beginning. + *q = (char *)p; + } + } + } else { + r = PyOS_string_to_double(p, q, 0); + } if (maybe_int != NULL) *maybe_int = 0; if (PyErr_Occurred() != NULL) *error = -1; else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL; PyErr_Clear(); PyGILState_Release(gstate); + free(pc); return r; } diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index d76d01904731a..50911b934781e 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -606,3 +606,41 @@ def test_unix_style_breaks(c_parser_only): result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c") expected = DataFrame(columns=["col_1", "col_2", "col_3"]) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("float_precision", [None, "high", "round_trip"]) +@pytest.mark.parametrize( + "data,thousands,decimal", + [ + ( + """A|B|C +1|2,334.01|5 +10|13|10. +""", + ",", + ".", + ), + ( + """A|B|C +1|2.334,01|5 +10|13|10, +""", + ".", + ",", + ), + ], +) +def test_1000_sep_with_decimal( + c_parser_only, data, thousands, decimal, float_precision +): + parser = c_parser_only + expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]}) + + result = parser.read_csv( + StringIO(data), + sep="|", + thousands=thousands, + decimal=decimal, + float_precision=float_precision, + ) + tm.assert_frame_equal(result, expected) From dcd8de1c6ff9587b5f55d723010841b88cf70a4b Mon Sep 17 00:00:00 2001 From: Ales Erjavec Date: Mon, 27 Jul 2020 15:55:28 +0200 Subject: [PATCH 2/7] Simplify str_copy_decimal_str. --- pandas/_libs/src/parser/tokenizer.c | 127 ++++++---------------------- 1 file changed, 24 insertions(+), 103 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 052f9427efd26..4da8a360e7422 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1778,113 +1778,41 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, return number; } -/* copy a decimal number string in form `decimal` and `tsep` and `sci` as - decimal point, thousands separator and sci exponent character to a an - equivalent c-locale decimal string (striping tsep, replacing `decimal` - with '.'). Return NULL if nothing could be copied. +/* copy a decimal number string with `decimal`, `tsep` as decimal point + and thousands separator to an equivalent c-locale decimal string (striping + `tsep`, replacing `decimal` with '.'). The returned memory should be free-d + with a call to `free`. */ -char* str_copy_decimal_str_c(const char *s, char **endpos, char decimal, - char tsep, char sci) { - #define IS_TSEP(c) (tsep != '\0' && c == tsep) - ssize_t size = 0; - ssize_t num_digits = 0; - char has_exponent = 0; +char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal, + char tsep) { const char *p = s; - // First count how many characters we can consume. - // Leading sign - if (*p == '+' || *p == '-') p++; - // Integer part - while (isdigit_ascii(*p)) { - p++; - p += IS_TSEP(*p); - num_digits++; - } - // Fractional part - if (*p == decimal) { - p++; - while (isdigit_ascii(*p)) { - p++; - num_digits++; - } - } - if (num_digits == 0) { - if (endpos != NULL) { - *endpos = (char *)s; - } - return NULL; - } - // Exponent part - if (toupper_ascii(*p) == toupper_ascii(sci)) { - const char * p_at_e = p; - num_digits = 0; - p++; - // Exponent sign - if (*p == '+' || *p == '-') p++; - // Exponent - while (isdigit_ascii(*p)) { - p++; - num_digits++; - } - if (num_digits == 0) { - // no digits after exponent; un-consume the (+|-)? - p = p_at_e; - has_exponent = 0; - } else { - has_exponent = 1; - } - } - - size = p - s; - char *pc = malloc(size + 1); - memcpy(pc, p, size); - pc[size] = '\0'; - char *dst = pc; - p = s; - num_digits = 0; - // Copy leading sign + size_t length = strlen(s); + char *s_copy = malloc(length + 1); + char *dst = s_copy; + // Copy Leading sign if (*p == '+' || *p == '-') { *dst++ = *p++; } - // Copy integer part + // Copy integer part dropping `tsep` while (isdigit_ascii(*p)) { *dst++ = *p++; - p += IS_TSEP(*p); - num_digits++; + p += (tsep != '\0' && *p == tsep); } - // Copy factional part, replacing `decimal` with '.' + // Replace `decimal` with '.' if (*p == decimal) { - *dst++ = '.'; - p++; - while (isdigit_ascii(*p)) { - *dst++ = *p++; - num_digits++; - } - } - assert(num_digits > 0); - // Copy exponent - if (has_exponent && toupper_ascii(*p) == toupper_ascii(sci)) { - num_digits = 0; - *dst++ = *p++; - // Copy leading exponent sign - if (*p == '+' || *p == '-') { - *dst++ = *p++; - } - // Exponent - while (isdigit_ascii(*p)) { - *dst++ = *p++; - num_digits++; - } - assert(num_digits > 0); - } - *dst = '\0'; - if (endpos != NULL) { - *endpos = (char *)p; - } - return pc; - #undef IS_TSEP + *dst++ = '.'; + p++; + } + // Copy the remainder of the string as is. + memcpy(dst, p, length + 1 - (p - s)); + s_copy[length] = '\0'; + if (endpos != NULL) + *endpos = (char *)(s + length); + return s_copy; } + double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int) { char *pc = NULL; @@ -1892,14 +1820,7 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep, // remove t(housand)sep. char *endptr = NULL; if (decimal != '.' || tsep != '\0') { - pc = str_copy_decimal_str_c(p, &endptr, decimal, tsep, sci); - if (pc == NULL) { - if (q != NULL) { - *q = (char *)p; - } - *error = -1; - return 0.0; - } + pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep); } // This is called from a nogil block in parsers.pyx // so need to explicitly get GIL before Python calls From cd23ba10e087592945c503056a603acf7675a283 Mon Sep 17 00:00:00 2001 From: Ales Erjavec Date: Mon, 27 Jul 2020 16:13:46 +0200 Subject: [PATCH 3/7] Add test for float_precision parsers equality --- pandas/tests/io/parser/test_c_parser_only.py | 60 ++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 50911b934781e..50179fc1ec4b8 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -644,3 +644,63 @@ def test_1000_sep_with_decimal( float_precision=float_precision, ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "float_precision", [None, "high", "round_trip"], +) +@pytest.mark.parametrize( + "value,expected", + [ + ("-1,0", -1.0), + ("-1,2e0", -1.2), + ("-1e0", -1.0), + ("+1e0", 1.0), + ("+1e+0", 1.0), + ("+1e-1", 0.1), + ("+,1e1", 1.0), + ("+1,e0", 1.0), + ("-,1e1", -1.0), + ("-1,e0", -1.0), + ("0,1", 0.1), + ("1,", 1.0), + (",1", 0.1), + ("-,1", -0.1), + ("1_,", 1.0), + ("1_234,56", 1234.56), + ("1_234,56e0", 1234.56), + # negative cases; must not parse as float + ("_", "_"), + ("-_", "-_"), + ("-_1", "-_1"), + ("-_1e0", "-_1e0"), + ("_1", "_1"), + ("_1,", "_1,"), + ("_1,_", "_1,_"), + ("_1e0", "_1e0"), + ("1,2e_1", "1,2e_1"), + ("1,2e1_0", "1,2e1_0"), + ("1,_2", "1,_2"), + (",1__2", ",1__2"), + (",1e", ",1e"), + ("-,1e", "-,1e"), + ("1_000,000_000", "1_000,000_000"), + ("1,e1_2", "1,e1_2"), + ], +) +def test_1000_sep_decimal_float_precision( + c_parser_only, value, expected, float_precision +): + # test decimal and thousand sep handling in across 'float_precision' + # parsers + parser = c_parser_only + df = parser.read_csv( + StringIO(value), + sep="|", + thousands="_", + decimal=",", + header=None, + float_precision=float_precision, + ) + val = df.iloc[0, 0] + assert val == expected From bee7fdfb776dd45f83b28e498d2af22a76130d41 Mon Sep 17 00:00:00 2001 From: Ales Erjavec Date: Wed, 5 Aug 2020 10:52:25 +0200 Subject: [PATCH 4/7] Simplify round_trip converter Always make a copy of input string. --- pandas/_libs/src/parser/tokenizer.c | 36 +++++++++++------------------ 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 4da8a360e7422..ccd7ad34055cc 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1815,37 +1815,29 @@ char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal, double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int) { - char *pc = NULL; // 'normalize' representation to C-locale; replace decimal with '.' and // remove t(housand)sep. char *endptr = NULL; - if (decimal != '.' || tsep != '\0') { - pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep); - } + char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep); // This is called from a nogil block in parsers.pyx // so need to explicitly get GIL before Python calls PyGILState_STATE gstate; gstate = PyGILState_Ensure(); - double r; - if (pc != NULL) { - char *endpc = NULL; - r = PyOS_string_to_double(pc, &endpc, 0); - // PyOS_string_to_double needs to consume the whole string - if (endpc == pc + strlen(pc)) { - if (q != NULL) { - // report endptr from source string (p) - *q = (char *) endptr; - } - } else { - *error = -1; - if (q != NULL) { - // p and pc are different len due to tsep removal. Can't report - // how much it has consumed of p. Just rewind to beginning. - *q = (char *)p; - } + char *endpc = NULL; + double r = PyOS_string_to_double(pc, &endpc, 0); + // PyOS_string_to_double needs to consume the whole string + if (endpc == pc + strlen(pc)) { + if (q != NULL) { + // report endptr from source string (p) + *q = (char *) endptr; } } else { - r = PyOS_string_to_double(p, q, 0); + *error = -1; + if (q != NULL) { + // p and pc are different len due to tsep removal. Can't report + // how much it has consumed of p. Just rewind to beginning. + *q = (char *)p; + } } if (maybe_int != NULL) *maybe_int = 0; if (PyErr_Occurred() != NULL) *error = -1; From 66f8be525a9667af423fd68cd43b699262056a53 Mon Sep 17 00:00:00 2001 From: Ales Erjavec Date: Mon, 10 Aug 2020 09:27:24 +0200 Subject: [PATCH 5/7] Use strncpy --- pandas/_libs/src/parser/tokenizer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index ccd7ad34055cc..adb25df9d2c87 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1805,8 +1805,7 @@ char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal, p++; } // Copy the remainder of the string as is. - memcpy(dst, p, length + 1 - (p - s)); - s_copy[length] = '\0'; + strncpy(dst, p, length + 1 - (p - s)); if (endpos != NULL) *endpos = (char *)(s + length); return s_copy; From 42c509c46be6cae87c14c3c5f24b4a9e964270c4 Mon Sep 17 00:00:00 2001 From: Ales Erjavec Date: Mon, 10 Aug 2020 09:37:56 +0200 Subject: [PATCH 6/7] Just define the endptr and endpc --- pandas/_libs/src/parser/tokenizer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index adb25df9d2c87..df8ec68986ccb 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1816,13 +1816,13 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int) { // 'normalize' representation to C-locale; replace decimal with '.' and // remove t(housand)sep. - char *endptr = NULL; + char *endptr; char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep); // This is called from a nogil block in parsers.pyx // so need to explicitly get GIL before Python calls PyGILState_STATE gstate; gstate = PyGILState_Ensure(); - char *endpc = NULL; + char *endpc; double r = PyOS_string_to_double(pc, &endpc, 0); // PyOS_string_to_double needs to consume the whole string if (endpc == pc + strlen(pc)) { From 8703ad6e9c6fe81a83a6778c5bab0b7e22287434 Mon Sep 17 00:00:00 2001 From: Ales Erjavec Date: Mon, 10 Aug 2020 10:53:09 +0200 Subject: [PATCH 7/7] Add whatsnew entry --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 33e70daa55e66..2ee15c506d033 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -141,6 +141,7 @@ I/O ^^^ - Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`) +- In :meth:`read_csv` `float_precision='round_trip'` now handles `decimal` and `thousands` parameters (:issue:`35365`) - Plotting