Handle decimal and tsep in round_trip converter (pandas-dev#35365)

ales-erjavec · ales-erjavec · commit 7b2c77cadf5c · 2020-07-22T14:16:12.000+02:00
In case of non c-locale decimal and tsep, copy and fixup the source
string before passing it to PyOS_string_to_double
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
@@ -1778,20 +1778,132 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
     return number;
 }
 
+/* copy a decimal number string in form `decimal` and `tsep` and `sci` as
+   decimal point, thousands separator and sci exponent character to a an
+   equivalent c-locale decimal string (striping tsep, replacing `decimal`
+   with '.' and sci with 'e'.
+*/
+
+char* str_copy_decimal_str_c(const char *s, char **endpos, char decimal, char tsep, char sci) {
+    #define IS_TSEP(c) (tsep != '\0' && c == tsep)
+    ssize_t size = 0;
+    const char *p = s;
+    // First count how many characters we can consume.
+    // Leading sign
+    if (*p == '+' || *p == '-') p++;
+    // Integer part
+    while (isdigit_ascii(*p)) {
+        p++;
+        p += IS_TSEP(*p);
+    }
+    // Fractional part
+    if (*p == decimal) {
+        p++;
+        while (isdigit_ascii(*p)) {
+            p++;
+            p+= IS_TSEP(*p);
+        }
+    }
+    // Exponent part
+    if (toupper_ascii(*p) == toupper_ascii(sci)) {
+        p++;
+        // Exponent sign
+        if (*p == '+' || *p == '-') p++;
+        // Exponent
+        while (isdigit_ascii(*p)) {
+            p++;
+            p+= IS_TSEP(*p);
+        }
+    }
+
+    size = p - s;
+    char *pc = malloc(size + 1);
+    memcpy(pc, p, size);
+    pc[size] = '\0';
+    char *dst = pc;
+    p = s;
+
+    // Copy leading sign
+    if (*p == '+' || *p == '-') {
+        *dst++ = *p++;
+    }
+    // Copy integer part
+    while (isdigit_ascii(*p)) {
+        *dst++ = *p++;
+        p += IS_TSEP(*p);
+    }
+    // Copy factional part, replacing `decimal` with '.'
+    if (*p == decimal) {
+        *dst++ = '.';
+        p++;
+        while (isdigit_ascii(*p)) {
+            *dst++ = *p++;
+            p += IS_TSEP(*p);
+        }
+    }
+    // Copy exponent replacing `sci` with 'e'
+    if (toupper_ascii(*p) == toupper_ascii(sci)) {
+        *dst++ = 'e';
+        p++;
+        // Copy leading exponent sign
+        if (*p == '+' || *p == '-') {
+            *dst++ = *p++;
+        }
+        // Exponent
+        while (isdigit_ascii(*p)) {
+            *dst++ = *p++;
+            p += IS_TSEP(*p);
+        }
+    }
+    *dst = '\0';
+    if (endpos != NULL) {
+        *endpos = (char *)p;
+    }
+    return pc;
+    #undef IS_TSEP
+}
+
 double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
                   int skip_trailing, int *error, int *maybe_int) {
+    char *pc = NULL;
+    // 'normalize' representation to C-locale; replace decimal with '.' and
+    // remove t(housand)sep.
+    char *endptr = NULL;
+    if (decimal != '.' || tsep != '\0') {
+        pc = str_copy_decimal_str_c(p, &endptr, decimal, tsep, sci);
+    }
     // This is called from a nogil block in parsers.pyx
     // so need to explicitly get GIL before Python calls
     PyGILState_STATE gstate;
     gstate = PyGILState_Ensure();
-
-    double r = PyOS_string_to_double(p, q, 0);
+    double r;
+    if (pc != NULL) {
+        char *endpc = NULL;
+        r = PyOS_string_to_double(pc, &endpc, 0);
+        // PyOS_string_to_double needs to consume the whole string
+        if (endpc == pc + strlen(pc)) {
+            if (q != NULL) {
+               // report endptr from source string (p)
+                *q = (char *) endptr;
+            }
+        } else {
+            *error = -1;
+            if (q != NULL) {
+               // p and pc are different len due to tsep removal. Can't report
+               // how much it has consumed of p. Just rewind to beginning.
+                *q = (char *)p;
+            }
+        }
+    } else {
+        r = PyOS_string_to_double(p, q, 0);
+    }
     if (maybe_int != NULL) *maybe_int = 0;
     if (PyErr_Occurred() != NULL) *error = -1;
     else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL;
     PyErr_Clear();
 
     PyGILState_Release(gstate);
+    free(pc);
     return r;
 }
 
diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py
@@ -606,3 +606,46 @@ def test_unix_style_breaks(c_parser_only):
         result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c")
     expected = DataFrame(columns=["col_1", "col_2", "col_3"])
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "data,thousands,decimal,float_precision",
+    [
+        (
+            """A|B|C
+1|2,334.01|5
+10|13|10.
+""",
+            ",",
+            ".",
+            prec,
+        )
+        for prec in [None, "high", "round_trip"]
+    ]
+    + [
+        (
+            """A|B|C
+1|2.334,01|5
+10|13|10,
+""",
+            ".",
+            ",",
+            prec,
+        )
+        for prec in [None, "high", "round_trip"]
+    ],
+)
+def test_1000_sep_with_decimal(
+    c_parser_only, data, thousands, decimal, float_precision
+):
+    parser = c_parser_only
+    expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
+
+    result = parser.read_csv(
+        StringIO(data),
+        sep="|",
+        thousands=thousands,
+        decimal=decimal,
+        float_precision=float_precision,
+    )
+    tm.assert_frame_equal(result, expected)