[FIX] Handle decimal and thousand separator in 'round_trip' converer (#35377)

ales-erjavec · web-flow · commit c87e40c23575 · 2020-08-10T20:36:26.000-04:00
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -155,6 +155,7 @@ I/O
 ^^^
 
 - Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`)
+- In :meth:`read_csv` `float_precision='round_trip'` now handles `decimal` and `thousands` parameters (:issue:`35365`)
 -
 
 Plotting
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
@@ -1778,20 +1778,73 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
     return number;
 }
 
+/* copy a decimal number string with `decimal`, `tsep` as decimal point
+   and thousands separator to an equivalent c-locale decimal string (striping
+   `tsep`, replacing `decimal` with '.'). The returned memory should be free-d
+   with a call to `free`.
+*/
+
+char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
+                              char tsep) {
+    const char *p = s;
+    size_t length = strlen(s);
+    char *s_copy = malloc(length + 1);
+    char *dst = s_copy;
+    // Copy Leading sign
+    if (*p == '+' || *p == '-') {
+        *dst++ = *p++;
+    }
+    // Copy integer part dropping `tsep`
+    while (isdigit_ascii(*p)) {
+        *dst++ = *p++;
+        p += (tsep != '\0' && *p == tsep);
+    }
+    // Replace `decimal` with '.'
+    if (*p == decimal) {
+       *dst++ = '.';
+       p++;
+    }
+    // Copy the remainder of the string as is.
+    strncpy(dst, p, length + 1 - (p - s));
+    if (endpos != NULL)
+        *endpos = (char *)(s + length);
+    return s_copy;
+}
+
+
 double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
                   int skip_trailing, int *error, int *maybe_int) {
+    // 'normalize' representation to C-locale; replace decimal with '.' and
+    // remove t(housand)sep.
+    char *endptr;
+    char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep);
     // This is called from a nogil block in parsers.pyx
     // so need to explicitly get GIL before Python calls
     PyGILState_STATE gstate;
     gstate = PyGILState_Ensure();
-
-    double r = PyOS_string_to_double(p, q, 0);
+    char *endpc;
+    double r = PyOS_string_to_double(pc, &endpc, 0);
+    // PyOS_string_to_double needs to consume the whole string
+    if (endpc == pc + strlen(pc)) {
+        if (q != NULL) {
+           // report endptr from source string (p)
+            *q = (char *) endptr;
+        }
+    } else {
+        *error = -1;
+        if (q != NULL) {
+           // p and pc are different len due to tsep removal. Can't report
+           // how much it has consumed of p. Just rewind to beginning.
+            *q = (char *)p;
+        }
+    }
     if (maybe_int != NULL) *maybe_int = 0;
     if (PyErr_Occurred() != NULL) *error = -1;
     else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL;
     PyErr_Clear();
 
     PyGILState_Release(gstate);
+    free(pc);
     return r;
 }
 
diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py
@@ -606,3 +606,101 @@ def test_unix_style_breaks(c_parser_only):
         result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c")
     expected = DataFrame(columns=["col_1", "col_2", "col_3"])
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("float_precision", [None, "high", "round_trip"])
+@pytest.mark.parametrize(
+    "data,thousands,decimal",
+    [
+        (
+            """A|B|C
+1|2,334.01|5
+10|13|10.
+""",
+            ",",
+            ".",
+        ),
+        (
+            """A|B|C
+1|2.334,01|5
+10|13|10,
+""",
+            ".",
+            ",",
+        ),
+    ],
+)
+def test_1000_sep_with_decimal(
+    c_parser_only, data, thousands, decimal, float_precision
+):
+    parser = c_parser_only
+    expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
+
+    result = parser.read_csv(
+        StringIO(data),
+        sep="|",
+        thousands=thousands,
+        decimal=decimal,
+        float_precision=float_precision,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "float_precision", [None, "high", "round_trip"],
+)
+@pytest.mark.parametrize(
+    "value,expected",
+    [
+        ("-1,0", -1.0),
+        ("-1,2e0", -1.2),
+        ("-1e0", -1.0),
+        ("+1e0", 1.0),
+        ("+1e+0", 1.0),
+        ("+1e-1", 0.1),
+        ("+,1e1", 1.0),
+        ("+1,e0", 1.0),
+        ("-,1e1", -1.0),
+        ("-1,e0", -1.0),
+        ("0,1", 0.1),
+        ("1,", 1.0),
+        (",1", 0.1),
+        ("-,1", -0.1),
+        ("1_,", 1.0),
+        ("1_234,56", 1234.56),
+        ("1_234,56e0", 1234.56),
+        # negative cases; must not parse as float
+        ("_", "_"),
+        ("-_", "-_"),
+        ("-_1", "-_1"),
+        ("-_1e0", "-_1e0"),
+        ("_1", "_1"),
+        ("_1,", "_1,"),
+        ("_1,_", "_1,_"),
+        ("_1e0", "_1e0"),
+        ("1,2e_1", "1,2e_1"),
+        ("1,2e1_0", "1,2e1_0"),
+        ("1,_2", "1,_2"),
+        (",1__2", ",1__2"),
+        (",1e", ",1e"),
+        ("-,1e", "-,1e"),
+        ("1_000,000_000", "1_000,000_000"),
+        ("1,e1_2", "1,e1_2"),
+    ],
+)
+def test_1000_sep_decimal_float_precision(
+    c_parser_only, value, expected, float_precision
+):
+    # test decimal and thousand sep handling in across 'float_precision'
+    # parsers
+    parser = c_parser_only
+    df = parser.read_csv(
+        StringIO(value),
+        sep="|",
+        thousands="_",
+        decimal=",",
+        header=None,
+        float_precision=float_precision,
+    )
+    val = df.iloc[0, 0]
+    assert val == expected

Original file line number	Diff line number	Diff line change
`@@ -155,6 +155,7 @@ I/O`
`155`	`155`	`^^^`
`156`	`156`
`157`	`157`	- Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`)
	`158`	+- In :meth:`read_csv` `float_precision='round_trip'` now handles `decimal` and `thousands` parameters (:issue:`35365`)
`158`	`159`	`-`
`159`	`160`
`160`	`161`	`Plotting`