From 2da9aa77f479a634bb6e82af2b636a8361bcf958 Mon Sep 17 00:00:00 2001
From: Ales Erjavec <ales.erjavec@fri.uni-lj.si>
Date: Wed, 22 Jul 2020 12:46:15 +0200
Subject: [PATCH 1/7] Handle `decimal` and `tsep` in `round_trip` converter
 (#35365)

In case of non c-locale decimal and tsep, copy and fixup the source
string before passing it to PyOS_string_to_double
---
 pandas/_libs/src/parser/tokenizer.c          | 145 ++++++++++++++++++-
 pandas/tests/io/parser/test_c_parser_only.py |  38 +++++
 2 files changed, 181 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index a195c0daf5271..052f9427efd26 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1778,20 +1778,161 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
     return number;
 }
 
+/* copy a decimal number string in form `decimal` and `tsep` and `sci` as
+   decimal point, thousands separator and sci exponent character to a an
+   equivalent c-locale decimal string (striping tsep, replacing `decimal`
+   with '.'). Return NULL if nothing could be copied.
+*/
+
+char* str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
+                             char tsep, char sci) {
+    #define IS_TSEP(c) (tsep != '\0' && c == tsep)
+    ssize_t size = 0;
+    ssize_t num_digits = 0;
+    char has_exponent = 0;
+    const char *p = s;
+    // First count how many characters we can consume.
+    // Leading sign
+    if (*p == '+' || *p == '-') p++;
+    // Integer part
+    while (isdigit_ascii(*p)) {
+        p++;
+        p += IS_TSEP(*p);
+        num_digits++;
+    }
+    // Fractional part
+    if (*p == decimal) {
+        p++;
+        while (isdigit_ascii(*p)) {
+            p++;
+            num_digits++;
+        }
+    }
+    if (num_digits == 0) {
+        if (endpos != NULL) {
+            *endpos = (char *)s;
+        }
+        return NULL;
+    }
+    // Exponent part
+    if (toupper_ascii(*p) == toupper_ascii(sci)) {
+        const char * p_at_e = p;
+        num_digits = 0;
+        p++;
+        // Exponent sign
+        if (*p == '+' || *p == '-') p++;
+        // Exponent
+        while (isdigit_ascii(*p)) {
+            p++;
+            num_digits++;
+        }
+        if (num_digits == 0) {
+            // no digits after exponent; un-consume the (+|-)?
+            p = p_at_e;
+            has_exponent = 0;
+        } else {
+            has_exponent = 1;
+        }
+    }
+
+    size = p - s;
+    char *pc = malloc(size + 1);
+    memcpy(pc, p, size);
+    pc[size] = '\0';
+    char *dst = pc;
+    p = s;
+    num_digits = 0;
+    // Copy leading sign
+    if (*p == '+' || *p == '-') {
+        *dst++ = *p++;
+    }
+    // Copy integer part
+    while (isdigit_ascii(*p)) {
+        *dst++ = *p++;
+        p += IS_TSEP(*p);
+        num_digits++;
+    }
+    // Copy factional part, replacing `decimal` with '.'
+    if (*p == decimal) {
+        *dst++ = '.';
+        p++;
+        while (isdigit_ascii(*p)) {
+            *dst++ = *p++;
+            num_digits++;
+        }
+    }
+    assert(num_digits > 0);
+    // Copy exponent
+    if (has_exponent && toupper_ascii(*p) == toupper_ascii(sci)) {
+        num_digits = 0;
+        *dst++ = *p++;
+        // Copy leading exponent sign
+        if (*p == '+' || *p == '-') {
+            *dst++ = *p++;
+        }
+        // Exponent
+        while (isdigit_ascii(*p)) {
+            *dst++ = *p++;
+            num_digits++;
+        }
+        assert(num_digits > 0);
+    }
+    *dst = '\0';
+    if (endpos != NULL) {
+        *endpos = (char *)p;
+    }
+    return pc;
+    #undef IS_TSEP
+}
+
 double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
                   int skip_trailing, int *error, int *maybe_int) {
+    char *pc = NULL;
+    // 'normalize' representation to C-locale; replace decimal with '.' and
+    // remove t(housand)sep.
+    char *endptr = NULL;
+    if (decimal != '.' || tsep != '\0') {
+        pc = str_copy_decimal_str_c(p, &endptr, decimal, tsep, sci);
+        if (pc == NULL) {
+            if (q != NULL) {
+                *q = (char *)p;
+            }
+            *error = -1;
+            return 0.0;
+        }
+    }
     // This is called from a nogil block in parsers.pyx
     // so need to explicitly get GIL before Python calls
     PyGILState_STATE gstate;
     gstate = PyGILState_Ensure();
-
-    double r = PyOS_string_to_double(p, q, 0);
+    double r;
+    if (pc != NULL) {
+        char *endpc = NULL;
+        r = PyOS_string_to_double(pc, &endpc, 0);
+        // PyOS_string_to_double needs to consume the whole string
+        if (endpc == pc + strlen(pc)) {
+            if (q != NULL) {
+               // report endptr from source string (p)
+                *q = (char *) endptr;
+            }
+        } else {
+            *error = -1;
+            if (q != NULL) {
+               // p and pc are different len due to tsep removal. Can't report
+               // how much it has consumed of p. Just rewind to beginning.
+                *q = (char *)p;
+            }
+        }
+    } else {
+        r = PyOS_string_to_double(p, q, 0);
+    }
     if (maybe_int != NULL) *maybe_int = 0;
     if (PyErr_Occurred() != NULL) *error = -1;
     else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL;
     PyErr_Clear();
 
     PyGILState_Release(gstate);
+    free(pc);
     return r;
 }
 
diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py
index d76d01904731a..50911b934781e 100644
--- a/pandas/tests/io/parser/test_c_parser_only.py
+++ b/pandas/tests/io/parser/test_c_parser_only.py
@@ -606,3 +606,41 @@ def test_unix_style_breaks(c_parser_only):
         result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c")
     expected = DataFrame(columns=["col_1", "col_2", "col_3"])
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("float_precision", [None, "high", "round_trip"])
+@pytest.mark.parametrize(
+    "data,thousands,decimal",
+    [
+        (
+            """A|B|C
+1|2,334.01|5
+10|13|10.
+""",
+            ",",
+            ".",
+        ),
+        (
+            """A|B|C
+1|2.334,01|5
+10|13|10,
+""",
+            ".",
+            ",",
+        ),
+    ],
+)
+def test_1000_sep_with_decimal(
+    c_parser_only, data, thousands, decimal, float_precision
+):
+    parser = c_parser_only
+    expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
+
+    result = parser.read_csv(
+        StringIO(data),
+        sep="|",
+        thousands=thousands,
+        decimal=decimal,
+        float_precision=float_precision,
+    )
+    tm.assert_frame_equal(result, expected)

From dcd8de1c6ff9587b5f55d723010841b88cf70a4b Mon Sep 17 00:00:00 2001
From: Ales Erjavec <ales.erjavec@fri.uni-lj.si>
Date: Mon, 27 Jul 2020 15:55:28 +0200
Subject: [PATCH 2/7] Simplify str_copy_decimal_str.

---
 pandas/_libs/src/parser/tokenizer.c | 127 ++++++----------------------
 1 file changed, 24 insertions(+), 103 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 052f9427efd26..4da8a360e7422 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1778,113 +1778,41 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
     return number;
 }
 
-/* copy a decimal number string in form `decimal` and `tsep` and `sci` as
-   decimal point, thousands separator and sci exponent character to a an
-   equivalent c-locale decimal string (striping tsep, replacing `decimal`
-   with '.'). Return NULL if nothing could be copied.
+/* copy a decimal number string with `decimal`, `tsep` as decimal point
+   and thousands separator to an equivalent c-locale decimal string (striping
+   `tsep`, replacing `decimal` with '.'). The returned memory should be free-d
+   with a call to `free`.
 */
 
-char* str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
-                             char tsep, char sci) {
-    #define IS_TSEP(c) (tsep != '\0' && c == tsep)
-    ssize_t size = 0;
-    ssize_t num_digits = 0;
-    char has_exponent = 0;
+char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
+                              char tsep) {
     const char *p = s;
-    // First count how many characters we can consume.
-    // Leading sign
-    if (*p == '+' || *p == '-') p++;
-    // Integer part
-    while (isdigit_ascii(*p)) {
-        p++;
-        p += IS_TSEP(*p);
-        num_digits++;
-    }
-    // Fractional part
-    if (*p == decimal) {
-        p++;
-        while (isdigit_ascii(*p)) {
-            p++;
-            num_digits++;
-        }
-    }
-    if (num_digits == 0) {
-        if (endpos != NULL) {
-            *endpos = (char *)s;
-        }
-        return NULL;
-    }
-    // Exponent part
-    if (toupper_ascii(*p) == toupper_ascii(sci)) {
-        const char * p_at_e = p;
-        num_digits = 0;
-        p++;
-        // Exponent sign
-        if (*p == '+' || *p == '-') p++;
-        // Exponent
-        while (isdigit_ascii(*p)) {
-            p++;
-            num_digits++;
-        }
-        if (num_digits == 0) {
-            // no digits after exponent; un-consume the (+|-)?
-            p = p_at_e;
-            has_exponent = 0;
-        } else {
-            has_exponent = 1;
-        }
-    }
-
-    size = p - s;
-    char *pc = malloc(size + 1);
-    memcpy(pc, p, size);
-    pc[size] = '\0';
-    char *dst = pc;
-    p = s;
-    num_digits = 0;
-    // Copy leading sign
+    size_t length = strlen(s);
+    char *s_copy = malloc(length + 1);
+    char *dst = s_copy;
+    // Copy Leading sign
     if (*p == '+' || *p == '-') {
         *dst++ = *p++;
     }
-    // Copy integer part
+    // Copy integer part dropping `tsep`
     while (isdigit_ascii(*p)) {
         *dst++ = *p++;
-        p += IS_TSEP(*p);
-        num_digits++;
+        p += (tsep != '\0' && *p == tsep);
     }
-    // Copy factional part, replacing `decimal` with '.'
+    // Replace `decimal` with '.'
     if (*p == decimal) {
-        *dst++ = '.';
-        p++;
-        while (isdigit_ascii(*p)) {
-            *dst++ = *p++;
-            num_digits++;
-        }
-    }
-    assert(num_digits > 0);
-    // Copy exponent
-    if (has_exponent && toupper_ascii(*p) == toupper_ascii(sci)) {
-        num_digits = 0;
-        *dst++ = *p++;
-        // Copy leading exponent sign
-        if (*p == '+' || *p == '-') {
-            *dst++ = *p++;
-        }
-        // Exponent
-        while (isdigit_ascii(*p)) {
-            *dst++ = *p++;
-            num_digits++;
-        }
-        assert(num_digits > 0);
-    }
-    *dst = '\0';
-    if (endpos != NULL) {
-        *endpos = (char *)p;
-    }
-    return pc;
-    #undef IS_TSEP
+       *dst++ = '.';
+       p++;
+    }
+    // Copy the remainder of the string as is.
+    memcpy(dst, p, length + 1 - (p - s));
+    s_copy[length] = '\0';
+    if (endpos != NULL)
+        *endpos = (char *)(s + length);
+    return s_copy;
 }
 
+
 double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
                   int skip_trailing, int *error, int *maybe_int) {
     char *pc = NULL;
@@ -1892,14 +1820,7 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
     // remove t(housand)sep.
     char *endptr = NULL;
     if (decimal != '.' || tsep != '\0') {
-        pc = str_copy_decimal_str_c(p, &endptr, decimal, tsep, sci);
-        if (pc == NULL) {
-            if (q != NULL) {
-                *q = (char *)p;
-            }
-            *error = -1;
-            return 0.0;
-        }
+        pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep);
     }
     // This is called from a nogil block in parsers.pyx
     // so need to explicitly get GIL before Python calls

From cd23ba10e087592945c503056a603acf7675a283 Mon Sep 17 00:00:00 2001
From: Ales Erjavec <ales.erjavec@fri.uni-lj.si>
Date: Mon, 27 Jul 2020 16:13:46 +0200
Subject: [PATCH 3/7] Add test for float_precision parsers equality

---
 pandas/tests/io/parser/test_c_parser_only.py | 60 ++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py
index 50911b934781e..50179fc1ec4b8 100644
--- a/pandas/tests/io/parser/test_c_parser_only.py
+++ b/pandas/tests/io/parser/test_c_parser_only.py
@@ -644,3 +644,63 @@ def test_1000_sep_with_decimal(
         float_precision=float_precision,
     )
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "float_precision", [None, "high", "round_trip"],
+)
+@pytest.mark.parametrize(
+    "value,expected",
+    [
+        ("-1,0", -1.0),
+        ("-1,2e0", -1.2),
+        ("-1e0", -1.0),
+        ("+1e0", 1.0),
+        ("+1e+0", 1.0),
+        ("+1e-1", 0.1),
+        ("+,1e1", 1.0),
+        ("+1,e0", 1.0),
+        ("-,1e1", -1.0),
+        ("-1,e0", -1.0),
+        ("0,1", 0.1),
+        ("1,", 1.0),
+        (",1", 0.1),
+        ("-,1", -0.1),
+        ("1_,", 1.0),
+        ("1_234,56", 1234.56),
+        ("1_234,56e0", 1234.56),
+        # negative cases; must not parse as float
+        ("_", "_"),
+        ("-_", "-_"),
+        ("-_1", "-_1"),
+        ("-_1e0", "-_1e0"),
+        ("_1", "_1"),
+        ("_1,", "_1,"),
+        ("_1,_", "_1,_"),
+        ("_1e0", "_1e0"),
+        ("1,2e_1", "1,2e_1"),
+        ("1,2e1_0", "1,2e1_0"),
+        ("1,_2", "1,_2"),
+        (",1__2", ",1__2"),
+        (",1e", ",1e"),
+        ("-,1e", "-,1e"),
+        ("1_000,000_000", "1_000,000_000"),
+        ("1,e1_2", "1,e1_2"),
+    ],
+)
+def test_1000_sep_decimal_float_precision(
+    c_parser_only, value, expected, float_precision
+):
+    # test decimal and thousand sep handling in across 'float_precision'
+    # parsers
+    parser = c_parser_only
+    df = parser.read_csv(
+        StringIO(value),
+        sep="|",
+        thousands="_",
+        decimal=",",
+        header=None,
+        float_precision=float_precision,
+    )
+    val = df.iloc[0, 0]
+    assert val == expected

From bee7fdfb776dd45f83b28e498d2af22a76130d41 Mon Sep 17 00:00:00 2001
From: Ales Erjavec <ales.erjavec@fri.uni-lj.si>
Date: Wed, 5 Aug 2020 10:52:25 +0200
Subject: [PATCH 4/7] Simplify round_trip converter

Always make a copy of input string.
---
 pandas/_libs/src/parser/tokenizer.c | 36 +++++++++++------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 4da8a360e7422..ccd7ad34055cc 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1815,37 +1815,29 @@ char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
 
 double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
                   int skip_trailing, int *error, int *maybe_int) {
-    char *pc = NULL;
     // 'normalize' representation to C-locale; replace decimal with '.' and
     // remove t(housand)sep.
     char *endptr = NULL;
-    if (decimal != '.' || tsep != '\0') {
-        pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep);
-    }
+    char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep);
     // This is called from a nogil block in parsers.pyx
     // so need to explicitly get GIL before Python calls
     PyGILState_STATE gstate;
     gstate = PyGILState_Ensure();
-    double r;
-    if (pc != NULL) {
-        char *endpc = NULL;
-        r = PyOS_string_to_double(pc, &endpc, 0);
-        // PyOS_string_to_double needs to consume the whole string
-        if (endpc == pc + strlen(pc)) {
-            if (q != NULL) {
-               // report endptr from source string (p)
-                *q = (char *) endptr;
-            }
-        } else {
-            *error = -1;
-            if (q != NULL) {
-               // p and pc are different len due to tsep removal. Can't report
-               // how much it has consumed of p. Just rewind to beginning.
-                *q = (char *)p;
-            }
+    char *endpc = NULL;
+    double r = PyOS_string_to_double(pc, &endpc, 0);
+    // PyOS_string_to_double needs to consume the whole string
+    if (endpc == pc + strlen(pc)) {
+        if (q != NULL) {
+           // report endptr from source string (p)
+            *q = (char *) endptr;
         }
     } else {
-        r = PyOS_string_to_double(p, q, 0);
+        *error = -1;
+        if (q != NULL) {
+           // p and pc are different len due to tsep removal. Can't report
+           // how much it has consumed of p. Just rewind to beginning.
+            *q = (char *)p;
+        }
     }
     if (maybe_int != NULL) *maybe_int = 0;
     if (PyErr_Occurred() != NULL) *error = -1;

From 66f8be525a9667af423fd68cd43b699262056a53 Mon Sep 17 00:00:00 2001
From: Ales Erjavec <ales.erjavec@fri.uni-lj.si>
Date: Mon, 10 Aug 2020 09:27:24 +0200
Subject: [PATCH 5/7] Use strncpy

---
 pandas/_libs/src/parser/tokenizer.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index ccd7ad34055cc..adb25df9d2c87 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1805,8 +1805,7 @@ char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
        p++;
     }
     // Copy the remainder of the string as is.
-    memcpy(dst, p, length + 1 - (p - s));
-    s_copy[length] = '\0';
+    strncpy(dst, p, length + 1 - (p - s));
     if (endpos != NULL)
         *endpos = (char *)(s + length);
     return s_copy;

From 42c509c46be6cae87c14c3c5f24b4a9e964270c4 Mon Sep 17 00:00:00 2001
From: Ales Erjavec <ales.erjavec@fri.uni-lj.si>
Date: Mon, 10 Aug 2020 09:37:56 +0200
Subject: [PATCH 6/7] Just define the endptr and endpc

---
 pandas/_libs/src/parser/tokenizer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index adb25df9d2c87..df8ec68986ccb 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1816,13 +1816,13 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
                   int skip_trailing, int *error, int *maybe_int) {
     // 'normalize' representation to C-locale; replace decimal with '.' and
     // remove t(housand)sep.
-    char *endptr = NULL;
+    char *endptr;
     char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep);
     // This is called from a nogil block in parsers.pyx
     // so need to explicitly get GIL before Python calls
     PyGILState_STATE gstate;
     gstate = PyGILState_Ensure();
-    char *endpc = NULL;
+    char *endpc;
     double r = PyOS_string_to_double(pc, &endpc, 0);
     // PyOS_string_to_double needs to consume the whole string
     if (endpc == pc + strlen(pc)) {

From 8703ad6e9c6fe81a83a6778c5bab0b7e22287434 Mon Sep 17 00:00:00 2001
From: Ales Erjavec <ales.erjavec@fri.uni-lj.si>
Date: Mon, 10 Aug 2020 10:53:09 +0200
Subject: [PATCH 7/7] Add whatsnew entry

---
 doc/source/whatsnew/v1.2.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index 33e70daa55e66..2ee15c506d033 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -141,6 +141,7 @@ I/O
 ^^^
 
 - Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`)
+- In :meth:`read_csv` `float_precision='round_trip'` now handles `decimal` and `thousands` parameters (:issue:`35365`)
 -
 
 Plotting