Skip to content

[FIX] Handle decimal and thousand separator in 'round_trip' converer #35377

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Aug 11, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ I/O
^^^

- Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`)
- In :meth:`read_csv` `float_precision='round_trip'` now handles `decimal` and `thousands` parameters (:issue:`35365`)
-

Plotting
Expand Down
57 changes: 55 additions & 2 deletions pandas/_libs/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -1778,20 +1778,73 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
return number;
}

/* copy a decimal number string with `decimal`, `tsep` as decimal point
and thousands separator to an equivalent c-locale decimal string (striping
`tsep`, replacing `decimal` with '.'). The returned memory should be free-d
with a call to `free`.
*/

char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
char tsep) {
const char *p = s;
size_t length = strlen(s);
char *s_copy = malloc(length + 1);
char *dst = s_copy;
// Copy Leading sign
if (*p == '+' || *p == '-') {
*dst++ = *p++;
}
// Copy integer part dropping `tsep`
while (isdigit_ascii(*p)) {
*dst++ = *p++;
p += (tsep != '\0' && *p == tsep);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the null byte check here serve a purpose?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. tsep is null when no thousands separator is defined. But the loop must not skip the terminating null byte in the input string (the second condition) which would happen on e.g. '123\0'

}
// Replace `decimal` with '.'
if (*p == decimal) {
*dst++ = '.';
p++;
}
// Copy the remainder of the string as is.
strncpy(dst, p, length + 1 - (p - s));
if (endpos != NULL)
*endpos = (char *)(s + length);
return s_copy;
}


double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
int skip_trailing, int *error, int *maybe_int) {
// 'normalize' representation to C-locale; replace decimal with '.' and
// remove t(housand)sep.
char *endptr;
char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep);
// This is called from a nogil block in parsers.pyx
// so need to explicitly get GIL before Python calls
PyGILState_STATE gstate;
gstate = PyGILState_Ensure();

double r = PyOS_string_to_double(p, q, 0);
char *endpc;
double r = PyOS_string_to_double(pc, &endpc, 0);
// PyOS_string_to_double needs to consume the whole string
if (endpc == pc + strlen(pc)) {
if (q != NULL) {
// report endptr from source string (p)
*q = (char *) endptr;
}
} else {
*error = -1;
if (q != NULL) {
// p and pc are different len due to tsep removal. Can't report
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a test that hits this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. The "1,2e1_0" for instance. I.e something that is prefixed with a number but has extra trailing characters.

// how much it has consumed of p. Just rewind to beginning.
*q = (char *)p;
}
}
if (maybe_int != NULL) *maybe_int = 0;
if (PyErr_Occurred() != NULL) *error = -1;
else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL;
PyErr_Clear();

PyGILState_Release(gstate);
free(pc);
return r;
}

Expand Down
98 changes: 98 additions & 0 deletions pandas/tests/io/parser/test_c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -606,3 +606,101 @@ def test_unix_style_breaks(c_parser_only):
result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c")
expected = DataFrame(columns=["col_1", "col_2", "col_3"])
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("float_precision", [None, "high", "round_trip"])
@pytest.mark.parametrize(
"data,thousands,decimal",
[
(
"""A|B|C
1|2,334.01|5
10|13|10.
""",
",",
".",
),
(
"""A|B|C
1|2.334,01|5
10|13|10,
""",
".",
",",
),
],
)
def test_1000_sep_with_decimal(
c_parser_only, data, thousands, decimal, float_precision
):
parser = c_parser_only
expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})

result = parser.read_csv(
StringIO(data),
sep="|",
thousands=thousands,
decimal=decimal,
float_precision=float_precision,
)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"float_precision", [None, "high", "round_trip"],
)
@pytest.mark.parametrize(
"value,expected",
[
("-1,0", -1.0),
("-1,2e0", -1.2),
("-1e0", -1.0),
("+1e0", 1.0),
("+1e+0", 1.0),
("+1e-1", 0.1),
("+,1e1", 1.0),
("+1,e0", 1.0),
("-,1e1", -1.0),
("-1,e0", -1.0),
("0,1", 0.1),
("1,", 1.0),
(",1", 0.1),
("-,1", -0.1),
("1_,", 1.0),
("1_234,56", 1234.56),
("1_234,56e0", 1234.56),
# negative cases; must not parse as float
("_", "_"),
("-_", "-_"),
("-_1", "-_1"),
("-_1e0", "-_1e0"),
("_1", "_1"),
("_1,", "_1,"),
("_1,_", "_1,_"),
("_1e0", "_1e0"),
("1,2e_1", "1,2e_1"),
("1,2e1_0", "1,2e1_0"),
("1,_2", "1,_2"),
(",1__2", ",1__2"),
(",1e", ",1e"),
("-,1e", "-,1e"),
("1_000,000_000", "1_000,000_000"),
("1,e1_2", "1,e1_2"),
],
)
def test_1000_sep_decimal_float_precision(
c_parser_only, value, expected, float_precision
):
# test decimal and thousand sep handling in across 'float_precision'
# parsers
parser = c_parser_only
df = parser.read_csv(
StringIO(value),
sep="|",
thousands="_",
decimal=",",
header=None,
float_precision=float_precision,
)
val = df.iloc[0, 0]
assert val == expected