Skip to content

Commit bd94bb1

Browse files
authored
BUG: round_trip parser initial/trailing whitespace (#43714)
1 parent 4f9b3ea commit bd94bb1

File tree

3 files changed

+50
-3
lines changed

3 files changed

+50
-3
lines changed

doc/source/whatsnew/v1.4.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,8 @@ I/O
463463
- Bug in unpickling a :class:`Index` with object dtype incorrectly inferring numeric dtypes (:issue:`43188`)
464464
- Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raising uncontrolled ``IndexError`` (:issue:`43102`)
465465
- Bug in :func:`read_csv`, changed exception class when expecting a file path name or file-like object from ``OSError`` to ``TypeError`` (:issue:`43366`)
466+
- Bug in :func:`read_csv` with :code:`float_precision="round_trip"` which did not skip initial/trailing whitespace (:issue:`43713`)
467+
-
466468

467469
Period
468470
^^^^^^

pandas/_libs/src/parser/tokenizer.c

+25-3
Original file line numberDiff line numberDiff line change
@@ -1784,6 +1784,8 @@ char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
17841784
size_t length = strlen(s);
17851785
char *s_copy = malloc(length + 1);
17861786
char *dst = s_copy;
1787+
// Skip leading whitespace.
1788+
while (isspace_ascii(*p)) p++;
17871789
// Copy Leading sign
17881790
if (*p == '+' || *p == '-') {
17891791
*dst++ = *p++;
@@ -1798,10 +1800,25 @@ char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
17981800
*dst++ = '.';
17991801
p++;
18001802
}
1801-
// Copy the remainder of the string as is.
1802-
strncpy(dst, p, length + 1 - (p - s));
1803+
// Copy fractional part after decimal (if any)
1804+
while (isdigit_ascii(*p)) {
1805+
*dst++ = *p++;
1806+
}
1807+
// Copy exponent if any
1808+
if (toupper_ascii(*p) == toupper_ascii('E')) {
1809+
*dst++ = *p++;
1810+
// Copy leading exponent sign (if any)
1811+
if (*p == '+' || *p == '-') {
1812+
*dst++ = *p++;
1813+
}
1814+
// Copy exponent digits
1815+
while (isdigit_ascii(*p)) {
1816+
*dst++ = *p++;
1817+
}
1818+
}
1819+
*dst++ = '\0'; // terminate
18031820
if (endpos != NULL)
1804-
*endpos = (char *)(s + length);
1821+
*endpos = (char *)p;
18051822
return s_copy;
18061823
}
18071824

@@ -1839,6 +1856,11 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
18391856

18401857
PyGILState_Release(gstate);
18411858
free(pc);
1859+
if (skip_trailing && q != NULL && *q != p) {
1860+
while (isspace_ascii(**q)) {
1861+
(*q)++;
1862+
}
1863+
}
18421864
return r;
18431865
}
18441866

pandas/tests/io/parser/dtypes/test_dtypes_basic.py

+23
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,11 @@ def test_1000_sep_decimal_float_precision(
204204
# test decimal and thousand sep handling in across 'float_precision'
205205
# parsers
206206
decimal_number_check(c_parser_only, numeric_decimal, thousands, float_precision)
207+
text, value = numeric_decimal
208+
text = " " + text + " "
209+
if isinstance(value, str): # the negative cases (parse as text)
210+
value = " " + value + " "
211+
decimal_number_check(c_parser_only, (text, value), thousands, float_precision)
207212

208213

209214
def decimal_number_check(parser, numeric_decimal, thousands, float_precision):
@@ -222,6 +227,24 @@ def decimal_number_check(parser, numeric_decimal, thousands, float_precision):
222227
assert val == numeric_decimal[1]
223228

224229

230+
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
231+
def test_skip_whitespace(c_parser_only, float_precision):
232+
DATA = """id\tnum\t
233+
1\t1.2 \t
234+
1\t 2.1\t
235+
2\t 1\t
236+
2\t 1.2 \t
237+
"""
238+
df = c_parser_only.read_csv(
239+
StringIO(DATA),
240+
float_precision=float_precision,
241+
sep="\t",
242+
header=0,
243+
dtype={1: np.float64},
244+
)
245+
tm.assert_series_equal(df.iloc[:, 1], pd.Series([1.2, 2.1, 1.0, 1.2], name="num"))
246+
247+
225248
def test_true_values_cast_to_bool(all_parsers):
226249
# GH#34655
227250
text = """a,b

0 commit comments

Comments
 (0)