Skip to content

Commit 1e72b2b

Browse files
authored
REGR: read_csv splitting on comma with delim_whitespace (#54954)
1 parent 51135ce commit 1e72b2b

File tree

3 files changed

+29
-1
lines changed

3 files changed

+29
-1
lines changed

doc/source/whatsnew/v2.1.1.rst

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ Fixed regressions
1515
~~~~~~~~~~~~~~~~~
1616
- Fixed regression in :func:`merge` when merging over a PyArrow string index (:issue:`54894`)
1717
- Fixed regression in :func:`read_csv` when ``usecols`` is given and ``dtypes`` is a dict for ``engine="python"`` (:issue:`54868`)
18+
- Fixed regression in :func:`read_csv` when ``delim_whitespace`` is True (:issue:`54918`, :issue:`54931`)
1819
- Fixed regression in :meth:`.GroupBy.get_group` raising for ``axis=1`` (:issue:`54858`)
1920
- Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`)
2021
- Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`)

pandas/_libs/src/parser/tokenizer.c

+2-1
Original file line numberDiff line numberDiff line change
@@ -664,7 +664,8 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes,
664664
((!self->delim_whitespace && c == ' ' && self->skipinitialspace))
665665

666666
// applied when in a field
667-
#define IS_DELIMITER(c) ((c == delimiter) || (delim_whitespace && isblank(c)))
667+
#define IS_DELIMITER(c) \
668+
((!delim_whitespace && c == delimiter) || (delim_whitespace && isblank(c)))
668669

669670
#define _TOKEN_CLEANUP() \
670671
self->stream_len = slen; \

pandas/tests/io/parser/test_header.py

+26
Original file line numberDiff line numberDiff line change
@@ -658,3 +658,29 @@ def test_header_missing_rows(all_parsers):
658658
msg = r"Passed header=\[0,1,2\], len of 3, but only 2 lines in file"
659659
with pytest.raises(ValueError, match=msg):
660660
parser.read_csv(StringIO(data), header=[0, 1, 2])
661+
662+
663+
@skip_pyarrow
664+
def test_header_multiple_whitespaces(all_parsers):
665+
# GH#54931
666+
parser = all_parsers
667+
data = """aa bb(1,1) cc(1,1)
668+
0 2 3.5"""
669+
670+
result = parser.read_csv(StringIO(data), sep=r"\s+")
671+
expected = DataFrame({"aa": [0], "bb(1,1)": 2, "cc(1,1)": 3.5})
672+
tm.assert_frame_equal(result, expected)
673+
674+
675+
@skip_pyarrow
676+
def test_header_delim_whitespace(all_parsers):
677+
# GH#54918
678+
parser = all_parsers
679+
data = """a,b
680+
1,2
681+
3,4
682+
"""
683+
684+
result = parser.read_csv(StringIO(data), delim_whitespace=True)
685+
expected = DataFrame({"a,b": ["1,2", "3,4"]})
686+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)