Skip to content

Commit 62d6260

Browse files
committed
BUG: Parse custom terminator with whitespace delimiter
Addresses BUG issue part of gh-12912. Closes gh-12912.
1 parent fdbc768 commit 62d6260

File tree

3 files changed

+265
-4
lines changed

3 files changed

+265
-4
lines changed

doc/source/whatsnew/v0.18.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,7 @@ Bug Fixes
302302
- Bug in ``value_counts`` when ``normalize=True`` and ``dropna=True`` where nulls still contributed to the normalized count (:issue:`12558`)
303303
- Bug in ``Panel.fillna()`` ignoring ``inplace=True`` (:issue:`12633`)
304304
- Bug in ``read_csv`` when specifying ``names``, ```usecols``, and ``parse_dates`` simultaneously with the C engine (:issue:`9755`)
305+
- Bug in ``read_csv`` when specifying ``delim_whitespace=True`` and ``lineterminator`` simultaneously with the C engine (:issue:`12912`)
305306
- Bug in ``Series.rename``, ``DataFrame.rename`` and ``DataFrame.rename_axis`` not treating ``Series`` as mappings to relabel (:issue:`12623`).
306307
- Clean in ``.rolling.min`` and ``.rolling.max`` to enhance dtype handling (:issue:`12373`)
307308
- Bug in ``groupby`` where complex types are coerced to float (:issue:`12902`)

pandas/io/tests/test_parsers.py

+9
Original file line numberDiff line numberDiff line change
@@ -3878,6 +3878,15 @@ def test_buffer_rd_bytes(self):
38783878
except Exception as e:
38793879
pass
38803880

3881+
def test_delim_whitespace_custom_terminator(self):
3882+
# See gh-12912
3883+
data = """a b c~1 2 3~4 5 6~7 8 9"""
3884+
df = self.read_csv(StringIO(data), lineterminator='~',
3885+
delim_whitespace=True)
3886+
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
3887+
columns=['a', 'b', 'c'])
3888+
tm.assert_frame_equal(df, expected)
3889+
38813890

38823891
class TestCParserHighMemory(CParserTests, CompressionTests, tm.TestCase):
38833892
engine = 'c'

pandas/src/parser/tokenizer.c

+255-4
Original file line numberDiff line numberDiff line change
@@ -1641,6 +1641,251 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
16411641
return 0;
16421642
}
16431643

1644+
// custom line terminator
1645+
int tokenize_whitespace_customterm(parser_t *self, size_t line_limit)
1646+
{
1647+
int i, slen, start_lines;
1648+
long maxstreamsize;
1649+
char c;
1650+
char *stream;
1651+
char *buf = self->data + self->datapos;
1652+
1653+
start_lines = self->lines;
1654+
1655+
if (make_stream_space(self, self->datalen - self->datapos) < 0) {
1656+
self->error_msg = "out of memory";
1657+
return -1;
1658+
}
1659+
1660+
stream = self->stream + self->stream_len;
1661+
slen = self->stream_len;
1662+
maxstreamsize = self->stream_cap;
1663+
1664+
TRACE(("%s\n", buf));
1665+
1666+
for (i = self->datapos; i < self->datalen; ++i)
1667+
{
1668+
// next character in file
1669+
c = *buf++;
1670+
1671+
TRACE(("tokenize_whitespace_customterm - Iter: %d Char: %c Line %d field_count %d, state %d\n",
1672+
i, c, self->file_lines + 1, self->line_fields[self->lines],
1673+
self->state));
1674+
1675+
switch(self->state) {
1676+
1677+
case SKIP_LINE:
1678+
if (c == self->lineterminator) {
1679+
END_LINE();
1680+
}
1681+
break;
1682+
1683+
case WHITESPACE_LINE:
1684+
if (c == self->lineterminator) {
1685+
self->file_lines++;
1686+
self->state = START_RECORD;
1687+
break;
1688+
}
1689+
// fall through
1690+
1691+
case EAT_WHITESPACE:
1692+
if (c == self->lineterminator) {
1693+
END_LINE();
1694+
self->state = START_RECORD;
1695+
break;
1696+
} else if (!IS_WHITESPACE(c)) {
1697+
self->state = START_FIELD;
1698+
// fall through to subsequent state
1699+
} else {
1700+
// if whitespace char, keep slurping
1701+
break;
1702+
}
1703+
1704+
case START_RECORD:
1705+
// start of record
1706+
if (skip_this_line(self, self->file_lines)) {
1707+
self->state = SKIP_LINE;
1708+
if (c == self->lineterminator) {
1709+
END_LINE();
1710+
}
1711+
break;
1712+
} else if (c == self->lineterminator) {
1713+
if (self->skip_empty_lines) {
1714+
self->file_lines++;
1715+
} else {
1716+
END_LINE();
1717+
}
1718+
break;
1719+
} else if (IS_WHITESPACE(c)) {
1720+
if (self->skip_empty_lines)
1721+
self->state = WHITESPACE_LINE;
1722+
else
1723+
self->state = EAT_WHITESPACE;
1724+
break;
1725+
} else if (c == self->commentchar) {
1726+
self->state = EAT_LINE_COMMENT;
1727+
break;
1728+
} else {
1729+
// nominal character - handle as START_FIELD
1730+
self->state = START_FIELD;
1731+
}
1732+
// fall through
1733+
1734+
case START_FIELD:
1735+
// expecting field
1736+
if (c == self->lineterminator) {
1737+
END_FIELD();
1738+
END_LINE();
1739+
} else if (c == self->quotechar &&
1740+
self->quoting != QUOTE_NONE) {
1741+
// start quote field
1742+
self->state = IN_QUOTED_FIELD;
1743+
} else if (c == self->escapechar) {
1744+
// possible escaped character
1745+
self->state = ESCAPED_CHAR;
1746+
} else if (IS_WHITESPACE(c)) {
1747+
self->state = EAT_WHITESPACE;
1748+
} else if (c == self->commentchar) {
1749+
END_FIELD();
1750+
self->state = EAT_COMMENT;
1751+
} else {
1752+
// begin new unquoted field
1753+
if (self->quoting == QUOTE_NONNUMERIC)
1754+
self->numeric_field = 1;
1755+
1756+
PUSH_CHAR(c);
1757+
self->state = IN_FIELD;
1758+
}
1759+
break;
1760+
1761+
case EAT_LINE_COMMENT:
1762+
if (c == self->lineterminator) {
1763+
self->file_lines++;
1764+
self->state = START_RECORD;
1765+
}
1766+
break;
1767+
1768+
case ESCAPED_CHAR:
1769+
PUSH_CHAR(c);
1770+
self->state = IN_FIELD;
1771+
break;
1772+
1773+
case IN_FIELD:
1774+
// in unquoted field
1775+
if (c == self->lineterminator) {
1776+
END_FIELD();
1777+
END_LINE();
1778+
} else if (c == self->escapechar) {
1779+
// possible escaped character
1780+
self->state = ESCAPED_CHAR;
1781+
} else if (IS_WHITESPACE(c)) {
1782+
// end of field (end of line not reached yet)
1783+
END_FIELD();
1784+
self->state = EAT_WHITESPACE;
1785+
} else if (c == self->commentchar) {
1786+
END_FIELD();
1787+
self->state = EAT_COMMENT;
1788+
} else {
1789+
// normal character - save in field
1790+
PUSH_CHAR(c);
1791+
}
1792+
break;
1793+
1794+
case IN_QUOTED_FIELD:
1795+
// in quoted field
1796+
if (c == self->escapechar) {
1797+
// possible escape character
1798+
self->state = ESCAPE_IN_QUOTED_FIELD;
1799+
} else if (c == self->quotechar &&
1800+
self->quoting != QUOTE_NONE) {
1801+
if (self->doublequote) {
1802+
// double quote - " represented by ""
1803+
self->state = QUOTE_IN_QUOTED_FIELD;
1804+
}
1805+
else {
1806+
// end of quote part of field
1807+
self->state = IN_FIELD;
1808+
}
1809+
} else {
1810+
// normal character - save in field
1811+
PUSH_CHAR(c);
1812+
}
1813+
break;
1814+
1815+
case ESCAPE_IN_QUOTED_FIELD:
1816+
PUSH_CHAR(c);
1817+
self->state = IN_QUOTED_FIELD;
1818+
break;
1819+
1820+
case QUOTE_IN_QUOTED_FIELD:
1821+
// double quote - seen a quote in an quoted field
1822+
if (self->quoting != QUOTE_NONE && c == self->quotechar) {
1823+
// save "" as "
1824+
PUSH_CHAR(c);
1825+
self->state = IN_QUOTED_FIELD;
1826+
} else if (IS_WHITESPACE(c)) {
1827+
// end of field (end of line not reached yet)
1828+
END_FIELD();
1829+
self->state = EAT_WHITESPACE;
1830+
} else if (c == self->lineterminator) {
1831+
END_FIELD();
1832+
END_LINE();
1833+
} else if (!self->strict) {
1834+
PUSH_CHAR(c);
1835+
self->state = IN_FIELD;
1836+
} else {
1837+
self->error_msg = (char*) malloc(50);
1838+
sprintf(self->error_msg, "'%c' expected after '%c'",
1839+
self->delimiter, self->quotechar);
1840+
goto parsingerror;
1841+
}
1842+
break;
1843+
1844+
case EAT_CRNL:
1845+
if (c == self->lineterminator) {
1846+
END_LINE();
1847+
} else if (IS_WHITESPACE(c)){
1848+
// Handle \r-delimited files
1849+
END_LINE_STATE(EAT_WHITESPACE);
1850+
} else {
1851+
/* XXX
1852+
* first character of a new record--need to back up and reread
1853+
* to handle properly...
1854+
*/
1855+
i--; buf--; // back up one character (HACK!)
1856+
END_LINE_STATE(START_RECORD);
1857+
}
1858+
break;
1859+
1860+
case EAT_COMMENT:
1861+
if (c == self->lineterminator) {
1862+
END_LINE();
1863+
}
1864+
break;
1865+
1866+
default:
1867+
break;
1868+
}
1869+
}
1870+
1871+
_TOKEN_CLEANUP();
1872+
1873+
TRACE(("Finished tokenizing input\n"))
1874+
1875+
return 0;
1876+
1877+
parsingerror:
1878+
i++;
1879+
_TOKEN_CLEANUP();
1880+
1881+
return -1;
1882+
1883+
linelimit:
1884+
i++;
1885+
_TOKEN_CLEANUP();
1886+
1887+
return 0;
1888+
}
16441889

16451890
static int parser_handle_eof(parser_t *self) {
16461891
TRACE(("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state))
@@ -1851,11 +2096,17 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) {
18512096
int start_lines = self->lines;
18522097

18532098
if (self->delim_whitespace) {
1854-
tokenize_bytes = tokenize_whitespace;
1855-
} else if (self->lineterminator == '\0') {
1856-
tokenize_bytes = tokenize_delimited;
2099+
if (self->lineterminator == '\0') {
2100+
tokenize_bytes = tokenize_whitespace;
2101+
} else {
2102+
tokenize_bytes = tokenize_whitespace_customterm;
2103+
}
18572104
} else {
1858-
tokenize_bytes = tokenize_delim_customterm;
2105+
if (self->lineterminator == '\0') {
2106+
tokenize_bytes = tokenize_delimited;
2107+
} else {
2108+
tokenize_bytes = tokenize_delim_customterm;
2109+
}
18592110
}
18602111

18612112
if (self->state == FINISHED) {

0 commit comments

Comments
 (0)