Skip to content

Commit e55875e

Browse files
gfyoungjreback
authored andcommitted
BUG: Respect 'usecols' parameter even when CSV rows are uneven
Closes #12203 by overriding the row alignment checks for both engines when the `usecols` parameter is passed into `read_csv`. Author: gfyoung <[email protected]> Closes #12551 from gfyoung/usecol_long_lines and squashes the following commits: d3824dc [gfyoung] BUG: Respect 'usecols' parameter even when CSV rows are uneven
1 parent 9fe2dd2 commit e55875e

File tree

6 files changed

+50
-3
lines changed

6 files changed

+50
-3
lines changed

doc/source/whatsnew/v0.18.1.txt

+7
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,12 @@ API changes
5555

5656

5757

58+
- ``CParserError`` is now a ``ValueError`` instead of just an ``Exception`` (:issue:`12551`)
59+
60+
61+
62+
63+
5864

5965

6066

@@ -95,6 +101,7 @@ Performance Improvements
95101

96102
Bug Fixes
97103
~~~~~~~~~
104+
- ``usecols`` parameter in ``pd.read_csv`` is now respected even when the lines of a CSV file are not even (:issue:`12203`)
98105

99106
- Bug in ``Period`` and ``PeriodIndex`` creation raises ``KeyError`` if ``freq="Minute"`` is specified. Note that "Minute" freq is deprecated in v0.17.0, and recommended to use ``freq="T"`` instead (:issue:`11854`)
100107
- Bug in printing data which contains ``Period`` with different ``freq`` raises ``ValueError`` (:issue:`12615`)

pandas/io/parsers.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1984,7 +1984,9 @@ def _rows_to_cols(self, content):
19841984
raise ValueError('skip footer cannot be negative')
19851985

19861986
# Loop through rows to verify lengths are correct.
1987-
if col_len != zip_len and self.index_col is not False:
1987+
if (col_len != zip_len and
1988+
self.index_col is not False and
1989+
self.usecols is None):
19881990
i = 0
19891991
for (i, l) in enumerate(content):
19901992
if len(l) != col_len:

pandas/io/tests/test_parsers.py

+31
Original file line numberDiff line numberDiff line change
@@ -2664,6 +2664,37 @@ def test_empty_header_read(count):
26642664
for count in range(1, 101):
26652665
test_empty_header_read(count)
26662666

2667+
def test_uneven_lines_with_usecols(self):
2668+
# See gh-12203
2669+
csv = r"""a,b,c
2670+
0,1,2
2671+
3,4,5,6,7
2672+
8,9,10
2673+
"""
2674+
2675+
# make sure that an error is still thrown
2676+
# when the 'usecols' parameter is not provided
2677+
msg = "Expected \d+ fields in line \d+, saw \d+"
2678+
with tm.assertRaisesRegexp(ValueError, msg):
2679+
df = self.read_csv(StringIO(csv))
2680+
2681+
expected = DataFrame({
2682+
'a': [0, 3, 8],
2683+
'b': [1, 4, 9]
2684+
})
2685+
2686+
usecols = [0, 1]
2687+
df = self.read_csv(StringIO(csv), usecols=usecols)
2688+
tm.assert_frame_equal(df, expected)
2689+
2690+
usecols = ['a', 1]
2691+
df = self.read_csv(StringIO(csv), usecols=usecols)
2692+
tm.assert_frame_equal(df, expected)
2693+
2694+
usecols = ['a', 'b']
2695+
df = self.read_csv(StringIO(csv), usecols=usecols)
2696+
tm.assert_frame_equal(df, expected)
2697+
26672698

26682699
class TestPythonParser(ParserTests, tm.TestCase):
26692700

pandas/parser.pyx

+5-1
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,8 @@ cdef extern from "parser/tokenizer.h":
143143
int allow_embedded_newline
144144
int strict # raise exception on bad CSV */
145145

146+
int usecols
147+
146148
int expected_fields
147149
int error_bad_lines
148150
int warn_bad_lines
@@ -350,6 +352,8 @@ cdef class TextReader:
350352
self.compression = compression
351353
self.memory_map = memory_map
352354

355+
self.parser.usecols = (usecols is not None)
356+
353357
self._setup_parser_source(source)
354358
parser_set_default_options(self.parser)
355359

@@ -1208,7 +1212,7 @@ cdef class TextReader:
12081212
else:
12091213
return None
12101214

1211-
class CParserError(Exception):
1215+
class CParserError(ValueError):
12121216
pass
12131217

12141218

pandas/src/parser/tokenizer.c

+2-1
Original file line numberDiff line numberDiff line change
@@ -494,7 +494,8 @@ static int end_line(parser_t *self) {
494494
/* printf("Line: %d, Fields: %d, Ex-fields: %d\n", self->lines, fields, ex_fields); */
495495

496496
if (!(self->lines <= self->header_end + 1)
497-
&& (self->expected_fields < 0 && fields > ex_fields)) {
497+
&& (self->expected_fields < 0 && fields > ex_fields)
498+
&& !(self->usecols)) {
498499
// increment file line count
499500
self->file_lines++;
500501

pandas/src/parser/tokenizer.h

+2
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,8 @@ typedef struct parser_t {
184184
int allow_embedded_newline;
185185
int strict; /* raise exception on bad CSV */
186186

187+
int usecols; // Boolean: 1: usecols provided, 0: none provided
188+
187189
int expected_fields;
188190
int error_bad_lines;
189191
int warn_bad_lines;

0 commit comments

Comments
 (0)