Skip to content

Commit 478bbc7

Browse files
author
Nicholas J Riasanovsky
committed
BUG: Support for checking the first row for errors with index_col=False (#40333)
1 parent c8493e3 commit 478bbc7

File tree

6 files changed

+112
-96
lines changed

6 files changed

+112
-96
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -595,6 +595,7 @@ I/O
595595
- Bug in :func:`read_hdf` returning unexpected records when filtering on categorical string columns using ``where`` parameter (:issue:`39189`)
596596
- Bug in :func:`read_sas` raising ``ValueError`` when ``datetimes`` were null (:issue:`39725`)
597597
- Bug in :func:`read_excel` dropping empty values from single-column spreadsheets (:issue:`39808`)
598+
- Bug in "func:`pandas.read_csv` failed to raise ParserError when first row had too many columns and index_col=False (:issue:`40333`)
598599

599600
Period
600601
^^^^^^

pandas/_libs/parsers.pyx

+3
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,8 @@ cdef extern from "parser/tokenizer.h":
215215
int64_t header_start # header row start
216216
uint64_t header_end # header row end
217217

218+
int allow_leading_cols # Boolean: 1: can infer index col, 0: no index col
219+
218220
void *skipset
219221
PyObject *skipfunc
220222
int64_t skip_first_N_rows
@@ -376,6 +378,7 @@ cdef class TextReader:
376378
self.encoding_errors = PyBytes_AsString(encoding_errors)
377379

378380
self.parser = parser_new()
381+
self.parser.allow_leading_cols = allow_leading_cols
379382
self.parser.chunksize = tokenize_chunksize
380383

381384
self.mangle_dupe_cols = mangle_dupe_cols

pandas/_libs/src/parser/tokenizer.c

+69-76
Original file line numberDiff line numberDiff line change
@@ -217,9 +217,7 @@ void parser_free(parser_t *self) {
217217
parser_cleanup(self);
218218
}
219219

220-
void parser_del(parser_t *self) {
221-
free(self);
222-
}
220+
void parser_del(parser_t *self) { free(self); }
223221

224222
static int make_stream_space(parser_t *self, size_t nbytes) {
225223
uint64_t i, cap, length;
@@ -278,9 +276,8 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
278276
}
279277

280278
self->words =
281-
(char **)grow_buffer((void *)self->words, length,
282-
&self->words_cap, nbytes,
283-
sizeof(char *), &status);
279+
(char **)grow_buffer((void *)self->words, length, &self->words_cap,
280+
nbytes, sizeof(char *), &status);
284281
TRACE(
285282
("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, "
286283
"%d)\n",
@@ -308,10 +305,9 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
308305
LINE VECTORS
309306
*/
310307
cap = self->lines_cap;
311-
self->line_start =
312-
(int64_t *)grow_buffer((void *)self->line_start, self->lines + 1,
313-
&self->lines_cap, nbytes,
314-
sizeof(int64_t), &status);
308+
self->line_start = (int64_t *)grow_buffer((void *)self->line_start,
309+
self->lines + 1, &self->lines_cap,
310+
nbytes, sizeof(int64_t), &status);
315311
TRACE((
316312
"make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n",
317313
self->lines + 1, self->lines_cap, nbytes, status))
@@ -445,7 +441,7 @@ static int end_line(parser_t *self) {
445441
return 0;
446442
}
447443

448-
if (!(self->lines <= self->header_end + 1) &&
444+
if (!(self->lines <= self->header_end + self->allow_leading_cols) &&
449445
(self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) {
450446
// increment file line count
451447
self->file_lines++;
@@ -460,8 +456,9 @@ static int end_line(parser_t *self) {
460456
if (self->error_bad_lines) {
461457
self->error_msg = malloc(bufsize);
462458
snprintf(self->error_msg, bufsize,
463-
"Expected %d fields in line %" PRIu64 ", saw %" PRId64 "\n",
464-
ex_fields, self->file_lines, fields);
459+
"Expected %d fields in line %" PRIu64 ", saw %" PRId64
460+
"\n",
461+
ex_fields, self->file_lines, fields);
465462

466463
TRACE(("Error at line %d, %d fields\n", self->file_lines, fields));
467464

@@ -472,16 +469,16 @@ static int end_line(parser_t *self) {
472469
// pass up error message
473470
msg = malloc(bufsize);
474471
snprintf(msg, bufsize,
475-
"Skipping line %" PRIu64 ": expected %d fields, saw %"
476-
PRId64 "\n", self->file_lines, ex_fields, fields);
472+
"Skipping line %" PRIu64
473+
": expected %d fields, saw %" PRId64 "\n",
474+
self->file_lines, ex_fields, fields);
477475
append_warning(self, msg);
478476
free(msg);
479477
}
480478
}
481479
} else {
482480
// missing trailing delimiters
483-
if ((self->lines >= self->header_end + 1) &&
484-
fields < ex_fields) {
481+
if ((self->lines >= self->header_end + 1) && fields < ex_fields) {
485482
// might overrun the buffer when closing fields
486483
if (make_stream_space(self, ex_fields - fields) < 0) {
487484
int64_t bufsize = 100;
@@ -592,20 +589,20 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes,
592589
593590
*/
594591

595-
#define PUSH_CHAR(c) \
596-
TRACE( \
597-
("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \
598-
c, slen, self->stream_cap, self->stream_len)) \
599-
if (slen >= self->stream_cap) { \
600-
TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \
601-
self->stream_cap)) \
602-
int64_t bufsize = 100; \
603-
self->error_msg = malloc(bufsize); \
604-
snprintf(self->error_msg, bufsize, \
605-
"Buffer overflow caught - possible malformed input file.\n");\
606-
return PARSER_OUT_OF_MEMORY; \
607-
} \
608-
*stream++ = c; \
592+
#define PUSH_CHAR(c) \
593+
TRACE( \
594+
("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \
595+
c, slen, self->stream_cap, self->stream_len)) \
596+
if (slen >= self->stream_cap) { \
597+
TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \
598+
self->stream_cap)) \
599+
int64_t bufsize = 100; \
600+
self->error_msg = malloc(bufsize); \
601+
snprintf(self->error_msg, bufsize, \
602+
"Buffer overflow caught - possible malformed input file.\n"); \
603+
return PARSER_OUT_OF_MEMORY; \
604+
} \
605+
*stream++ = c; \
609606
slen++;
610607

611608
// This is a little bit of a hack but works for now
@@ -647,8 +644,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes,
647644

648645
#define END_LINE() END_LINE_STATE(START_RECORD)
649646

650-
#define IS_TERMINATOR(c) \
651-
(c == line_terminator)
647+
#define IS_TERMINATOR(c) (c == line_terminator)
652648

653649
#define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE))
654650

@@ -708,25 +704,24 @@ int skip_this_line(parser_t *self, int64_t rownum) {
708704
}
709705
}
710706

711-
int tokenize_bytes(parser_t *self,
712-
size_t line_limit, uint64_t start_lines) {
707+
int tokenize_bytes(parser_t *self, size_t line_limit, uint64_t start_lines) {
713708
int64_t i;
714709
uint64_t slen;
715710
int should_skip;
716711
char c;
717712
char *stream;
718713
char *buf = self->data + self->datapos;
719714

720-
const char line_terminator = (self->lineterminator == '\0') ?
721-
'\n' : self->lineterminator;
715+
const char line_terminator =
716+
(self->lineterminator == '\0') ? '\n' : self->lineterminator;
722717

723718
// 1000 is something that couldn't fit in "char"
724719
// thus comparing a char to it would always be "false"
725720
const int carriage_symbol = (self->lineterminator == '\0') ? '\r' : 1000;
726-
const int comment_symbol = (self->commentchar != '\0') ?
727-
self->commentchar : 1000;
728-
const int escape_symbol = (self->escapechar != '\0') ?
729-
self->escapechar : 1000;
721+
const int comment_symbol =
722+
(self->commentchar != '\0') ? self->commentchar : 1000;
723+
const int escape_symbol =
724+
(self->escapechar != '\0') ? self->escapechar : 1000;
730725

731726
if (make_stream_space(self, self->datalen - self->datapos) < 0) {
732727
int64_t bufsize = 100;
@@ -833,7 +828,7 @@ int tokenize_bytes(parser_t *self,
833828
}
834829
break;
835830
}
836-
// fall through
831+
// fall through
837832

838833
case EAT_WHITESPACE:
839834
if (IS_TERMINATOR(c)) {
@@ -1061,10 +1056,10 @@ int tokenize_bytes(parser_t *self,
10611056
} else {
10621057
if (self->delim_whitespace) {
10631058
/* XXX
1064-
* first character of a new record--need to back up and
1065-
* reread
1066-
* to handle properly...
1067-
*/
1059+
* first character of a new record--need to back up and
1060+
* reread
1061+
* to handle properly...
1062+
*/
10681063
i--;
10691064
buf--; // back up one character (HACK!)
10701065
END_LINE_STATE(START_RECORD);
@@ -1144,8 +1139,8 @@ static int parser_handle_eof(parser_t *self) {
11441139
case IN_QUOTED_FIELD:
11451140
self->error_msg = (char *)malloc(bufsize);
11461141
snprintf(self->error_msg, bufsize,
1147-
"EOF inside string starting at row %" PRIu64,
1148-
self->file_lines);
1142+
"EOF inside string starting at row %" PRIu64,
1143+
self->file_lines);
11491144
return -1;
11501145

11511146
case ESCAPED_CHAR:
@@ -1267,8 +1262,8 @@ int parser_trim_buffers(parser_t *self) {
12671262
if (self->words == NULL) {
12681263
return PARSER_OUT_OF_MEMORY;
12691264
}
1270-
self->word_starts = realloc(self->word_starts,
1271-
new_cap * sizeof(int64_t));
1265+
self->word_starts =
1266+
realloc(self->word_starts, new_cap * sizeof(int64_t));
12721267
if (self->word_starts == NULL) {
12731268
return PARSER_OUT_OF_MEMORY;
12741269
}
@@ -1311,15 +1306,13 @@ int parser_trim_buffers(parser_t *self) {
13111306
new_cap = _next_pow2(self->lines) + 1;
13121307
if (new_cap < self->lines_cap) {
13131308
TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n"));
1314-
newptr = realloc(self->line_start,
1315-
new_cap * sizeof(int64_t));
1309+
newptr = realloc(self->line_start, new_cap * sizeof(int64_t));
13161310
if (newptr == NULL) {
13171311
return PARSER_OUT_OF_MEMORY;
13181312
} else {
13191313
self->line_start = newptr;
13201314
}
1321-
newptr = realloc(self->line_fields,
1322-
new_cap * sizeof(int64_t));
1315+
newptr = realloc(self->line_fields, new_cap * sizeof(int64_t));
13231316
if (newptr == NULL) {
13241317
return PARSER_OUT_OF_MEMORY;
13251318
} else {
@@ -1353,8 +1346,8 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all,
13531346
if (!all && self->lines - start_lines >= nrows) break;
13541347

13551348
if (self->datapos == self->datalen) {
1356-
status = parser_buffer_bytes(self, self->chunksize,
1357-
encoding_errors);
1349+
status =
1350+
parser_buffer_bytes(self, self->chunksize, encoding_errors);
13581351

13591352
if (status == REACHED_EOF) {
13601353
// close out last line
@@ -1413,11 +1406,11 @@ int tokenize_all_rows(parser_t *self, const char *encoding_errors) {
14131406
*/
14141407
int to_boolean(const char *item, uint8_t *val) {
14151408
if (strcasecmp(item, "TRUE") == 0) {
1416-
*val = 1;
1417-
return 0;
1409+
*val = 1;
1410+
return 0;
14181411
} else if (strcasecmp(item, "FALSE") == 0) {
1419-
*val = 0;
1420-
return 0;
1412+
*val = 0;
1413+
return 0;
14211414
}
14221415

14231416
return -1;
@@ -1611,9 +1604,9 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci,
16111604
return number;
16121605
}
16131606

1614-
double precise_xstrtod(const char *str, char **endptr, char decimal,
1615-
char sci, char tsep, int skip_trailing,
1616-
int *error, int *maybe_int) {
1607+
double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
1608+
char tsep, int skip_trailing, int *error,
1609+
int *maybe_int) {
16171610
double number;
16181611
int exponent;
16191612
int negative;
@@ -1751,7 +1744,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
17511744
} else if (exponent > 0) {
17521745
number *= e[exponent];
17531746
} else if (exponent < -308) { // Subnormal
1754-
if (exponent < -616) { // Prevent invalid array access.
1747+
if (exponent < -616) { // Prevent invalid array access.
17551748
number = 0.;
17561749
} else {
17571750
number /= e[-308 - exponent];
@@ -1779,7 +1772,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
17791772
with a call to `free`.
17801773
*/
17811774

1782-
char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
1775+
char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
17831776
char tsep) {
17841777
const char *p = s;
17851778
size_t length = strlen(s);
@@ -1796,17 +1789,15 @@ char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
17961789
}
17971790
// Replace `decimal` with '.'
17981791
if (*p == decimal) {
1799-
*dst++ = '.';
1800-
p++;
1792+
*dst++ = '.';
1793+
p++;
18011794
}
18021795
// Copy the remainder of the string as is.
18031796
strncpy(dst, p, length + 1 - (p - s));
1804-
if (endpos != NULL)
1805-
*endpos = (char *)(s + length);
1797+
if (endpos != NULL) *endpos = (char *)(s + length);
18061798
return s_copy;
18071799
}
18081800

1809-
18101801
double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
18111802
int skip_trailing, int *error, int *maybe_int) {
18121803
// 'normalize' representation to C-locale; replace decimal with '.' and
@@ -1822,20 +1813,22 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
18221813
// PyOS_string_to_double needs to consume the whole string
18231814
if (endpc == pc + strlen(pc)) {
18241815
if (q != NULL) {
1825-
// report endptr from source string (p)
1816+
// report endptr from source string (p)
18261817
*q = endptr;
18271818
}
18281819
} else {
18291820
*error = -1;
18301821
if (q != NULL) {
1831-
// p and pc are different len due to tsep removal. Can't report
1832-
// how much it has consumed of p. Just rewind to beginning.
1833-
*q = (char *)p; // TODO(willayd): this could be undefined behavior
1822+
// p and pc are different len due to tsep removal. Can't report
1823+
// how much it has consumed of p. Just rewind to beginning.
1824+
*q = (char *)p; // TODO(willayd): this could be undefined behavior
18341825
}
18351826
}
18361827
if (maybe_int != NULL) *maybe_int = 0;
1837-
if (PyErr_Occurred() != NULL) *error = -1;
1838-
else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL;
1828+
if (PyErr_Occurred() != NULL)
1829+
*error = -1;
1830+
else if (r == Py_HUGE_VAL)
1831+
*error = (int)Py_HUGE_VAL;
18391832
PyErr_Clear();
18401833

18411834
PyGILState_Release(gstate);

0 commit comments

Comments
 (0)