Skip to content

Commit 6f1965a

Browse files
committed
BUG: Corrects stopping logic when nrows argument is supplied (Fixes #7626)
Fixed code formatting Added test to C Parser Only suite, added whatsnew entry
1 parent 725453d commit 6f1965a

File tree

4 files changed

+38
-5
lines changed

4 files changed

+38
-5
lines changed

doc/source/whatsnew/v0.19.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ Bug Fixes
6767

6868

6969
- Bug in ``pd.read_csv()`` in which the ``dtype`` parameter was not being respected for empty data (:issue:`14712`)
70+
- Bug in ``pd.read_csv()`` in which the ``nrows`` parameter was not being respected for large input when using the C engine for parsing (:issue:`7626`)
7071

7172

7273

pandas/io/tests/parser/c_parser_only.py

+17
Original file line numberDiff line numberDiff line change
@@ -371,3 +371,20 @@ def test_internal_null_byte(self):
371371

372372
result = self.read_csv(StringIO(data), names=names)
373373
tm.assert_frame_equal(result, expected)
374+
375+
def test_read_nrows_large(self):
376+
# gh-7626 - Read only nrows of data in for large inputs (>262144b)
377+
header_narrow = '\t'.join(['COL_HEADER_' + str(i)
378+
for i in range(10)]) + '\n'
379+
data_narrow = '\t'.join(['somedatasomedatasomedata1'
380+
for i in range(10)]) + '\n'
381+
header_wide = '\t'.join(['COL_HEADER_' + str(i)
382+
for i in range(15)]) + '\n'
383+
data_wide = '\t'.join(['somedatasomedatasomedata2'
384+
for i in range(15)]) + '\n'
385+
test_input = (header_narrow + data_narrow * 1050 +
386+
header_wide + data_wide * 2)
387+
388+
df = self.read_csv(StringIO(test_input), sep='\t', nrows=1010)
389+
390+
self.assertTrue(df.size == 1010 * 10)

pandas/io/tests/parser/common.py

+17
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,23 @@ def test_read_nrows(self):
427427
with tm.assertRaisesRegexp(ValueError, msg):
428428
self.read_csv(StringIO(self.data1), nrows='foo')
429429

430+
def test_read_nrows_large(self):
431+
# GH-7626 - Read only nrows of data in for large inputs (>262144b)
432+
header_narrow = '\t'.join(['COL_HEADER_' + str(i)
433+
for i in range(10)]) + '\n'
434+
data_narrow = '\t'.join(['somedatasomedatasomedata1'
435+
for i in range(10)]) + '\n'
436+
header_wide = '\t'.join(['COL_HEADER_' + str(i)
437+
for i in range(15)]) + '\n'
438+
data_wide = '\t'.join(['somedatasomedatasomedata2'
439+
for i in range(15)]) + '\n'
440+
test_input = (header_narrow + data_narrow * 1050 +
441+
header_wide + data_wide * 2)
442+
443+
df = self.read_csv(StringIO(test_input), sep="\t", nrows=1010)
444+
445+
self.assertTrue(df.size == 1010 * 10)
446+
430447
def test_read_chunksize(self):
431448
reader = self.read_csv(StringIO(self.data1), index_col=0, chunksize=2)
432449
df = self.read_csv(StringIO(self.data1), index_col=0)

pandas/src/parser/tokenizer.c

+3-5
Original file line numberDiff line numberDiff line change
@@ -726,16 +726,14 @@ int skip_this_line(parser_t *self, int64_t rownum) {
726726
}
727727
}
728728

729-
int tokenize_bytes(parser_t *self, size_t line_limit)
729+
int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines)
730730
{
731-
int i, slen, start_lines;
731+
int i, slen;
732732
long maxstreamsize;
733733
char c;
734734
char *stream;
735735
char *buf = self->data + self->datapos;
736736

737-
start_lines = self->lines;
738-
739737
if (make_stream_space(self, self->datalen - self->datapos) < 0) {
740738
self->error_msg = "out of memory";
741739
return -1;
@@ -1384,7 +1382,7 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) {
13841382
TRACE(("_tokenize_helper: Trying to process %d bytes, datalen=%d, datapos= %d\n",
13851383
self->datalen - self->datapos, self->datalen, self->datapos));
13861384

1387-
status = tokenize_bytes(self, nrows);
1385+
status = tokenize_bytes(self, nrows, start_lines);
13881386

13891387
if (status < 0) {
13901388
// XXX

0 commit comments

Comments
 (0)