Skip to content

BUG (GH3967) csv parsers would loop infinitely if iterator=True but no chunksize specified #3978

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 21, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,8 @@ pandas 0.11.1
- Fixed ``__truediv__`` in Python 2.7 with ``numexpr`` installed to actually do true division when dividing
two integer arrays with at least 10000 cells total (:issue:`3764`)
- Indexing with a string with seconds resolution not selecting from a time index (:issue:`3925`)
- csv parsers would loop infinitely if ``iterator=True`` but no ``chunksize`` was
specified (:issue:`3967`), python parser failing with ``chunksize=1``

.. _Gh3616: https://github.com/pydata/pandas/issues/3616

Expand Down
86 changes: 50 additions & 36 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def _read(filepath_or_buffer, kwds):
kwds['parse_dates'] = True

# Extract some of the arguments (pass chunksize on).
iterator = kwds.pop('iterator', False)
iterator = kwds.get('iterator', False)
nrows = kwds.pop('nrows', None)
chunksize = kwds.get('chunksize', None)

Expand Down Expand Up @@ -569,8 +569,11 @@ def _clean_options(self, options, engine):

def __iter__(self):
try:
while True:
yield self.read(self.chunksize)
if self.chunksize:
while True:
yield self.read(self.chunksize)
else:
yield self.read()
except StopIteration:
pass

Expand Down Expand Up @@ -1594,47 +1597,58 @@ def _rows_to_cols(self, content):
def _get_lines(self, rows=None):
source = self.data
lines = self.buf
new_rows = None

# already fetched some number
if rows is not None:
rows -= len(self.buf)

if isinstance(source, list):
if self.pos > len(source):
raise StopIteration
if rows is None:
lines.extend(source[self.pos:])
self.pos = len(source)
# we already have the lines in the buffer
if len(self.buf) >= rows:
new_rows, self.buf = self.buf[:rows], self.buf[rows:]

# need some lines
else:
lines.extend(source[self.pos:self.pos + rows])
self.pos += rows
else:
new_rows = []
try:
if rows is not None:
for _ in xrange(rows):
new_rows.append(next(source))
lines.extend(new_rows)
rows -= len(self.buf)

if new_rows is None:
if isinstance(source, list):
if self.pos > len(source):
raise StopIteration
if rows is None:
lines.extend(source[self.pos:])
self.pos = len(source)
else:
rows = 0
while True:
try:
lines.extend(source[self.pos:self.pos + rows])
self.pos += rows
else:
new_rows = []
try:
if rows is not None:
for _ in xrange(rows):
new_rows.append(next(source))
rows += 1
except csv.Error, inst:
if 'newline inside string' in str(inst):
row_num = str(self.pos + rows)
msg = ('EOF inside string starting with line '
+ row_num)
raise Exception(msg)
raise
except StopIteration:
lines.extend(new_rows)
if len(lines) == 0:
raise
self.pos += len(new_rows)
lines.extend(new_rows)
else:
rows = 0
while True:
try:
new_rows.append(next(source))
rows += 1
except csv.Error, inst:
if 'newline inside string' in str(inst):
row_num = str(self.pos + rows)
msg = ('EOF inside string starting with line '
+ row_num)
raise Exception(msg)
raise
except StopIteration:
lines.extend(new_rows)
if len(lines) == 0:
raise
self.pos += len(new_rows)

self.buf = []
self.buf = []
else:
lines = new_rows

if self.skip_footer:
lines = lines[:-self.skip_footer]
Expand Down
18 changes: 18 additions & 0 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1037,6 +1037,24 @@ def test_iterator(self):
iterator=True)
self.assert_(isinstance(treader, TextFileReader))

# stopping iteration when on chunksize is specified, GH 3967
data = """A,B,C
foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
reader = self.read_csv(StringIO(data), iterator=True)
result = list(reader)
expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz'])
tm.assert_frame_equal(result[0], expected)

# chunksize = 1
reader = self.read_csv(StringIO(data), chunksize=1)
result = list(reader)
expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz'])
self.assert_(len(result) == 3)
tm.assert_frame_equal(pd.concat(result), expected)

def test_header_not_first_line(self):
data = """got,to,ignore,this,line
got,to,ignore,this,line
Expand Down