Skip to content

Commit 78a71b1

Browse files
committed
Merge pull request #3978 from jreback/parser_iterator
BUG (GH3967) csv parsers would loop infinitely if iterator=True but no chunksize specified
2 parents 40064ec + 79e81a0 commit 78a71b1

File tree

3 files changed

+70
-36
lines changed

3 files changed

+70
-36
lines changed

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,8 @@ pandas 0.11.1
258258
- Fixed ``__truediv__`` in Python 2.7 with ``numexpr`` installed to actually do true division when dividing
259259
two integer arrays with at least 10000 cells total (:issue:`3764`)
260260
- Indexing with a string with seconds resolution not selecting from a time index (:issue:`3925`)
261+
- csv parsers would loop infinitely if ``iterator=True`` but no ``chunksize`` was
262+
specified (:issue:`3967`), python parser failing with ``chunksize=1``
261263

262264
.. _Gh3616: https://github.com/pydata/pandas/issues/3616
263265

pandas/io/parsers.py

+50-36
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ def _read(filepath_or_buffer, kwds):
186186
kwds['parse_dates'] = True
187187

188188
# Extract some of the arguments (pass chunksize on).
189-
iterator = kwds.pop('iterator', False)
189+
iterator = kwds.get('iterator', False)
190190
nrows = kwds.pop('nrows', None)
191191
chunksize = kwds.get('chunksize', None)
192192

@@ -569,8 +569,11 @@ def _clean_options(self, options, engine):
569569

570570
def __iter__(self):
571571
try:
572-
while True:
573-
yield self.read(self.chunksize)
572+
if self.chunksize:
573+
while True:
574+
yield self.read(self.chunksize)
575+
else:
576+
yield self.read()
574577
except StopIteration:
575578
pass
576579

@@ -1594,47 +1597,58 @@ def _rows_to_cols(self, content):
15941597
def _get_lines(self, rows=None):
15951598
source = self.data
15961599
lines = self.buf
1600+
new_rows = None
15971601

15981602
# already fetched some number
15991603
if rows is not None:
1600-
rows -= len(self.buf)
16011604

1602-
if isinstance(source, list):
1603-
if self.pos > len(source):
1604-
raise StopIteration
1605-
if rows is None:
1606-
lines.extend(source[self.pos:])
1607-
self.pos = len(source)
1605+
# we already have the lines in the buffer
1606+
if len(self.buf) >= rows:
1607+
new_rows, self.buf = self.buf[:rows], self.buf[rows:]
1608+
1609+
# need some lines
16081610
else:
1609-
lines.extend(source[self.pos:self.pos + rows])
1610-
self.pos += rows
1611-
else:
1612-
new_rows = []
1613-
try:
1614-
if rows is not None:
1615-
for _ in xrange(rows):
1616-
new_rows.append(next(source))
1617-
lines.extend(new_rows)
1611+
rows -= len(self.buf)
1612+
1613+
if new_rows is None:
1614+
if isinstance(source, list):
1615+
if self.pos > len(source):
1616+
raise StopIteration
1617+
if rows is None:
1618+
lines.extend(source[self.pos:])
1619+
self.pos = len(source)
16181620
else:
1619-
rows = 0
1620-
while True:
1621-
try:
1621+
lines.extend(source[self.pos:self.pos + rows])
1622+
self.pos += rows
1623+
else:
1624+
new_rows = []
1625+
try:
1626+
if rows is not None:
1627+
for _ in xrange(rows):
16221628
new_rows.append(next(source))
1623-
rows += 1
1624-
except csv.Error, inst:
1625-
if 'newline inside string' in str(inst):
1626-
row_num = str(self.pos + rows)
1627-
msg = ('EOF inside string starting with line '
1628-
+ row_num)
1629-
raise Exception(msg)
1630-
raise
1631-
except StopIteration:
1632-
lines.extend(new_rows)
1633-
if len(lines) == 0:
1634-
raise
1635-
self.pos += len(new_rows)
1629+
lines.extend(new_rows)
1630+
else:
1631+
rows = 0
1632+
while True:
1633+
try:
1634+
new_rows.append(next(source))
1635+
rows += 1
1636+
except csv.Error, inst:
1637+
if 'newline inside string' in str(inst):
1638+
row_num = str(self.pos + rows)
1639+
msg = ('EOF inside string starting with line '
1640+
+ row_num)
1641+
raise Exception(msg)
1642+
raise
1643+
except StopIteration:
1644+
lines.extend(new_rows)
1645+
if len(lines) == 0:
1646+
raise
1647+
self.pos += len(new_rows)
16361648

1637-
self.buf = []
1649+
self.buf = []
1650+
else:
1651+
lines = new_rows
16381652

16391653
if self.skip_footer:
16401654
lines = lines[:-self.skip_footer]

pandas/io/tests/test_parsers.py

+18
Original file line numberDiff line numberDiff line change
@@ -1037,6 +1037,24 @@ def test_iterator(self):
10371037
iterator=True)
10381038
self.assert_(isinstance(treader, TextFileReader))
10391039

1040+
# stopping iteration when on chunksize is specified, GH 3967
1041+
data = """A,B,C
1042+
foo,1,2,3
1043+
bar,4,5,6
1044+
baz,7,8,9
1045+
"""
1046+
reader = self.read_csv(StringIO(data), iterator=True)
1047+
result = list(reader)
1048+
expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz'])
1049+
tm.assert_frame_equal(result[0], expected)
1050+
1051+
# chunksize = 1
1052+
reader = self.read_csv(StringIO(data), chunksize=1)
1053+
result = list(reader)
1054+
expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz'])
1055+
self.assert_(len(result) == 3)
1056+
tm.assert_frame_equal(pd.concat(result), expected)
1057+
10401058
def test_header_not_first_line(self):
10411059
data = """got,to,ignore,this,line
10421060
got,to,ignore,this,line

0 commit comments

Comments
 (0)