Skip to content

Commit 45a83a0

Browse files
toobazjreback
authored andcommitted
INT: BaseIterator for parsers' common logic
closes #12173 closes #12153
1 parent 87a9880 commit 45a83a0

File tree

7 files changed

+64
-47
lines changed

7 files changed

+64
-47
lines changed

pandas/io/common.py

+17-9
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from pandas.compat import StringIO, BytesIO, string_types, text_type
1111
from pandas import compat
12-
from pandas.core.common import pprint_thing, is_number
12+
from pandas.core.common import pprint_thing, is_number, AbstractMethodError
1313

1414

1515
try:
@@ -59,6 +59,20 @@ class DtypeWarning(Warning):
5959
pass
6060

6161

62+
class BaseIterator(object):
63+
"""Subclass this and provide a "__next__()" method to obtain an iterator.
64+
Useful only when the object being iterated is non-reusable (e.g. OK for a
65+
parser, not for an in-memory table, yes for its iterator)."""
66+
def __iter__(self):
67+
return self
68+
69+
def __next__(self):
70+
raise AbstractMethodError(self)
71+
72+
if not compat.PY3:
73+
BaseIterator.next = lambda self: self.__next__()
74+
75+
6276
try:
6377
from boto.s3 import key
6478

@@ -394,7 +408,7 @@ def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds):
394408
def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds):
395409
return csv.writer(f, dialect=dialect, **kwds)
396410
else:
397-
class UnicodeReader:
411+
class UnicodeReader(BaseIterator):
398412

399413
"""
400414
A CSV reader which will iterate over lines in the CSV file "f",
@@ -408,16 +422,10 @@ def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
408422
f = UTF8Recoder(f, encoding)
409423
self.reader = csv.reader(f, dialect=dialect, **kwds)
410424

411-
def next(self):
425+
def __next__(self):
412426
row = next(self.reader)
413427
return [compat.text_type(s, "utf-8") for s in row]
414428

415-
# python 3 iterator
416-
__next__ = next
417-
418-
def __iter__(self): # pragma: no cover
419-
return self
420-
421429
class UnicodeWriter:
422430

423431
"""

pandas/io/parsers.py

+7-16
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919
from pandas.core.config import get_option
2020
from pandas.io.date_converters import generic_parser
2121
from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg,
22-
_get_handle, UnicodeReader, UTF8Recoder)
22+
_get_handle, UnicodeReader, UTF8Recoder,
23+
BaseIterator)
2324
from pandas.tseries import tools
2425

2526
from pandas.util.decorators import Appender
@@ -545,7 +546,7 @@ def read_fwf(filepath_or_buffer, colspecs='infer', widths=None, **kwds):
545546
])
546547

547548

548-
class TextFileReader(object):
549+
class TextFileReader(BaseIterator):
549550
"""
550551
551552
Passed dialect overrides any of the related parser options
@@ -724,15 +725,8 @@ def _clean_options(self, options, engine):
724725

725726
return result, engine
726727

727-
def __iter__(self):
728-
try:
729-
if self.chunksize:
730-
while True:
731-
yield self.read(self.chunksize)
732-
else:
733-
yield self.read()
734-
except StopIteration:
735-
pass
728+
def __next__(self):
729+
return self.get_chunk()
736730

737731
def _make_engine(self, engine='c'):
738732
if engine == 'c':
@@ -2363,7 +2357,7 @@ def _concat_date_cols(date_cols):
23632357
return rs
23642358

23652359

2366-
class FixedWidthReader(object):
2360+
class FixedWidthReader(BaseIterator):
23672361
"""
23682362
A reader of fixed-width lines.
23692363
"""
@@ -2417,7 +2411,7 @@ def detect_colspecs(self, n=100):
24172411
edges = np.where((mask ^ shifted) == 1)[0]
24182412
return list(zip(edges[::2], edges[1::2]))
24192413

2420-
def next(self):
2414+
def __next__(self):
24212415
if self.buffer is not None:
24222416
try:
24232417
line = next(self.buffer)
@@ -2430,9 +2424,6 @@ def next(self):
24302424
return [line[fromm:to].strip(self.delimiter)
24312425
for (fromm, to) in self.colspecs]
24322426

2433-
# Iterator protocol in Python 3 uses __next__()
2434-
__next__ = next
2435-
24362427

24372428
class FixedWidthFieldParser(PythonParser):
24382429
"""

pandas/io/sas.py

+4-11
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
from datetime import datetime
1212
import pandas as pd
13-
from pandas.io.common import get_filepath_or_buffer
13+
from pandas.io.common import get_filepath_or_buffer, BaseIterator
1414
from pandas import compat
1515
import struct
1616
import numpy as np
@@ -242,7 +242,7 @@ def _parse_float_vec(vec):
242242
return ieee
243243

244244

245-
class XportReader(object):
245+
class XportReader(BaseIterator):
246246
__doc__ = _xport_reader_doc
247247

248248
def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1',
@@ -369,15 +369,8 @@ def _read_header(self):
369369
dtype = np.dtype(dtypel)
370370
self._dtype = dtype
371371

372-
def __iter__(self):
373-
try:
374-
if self._chunksize:
375-
while True:
376-
yield self.read(self._chunksize)
377-
else:
378-
yield self.read()
379-
except StopIteration:
380-
pass
372+
def __next__(self):
373+
return self.read(nrows=self._chunksize or 1)
381374

382375
def _record_count(self):
383376
"""

pandas/io/stata.py

+4-11
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from pandas.util.decorators import Appender
2626
import pandas as pd
2727
import pandas.core.common as com
28-
from pandas.io.common import get_filepath_or_buffer
28+
from pandas.io.common import get_filepath_or_buffer, BaseIterator
2929
from pandas.lib import max_len_string_array, infer_dtype
3030
from pandas.tslib import NaT, Timestamp
3131

@@ -907,7 +907,7 @@ def _decode_bytes(self, str, errors=None):
907907
return str
908908

909909

910-
class StataReader(StataParser):
910+
class StataReader(StataParser, BaseIterator):
911911
__doc__ = _stata_reader_doc
912912

913913
def __init__(self, path_or_buf, convert_dates=True,
@@ -1377,15 +1377,8 @@ def data(self, **kwargs):
13771377

13781378
return self.read(None, **kwargs)
13791379

1380-
def __iter__(self):
1381-
try:
1382-
if self._chunksize:
1383-
while True:
1384-
yield self.read(self._chunksize)
1385-
else:
1386-
yield self.read()
1387-
except StopIteration:
1388-
pass
1380+
def __next__(self):
1381+
return self.read(nrows=self._chunksize or 1)
13891382

13901383
def get_chunk(self, size=None):
13911384
"""

pandas/io/tests/test_common.py

+23
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010
from pandas.io import common
1111

12+
from pandas import read_csv, concat
13+
1214
try:
1315
from pathlib import Path
1416
except ImportError:
@@ -21,6 +23,14 @@
2123

2224

2325
class TestCommonIOCapabilities(tm.TestCase):
26+
data1 = """index,A,B,C,D
27+
foo,2,3,4,5
28+
bar,7,8,9,10
29+
baz,12,13,14,15
30+
qux,12,13,14,15
31+
foo2,12,13,14,15
32+
bar2,12,13,14,15
33+
"""
2434

2535
def test_expand_user(self):
2636
filename = '~/sometest'
@@ -64,3 +74,16 @@ def test_get_filepath_or_buffer_with_buffer(self):
6474
input_buffer = StringIO()
6575
filepath_or_buffer, _, _ = common.get_filepath_or_buffer(input_buffer)
6676
self.assertEqual(filepath_or_buffer, input_buffer)
77+
78+
def test_iterator(self):
79+
reader = read_csv(StringIO(self.data1), chunksize=1)
80+
result = concat(reader, ignore_index=True)
81+
expected = read_csv(StringIO(self.data1))
82+
tm.assert_frame_equal(result, expected)
83+
84+
# GH12153
85+
it = read_csv(StringIO(self.data1), chunksize=1)
86+
first = next(it)
87+
tm.assert_frame_equal(first, expected.iloc[[0]])
88+
expected.index = [0 for i in range(len(expected))]
89+
tm.assert_frame_equal(concat(it), expected.iloc[1:])

pandas/io/tests/test_sas.py

+5
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,11 @@ def test1_incremental(self):
8888

8989
tm.assert_frame_equal(data, data_csv, check_index_type=False)
9090

91+
reader = XportReader(self.file01, index="SEQN", chunksize=1000)
92+
data = pd.concat(reader, axis=0)
93+
94+
tm.assert_frame_equal(data, data_csv, check_index_type=False)
95+
9196
def test2(self):
9297
# Test with SSHSV1_A.XPT
9398

pandas/io/tests/test_stata.py

+4
Original file line numberDiff line numberDiff line change
@@ -1033,6 +1033,10 @@ def test_iterator(self):
10331033
chunk = itr.get_chunk()
10341034
tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)
10351035

1036+
# GH12153
1037+
from_chunks = pd.concat(read_stata(fname, chunksize=4))
1038+
tm.assert_frame_equal(parsed, from_chunks)
1039+
10361040
def test_read_chunks_115(self):
10371041
files_115 = [self.dta2_115, self.dta3_115, self.dta4_115,
10381042
self.dta14_115, self.dta15_115, self.dta16_115,

0 commit comments

Comments
 (0)