INT: BaseIterator for parsers' common logic

toobaz · jreback · commit 45a83a08c9af · 2016-02-06T15:02:33.000-05:00
closes #12173 closes #12153
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -9,7 +9,7 @@
 
 from pandas.compat import StringIO, BytesIO, string_types, text_type
 from pandas import compat
-from pandas.core.common import pprint_thing, is_number
+from pandas.core.common import pprint_thing, is_number, AbstractMethodError
 
 
 try:
@@ -59,6 +59,20 @@ class DtypeWarning(Warning):
     pass
 
 
+class BaseIterator(object):
+    """Subclass this and provide a "__next__()" method to obtain an iterator.
+    Useful only when the object being iterated is non-reusable (e.g. OK for a
+    parser, not for an in-memory table, yes for its iterator)."""
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        raise AbstractMethodError(self)
+
+if not compat.PY3:
+    BaseIterator.next = lambda self: self.__next__()
+
+
 try:
     from boto.s3 import key
 
@@ -394,7 +408,7 @@ def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds):
     def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds):
         return csv.writer(f, dialect=dialect, **kwds)
 else:
-    class UnicodeReader:
+    class UnicodeReader(BaseIterator):
 
         """
         A CSV reader which will iterate over lines in the CSV file "f",
@@ -408,16 +422,10 @@ def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
             f = UTF8Recoder(f, encoding)
             self.reader = csv.reader(f, dialect=dialect, **kwds)
 
-        def next(self):
+        def __next__(self):
             row = next(self.reader)
             return [compat.text_type(s, "utf-8") for s in row]
 
-        # python 3 iterator
-        __next__ = next
-
-        def __iter__(self):  # pragma: no cover
-            return self
-
     class UnicodeWriter:
 
         """
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -19,7 +19,8 @@
 from pandas.core.config import get_option
 from pandas.io.date_converters import generic_parser
 from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg,
-                              _get_handle, UnicodeReader, UTF8Recoder)
+                              _get_handle, UnicodeReader, UTF8Recoder,
+                              BaseIterator)
 from pandas.tseries import tools
 
 from pandas.util.decorators import Appender
@@ -545,7 +546,7 @@ def read_fwf(filepath_or_buffer, colspecs='infer', widths=None, **kwds):
 ])
 
 
-class TextFileReader(object):
+class TextFileReader(BaseIterator):
     """
 
     Passed dialect overrides any of the related parser options
@@ -724,15 +725,8 @@ def _clean_options(self, options, engine):
 
         return result, engine
 
-    def __iter__(self):
-        try:
-            if self.chunksize:
-                while True:
-                    yield self.read(self.chunksize)
-            else:
-                yield self.read()
-        except StopIteration:
-            pass
+    def __next__(self):
+        return self.get_chunk()
 
     def _make_engine(self, engine='c'):
         if engine == 'c':
@@ -2363,7 +2357,7 @@ def _concat_date_cols(date_cols):
     return rs
 
 
-class FixedWidthReader(object):
+class FixedWidthReader(BaseIterator):
     """
     A reader of fixed-width lines.
     """
@@ -2417,7 +2411,7 @@ def detect_colspecs(self, n=100):
         edges = np.where((mask ^ shifted) == 1)[0]
         return list(zip(edges[::2], edges[1::2]))
 
-    def next(self):
+    def __next__(self):
         if self.buffer is not None:
             try:
                 line = next(self.buffer)
@@ -2430,9 +2424,6 @@ def next(self):
         return [line[fromm:to].strip(self.delimiter)
                 for (fromm, to) in self.colspecs]
 
-    # Iterator protocol in Python 3 uses __next__()
-    __next__ = next
-
 
 class FixedWidthFieldParser(PythonParser):
     """
diff --git a/pandas/io/sas.py b/pandas/io/sas.py
@@ -10,7 +10,7 @@
 
 from datetime import datetime
 import pandas as pd
-from pandas.io.common import get_filepath_or_buffer
+from pandas.io.common import get_filepath_or_buffer, BaseIterator
 from pandas import compat
 import struct
 import numpy as np
@@ -242,7 +242,7 @@ def _parse_float_vec(vec):
     return ieee
 
 
-class XportReader(object):
+class XportReader(BaseIterator):
     __doc__ = _xport_reader_doc
 
     def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1',
@@ -369,15 +369,8 @@ def _read_header(self):
         dtype = np.dtype(dtypel)
         self._dtype = dtype
 
-    def __iter__(self):
-        try:
-            if self._chunksize:
-                while True:
-                    yield self.read(self._chunksize)
-            else:
-                yield self.read()
-        except StopIteration:
-            pass
+    def __next__(self):
+        return self.read(nrows=self._chunksize or 1)
 
     def _record_count(self):
         """
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -25,7 +25,7 @@
 from pandas.util.decorators import Appender
 import pandas as pd
 import pandas.core.common as com
-from pandas.io.common import get_filepath_or_buffer
+from pandas.io.common import get_filepath_or_buffer, BaseIterator
 from pandas.lib import max_len_string_array, infer_dtype
 from pandas.tslib import NaT, Timestamp
 
@@ -907,7 +907,7 @@ def _decode_bytes(self, str, errors=None):
             return str
 
 
-class StataReader(StataParser):
+class StataReader(StataParser, BaseIterator):
     __doc__ = _stata_reader_doc
 
     def __init__(self, path_or_buf, convert_dates=True,
@@ -1377,15 +1377,8 @@ def data(self, **kwargs):
 
         return self.read(None, **kwargs)
 
-    def __iter__(self):
-        try:
-            if self._chunksize:
-                while True:
-                    yield self.read(self._chunksize)
-            else:
-                yield self.read()
-        except StopIteration:
-            pass
+    def __next__(self):
+        return self.read(nrows=self._chunksize or 1)
 
     def get_chunk(self, size=None):
         """
diff --git a/pandas/io/tests/test_common.py b/pandas/io/tests/test_common.py
@@ -9,6 +9,8 @@
 
 from pandas.io import common
 
+from pandas import read_csv, concat
+
 try:
     from pathlib import Path
 except ImportError:
@@ -21,6 +23,14 @@
 
 
 class TestCommonIOCapabilities(tm.TestCase):
+    data1 = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
 
     def test_expand_user(self):
         filename = '~/sometest'
@@ -64,3 +74,16 @@ def test_get_filepath_or_buffer_with_buffer(self):
         input_buffer = StringIO()
         filepath_or_buffer, _, _ = common.get_filepath_or_buffer(input_buffer)
         self.assertEqual(filepath_or_buffer, input_buffer)
+
+    def test_iterator(self):
+        reader = read_csv(StringIO(self.data1), chunksize=1)
+        result = concat(reader, ignore_index=True)
+        expected = read_csv(StringIO(self.data1))
+        tm.assert_frame_equal(result, expected)
+
+        # GH12153
+        it = read_csv(StringIO(self.data1), chunksize=1)
+        first = next(it)
+        tm.assert_frame_equal(first, expected.iloc[[0]])
+        expected.index = [0 for i in range(len(expected))]
+        tm.assert_frame_equal(concat(it), expected.iloc[1:])
diff --git a/pandas/io/tests/test_sas.py b/pandas/io/tests/test_sas.py
@@ -88,6 +88,11 @@ def test1_incremental(self):
 
         tm.assert_frame_equal(data, data_csv, check_index_type=False)
 
+        reader = XportReader(self.file01, index="SEQN", chunksize=1000)
+        data = pd.concat(reader, axis=0)
+
+        tm.assert_frame_equal(data, data_csv, check_index_type=False)
+
     def test2(self):
         # Test with SSHSV1_A.XPT
 
diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py
@@ -1033,6 +1033,10 @@ def test_iterator(self):
         chunk = itr.get_chunk()
         tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)
 
+        # GH12153
+        from_chunks = pd.concat(read_stata(fname, chunksize=4))
+        tm.assert_frame_equal(parsed, from_chunks)
+
     def test_read_chunks_115(self):
         files_115 = [self.dta2_115, self.dta3_115, self.dta4_115,
                      self.dta14_115, self.dta15_115, self.dta16_115,