From c725222e9ee0b502a71b346f65a98a9152261b24 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Fri, 29 Jan 2016 23:15:12 +0100 Subject: [PATCH 1/3] REF: BaseIterator for parsers' common logic --- pandas/io/common.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index c5f433ceaab4b..ef5f73a1c7335 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -9,7 +9,7 @@ from pandas.compat import StringIO, BytesIO, string_types, text_type from pandas import compat -from pandas.core.common import pprint_thing, is_number +from pandas.core.common import pprint_thing, is_number, AbstractMethodError try: @@ -59,6 +59,20 @@ class DtypeWarning(Warning): pass +class BaseIterator(object): + """Subclass this and provide a "__next__()" method to obtain an iterator. + Useful only when the object being iterated is non-reusable (e.g. OK for a + parser, not for an in-memory table, yes for its iterator).""" + def __iter__(self): + return self + + def __next__(self): + raise AbstractMethodError(self) + +if not compat.PY3: + BaseIterator.next = lambda self: self.__next__() + + try: from boto.s3 import key From b54d81520e5850d34aa51cb376904233df81c7c0 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Fri, 29 Jan 2016 23:21:41 +0100 Subject: [PATCH 2/3] BUG: Inherit iterator parsers from BaseIterator closes #12153 --- pandas/io/common.py | 10 ++-------- pandas/io/parsers.py | 23 +++++++---------------- pandas/io/sas.py | 15 ++++----------- pandas/io/stata.py | 15 ++++----------- 4 files changed, 17 insertions(+), 46 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index ef5f73a1c7335..8c9c348b9a11c 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -408,7 +408,7 @@ def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds): def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds): return csv.writer(f, dialect=dialect, **kwds) else: - class UnicodeReader: + class UnicodeReader(BaseIterator): """ A CSV reader which will iterate over lines in the CSV file "f", @@ -422,16 +422,10 @@ def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): f = UTF8Recoder(f, encoding) self.reader = csv.reader(f, dialect=dialect, **kwds) - def next(self): + def __next__(self): row = next(self.reader) return [compat.text_type(s, "utf-8") for s in row] - # python 3 iterator - __next__ = next - - def __iter__(self): # pragma: no cover - return self - class UnicodeWriter: """ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index dc6923b752ac7..1593716097985 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -19,7 +19,8 @@ from pandas.core.config import get_option from pandas.io.date_converters import generic_parser from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg, - _get_handle, UnicodeReader, UTF8Recoder) + _get_handle, UnicodeReader, UTF8Recoder, + BaseIterator) from pandas.tseries import tools from pandas.util.decorators import Appender @@ -545,7 +546,7 @@ def read_fwf(filepath_or_buffer, colspecs='infer', widths=None, **kwds): ]) -class TextFileReader(object): +class TextFileReader(BaseIterator): """ Passed dialect overrides any of the related parser options @@ -724,15 +725,8 @@ def _clean_options(self, options, engine): return result, engine - def __iter__(self): - try: - if self.chunksize: - while True: - yield self.read(self.chunksize) - else: - yield self.read() - except StopIteration: - pass + def __next__(self): + return self.get_chunk() def _make_engine(self, engine='c'): if engine == 'c': @@ -2363,7 +2357,7 @@ def _concat_date_cols(date_cols): return rs -class FixedWidthReader(object): +class FixedWidthReader(BaseIterator): """ A reader of fixed-width lines. """ @@ -2417,7 +2411,7 @@ def detect_colspecs(self, n=100): edges = np.where((mask ^ shifted) == 1)[0] return list(zip(edges[::2], edges[1::2])) - def next(self): + def __next__(self): if self.buffer is not None: try: line = next(self.buffer) @@ -2430,9 +2424,6 @@ def next(self): return [line[fromm:to].strip(self.delimiter) for (fromm, to) in self.colspecs] - # Iterator protocol in Python 3 uses __next__() - __next__ = next - class FixedWidthFieldParser(PythonParser): """ diff --git a/pandas/io/sas.py b/pandas/io/sas.py index 39e83b7715cda..49013a98c77ff 100644 --- a/pandas/io/sas.py +++ b/pandas/io/sas.py @@ -10,7 +10,7 @@ from datetime import datetime import pandas as pd -from pandas.io.common import get_filepath_or_buffer +from pandas.io.common import get_filepath_or_buffer, BaseIterator from pandas import compat import struct import numpy as np @@ -242,7 +242,7 @@ def _parse_float_vec(vec): return ieee -class XportReader(object): +class XportReader(BaseIterator): __doc__ = _xport_reader_doc def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1', @@ -369,15 +369,8 @@ def _read_header(self): dtype = np.dtype(dtypel) self._dtype = dtype - def __iter__(self): - try: - if self._chunksize: - while True: - yield self.read(self._chunksize) - else: - yield self.read() - except StopIteration: - pass + def __next__(self): + return self.read(nrows=self._chunksize or 1) def _record_count(self): """ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 8181e69abc60b..e54d0a5c43887 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -25,7 +25,7 @@ from pandas.util.decorators import Appender import pandas as pd import pandas.core.common as com -from pandas.io.common import get_filepath_or_buffer +from pandas.io.common import get_filepath_or_buffer, BaseIterator from pandas.lib import max_len_string_array, infer_dtype from pandas.tslib import NaT, Timestamp @@ -907,7 +907,7 @@ def _decode_bytes(self, str, errors=None): return str -class StataReader(StataParser): +class StataReader(StataParser, BaseIterator): __doc__ = _stata_reader_doc def __init__(self, path_or_buf, convert_dates=True, @@ -1377,15 +1377,8 @@ def data(self, **kwargs): return self.read(None, **kwargs) - def __iter__(self): - try: - if self._chunksize: - while True: - yield self.read(self._chunksize) - else: - yield self.read() - except StopIteration: - pass + def __next__(self): + return self.read(nrows=self._chunksize or 1) def get_chunk(self, size=None): """ From 88b5b5721c8bdaa9fe9e21ffd63ac2015d44b5a0 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Fri, 29 Jan 2016 11:45:17 +0100 Subject: [PATCH 3/3] TST: Test iterator parsers as iterators --- pandas/io/tests/test_common.py | 23 +++++++++++++++++++++++ pandas/io/tests/test_sas.py | 5 +++++ pandas/io/tests/test_stata.py | 4 ++++ 3 files changed, 32 insertions(+) diff --git a/pandas/io/tests/test_common.py b/pandas/io/tests/test_common.py index 55fe3f3357c05..8615b75d87626 100644 --- a/pandas/io/tests/test_common.py +++ b/pandas/io/tests/test_common.py @@ -9,6 +9,8 @@ from pandas.io import common +from pandas import read_csv, concat + try: from pathlib import Path except ImportError: @@ -21,6 +23,14 @@ class TestCommonIOCapabilities(tm.TestCase): + data1 = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" def test_expand_user(self): filename = '~/sometest' @@ -64,3 +74,16 @@ def test_get_filepath_or_buffer_with_buffer(self): input_buffer = StringIO() filepath_or_buffer, _, _ = common.get_filepath_or_buffer(input_buffer) self.assertEqual(filepath_or_buffer, input_buffer) + + def test_iterator(self): + reader = read_csv(StringIO(self.data1), chunksize=1) + result = concat(reader, ignore_index=True) + expected = read_csv(StringIO(self.data1)) + tm.assert_frame_equal(result, expected) + + # GH12153 + it = read_csv(StringIO(self.data1), chunksize=1) + first = next(it) + tm.assert_frame_equal(first, expected.iloc[[0]]) + expected.index = [0 for i in range(len(expected))] + tm.assert_frame_equal(concat(it), expected.iloc[1:]) diff --git a/pandas/io/tests/test_sas.py b/pandas/io/tests/test_sas.py index bca3594f4b47c..3a235eafe9b2c 100644 --- a/pandas/io/tests/test_sas.py +++ b/pandas/io/tests/test_sas.py @@ -88,6 +88,11 @@ def test1_incremental(self): tm.assert_frame_equal(data, data_csv, check_index_type=False) + reader = XportReader(self.file01, index="SEQN", chunksize=1000) + data = pd.concat(reader, axis=0) + + tm.assert_frame_equal(data, data_csv, check_index_type=False) + def test2(self): # Test with SSHSV1_A.XPT diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index e1e12e47457f9..3eb0e0819e2ca 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -1033,6 +1033,10 @@ def test_iterator(self): chunk = itr.get_chunk() tm.assert_frame_equal(parsed.iloc[0:5, :], chunk) + # GH12153 + from_chunks = pd.concat(read_stata(fname, chunksize=4)) + tm.assert_frame_equal(parsed, from_chunks) + def test_read_chunks_115(self): files_115 = [self.dta2_115, self.dta3_115, self.dta4_115, self.dta14_115, self.dta15_115, self.dta16_115,