Skip to content

Iterableiterator #12173

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 17 additions & 9 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from pandas.compat import StringIO, BytesIO, string_types, text_type
from pandas import compat
from pandas.core.common import pprint_thing, is_number
from pandas.core.common import pprint_thing, is_number, AbstractMethodError


try:
Expand Down Expand Up @@ -59,6 +59,20 @@ class DtypeWarning(Warning):
pass


class BaseIterator(object):
"""Subclass this and provide a "__next__()" method to obtain an iterator.
Useful only when the object being iterated is non-reusable (e.g. OK for a
parser, not for an in-memory table, yes for its iterator)."""
def __iter__(self):
return self

def __next__(self):
raise AbstractMethodError(self)

if not compat.PY3:
BaseIterator.next = lambda self: self.__next__()


try:
from boto.s3 import key

Expand Down Expand Up @@ -394,7 +408,7 @@ def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds):
def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds):
return csv.writer(f, dialect=dialect, **kwds)
else:
class UnicodeReader:
class UnicodeReader(BaseIterator):

"""
A CSV reader which will iterate over lines in the CSV file "f",
Expand All @@ -408,16 +422,10 @@ def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)

def next(self):
def __next__(self):
row = next(self.reader)
return [compat.text_type(s, "utf-8") for s in row]

# python 3 iterator
__next__ = next

def __iter__(self): # pragma: no cover
return self

class UnicodeWriter:

"""
Expand Down
23 changes: 7 additions & 16 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
from pandas.core.config import get_option
from pandas.io.date_converters import generic_parser
from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg,
_get_handle, UnicodeReader, UTF8Recoder)
_get_handle, UnicodeReader, UTF8Recoder,
BaseIterator)
from pandas.tseries import tools

from pandas.util.decorators import Appender
Expand Down Expand Up @@ -545,7 +546,7 @@ def read_fwf(filepath_or_buffer, colspecs='infer', widths=None, **kwds):
])


class TextFileReader(object):
class TextFileReader(BaseIterator):
"""

Passed dialect overrides any of the related parser options
Expand Down Expand Up @@ -724,15 +725,8 @@ def _clean_options(self, options, engine):

return result, engine

def __iter__(self):
try:
if self.chunksize:
while True:
yield self.read(self.chunksize)
else:
yield self.read()
except StopIteration:
pass
def __next__(self):
return self.get_chunk()

def _make_engine(self, engine='c'):
if engine == 'c':
Expand Down Expand Up @@ -2363,7 +2357,7 @@ def _concat_date_cols(date_cols):
return rs


class FixedWidthReader(object):
class FixedWidthReader(BaseIterator):
"""
A reader of fixed-width lines.
"""
Expand Down Expand Up @@ -2417,7 +2411,7 @@ def detect_colspecs(self, n=100):
edges = np.where((mask ^ shifted) == 1)[0]
return list(zip(edges[::2], edges[1::2]))

def next(self):
def __next__(self):
if self.buffer is not None:
try:
line = next(self.buffer)
Expand All @@ -2430,9 +2424,6 @@ def next(self):
return [line[fromm:to].strip(self.delimiter)
for (fromm, to) in self.colspecs]

# Iterator protocol in Python 3 uses __next__()
__next__ = next


class FixedWidthFieldParser(PythonParser):
"""
Expand Down
15 changes: 4 additions & 11 deletions pandas/io/sas.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from datetime import datetime
import pandas as pd
from pandas.io.common import get_filepath_or_buffer
from pandas.io.common import get_filepath_or_buffer, BaseIterator
from pandas import compat
import struct
import numpy as np
Expand Down Expand Up @@ -242,7 +242,7 @@ def _parse_float_vec(vec):
return ieee


class XportReader(object):
class XportReader(BaseIterator):
__doc__ = _xport_reader_doc

def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1',
Expand Down Expand Up @@ -369,15 +369,8 @@ def _read_header(self):
dtype = np.dtype(dtypel)
self._dtype = dtype

def __iter__(self):
try:
if self._chunksize:
while True:
yield self.read(self._chunksize)
else:
yield self.read()
except StopIteration:
pass
def __next__(self):
return self.read(nrows=self._chunksize or 1)

def _record_count(self):
"""
Expand Down
15 changes: 4 additions & 11 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from pandas.util.decorators import Appender
import pandas as pd
import pandas.core.common as com
from pandas.io.common import get_filepath_or_buffer
from pandas.io.common import get_filepath_or_buffer, BaseIterator
from pandas.lib import max_len_string_array, infer_dtype
from pandas.tslib import NaT, Timestamp

Expand Down Expand Up @@ -907,7 +907,7 @@ def _decode_bytes(self, str, errors=None):
return str


class StataReader(StataParser):
class StataReader(StataParser, BaseIterator):
__doc__ = _stata_reader_doc

def __init__(self, path_or_buf, convert_dates=True,
Expand Down Expand Up @@ -1377,15 +1377,8 @@ def data(self, **kwargs):

return self.read(None, **kwargs)

def __iter__(self):
try:
if self._chunksize:
while True:
yield self.read(self._chunksize)
else:
yield self.read()
except StopIteration:
pass
def __next__(self):
return self.read(nrows=self._chunksize or 1)

def get_chunk(self, size=None):
"""
Expand Down
23 changes: 23 additions & 0 deletions pandas/io/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

from pandas.io import common

from pandas import read_csv, concat

try:
from pathlib import Path
except ImportError:
Expand All @@ -21,6 +23,14 @@


class TestCommonIOCapabilities(tm.TestCase):
data1 = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""

def test_expand_user(self):
filename = '~/sometest'
Expand Down Expand Up @@ -64,3 +74,16 @@ def test_get_filepath_or_buffer_with_buffer(self):
input_buffer = StringIO()
filepath_or_buffer, _, _ = common.get_filepath_or_buffer(input_buffer)
self.assertEqual(filepath_or_buffer, input_buffer)

def test_iterator(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add tests for read_stata & read_sas

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep, done

reader = read_csv(StringIO(self.data1), chunksize=1)
result = concat(reader, ignore_index=True)
expected = read_csv(StringIO(self.data1))
tm.assert_frame_equal(result, expected)

# GH12153
it = read_csv(StringIO(self.data1), chunksize=1)
first = next(it)
tm.assert_frame_equal(first, expected.iloc[[0]])
expected.index = [0 for i in range(len(expected))]
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Each chunk has an index restarting from 0 (also in master). Isn't this a bug?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, that shouldn't be the case

In [46]: s = DataFrame({'A' : range(10)}).to_csv(None)

In [47]: s
Out[47]: ',A\n0,0\n1,1\n2,2\n3,3\n4,4\n5,5\n6,6\n7,7\n8,8\n9,9\n'

In [48]: pd.read_csv(StringIO(s),index_col=0)
Out[48]: 
   A
0  0
1  1
2  2
3  3
4  4
5  5
6  6
7  7
8  8
9  9

In [49]: list(pd.read_csv(StringIO(s),index_col=0,chunksize=4))
Out[49]: 
[   A
 0  0
 1  1
 2  2
 3  3,    A
 4  4
 5  5
 6  6
 7  7,    A
 8  8
 9  9]

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean when the index is not loaded:

In [7]: list(pd.read_csv(StringIO(s), chunksize=4))
Out[7]: 
[   Unnamed: 0  A
 0           0  0
 1           1  1
 2           2  2
 3           3  3,    Unnamed: 0  A
 0           4  4
 1           5  5
 2           6  6
 3           7  7,    Unnamed: 0  A
 0           8  8
 1           9  9]

Wouldn't we prefer a consistent index?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm yes I agree

you'd have to keep state in the in the iterators I think though -
I do this is HDFStore iirc

but might be a bit non trivial

if u think u can fix easily - go ahead
otherwise create an issue

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, I'd like to do it but it probably won't be immediate: #12185 .

tm.assert_frame_equal(concat(it), expected.iloc[1:])
5 changes: 5 additions & 0 deletions pandas/io/tests/test_sas.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,11 @@ def test1_incremental(self):

tm.assert_frame_equal(data, data_csv, check_index_type=False)

reader = XportReader(self.file01, index="SEQN", chunksize=1000)
data = pd.concat(reader, axis=0)

tm.assert_frame_equal(data, data_csv, check_index_type=False)

def test2(self):
# Test with SSHSV1_A.XPT

Expand Down
4 changes: 4 additions & 0 deletions pandas/io/tests/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1033,6 +1033,10 @@ def test_iterator(self):
chunk = itr.get_chunk()
tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)

# GH12153
from_chunks = pd.concat(read_stata(fname, chunksize=4))
tm.assert_frame_equal(parsed, from_chunks)

def test_read_chunks_115(self):
files_115 = [self.dta2_115, self.dta3_115, self.dta4_115,
self.dta14_115, self.dta15_115, self.dta16_115,
Expand Down