Skip to content

Commit 5b0d947

Browse files
toobazjreback
authored andcommitted
BUG: provide chunks with progressively numbered (default) indices
closes #12185 Notice the test I fix was indeed wrong - I had written that line as a workaround waiting for this fix. Author: Pietro Battiston <[email protected]> Closes #12289 from toobaz/csvstate and squashes the following commits: 381e3b3 [Pietro Battiston] BUG: provide chunks with progressively numbered (default) indices
1 parent dcb7bf7 commit 5b0d947

File tree

5 files changed

+60
-6
lines changed

5 files changed

+60
-6
lines changed

doc/source/whatsnew/v0.19.0.txt

+34
Original file line numberDiff line numberDiff line change
@@ -626,6 +626,40 @@ New Behavior:
626626
idx1.difference(idx2)
627627
idx1.symmetric_difference(idx2)
628628

629+
.. _whatsnew_0190.api.autogenerated_chunksize_index:
630+
631+
:func:`read_csv` called with ``chunksize`` will progressively enumerate chunks
632+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
633+
634+
When :func:`read_csv` is called with ``chunksize='n'`` and without specifying an index,
635+
each chunk used to have an independently generated index from `0`` to ``n-1``.
636+
They are now given instead a progressive index, starting from ``0`` for the first chunk,
637+
from ``n`` for the second, and so on, so that, when concatenated, they are identical to
638+
the result of calling :func:`read_csv` without the ``chunksize=`` argument.
639+
(:issue:`12185`)
640+
641+
.. ipython :: python
642+
643+
data = 'A,B\n0,1\n2,3\n4,5\n6,7'
644+
645+
Previous behaviour:
646+
647+
.. code-block:: ipython
648+
649+
In [2]: pd.concat(pd.read_csv(StringIO(data), chunksize=2))
650+
Out[2]:
651+
A B
652+
0 0 1
653+
1 2 3
654+
0 4 5
655+
1 6 7
656+
657+
New behaviour:
658+
659+
.. ipython :: python
660+
661+
pd.concat(pd.read_csv(StringIO(data), chunksize=2))
662+
629663
.. _whatsnew_0190.deprecations:
630664

631665
Deprecations

pandas/io/parsers.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
is_list_like, is_integer_dtype,
1717
is_float,
1818
is_scalar)
19-
from pandas.core.index import Index, MultiIndex
19+
from pandas.core.index import Index, MultiIndex, RangeIndex
2020
from pandas.core.frame import DataFrame
2121
from pandas.core.common import AbstractMethodError
2222
from pandas.core.config import get_option
@@ -700,6 +700,7 @@ def __init__(self, f, engine=None, **kwds):
700700
# miscellanea
701701
self.engine = engine
702702
self._engine = None
703+
self._currow = 0
703704

704705
options = self._get_options_with_defaults(engine)
705706

@@ -913,8 +914,20 @@ def read(self, nrows=None):
913914
# May alter columns / col_dict
914915
index, columns, col_dict = self._create_index(ret)
915916

917+
if index is None:
918+
if col_dict:
919+
# Any column is actually fine:
920+
new_rows = len(compat.next(compat.itervalues(col_dict)))
921+
index = RangeIndex(self._currow, self._currow + new_rows)
922+
else:
923+
new_rows = 0
924+
else:
925+
new_rows = len(index)
926+
916927
df = DataFrame(col_dict, columns=columns, index=index)
917928

929+
self._currow += new_rows
930+
918931
if self.squeeze and len(df.columns) == 1:
919932
return df[df.columns[0]].copy()
920933
return df

pandas/io/tests/parser/common.py

+12
Original file line numberDiff line numberDiff line change
@@ -461,6 +461,18 @@ def test_get_chunk_passed_chunksize(self):
461461
piece = result.get_chunk()
462462
self.assertEqual(len(piece), 2)
463463

464+
def test_read_chunksize_generated_index(self):
465+
# GH 12185
466+
reader = self.read_csv(StringIO(self.data1), chunksize=2)
467+
df = self.read_csv(StringIO(self.data1))
468+
469+
tm.assert_frame_equal(pd.concat(reader), df)
470+
471+
reader = self.read_csv(StringIO(self.data1), chunksize=2, index_col=0)
472+
df = self.read_csv(StringIO(self.data1), index_col=0)
473+
474+
tm.assert_frame_equal(pd.concat(reader), df)
475+
464476
def test_read_text_list(self):
465477
data = """A,B,C\nfoo,1,2,3\nbar,4,5,6"""
466478
as_list = [['A', 'B', 'C'], ['foo', '1', '2', '3'], ['bar',

pandas/io/tests/parser/test_network.py

-4
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,6 @@ def test_parse_public_s3_bucket_chunked(self):
122122
self.assertFalse(df.empty)
123123
true_df = local_tips.iloc[
124124
chunksize * i_chunk: chunksize * (i_chunk + 1)]
125-
# Chunking doesn't preserve row numbering
126-
true_df = true_df.reset_index().drop('index', axis=1)
127125
tm.assert_frame_equal(true_df, df)
128126

129127
@tm.network
@@ -143,8 +141,6 @@ def test_parse_public_s3_bucket_chunked_python(self):
143141
self.assertFalse(df.empty)
144142
true_df = local_tips.iloc[
145143
chunksize * i_chunk: chunksize * (i_chunk + 1)]
146-
# Chunking doesn't preserve row numbering
147-
true_df = true_df.reset_index().drop('index', axis=1)
148144
tm.assert_frame_equal(true_df, df)
149145

150146
@tm.network

pandas/io/tests/test_common.py

-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ def test_iterator(self):
8686
it = read_csv(StringIO(self.data1), chunksize=1)
8787
first = next(it)
8888
tm.assert_frame_equal(first, expected.iloc[[0]])
89-
expected.index = [0 for i in range(len(expected))]
9089
tm.assert_frame_equal(concat(it), expected.iloc[1:])
9190

9291

0 commit comments

Comments
 (0)