Skip to content

Commit 381e3b3

Browse files
committed
BUG: provide chunks with progressively numbered (default) indices
1 parent 474fd05 commit 381e3b3

File tree

5 files changed

+60
-6
lines changed

5 files changed

+60
-6
lines changed

doc/source/whatsnew/v0.19.0.txt

+34
Original file line numberDiff line numberDiff line change
@@ -596,6 +596,40 @@ New Behavior:
596596
idx1.difference(idx2)
597597
idx1.symmetric_difference(idx2)
598598

599+
.. _whatsnew_0190.api.autogenerated_chunksize_index:
600+
601+
:func:`read_csv` called with ``chunksize`` parameter generates correct index
602+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
603+
604+
When :func:`read_csv` is called with ``chunksize='n'`` and without specifying an index,
605+
each chunk used to have an independently generated index from `0`` to ``n-1``.
606+
They are now given instead a progressive index, starting from ``0`` for the first chunk,
607+
from ``n`` for the second, and so on, so that, when concatenated, they are identical to
608+
the result of calling :func:`read_csv` without the ``chunksize=`` argument.
609+
(:issue:`12185`)
610+
611+
.. ipython :: python
612+
613+
data = 'A,B\n0,1\n2,3\n4,5\n6,7'
614+
615+
Previous behaviour:
616+
617+
.. code-block:: ipython
618+
619+
In [2]: pd.concat(pd.read_csv(StringIO(data), chunksize=2))
620+
Out[2]:
621+
A B
622+
0 0 1
623+
1 2 3
624+
0 4 5
625+
1 6 7
626+
627+
New behaviour:
628+
629+
.. ipython :: python
630+
631+
pd.concat(pd.read_csv(StringIO(data), chunksize=2))
632+
599633
.. _whatsnew_0190.deprecations:
600634

601635
Deprecations

pandas/io/parsers.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
is_list_like, is_integer_dtype,
1717
is_float,
1818
is_scalar)
19-
from pandas.core.index import Index, MultiIndex
19+
from pandas.core.index import Index, MultiIndex, RangeIndex
2020
from pandas.core.frame import DataFrame
2121
from pandas.core.common import AbstractMethodError
2222
from pandas.core.config import get_option
@@ -700,6 +700,7 @@ def __init__(self, f, engine=None, **kwds):
700700
# miscellanea
701701
self.engine = engine
702702
self._engine = None
703+
self._currow = 0
703704

704705
options = self._get_options_with_defaults(engine)
705706

@@ -913,8 +914,20 @@ def read(self, nrows=None):
913914
# May alter columns / col_dict
914915
index, columns, col_dict = self._create_index(ret)
915916

917+
if index is None:
918+
if col_dict:
919+
# Any column is actually fine:
920+
new_rows = len(compat.next(compat.itervalues(col_dict)))
921+
index = RangeIndex(self._currow, self._currow + new_rows)
922+
else:
923+
new_rows = 0
924+
else:
925+
new_rows = len(index)
926+
916927
df = DataFrame(col_dict, columns=columns, index=index)
917928

929+
self._currow += new_rows
930+
918931
if self.squeeze and len(df.columns) == 1:
919932
return df[df.columns[0]].copy()
920933
return df

pandas/io/tests/parser/common.py

+12
Original file line numberDiff line numberDiff line change
@@ -461,6 +461,18 @@ def test_get_chunk_passed_chunksize(self):
461461
piece = result.get_chunk()
462462
self.assertEqual(len(piece), 2)
463463

464+
def test_read_chunksize_generated_index(self):
465+
# GH 12185
466+
reader = self.read_csv(StringIO(self.data1), chunksize=2)
467+
df = self.read_csv(StringIO(self.data1))
468+
469+
tm.assert_frame_equal(pd.concat(reader), df)
470+
471+
reader = self.read_csv(StringIO(self.data1), chunksize=2, index_col=0)
472+
df = self.read_csv(StringIO(self.data1), index_col=0)
473+
474+
tm.assert_frame_equal(pd.concat(reader), df)
475+
464476
def test_read_text_list(self):
465477
data = """A,B,C\nfoo,1,2,3\nbar,4,5,6"""
466478
as_list = [['A', 'B', 'C'], ['foo', '1', '2', '3'], ['bar',

pandas/io/tests/parser/test_network.py

-4
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,6 @@ def test_parse_public_s3_bucket_chunked(self):
122122
self.assertFalse(df.empty)
123123
true_df = local_tips.iloc[
124124
chunksize * i_chunk: chunksize * (i_chunk + 1)]
125-
# Chunking doesn't preserve row numbering
126-
true_df = true_df.reset_index().drop('index', axis=1)
127125
tm.assert_frame_equal(true_df, df)
128126

129127
@tm.network
@@ -143,8 +141,6 @@ def test_parse_public_s3_bucket_chunked_python(self):
143141
self.assertFalse(df.empty)
144142
true_df = local_tips.iloc[
145143
chunksize * i_chunk: chunksize * (i_chunk + 1)]
146-
# Chunking doesn't preserve row numbering
147-
true_df = true_df.reset_index().drop('index', axis=1)
148144
tm.assert_frame_equal(true_df, df)
149145

150146
@tm.network

pandas/io/tests/test_common.py

-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ def test_iterator(self):
8686
it = read_csv(StringIO(self.data1), chunksize=1)
8787
first = next(it)
8888
tm.assert_frame_equal(first, expected.iloc[[0]])
89-
expected.index = [0 for i in range(len(expected))]
9089
tm.assert_frame_equal(concat(it), expected.iloc[1:])
9190

9291

0 commit comments

Comments
 (0)