BUG: provide chunks with progressively numbered (default) indices

toobaz · jreback · commit 5b0d947e8079 · 2016-07-28T20:13:25.000-04:00
closes #12185 Notice the test I fix was indeed wrong - I had written that line as a workaround waiting for this fix. Author: Pietro Battiston <me@pietrobattiston.it> Closes #12289 from toobaz/csvstate and squashes the following commits: 381e3b3 [Pietro Battiston] BUG: provide chunks with progressively numbered (default) indices
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -626,6 +626,40 @@ New Behavior:
    idx1.difference(idx2)
    idx1.symmetric_difference(idx2)
 
+.. _whatsnew_0190.api.autogenerated_chunksize_index:
+
+:func:`read_csv` called with ``chunksize`` will progressively enumerate chunks
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When :func:`read_csv` is called with ``chunksize='n'`` and without specifying an index,
+each chunk used to have an independently generated index from `0`` to ``n-1``.
+They are now given instead a progressive index, starting from ``0`` for the first chunk,
+from ``n`` for the second, and so on, so that, when concatenated, they are identical to
+the result of calling :func:`read_csv` without the ``chunksize=`` argument.
+(:issue:`12185`)
+
+.. ipython :: python
+
+   data = 'A,B\n0,1\n2,3\n4,5\n6,7'
+
+Previous behaviour:
+
+.. code-block:: ipython
+
+   In [2]: pd.concat(pd.read_csv(StringIO(data), chunksize=2))
+   Out[2]:
+      A  B
+   0  0  1
+   1  2  3
+   0  4  5
+   1  6  7
+
+New behaviour:
+
+.. ipython :: python
+
+   pd.concat(pd.read_csv(StringIO(data), chunksize=2))
+
 .. _whatsnew_0190.deprecations:
 
 Deprecations
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -16,7 +16,7 @@
                                  is_list_like, is_integer_dtype,
                                  is_float,
                                  is_scalar)
-from pandas.core.index import Index, MultiIndex
+from pandas.core.index import Index, MultiIndex, RangeIndex
 from pandas.core.frame import DataFrame
 from pandas.core.common import AbstractMethodError
 from pandas.core.config import get_option
@@ -700,6 +700,7 @@ def __init__(self, f, engine=None, **kwds):
         # miscellanea
         self.engine = engine
         self._engine = None
+        self._currow = 0
 
         options = self._get_options_with_defaults(engine)
 
@@ -913,8 +914,20 @@ def read(self, nrows=None):
         # May alter columns / col_dict
         index, columns, col_dict = self._create_index(ret)
 
+        if index is None:
+            if col_dict:
+                # Any column is actually fine:
+                new_rows = len(compat.next(compat.itervalues(col_dict)))
+                index = RangeIndex(self._currow, self._currow + new_rows)
+            else:
+                new_rows = 0
+        else:
+            new_rows = len(index)
+
         df = DataFrame(col_dict, columns=columns, index=index)
 
+        self._currow += new_rows
+
         if self.squeeze and len(df.columns) == 1:
             return df[df.columns[0]].copy()
         return df
diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py
@@ -461,6 +461,18 @@ def test_get_chunk_passed_chunksize(self):
         piece = result.get_chunk()
         self.assertEqual(len(piece), 2)
 
+    def test_read_chunksize_generated_index(self):
+        # GH 12185
+        reader = self.read_csv(StringIO(self.data1), chunksize=2)
+        df = self.read_csv(StringIO(self.data1))
+
+        tm.assert_frame_equal(pd.concat(reader), df)
+
+        reader = self.read_csv(StringIO(self.data1), chunksize=2, index_col=0)
+        df = self.read_csv(StringIO(self.data1), index_col=0)
+
+        tm.assert_frame_equal(pd.concat(reader), df)
+
     def test_read_text_list(self):
         data = """A,B,C\nfoo,1,2,3\nbar,4,5,6"""
         as_list = [['A', 'B', 'C'], ['foo', '1', '2', '3'], ['bar',
diff --git a/pandas/io/tests/parser/test_network.py b/pandas/io/tests/parser/test_network.py
@@ -122,8 +122,6 @@ def test_parse_public_s3_bucket_chunked(self):
                     self.assertFalse(df.empty)
                     true_df = local_tips.iloc[
                         chunksize * i_chunk: chunksize * (i_chunk + 1)]
-                    # Chunking doesn't preserve row numbering
-                    true_df = true_df.reset_index().drop('index', axis=1)
                     tm.assert_frame_equal(true_df, df)
 
     @tm.network
@@ -143,8 +141,6 @@ def test_parse_public_s3_bucket_chunked_python(self):
                 self.assertFalse(df.empty)
                 true_df = local_tips.iloc[
                     chunksize * i_chunk: chunksize * (i_chunk + 1)]
-                # Chunking doesn't preserve row numbering
-                true_df = true_df.reset_index().drop('index', axis=1)
                 tm.assert_frame_equal(true_df, df)
 
     @tm.network
diff --git a/pandas/io/tests/test_common.py b/pandas/io/tests/test_common.py
@@ -86,7 +86,6 @@ def test_iterator(self):
         it = read_csv(StringIO(self.data1), chunksize=1)
         first = next(it)
         tm.assert_frame_equal(first, expected.iloc[[0]])
-        expected.index = [0 for i in range(len(expected))]
         tm.assert_frame_equal(concat(it), expected.iloc[1:])