BUG: Set index when reading Stata file

bashtage · bashtage · commit dd83e7b61769 · 2017-08-28T12:00:10.000+01:00
Ensures index is set when requested during reading of a Stata dta file Rename index to index_col for API consistency closes pandas-dev#16342
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -293,6 +293,7 @@ Other API Changes
 - :func:`Series.argmin` and :func:`Series.argmax` will now raise a ``TypeError`` when used with ``object`` dtypes, instead of a ``ValueError`` (:issue:`13595`)
 - :class:`Period` is now immutable, and will now raise an ``AttributeError`` when a user tries to assign a new value to the ``ordinal`` or ``freq`` attributes (:issue:`17116`).
 - :func:`to_datetime` when passed a tz-aware ``origin=`` kwarg will now raise a more informative ``ValueError`` rather than a ``TypeError`` (:issue:`16842`)
+- Renamed non-functional `index` to `index_col` in :func:`read_stata` to improve API consistency (:issue:`16342`)
 
 
 .. _whatsnew_0210.deprecations:
@@ -369,6 +370,7 @@ I/O
 - Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size would incorrectly raise a ``MemoryError`` (:issue:`16798`).
 - Bug in :func:`read_csv` when called with a single-element list ``header`` would return a ``DataFrame`` of all NaN values (:issue:`7757`)
 - Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`)
+- Bug in :func:`read_stata` where the index was not set (:issue:`16342`)
 - Bug in :func:`read_html` where import check fails when run in multiple threads (:issue:`16928`)
 
 Plotting
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -160,14 +160,14 @@
 
 @Appender(_read_stata_doc)
 def read_stata(filepath_or_buffer, convert_dates=True,
-               convert_categoricals=True, encoding=None, index=None,
+               convert_categoricals=True, encoding=None, index_col=None,
                convert_missing=False, preserve_dtypes=True, columns=None,
                order_categoricals=True, chunksize=None, iterator=False):
 
     reader = StataReader(filepath_or_buffer,
                          convert_dates=convert_dates,
                          convert_categoricals=convert_categoricals,
-                         index=index, convert_missing=convert_missing,
+                         index_col=index_col, convert_missing=convert_missing,
                          preserve_dtypes=preserve_dtypes,
                          columns=columns,
                          order_categoricals=order_categoricals,
@@ -945,7 +945,7 @@ class StataReader(StataParser, BaseIterator):
     __doc__ = _stata_reader_doc
 
     def __init__(self, path_or_buf, convert_dates=True,
-                 convert_categoricals=True, index=None,
+                 convert_categoricals=True, index_col=None,
                  convert_missing=False, preserve_dtypes=True,
                  columns=None, order_categoricals=True,
                  encoding='latin-1', chunksize=None):
@@ -956,7 +956,7 @@ def __init__(self, path_or_buf, convert_dates=True,
         # calls to read).
         self._convert_dates = convert_dates
         self._convert_categoricals = convert_categoricals
-        self._index = index
+        self._index_col = index_col
         self._convert_missing = convert_missing
         self._preserve_dtypes = preserve_dtypes
         self._columns = columns
@@ -1461,7 +1461,7 @@ def get_chunk(self, size=None):
 
     @Appender(_read_method_doc)
     def read(self, nrows=None, convert_dates=None,
-             convert_categoricals=None, index=None,
+             convert_categoricals=None, index_col=None,
              convert_missing=None, preserve_dtypes=None,
              columns=None, order_categoricals=None):
         # Handle empty file or chunk.  If reading incrementally raise
@@ -1486,6 +1486,8 @@ def read(self, nrows=None, convert_dates=None,
             columns = self._columns
         if order_categoricals is None:
             order_categoricals = self._order_categoricals
+        if index_col is None:
+            index_col = self._index_col
 
         if nrows is None:
             nrows = self.nobs
@@ -1524,14 +1526,14 @@ def read(self, nrows=None, convert_dates=None,
             self._read_value_labels()
 
         if len(data) == 0:
-            data = DataFrame(columns=self.varlist, index=index)
+            data = DataFrame(columns=self.varlist)
         else:
-            data = DataFrame.from_records(data, index=index)
+            data = DataFrame.from_records(data)
             data.columns = self.varlist
 
         # If index is not specified, use actual row number rather than
         # restarting at 0 for each chunk.
-        if index is None:
+        if index_col is None:
             ix = np.arange(self._lines_read - read_lines, self._lines_read)
             data = data.set_index(ix)
 
@@ -1553,7 +1555,7 @@ def read(self, nrows=None, convert_dates=None,
         cols_ = np.where(self.dtyplist)[0]
 
         # Convert columns (if needed) to match input type
-        index = data.index
+        ix = data.index
         requires_type_conversion = False
         data_formatted = []
         for i in cols_:
@@ -1563,7 +1565,7 @@ def read(self, nrows=None, convert_dates=None,
                 if dtype != np.dtype(object) and dtype != self.dtyplist[i]:
                     requires_type_conversion = True
                     data_formatted.append(
-                        (col, Series(data[col], index, self.dtyplist[i])))
+                        (col, Series(data[col], ix, self.dtyplist[i])))
                 else:
                     data_formatted.append((col, data[col]))
         if requires_type_conversion:
@@ -1606,6 +1608,9 @@ def read(self, nrows=None, convert_dates=None,
             if convert:
                 data = DataFrame.from_items(retyped_data)
 
+        if index_col is not None:
+            data = data.set_index(data.pop(index_col))
+
         return data
 
     def _do_convert_missing(self, data, convert_missing):
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -1309,3 +1309,11 @@ def test_value_labels_iterator(self, write_index):
             dta_iter = pd.read_stata(path, iterator=True)
             value_labels = dta_iter.value_labels()
         assert value_labels == {'A': {0: 'A', 1: 'B', 2: 'C', 3: 'E'}}
+
+    def test_set_index(self):
+        df = tm.makeDataFrame()
+        df.index.name = 'index'
+        with tm.ensure_clean() as path:
+            df.to_stata(path)
+            reread = pd.read_stata(path, index_col='index')
+        tm.assert_frame_equal(df, reread)