pandas-dev · jreback · Nov 19, 2014 · Nov 18, 2014
diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
@@ -546,7 +546,8 @@ Getting Data In/Out
 Writing data (`Series`, `Frames`) to a HDF store that contains a ``category`` dtype was implemented
 in 0.15.2. See :ref:`here <io.hdf5-categorical>` for an example and caveats.
 
-Writing data to/from Stata format files was implemented in 0.15.2.
+Writing data to and reading data from *Stata* format files was implemented in
+0.15.2. See :ref:`here <io.stata-categorical>` for an example and caveats.
 
 Writing to a CSV file will convert the data, effectively removing any information about the
 categorical (categories and ordering). So if you read back the CSV file you have to convert the

diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -3204,8 +3204,8 @@ format store like this:
      .. ipython:: python
 
         store_export = HDFStore('export.h5')
-	    store_export.append('df_dc', df_dc, data_columns=df_dc.columns)
-	    store_export
+        store_export.append('df_dc', df_dc, data_columns=df_dc.columns)
+        store_export
 
      .. ipython:: python
         :suppress:
@@ -3240,8 +3240,8 @@ number of options, please see the docstring.
         legacy_store
 
         # copy (and return the new handle)
-	    new_store = legacy_store.copy('store_new.h5')
-	    new_store
+        new_store = legacy_store.copy('store_new.h5')
+        new_store
         new_store.close()
 
      .. ipython:: python
@@ -3651,14 +3651,14 @@ You can access the management console to determine project id's by:
 
 .. _io.stata:
 
-STATA Format
+Stata Format
 ------------
 
 .. versionadded:: 0.12.0
 
 .. _io.stata_writer:
 
-Writing to STATA format
+Writing to Stata format
 ~~~~~~~~~~~~~~~~~~~~~~~
 
 The method :func:`~pandas.core.frame.DataFrame.to_stata` will write a DataFrame
@@ -3753,6 +3753,53 @@ Alternatively, the function :func:`~pandas.io.stata.read_stata` can be used
    import os
    os.remove('stata.dta')
 
+.. _io.stata-categorical:
+
+Categorical Data
+~~~~~~~~~~~~~~~~
+
+.. versionadded:: 0.15.2
+
+``Categorical`` data can be exported to *Stata* data files as value labeled data.
+The exported data consists of the underlying category codes as integer data values
+and the categories as value labels.  *Stata* does not have an explicit equivalent
+to a ``Categorical`` and information about *whether* the variable is ordered
+is lost when exporting.
+
+.. warning::
+
+    *Stata* only supports string value labels, and so ``str`` is called on the
+    categories when exporting data.  Exporting ``Categorical`` variables with
+    non-string categories produces a warning, and can result a loss of 
+    information if the ``str`` representations of the categories are not unique.
+
+Labeled data can similarly be imported from *Stata* data files as ``Categorical``
+variables using the keyword argument ``convert_categoricals`` (``True`` by default).  
+By default, imported ``Categorical`` variables are ordered according to the 
+underlying numerical data. However, setting ``order_categoricals=False`` will 
+import labeled data as ``Categorical`` variables without an order.
+
+.. note::
+
+    When importing categorical data, the values of the variables in the *Stata*
+    data file are not generally preserved since ``Categorical`` variables always
+    use integer data types between ``-1`` and ``n-1`` where ``n`` is the number
+    of categories. If the original values in the *Stata* data file are required,
+    these can be imported by setting ``convert_categoricals=False``, which will
+    import original data (but not the variable labels). The original values can
+    be matched to the imported categorical data since there is a simple mapping
+    between the original *Stata* data values and the category codes of imported
+    Categorical variables: missing values are assigned code ``-1``, and the
+    smallest original value is assigned ``0``, the second smallest is assigned
+    ``1`` and so on until the largest original value is assigned the code ``n-1``.
+
+.. note::
+
+    *Stata* suppots partially labeled series.  These series have value labels for
+    some but not all data values. Importing a partially labeled series will produce
+    a ``Categorial`` with string categories for the values that are labeled and
+    numeric categories for values with no label.
+
 .. _io.perf:
 
 Performance Considerations

diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt
@@ -41,10 +41,11 @@ API changes
 Enhancements
 ~~~~~~~~~~~~
 
-- Added ability to export Categorical data to Stata (:issue:`8633`).
+- Added ability to export Categorical data to Stata (:issue:`8633`).  See :ref:`here <io.stata-categorical>` for limitations of categorical variables exported to Stata data files.
 - Added ability to export Categorical data to to/from HDF5 (:issue:`7621`). Queries work the same as if it was an object array. However, the ``category`` dtyped data is stored in a more efficient manner. See :ref:`here <io.hdf5-categorical>` for an example and caveats w.r.t. prior versions of pandas.
 - Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on `Timestamp` class (:issue:`5351`).
 - Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See :ref:`here<remote_data.ga>`.
+- Added flag ``order_categoricals`` to ``StataReader`` and ``read_stata`` to select whether to order imported categorical data (:issue:`8836`).  See :ref:`here <io.stata-categorical>` for more information on importing categorical variables from Stata data files.
 
 .. _whatsnew_0152.performance:
 
@@ -73,6 +74,7 @@ Bug Fixes
 
 
 
+- Imported categorical variables from Stata files retain the ordinal information in the underlying data (:issue:`8836`).
 
 
 

diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -29,7 +29,8 @@
 
 def read_stata(filepath_or_buffer, convert_dates=True,
                convert_categoricals=True, encoding=None, index=None,
-               convert_missing=False, preserve_dtypes=True, columns=None):
+               convert_missing=False, preserve_dtypes=True, columns=None,
+               order_categoricals=True):
     """
     Read Stata file into DataFrame
 
@@ -58,11 +59,14 @@ def read_stata(filepath_or_buffer, convert_dates=True,
     columns : list or None
         Columns to retain.  Columns will be returned in the given order.  None
         returns all columns
+    order_categoricals : boolean, defaults to True
+        Flag indicating whether converted categorical data are ordered.
     """
     reader = StataReader(filepath_or_buffer, encoding)
 
     return reader.data(convert_dates, convert_categoricals, index,
-                       convert_missing, preserve_dtypes, columns)
+                       convert_missing, preserve_dtypes, columns,
+                       order_categoricals)
 
 _date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"]
 
@@ -1136,7 +1140,8 @@ def _read_strls(self):
             self.path_or_buf.read(1)  # zero-termination
 
     def data(self, convert_dates=True, convert_categoricals=True, index=None,
-             convert_missing=False, preserve_dtypes=True, columns=None):
+             convert_missing=False, preserve_dtypes=True, columns=None,
+             order_categoricals=True):
         """
         Reads observations from Stata file, converting them into a dataframe
 
@@ -1161,6 +1166,8 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None,
         columns : list or None
             Columns to retain.  Columns will be returned in the given order.
             None returns all columns
+        order_categoricals : boolean, defaults to True
+            Flag indicating whether converted categorical data are ordered.
 
         Returns
         -------
@@ -1228,7 +1235,7 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None,
 
         for col, typ in zip(data, self.typlist):
             if type(typ) is int:
-                data[col] = data[col].apply(self._null_terminate, convert_dtype=True,)
+                data[col] = data[col].apply(self._null_terminate, convert_dtype=True)
 
         cols_ = np.where(self.dtyplist)[0]
 
@@ -1288,19 +1295,25 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None,
                 col = data.columns[i]
                 data[col] = _stata_elapsed_date_to_datetime_vec(data[col], self.fmtlist[i])
 
-        if convert_categoricals:
-            cols = np.where(
-                lmap(lambda x: x in compat.iterkeys(self.value_label_dict),
-                     self.lbllist)
-            )[0]
-            for i in cols:
-                col = data.columns[i]
-                labeled_data = np.copy(data[col])
-                labeled_data = labeled_data.astype(object)
-                for k, v in compat.iteritems(
-                        self.value_label_dict[self.lbllist[i]]):
-                    labeled_data[(data[col] == k).values] = v
-                data[col] = Categorical.from_array(labeled_data)
+        if convert_categoricals and self.value_label_dict:
+            value_labels = list(compat.iterkeys(self.value_label_dict))
+            cat_converted_data = []
+            for col, label in zip(data, self.lbllist):
+                if label in value_labels:
+                    # Explicit call with ordered=True
+                    cat_data = Categorical(data[col], ordered=order_categoricals)
+                    value_label_dict = self.value_label_dict[label]
+                    categories = []
+                    for category in cat_data.categories:
+                        if category in value_label_dict:
+                            categories.append(value_label_dict[category])
+                        else:
+                            categories.append(category)  # Partially labeled
+                    cat_data.categories = categories
+                    cat_converted_data.append((col, cat_data))
+                else:
+                    cat_converted_data.append((col, data[col]))
+            data = DataFrame.from_items(cat_converted_data)
 
         if not preserve_dtypes:
             retyped_data = []

diff --git a/pandas/io/tests/data/stata10_115.dta b/pandas/io/tests/data/stata10_115.dta
diff --git a/pandas/io/tests/data/stata10_117.dta b/pandas/io/tests/data/stata10_117.dta
diff --git a/pandas/io/tests/data/stata11_115.dta b/pandas/io/tests/data/stata11_115.dta
diff --git a/pandas/io/tests/data/stata11_117.dta b/pandas/io/tests/data/stata11_117.dta
diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py
@@ -14,6 +14,7 @@
 import pandas as pd
 from pandas.compat import iterkeys
 from pandas.core.frame import DataFrame, Series
+from pandas.core.common import is_categorical_dtype
 from pandas.io.parsers import read_csv
 from pandas.io.stata import (read_stata, StataReader, InvalidColumnName,
     PossiblePrecisionLoss, StataMissingValue)
@@ -81,6 +82,11 @@ def setUp(self):
         self.dta18_115 = os.path.join(self.dirpath, 'stata9_115.dta')
         self.dta18_117 = os.path.join(self.dirpath, 'stata9_117.dta')
 
+        self.dta19_115 = os.path.join(self.dirpath, 'stata10_115.dta')
+        self.dta19_117 = os.path.join(self.dirpath, 'stata10_117.dta')
+
+        self.dta20_115 = os.path.join(self.dirpath, 'stata11_115.dta')
+        self.dta20_117 = os.path.join(self.dirpath, 'stata11_117.dta')
 
     def read_dta(self, file):
         # Legacy default reader configuration
@@ -817,6 +823,72 @@ def test_categorical_with_stata_missing_values(self):
             written_and_read_again = self.read_dta(path)
             tm.assert_frame_equal(written_and_read_again.set_index('index'), original)
 
+    def test_categorical_order(self):
+        # Directly construct using expected codes
+        # Format is is_cat, col_name, labels (in order), underlying data
+        expected = [(True, 'ordered', ['a', 'b', 'c', 'd', 'e'], np.arange(5)),
+                    (True, 'reverse', ['a', 'b', 'c', 'd', 'e'], np.arange(5)[::-1]),
+                    (True, 'noorder', ['a', 'b', 'c', 'd', 'e'], np.array([2, 1, 4, 0, 3])),
+                    (True, 'floating', ['a', 'b', 'c', 'd', 'e'], np.arange(0, 5)),
+                    (True, 'float_missing', ['a', 'd', 'e'], np.array([0, 1, 2, -1, -1])),
+                    (False, 'nolabel', [1.0, 2.0, 3.0, 4.0, 5.0], np.arange(5)),
+                    (True, 'int32_mixed', ['d', 2, 'e', 'b', 'a'], np.arange(5))]
+        cols = []
+        for is_cat, col, labels, codes in expected:
+            if is_cat:
+                cols.append((col, pd.Categorical.from_codes(codes, labels)))
+            else:
+                cols.append((col, pd.Series(labels, dtype=np.float32)))
+        expected = DataFrame.from_items(cols)
+
+        # Read with and with out categoricals, ensure order is identical
+        parsed_115 = read_stata(self.dta19_115)
+        parsed_117 = read_stata(self.dta19_117)
+        tm.assert_frame_equal(expected, parsed_115)
+        tm.assert_frame_equal(expected, parsed_117)
+
+        # Check identity of codes
+        for col in expected:
+            if is_categorical_dtype(expected[col]):
+                print(col)
+                tm.assert_series_equal(expected[col].cat.codes,
+                                       parsed_115[col].cat.codes)
+                tm.assert_index_equal(expected[col].cat.categories,
+                                      parsed_115[col].cat.categories)
+
+    def test_categorical_sorting(self):
+        parsed_115 = read_stata(self.dta20_115)
+        parsed_117 = read_stata(self.dta20_117)
+        # Sort based on codes, not strings
+        parsed_115 = parsed_115.sort("srh")
+        parsed_117 = parsed_117.sort("srh")
+        # Don't sort index
+        parsed_115.index = np.arange(parsed_115.shape[0])
+        parsed_117.index = np.arange(parsed_117.shape[0])
+        codes = [-1, -1, 0, 1, 1, 1, 2, 2, 3, 4]
+        categories = ["Poor", "Fair", "Good", "Very good", "Excellent"]
+        expected = pd.Series(pd.Categorical.from_codes(codes=codes,
+                                                       categories=categories))
+        tm.assert_series_equal(expected, parsed_115["srh"])
+        tm.assert_series_equal(expected, parsed_117["srh"])
+
+    def test_categorical_ordering(self):
+        parsed_115 = read_stata(self.dta19_115)
+        parsed_117 = read_stata(self.dta19_117)
+
+        parsed_115_unordered = read_stata(self.dta19_115,
+                                          order_categoricals=False)
+        parsed_117_unordered = read_stata(self.dta19_117,
+                                          order_categoricals=False)
+        for col in parsed_115:
+            if not is_categorical_dtype(parsed_115[col]):
+                continue
+            tm.assert_equal(True, parsed_115[col].cat.ordered)
+            tm.assert_equal(True, parsed_117[col].cat.ordered)
+            tm.assert_equal(False, parsed_115_unordered[col].cat.ordered)
+            tm.assert_equal(False, parsed_117_unordered[col].cat.ordered)
+
+
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)