From 6cf2e48c4f280449a50faed788cbb2abf5e2accc Mon Sep 17 00:00:00 2001
From: Kevin Sheppard <kevin.sheppard@economics.ox.ac.uk>
Date: Tue, 18 Nov 2014 16:11:33 +0000
Subject: [PATCH] BUG: Correct importing behavior for Categoricals in
 StataReader

Ensure that category codes have the same order as the underlying Stata data.
Also adds a flag that allows categorical data to be treated as ordered or
unordered when importing.
---
 doc/source/categorical.rst           |   3 +-
 doc/source/io.rst                    |  59 +++++++++++++++++++---
 doc/source/whatsnew/v0.15.2.txt      |   4 +-
 pandas/io/stata.py                   |  47 ++++++++++-------
 pandas/io/tests/data/stata10_115.dta | Bin 0 -> 2298 bytes
 pandas/io/tests/data/stata10_117.dta | Bin 0 -> 2298 bytes
 pandas/io/tests/data/stata11_115.dta | Bin 0 -> 810 bytes
 pandas/io/tests/data/stata11_117.dta | Bin 0 -> 1268 bytes
 pandas/io/tests/test_stata.py        |  72 +++++++++++++++++++++++++++
 9 files changed, 160 insertions(+), 25 deletions(-)
 create mode 100755 pandas/io/tests/data/stata10_115.dta
 create mode 100755 pandas/io/tests/data/stata10_117.dta
 create mode 100755 pandas/io/tests/data/stata11_115.dta
 create mode 100755 pandas/io/tests/data/stata11_117.dta

diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
index 3d7b41a1c4c24..a7091d6ab38fb 100644
--- a/doc/source/categorical.rst
+++ b/doc/source/categorical.rst
@@ -546,7 +546,8 @@ Getting Data In/Out
 Writing data (`Series`, `Frames`) to a HDF store that contains a ``category`` dtype was implemented
 in 0.15.2. See :ref:`here <io.hdf5-categorical>` for an example and caveats.
 
-Writing data to/from Stata format files was implemented in 0.15.2.
+Writing data to and reading data from *Stata* format files was implemented in
+0.15.2. See :ref:`here <io.stata-categorical>` for an example and caveats.
 
 Writing to a CSV file will convert the data, effectively removing any information about the
 categorical (categories and ordering). So if you read back the CSV file you have to convert the
diff --git a/doc/source/io.rst b/doc/source/io.rst
index bd6400787ae58..9686a72d43cf8 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -3204,8 +3204,8 @@ format store like this:
      .. ipython:: python
 
         store_export = HDFStore('export.h5')
-	    store_export.append('df_dc', df_dc, data_columns=df_dc.columns)
-	    store_export
+        store_export.append('df_dc', df_dc, data_columns=df_dc.columns)
+        store_export
 
      .. ipython:: python
         :suppress:
@@ -3240,8 +3240,8 @@ number of options, please see the docstring.
         legacy_store
 
         # copy (and return the new handle)
-	    new_store = legacy_store.copy('store_new.h5')
-	    new_store
+        new_store = legacy_store.copy('store_new.h5')
+        new_store
         new_store.close()
 
      .. ipython:: python
@@ -3651,14 +3651,14 @@ You can access the management console to determine project id's by:
 
 .. _io.stata:
 
-STATA Format
+Stata Format
 ------------
 
 .. versionadded:: 0.12.0
 
 .. _io.stata_writer:
 
-Writing to STATA format
+Writing to Stata format
 ~~~~~~~~~~~~~~~~~~~~~~~
 
 The method :func:`~pandas.core.frame.DataFrame.to_stata` will write a DataFrame
@@ -3753,6 +3753,53 @@ Alternatively, the function :func:`~pandas.io.stata.read_stata` can be used
    import os
    os.remove('stata.dta')
 
+.. _io.stata-categorical:
+
+Categorical Data
+~~~~~~~~~~~~~~~~
+
+.. versionadded:: 0.15.2
+
+``Categorical`` data can be exported to *Stata* data files as value labeled data.
+The exported data consists of the underlying category codes as integer data values
+and the categories as value labels.  *Stata* does not have an explicit equivalent
+to a ``Categorical`` and information about *whether* the variable is ordered
+is lost when exporting.
+
+.. warning::
+
+    *Stata* only supports string value labels, and so ``str`` is called on the
+    categories when exporting data.  Exporting ``Categorical`` variables with
+    non-string categories produces a warning, and can result a loss of 
+    information if the ``str`` representations of the categories are not unique.
+
+Labeled data can similarly be imported from *Stata* data files as ``Categorical``
+variables using the keyword argument ``convert_categoricals`` (``True`` by default).  
+By default, imported ``Categorical`` variables are ordered according to the 
+underlying numerical data. However, setting ``order_categoricals=False`` will 
+import labeled data as ``Categorical`` variables without an order.
+
+.. note::
+
+    When importing categorical data, the values of the variables in the *Stata*
+    data file are not generally preserved since ``Categorical`` variables always
+    use integer data types between ``-1`` and ``n-1`` where ``n`` is the number
+    of categories. If the original values in the *Stata* data file are required,
+    these can be imported by setting ``convert_categoricals=False``, which will
+    import original data (but not the variable labels). The original values can
+    be matched to the imported categorical data since there is a simple mapping
+    between the original *Stata* data values and the category codes of imported
+    Categorical variables: missing values are assigned code ``-1``, and the
+    smallest original value is assigned ``0``, the second smallest is assigned
+    ``1`` and so on until the largest original value is assigned the code ``n-1``.
+
+.. note::
+
+    *Stata* suppots partially labeled series.  These series have value labels for
+    some but not all data values. Importing a partially labeled series will produce
+    a ``Categorial`` with string categories for the values that are labeled and
+    numeric categories for values with no label.
+
 .. _io.perf:
 
 Performance Considerations
diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt
index 61d18da45e5f0..fedd3ddabf045 100644
--- a/doc/source/whatsnew/v0.15.2.txt
+++ b/doc/source/whatsnew/v0.15.2.txt
@@ -41,10 +41,11 @@ API changes
 Enhancements
 ~~~~~~~~~~~~
 
-- Added ability to export Categorical data to Stata (:issue:`8633`).
+- Added ability to export Categorical data to Stata (:issue:`8633`).  See :ref:`here <io.stata-categorical>` for limitations of categorical variables exported to Stata data files.
 - Added ability to export Categorical data to to/from HDF5 (:issue:`7621`). Queries work the same as if it was an object array. However, the ``category`` dtyped data is stored in a more efficient manner. See :ref:`here <io.hdf5-categorical>` for an example and caveats w.r.t. prior versions of pandas.
 - Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on `Timestamp` class (:issue:`5351`).
 - Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See :ref:`here<remote_data.ga>`.
+- Added flag ``order_categoricals`` to ``StataReader`` and ``read_stata`` to select whether to order imported categorical data (:issue:`8836`).  See :ref:`here <io.stata-categorical>` for more information on importing categorical variables from Stata data files.
 
 .. _whatsnew_0152.performance:
 
@@ -73,6 +74,7 @@ Bug Fixes
 
 
 
+- Imported categorical variables from Stata files retain the ordinal information in the underlying data (:issue:`8836`).
 
 
 
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index ab9d330b48988..45d3274088c75 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -29,7 +29,8 @@
 
 def read_stata(filepath_or_buffer, convert_dates=True,
                convert_categoricals=True, encoding=None, index=None,
-               convert_missing=False, preserve_dtypes=True, columns=None):
+               convert_missing=False, preserve_dtypes=True, columns=None,
+               order_categoricals=True):
     """
     Read Stata file into DataFrame
 
@@ -58,11 +59,14 @@ def read_stata(filepath_or_buffer, convert_dates=True,
     columns : list or None
         Columns to retain.  Columns will be returned in the given order.  None
         returns all columns
+    order_categoricals : boolean, defaults to True
+        Flag indicating whether converted categorical data are ordered.
     """
     reader = StataReader(filepath_or_buffer, encoding)
 
     return reader.data(convert_dates, convert_categoricals, index,
-                       convert_missing, preserve_dtypes, columns)
+                       convert_missing, preserve_dtypes, columns,
+                       order_categoricals)
 
 _date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"]
 
@@ -1136,7 +1140,8 @@ def _read_strls(self):
             self.path_or_buf.read(1)  # zero-termination
 
     def data(self, convert_dates=True, convert_categoricals=True, index=None,
-             convert_missing=False, preserve_dtypes=True, columns=None):
+             convert_missing=False, preserve_dtypes=True, columns=None,
+             order_categoricals=True):
         """
         Reads observations from Stata file, converting them into a dataframe
 
@@ -1161,6 +1166,8 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None,
         columns : list or None
             Columns to retain.  Columns will be returned in the given order.
             None returns all columns
+        order_categoricals : boolean, defaults to True
+            Flag indicating whether converted categorical data are ordered.
 
         Returns
         -------
@@ -1228,7 +1235,7 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None,
 
         for col, typ in zip(data, self.typlist):
             if type(typ) is int:
-                data[col] = data[col].apply(self._null_terminate, convert_dtype=True,)
+                data[col] = data[col].apply(self._null_terminate, convert_dtype=True)
 
         cols_ = np.where(self.dtyplist)[0]
 
@@ -1288,19 +1295,25 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None,
                 col = data.columns[i]
                 data[col] = _stata_elapsed_date_to_datetime_vec(data[col], self.fmtlist[i])
 
-        if convert_categoricals:
-            cols = np.where(
-                lmap(lambda x: x in compat.iterkeys(self.value_label_dict),
-                     self.lbllist)
-            )[0]
-            for i in cols:
-                col = data.columns[i]
-                labeled_data = np.copy(data[col])
-                labeled_data = labeled_data.astype(object)
-                for k, v in compat.iteritems(
-                        self.value_label_dict[self.lbllist[i]]):
-                    labeled_data[(data[col] == k).values] = v
-                data[col] = Categorical.from_array(labeled_data)
+        if convert_categoricals and self.value_label_dict:
+            value_labels = list(compat.iterkeys(self.value_label_dict))
+            cat_converted_data = []
+            for col, label in zip(data, self.lbllist):
+                if label in value_labels:
+                    # Explicit call with ordered=True
+                    cat_data = Categorical(data[col], ordered=order_categoricals)
+                    value_label_dict = self.value_label_dict[label]
+                    categories = []
+                    for category in cat_data.categories:
+                        if category in value_label_dict:
+                            categories.append(value_label_dict[category])
+                        else:
+                            categories.append(category)  # Partially labeled
+                    cat_data.categories = categories
+                    cat_converted_data.append((col, cat_data))
+                else:
+                    cat_converted_data.append((col, data[col]))
+            data = DataFrame.from_items(cat_converted_data)
 
         if not preserve_dtypes:
             retyped_data = []
diff --git a/pandas/io/tests/data/stata10_115.dta b/pandas/io/tests/data/stata10_115.dta
new file mode 100755
index 0000000000000000000000000000000000000000..b917dde5ad47dfecfc872b5a5ea756ce603eda59
GIT binary patch
literal 2298
zcmeHJy>1jS5O$8A1PUZ1puUEdy$cahqLH)WuL-&aD5zH6TZgQ)KPTH9L_#7GFF;Fo
zqT*?I0-k{)4TvVpc)ZzF?v@&f>}hVspJ!*rW6yZOI^|T7IVIsxt4tdSMi*LHEyv?+
zh?)VUM^4v<6rMeQ3Q;?ReYq|m+LthqSA;-uBB4;JE+on<fXhpbuF9FbIo=*l>iKYU
zG~ODHc6PT$5Vf-qHBppmCefOIJ_mkljfF>n?+tjI1^(?4kH24L{$QQ?tH4hJU)x~(
zN8rB$hlsqwT$x;@g-+bjOk4T$wEg`$h3MQgsaFb@Cw+Q!#<o}e8$H*?>Qz&WRkgR#
zuW(S*%H`GTUJb91l)1G%T2<}0q}PDRD|derExhYniO&G97;J4^kz0pbbt7y7TZZUt
z3Jz*hDre>0oqMDH$X7&k@xqFvEI|-83C5#XywQo@;DmQ5RW_e4z0v{x1%*yHtHr9W
z{$5;kCeJUutz(}ME5+OLD%lU+Lp-_t@gd3a$&0wbX#Q)^3*F*O|M4BxpM7fdFAV+%
zIqY=&aFAensEnFAZLsy_Q!61_LNdHpu`3?$39*Rrj3JTC6iypqNFor+0|Zz<Mtp(P
za=zb=5fez>7RevT4&QmC-{2^3(A*;Cd)bEs)|rAL4!l!0h!$xfM`*EIaM!}IfBEIT
zFO#M`$YHoR#Q<d*Q+?6mC{P-c<4kCg3Pm3SXx+QmMVK3pHgwJOxuDmtBhk}!L2k(n
VVO>`W9ZkK%;H=vgc+t)0&L2ctMtT4M

literal 0
HcmV?d00001

diff --git a/pandas/io/tests/data/stata10_117.dta b/pandas/io/tests/data/stata10_117.dta
new file mode 100755
index 0000000000000000000000000000000000000000..b917dde5ad47dfecfc872b5a5ea756ce603eda59
GIT binary patch
literal 2298
zcmeHJy>1jS5O$8A1PUZ1puUEdy$cahqLH)WuL-&aD5zH6TZgQ)KPTH9L_#7GFF;Fo
zqT*?I0-k{)4TvVpc)ZzF?v@&f>}hVspJ!*rW6yZOI^|T7IVIsxt4tdSMi*LHEyv?+
zh?)VUM^4v<6rMeQ3Q;?ReYq|m+LthqSA;-uBB4;JE+on<fXhpbuF9FbIo=*l>iKYU
zG~ODHc6PT$5Vf-qHBppmCefOIJ_mkljfF>n?+tjI1^(?4kH24L{$QQ?tH4hJU)x~(
zN8rB$hlsqwT$x;@g-+bjOk4T$wEg`$h3MQgsaFb@Cw+Q!#<o}e8$H*?>Qz&WRkgR#
zuW(S*%H`GTUJb91l)1G%T2<}0q}PDRD|derExhYniO&G97;J4^kz0pbbt7y7TZZUt
z3Jz*hDre>0oqMDH$X7&k@xqFvEI|-83C5#XywQo@;DmQ5RW_e4z0v{x1%*yHtHr9W
z{$5;kCeJUutz(}ME5+OLD%lU+Lp-_t@gd3a$&0wbX#Q)^3*F*O|M4BxpM7fdFAV+%
zIqY=&aFAensEnFAZLsy_Q!61_LNdHpu`3?$39*Rrj3JTC6iypqNFor+0|Zz<Mtp(P
za=zb=5fez>7RevT4&QmC-{2^3(A*;Cd)bEs)|rAL4!l!0h!$xfM`*EIaM!}IfBEIT
zFO#M`$YHoR#Q<d*Q+?6mC{P-c<4kCg3Pm3SXx+QmMVK3pHgwJOxuDmtBhk}!L2k(n
VVO>`W9ZkK%;H=vgc+t)0&L2ctMtT4M

literal 0
HcmV?d00001

diff --git a/pandas/io/tests/data/stata11_115.dta b/pandas/io/tests/data/stata11_115.dta
new file mode 100755
index 0000000000000000000000000000000000000000..cfcd250f1cd9fd5b3c2a77f1414fea73d407d955
GIT binary patch
literal 810
zcmbVK%}N7749?b{gC_+qg2;dr6l~ckUBn)AQT(|mLcx<zM|W(6rBkL$Q4lQnBI?Wd
z2A+L{9tBa%H{BJHwW5JczRY}+ge2X9M+KUqp?LGRV?cw{hw)hVYRBC~B}&SMltDR4
z6nVHGygh`lkP`;eZ2J@VBZ%axI50g?@nuy=Sqmh+zjkeltnPuIFHC5RBn-b5*UEmA
zh;=L0s$G;7_#<#p`d~P&Ln*iV<)4e0-uoydR;L8bm9$bv$|clwd|tA;V{D=$(b6hT
zS}VfXNL%)o^yQh3W8F?fX<c-TvCAR=l;4A)!EQi*4=52Z_%UGm&<_iY!g1D+$KT)J
z2|AZ<G$VK5^$xRUN}VsW_AsR9r%f1oyuWta@G`SM2_In=Sr=I6SZ7&3H;pl}V%<2j
u<8Egip(AatMAMjvc5&>e3rmOGBFa+pinzxAmYO5Glt_&dPzw6_kJKMl=8vKP

literal 0
HcmV?d00001

diff --git a/pandas/io/tests/data/stata11_117.dta b/pandas/io/tests/data/stata11_117.dta
new file mode 100755
index 0000000000000000000000000000000000000000..79dfffd94483f30bcf6f155711caa392d849f970
GIT binary patch
literal 1268
zcmbtUPiqrF6d$94coTZ@A_FC$V4ZA9(a2+55bL25gn-v&lDsy7Nj9CG26Jk`leb>I
z_3p)Q(6isdqaQ#Jyu|lrXPdRgia0R){>*RQ|5+%!@g^I3qhW+*h)#n;i>5%GB#%K=
z0`wOibH{*Jy<Jc%5%zRLfZEg0)whKpwyVLKK3YwD$$%eEkX%h>`i^YNZa$aWt)wlJ
zCr?`)P^(i=RTL95(_}5I50P$ME16EDr;&b)R5r@<_mOt5m-1nxpCUbv^y`iC{9B};
z*elG98=Jnx%r9nG=<mxPOHh}T44N8BN*8V<9L`Hi>%IiYt^@?*r7<XS=k2)gF?UcG
z3fRxx#P~utJC9liLePFrjhZ%-ZI-X#n@Nii0UNl8ZP)Gvn^E5aRr#Vc?Ga|7i`hlK
zEDPRiSIjqn)&e-+%T`xK555ZBL%YA_a7OjOp`@v>e)Ld=2_*SGz8#>oIQ4Q%%UPbg
z2XcqmL~I@k9)~ZV&$Hz*F=y`@ET8_n!GsO&{Lx@-CSuGm7#U{<9vyXXJSa4%-!wyd
zZ`9nAlxH-h!Anz`#*=3^qbQD%xbWfAvy_4-2&ixo3hyl2jkH!s(%Qb&J-S`*0Nbsv
zTTx*BX5{(VZqU9<`yOqs7l+5sgr5duUsMGbZ_q8|L7orAOaAPdu@kSU1N_=4fM24h
b2IX&2uvhlZQE)W8{x6F9jFkneesn(pwkR(F

literal 0
HcmV?d00001

diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py
index d97feaea2658a..a99bcf741792f 100644
--- a/pandas/io/tests/test_stata.py
+++ b/pandas/io/tests/test_stata.py
@@ -14,6 +14,7 @@
 import pandas as pd
 from pandas.compat import iterkeys
 from pandas.core.frame import DataFrame, Series
+from pandas.core.common import is_categorical_dtype
 from pandas.io.parsers import read_csv
 from pandas.io.stata import (read_stata, StataReader, InvalidColumnName,
     PossiblePrecisionLoss, StataMissingValue)
@@ -81,6 +82,11 @@ def setUp(self):
         self.dta18_115 = os.path.join(self.dirpath, 'stata9_115.dta')
         self.dta18_117 = os.path.join(self.dirpath, 'stata9_117.dta')
 
+        self.dta19_115 = os.path.join(self.dirpath, 'stata10_115.dta')
+        self.dta19_117 = os.path.join(self.dirpath, 'stata10_117.dta')
+
+        self.dta20_115 = os.path.join(self.dirpath, 'stata11_115.dta')
+        self.dta20_117 = os.path.join(self.dirpath, 'stata11_117.dta')
 
     def read_dta(self, file):
         # Legacy default reader configuration
@@ -817,6 +823,72 @@ def test_categorical_with_stata_missing_values(self):
             written_and_read_again = self.read_dta(path)
             tm.assert_frame_equal(written_and_read_again.set_index('index'), original)
 
+    def test_categorical_order(self):
+        # Directly construct using expected codes
+        # Format is is_cat, col_name, labels (in order), underlying data
+        expected = [(True, 'ordered', ['a', 'b', 'c', 'd', 'e'], np.arange(5)),
+                    (True, 'reverse', ['a', 'b', 'c', 'd', 'e'], np.arange(5)[::-1]),
+                    (True, 'noorder', ['a', 'b', 'c', 'd', 'e'], np.array([2, 1, 4, 0, 3])),
+                    (True, 'floating', ['a', 'b', 'c', 'd', 'e'], np.arange(0, 5)),
+                    (True, 'float_missing', ['a', 'd', 'e'], np.array([0, 1, 2, -1, -1])),
+                    (False, 'nolabel', [1.0, 2.0, 3.0, 4.0, 5.0], np.arange(5)),
+                    (True, 'int32_mixed', ['d', 2, 'e', 'b', 'a'], np.arange(5))]
+        cols = []
+        for is_cat, col, labels, codes in expected:
+            if is_cat:
+                cols.append((col, pd.Categorical.from_codes(codes, labels)))
+            else:
+                cols.append((col, pd.Series(labels, dtype=np.float32)))
+        expected = DataFrame.from_items(cols)
+
+        # Read with and with out categoricals, ensure order is identical
+        parsed_115 = read_stata(self.dta19_115)
+        parsed_117 = read_stata(self.dta19_117)
+        tm.assert_frame_equal(expected, parsed_115)
+        tm.assert_frame_equal(expected, parsed_117)
+
+        # Check identity of codes
+        for col in expected:
+            if is_categorical_dtype(expected[col]):
+                print(col)
+                tm.assert_series_equal(expected[col].cat.codes,
+                                       parsed_115[col].cat.codes)
+                tm.assert_index_equal(expected[col].cat.categories,
+                                      parsed_115[col].cat.categories)
+
+    def test_categorical_sorting(self):
+        parsed_115 = read_stata(self.dta20_115)
+        parsed_117 = read_stata(self.dta20_117)
+        # Sort based on codes, not strings
+        parsed_115 = parsed_115.sort("srh")
+        parsed_117 = parsed_117.sort("srh")
+        # Don't sort index
+        parsed_115.index = np.arange(parsed_115.shape[0])
+        parsed_117.index = np.arange(parsed_117.shape[0])
+        codes = [-1, -1, 0, 1, 1, 1, 2, 2, 3, 4]
+        categories = ["Poor", "Fair", "Good", "Very good", "Excellent"]
+        expected = pd.Series(pd.Categorical.from_codes(codes=codes,
+                                                       categories=categories))
+        tm.assert_series_equal(expected, parsed_115["srh"])
+        tm.assert_series_equal(expected, parsed_117["srh"])
+
+    def test_categorical_ordering(self):
+        parsed_115 = read_stata(self.dta19_115)
+        parsed_117 = read_stata(self.dta19_117)
+
+        parsed_115_unordered = read_stata(self.dta19_115,
+                                          order_categoricals=False)
+        parsed_117_unordered = read_stata(self.dta19_117,
+                                          order_categoricals=False)
+        for col in parsed_115:
+            if not is_categorical_dtype(parsed_115[col]):
+                continue
+            tm.assert_equal(True, parsed_115[col].cat.ordered)
+            tm.assert_equal(True, parsed_117[col].cat.ordered)
+            tm.assert_equal(False, parsed_115_unordered[col].cat.ordered)
+            tm.assert_equal(False, parsed_117_unordered[col].cat.ordered)
+
+
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)