From 6cf2e48c4f280449a50faed788cbb2abf5e2accc Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Tue, 18 Nov 2014 16:11:33 +0000 Subject: [PATCH] BUG: Correct importing behavior for Categoricals in StataReader Ensure that category codes have the same order as the underlying Stata data. Also adds a flag that allows categorical data to be treated as ordered or unordered when importing. --- doc/source/categorical.rst | 3 +- doc/source/io.rst | 59 +++++++++++++++++++--- doc/source/whatsnew/v0.15.2.txt | 4 +- pandas/io/stata.py | 47 ++++++++++------- pandas/io/tests/data/stata10_115.dta | Bin 0 -> 2298 bytes pandas/io/tests/data/stata10_117.dta | Bin 0 -> 2298 bytes pandas/io/tests/data/stata11_115.dta | Bin 0 -> 810 bytes pandas/io/tests/data/stata11_117.dta | Bin 0 -> 1268 bytes pandas/io/tests/test_stata.py | 72 +++++++++++++++++++++++++++ 9 files changed, 160 insertions(+), 25 deletions(-) create mode 100755 pandas/io/tests/data/stata10_115.dta create mode 100755 pandas/io/tests/data/stata10_117.dta create mode 100755 pandas/io/tests/data/stata11_115.dta create mode 100755 pandas/io/tests/data/stata11_117.dta diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 3d7b41a1c4c24..a7091d6ab38fb 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -546,7 +546,8 @@ Getting Data In/Out Writing data (`Series`, `Frames`) to a HDF store that contains a ``category`` dtype was implemented in 0.15.2. See :ref:`here ` for an example and caveats. -Writing data to/from Stata format files was implemented in 0.15.2. +Writing data to and reading data from *Stata* format files was implemented in +0.15.2. See :ref:`here ` for an example and caveats. Writing to a CSV file will convert the data, effectively removing any information about the categorical (categories and ordering). So if you read back the CSV file you have to convert the diff --git a/doc/source/io.rst b/doc/source/io.rst index bd6400787ae58..9686a72d43cf8 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -3204,8 +3204,8 @@ format store like this: .. ipython:: python store_export = HDFStore('export.h5') - store_export.append('df_dc', df_dc, data_columns=df_dc.columns) - store_export + store_export.append('df_dc', df_dc, data_columns=df_dc.columns) + store_export .. ipython:: python :suppress: @@ -3240,8 +3240,8 @@ number of options, please see the docstring. legacy_store # copy (and return the new handle) - new_store = legacy_store.copy('store_new.h5') - new_store + new_store = legacy_store.copy('store_new.h5') + new_store new_store.close() .. ipython:: python @@ -3651,14 +3651,14 @@ You can access the management console to determine project id's by: .. _io.stata: -STATA Format +Stata Format ------------ .. versionadded:: 0.12.0 .. _io.stata_writer: -Writing to STATA format +Writing to Stata format ~~~~~~~~~~~~~~~~~~~~~~~ The method :func:`~pandas.core.frame.DataFrame.to_stata` will write a DataFrame @@ -3753,6 +3753,53 @@ Alternatively, the function :func:`~pandas.io.stata.read_stata` can be used import os os.remove('stata.dta') +.. _io.stata-categorical: + +Categorical Data +~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.15.2 + +``Categorical`` data can be exported to *Stata* data files as value labeled data. +The exported data consists of the underlying category codes as integer data values +and the categories as value labels. *Stata* does not have an explicit equivalent +to a ``Categorical`` and information about *whether* the variable is ordered +is lost when exporting. + +.. warning:: + + *Stata* only supports string value labels, and so ``str`` is called on the + categories when exporting data. Exporting ``Categorical`` variables with + non-string categories produces a warning, and can result a loss of + information if the ``str`` representations of the categories are not unique. + +Labeled data can similarly be imported from *Stata* data files as ``Categorical`` +variables using the keyword argument ``convert_categoricals`` (``True`` by default). +By default, imported ``Categorical`` variables are ordered according to the +underlying numerical data. However, setting ``order_categoricals=False`` will +import labeled data as ``Categorical`` variables without an order. + +.. note:: + + When importing categorical data, the values of the variables in the *Stata* + data file are not generally preserved since ``Categorical`` variables always + use integer data types between ``-1`` and ``n-1`` where ``n`` is the number + of categories. If the original values in the *Stata* data file are required, + these can be imported by setting ``convert_categoricals=False``, which will + import original data (but not the variable labels). The original values can + be matched to the imported categorical data since there is a simple mapping + between the original *Stata* data values and the category codes of imported + Categorical variables: missing values are assigned code ``-1``, and the + smallest original value is assigned ``0``, the second smallest is assigned + ``1`` and so on until the largest original value is assigned the code ``n-1``. + +.. note:: + + *Stata* suppots partially labeled series. These series have value labels for + some but not all data values. Importing a partially labeled series will produce + a ``Categorial`` with string categories for the values that are labeled and + numeric categories for values with no label. + .. _io.perf: Performance Considerations diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index 61d18da45e5f0..fedd3ddabf045 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -41,10 +41,11 @@ API changes Enhancements ~~~~~~~~~~~~ -- Added ability to export Categorical data to Stata (:issue:`8633`). +- Added ability to export Categorical data to Stata (:issue:`8633`). See :ref:`here ` for limitations of categorical variables exported to Stata data files. - Added ability to export Categorical data to to/from HDF5 (:issue:`7621`). Queries work the same as if it was an object array. However, the ``category`` dtyped data is stored in a more efficient manner. See :ref:`here ` for an example and caveats w.r.t. prior versions of pandas. - Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on `Timestamp` class (:issue:`5351`). - Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See :ref:`here`. +- Added flag ``order_categoricals`` to ``StataReader`` and ``read_stata`` to select whether to order imported categorical data (:issue:`8836`). See :ref:`here ` for more information on importing categorical variables from Stata data files. .. _whatsnew_0152.performance: @@ -73,6 +74,7 @@ Bug Fixes +- Imported categorical variables from Stata files retain the ordinal information in the underlying data (:issue:`8836`). diff --git a/pandas/io/stata.py b/pandas/io/stata.py index ab9d330b48988..45d3274088c75 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -29,7 +29,8 @@ def read_stata(filepath_or_buffer, convert_dates=True, convert_categoricals=True, encoding=None, index=None, - convert_missing=False, preserve_dtypes=True, columns=None): + convert_missing=False, preserve_dtypes=True, columns=None, + order_categoricals=True): """ Read Stata file into DataFrame @@ -58,11 +59,14 @@ def read_stata(filepath_or_buffer, convert_dates=True, columns : list or None Columns to retain. Columns will be returned in the given order. None returns all columns + order_categoricals : boolean, defaults to True + Flag indicating whether converted categorical data are ordered. """ reader = StataReader(filepath_or_buffer, encoding) return reader.data(convert_dates, convert_categoricals, index, - convert_missing, preserve_dtypes, columns) + convert_missing, preserve_dtypes, columns, + order_categoricals) _date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"] @@ -1136,7 +1140,8 @@ def _read_strls(self): self.path_or_buf.read(1) # zero-termination def data(self, convert_dates=True, convert_categoricals=True, index=None, - convert_missing=False, preserve_dtypes=True, columns=None): + convert_missing=False, preserve_dtypes=True, columns=None, + order_categoricals=True): """ Reads observations from Stata file, converting them into a dataframe @@ -1161,6 +1166,8 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None, columns : list or None Columns to retain. Columns will be returned in the given order. None returns all columns + order_categoricals : boolean, defaults to True + Flag indicating whether converted categorical data are ordered. Returns ------- @@ -1228,7 +1235,7 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None, for col, typ in zip(data, self.typlist): if type(typ) is int: - data[col] = data[col].apply(self._null_terminate, convert_dtype=True,) + data[col] = data[col].apply(self._null_terminate, convert_dtype=True) cols_ = np.where(self.dtyplist)[0] @@ -1288,19 +1295,25 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None, col = data.columns[i] data[col] = _stata_elapsed_date_to_datetime_vec(data[col], self.fmtlist[i]) - if convert_categoricals: - cols = np.where( - lmap(lambda x: x in compat.iterkeys(self.value_label_dict), - self.lbllist) - )[0] - for i in cols: - col = data.columns[i] - labeled_data = np.copy(data[col]) - labeled_data = labeled_data.astype(object) - for k, v in compat.iteritems( - self.value_label_dict[self.lbllist[i]]): - labeled_data[(data[col] == k).values] = v - data[col] = Categorical.from_array(labeled_data) + if convert_categoricals and self.value_label_dict: + value_labels = list(compat.iterkeys(self.value_label_dict)) + cat_converted_data = [] + for col, label in zip(data, self.lbllist): + if label in value_labels: + # Explicit call with ordered=True + cat_data = Categorical(data[col], ordered=order_categoricals) + value_label_dict = self.value_label_dict[label] + categories = [] + for category in cat_data.categories: + if category in value_label_dict: + categories.append(value_label_dict[category]) + else: + categories.append(category) # Partially labeled + cat_data.categories = categories + cat_converted_data.append((col, cat_data)) + else: + cat_converted_data.append((col, data[col])) + data = DataFrame.from_items(cat_converted_data) if not preserve_dtypes: retyped_data = [] diff --git a/pandas/io/tests/data/stata10_115.dta b/pandas/io/tests/data/stata10_115.dta new file mode 100755 index 0000000000000000000000000000000000000000..b917dde5ad47dfecfc872b5a5ea756ce603eda59 GIT binary patch literal 2298 zcmeHJy>1jS5O$8A1PUZ1puUEdy$cahqLH)WuL-&aD5zH6TZgQ)KPTH9L_#7GFF;Fo zqT*?I0-k{)4TvVpc)ZzF?v@&f>}hVspJ!*rW6yZOI^|T7IVIsxt4tdSMi*LHEyv?+ zh?)VUM^4v<6rMeQ3Q;?ReYq|m+LthqSA;-uBB4;JE+oniKYU zG~ODHc6PT$5Vf-qHBppmCefOIJ_mkljfF>n?+tjI1^(?4kH24L{$QQ?tH4hJU)x~( zN8rB$hlsqwT$x;@g-+bjOk4T$wEg`$h3MQgsaFb@Cw+Q!#Qz&WRkgR# zuW(S*%H`GTUJb91l)1G%T2<}0q}PDRD|derExhYniO&G97;J4^kz0pbbt7y7TZZUt z3Jz*hDre>0oqMDH$X7&k@xqFvEI|-83C5#XywQo@;DmQ5RW_e4z0v{x1%*yHtHr9W z{$5;kCeJUutz(}ME5+OLD%lU+Lp-_t@gd3a$&0wbX#Q)^3*F*O|M4BxpM7fdFAV+% zIqY=&aFAensEnFAZLsy_Q!61_LNdHpu`3?$39*Rrj3JTC6iypqNFor+0|Zz7RevT4&QmC-{2^3(A*;Cd)bEs)|rAL4!l!0h!$xfM`*EIaM!}IfBEIT zFO#M`$YHoR#Q`W9ZkK%;H=vgc+t)0&L2ctMtT4M literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata10_117.dta b/pandas/io/tests/data/stata10_117.dta new file mode 100755 index 0000000000000000000000000000000000000000..b917dde5ad47dfecfc872b5a5ea756ce603eda59 GIT binary patch literal 2298 zcmeHJy>1jS5O$8A1PUZ1puUEdy$cahqLH)WuL-&aD5zH6TZgQ)KPTH9L_#7GFF;Fo zqT*?I0-k{)4TvVpc)ZzF?v@&f>}hVspJ!*rW6yZOI^|T7IVIsxt4tdSMi*LHEyv?+ zh?)VUM^4v<6rMeQ3Q;?ReYq|m+LthqSA;-uBB4;JE+oniKYU zG~ODHc6PT$5Vf-qHBppmCefOIJ_mkljfF>n?+tjI1^(?4kH24L{$QQ?tH4hJU)x~( zN8rB$hlsqwT$x;@g-+bjOk4T$wEg`$h3MQgsaFb@Cw+Q!#Qz&WRkgR# zuW(S*%H`GTUJb91l)1G%T2<}0q}PDRD|derExhYniO&G97;J4^kz0pbbt7y7TZZUt z3Jz*hDre>0oqMDH$X7&k@xqFvEI|-83C5#XywQo@;DmQ5RW_e4z0v{x1%*yHtHr9W z{$5;kCeJUutz(}ME5+OLD%lU+Lp-_t@gd3a$&0wbX#Q)^3*F*O|M4BxpM7fdFAV+% zIqY=&aFAensEnFAZLsy_Q!61_LNdHpu`3?$39*Rrj3JTC6iypqNFor+0|Zz7RevT4&QmC-{2^3(A*;Cd)bEs)|rAL4!l!0h!$xfM`*EIaM!}IfBEIT zFO#M`$YHoR#Q`W9ZkK%;H=vgc+t)0&L2ctMtT4M literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata11_115.dta b/pandas/io/tests/data/stata11_115.dta new file mode 100755 index 0000000000000000000000000000000000000000..cfcd250f1cd9fd5b3c2a77f1414fea73d407d955 GIT binary patch literal 810 zcmbVK%}N7749?b{gC_+qg2;dr6l~ckUBn)AQT(|mLcxe3rmOGBFa+pinzxAmYO5Glt_&dPzw6_kJKMl=8vKP literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata11_117.dta b/pandas/io/tests/data/stata11_117.dta new file mode 100755 index 0000000000000000000000000000000000000000..79dfffd94483f30bcf6f155711caa392d849f970 GIT binary patch literal 1268 zcmbtUPiqrF6d$94coTZ@A_FC$V4ZA9(a2+55bL25gn-v&lDsy7Nj9CG26Jk`leb>I z_3p)Q(6isdqaQ#Jyu|lrXPdRgia0R){>*RQ|5+%!@g^I3qhW+*h)#n;i>5%GB#%K= z0`wOibH{*Jy`i^YNZa$aWt)wlJ zCr?`)P^(i=RTL95(_}5I50P$ME16EDr;&b)R5r@<_mOt5m-1nxpCUbv^y`iC{9B}; z*elG98=Jnx%r9nG=%IiYt^@?*r7e)Ld=2_*SGz8#>oIQ4Q%%UPbg z2XcqmL~I@k9)~ZV&$Hz*F=y`@ET8_n!GsO&{Lx@-CSuGm7#U{<9vyXXJSa4%-!wyd zZ`9nAlxH-h!Anz`#*=3^qbQD%xbWfAvy_4-2&ixo3hyl2jkH!s(%Qb&J-S`*0Nbsv zTTx*BX5{(VZqU9<`yOqs7l+5sgr5duUsMGbZ_q8|L7orAOaAPdu@kSU1N_=4fM24h b2IX&2uvhlZQE)W8{x6F9jFkneesn(pwkR(F literal 0 HcmV?d00001 diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index d97feaea2658a..a99bcf741792f 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -14,6 +14,7 @@ import pandas as pd from pandas.compat import iterkeys from pandas.core.frame import DataFrame, Series +from pandas.core.common import is_categorical_dtype from pandas.io.parsers import read_csv from pandas.io.stata import (read_stata, StataReader, InvalidColumnName, PossiblePrecisionLoss, StataMissingValue) @@ -81,6 +82,11 @@ def setUp(self): self.dta18_115 = os.path.join(self.dirpath, 'stata9_115.dta') self.dta18_117 = os.path.join(self.dirpath, 'stata9_117.dta') + self.dta19_115 = os.path.join(self.dirpath, 'stata10_115.dta') + self.dta19_117 = os.path.join(self.dirpath, 'stata10_117.dta') + + self.dta20_115 = os.path.join(self.dirpath, 'stata11_115.dta') + self.dta20_117 = os.path.join(self.dirpath, 'stata11_117.dta') def read_dta(self, file): # Legacy default reader configuration @@ -817,6 +823,72 @@ def test_categorical_with_stata_missing_values(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), original) + def test_categorical_order(self): + # Directly construct using expected codes + # Format is is_cat, col_name, labels (in order), underlying data + expected = [(True, 'ordered', ['a', 'b', 'c', 'd', 'e'], np.arange(5)), + (True, 'reverse', ['a', 'b', 'c', 'd', 'e'], np.arange(5)[::-1]), + (True, 'noorder', ['a', 'b', 'c', 'd', 'e'], np.array([2, 1, 4, 0, 3])), + (True, 'floating', ['a', 'b', 'c', 'd', 'e'], np.arange(0, 5)), + (True, 'float_missing', ['a', 'd', 'e'], np.array([0, 1, 2, -1, -1])), + (False, 'nolabel', [1.0, 2.0, 3.0, 4.0, 5.0], np.arange(5)), + (True, 'int32_mixed', ['d', 2, 'e', 'b', 'a'], np.arange(5))] + cols = [] + for is_cat, col, labels, codes in expected: + if is_cat: + cols.append((col, pd.Categorical.from_codes(codes, labels))) + else: + cols.append((col, pd.Series(labels, dtype=np.float32))) + expected = DataFrame.from_items(cols) + + # Read with and with out categoricals, ensure order is identical + parsed_115 = read_stata(self.dta19_115) + parsed_117 = read_stata(self.dta19_117) + tm.assert_frame_equal(expected, parsed_115) + tm.assert_frame_equal(expected, parsed_117) + + # Check identity of codes + for col in expected: + if is_categorical_dtype(expected[col]): + print(col) + tm.assert_series_equal(expected[col].cat.codes, + parsed_115[col].cat.codes) + tm.assert_index_equal(expected[col].cat.categories, + parsed_115[col].cat.categories) + + def test_categorical_sorting(self): + parsed_115 = read_stata(self.dta20_115) + parsed_117 = read_stata(self.dta20_117) + # Sort based on codes, not strings + parsed_115 = parsed_115.sort("srh") + parsed_117 = parsed_117.sort("srh") + # Don't sort index + parsed_115.index = np.arange(parsed_115.shape[0]) + parsed_117.index = np.arange(parsed_117.shape[0]) + codes = [-1, -1, 0, 1, 1, 1, 2, 2, 3, 4] + categories = ["Poor", "Fair", "Good", "Very good", "Excellent"] + expected = pd.Series(pd.Categorical.from_codes(codes=codes, + categories=categories)) + tm.assert_series_equal(expected, parsed_115["srh"]) + tm.assert_series_equal(expected, parsed_117["srh"]) + + def test_categorical_ordering(self): + parsed_115 = read_stata(self.dta19_115) + parsed_117 = read_stata(self.dta19_117) + + parsed_115_unordered = read_stata(self.dta19_115, + order_categoricals=False) + parsed_117_unordered = read_stata(self.dta19_117, + order_categoricals=False) + for col in parsed_115: + if not is_categorical_dtype(parsed_115[col]): + continue + tm.assert_equal(True, parsed_115[col].cat.ordered) + tm.assert_equal(True, parsed_117[col].cat.ordered) + tm.assert_equal(False, parsed_115_unordered[col].cat.ordered) + tm.assert_equal(False, parsed_117_unordered[col].cat.ordered) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False)