diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 3d7b41a1c4c24..a7091d6ab38fb 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -546,7 +546,8 @@ Getting Data In/Out Writing data (`Series`, `Frames`) to a HDF store that contains a ``category`` dtype was implemented in 0.15.2. See :ref:`here ` for an example and caveats. -Writing data to/from Stata format files was implemented in 0.15.2. +Writing data to and reading data from *Stata* format files was implemented in +0.15.2. See :ref:`here ` for an example and caveats. Writing to a CSV file will convert the data, effectively removing any information about the categorical (categories and ordering). So if you read back the CSV file you have to convert the diff --git a/doc/source/io.rst b/doc/source/io.rst index bd6400787ae58..9686a72d43cf8 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -3204,8 +3204,8 @@ format store like this: .. ipython:: python store_export = HDFStore('export.h5') - store_export.append('df_dc', df_dc, data_columns=df_dc.columns) - store_export + store_export.append('df_dc', df_dc, data_columns=df_dc.columns) + store_export .. ipython:: python :suppress: @@ -3240,8 +3240,8 @@ number of options, please see the docstring. legacy_store # copy (and return the new handle) - new_store = legacy_store.copy('store_new.h5') - new_store + new_store = legacy_store.copy('store_new.h5') + new_store new_store.close() .. ipython:: python @@ -3651,14 +3651,14 @@ You can access the management console to determine project id's by: .. _io.stata: -STATA Format +Stata Format ------------ .. versionadded:: 0.12.0 .. _io.stata_writer: -Writing to STATA format +Writing to Stata format ~~~~~~~~~~~~~~~~~~~~~~~ The method :func:`~pandas.core.frame.DataFrame.to_stata` will write a DataFrame @@ -3753,6 +3753,53 @@ Alternatively, the function :func:`~pandas.io.stata.read_stata` can be used import os os.remove('stata.dta') +.. _io.stata-categorical: + +Categorical Data +~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.15.2 + +``Categorical`` data can be exported to *Stata* data files as value labeled data. +The exported data consists of the underlying category codes as integer data values +and the categories as value labels. *Stata* does not have an explicit equivalent +to a ``Categorical`` and information about *whether* the variable is ordered +is lost when exporting. + +.. warning:: + + *Stata* only supports string value labels, and so ``str`` is called on the + categories when exporting data. Exporting ``Categorical`` variables with + non-string categories produces a warning, and can result a loss of + information if the ``str`` representations of the categories are not unique. + +Labeled data can similarly be imported from *Stata* data files as ``Categorical`` +variables using the keyword argument ``convert_categoricals`` (``True`` by default). +By default, imported ``Categorical`` variables are ordered according to the +underlying numerical data. However, setting ``order_categoricals=False`` will +import labeled data as ``Categorical`` variables without an order. + +.. note:: + + When importing categorical data, the values of the variables in the *Stata* + data file are not generally preserved since ``Categorical`` variables always + use integer data types between ``-1`` and ``n-1`` where ``n`` is the number + of categories. If the original values in the *Stata* data file are required, + these can be imported by setting ``convert_categoricals=False``, which will + import original data (but not the variable labels). The original values can + be matched to the imported categorical data since there is a simple mapping + between the original *Stata* data values and the category codes of imported + Categorical variables: missing values are assigned code ``-1``, and the + smallest original value is assigned ``0``, the second smallest is assigned + ``1`` and so on until the largest original value is assigned the code ``n-1``. + +.. note:: + + *Stata* suppots partially labeled series. These series have value labels for + some but not all data values. Importing a partially labeled series will produce + a ``Categorial`` with string categories for the values that are labeled and + numeric categories for values with no label. + .. _io.perf: Performance Considerations diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index 61d18da45e5f0..fedd3ddabf045 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -41,10 +41,11 @@ API changes Enhancements ~~~~~~~~~~~~ -- Added ability to export Categorical data to Stata (:issue:`8633`). +- Added ability to export Categorical data to Stata (:issue:`8633`). See :ref:`here ` for limitations of categorical variables exported to Stata data files. - Added ability to export Categorical data to to/from HDF5 (:issue:`7621`). Queries work the same as if it was an object array. However, the ``category`` dtyped data is stored in a more efficient manner. See :ref:`here ` for an example and caveats w.r.t. prior versions of pandas. - Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on `Timestamp` class (:issue:`5351`). - Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See :ref:`here`. +- Added flag ``order_categoricals`` to ``StataReader`` and ``read_stata`` to select whether to order imported categorical data (:issue:`8836`). See :ref:`here ` for more information on importing categorical variables from Stata data files. .. _whatsnew_0152.performance: @@ -73,6 +74,7 @@ Bug Fixes +- Imported categorical variables from Stata files retain the ordinal information in the underlying data (:issue:`8836`). diff --git a/pandas/io/stata.py b/pandas/io/stata.py index ab9d330b48988..45d3274088c75 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -29,7 +29,8 @@ def read_stata(filepath_or_buffer, convert_dates=True, convert_categoricals=True, encoding=None, index=None, - convert_missing=False, preserve_dtypes=True, columns=None): + convert_missing=False, preserve_dtypes=True, columns=None, + order_categoricals=True): """ Read Stata file into DataFrame @@ -58,11 +59,14 @@ def read_stata(filepath_or_buffer, convert_dates=True, columns : list or None Columns to retain. Columns will be returned in the given order. None returns all columns + order_categoricals : boolean, defaults to True + Flag indicating whether converted categorical data are ordered. """ reader = StataReader(filepath_or_buffer, encoding) return reader.data(convert_dates, convert_categoricals, index, - convert_missing, preserve_dtypes, columns) + convert_missing, preserve_dtypes, columns, + order_categoricals) _date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"] @@ -1136,7 +1140,8 @@ def _read_strls(self): self.path_or_buf.read(1) # zero-termination def data(self, convert_dates=True, convert_categoricals=True, index=None, - convert_missing=False, preserve_dtypes=True, columns=None): + convert_missing=False, preserve_dtypes=True, columns=None, + order_categoricals=True): """ Reads observations from Stata file, converting them into a dataframe @@ -1161,6 +1166,8 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None, columns : list or None Columns to retain. Columns will be returned in the given order. None returns all columns + order_categoricals : boolean, defaults to True + Flag indicating whether converted categorical data are ordered. Returns ------- @@ -1228,7 +1235,7 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None, for col, typ in zip(data, self.typlist): if type(typ) is int: - data[col] = data[col].apply(self._null_terminate, convert_dtype=True,) + data[col] = data[col].apply(self._null_terminate, convert_dtype=True) cols_ = np.where(self.dtyplist)[0] @@ -1288,19 +1295,25 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None, col = data.columns[i] data[col] = _stata_elapsed_date_to_datetime_vec(data[col], self.fmtlist[i]) - if convert_categoricals: - cols = np.where( - lmap(lambda x: x in compat.iterkeys(self.value_label_dict), - self.lbllist) - )[0] - for i in cols: - col = data.columns[i] - labeled_data = np.copy(data[col]) - labeled_data = labeled_data.astype(object) - for k, v in compat.iteritems( - self.value_label_dict[self.lbllist[i]]): - labeled_data[(data[col] == k).values] = v - data[col] = Categorical.from_array(labeled_data) + if convert_categoricals and self.value_label_dict: + value_labels = list(compat.iterkeys(self.value_label_dict)) + cat_converted_data = [] + for col, label in zip(data, self.lbllist): + if label in value_labels: + # Explicit call with ordered=True + cat_data = Categorical(data[col], ordered=order_categoricals) + value_label_dict = self.value_label_dict[label] + categories = [] + for category in cat_data.categories: + if category in value_label_dict: + categories.append(value_label_dict[category]) + else: + categories.append(category) # Partially labeled + cat_data.categories = categories + cat_converted_data.append((col, cat_data)) + else: + cat_converted_data.append((col, data[col])) + data = DataFrame.from_items(cat_converted_data) if not preserve_dtypes: retyped_data = [] diff --git a/pandas/io/tests/data/stata10_115.dta b/pandas/io/tests/data/stata10_115.dta new file mode 100755 index 0000000000000..b917dde5ad47d Binary files /dev/null and b/pandas/io/tests/data/stata10_115.dta differ diff --git a/pandas/io/tests/data/stata10_117.dta b/pandas/io/tests/data/stata10_117.dta new file mode 100755 index 0000000000000..b917dde5ad47d Binary files /dev/null and b/pandas/io/tests/data/stata10_117.dta differ diff --git a/pandas/io/tests/data/stata11_115.dta b/pandas/io/tests/data/stata11_115.dta new file mode 100755 index 0000000000000..cfcd250f1cd9f Binary files /dev/null and b/pandas/io/tests/data/stata11_115.dta differ diff --git a/pandas/io/tests/data/stata11_117.dta b/pandas/io/tests/data/stata11_117.dta new file mode 100755 index 0000000000000..79dfffd94483f Binary files /dev/null and b/pandas/io/tests/data/stata11_117.dta differ diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index d97feaea2658a..a99bcf741792f 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -14,6 +14,7 @@ import pandas as pd from pandas.compat import iterkeys from pandas.core.frame import DataFrame, Series +from pandas.core.common import is_categorical_dtype from pandas.io.parsers import read_csv from pandas.io.stata import (read_stata, StataReader, InvalidColumnName, PossiblePrecisionLoss, StataMissingValue) @@ -81,6 +82,11 @@ def setUp(self): self.dta18_115 = os.path.join(self.dirpath, 'stata9_115.dta') self.dta18_117 = os.path.join(self.dirpath, 'stata9_117.dta') + self.dta19_115 = os.path.join(self.dirpath, 'stata10_115.dta') + self.dta19_117 = os.path.join(self.dirpath, 'stata10_117.dta') + + self.dta20_115 = os.path.join(self.dirpath, 'stata11_115.dta') + self.dta20_117 = os.path.join(self.dirpath, 'stata11_117.dta') def read_dta(self, file): # Legacy default reader configuration @@ -817,6 +823,72 @@ def test_categorical_with_stata_missing_values(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), original) + def test_categorical_order(self): + # Directly construct using expected codes + # Format is is_cat, col_name, labels (in order), underlying data + expected = [(True, 'ordered', ['a', 'b', 'c', 'd', 'e'], np.arange(5)), + (True, 'reverse', ['a', 'b', 'c', 'd', 'e'], np.arange(5)[::-1]), + (True, 'noorder', ['a', 'b', 'c', 'd', 'e'], np.array([2, 1, 4, 0, 3])), + (True, 'floating', ['a', 'b', 'c', 'd', 'e'], np.arange(0, 5)), + (True, 'float_missing', ['a', 'd', 'e'], np.array([0, 1, 2, -1, -1])), + (False, 'nolabel', [1.0, 2.0, 3.0, 4.0, 5.0], np.arange(5)), + (True, 'int32_mixed', ['d', 2, 'e', 'b', 'a'], np.arange(5))] + cols = [] + for is_cat, col, labels, codes in expected: + if is_cat: + cols.append((col, pd.Categorical.from_codes(codes, labels))) + else: + cols.append((col, pd.Series(labels, dtype=np.float32))) + expected = DataFrame.from_items(cols) + + # Read with and with out categoricals, ensure order is identical + parsed_115 = read_stata(self.dta19_115) + parsed_117 = read_stata(self.dta19_117) + tm.assert_frame_equal(expected, parsed_115) + tm.assert_frame_equal(expected, parsed_117) + + # Check identity of codes + for col in expected: + if is_categorical_dtype(expected[col]): + print(col) + tm.assert_series_equal(expected[col].cat.codes, + parsed_115[col].cat.codes) + tm.assert_index_equal(expected[col].cat.categories, + parsed_115[col].cat.categories) + + def test_categorical_sorting(self): + parsed_115 = read_stata(self.dta20_115) + parsed_117 = read_stata(self.dta20_117) + # Sort based on codes, not strings + parsed_115 = parsed_115.sort("srh") + parsed_117 = parsed_117.sort("srh") + # Don't sort index + parsed_115.index = np.arange(parsed_115.shape[0]) + parsed_117.index = np.arange(parsed_117.shape[0]) + codes = [-1, -1, 0, 1, 1, 1, 2, 2, 3, 4] + categories = ["Poor", "Fair", "Good", "Very good", "Excellent"] + expected = pd.Series(pd.Categorical.from_codes(codes=codes, + categories=categories)) + tm.assert_series_equal(expected, parsed_115["srh"]) + tm.assert_series_equal(expected, parsed_117["srh"]) + + def test_categorical_ordering(self): + parsed_115 = read_stata(self.dta19_115) + parsed_117 = read_stata(self.dta19_117) + + parsed_115_unordered = read_stata(self.dta19_115, + order_categoricals=False) + parsed_117_unordered = read_stata(self.dta19_117, + order_categoricals=False) + for col in parsed_115: + if not is_categorical_dtype(parsed_115[col]): + continue + tm.assert_equal(True, parsed_115[col].cat.ordered) + tm.assert_equal(True, parsed_117[col].cat.ordered) + tm.assert_equal(False, parsed_115_unordered[col].cat.ordered) + tm.assert_equal(False, parsed_117_unordered[col].cat.ordered) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False)