diff --git a/doc/source/io.rst b/doc/source/io.rst index d6abed6e9d1ad..4d47d8b77aebf 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -452,7 +452,8 @@ Specifying Categorical dtype .. versionadded:: 0.19.0 -``Categorical`` columns can be parsed directly by specifying ``dtype='category'`` +``Categorical`` columns can be parsed directly by specifying ``dtype='category'`` or +``dtype=CategoricalDtype(categories, ordered)``. .. ipython:: python @@ -468,12 +469,40 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes +.. versionadded:: 0.21.0 + +Specifying ``dtype='cateogry'`` will result in an unordered ``Categorical`` +whose ``categories`` are the unique values observed in the data. For more +control on the categories and order, create a +:class:`~pandas.api.types.CategoricalDtype` ahead of time, and pass that for +that column's ``dtype``. + +.. ipython:: python + + from pandas.api.types import CategoricalDtype + + dtype = CategoricalDtype(['d', 'c', 'b', 'a'], ordered=True) + pd.read_csv(StringIO(data), dtype={'col1': dtype}).dtypes + +When using ``dtype=CategoricalDtype``, "unexpected" values outside of +``dtype.categories`` are treated as missing values. + +.. ipython:: python + + dtype = CategoricalDtype(['a', 'b', 'd']) # No 'c' + pd.read_csv(StringIO(data), dtype={'col1': dtype}).col1 + +This matches the behavior of :meth:`Categorical.set_categories`. + .. note:: - The resulting categories will always be parsed as strings (object dtype). - If the categories are numeric they can be converted using the - :func:`to_numeric` function, or as appropriate, another converter - such as :func:`to_datetime`. + With ``dtype='category'``, the resulting categories will always be parsed + as strings (object dtype). If the categories are numeric they can be + converted using the :func:`to_numeric` function, or as appropriate, another + converter such as :func:`to_datetime`. + + When ``dtype`` is a ``CategoricalDtype`` with homogenous ``categories`` ( + all numeric, all datetimes, etc.), the conversion is done automatically. .. ipython:: python diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index dae93feb48b02..72847de135d91 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -119,7 +119,7 @@ expanded to include the ``categories`` and ``ordered`` attributes. A ``CategoricalDtype`` can be used to specify the set of categories and orderedness of an array, independent of the data themselves. This can be useful, e.g., when converting string data to a ``Categorical`` (:issue:`14711`, -:issue:`15078`, :issue:`16015`): +:issue:`15078`, :issue:`16015`, :issue:`17643`): .. ipython:: python @@ -129,8 +129,37 @@ e.g., when converting string data to a ``Categorical`` (:issue:`14711`, dtype = CategoricalDtype(categories=['a', 'b', 'c', 'd'], ordered=True) s.astype(dtype) +One place that deserves special mention is in :meth:`read_csv`. Previously, with +``dtype={'col': 'category'}``, the returned values and categories would always +be strings. + +.. ipython:: python + :suppress: + + from pandas.compat import StringIO + +.. ipython:: python + + data = 'A,B\na,1\nb,2\nc,3' + pd.read_csv(StringIO(data), dtype={'B': 'category'}).B.cat.categories + +Notice the "object" dtype. + +With a ``CategoricalDtype`` of all numerics, datetimes, or +timedeltas, we can automatically convert to the correct type + + dtype = {'B': CategoricalDtype([1, 2, 3])} + pd.read_csv(StringIO(data), dtype=dtype).B.cat.categories + +The values have been correctly interpreted as integers. + The ``.dtype`` property of a ``Categorical``, ``CategoricalIndex`` or a -``Series`` with categorical type will now return an instance of ``CategoricalDtype``. +``Series`` with categorical type will now return an instance of +``CategoricalDtype``. For the most part, this is backwards compatible, though +the string repr has changed. If you were previously using ``str(s.dtype) == +'category'`` to detect categorical data, switch to +:func:`pandas.api.types.is_categorical_dtype`, which is compatible with the old +and new ``CategoricalDtype``. See the :ref:`CategoricalDtype docs ` for more. diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 5bf9f4ce83cbf..60a646769dd1a 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -45,7 +45,7 @@ from pandas.core.dtypes.common import ( is_bool_dtype, is_object_dtype, is_string_dtype, is_datetime64_dtype, pandas_dtype) -from pandas.core.categorical import Categorical +from pandas.core.categorical import Categorical, _recode_for_categories from pandas.core.algorithms import take_1d from pandas.core.dtypes.concat import union_categoricals from pandas import Index @@ -1267,19 +1267,14 @@ cdef class TextReader: return self._string_convert(i, start, end, na_filter, na_hashset) elif is_categorical_dtype(dtype): + # TODO: I suspect that _categorical_convert could be + # optimized when dtype is an instance of CategoricalDtype codes, cats, na_count = _categorical_convert( self.parser, i, start, end, na_filter, na_hashset, self.c_encoding) - # sort categories and recode if necessary - cats = Index(cats) - if not cats.is_monotonic_increasing: - unsorted = cats.copy() - cats = cats.sort_values() - indexer = cats.get_indexer(unsorted) - codes = take_1d(indexer, codes, fill_value=-1) - - return Categorical(codes, categories=cats, ordered=False, - fastpath=True), na_count + cat = Categorical._from_inferred_categories(cats, codes, dtype) + return cat, na_count + elif is_object_dtype(dtype): return self._string_convert(i, start, end, na_filter, na_hashset) @@ -2230,8 +2225,11 @@ def _concatenate_chunks(list chunks): if common_type == np.object: warning_columns.append(str(name)) - if is_categorical_dtype(dtypes.pop()): - result[name] = union_categoricals(arrs, sort_categories=True) + dtype = dtypes.pop() + if is_categorical_dtype(dtype): + sort_categories = isinstance(dtype, str) + result[name] = union_categoricals(arrs, + sort_categories=sort_categories) else: result[name] = np.concatenate(arrs) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index d79937829cf3f..ce71e6fd74326 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -21,6 +21,8 @@ _ensure_platform_int, is_dtype_equal, is_datetimelike, + is_datetime64_dtype, + is_timedelta64_dtype, is_categorical, is_categorical_dtype, is_integer_dtype, @@ -509,6 +511,59 @@ def base(self): """ compat, we are always our own object """ return None + @classmethod + def _from_inferred_categories(cls, inferred_categories, inferred_codes, + dtype): + """Construct a Categorical from inferred values + + For inferred categories (`dtype` is None) the categories are sorted. + For explicit `dtype`, the `inferred_categories` are cast to the + appropriate type. + + Parameters + ---------- + + inferred_categories : Index + inferred_codes : Index + dtype : CategoricalDtype or 'category' + + Returns + ------- + Categorical + """ + from pandas import Index, to_numeric, to_datetime, to_timedelta + + cats = Index(inferred_categories) + + known_categories = (isinstance(dtype, CategoricalDtype) and + dtype.categories is not None) + + if known_categories: + # Convert to a specialzed type with `dtype` if specified + if dtype.categories.is_numeric(): + cats = to_numeric(inferred_categories, errors='coerce') + elif is_datetime64_dtype(dtype.categories): + cats = to_datetime(inferred_categories, errors='coerce') + elif is_timedelta64_dtype(dtype.categories): + cats = to_timedelta(inferred_categories, errors='coerce') + + if known_categories: + # recode from observation oder to dtype.categories order + categories = dtype.categories + codes = _recode_for_categories(inferred_codes, cats, categories) + elif not cats.is_monotonic_increasing: + # sort categories and recode for unknown categories + unsorted = cats.copy() + categories = cats.sort_values() + codes = _recode_for_categories(inferred_codes, unsorted, + categories) + dtype = CategoricalDtype(categories, ordered=False) + else: + dtype = CategoricalDtype(cats, ordered=False) + codes = inferred_codes + + return cls(codes, dtype=dtype, fastpath=True) + @classmethod def from_array(cls, data, **kwargs): """ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index eeb79552477e1..c8b2987d591ef 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -21,6 +21,7 @@ is_float, is_dtype_equal, is_object_dtype, is_string_dtype, is_scalar, is_categorical_dtype) +from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import isna from pandas.core.dtypes.cast import astype_nansafe from pandas.core.index import (Index, MultiIndex, RangeIndex, @@ -1602,12 +1603,20 @@ def _cast_types(self, values, cast_type, column): """ if is_categorical_dtype(cast_type): - # XXX this is for consistency with - # c-parser which parses all categories - # as strings - if not is_object_dtype(values): + known_cats = (isinstance(cast_type, CategoricalDtype) and + cast_type.categories is not None) + + if not is_object_dtype(values) and not known_cats: + # XXX this is for consistency with + # c-parser which parses all categories + # as strings values = astype_nansafe(values, str) - values = Categorical(values) + + cats = Index(values).unique().dropna() + values = Categorical._from_inferred_categories( + cats, cats.get_indexer(values), cast_type + ) + else: try: values = astype_nansafe(values, cast_type, copy=True) diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py index 402fa0817595c..7d3df6201a390 100644 --- a/pandas/tests/io/parser/dtypes.py +++ b/pandas/tests/io/parser/dtypes.py @@ -149,6 +149,105 @@ def test_categorical_dtype_chunksize(self): for actual, expected in zip(actuals, expecteds): tm.assert_frame_equal(actual, expected) + @pytest.mark.parametrize('ordered', [False, True]) + @pytest.mark.parametrize('categories', [ + ['a', 'b', 'c'], + ['a', 'c', 'b'], + ['a', 'b', 'c', 'd'], + ['c', 'b', 'a'], + ]) + def test_categorical_categoricaldtype(self, categories, ordered): + data = """a,b +1,a +1,b +1,b +2,c""" + expected = pd.DataFrame({ + "a": [1, 1, 1, 2], + "b": Categorical(['a', 'b', 'b', 'c'], + categories=categories, + ordered=ordered) + }) + dtype = {"b": CategoricalDtype(categories=categories, + ordered=ordered)} + result = self.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + def test_categorical_categoricaldtype_unsorted(self): + data = """a,b +1,a +1,b +1,b +2,c""" + dtype = CategoricalDtype(['c', 'b', 'a']) + expected = pd.DataFrame({ + 'a': [1, 1, 1, 2], + 'b': Categorical(['a', 'b', 'b', 'c'], categories=['c', 'b', 'a']) + }) + result = self.read_csv(StringIO(data), dtype={'b': dtype}) + tm.assert_frame_equal(result, expected) + + def test_categoricaldtype_coerces_numeric(self): + dtype = {'b': CategoricalDtype([1, 2, 3])} + data = "b\n1\n1\n2\n3" + expected = pd.DataFrame({'b': Categorical([1, 1, 2, 3])}) + result = self.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + def test_categoricaldtype_coerces_datetime(self): + dtype = { + 'b': CategoricalDtype(pd.date_range('2017', '2019', freq='AS')) + } + data = "b\n2017-01-01\n2018-01-01\n2019-01-01" + expected = pd.DataFrame({'b': Categorical(dtype['b'].categories)}) + result = self.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + dtype = { + 'b': CategoricalDtype([pd.Timestamp("2014")]) + } + data = "b\n2014-01-01\n2014-01-01T00:00:00" + expected = pd.DataFrame({'b': Categorical([pd.Timestamp('2014')] * 2)}) + result = self.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + def test_categoricaldtype_coerces_timedelta(self): + dtype = {'b': CategoricalDtype(pd.to_timedelta(['1H', '2H', '3H']))} + data = "b\n1H\n2H\n3H" + expected = pd.DataFrame({'b': Categorical(dtype['b'].categories)}) + result = self.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + def test_categoricaldtype_unexpected_categories(self): + dtype = {'b': CategoricalDtype(['a', 'b', 'd', 'e'])} + data = "b\nd\na\nc\nd" # Unexpected c + expected = pd.DataFrame({"b": Categorical(list('dacd'), + dtype=dtype['b'])}) + result = self.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + def test_categorical_categoricaldtype_chunksize(self): + # GH 10153 + data = """a,b +1,a +1,b +1,b +2,c""" + cats = ['a', 'b', 'c'] + expecteds = [pd.DataFrame({'a': [1, 1], + 'b': Categorical(['a', 'b'], + categories=cats)}), + pd.DataFrame({'a': [1, 2], + 'b': Categorical(['b', 'c'], + categories=cats)}, + index=[2, 3])] + dtype = CategoricalDtype(cats) + actuals = self.read_csv(StringIO(data), dtype={'b': dtype}, + chunksize=2) + + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) + def test_empty_pass_dtype(self): data = 'one,two' result = self.read_csv(StringIO(data), dtype={'one': 'u1'}) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index d43901ea091b7..9e3bd40dc275a 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -560,6 +560,40 @@ def f(): codes = np.random.choice([0, 1], 5, p=[0.9, 0.1]) pd.Categorical.from_codes(codes, categories=["train", "test"]) + @pytest.mark.parametrize('dtype', [None, 'category']) + def test_from_inferred_categories(self, dtype): + cats = ['a', 'b'] + codes = np.array([0, 0, 1, 1], dtype='i8') + result = Categorical._from_inferred_categories(cats, codes, dtype) + expected = Categorical.from_codes(codes, cats) + tm.assert_categorical_equal(result, expected) + + @pytest.mark.parametrize('dtype', [None, 'category']) + def test_from_inferred_categories_sorts(self, dtype): + cats = ['b', 'a'] + codes = np.array([0, 1, 1, 1], dtype='i8') + result = Categorical._from_inferred_categories(cats, codes, dtype) + expected = Categorical.from_codes([1, 0, 0, 0], ['a', 'b']) + tm.assert_categorical_equal(result, expected) + + def test_from_inferred_categories_dtype(self): + cats = ['a', 'b', 'd'] + codes = np.array([0, 1, 0, 2], dtype='i8') + dtype = CategoricalDtype(['c', 'b', 'a'], ordered=True) + result = Categorical._from_inferred_categories(cats, codes, dtype) + expected = Categorical(['a', 'b', 'a', 'd'], + categories=['c', 'b', 'a'], + ordered=True) + tm.assert_categorical_equal(result, expected) + + def test_from_inferred_categories_coerces(self): + cats = ['1', '2', 'bad'] + codes = np.array([0, 0, 1, 2], dtype='i8') + dtype = CategoricalDtype([1, 2]) + result = Categorical._from_inferred_categories(cats, codes, dtype) + expected = Categorical([1, 1, 2, np.nan]) + tm.assert_categorical_equal(result, expected) + def test_validate_ordered(self): # see gh-14058 exp_msg = "'ordered' must either be 'True' or 'False'"