From e83a0b820e1abd2c97685c6ff5a1917f35bd5722 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 31 Aug 2017 13:53:07 -0500 Subject: [PATCH 01/17] ENH: Accept CategoricalDtype in CSV reader --- doc/source/io.rst | 15 ++++++++++- pandas/_libs/parsers.pyx | 23 +++++++++++++--- pandas/io/parsers.py | 7 ++++- pandas/tests/io/parser/dtypes.py | 45 ++++++++++++++++++++++++++++++++ 4 files changed, 84 insertions(+), 6 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index d6abed6e9d1ad..6a4af0c716f4d 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -452,7 +452,8 @@ Specifying Categorical dtype .. versionadded:: 0.19.0 -``Categorical`` columns can be parsed directly by specifying ``dtype='category'`` +``Categorical`` columns can be parsed directly by specifying ``dtype='category'`` or +``dtype=CategoricalDtype(categories, ordered)``. .. ipython:: python @@ -468,6 +469,18 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes +Specifying ``dtype='cateogry'`` will result in a ``Categorical`` that is +unordered, and whose ``categories`` are the unique values observed in the data. +For more control on the categories and order, create a +:class:`~pandas.api.types.CategoricalDtype` ahead of time. + +.. ipython:: python + + from pandas.api.types import CategoricalDtype + + dtype = CategoricalDtype(['d', 'c', 'b', 'a'], ordered=True) + pd.read_csv(StringIO(data), dtype={'col1': dtype}).dtypes + .. note:: The resulting categories will always be parsed as strings (object dtype). diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 5bf9f4ce83cbf..9324ca0c76ce3 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1267,6 +1267,8 @@ cdef class TextReader: return self._string_convert(i, start, end, na_filter, na_hashset) elif is_categorical_dtype(dtype): + # TODO: I suspect that this could be optimized when dtype + # is an instance of CategoricalDtype codes, cats, na_count = _categorical_convert( self.parser, i, start, end, na_filter, na_hashset, self.c_encoding) @@ -1278,8 +1280,18 @@ cdef class TextReader: indexer = cats.get_indexer(unsorted) codes = take_1d(indexer, codes, fill_value=-1) - return Categorical(codes, categories=cats, ordered=False, - fastpath=True), na_count + cat = Categorical(codes, categories=cats, ordered=False, + fastpath=True) + + if isinstance(dtype, CategoricalDtype): + if dtype.categories is None: + # skip recoding + if dtype.ordered: + cat = cat.set_ordered(ordered=dtype.ordered) + else: + cat = cat.set_categories(dtype.categories, + ordered=dtype.ordered) + return cat, na_count elif is_object_dtype(dtype): return self._string_convert(i, start, end, na_filter, na_hashset) @@ -2230,8 +2242,11 @@ def _concatenate_chunks(list chunks): if common_type == np.object: warning_columns.append(str(name)) - if is_categorical_dtype(dtypes.pop()): - result[name] = union_categoricals(arrs, sort_categories=True) + dtype = dtypes.pop() + if is_categorical_dtype(dtype): + sort_categories = isinstance(dtype, str) + result[name] = union_categoricals(arrs, + sort_categories=sort_categories) else: result[name] = np.concatenate(arrs) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index ed15d4295d688..2a94c50c91f25 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -21,6 +21,7 @@ is_float, is_dtype_equal, is_object_dtype, is_string_dtype, is_scalar, is_categorical_dtype) +from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import isna from pandas.core.dtypes.cast import astype_nansafe from pandas.core.index import (Index, MultiIndex, RangeIndex, @@ -1607,7 +1608,11 @@ def _cast_types(self, values, cast_type, column): # as strings if not is_object_dtype(values): values = astype_nansafe(values, str) - values = Categorical(values) + if isinstance(cast_type, CategoricalDtype): + values = Categorical(values, categories=cast_type.categories, + ordered=cast_type.ordered) + else: + values = Categorical(values) else: try: values = astype_nansafe(values, cast_type, copy=True) diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py index 402fa0817595c..10d005fa4a333 100644 --- a/pandas/tests/io/parser/dtypes.py +++ b/pandas/tests/io/parser/dtypes.py @@ -149,6 +149,51 @@ def test_categorical_dtype_chunksize(self): for actual, expected in zip(actuals, expecteds): tm.assert_frame_equal(actual, expected) + @pytest.mark.parametrize('ordered', [False, True]) + @pytest.mark.parametrize('categories', [ + ['a', 'b', 'c'], + ['a', 'c', 'b'], + ['a', 'b', 'c', 'd'], + ]) + def test_categorical_categoricaldtype(self, categories, ordered): + data = """a,b +1,a +1,b +1,b +2,c""" + expected = pd.DataFrame({ + "a": [1, 1, 1, 2], + "b": Categorical(['a', 'b', 'b', 'c'], + categories=categories, + ordered=ordered) + }) + dtype = {"b": CategoricalDtype(categories=categories, + ordered=ordered)} + result = self.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + def test_categorical_categoricaldtype_chunksize(self): + # GH 10153 + data = """a,b +1,a +1,b +1,b +2,c""" + cats = ['a', 'b', 'c'] + expecteds = [pd.DataFrame({'a': [1, 1], + 'b': Categorical(['a', 'b'], + categories=cats)}), + pd.DataFrame({'a': [1, 2], + 'b': Categorical(['b', 'c'], + categories=cats)}, + index=[2, 3])] + dtype = CategoricalDtype(cats) + actuals = self.read_csv(StringIO(data), dtype={'b': dtype}, + chunksize=2) + + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) + def test_empty_pass_dtype(self): data = 'one,two' result = self.read_csv(StringIO(data), dtype={'one': 'u1'}) From 388e8a963dc8d0e93a8d910bf17ce565048503df Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 24 Sep 2017 13:26:23 -0500 Subject: [PATCH 02/17] rework --- pandas/_libs/parsers.pyx | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 9324ca0c76ce3..90bbe07b23d55 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1274,23 +1274,25 @@ cdef class TextReader: na_hashset, self.c_encoding) # sort categories and recode if necessary cats = Index(cats) - if not cats.is_monotonic_increasing: + if (isinstance(dtype, CategoricalDtype) and + dtype.categories is not None): + # redcode for dtype.categories + categories = dtype.categories + indexer = cats.get_indexer(categories) + codes = take_1d(codes, categories, fill_value=-1) + elif not cats.is_monotonic_increasing: unsorted = cats.copy() cats = cats.sort_values() indexer = cats.get_indexer(unsorted) codes = take_1d(indexer, codes, fill_value=-1) + else: + categories = cats - cat = Categorical(codes, categories=cats, ordered=False, + cat = Categorical(codes, categories=categories, ordered=False, fastpath=True) - if isinstance(dtype, CategoricalDtype): - if dtype.categories is None: - # skip recoding - if dtype.ordered: - cat = cat.set_ordered(ordered=dtype.ordered) - else: - cat = cat.set_categories(dtype.categories, - ordered=dtype.ordered) + if isinstance(dtype, CategoricalDtype) and dtype.ordered: + cat = cat.set_ordered(ordered=True) return cat, na_count elif is_object_dtype(dtype): return self._string_convert(i, start, end, na_filter, From c5f6e04fdf21de4beede3bf2fb6a12cabc4cd76d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 24 Sep 2017 15:39:20 -0500 Subject: [PATCH 03/17] Fixed basic implementation --- pandas/_libs/parsers.pyx | 15 ++++++++------- pandas/tests/io/parser/dtypes.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 90bbe07b23d55..b2a7d15a3692e 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1272,27 +1272,28 @@ cdef class TextReader: codes, cats, na_count = _categorical_convert( self.parser, i, start, end, na_filter, na_hashset, self.c_encoding) - # sort categories and recode if necessary cats = Index(cats) if (isinstance(dtype, CategoricalDtype) and dtype.categories is not None): - # redcode for dtype.categories + # recode for dtype.categories categories = dtype.categories - indexer = cats.get_indexer(categories) - codes = take_1d(codes, categories, fill_value=-1) + indexer = categories.get_indexer(cats) + codes = take_1d(indexer, codes, fill_value=-1) + ordered = dtype.ordered elif not cats.is_monotonic_increasing: + # sort categories and recode if necessary unsorted = cats.copy() cats = cats.sort_values() indexer = cats.get_indexer(unsorted) codes = take_1d(indexer, codes, fill_value=-1) + ordered = False else: categories = cats + ordered = False - cat = Categorical(codes, categories=categories, ordered=False, + cat = Categorical(codes, categories=categories, ordered=ordered, fastpath=True) - if isinstance(dtype, CategoricalDtype) and dtype.ordered: - cat = cat.set_ordered(ordered=True) return cat, na_count elif is_object_dtype(dtype): return self._string_convert(i, start, end, na_filter, diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py index 10d005fa4a333..5f6cf2892a254 100644 --- a/pandas/tests/io/parser/dtypes.py +++ b/pandas/tests/io/parser/dtypes.py @@ -154,6 +154,7 @@ def test_categorical_dtype_chunksize(self): ['a', 'b', 'c'], ['a', 'c', 'b'], ['a', 'b', 'c', 'd'], + ['c', 'b', 'a'], ]) def test_categorical_categoricaldtype(self, categories, ordered): data = """a,b @@ -172,6 +173,35 @@ def test_categorical_categoricaldtype(self, categories, ordered): result = self.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(result, expected) + def test_categorical_categoricaldtype_unsorted(self): + data = """a,b +1,a +1,b +1,b +2,c""" + dtype = CategoricalDtype(['c', 'b', 'a']) + expected = pd.DataFrame({ + 'a': [1, 1, 1, 2], + 'b': Categorical(['a', 'b', 'b', 'c'], categories=['c', 'b', 'a']) + }) + result = self.read_csv(StringIO(data), dtype={'b': dtype}) + tm.assert_frame_equal(result, expected) + +# @pytest.mark.parametrize('ordered', [True, False]) +# def test_categoricaldtype_coerces(self, ordered): +# dtype = {'b': CategoricalDtype([10, 11, 12, 13], ordered=ordered)} +# data = """a,b +# 1,10 +# 1,11 +# 1,12 +# 2,13""" +# expected = pd.DataFrame({ +# 'a': [1, 1, 1, 2], +# 'b': Categorical([10, 11, 12, 13], ordered=ordered), +# }, columns=['a', 'b']) +# result = self.read_csv(StringIO(data), dtype=dtype) +# tm.assert_frame_equal(result, expected) + def test_categorical_categoricaldtype_chunksize(self): # GH 10153 data = """a,b From 4b588cdda8b7fed3d7df084078738a91c9e04be5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 25 Sep 2017 13:22:05 -0500 Subject: [PATCH 04/17] Added casting --- pandas/_libs/parsers.pyx | 24 ++++++++++++++++++--- pandas/io/parsers.py | 21 ++++++++++++++----- pandas/tests/io/parser/dtypes.py | 36 +++++++++++++++++++------------- 3 files changed, 59 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index b2a7d15a3692e..c76f77e9d047b 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -48,7 +48,7 @@ from pandas.core.dtypes.common import ( from pandas.core.categorical import Categorical from pandas.core.algorithms import take_1d from pandas.core.dtypes.concat import union_categoricals -from pandas import Index +from pandas import Index, to_numeric, to_datetime, to_timedelta import pandas.io.common as com @@ -1267,12 +1267,30 @@ cdef class TextReader: return self._string_convert(i, start, end, na_filter, na_hashset) elif is_categorical_dtype(dtype): - # TODO: I suspect that this could be optimized when dtype - # is an instance of CategoricalDtype + # TODO: I suspect that _categorical_convert could be + # optimized when dtype is an instance of CategoricalDtype codes, cats, na_count = _categorical_convert( self.parser, i, start, end, na_filter, na_hashset, self.c_encoding) cats = Index(cats) + + # Here is where we'll do the casting... + if (isinstance(dtype, CategoricalDtype) and + dtype.categories is not None): + if dtype.categories.is_numeric(): + # is ignore correct? + cats = to_numeric(cats, errors='ignore') + elif dtype.categories.is_all_dates: + # is ignore correct? + if is_datetime64_dtype(dtype.categories): + print("before", cats) + cats = to_datetime(cats, errors='ignore') + print("after", cats) + else: + print("before", cats) + cats = to_timedelta(cats, errors='ignore') + print("after", cats) + if (isinstance(dtype, CategoricalDtype) and dtype.categories is not None): # recode for dtype.categories diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 2a94c50c91f25..286e3e2e72d94 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -12,7 +12,7 @@ import numpy as np -from pandas import compat +from pandas import compat, to_numeric, to_timedelta from pandas.compat import (range, lrange, PY3, StringIO, lzip, zip, string_types, map, u) from pandas.core.dtypes.common import ( @@ -20,7 +20,8 @@ is_list_like, is_integer_dtype, is_float, is_dtype_equal, is_object_dtype, is_string_dtype, - is_scalar, is_categorical_dtype) + is_scalar, is_categorical_dtype, + is_datetime64_dtype, is_timedelta64_dtype) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import isna from pandas.core.dtypes.cast import astype_nansafe @@ -1606,11 +1607,21 @@ def _cast_types(self, values, cast_type, column): # XXX this is for consistency with # c-parser which parses all categories # as strings - if not is_object_dtype(values): - values = astype_nansafe(values, str) - if isinstance(cast_type, CategoricalDtype): + known_cats = (isinstance(cast_type, CategoricalDtype) and + cast_type.categories is not None) + str_values = is_object_dtype(values) + + if known_cats and str_values: + if cast_type.categories.is_numeric(): + values = to_numeric(values, errors='ignore') + elif is_datetime64_dtype(cast_type.categories): + values = tools.to_datetime(values, errors='ignore') + elif is_timedelta64_dtype(cast_type.categories): + values = to_timedelta(values, errors='ignore') values = Categorical(values, categories=cast_type.categories, ordered=cast_type.ordered) + elif not is_object_dtype(values): + values = astype_nansafe(values, str) else: values = Categorical(values) else: diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py index 5f6cf2892a254..6b92658f174d1 100644 --- a/pandas/tests/io/parser/dtypes.py +++ b/pandas/tests/io/parser/dtypes.py @@ -187,20 +187,28 @@ def test_categorical_categoricaldtype_unsorted(self): result = self.read_csv(StringIO(data), dtype={'b': dtype}) tm.assert_frame_equal(result, expected) -# @pytest.mark.parametrize('ordered', [True, False]) -# def test_categoricaldtype_coerces(self, ordered): -# dtype = {'b': CategoricalDtype([10, 11, 12, 13], ordered=ordered)} -# data = """a,b -# 1,10 -# 1,11 -# 1,12 -# 2,13""" -# expected = pd.DataFrame({ -# 'a': [1, 1, 1, 2], -# 'b': Categorical([10, 11, 12, 13], ordered=ordered), -# }, columns=['a', 'b']) -# result = self.read_csv(StringIO(data), dtype=dtype) -# tm.assert_frame_equal(result, expected) + def test_categoricaldtype_coerces_numeric(self): + dtype = {'b': CategoricalDtype([1, 2, 3])} + data = "b\n1\n1\n2\n3" + expected = pd.DataFrame({'b': Categorical([1, 1, 2, 3])}) + result = self.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + def test_categoricaldtype_coerces_datetime(self): + dtype = { + 'b': CategoricalDtype(pd.date_range('2017', '2019', freq='AS')) + } + data = "b\n2017-01-01\n2018-01-01\n2019-01-01" + expected = pd.DataFrame({'b': Categorical(dtype['b'].categories)}) + result = self.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + def test_categoricaldtype_coerces_timedelta(self): + dtype = {'b': CategoricalDtype(pd.to_timedelta(['1H', '2H', '3H']))} + data = "b\n1H\n2H\n3H" + expected = pd.DataFrame({'b': Categorical(dtype['b'].categories)}) + result = self.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) def test_categorical_categoricaldtype_chunksize(self): # GH 10153 From e32d5be89781b6cc4923419a87064778cddd9343 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 25 Sep 2017 13:32:59 -0500 Subject: [PATCH 05/17] Doc and cleanup --- doc/source/io.rst | 20 ++++++++++++-------- doc/source/whatsnew/v0.21.0.txt | 2 ++ pandas/_libs/parsers.pyx | 7 ++----- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 6a4af0c716f4d..61659a9571140 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -469,10 +469,11 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes -Specifying ``dtype='cateogry'`` will result in a ``Categorical`` that is -unordered, and whose ``categories`` are the unique values observed in the data. -For more control on the categories and order, create a -:class:`~pandas.api.types.CategoricalDtype` ahead of time. +Specifying ``dtype='cateogry'`` will result in an unordered ``Categorical`` +whose ``categories`` are the unique values observed in the data. For more +control on the categories and order, create a +:class:`~pandas.api.types.CategoricalDtype` ahead of time, and pass that for +that column's ``dtype``. .. ipython:: python @@ -483,10 +484,13 @@ For more control on the categories and order, create a .. note:: - The resulting categories will always be parsed as strings (object dtype). - If the categories are numeric they can be converted using the - :func:`to_numeric` function, or as appropriate, another converter - such as :func:`to_datetime`. + With ``dtype='category'``, the resulting categories will always be parsed + as strings (object dtype). If the categories are numeric they can be + converted using the :func:`to_numeric` function, or as appropriate, another + converter such as :func:`to_datetime`. + + When ``dtype`` is a ``CategoricalDtype`` with homogenous ``categoriess`` ( + all numeric, all datetimes, etc.), the conversion is done automatically. .. ipython:: python diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 1365901c2ce5e..ab68219a18d8a 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -163,6 +163,8 @@ Other Enhancements - :func:`Categorical.rename_categories` now accepts a dict-like argument as `new_categories` and only updates the categories found in that dict. (:issue:`17336`) - :func:`read_excel` raises ``ImportError`` with a better message if ``xlrd`` is not installed. (:issue:`17613`) - :meth:`DataFrame.assign` will preserve the original order of ``**kwargs`` for Python 3.6+ users instead of sorting the column names +- Pass a :class:`~pandas.api.types.CategoricalDtype` to :meth:`read_csv` to parse categorical + data as numeric, datetimes, or timedeltas, instead of strings. See :ref:`here `. (:issue:`17643`) .. _whatsnew_0210.api_breaking: diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index c76f77e9d047b..4f14fc365aa77 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1274,7 +1274,8 @@ cdef class TextReader: na_hashset, self.c_encoding) cats = Index(cats) - # Here is where we'll do the casting... + # Determine if we should convert inferred string + # categories to a specialized type if (isinstance(dtype, CategoricalDtype) and dtype.categories is not None): if dtype.categories.is_numeric(): @@ -1283,13 +1284,9 @@ cdef class TextReader: elif dtype.categories.is_all_dates: # is ignore correct? if is_datetime64_dtype(dtype.categories): - print("before", cats) cats = to_datetime(cats, errors='ignore') - print("after", cats) else: - print("before", cats) cats = to_timedelta(cats, errors='ignore') - print("after", cats) if (isinstance(dtype, CategoricalDtype) and dtype.categories is not None): From 508dd1e13f67e06e255d91f708b56a2891fa2184 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 25 Sep 2017 14:42:18 -0500 Subject: [PATCH 06/17] Fixed assignment of categoricals --- pandas/_libs/parsers.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 4f14fc365aa77..0c8ca84b6ad7f 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1298,8 +1298,8 @@ cdef class TextReader: elif not cats.is_monotonic_increasing: # sort categories and recode if necessary unsorted = cats.copy() - cats = cats.sort_values() - indexer = cats.get_indexer(unsorted) + categories = cats.sort_values() + indexer = categories.get_indexer(unsorted) codes = take_1d(indexer, codes, fill_value=-1) ordered = False else: From 6f175a7f727d44cf819252d8979a38c1b19384b7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 26 Sep 2017 08:51:03 -0500 Subject: [PATCH 07/17] Doc and test unexpected values --- doc/source/io.rst | 10 +++++++++- doc/source/whatsnew/v0.21.0.txt | 29 ++++++++++++++++++++++++++--- pandas/tests/io/parser/dtypes.py | 8 ++++++++ 3 files changed, 43 insertions(+), 4 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 61659a9571140..233f32123da5b 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -482,6 +482,14 @@ that column's ``dtype``. dtype = CategoricalDtype(['d', 'c', 'b', 'a'], ordered=True) pd.read_csv(StringIO(data), dtype={'col1': dtype}).dtypes +When using ``dtype=CategoricalDtype``, "unexpected" values outside of +``dtype.categories`` are treated as missing values. + + dtype = CategoricalDtype(['a', 'b', 'd']) # No 'c' + pd.read_csv(StringIO(data), dtype={'col1': dtype}).col1 + +This matches the behavior of :meth:`Categorical.set_categories`. + .. note:: With ``dtype='category'``, the resulting categories will always be parsed @@ -489,7 +497,7 @@ that column's ``dtype``. converted using the :func:`to_numeric` function, or as appropriate, another converter such as :func:`to_datetime`. - When ``dtype`` is a ``CategoricalDtype`` with homogenous ``categoriess`` ( + When ``dtype`` is a ``CategoricalDtype`` with homogenous ``categories`` ( all numeric, all datetimes, etc.), the conversion is done automatically. .. ipython:: python diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index ab68219a18d8a..414e8b45ab001 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -119,7 +119,7 @@ expanded to include the ``categories`` and ``ordered`` attributes. A ``CategoricalDtype`` can be used to specify the set of categories and orderedness of an array, independent of the data themselves. This can be useful, e.g., when converting string data to a ``Categorical`` (:issue:`14711`, -:issue:`15078`, :issue:`16015`): +:issue:`15078`, :issue:`16015`, :issue:`17643`): .. ipython:: python @@ -129,8 +129,33 @@ e.g., when converting string data to a ``Categorical`` (:issue:`14711`, dtype = CategoricalDtype(categories=['a', 'b', 'c', 'd'], ordered=True) s.astype(dtype) +One place that deserves special mention is in :meth:`read_csv`. Previously, with +``dtype={'col': 'category'}``, the returned values and categories would always +be strings. + +.. ipython:: python + + from pandas.compat import StringIO + + data = 'A,B\na,1\nb,2\nc,3' + pd.read_csv(StringIO(data), dtype={'B': 'category'}).B.cat.categories + +Notice the "object" dtype. + +With a ``CategoricalDtype`` of all numerics, datetimes, or +timedeltas, we can automatically convert to the correct type + + dtype = {'B': CategoricalDtype([1, 2, 3])} + pd.read_csv(StringIO(data), dtype=dtype).B.cat.categories + +The values have been correctly interpreted as integers. + The ``.dtype`` property of a ``Categorical``, ``CategoricalIndex`` or a ``Series`` with categorical type will now return an instance of ``CategoricalDtype``. +For the most part, this is backwards compatible, though the string repr has changed. +If you were previously using ``str(s.dtype == 'category')`` to detect categorical data, +switch to :func:`api.types.is_categorical_dtype`, which is compatible with the old and +new ``CategoricalDtype``. See the :ref:`CategoricalDtype docs ` for more. @@ -163,8 +188,6 @@ Other Enhancements - :func:`Categorical.rename_categories` now accepts a dict-like argument as `new_categories` and only updates the categories found in that dict. (:issue:`17336`) - :func:`read_excel` raises ``ImportError`` with a better message if ``xlrd`` is not installed. (:issue:`17613`) - :meth:`DataFrame.assign` will preserve the original order of ``**kwargs`` for Python 3.6+ users instead of sorting the column names -- Pass a :class:`~pandas.api.types.CategoricalDtype` to :meth:`read_csv` to parse categorical - data as numeric, datetimes, or timedeltas, instead of strings. See :ref:`here `. (:issue:`17643`) .. _whatsnew_0210.api_breaking: diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py index 6b92658f174d1..096fcf9a52718 100644 --- a/pandas/tests/io/parser/dtypes.py +++ b/pandas/tests/io/parser/dtypes.py @@ -210,6 +210,14 @@ def test_categoricaldtype_coerces_timedelta(self): result = self.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(result, expected) + def test_categoricaldtype_unexpected_categories(self): + dtype = {'b': CategoricalDtype(['a', 'b', 'd', 'e'])} + data = "b\nd\na\nc\nd" # Unexpected c + expected = pd.DataFrame({"b": Categorical(list('dacd'), + dtype=dtype['b'])}) + result = self.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + def test_categorical_categoricaldtype_chunksize(self): # GH 10153 data = """a,b From 1545734ed1b258b7ef08e6788e460bf39a37f650 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 26 Sep 2017 10:25:56 -0500 Subject: [PATCH 08/17] DOC: fixups --- doc/source/io.rst | 2 ++ doc/source/whatsnew/v0.21.0.txt | 14 +++++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 233f32123da5b..58e854d16afa2 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -485,6 +485,8 @@ that column's ``dtype``. When using ``dtype=CategoricalDtype``, "unexpected" values outside of ``dtype.categories`` are treated as missing values. +.. ipython:: python + dtype = CategoricalDtype(['a', 'b', 'd']) # No 'c' pd.read_csv(StringIO(data), dtype={'col1': dtype}).col1 diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 414e8b45ab001..731645f992e6a 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -134,9 +134,12 @@ One place that deserves special mention is in :meth:`read_csv`. Previously, with be strings. .. ipython:: python + :suppress: from pandas.compat import StringIO +.. ipython:: python + data = 'A,B\na,1\nb,2\nc,3' pd.read_csv(StringIO(data), dtype={'B': 'category'}).B.cat.categories @@ -151,11 +154,12 @@ timedeltas, we can automatically convert to the correct type The values have been correctly interpreted as integers. The ``.dtype`` property of a ``Categorical``, ``CategoricalIndex`` or a -``Series`` with categorical type will now return an instance of ``CategoricalDtype``. -For the most part, this is backwards compatible, though the string repr has changed. -If you were previously using ``str(s.dtype == 'category')`` to detect categorical data, -switch to :func:`api.types.is_categorical_dtype`, which is compatible with the old and -new ``CategoricalDtype``. +``Series`` with categorical type will now return an instance of +``CategoricalDtype``. For the most part, this is backwards compatible, though +the string repr has changed. If you were previously using ``str(s.dtype) == +'category'`` to detect categorical data, switch to +:func:`pandas.api.types.is_categorical_dtype`, which is compatible with the old +and new ``CategoricalDtype``. See the :ref:`CategoricalDtype docs ` for more. From b80cff8c2f83559298f723bd092255538b753ba9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 26 Sep 2017 14:17:45 -0500 Subject: [PATCH 09/17] More coercion, use _recode_for_categories --- pandas/_libs/parsers.pyx | 5 ++--- pandas/tests/io/parser/dtypes.py | 8 ++++++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 0c8ca84b6ad7f..9db225a6287db 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -45,7 +45,7 @@ from pandas.core.dtypes.common import ( is_bool_dtype, is_object_dtype, is_string_dtype, is_datetime64_dtype, pandas_dtype) -from pandas.core.categorical import Categorical +from pandas.core.categorical import Categorical, _recode_for_categories from pandas.core.algorithms import take_1d from pandas.core.dtypes.concat import union_categoricals from pandas import Index, to_numeric, to_datetime, to_timedelta @@ -1292,8 +1292,7 @@ cdef class TextReader: dtype.categories is not None): # recode for dtype.categories categories = dtype.categories - indexer = categories.get_indexer(cats) - codes = take_1d(indexer, codes, fill_value=-1) + codes = _recode_for_categories(codes, cats, categories) ordered = dtype.ordered elif not cats.is_monotonic_increasing: # sort categories and recode if necessary diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py index 096fcf9a52718..7d3df6201a390 100644 --- a/pandas/tests/io/parser/dtypes.py +++ b/pandas/tests/io/parser/dtypes.py @@ -203,6 +203,14 @@ def test_categoricaldtype_coerces_datetime(self): result = self.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(result, expected) + dtype = { + 'b': CategoricalDtype([pd.Timestamp("2014")]) + } + data = "b\n2014-01-01\n2014-01-01T00:00:00" + expected = pd.DataFrame({'b': Categorical([pd.Timestamp('2014')] * 2)}) + result = self.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + def test_categoricaldtype_coerces_timedelta(self): dtype = {'b': CategoricalDtype(pd.to_timedelta(['1H', '2H', '3H']))} data = "b\n1H\n2H\n3H" From b02882714601004fe8a9480d04461574d550dde5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 26 Sep 2017 15:50:31 -0500 Subject: [PATCH 10/17] Refactor with maybe_convert_for_categorical --- doc/source/io.rst | 2 ++ pandas/_libs/parsers.pyx | 20 +++------------ pandas/core/dtypes/cast.py | 38 ++++++++++++++++++++++++++++- pandas/io/parsers.py | 29 +++++++++------------- pandas/tests/dtypes/test_cast.py | 42 +++++++++++++++++++++++++++++++- 5 files changed, 96 insertions(+), 35 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 58e854d16afa2..4d47d8b77aebf 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -469,6 +469,8 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes +.. versionadded:: 0.21.0 + Specifying ``dtype='cateogry'`` will result in an unordered ``Categorical`` whose ``categories`` are the unique values observed in the data. For more control on the categories and order, create a diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 9db225a6287db..9587baa3f10f7 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -48,7 +48,8 @@ from pandas.core.dtypes.common import ( from pandas.core.categorical import Categorical, _recode_for_categories from pandas.core.algorithms import take_1d from pandas.core.dtypes.concat import union_categoricals -from pandas import Index, to_numeric, to_datetime, to_timedelta +from pandas.core.dtypes.cast import maybe_convert_for_categorical +from pandas import Index import pandas.io.common as com @@ -1274,19 +1275,7 @@ cdef class TextReader: na_hashset, self.c_encoding) cats = Index(cats) - # Determine if we should convert inferred string - # categories to a specialized type - if (isinstance(dtype, CategoricalDtype) and - dtype.categories is not None): - if dtype.categories.is_numeric(): - # is ignore correct? - cats = to_numeric(cats, errors='ignore') - elif dtype.categories.is_all_dates: - # is ignore correct? - if is_datetime64_dtype(dtype.categories): - cats = to_datetime(cats, errors='ignore') - else: - cats = to_timedelta(cats, errors='ignore') + cats = maybe_convert_for_categorical(cats, dtype) if (isinstance(dtype, CategoricalDtype) and dtype.categories is not None): @@ -1298,8 +1287,7 @@ cdef class TextReader: # sort categories and recode if necessary unsorted = cats.copy() categories = cats.sort_values() - indexer = categories.get_indexer(unsorted) - codes = take_1d(indexer, codes, fill_value=-1) + codes = _recode_for_categories(codes, unsorted, categories) ordered = False else: categories = cats diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index c2cf6afc1a7b5..acc389bcdaa11 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -24,7 +24,8 @@ _ensure_int32, _ensure_int64, _NS_DTYPE, _TD_DTYPE, _INT64_DTYPE, _POSSIBLY_CAST_DTYPES) -from .dtypes import ExtensionDtype, DatetimeTZDtype, PeriodDtype +from .dtypes import (ExtensionDtype, DatetimeTZDtype, PeriodDtype, + CategoricalDtype) from .generic import (ABCDatetimeIndex, ABCPeriodIndex, ABCSeries) from .missing import isna, notna @@ -604,6 +605,41 @@ def conv(r, dtype): return [conv(r, dtype) for r, dtype in zip(result, dtypes)] +def maybe_convert_for_categorical(categories, dtype): + """Convert ``categories`` depending on ``dtype``. + + Converts to numeric, datetime, or timedelta types, when ``dtype`` is + a CategoricalDtype with known, non-object categories. + + Parameters + ---------- + categories : array-like + type : CategoricalDtype + + Returns + ------- + new_categories : array or Index + + Examples + -------- + >>> maybe_convert_for_categorical(['1', '2'], CategoricalDtype([1, 2])) + array([ 1, 2]) + >>> maybe_convert_for_categorical([1, 'a'], CategoricalDtype([1, 2])) + array([ 1., nan]) + """ + if isinstance(dtype, CategoricalDtype) and dtype.categories is not None: + from pandas import to_numeric, to_datetime, to_timedelta + + if dtype.categories.is_numeric(): + categories = to_numeric(categories, errors='coerce') + elif is_datetime64_dtype(dtype.categories): + categories = to_datetime(categories, errors='coerce') + elif is_timedelta64_dtype(dtype.categories): + categories = to_timedelta(categories, errors='coerce') + + return categories + + def astype_nansafe(arr, dtype, copy=True): """ return a view if copy is False, but need to be very careful as the result shape could change! """ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 286e3e2e72d94..1ac82d1e83f7c 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -12,7 +12,7 @@ import numpy as np -from pandas import compat, to_numeric, to_timedelta +from pandas import compat from pandas.compat import (range, lrange, PY3, StringIO, lzip, zip, string_types, map, u) from pandas.core.dtypes.common import ( @@ -20,11 +20,11 @@ is_list_like, is_integer_dtype, is_float, is_dtype_equal, is_object_dtype, is_string_dtype, - is_scalar, is_categorical_dtype, - is_datetime64_dtype, is_timedelta64_dtype) + is_scalar, is_categorical_dtype) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import isna -from pandas.core.dtypes.cast import astype_nansafe +from pandas.core.dtypes.cast import (astype_nansafe, + maybe_convert_for_categorical) from pandas.core.index import (Index, MultiIndex, RangeIndex, _ensure_index_from_sequences) from pandas.core.series import Series @@ -1609,21 +1609,16 @@ def _cast_types(self, values, cast_type, column): # as strings known_cats = (isinstance(cast_type, CategoricalDtype) and cast_type.categories is not None) - str_values = is_object_dtype(values) - - if known_cats and str_values: - if cast_type.categories.is_numeric(): - values = to_numeric(values, errors='ignore') - elif is_datetime64_dtype(cast_type.categories): - values = tools.to_datetime(values, errors='ignore') - elif is_timedelta64_dtype(cast_type.categories): - values = to_timedelta(values, errors='ignore') - values = Categorical(values, categories=cast_type.categories, - ordered=cast_type.ordered) + + categories = ordered = None + if known_cats: + values = maybe_convert_for_categorical(values, cast_type) + categories = cast_type.categories + ordered = cast_type.ordered elif not is_object_dtype(values): values = astype_nansafe(values, str) - else: - values = Categorical(values) + values = Categorical(values, categories=categories, + ordered=ordered) else: try: values = astype_nansafe(values, cast_type, copy=True) diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index d9fb458c83529..f1fe7d720af3b 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -16,6 +16,7 @@ from pandas.core.dtypes.cast import ( maybe_downcast_to_dtype, maybe_convert_objects, + maybe_convert_for_categorical, cast_scalar_to_array, infer_dtype_from_scalar, infer_dtype_from_array, @@ -25,7 +26,8 @@ from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, - PeriodDtype) + PeriodDtype, + CategoricalDtype) from pandas.core.dtypes.common import ( is_dtype_equal) from pandas.util import testing as tm @@ -299,6 +301,44 @@ def test_maybe_infer_to_datetimelike(self): [NaT, 'b', 1]])) assert result.size == 6 + def test_maybe_convert_for_categorical_noop(self): + expected = ['1', '2'] + result = maybe_convert_for_categorical(expected, None) + assert result == expected + + result = maybe_convert_for_categorical(expected, CategoricalDtype()) + assert result == expected + + result = maybe_convert_for_categorical(expected, 'category') + assert result == expected + + @pytest.mark.parametrize('categories, dtype, expected', [ + (['1', '2'], [1, 2, 3], np.array([1, 2])), + (['1', '2', 'a'], [1, 2, 3], np.array([1, 2, np.nan])), + ]) + def test_maybe_convert_for_categorical(self, categories, dtype, expected): + dtype = CategoricalDtype(dtype) + result = maybe_convert_for_categorical(categories, dtype) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize('categories, dtype, expected', [ + (['2016', '2017'], pd.to_datetime(['2016', '2017']), + pd.to_datetime(['2016', '2017'])), + (['2016', '2017', 'bad'], pd.to_datetime(['2016', '2017']), + pd.to_datetime(['2016', '2017', 'NaT'])), + + (['1H', '2H'], pd.to_timedelta(['1H', '2H']), + pd.to_timedelta(['1H', '2H'])), + (['1H', '2H', 'bad'], pd.to_timedelta(['1H', '2H']), + pd.to_timedelta(['1H', '2H', 'NaT'])), + + ]) + def test_maybe_convert_for_categorical_dates(self, categories, dtype, + expected): + dtype = CategoricalDtype(dtype) + result = maybe_convert_for_categorical(categories, dtype) + tm.assert_index_equal(result, expected) + class TestConvert(object): From fc34080d91160f1fc9de835cec97b858da467002 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 27 Sep 2017 09:56:56 -0500 Subject: [PATCH 11/17] PEP8 --- pandas/tests/dtypes/test_cast.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index f1fe7d720af3b..b8cab280f3aea 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -26,8 +26,7 @@ from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, - PeriodDtype, - CategoricalDtype) + PeriodDtype) from pandas.core.dtypes.common import ( is_dtype_equal) from pandas.util import testing as tm From d100f0c3bb24b94525466c59b408a2ee94cf898b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 27 Sep 2017 09:58:35 -0500 Subject: [PATCH 12/17] Type for 32bit --- pandas/tests/dtypes/test_cast.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index b8cab280f3aea..96cb6dd8550c5 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -312,8 +312,8 @@ def test_maybe_convert_for_categorical_noop(self): assert result == expected @pytest.mark.parametrize('categories, dtype, expected', [ - (['1', '2'], [1, 2, 3], np.array([1, 2])), - (['1', '2', 'a'], [1, 2, 3], np.array([1, 2, np.nan])), + (['1', '2'], [1, 2, 3], np.array([1, 2], dtype='i8')), + (['1', '2', 'a'], [1, 2, 3], np.array([1, 2, np.nan], dtype='f8')), ]) def test_maybe_convert_for_categorical(self, categories, dtype, expected): dtype = CategoricalDtype(dtype) From 8600c505b2ec922b5fae6c776d6cebe8b40f0419 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 28 Sep 2017 07:37:43 -0500 Subject: [PATCH 13/17] REF: refactor to new method --- pandas/_libs/parsers.pyx | 25 ++----------------- pandas/core/categorical.py | 43 ++++++++++++++++++++++++++++++++ pandas/tests/test_categorical.py | 34 +++++++++++++++++++++++++ 3 files changed, 79 insertions(+), 23 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 9587baa3f10f7..04f3419f87b85 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1273,30 +1273,9 @@ cdef class TextReader: codes, cats, na_count = _categorical_convert( self.parser, i, start, end, na_filter, na_hashset, self.c_encoding) - cats = Index(cats) - - cats = maybe_convert_for_categorical(cats, dtype) - - if (isinstance(dtype, CategoricalDtype) and - dtype.categories is not None): - # recode for dtype.categories - categories = dtype.categories - codes = _recode_for_categories(codes, cats, categories) - ordered = dtype.ordered - elif not cats.is_monotonic_increasing: - # sort categories and recode if necessary - unsorted = cats.copy() - categories = cats.sort_values() - codes = _recode_for_categories(codes, unsorted, categories) - ordered = False - else: - categories = cats - ordered = False - - cat = Categorical(codes, categories=categories, ordered=ordered, - fastpath=True) - + cat = Categorical._from_inferred_categories(cats, codes, dtype) return cat, na_count + elif is_object_dtype(dtype): return self._string_convert(i, start, end, na_filter, na_hashset) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 8b055e9ae59c3..5b9514a923a06 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -509,6 +509,49 @@ def base(self): """ compat, we are always our own object """ return None + @classmethod + def _from_inferred_categories(cls, inferred_categories, inferred_codes, + dtype): + """Construct a Categorical from inferred values + + For inferred categories (`dtype` is None) the categories are sorted. + For explicit `dtype`, the `inferred_categories` are cast to the + appropriate type. + + Parameters + ---------- + + inferred_categories, inferred_codes : Index + dtype : CategoricalDtype + + Returns + ------- + Categorical + """ + from pandas.core.dtypes.cast import maybe_convert_for_categorical + from pandas import Index + + cats = Index(inferred_categories) + cats = maybe_convert_for_categorical(cats, dtype) + + if (isinstance(dtype, CategoricalDtype) and + dtype.categories is not None): + # recode for dtype.categories + categories = dtype.categories + codes = _recode_for_categories(inferred_codes, cats, categories) + elif not cats.is_monotonic_increasing: + # sort categories and recode if necessary + unsorted = cats.copy() + categories = cats.sort_values() + codes = _recode_for_categories(inferred_codes, unsorted, + categories) + dtype = CategoricalDtype(categories, ordered=False) + else: + dtype = CategoricalDtype(cats, ordered=False) + codes = inferred_codes + + return cls(codes, dtype=dtype, fastpath=True) + @classmethod def from_array(cls, data, **kwargs): """ diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index d43901ea091b7..6130ca44ad0cd 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -560,6 +560,40 @@ def f(): codes = np.random.choice([0, 1], 5, p=[0.9, 0.1]) pd.Categorical.from_codes(codes, categories=["train", "test"]) + @pytest.mark.parametrize('dtype', [None, 'category']) + def test_from_inferred_categories(self, dtype): + cats = ['a', 'b'] + codes = [0, 0, 1, 1] + result = Categorical._from_inferred_categories(cats, codes, dtype) + expected = Categorical.from_codes(codes, cats) + tm.assert_categorical_equal(result, expected) + + @pytest.mark.parametrize('dtype', [None, 'category']) + def test_from_inferred_categories_sorts(self, dtype): + cats = ['b', 'a'] + codes = [0, 1, 1, 1] + result = Categorical._from_inferred_categories(cats, codes, dtype) + expected = Categorical.from_codes([1, 0, 0, 0], ['a', 'b']) + tm.assert_categorical_equal(result, expected) + + def test_from_inferred_categories_dtype(self): + cats = ['a', 'b', 'd'] + codes = [0, 1, 0, 2] + dtype = CategoricalDtype(['c', 'b', 'a'], ordered=True) + result = Categorical._from_inferred_categories(cats, codes, dtype) + expected = Categorical(['a', 'b', 'a', 'd'], + categories=['c', 'b', 'a'], + ordered=True) + tm.assert_categorical_equal(result, expected) + + def test_from_inferred_categories_coerces(self): + cats = ['1', '2', 'bad'] + codes = [0, 0, 1, 2] + dtype = CategoricalDtype([1, 2]) + result = Categorical._from_inferred_categories(cats, codes, dtype) + expected = Categorical([1, 1, 2, np.nan]) + tm.assert_categorical_equal(result, expected) + def test_validate_ordered(self): # see gh-14058 exp_msg = "'ordered' must either be 'True' or 'False'" From 96d51441c8fe587498d7ec087f49dc389114facb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 28 Sep 2017 10:33:57 -0500 Subject: [PATCH 14/17] py2 compat --- pandas/tests/test_categorical.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 6130ca44ad0cd..9e3bd40dc275a 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -563,7 +563,7 @@ def f(): @pytest.mark.parametrize('dtype', [None, 'category']) def test_from_inferred_categories(self, dtype): cats = ['a', 'b'] - codes = [0, 0, 1, 1] + codes = np.array([0, 0, 1, 1], dtype='i8') result = Categorical._from_inferred_categories(cats, codes, dtype) expected = Categorical.from_codes(codes, cats) tm.assert_categorical_equal(result, expected) @@ -571,14 +571,14 @@ def test_from_inferred_categories(self, dtype): @pytest.mark.parametrize('dtype', [None, 'category']) def test_from_inferred_categories_sorts(self, dtype): cats = ['b', 'a'] - codes = [0, 1, 1, 1] + codes = np.array([0, 1, 1, 1], dtype='i8') result = Categorical._from_inferred_categories(cats, codes, dtype) expected = Categorical.from_codes([1, 0, 0, 0], ['a', 'b']) tm.assert_categorical_equal(result, expected) def test_from_inferred_categories_dtype(self): cats = ['a', 'b', 'd'] - codes = [0, 1, 0, 2] + codes = np.array([0, 1, 0, 2], dtype='i8') dtype = CategoricalDtype(['c', 'b', 'a'], ordered=True) result = Categorical._from_inferred_categories(cats, codes, dtype) expected = Categorical(['a', 'b', 'a', 'd'], @@ -588,7 +588,7 @@ def test_from_inferred_categories_dtype(self): def test_from_inferred_categories_coerces(self): cats = ['1', '2', 'bad'] - codes = [0, 0, 1, 2] + codes = np.array([0, 0, 1, 2], dtype='i8') dtype = CategoricalDtype([1, 2]) result = Categorical._from_inferred_categories(cats, codes, dtype) expected = Categorical([1, 1, 2, np.nan]) From 3de75cdb60d3cd05a878450a92471692b7b0e0b6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 29 Sep 2017 06:24:43 -0500 Subject: [PATCH 15/17] Refactored --- pandas/_libs/parsers.pyx | 1 - pandas/core/categorical.py | 17 +++++++++++--- pandas/core/dtypes/cast.py | 38 +------------------------------ pandas/io/parsers.py | 19 ++++++++-------- pandas/tests/dtypes/test_cast.py | 39 -------------------------------- 5 files changed, 24 insertions(+), 90 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 04f3419f87b85..60a646769dd1a 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -48,7 +48,6 @@ from pandas.core.dtypes.common import ( from pandas.core.categorical import Categorical, _recode_for_categories from pandas.core.algorithms import take_1d from pandas.core.dtypes.concat import union_categoricals -from pandas.core.dtypes.cast import maybe_convert_for_categorical from pandas import Index import pandas.io.common as com diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index d173298656e3f..d5a161874891f 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -21,6 +21,8 @@ _ensure_platform_int, is_dtype_equal, is_datetimelike, + is_datetime64_dtype, + is_timedelta64_dtype, is_categorical, is_categorical_dtype, is_integer_dtype, @@ -528,11 +530,20 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes, ------- Categorical """ - from pandas.core.dtypes.cast import maybe_convert_for_categorical - from pandas import Index + from pandas import Index, to_numeric, to_datetime, to_timedelta cats = Index(inferred_categories) - cats = maybe_convert_for_categorical(cats, dtype) + + # Convert to a specialzed type with `dtype` is specified + if (isinstance(dtype, CategoricalDtype) and + dtype.categories is not None): + + if dtype.categories.is_numeric(): + cats = to_numeric(inferred_categories, errors='coerce') + elif is_datetime64_dtype(dtype.categories): + cats = to_datetime(inferred_categories, errors='coerce') + elif is_timedelta64_dtype(dtype.categories): + cats = to_timedelta(inferred_categories, errors='coerce') if (isinstance(dtype, CategoricalDtype) and dtype.categories is not None): diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index acc389bcdaa11..c2cf6afc1a7b5 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -24,8 +24,7 @@ _ensure_int32, _ensure_int64, _NS_DTYPE, _TD_DTYPE, _INT64_DTYPE, _POSSIBLY_CAST_DTYPES) -from .dtypes import (ExtensionDtype, DatetimeTZDtype, PeriodDtype, - CategoricalDtype) +from .dtypes import ExtensionDtype, DatetimeTZDtype, PeriodDtype from .generic import (ABCDatetimeIndex, ABCPeriodIndex, ABCSeries) from .missing import isna, notna @@ -605,41 +604,6 @@ def conv(r, dtype): return [conv(r, dtype) for r, dtype in zip(result, dtypes)] -def maybe_convert_for_categorical(categories, dtype): - """Convert ``categories`` depending on ``dtype``. - - Converts to numeric, datetime, or timedelta types, when ``dtype`` is - a CategoricalDtype with known, non-object categories. - - Parameters - ---------- - categories : array-like - type : CategoricalDtype - - Returns - ------- - new_categories : array or Index - - Examples - -------- - >>> maybe_convert_for_categorical(['1', '2'], CategoricalDtype([1, 2])) - array([ 1, 2]) - >>> maybe_convert_for_categorical([1, 'a'], CategoricalDtype([1, 2])) - array([ 1., nan]) - """ - if isinstance(dtype, CategoricalDtype) and dtype.categories is not None: - from pandas import to_numeric, to_datetime, to_timedelta - - if dtype.categories.is_numeric(): - categories = to_numeric(categories, errors='coerce') - elif is_datetime64_dtype(dtype.categories): - categories = to_datetime(categories, errors='coerce') - elif is_timedelta64_dtype(dtype.categories): - categories = to_timedelta(categories, errors='coerce') - - return categories - - def astype_nansafe(arr, dtype, copy=True): """ return a view if copy is False, but need to be very careful as the result shape could change! """ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f12d8f7c81ab5..42f51407f4e34 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -23,8 +23,7 @@ is_scalar, is_categorical_dtype) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import isna -from pandas.core.dtypes.cast import (astype_nansafe, - maybe_convert_for_categorical) +from pandas.core.dtypes.cast import astype_nansafe from pandas.core.index import (Index, MultiIndex, RangeIndex, _ensure_index_from_sequences) from pandas.core.series import Series @@ -1610,15 +1609,15 @@ def _cast_types(self, values, cast_type, column): known_cats = (isinstance(cast_type, CategoricalDtype) and cast_type.categories is not None) - categories = ordered = None if known_cats: - values = maybe_convert_for_categorical(values, cast_type) - categories = cast_type.categories - ordered = cast_type.ordered - elif not is_object_dtype(values): - values = astype_nansafe(values, str) - values = Categorical(values, categories=categories, - ordered=ordered) + cats = Index(values).unique() + values = Categorical._from_inferred_categories( + cats, cats.get_indexer(values), cast_type + ) + else: + if not is_object_dtype(values): + values = astype_nansafe(values, str) + values = Categorical(values, categories=None, ordered=False) else: try: values = astype_nansafe(values, cast_type, copy=True) diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index 96cb6dd8550c5..d9fb458c83529 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -16,7 +16,6 @@ from pandas.core.dtypes.cast import ( maybe_downcast_to_dtype, maybe_convert_objects, - maybe_convert_for_categorical, cast_scalar_to_array, infer_dtype_from_scalar, infer_dtype_from_array, @@ -300,44 +299,6 @@ def test_maybe_infer_to_datetimelike(self): [NaT, 'b', 1]])) assert result.size == 6 - def test_maybe_convert_for_categorical_noop(self): - expected = ['1', '2'] - result = maybe_convert_for_categorical(expected, None) - assert result == expected - - result = maybe_convert_for_categorical(expected, CategoricalDtype()) - assert result == expected - - result = maybe_convert_for_categorical(expected, 'category') - assert result == expected - - @pytest.mark.parametrize('categories, dtype, expected', [ - (['1', '2'], [1, 2, 3], np.array([1, 2], dtype='i8')), - (['1', '2', 'a'], [1, 2, 3], np.array([1, 2, np.nan], dtype='f8')), - ]) - def test_maybe_convert_for_categorical(self, categories, dtype, expected): - dtype = CategoricalDtype(dtype) - result = maybe_convert_for_categorical(categories, dtype) - tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.parametrize('categories, dtype, expected', [ - (['2016', '2017'], pd.to_datetime(['2016', '2017']), - pd.to_datetime(['2016', '2017'])), - (['2016', '2017', 'bad'], pd.to_datetime(['2016', '2017']), - pd.to_datetime(['2016', '2017', 'NaT'])), - - (['1H', '2H'], pd.to_timedelta(['1H', '2H']), - pd.to_timedelta(['1H', '2H'])), - (['1H', '2H', 'bad'], pd.to_timedelta(['1H', '2H']), - pd.to_timedelta(['1H', '2H', 'NaT'])), - - ]) - def test_maybe_convert_for_categorical_dates(self, categories, dtype, - expected): - dtype = CategoricalDtype(dtype) - result = maybe_convert_for_categorical(categories, dtype) - tm.assert_index_equal(result, expected) - class TestConvert(object): From f03798dd242852f67c50d61c7a90567be0159ae7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 29 Sep 2017 10:18:12 -0500 Subject: [PATCH 16/17] More in Categorical --- pandas/io/parsers.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 42f51407f4e34..c8b2987d591ef 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1603,21 +1603,20 @@ def _cast_types(self, values, cast_type, column): """ if is_categorical_dtype(cast_type): - # XXX this is for consistency with - # c-parser which parses all categories - # as strings known_cats = (isinstance(cast_type, CategoricalDtype) and cast_type.categories is not None) - if known_cats: - cats = Index(values).unique() - values = Categorical._from_inferred_categories( - cats, cats.get_indexer(values), cast_type - ) - else: - if not is_object_dtype(values): - values = astype_nansafe(values, str) - values = Categorical(values, categories=None, ordered=False) + if not is_object_dtype(values) and not known_cats: + # XXX this is for consistency with + # c-parser which parses all categories + # as strings + values = astype_nansafe(values, str) + + cats = Index(values).unique().dropna() + values = Categorical._from_inferred_categories( + cats, cats.get_indexer(values), cast_type + ) + else: try: values = astype_nansafe(values, cast_type, copy=True) From 9325a9345f3a4495ff81eb692121c194d8d7c040 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 2 Oct 2017 06:44:59 -0500 Subject: [PATCH 17/17] fixup! More in Categorical --- pandas/core/categorical.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index d5a161874891f..ce71e6fd74326 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -523,8 +523,9 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes, Parameters ---------- - inferred_categories, inferred_codes : Index - dtype : CategoricalDtype + inferred_categories : Index + inferred_codes : Index + dtype : CategoricalDtype or 'category' Returns ------- @@ -534,10 +535,11 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes, cats = Index(inferred_categories) - # Convert to a specialzed type with `dtype` is specified - if (isinstance(dtype, CategoricalDtype) and - dtype.categories is not None): + known_categories = (isinstance(dtype, CategoricalDtype) and + dtype.categories is not None) + if known_categories: + # Convert to a specialzed type with `dtype` if specified if dtype.categories.is_numeric(): cats = to_numeric(inferred_categories, errors='coerce') elif is_datetime64_dtype(dtype.categories): @@ -545,13 +547,12 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes, elif is_timedelta64_dtype(dtype.categories): cats = to_timedelta(inferred_categories, errors='coerce') - if (isinstance(dtype, CategoricalDtype) and - dtype.categories is not None): - # recode for dtype.categories + if known_categories: + # recode from observation oder to dtype.categories order categories = dtype.categories codes = _recode_for_categories(inferred_codes, cats, categories) elif not cats.is_monotonic_increasing: - # sort categories and recode if necessary + # sort categories and recode for unknown categories unsorted = cats.copy() categories = cats.sort_values() codes = _recode_for_categories(inferred_codes, unsorted,