From ba955a991665bbcd376e70f0b35a00369d1e77c0 Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 19 Aug 2014 20:12:03 -0400 Subject: [PATCH 1/2] ENH: add support dtype='category' in Series constructor --- doc/source/v0.15.0.txt | 4 ++-- pandas/core/common.py | 18 +++++++++++++++--- pandas/core/generic.py | 2 +- pandas/core/series.py | 10 ++++++---- pandas/tests/test_categorical.py | 13 ++++++++++++- 5 files changed, 36 insertions(+), 11 deletions(-) diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 6c58e751a6bcc..851a4f4581eee 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -120,7 +120,7 @@ API changes 3 9 4 NaN dtype: float64 - + New behavior (note final value is ``7 = sum([3, 4, NaN])``): .. ipython:: python @@ -346,7 +346,7 @@ Categoricals in Series/DataFrame :class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new methods to manipulate. Thanks to Jan Schultz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`, -:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`). +:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`, :issue:`8075`). For full docs, see the :ref:`Categorical introduction ` and the :ref:`API documentation `. diff --git a/pandas/core/common.py b/pandas/core/common.py index 0274a0f1b3b03..e3a0cf14cfbc1 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -2326,19 +2326,31 @@ def is_number(obj): return isinstance(obj, (numbers.Number, np.number)) +def _coerce_to_dtype(dtype): + """ coerce a string / np.dtype to a dtype """ + if is_categorical_dtype(dtype): + dtype = CategoricalDtype() + else: + dtype = np.dtype(dtype) + return dtype + def _get_dtype(arr_or_dtype): if isinstance(arr_or_dtype, np.dtype): return arr_or_dtype - if isinstance(arr_or_dtype, type): + elif isinstance(arr_or_dtype, type): return np.dtype(arr_or_dtype) + elif isinstance(arr_or_dtype, CategoricalDtype): + return CategoricalDtype() return arr_or_dtype.dtype def _get_dtype_type(arr_or_dtype): if isinstance(arr_or_dtype, np.dtype): return arr_or_dtype.type - if isinstance(arr_or_dtype, type): + elif isinstance(arr_or_dtype, type): return np.dtype(arr_or_dtype).type + elif isinstance(arr_or_dtype, CategoricalDtype): + return CategoricalDtypeType return arr_or_dtype.dtype.type @@ -2488,7 +2500,7 @@ def _astype_nansafe(arr, dtype, copy=True): """ return a view if copy is False, but need to be very careful as the result shape could change! """ if not isinstance(dtype, np.dtype): - dtype = np.dtype(dtype) + dtype = _coerce_to_dtype(dtype) if is_datetime64_dtype(arr): if dtype == object: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5064545404fb0..ee5016386af4c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -105,7 +105,7 @@ def _validate_dtype(self, dtype): """ validate the passed dtype """ if dtype is not None: - dtype = np.dtype(dtype) + dtype = com._coerce_to_dtype(dtype) # a compound dtype if dtype.kind == 'V': diff --git a/pandas/core/series.py b/pandas/core/series.py index 68f5b4d36392f..a0bbb2c713e56 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -19,7 +19,7 @@ is_list_like, _values_from_object, _possibly_cast_to_datetime, _possibly_castable, _possibly_convert_platform, _try_sort, - ABCSparseArray, _maybe_match_name, + ABCSparseArray, _maybe_match_name, _coerce_to_dtype, _ensure_object, SettingWithCopyError) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, _ensure_index) @@ -2434,7 +2434,7 @@ def _sanitize_array(data, index, dtype=None, copy=False, """ sanitize input data to an ndarray, copy if specified, coerce to the dtype if specified """ if dtype is not None: - dtype = np.dtype(dtype) + dtype = _coerce_to_dtype(dtype) if isinstance(data, ma.MaskedArray): mask = ma.getmaskarray(data) @@ -2455,9 +2455,11 @@ def _try_cast(arr, take_fast_path): arr = _possibly_cast_to_datetime(arr, dtype) subarr = pa.array(arr, dtype=dtype, copy=copy) except (ValueError, TypeError): - if dtype is not None and raise_cast_failure: + if com.is_categorical_dtype(dtype): + subarr = Categorical(arr) + elif dtype is not None and raise_cast_failure: raise - else: # pragma: no cover + else: subarr = pa.array(arr, dtype=object, copy=copy) return subarr diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index fcfee8cf9b1ba..5ee9fdb119b56 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -840,13 +840,24 @@ def test_creation_astype(self): df["cats"] = df["cats"].astype("category") tm.assert_frame_equal(exp_df, df) - df = pd.DataFrame({"cats":['a', 'b', 'b', 'a', 'a', 'd'], "vals":[1,2,3,4,5,6]}) cats = Categorical(['a', 'b', 'b', 'a', 'a', 'd']) exp_df = pd.DataFrame({"cats":cats, "vals":[1,2,3,4,5,6]}) df["cats"] = df["cats"].astype("category") tm.assert_frame_equal(exp_df, df) + def test_construction_series(self): + + l = [1,2,3,1] + exp = Series(l).astype('category') + res = Series(l,dtype='category') + tm.assert_series_equal(res, exp) + + l = ["a","b","c","a"] + exp = Series(l).astype('category') + res = Series(l,dtype='category') + tm.assert_series_equal(res, exp) + def test_sideeffects_free(self): # Passing a categorical to a Series and then changing values in either the series or the From 9159d9804fca85166f6aefde4267be5d0c161082 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 20 Aug 2014 08:44:37 -0400 Subject: [PATCH 2/2] BUG: fix reindexing to an all-nan Categorical (GH8076) --- doc/source/v0.15.0.txt | 2 +- pandas/core/categorical.py | 12 ++++++----- pandas/tests/test_categorical.py | 34 ++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 6 deletions(-) diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 851a4f4581eee..b987104ac2408 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -346,7 +346,7 @@ Categoricals in Series/DataFrame :class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new methods to manipulate. Thanks to Jan Schultz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`, -:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`, :issue:`8075`). +:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`, :issue:`8075`, :issue:`8076`). For full docs, see the :ref:`Categorical introduction ` and the :ref:`API documentation `. diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 853feb27d1b21..ec1de70e18b4c 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -743,12 +743,14 @@ def fillna(self, fill_value=None, method=None, limit=None, **kwargs): name=self.name, fastpath=True) def take_nd(self, indexer, allow_fill=True, fill_value=None): - """ Take the values by the indexer, fill with the fill_value. """ - if allow_fill and fill_value is None: - fill_value = np.nan + """ Take the codes by the indexer, fill with the fill_value. """ + + # filling must always be None/nan here + # but is passed thru internally + assert isnull(fill_value) - values = com.take_1d(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value) - result = Categorical(values=values, levels=self.levels, ordered=self.ordered, + codes = com.take_1d(self._codes, indexer, allow_fill=True, fill_value=-1) + result = Categorical(codes, levels=self.levels, ordered=self.ordered, name=self.name, fastpath=True) return result diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 5ee9fdb119b56..7bc2eeb97d47a 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -858,6 +858,40 @@ def test_construction_series(self): res = Series(l,dtype='category') tm.assert_series_equal(res, exp) + # insert into frame with different index + # GH 8076 + index = pd.date_range('20000101', periods=3) + expected = Series(Categorical(values=[np.nan,np.nan,np.nan],levels=['a', 'b', 'c'])) + expected.index = index + + expected = DataFrame({'x': expected}) + df = DataFrame({'x': Series(['a', 'b', 'c'],dtype='category')}, index=index) + tm.assert_frame_equal(df, expected) + + def test_reindex(self): + + index = pd.date_range('20000101', periods=3) + + # reindexing to an invalid Categorical + s = Series(['a', 'b', 'c'],dtype='category') + result = s.reindex(index) + expected = Series(Categorical(values=[np.nan,np.nan,np.nan],levels=['a', 'b', 'c'])) + expected.index = index + tm.assert_series_equal(result, expected) + + # partial reindexing + expected = Series(Categorical(values=['b','c'],levels=['a', 'b', 'c'])) + expected.index = [1,2] + result = s.reindex([1,2]) + tm.assert_series_equal(result, expected) + + expected = Series(Categorical(values=['c',np.nan],levels=['a', 'b', 'c'])) + expected.index = [2,3] + result = s.reindex([2,3]) + tm.assert_series_equal(result, expected) + + + def test_sideeffects_free(self): # Passing a categorical to a Series and then changing values in either the series or the