Merge pull request #8075 from jreback/cat2

jreback · jreback · commit aa5e55ef01ec · 2014-08-21T17:33:10.000-04:00
ENH: add support dtype='category' in Series constructor
diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -120,7 +120,7 @@ API changes
     3     9
     4   NaN
     dtype: float64
-  
+
   New behavior (note final value is ``7 = sum([3, 4, NaN])``):
 
   .. ipython:: python
@@ -346,7 +346,7 @@ Categoricals in Series/DataFrame
 
 :class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new
 methods to manipulate. Thanks to Jan Schultz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`,
-:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`).
+:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`, :issue:`8075`, :issue:`8076`).
 
 For full docs, see the :ref:`Categorical introduction <categorical>` and the
 :ref:`API documentation <api.categorical>`.
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -743,12 +743,14 @@ def fillna(self, fill_value=None, method=None, limit=None, **kwargs):
                            name=self.name, fastpath=True)
 
     def take_nd(self, indexer, allow_fill=True, fill_value=None):
-        """ Take the values by the indexer, fill with the fill_value. """
-        if allow_fill and fill_value is None:
-            fill_value = np.nan
+        """ Take the codes by the indexer, fill with the fill_value. """
+
+        # filling must always be None/nan here
+        # but is passed thru internally
+        assert isnull(fill_value)
 
-        values = com.take_1d(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value)
-        result = Categorical(values=values, levels=self.levels, ordered=self.ordered,
+        codes = com.take_1d(self._codes, indexer, allow_fill=True, fill_value=-1)
+        result = Categorical(codes, levels=self.levels, ordered=self.ordered,
                              name=self.name, fastpath=True)
         return result
 
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -2326,19 +2326,31 @@ def is_number(obj):
     return isinstance(obj, (numbers.Number, np.number))
 
 
+def _coerce_to_dtype(dtype):
+    """ coerce a string / np.dtype to a dtype """
+    if is_categorical_dtype(dtype):
+        dtype = CategoricalDtype()
+    else:
+        dtype = np.dtype(dtype)
+    return dtype
+
 def _get_dtype(arr_or_dtype):
     if isinstance(arr_or_dtype, np.dtype):
         return arr_or_dtype
-    if isinstance(arr_or_dtype, type):
+    elif isinstance(arr_or_dtype, type):
         return np.dtype(arr_or_dtype)
+    elif isinstance(arr_or_dtype, CategoricalDtype):
+        return CategoricalDtype()
     return arr_or_dtype.dtype
 
 
 def _get_dtype_type(arr_or_dtype):
     if isinstance(arr_or_dtype, np.dtype):
         return arr_or_dtype.type
-    if isinstance(arr_or_dtype, type):
+    elif isinstance(arr_or_dtype, type):
         return np.dtype(arr_or_dtype).type
+    elif isinstance(arr_or_dtype, CategoricalDtype):
+        return CategoricalDtypeType
     return arr_or_dtype.dtype.type
 
 
@@ -2488,7 +2500,7 @@ def _astype_nansafe(arr, dtype, copy=True):
     """ return a view if copy is False, but
         need to be very careful as the result shape could change! """
     if not isinstance(dtype, np.dtype):
-        dtype = np.dtype(dtype)
+        dtype = _coerce_to_dtype(dtype)
 
     if is_datetime64_dtype(arr):
         if dtype == object:
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -105,7 +105,7 @@ def _validate_dtype(self, dtype):
         """ validate the passed dtype """
 
         if dtype is not None:
-            dtype = np.dtype(dtype)
+            dtype = com._coerce_to_dtype(dtype)
 
             # a compound dtype
             if dtype.kind == 'V':
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -19,7 +19,7 @@
                                 is_list_like, _values_from_object,
                                 _possibly_cast_to_datetime, _possibly_castable,
                                 _possibly_convert_platform, _try_sort,
-                                ABCSparseArray, _maybe_match_name,
+                                ABCSparseArray, _maybe_match_name, _coerce_to_dtype,
                                 _ensure_object, SettingWithCopyError)
 from pandas.core.index import (Index, MultiIndex, InvalidIndexError,
                                _ensure_index)
@@ -2434,7 +2434,7 @@ def _sanitize_array(data, index, dtype=None, copy=False,
     """ sanitize input data to an ndarray, copy if specified, coerce to the dtype if specified """
 
     if dtype is not None:
-        dtype = np.dtype(dtype)
+        dtype = _coerce_to_dtype(dtype)
 
     if isinstance(data, ma.MaskedArray):
         mask = ma.getmaskarray(data)
@@ -2455,9 +2455,11 @@ def _try_cast(arr, take_fast_path):
             arr = _possibly_cast_to_datetime(arr, dtype)
             subarr = pa.array(arr, dtype=dtype, copy=copy)
         except (ValueError, TypeError):
-            if dtype is not None and raise_cast_failure:
+            if com.is_categorical_dtype(dtype):
+                subarr = Categorical(arr)
+            elif dtype is not None and raise_cast_failure:
                 raise
-            else:  # pragma: no cover
+            else:
                 subarr = pa.array(arr, dtype=object, copy=copy)
         return subarr
 
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -840,13 +840,58 @@ def test_creation_astype(self):
         df["cats"] =  df["cats"].astype("category")
         tm.assert_frame_equal(exp_df, df)
 
-
         df = pd.DataFrame({"cats":['a', 'b', 'b', 'a', 'a', 'd'], "vals":[1,2,3,4,5,6]})
         cats = Categorical(['a', 'b', 'b', 'a', 'a', 'd'])
         exp_df = pd.DataFrame({"cats":cats, "vals":[1,2,3,4,5,6]})
         df["cats"] =  df["cats"].astype("category")
         tm.assert_frame_equal(exp_df, df)
 
+    def test_construction_series(self):
+
+        l = [1,2,3,1]
+        exp = Series(l).astype('category')
+        res = Series(l,dtype='category')
+        tm.assert_series_equal(res, exp)
+
+        l = ["a","b","c","a"]
+        exp = Series(l).astype('category')
+        res = Series(l,dtype='category')
+        tm.assert_series_equal(res, exp)
+
+        # insert into frame with different index
+        # GH 8076
+        index = pd.date_range('20000101', periods=3)
+        expected = Series(Categorical(values=[np.nan,np.nan,np.nan],levels=['a', 'b', 'c']))
+        expected.index = index
+
+        expected = DataFrame({'x': expected})
+        df = DataFrame({'x': Series(['a', 'b', 'c'],dtype='category')}, index=index)
+        tm.assert_frame_equal(df, expected)
+
+    def test_reindex(self):
+
+        index = pd.date_range('20000101', periods=3)
+
+        # reindexing to an invalid Categorical
+        s = Series(['a', 'b', 'c'],dtype='category')
+        result = s.reindex(index)
+        expected = Series(Categorical(values=[np.nan,np.nan,np.nan],levels=['a', 'b', 'c']))
+        expected.index = index
+        tm.assert_series_equal(result, expected)
+
+        # partial reindexing
+        expected = Series(Categorical(values=['b','c'],levels=['a', 'b', 'c']))
+        expected.index = [1,2]
+        result = s.reindex([1,2])
+        tm.assert_series_equal(result, expected)
+
+        expected = Series(Categorical(values=['c',np.nan],levels=['a', 'b', 'c']))
+        expected.index = [2,3]
+        result = s.reindex([2,3])
+        tm.assert_series_equal(result, expected)
+
+
+
     def test_sideeffects_free(self):
 
         # Passing a categorical to a Series and then changing values in either the series or the