Skip to content

Commit aa5e55e

Browse files
committed
Merge pull request #8075 from jreback/cat2
ENH: add support dtype='category' in Series constructor
2 parents 8dc3c19 + 9159d98 commit aa5e55e

File tree

6 files changed

+77
-16
lines changed

6 files changed

+77
-16
lines changed

doc/source/v0.15.0.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ API changes
120120
3 9
121121
4 NaN
122122
dtype: float64
123-
123+
124124
New behavior (note final value is ``7 = sum([3, 4, NaN])``):
125125

126126
.. ipython:: python
@@ -346,7 +346,7 @@ Categoricals in Series/DataFrame
346346

347347
:class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new
348348
methods to manipulate. Thanks to Jan Schultz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`,
349-
:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`).
349+
:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`, :issue:`8075`, :issue:`8076`).
350350

351351
For full docs, see the :ref:`Categorical introduction <categorical>` and the
352352
:ref:`API documentation <api.categorical>`.

pandas/core/categorical.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -743,12 +743,14 @@ def fillna(self, fill_value=None, method=None, limit=None, **kwargs):
743743
name=self.name, fastpath=True)
744744

745745
def take_nd(self, indexer, allow_fill=True, fill_value=None):
746-
""" Take the values by the indexer, fill with the fill_value. """
747-
if allow_fill and fill_value is None:
748-
fill_value = np.nan
746+
""" Take the codes by the indexer, fill with the fill_value. """
747+
748+
# filling must always be None/nan here
749+
# but is passed thru internally
750+
assert isnull(fill_value)
749751

750-
values = com.take_1d(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value)
751-
result = Categorical(values=values, levels=self.levels, ordered=self.ordered,
752+
codes = com.take_1d(self._codes, indexer, allow_fill=True, fill_value=-1)
753+
result = Categorical(codes, levels=self.levels, ordered=self.ordered,
752754
name=self.name, fastpath=True)
753755
return result
754756

pandas/core/common.py

+15-3
Original file line numberDiff line numberDiff line change
@@ -2326,19 +2326,31 @@ def is_number(obj):
23262326
return isinstance(obj, (numbers.Number, np.number))
23272327

23282328

2329+
def _coerce_to_dtype(dtype):
2330+
""" coerce a string / np.dtype to a dtype """
2331+
if is_categorical_dtype(dtype):
2332+
dtype = CategoricalDtype()
2333+
else:
2334+
dtype = np.dtype(dtype)
2335+
return dtype
2336+
23292337
def _get_dtype(arr_or_dtype):
23302338
if isinstance(arr_or_dtype, np.dtype):
23312339
return arr_or_dtype
2332-
if isinstance(arr_or_dtype, type):
2340+
elif isinstance(arr_or_dtype, type):
23332341
return np.dtype(arr_or_dtype)
2342+
elif isinstance(arr_or_dtype, CategoricalDtype):
2343+
return CategoricalDtype()
23342344
return arr_or_dtype.dtype
23352345

23362346

23372347
def _get_dtype_type(arr_or_dtype):
23382348
if isinstance(arr_or_dtype, np.dtype):
23392349
return arr_or_dtype.type
2340-
if isinstance(arr_or_dtype, type):
2350+
elif isinstance(arr_or_dtype, type):
23412351
return np.dtype(arr_or_dtype).type
2352+
elif isinstance(arr_or_dtype, CategoricalDtype):
2353+
return CategoricalDtypeType
23422354
return arr_or_dtype.dtype.type
23432355

23442356

@@ -2488,7 +2500,7 @@ def _astype_nansafe(arr, dtype, copy=True):
24882500
""" return a view if copy is False, but
24892501
need to be very careful as the result shape could change! """
24902502
if not isinstance(dtype, np.dtype):
2491-
dtype = np.dtype(dtype)
2503+
dtype = _coerce_to_dtype(dtype)
24922504

24932505
if is_datetime64_dtype(arr):
24942506
if dtype == object:

pandas/core/generic.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def _validate_dtype(self, dtype):
105105
""" validate the passed dtype """
106106

107107
if dtype is not None:
108-
dtype = np.dtype(dtype)
108+
dtype = com._coerce_to_dtype(dtype)
109109

110110
# a compound dtype
111111
if dtype.kind == 'V':

pandas/core/series.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
is_list_like, _values_from_object,
2020
_possibly_cast_to_datetime, _possibly_castable,
2121
_possibly_convert_platform, _try_sort,
22-
ABCSparseArray, _maybe_match_name,
22+
ABCSparseArray, _maybe_match_name, _coerce_to_dtype,
2323
_ensure_object, SettingWithCopyError)
2424
from pandas.core.index import (Index, MultiIndex, InvalidIndexError,
2525
_ensure_index)
@@ -2434,7 +2434,7 @@ def _sanitize_array(data, index, dtype=None, copy=False,
24342434
""" sanitize input data to an ndarray, copy if specified, coerce to the dtype if specified """
24352435

24362436
if dtype is not None:
2437-
dtype = np.dtype(dtype)
2437+
dtype = _coerce_to_dtype(dtype)
24382438

24392439
if isinstance(data, ma.MaskedArray):
24402440
mask = ma.getmaskarray(data)
@@ -2455,9 +2455,11 @@ def _try_cast(arr, take_fast_path):
24552455
arr = _possibly_cast_to_datetime(arr, dtype)
24562456
subarr = pa.array(arr, dtype=dtype, copy=copy)
24572457
except (ValueError, TypeError):
2458-
if dtype is not None and raise_cast_failure:
2458+
if com.is_categorical_dtype(dtype):
2459+
subarr = Categorical(arr)
2460+
elif dtype is not None and raise_cast_failure:
24592461
raise
2460-
else: # pragma: no cover
2462+
else:
24612463
subarr = pa.array(arr, dtype=object, copy=copy)
24622464
return subarr
24632465

pandas/tests/test_categorical.py

+46-1
Original file line numberDiff line numberDiff line change
@@ -840,13 +840,58 @@ def test_creation_astype(self):
840840
df["cats"] = df["cats"].astype("category")
841841
tm.assert_frame_equal(exp_df, df)
842842

843-
844843
df = pd.DataFrame({"cats":['a', 'b', 'b', 'a', 'a', 'd'], "vals":[1,2,3,4,5,6]})
845844
cats = Categorical(['a', 'b', 'b', 'a', 'a', 'd'])
846845
exp_df = pd.DataFrame({"cats":cats, "vals":[1,2,3,4,5,6]})
847846
df["cats"] = df["cats"].astype("category")
848847
tm.assert_frame_equal(exp_df, df)
849848

849+
def test_construction_series(self):
850+
851+
l = [1,2,3,1]
852+
exp = Series(l).astype('category')
853+
res = Series(l,dtype='category')
854+
tm.assert_series_equal(res, exp)
855+
856+
l = ["a","b","c","a"]
857+
exp = Series(l).astype('category')
858+
res = Series(l,dtype='category')
859+
tm.assert_series_equal(res, exp)
860+
861+
# insert into frame with different index
862+
# GH 8076
863+
index = pd.date_range('20000101', periods=3)
864+
expected = Series(Categorical(values=[np.nan,np.nan,np.nan],levels=['a', 'b', 'c']))
865+
expected.index = index
866+
867+
expected = DataFrame({'x': expected})
868+
df = DataFrame({'x': Series(['a', 'b', 'c'],dtype='category')}, index=index)
869+
tm.assert_frame_equal(df, expected)
870+
871+
def test_reindex(self):
872+
873+
index = pd.date_range('20000101', periods=3)
874+
875+
# reindexing to an invalid Categorical
876+
s = Series(['a', 'b', 'c'],dtype='category')
877+
result = s.reindex(index)
878+
expected = Series(Categorical(values=[np.nan,np.nan,np.nan],levels=['a', 'b', 'c']))
879+
expected.index = index
880+
tm.assert_series_equal(result, expected)
881+
882+
# partial reindexing
883+
expected = Series(Categorical(values=['b','c'],levels=['a', 'b', 'c']))
884+
expected.index = [1,2]
885+
result = s.reindex([1,2])
886+
tm.assert_series_equal(result, expected)
887+
888+
expected = Series(Categorical(values=['c',np.nan],levels=['a', 'b', 'c']))
889+
expected.index = [2,3]
890+
result = s.reindex([2,3])
891+
tm.assert_series_equal(result, expected)
892+
893+
894+
850895
def test_sideeffects_free(self):
851896

852897
# Passing a categorical to a Series and then changing values in either the series or the

0 commit comments

Comments
 (0)