Skip to content

Commit ba955a9

Browse files
committed
ENH: add support dtype='category' in Series constructor
1 parent 8dc3c19 commit ba955a9

File tree

5 files changed

+36
-11
lines changed

5 files changed

+36
-11
lines changed

doc/source/v0.15.0.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ API changes
120120
3 9
121121
4 NaN
122122
dtype: float64
123-
123+
124124
New behavior (note final value is ``7 = sum([3, 4, NaN])``):
125125

126126
.. ipython:: python
@@ -346,7 +346,7 @@ Categoricals in Series/DataFrame
346346

347347
:class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new
348348
methods to manipulate. Thanks to Jan Schultz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`,
349-
:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`).
349+
:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`, :issue:`8075`).
350350

351351
For full docs, see the :ref:`Categorical introduction <categorical>` and the
352352
:ref:`API documentation <api.categorical>`.

pandas/core/common.py

+15-3
Original file line numberDiff line numberDiff line change
@@ -2326,19 +2326,31 @@ def is_number(obj):
23262326
return isinstance(obj, (numbers.Number, np.number))
23272327

23282328

2329+
def _coerce_to_dtype(dtype):
2330+
""" coerce a string / np.dtype to a dtype """
2331+
if is_categorical_dtype(dtype):
2332+
dtype = CategoricalDtype()
2333+
else:
2334+
dtype = np.dtype(dtype)
2335+
return dtype
2336+
23292337
def _get_dtype(arr_or_dtype):
23302338
if isinstance(arr_or_dtype, np.dtype):
23312339
return arr_or_dtype
2332-
if isinstance(arr_or_dtype, type):
2340+
elif isinstance(arr_or_dtype, type):
23332341
return np.dtype(arr_or_dtype)
2342+
elif isinstance(arr_or_dtype, CategoricalDtype):
2343+
return CategoricalDtype()
23342344
return arr_or_dtype.dtype
23352345

23362346

23372347
def _get_dtype_type(arr_or_dtype):
23382348
if isinstance(arr_or_dtype, np.dtype):
23392349
return arr_or_dtype.type
2340-
if isinstance(arr_or_dtype, type):
2350+
elif isinstance(arr_or_dtype, type):
23412351
return np.dtype(arr_or_dtype).type
2352+
elif isinstance(arr_or_dtype, CategoricalDtype):
2353+
return CategoricalDtypeType
23422354
return arr_or_dtype.dtype.type
23432355

23442356

@@ -2488,7 +2500,7 @@ def _astype_nansafe(arr, dtype, copy=True):
24882500
""" return a view if copy is False, but
24892501
need to be very careful as the result shape could change! """
24902502
if not isinstance(dtype, np.dtype):
2491-
dtype = np.dtype(dtype)
2503+
dtype = _coerce_to_dtype(dtype)
24922504

24932505
if is_datetime64_dtype(arr):
24942506
if dtype == object:

pandas/core/generic.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def _validate_dtype(self, dtype):
105105
""" validate the passed dtype """
106106

107107
if dtype is not None:
108-
dtype = np.dtype(dtype)
108+
dtype = com._coerce_to_dtype(dtype)
109109

110110
# a compound dtype
111111
if dtype.kind == 'V':

pandas/core/series.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
is_list_like, _values_from_object,
2020
_possibly_cast_to_datetime, _possibly_castable,
2121
_possibly_convert_platform, _try_sort,
22-
ABCSparseArray, _maybe_match_name,
22+
ABCSparseArray, _maybe_match_name, _coerce_to_dtype,
2323
_ensure_object, SettingWithCopyError)
2424
from pandas.core.index import (Index, MultiIndex, InvalidIndexError,
2525
_ensure_index)
@@ -2434,7 +2434,7 @@ def _sanitize_array(data, index, dtype=None, copy=False,
24342434
""" sanitize input data to an ndarray, copy if specified, coerce to the dtype if specified """
24352435

24362436
if dtype is not None:
2437-
dtype = np.dtype(dtype)
2437+
dtype = _coerce_to_dtype(dtype)
24382438

24392439
if isinstance(data, ma.MaskedArray):
24402440
mask = ma.getmaskarray(data)
@@ -2455,9 +2455,11 @@ def _try_cast(arr, take_fast_path):
24552455
arr = _possibly_cast_to_datetime(arr, dtype)
24562456
subarr = pa.array(arr, dtype=dtype, copy=copy)
24572457
except (ValueError, TypeError):
2458-
if dtype is not None and raise_cast_failure:
2458+
if com.is_categorical_dtype(dtype):
2459+
subarr = Categorical(arr)
2460+
elif dtype is not None and raise_cast_failure:
24592461
raise
2460-
else: # pragma: no cover
2462+
else:
24612463
subarr = pa.array(arr, dtype=object, copy=copy)
24622464
return subarr
24632465

pandas/tests/test_categorical.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -840,13 +840,24 @@ def test_creation_astype(self):
840840
df["cats"] = df["cats"].astype("category")
841841
tm.assert_frame_equal(exp_df, df)
842842

843-
844843
df = pd.DataFrame({"cats":['a', 'b', 'b', 'a', 'a', 'd'], "vals":[1,2,3,4,5,6]})
845844
cats = Categorical(['a', 'b', 'b', 'a', 'a', 'd'])
846845
exp_df = pd.DataFrame({"cats":cats, "vals":[1,2,3,4,5,6]})
847846
df["cats"] = df["cats"].astype("category")
848847
tm.assert_frame_equal(exp_df, df)
849848

849+
def test_construction_series(self):
850+
851+
l = [1,2,3,1]
852+
exp = Series(l).astype('category')
853+
res = Series(l,dtype='category')
854+
tm.assert_series_equal(res, exp)
855+
856+
l = ["a","b","c","a"]
857+
exp = Series(l).astype('category')
858+
res = Series(l,dtype='category')
859+
tm.assert_series_equal(res, exp)
860+
850861
def test_sideeffects_free(self):
851862

852863
# Passing a categorical to a Series and then changing values in either the series or the

0 commit comments

Comments
 (0)