Skip to content

Commit 0b2969f

Browse files
committed
ENH: Implement DataFrame.astype('category')
1 parent 7f4c960 commit 0b2969f

File tree

4 files changed

+161
-36
lines changed

4 files changed

+161
-36
lines changed

doc/source/categorical.rst

+56
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,16 @@ The categorical data type is useful in the following cases:
4545

4646
See also the :ref:`API docs on categoricals<api.categorical>`.
4747

48+
.. _categorical.objectcreation:
49+
4850
Object Creation
4951
---------------
5052

53+
.. _categorical.objectcreation.series:
54+
55+
Creating categories from a ``Series``
56+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
57+
5158
Categorical `Series` or columns in a `DataFrame` can be created in several ways:
5259

5360
By specifying ``dtype="category"`` when constructing a `Series`:
@@ -143,6 +150,55 @@ constructor to save the factorize step during normal constructor mode:
143150
splitter = np.random.choice([0,1], 5, p=[0.5,0.5])
144151
s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"]))
145152
153+
.. _categorical.objectcreation.frame:
154+
155+
Creating categories from a ``DataFrame``
156+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
157+
158+
.. versionadded:: 0.22.0
159+
160+
:meth:`DataFrame.astype` supports simultaneously setting multiple columns as categorical. When setting multiple
161+
columns as categorical, by default each column's dtype will contain categories for all labels present in all columns, even
162+
if a column does not contain all labels:
163+
164+
.. ipython:: python
165+
166+
df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']})
167+
df = df.astype('category')
168+
df
169+
df['A'].dtype
170+
df['B'].dtype
171+
172+
Note that this behavior is different than instantiating a ``DataFrame`` with categorical dtype, which will only assign
173+
categories to each column based on the labels present in each column:
174+
175+
.. ipython:: python
176+
177+
df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']}, dtype='category')
178+
df['A'].dtype
179+
df['B'].dtype
180+
181+
When using ``astype``, you can control the categories that will be present in each column by passing
182+
a ``CategoricalDtype``:
183+
184+
.. ipython:: python
185+
186+
df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']})
187+
dtype = CategoricalDtype(categories=list('abdef'), ordered=True)
188+
df = df.astype(dtype)
189+
df
190+
df['A'].dtype
191+
df['B'].dtype
192+
193+
Use subselection if you only want to convert certain columns to categorical. The same be behaviors previously
194+
discussed hold with subselection.
195+
196+
.. ipython:: python
197+
198+
df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e'], 'C': ['x', 'y', 'z']})
199+
df[['A', 'B']] = df[['A', 'B']].astype('category')
200+
df.dtypes
201+
146202
.. _categorical.categoricaldtype:
147203

148204
CategoricalDtype

doc/source/whatsnew/v0.22.0.txt

+19-3
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,25 @@ version.
1313
New features
1414
~~~~~~~~~~~~
1515

16-
-
17-
-
18-
-
16+
.. _whatsnew_0220.enhancements.astype_category:
17+
18+
``DataFrame.astype`` now supports categoricals
19+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
20+
21+
:meth:`DataFrame.astype` now supports simultaneously setting multiple columns as categorical (:issue:`12860`)
22+
23+
When setting multiple columns as categorical, by default each column's dtype will contain categories for all
24+
labels present in all columns, even if a column does not contain all labels:
25+
26+
.. ipython:: python
27+
28+
df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']})
29+
df = df.astype('category')
30+
df
31+
df['A'].dtype
32+
df['B'].dtype
33+
34+
See the :ref:`categorical.objectcreation.frame` section of the documentation for more details and examples.
1935

2036
.. _whatsnew_0220.enhancements.other:
2137

pandas/core/generic.py

+21-3
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,16 @@
1818
is_number,
1919
is_integer, is_bool,
2020
is_bool_dtype,
21+
is_categorical_dtype,
2122
is_numeric_dtype,
2223
is_datetime64_dtype,
2324
is_timedelta64_dtype,
2425
is_datetime64tz_dtype,
2526
is_list_like,
2627
is_dict_like,
2728
is_re_compilable,
28-
pandas_dtype)
29+
pandas_dtype,
30+
CategoricalDtype)
2931
from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask
3032
from pandas.core.dtypes.missing import isna, notna
3133
from pandas.core.dtypes.generic import ABCSeries, ABCPanel, ABCDataFrame
@@ -3973,14 +3975,30 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs):
39733975
if col_name not in self:
39743976
raise KeyError('Only a column name can be used for the '
39753977
'key in a dtype mappings argument.')
3976-
from pandas import concat
39773978
results = []
39783979
for col_name, col in self.iteritems():
39793980
if col_name in dtype:
39803981
results.append(col.astype(dtype[col_name], copy=copy))
39813982
else:
39823983
results.append(results.append(col.copy() if copy else col))
3983-
return concat(results, axis=1, copy=False)
3984+
return pd.concat(results, axis=1, copy=False)
3985+
3986+
elif is_categorical_dtype(dtype) and self.ndim > 1:
3987+
# GH 12860
3988+
dtype_with_cat = (isinstance(dtype, CategoricalDtype) and
3989+
dtype.categories is not None)
3990+
if not dtype_with_cat:
3991+
categories = kwargs.get('categories', None)
3992+
ordered = (kwargs.get('ordered', None) or
3993+
getattr(dtype, 'ordered', None))
3994+
3995+
if categories is None:
3996+
categories = algos.unique(self.values.ravel(order='F'))
3997+
3998+
dtype = CategoricalDtype(categories, ordered)
3999+
4000+
results = (self[col].astype(dtype, copy=copy) for col in self)
4001+
return pd.concat(results, axis=1, copy=False)
39844002

39854003
# else, only a single dtype is given
39864004
new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,

pandas/tests/test_categorical.py

+65-30
Original file line numberDiff line numberDiff line change
@@ -2174,51 +2174,86 @@ def test_basic(self):
21742174
result = x.person_name.loc[0]
21752175
assert result == expected
21762176

2177-
def test_creation_astype(self):
2178-
l = ["a", "b", "c", "a"]
2179-
s = pd.Series(l)
2180-
exp = pd.Series(Categorical(l))
2181-
res = s.astype('category')
2177+
def test_series_creation_astype(self):
2178+
labels = list('abca')
2179+
exp = Series(Categorical(labels))
2180+
res = Series(labels).astype('category')
21822181
tm.assert_series_equal(res, exp)
21832182

2184-
l = [1, 2, 3, 1]
2185-
s = pd.Series(l)
2186-
exp = pd.Series(Categorical(l))
2187-
res = s.astype('category')
2183+
labels = [1, 2, 3, 1]
2184+
exp = Series(Categorical(labels))
2185+
res = Series(labels).astype('category')
21882186
tm.assert_series_equal(res, exp)
21892187

2190-
df = pd.DataFrame({"cats": [1, 2, 3, 4, 5, 6],
2191-
"vals": [1, 2, 3, 4, 5, 6]})
2192-
cats = Categorical([1, 2, 3, 4, 5, 6])
2193-
exp_df = pd.DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
2194-
df["cats"] = df["cats"].astype("category")
2195-
tm.assert_frame_equal(exp_df, df)
2188+
labels_int = [1, 2, 3, 4, 5, 6]
2189+
exp = DataFrame({"cats": Categorical(labels_int), "vals": labels_int})
2190+
res = DataFrame({"cats": labels_int, "vals": labels_int})
2191+
res["cats"] = res["cats"].astype("category")
2192+
tm.assert_frame_equal(res, exp)
21962193

2197-
df = pd.DataFrame({"cats": ['a', 'b', 'b', 'a', 'a', 'd'],
2198-
"vals": [1, 2, 3, 4, 5, 6]})
2199-
cats = Categorical(['a', 'b', 'b', 'a', 'a', 'd'])
2200-
exp_df = pd.DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
2201-
df["cats"] = df["cats"].astype("category")
2202-
tm.assert_frame_equal(exp_df, df)
2194+
labels_str = list('abbaad')
2195+
exp = DataFrame({"cats": Categorical(labels_str), "vals": labels_int})
2196+
res = DataFrame({"cats": labels_str, "vals": labels_int})
2197+
res["cats"] = res["cats"].astype("category")
2198+
tm.assert_frame_equal(res, exp)
22032199

22042200
# with keywords
2205-
l = ["a", "b", "c", "a"]
2206-
s = pd.Series(l)
2207-
exp = pd.Series(Categorical(l, ordered=True))
2201+
labels = list('abca')
2202+
s = Series(labels)
2203+
exp = Series(Categorical(labels, ordered=True))
22082204
res = s.astype(CategoricalDtype(None, ordered=True))
22092205
tm.assert_series_equal(res, exp)
22102206

2211-
exp = pd.Series(Categorical(
2212-
l, categories=list('abcdef'), ordered=True))
2213-
res = s.astype(CategoricalDtype(list('abcdef'), ordered=True))
2207+
cats = list('abcdef')
2208+
exp = Series(Categorical(labels, categories=cats, ordered=True))
2209+
res = s.astype(CategoricalDtype(cats, ordered=True))
22142210
tm.assert_series_equal(res, exp)
22152211

2212+
def test_frame_creation_astype(self):
2213+
# GH 12860
2214+
cats = list('abcde')
2215+
x = Categorical(list('abcd'), categories=cats)
2216+
y = Categorical(list('bcde'), categories=cats)
2217+
exp = DataFrame({'x': x, 'y': y})
2218+
2219+
data = {'x': list('abcd'), 'y': list('bcde')}
2220+
res = DataFrame(data).astype('category')
2221+
tm.assert_frame_equal(res, exp)
2222+
2223+
res = DataFrame(data).astype(CategoricalDtype())
2224+
tm.assert_frame_equal(res, exp)
2225+
2226+
# categories keyword
2227+
cats = list('abdef')
2228+
x = Categorical(['a', 'b', np.nan, 'd'], categories=cats)
2229+
y = Categorical(['b', np.nan, 'd', 'e'], categories=cats)
2230+
exp = DataFrame({'x': x, 'y': y})
2231+
2232+
res = DataFrame(data).astype('category', categories=cats)
2233+
tm.assert_frame_equal(res, exp)
2234+
2235+
res = DataFrame(data).astype(CategoricalDtype(categories=cats))
2236+
tm.assert_frame_equal(res, exp)
2237+
2238+
# ordered keyword
2239+
cats = [1, 2, 3, 4, 0]
2240+
x = Categorical(range(1, 5), categories=cats, ordered=True)
2241+
y = Categorical(range(4), categories=cats, ordered=True)
2242+
exp = DataFrame({'x': x, 'y': y})
2243+
2244+
data = {'x': range(1, 5), 'y': range(4)}
2245+
res = DataFrame(data).astype('category', ordered=True)
2246+
tm.assert_frame_equal(res, exp)
2247+
2248+
res = DataFrame(data).astype(CategoricalDtype(ordered=True))
2249+
tm.assert_frame_equal(res, exp)
2250+
22162251
@pytest.mark.parametrize('columns', [['x'], ['x', 'y'], ['x', 'y', 'z']])
22172252
def test_empty_astype(self, columns):
22182253
# GH 18004
2219-
msg = '> 1 ndim Categorical are not supported at this time'
2220-
with tm.assert_raises_regex(NotImplementedError, msg):
2221-
DataFrame(columns=columns).astype('category')
2254+
exp = DataFrame({c: Categorical([]) for c in columns}, index=[])
2255+
res = DataFrame(columns=columns).astype('category')
2256+
tm.assert_frame_equal(res, exp)
22222257

22232258
def test_construction_series(self):
22242259

0 commit comments

Comments
 (0)