Skip to content

Commit 78ccfac

Browse files
author
TomAugspurger
committed
ENH: let get_dummies take a DataFrame
implement via 1d fixup docstring, tests add documentation test for dicts
1 parent b82a4e6 commit 78ccfac

File tree

4 files changed

+241
-6
lines changed

4 files changed

+241
-6
lines changed

doc/source/reshaping.rst

+43
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,49 @@ This function is often used along with discretization functions like ``cut``:
480480
481481
See also :func:`Series.str.get_dummies <pandas.core.strings.StringMethods.get_dummies>`.
482482

483+
.. versionadded:: 0.15.0
484+
485+
:func:`get_dummies` also accepts a DataFrame. By default all categorical
486+
variables (categorical in the statistical sense,
487+
those with `object` or `categorical` dtype) are encoded as dummy variables.
488+
489+
490+
.. ipython:: python
491+
492+
df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['c', 'c', 'b'],
493+
'C': [1, 2, 3]})
494+
pd.get_dummies(df)
495+
496+
All non-object columns are included untouched in the output.
497+
498+
You can control the columns that are encoded with the ``columns`` keyword.
499+
500+
.. ipython:: python
501+
502+
pd.get_dummies(df, columns=['A'])
503+
504+
Notice that the ``B`` column is still included in the output, it just hasn't
505+
been encoded. You can drop ``B`` before calling ``get_dummies`` if you don't
506+
want to include it in the output.
507+
508+
As with the Series version, you can pass values for the ``prefix`` and
509+
``prefix_sep``. By default the column name is used as the prefix, and '_' as
510+
the prefix separator. You can specify ``prefix`` and ``prefix_sep`` in 3 ways
511+
512+
- string: Use the same value for ``prefix`` or ``prefix_sep`` for each column
513+
to be encoded
514+
- list: Must be the same length as the number of columns being encoded.
515+
- dict: Mapping column name to prefix
516+
517+
.. ipython:: python
518+
519+
simple = pd.get_dummies(df, prefix='new_prefix')
520+
simple
521+
from_list = pd.get_dummies(df, prefix=['from_A', 'from_B'])
522+
from_list
523+
from_dict = pd.get_dummies(df, prefix={'B': 'from_B', 'A': 'from_A'})
524+
from_dict
525+
483526
Factorizing values
484527
------------------
485528

doc/source/v0.15.0.txt

+8
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,15 @@ Enhancements
461461

462462

463463

464+
- The ``get_dummies`` method can now be used on DataFrames. By default only
465+
catagorical columns are encoded as 0's and 1's, while other columns are
466+
left untouched.
464467

468+
.. ipython:: python
469+
470+
df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['c', 'c', 'b'],
471+
'C': [1, 2, 3]})
472+
pd.get_dummies(df)
465473

466474

467475

pandas/core/reshape.py

+77-6
Original file line numberDiff line numberDiff line change
@@ -981,25 +981,34 @@ def convert_dummies(data, cat_variables, prefix_sep='_'):
981981
"""
982982
result = data.drop(cat_variables, axis=1)
983983
for variable in cat_variables:
984-
dummies = get_dummies(data[variable], prefix=variable,
985-
prefix_sep=prefix_sep)
984+
dummies = _get_dummies_1d(data[variable], prefix=variable,
985+
prefix_sep=prefix_sep)
986986
result = result.join(dummies)
987987
return result
988988

989989

990-
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False):
990+
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
991+
columns=None):
991992
"""
992993
Convert categorical variable into dummy/indicator variables
993994
994995
Parameters
995996
----------
996-
data : array-like or Series
997-
prefix : string, default None
997+
data : array-like, Series, or DataFrame
998+
prefix : string, list of strings, or dict of strings, default None
998999
String to append DataFrame column names
1000+
Pass a list with length equal to the number of columns
1001+
when calling get_dummies on a DataFrame. Alternativly, `prefix`
1002+
can be a dictionary mapping column names to prefixes.
9991003
prefix_sep : string, default '_'
1000-
If appending prefix, separator/delimiter to use
1004+
If appending prefix, separator/delimiter to use. Or pass a
1005+
list or dictionary as with `prefix.`
10011006
dummy_na : bool, default False
10021007
Add a column to indicate NaNs, if False NaNs are ignored.
1008+
columns : list-like, default None
1009+
Column names in the DataFrame to be encoded.
1010+
If `columns` is None then all the columns with
1011+
`object` or `category` dtype will be converted.
10031012
10041013
Returns
10051014
-------
@@ -1031,9 +1040,71 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False):
10311040
1 0 1 0
10321041
2 0 0 1
10331042
1043+
>>> df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
1044+
'C': [1, 2, 3]})
1045+
1046+
>>> get_dummies(df, prefix=['col1', 'col2']):
1047+
C col1_a col1_b col2_a col2_b col2_c
1048+
0 1 1 0 0 1 0
1049+
1 2 0 1 1 0 0
1050+
2 3 1 0 0 0 1
1051+
10341052
See also ``Series.str.get_dummies``.
10351053
10361054
"""
1055+
from pandas.tools.merge import concat
1056+
from itertools import cycle
1057+
1058+
if isinstance(data, DataFrame):
1059+
# determine columns being encoded
1060+
1061+
if columns is None:
1062+
columns_to_encode = data.select_dtypes(include=['object',
1063+
'category']).columns
1064+
else:
1065+
columns_to_encode = columns
1066+
1067+
# validate prefixes and separator to avoid silently dropping cols
1068+
def check_len(item, name):
1069+
length_msg = ("Length of '{0}' ({1}) did "
1070+
"not match the length of the columns "
1071+
"being encoded ({2}).")
1072+
1073+
if com.is_list_like(item):
1074+
if not len(item) == len(columns_to_encode):
1075+
raise ValueError(length_msg.format(name, len(item),
1076+
len(columns_to_encode)))
1077+
1078+
check_len(prefix, 'prefix')
1079+
check_len(prefix_sep, 'prefix_sep')
1080+
if isinstance(prefix, compat.string_types):
1081+
prefix = cycle([prefix])
1082+
if isinstance(prefix, dict):
1083+
prefix = [prefix[col] for col in columns_to_encode]
1084+
1085+
if prefix is None:
1086+
prefix = columns_to_encode
1087+
1088+
# validate separators
1089+
if isinstance(prefix_sep, compat.string_types):
1090+
prefix_sep = cycle([prefix_sep])
1091+
elif isinstance(prefix_sep, dict):
1092+
prefix_sep = [prefix_sep[col] for col in columns_to_encode]
1093+
1094+
result = data.drop(columns_to_encode, axis=1)
1095+
with_dummies = [result]
1096+
for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep):
1097+
1098+
dummy = _get_dummies_1d(data[col], prefix=pre,
1099+
prefix_sep=sep, dummy_na=dummy_na)
1100+
with_dummies.append(dummy)
1101+
result = concat(with_dummies, axis=1)
1102+
else:
1103+
result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na)
1104+
return result
1105+
1106+
1107+
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False):
10371108
# Series avoids inconsistent NaN handling
10381109
cat = Categorical.from_array(Series(data))
10391110
levels = cat.levels

pandas/tests/test_reshape.py

+113
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,11 @@ def test_multiindex(self):
149149

150150

151151
class TestGetDummies(tm.TestCase):
152+
153+
def setUp(self):
154+
self.df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'],
155+
'C': [1, 2, 3]})
156+
152157
def test_basic(self):
153158
s_list = list('abc')
154159
s_series = Series(s_list)
@@ -209,6 +214,114 @@ def test_unicode(self): # See GH 6885 - get_dummies chokes on unicode values
209214
u('letter_%s') % eacute: {0: 0.0, 1: 1.0, 2: 1.0}})
210215
assert_frame_equal(res, exp)
211216

217+
def test_dataframe_dummies_all_obj(self):
218+
df = self.df[['A', 'B']]
219+
result = get_dummies(df)
220+
expected = DataFrame({'A_a': [1., 0, 1], 'A_b': [0., 1, 0],
221+
'B_b': [1., 1, 0], 'B_c': [0., 0, 1]})
222+
assert_frame_equal(result, expected)
223+
224+
def test_dataframe_dummies_mix_default(self):
225+
df = self.df
226+
result = get_dummies(df)
227+
expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1],
228+
'A_b': [0., 1, 0], 'B_b': [1., 1, 0],
229+
'B_c': [0., 0, 1]})
230+
expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
231+
assert_frame_equal(result, expected)
232+
233+
def test_dataframe_dummies_prefix_list(self):
234+
prefixes = ['from_A', 'from_B']
235+
df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'],
236+
'C': [1, 2, 3]})
237+
result = get_dummies(df, prefix=prefixes)
238+
expected = DataFrame({'C': [1, 2, 3], 'from_A_a': [1., 0, 1],
239+
'from_A_b': [0., 1, 0], 'from_B_b': [1., 1, 0],
240+
'from_B_c': [0., 0, 1]})
241+
expected = expected[['C', 'from_A_a', 'from_A_b', 'from_B_b',
242+
'from_B_c']]
243+
assert_frame_equal(result, expected)
244+
245+
def test_datafrmae_dummies_prefix_str(self):
246+
# not that you should do this...
247+
df = self.df
248+
result = get_dummies(df, prefix='bad')
249+
expected = DataFrame([[1, 1., 0., 1., 0.],
250+
[2, 0., 1., 1., 0.],
251+
[3, 1., 0., 0., 1.]],
252+
columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c'])
253+
assert_frame_equal(result, expected)
254+
255+
def test_dataframe_dummies_subset(self):
256+
df = self.df
257+
result = get_dummies(df, prefix=['from_A'],
258+
columns=['A'])
259+
expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0],
260+
'B': ['b', 'b', 'c'], 'C': [1, 2, 3]})
261+
assert_frame_equal(result, expected)
262+
263+
def test_dataframe_dummies_prefix_sep(self):
264+
df = self.df
265+
result = get_dummies(df, prefix_sep='..')
266+
expected = DataFrame({'C': [1, 2, 3], 'A..a': [1., 0, 1],
267+
'A..b': [0., 1, 0], 'B..b': [1., 1, 0],
268+
'B..c': [0., 0, 1]})
269+
expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']]
270+
assert_frame_equal(result, expected)
271+
272+
result = get_dummies(df, prefix_sep=['..', '__'])
273+
expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'})
274+
assert_frame_equal(result, expected)
275+
276+
result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'})
277+
assert_frame_equal(result, expected)
278+
279+
def test_dataframe_dummies_prefix_bad_length(self):
280+
with tm.assertRaises(ValueError):
281+
get_dummies(self.df, prefix=['too few'])
282+
283+
def test_dataframe_dummies_prefix_sep_bad_length(self):
284+
with tm.assertRaises(ValueError):
285+
get_dummies(self.df, prefix_sep=['bad'])
286+
287+
def test_dataframe_dummies_prefix_dict(self):
288+
prefixes = {'A': 'from_A', 'B': 'from_B'}
289+
df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'],
290+
'C': [1, 2, 3]})
291+
result = get_dummies(df, prefix=prefixes)
292+
expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0],
293+
'from_B_b': [1., 1, 0], 'from_B_c': [0., 0, 1],
294+
'C': [1, 2, 3]})
295+
assert_frame_equal(result, expected)
296+
297+
def test_dataframe_dummies_with_na(self):
298+
df = self.df
299+
df.loc[3, :] = [np.nan, np.nan, np.nan]
300+
result = get_dummies(df, dummy_na=True)
301+
expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_a': [1., 0, 1, 0],
302+
'A_b': [0., 1, 0, 0], 'A_nan': [0., 0, 0, 1], 'B_b': [1., 1, 0, 0],
303+
'B_c': [0., 0, 1, 0], 'B_nan': [0., 0, 0, 1]})
304+
expected = expected[['C', 'A_a', 'A_b', 'A_nan', 'B_b', 'B_c',
305+
'B_nan']]
306+
assert_frame_equal(result, expected)
307+
308+
result = get_dummies(df, dummy_na=False)
309+
expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
310+
assert_frame_equal(result, expected)
311+
312+
def test_dataframe_dummies_with_categorical(self):
313+
df = self.df
314+
df['cat'] = pd.Categorical(['x', 'y', 'y'])
315+
result = get_dummies(df)
316+
expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1],
317+
'A_b': [0., 1, 0], 'B_b': [1., 1, 0],
318+
'B_c': [0., 0, 1], 'cat_x': [1., 0, 0],
319+
'cat_y': [0., 1, 1]})
320+
expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c',
321+
'cat_x', 'cat_y']]
322+
assert_frame_equal(result, expected)
323+
324+
212325
class TestConvertDummies(tm.TestCase):
213326
def test_convert_dummies(self):
214327
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',

0 commit comments

Comments
 (0)