Skip to content

Commit 643d54b

Browse files
committed
+pd.DataFrame.get_dummies, pd.Series.get_dummies
1 parent c7d768c commit 643d54b

File tree

7 files changed

+313
-279
lines changed

7 files changed

+313
-279
lines changed

pandas/core/api.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818
from pandas.core.panel import Panel, WidePanel
1919
from pandas.core.panel4d import Panel4D
2020
from pandas.core.groupby import groupby
21-
from pandas.core.reshape import (pivot_simple as pivot, get_dummies,
22-
lreshape, wide_to_long)
21+
from pandas.core.reshape import (pivot_simple as pivot, lreshape, wide_to_long)
22+
from pandas.core.generic import get_dummies
2323

2424
from pandas.core.indexing import IndexSlice
2525
from pandas.tseries.offsets import DateOffset

pandas/core/frame.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,9 @@
112112
versionadded_crosstab='\n.. versionadded:: 0.20.0\n',
113113
other_crosstab='crosstab',
114114
versionadded_pivot_table='\n.. versionadded:: 0.20.0\n',
115-
other_pivot_table='pivot_table')
115+
other_pivot_table='pivot_table',
116+
versionadded_get_dummies='\n.. versionadded:: 0.20.0\n',
117+
other_get_dummies='get_dummies\nSeries.get_dummies')
116118

117119
_numeric_only_doc = """numeric_only : boolean, default None
118120
Include only float, int, boolean data. If None, will attempt to use
@@ -4314,6 +4316,14 @@ def crosstab(self, columns, values=None, rownames=None, colnames=None,
43144316
colnames=colnames, aggfunc=aggfunc, margins=margins,
43154317
dropna=dropna, normalize=normalize)
43164318

4319+
@Appender(_shared_docs['get_dummies'] % _shared_doc_kwargs)
4320+
def get_dummies(self, prefix=None, prefix_sep='_', dummy_na=False,
4321+
columns=None, sparse=False, drop_first=False):
4322+
from pandas.core.generic import get_dummies
4323+
return get_dummies(self, prefix=prefix, prefix_sep=prefix_sep,
4324+
dummy_na=dummy_na, column=columns,
4325+
sparse=sparse, drop_first=drop_first)
4326+
43174327
# ----------------------------------------------------------------------
43184328
# Time series-related
43194329

pandas/core/generic.py

+284-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010

1111
import pandas as pd
1212

13-
1413
from pandas.types.common import (_coerce_to_dtype,
1514
_ensure_int64,
1615
needs_i8_conversion,
@@ -64,7 +63,10 @@
6463
args_transpose='axes to permute (int or label for object)',
6564
optional_by="""
6665
by : str or list of str
67-
Name or list of names which refer to the axis items.""")
66+
Name or list of names which refer to the axis items.""",
67+
versionadded_get_dummies="",
68+
other_get_dummies=""
69+
)
6870

6971

7072
def _single_replace(self, to_replace, method, inplace, limit):
@@ -6069,3 +6071,283 @@ def logical_func(self, axis=None, bool_only=None, skipna=None, level=None,
60696071
# install the indexes
60706072
for _name, _indexer in indexing.get_indexers_list():
60716073
NDFrame._create_indexer(_name, _indexer)
6074+
6075+
6076+
_shared_docs['get_dummies'] = """
6077+
Convert categorical variable into dummy/indicator variables
6078+
6079+
%(versionadded_get_dummies)s
6080+
6081+
Parameters
6082+
----------
6083+
data : array-like, Series, or DataFrame
6084+
prefix : string, list of strings, or dict of strings, default None
6085+
String to append DataFrame column names
6086+
Pass a list with length equal to the number of columns
6087+
when calling get_dummies on a DataFrame. Alternativly, `prefix`
6088+
can be a dictionary mapping column names to prefixes.
6089+
prefix_sep : string, default '_'
6090+
If appending prefix, separator/delimiter to use. Or pass a
6091+
list or dictionary as with `prefix.`
6092+
dummy_na : bool, default False
6093+
Add a column to indicate NaNs, if False NaNs are ignored.
6094+
columns : list-like, default None
6095+
Column names in the DataFrame to be encoded.
6096+
If `columns` is None then all the columns with
6097+
`object` or `category` dtype will be converted.
6098+
sparse : bool, default False
6099+
Whether the dummy columns should be sparse or not. Returns
6100+
SparseDataFrame if `data` is a Series or if all columns are included.
6101+
Otherwise returns a DataFrame with some SparseBlocks.
6102+
6103+
.. versionadded:: 0.16.1
6104+
drop_first : bool, default False
6105+
Whether to get k-1 dummies out of k categorical levels by removing the
6106+
first level.
6107+
6108+
.. versionadded:: 0.18.0
6109+
Returns
6110+
-------
6111+
dummies : DataFrame or SparseDataFrame
6112+
6113+
Examples
6114+
--------
6115+
>>> import pandas as pd
6116+
>>> s = pd.Series(list('abca'))
6117+
6118+
>>> pd.get_dummies(s)
6119+
a b c
6120+
0 1 0 0
6121+
1 0 1 0
6122+
2 0 0 1
6123+
3 1 0 0
6124+
6125+
>>> s1 = ['a', 'b', np.nan]
6126+
6127+
>>> pd.get_dummies(s1)
6128+
a b
6129+
0 1 0
6130+
1 0 1
6131+
2 0 0
6132+
6133+
>>> pd.get_dummies(s1, dummy_na=True)
6134+
a b NaN
6135+
0 1 0 0
6136+
1 0 1 0
6137+
2 0 0 1
6138+
6139+
>>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
6140+
'C': [1, 2, 3]})
6141+
6142+
>>> pd.get_dummies(df, prefix=['col1', 'col2'])
6143+
C col1_a col1_b col2_a col2_b col2_c
6144+
0 1 1 0 0 1 0
6145+
1 2 0 1 1 0 0
6146+
2 3 1 0 0 0 1
6147+
6148+
>>> pd.get_dummies(pd.Series(list('abcaa')))
6149+
a b c
6150+
0 1 0 0
6151+
1 0 1 0
6152+
2 0 0 1
6153+
3 1 0 0
6154+
4 1 0 0
6155+
6156+
>>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True))
6157+
b c
6158+
0 0 0
6159+
1 1 0
6160+
2 0 1
6161+
3 0 0
6162+
4 0 0
6163+
6164+
See Also
6165+
--------
6166+
%(other_get_dummies)s
6167+
Series.str.get_dummies
6168+
"""
6169+
6170+
6171+
@Appender(_shared_docs['get_dummies'] % _shared_doc_kwargs)
6172+
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
6173+
columns=None, sparse=False, drop_first=False):
6174+
from pandas.tools.concat import concat
6175+
from itertools import cycle
6176+
from pandas.core.frame import DataFrame
6177+
6178+
if isinstance(data, DataFrame):
6179+
# determine columns being encoded
6180+
6181+
if columns is None:
6182+
columns_to_encode = data.select_dtypes(
6183+
include=['object', 'category']).columns
6184+
else:
6185+
columns_to_encode = columns
6186+
6187+
# validate prefixes and separator to avoid silently dropping cols
6188+
def check_len(item, name):
6189+
length_msg = ("Length of '{0}' ({1}) did not match the length of "
6190+
"the columns being encoded ({2}).")
6191+
6192+
if is_list_like(item):
6193+
if not len(item) == len(columns_to_encode):
6194+
raise ValueError(length_msg.format(name, len(item),
6195+
len(columns_to_encode)))
6196+
6197+
check_len(prefix, 'prefix')
6198+
check_len(prefix_sep, 'prefix_sep')
6199+
if isinstance(prefix, compat.string_types):
6200+
prefix = cycle([prefix])
6201+
if isinstance(prefix, dict):
6202+
prefix = [prefix[col] for col in columns_to_encode]
6203+
6204+
if prefix is None:
6205+
prefix = columns_to_encode
6206+
6207+
# validate separators
6208+
if isinstance(prefix_sep, compat.string_types):
6209+
prefix_sep = cycle([prefix_sep])
6210+
elif isinstance(prefix_sep, dict):
6211+
prefix_sep = [prefix_sep[col] for col in columns_to_encode]
6212+
6213+
if set(columns_to_encode) == set(data.columns):
6214+
with_dummies = []
6215+
else:
6216+
with_dummies = [data.drop(columns_to_encode, axis=1)]
6217+
6218+
for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep):
6219+
6220+
dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep,
6221+
dummy_na=dummy_na, sparse=sparse,
6222+
drop_first=drop_first)
6223+
with_dummies.append(dummy)
6224+
result = concat(with_dummies, axis=1)
6225+
else:
6226+
result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na,
6227+
sparse=sparse, drop_first=drop_first)
6228+
return result
6229+
6230+
6231+
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
6232+
sparse=False, drop_first=False):
6233+
from pandas.core.sparse import SparseDataFrame, SparseSeries
6234+
from pandas.sparse.array import SparseArray
6235+
from pandas._sparse import IntIndex
6236+
from pandas.core.series import Series
6237+
from pandas.core.frame import DataFrame
6238+
6239+
from pandas.core.categorical import _factorize_from_iterable
6240+
# Series avoids inconsistent NaN handling
6241+
codes, levels = _factorize_from_iterable(Series(data))
6242+
6243+
def get_empty_Frame(data, sparse):
6244+
if isinstance(data, Series):
6245+
index = data.index
6246+
else:
6247+
index = np.arange(len(data))
6248+
if not sparse:
6249+
return DataFrame(index=index)
6250+
else:
6251+
return SparseDataFrame(index=index)
6252+
6253+
# if all NaN
6254+
if not dummy_na and len(levels) == 0:
6255+
return get_empty_Frame(data, sparse)
6256+
6257+
codes = codes.copy()
6258+
if dummy_na:
6259+
codes[codes == -1] = len(levels)
6260+
levels = np.append(levels, np.nan)
6261+
6262+
# if dummy_na, we just fake a nan level. drop_first will drop it again
6263+
if drop_first and len(levels) == 1:
6264+
return get_empty_Frame(data, sparse)
6265+
6266+
number_of_cols = len(levels)
6267+
6268+
if prefix is not None:
6269+
dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) for v in levels]
6270+
else:
6271+
dummy_cols = levels
6272+
6273+
if isinstance(data, Series):
6274+
index = data.index
6275+
else:
6276+
index = None
6277+
6278+
if sparse:
6279+
sparse_series = {}
6280+
N = len(data)
6281+
sp_indices = [[] for _ in range(len(dummy_cols))]
6282+
for ndx, code in enumerate(codes):
6283+
if code == -1:
6284+
# Blank entries if not dummy_na and code == -1, #GH4446
6285+
continue
6286+
sp_indices[code].append(ndx)
6287+
6288+
if drop_first:
6289+
# remove first categorical level to avoid perfect collinearity
6290+
# GH12042
6291+
sp_indices = sp_indices[1:]
6292+
dummy_cols = dummy_cols[1:]
6293+
for col, ixs in zip(dummy_cols, sp_indices):
6294+
sarr = SparseArray(np.ones(len(ixs), dtype=np.uint8),
6295+
sparse_index=IntIndex(N, ixs), fill_value=0,
6296+
dtype=np.uint8)
6297+
sparse_series[col] = SparseSeries(data=sarr, index=index)
6298+
6299+
out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols,
6300+
dtype=np.uint8)
6301+
return out
6302+
6303+
else:
6304+
dummy_mat = np.eye(number_of_cols, dtype=np.uint8).take(codes, axis=0)
6305+
6306+
if not dummy_na:
6307+
# reset NaN GH4446
6308+
dummy_mat[codes == -1] = 0
6309+
6310+
if drop_first:
6311+
# remove first GH12042
6312+
dummy_mat = dummy_mat[:, 1:]
6313+
dummy_cols = dummy_cols[1:]
6314+
return DataFrame(dummy_mat, index=index, columns=dummy_cols)
6315+
6316+
6317+
def make_axis_dummies(frame, axis='minor', transform=None):
6318+
"""
6319+
Construct 1-0 dummy variables corresponding to designated axis
6320+
labels
6321+
6322+
Parameters
6323+
----------
6324+
frame : DataFrame
6325+
axis : {'major', 'minor'}, default 'minor'
6326+
transform : function, default None
6327+
Function to apply to axis labels first. For example, to
6328+
get "day of week" dummies in a time series regression
6329+
you might call::
6330+
6331+
make_axis_dummies(panel, axis='major',
6332+
transform=lambda d: d.weekday())
6333+
Returns
6334+
-------
6335+
dummies : DataFrame
6336+
Column names taken from chosen axis
6337+
"""
6338+
from pandas.core.frame import DataFrame
6339+
from pandas.core.categorical import _factorize_from_iterable
6340+
6341+
numbers = {'major': 0, 'minor': 1}
6342+
num = numbers.get(axis, axis)
6343+
6344+
items = frame.index.levels[num]
6345+
labels = frame.index.labels[num]
6346+
if transform is not None:
6347+
mapped_items = items.map(transform)
6348+
labels, items = _factorize_from_iterable(mapped_items.take(labels))
6349+
6350+
values = np.eye(len(items), dtype=float)
6351+
values = values.take(labels, axis=0)
6352+
6353+
return DataFrame(values, columns=items, index=frame.index)

0 commit comments

Comments
 (0)