Skip to content

Commit f9f9b1c

Browse files
pckSFyehoshuadimarsky
authored andcommitted
Move dummy coding related functions from reshape/reshape into separate file (pandas-dev#45215)
1 parent 0a86e01 commit f9f9b1c

File tree

3 files changed

+320
-308
lines changed

3 files changed

+320
-308
lines changed

pandas/core/reshape/api.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# flake8: noqa:F401
22

33
from pandas.core.reshape.concat import concat
4+
from pandas.core.reshape.encoding import get_dummies
45
from pandas.core.reshape.melt import (
56
lreshape,
67
melt,
@@ -16,7 +17,6 @@
1617
pivot,
1718
pivot_table,
1819
)
19-
from pandas.core.reshape.reshape import get_dummies
2020
from pandas.core.reshape.tile import (
2121
cut,
2222
qcut,

pandas/core/reshape/encoding.py

+318
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,318 @@
1+
from __future__ import annotations
2+
3+
import itertools
4+
5+
import numpy as np
6+
7+
from pandas._libs.sparse import IntIndex
8+
from pandas._typing import Dtype
9+
10+
from pandas.core.dtypes.common import (
11+
is_integer_dtype,
12+
is_list_like,
13+
is_object_dtype,
14+
)
15+
16+
from pandas.core.arrays import SparseArray
17+
from pandas.core.arrays.categorical import factorize_from_iterable
18+
from pandas.core.frame import DataFrame
19+
from pandas.core.indexes.api import Index
20+
from pandas.core.series import Series
21+
22+
23+
def get_dummies(
24+
data,
25+
prefix=None,
26+
prefix_sep="_",
27+
dummy_na: bool = False,
28+
columns=None,
29+
sparse: bool = False,
30+
drop_first: bool = False,
31+
dtype: Dtype | None = None,
32+
) -> DataFrame:
33+
"""
34+
Convert categorical variable into dummy/indicator variables.
35+
36+
Parameters
37+
----------
38+
data : array-like, Series, or DataFrame
39+
Data of which to get dummy indicators.
40+
prefix : str, list of str, or dict of str, default None
41+
String to append DataFrame column names.
42+
Pass a list with length equal to the number of columns
43+
when calling get_dummies on a DataFrame. Alternatively, `prefix`
44+
can be a dictionary mapping column names to prefixes.
45+
prefix_sep : str, default '_'
46+
If appending prefix, separator/delimiter to use. Or pass a
47+
list or dictionary as with `prefix`.
48+
dummy_na : bool, default False
49+
Add a column to indicate NaNs, if False NaNs are ignored.
50+
columns : list-like, default None
51+
Column names in the DataFrame to be encoded.
52+
If `columns` is None then all the columns with
53+
`object`, `string`, or `category` dtype will be converted.
54+
sparse : bool, default False
55+
Whether the dummy-encoded columns should be backed by
56+
a :class:`SparseArray` (True) or a regular NumPy array (False).
57+
drop_first : bool, default False
58+
Whether to get k-1 dummies out of k categorical levels by removing the
59+
first level.
60+
dtype : dtype, default np.uint8
61+
Data type for new columns. Only a single dtype is allowed.
62+
63+
Returns
64+
-------
65+
DataFrame
66+
Dummy-coded data.
67+
68+
See Also
69+
--------
70+
Series.str.get_dummies : Convert Series to dummy codes.
71+
72+
Notes
73+
-----
74+
Reference :ref:`the user guide <reshaping.dummies>` for more examples.
75+
76+
Examples
77+
--------
78+
>>> s = pd.Series(list('abca'))
79+
80+
>>> pd.get_dummies(s)
81+
a b c
82+
0 1 0 0
83+
1 0 1 0
84+
2 0 0 1
85+
3 1 0 0
86+
87+
>>> s1 = ['a', 'b', np.nan]
88+
89+
>>> pd.get_dummies(s1)
90+
a b
91+
0 1 0
92+
1 0 1
93+
2 0 0
94+
95+
>>> pd.get_dummies(s1, dummy_na=True)
96+
a b NaN
97+
0 1 0 0
98+
1 0 1 0
99+
2 0 0 1
100+
101+
>>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
102+
... 'C': [1, 2, 3]})
103+
104+
>>> pd.get_dummies(df, prefix=['col1', 'col2'])
105+
C col1_a col1_b col2_a col2_b col2_c
106+
0 1 1 0 0 1 0
107+
1 2 0 1 1 0 0
108+
2 3 1 0 0 0 1
109+
110+
>>> pd.get_dummies(pd.Series(list('abcaa')))
111+
a b c
112+
0 1 0 0
113+
1 0 1 0
114+
2 0 0 1
115+
3 1 0 0
116+
4 1 0 0
117+
118+
>>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
119+
b c
120+
0 0 0
121+
1 1 0
122+
2 0 1
123+
3 0 0
124+
4 0 0
125+
126+
>>> pd.get_dummies(pd.Series(list('abc')), dtype=float)
127+
a b c
128+
0 1.0 0.0 0.0
129+
1 0.0 1.0 0.0
130+
2 0.0 0.0 1.0
131+
"""
132+
from pandas.core.reshape.concat import concat
133+
134+
dtypes_to_encode = ["object", "string", "category"]
135+
136+
if isinstance(data, DataFrame):
137+
# determine columns being encoded
138+
if columns is None:
139+
data_to_encode = data.select_dtypes(include=dtypes_to_encode)
140+
elif not is_list_like(columns):
141+
raise TypeError("Input must be a list-like for parameter `columns`")
142+
else:
143+
data_to_encode = data[columns]
144+
145+
# validate prefixes and separator to avoid silently dropping cols
146+
def check_len(item, name):
147+
148+
if is_list_like(item):
149+
if not len(item) == data_to_encode.shape[1]:
150+
len_msg = (
151+
f"Length of '{name}' ({len(item)}) did not match the "
152+
"length of the columns being encoded "
153+
f"({data_to_encode.shape[1]})."
154+
)
155+
raise ValueError(len_msg)
156+
157+
check_len(prefix, "prefix")
158+
check_len(prefix_sep, "prefix_sep")
159+
160+
if isinstance(prefix, str):
161+
prefix = itertools.cycle([prefix])
162+
if isinstance(prefix, dict):
163+
prefix = [prefix[col] for col in data_to_encode.columns]
164+
165+
if prefix is None:
166+
prefix = data_to_encode.columns
167+
168+
# validate separators
169+
if isinstance(prefix_sep, str):
170+
prefix_sep = itertools.cycle([prefix_sep])
171+
elif isinstance(prefix_sep, dict):
172+
prefix_sep = [prefix_sep[col] for col in data_to_encode.columns]
173+
174+
with_dummies: list[DataFrame]
175+
if data_to_encode.shape == data.shape:
176+
# Encoding the entire df, do not prepend any dropped columns
177+
with_dummies = []
178+
elif columns is not None:
179+
# Encoding only cols specified in columns. Get all cols not in
180+
# columns to prepend to result.
181+
with_dummies = [data.drop(columns, axis=1)]
182+
else:
183+
# Encoding only object and category dtype columns. Get remaining
184+
# columns to prepend to result.
185+
with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)]
186+
187+
for (col, pre, sep) in zip(data_to_encode.items(), prefix, prefix_sep):
188+
# col is (column_name, column), use just column data here
189+
dummy = _get_dummies_1d(
190+
col[1],
191+
prefix=pre,
192+
prefix_sep=sep,
193+
dummy_na=dummy_na,
194+
sparse=sparse,
195+
drop_first=drop_first,
196+
dtype=dtype,
197+
)
198+
with_dummies.append(dummy)
199+
result = concat(with_dummies, axis=1)
200+
else:
201+
result = _get_dummies_1d(
202+
data,
203+
prefix,
204+
prefix_sep,
205+
dummy_na,
206+
sparse=sparse,
207+
drop_first=drop_first,
208+
dtype=dtype,
209+
)
210+
return result
211+
212+
213+
def _get_dummies_1d(
214+
data,
215+
prefix,
216+
prefix_sep="_",
217+
dummy_na: bool = False,
218+
sparse: bool = False,
219+
drop_first: bool = False,
220+
dtype: Dtype | None = None,
221+
) -> DataFrame:
222+
from pandas.core.reshape.concat import concat
223+
224+
# Series avoids inconsistent NaN handling
225+
codes, levels = factorize_from_iterable(Series(data))
226+
227+
if dtype is None:
228+
dtype = np.dtype(np.uint8)
229+
# error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str,
230+
# dtype[Any], Type[object]]"; expected "Type[Any]"
231+
dtype = np.dtype(dtype) # type: ignore[arg-type]
232+
233+
if is_object_dtype(dtype):
234+
raise ValueError("dtype=object is not a valid dtype for get_dummies")
235+
236+
def get_empty_frame(data) -> DataFrame:
237+
index: Index | np.ndarray
238+
if isinstance(data, Series):
239+
index = data.index
240+
else:
241+
index = Index(range(len(data)))
242+
return DataFrame(index=index)
243+
244+
# if all NaN
245+
if not dummy_na and len(levels) == 0:
246+
return get_empty_frame(data)
247+
248+
codes = codes.copy()
249+
if dummy_na:
250+
codes[codes == -1] = len(levels)
251+
levels = levels.insert(len(levels), np.nan)
252+
253+
# if dummy_na, we just fake a nan level. drop_first will drop it again
254+
if drop_first and len(levels) == 1:
255+
return get_empty_frame(data)
256+
257+
number_of_cols = len(levels)
258+
259+
if prefix is None:
260+
dummy_cols = levels
261+
else:
262+
dummy_cols = Index([f"{prefix}{prefix_sep}{level}" for level in levels])
263+
264+
index: Index | None
265+
if isinstance(data, Series):
266+
index = data.index
267+
else:
268+
index = None
269+
270+
if sparse:
271+
272+
fill_value: bool | float | int
273+
if is_integer_dtype(dtype):
274+
fill_value = 0
275+
elif dtype == np.dtype(bool):
276+
fill_value = False
277+
else:
278+
fill_value = 0.0
279+
280+
sparse_series = []
281+
N = len(data)
282+
sp_indices: list[list] = [[] for _ in range(len(dummy_cols))]
283+
mask = codes != -1
284+
codes = codes[mask]
285+
n_idx = np.arange(N)[mask]
286+
287+
for ndx, code in zip(n_idx, codes):
288+
sp_indices[code].append(ndx)
289+
290+
if drop_first:
291+
# remove first categorical level to avoid perfect collinearity
292+
# GH12042
293+
sp_indices = sp_indices[1:]
294+
dummy_cols = dummy_cols[1:]
295+
for col, ixs in zip(dummy_cols, sp_indices):
296+
sarr = SparseArray(
297+
np.ones(len(ixs), dtype=dtype),
298+
sparse_index=IntIndex(N, ixs),
299+
fill_value=fill_value,
300+
dtype=dtype,
301+
)
302+
sparse_series.append(Series(data=sarr, index=index, name=col))
303+
304+
return concat(sparse_series, axis=1, copy=False)
305+
306+
else:
307+
# take on axis=1 + transpose to ensure ndarray layout is column-major
308+
dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=1).T
309+
310+
if not dummy_na:
311+
# reset NaN GH4446
312+
dummy_mat[codes == -1] = 0
313+
314+
if drop_first:
315+
# remove first GH12042
316+
dummy_mat = dummy_mat[:, 1:]
317+
dummy_cols = dummy_cols[1:]
318+
return DataFrame(dummy_mat, index=index, columns=dummy_cols)

0 commit comments

Comments
 (0)