Skip to content

Commit ed55bdf

Browse files
authored
Initial draft: from_dummies (#41902)
1 parent 4f566c8 commit ed55bdf

File tree

8 files changed

+651
-1
lines changed

8 files changed

+651
-1
lines changed

doc/source/reference/general_functions.rst

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ Data manipulations
2323
merge_asof
2424
concat
2525
get_dummies
26+
from_dummies
2627
factorize
2728
unique
2829
wide_to_long

doc/source/user_guide/reshaping.rst

+24
Original file line numberDiff line numberDiff line change
@@ -706,6 +706,30 @@ To choose another dtype, use the ``dtype`` argument:
706706
707707
pd.get_dummies(df, dtype=bool).dtypes
708708
709+
.. versionadded:: 1.5.0
710+
711+
To convert a "dummy" or "indicator" ``DataFrame``, into a categorical ``DataFrame``,
712+
for example ``k`` columns of a ``DataFrame`` containing 1s and 0s can derive a
713+
``DataFrame`` which has ``k`` distinct values using
714+
:func:`~pandas.from_dummies`:
715+
716+
.. ipython:: python
717+
718+
df = pd.DataFrame({"prefix_a": [0, 1, 0], "prefix_b": [1, 0, 1]})
719+
df
720+
721+
pd.from_dummies(df, sep="_")
722+
723+
Dummy coded data only requires ``k - 1`` categories to be included, in this case
724+
the ``k`` th category is the default category, implied by not being assigned any of
725+
the other ``k - 1`` categories, can be passed via ``default_category``.
726+
727+
.. ipython:: python
728+
729+
df = pd.DataFrame({"prefix_a": [0, 1, 0]})
730+
df
731+
732+
pd.from_dummies(df, sep="_", default_category="b")
709733
710734
.. _reshaping.factorize:
711735

doc/source/whatsnew/v1.5.0.rst

+19
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,25 @@ as seen in the following example.
100100
1 2021-01-02 08:00:00 4
101101
2 2021-01-02 16:00:00 5
102102
103+
.. _whatsnew_150.enhancements.from_dummies:
104+
105+
from_dummies
106+
^^^^^^^^^^^^
107+
108+
Added new function :func:`~pandas.from_dummies` to convert a dummy coded :class:`DataFrame` into a categorical :class:`DataFrame`.
109+
110+
Example::
111+
112+
.. ipython:: python
113+
114+
import pandas as pd
115+
116+
df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0],
117+
"col2_a": [0, 1, 0], "col2_b": [1, 0, 0],
118+
"col2_c": [0, 0, 1]})
119+
120+
pd.from_dummies(df, sep="_")
121+
103122
.. _whatsnew_150.enhancements.orc:
104123

105124
Writing to ORC files

pandas/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@
128128
pivot,
129129
pivot_table,
130130
get_dummies,
131+
from_dummies,
131132
cut,
132133
qcut,
133134
)
@@ -361,6 +362,7 @@ def __getattr__(name):
361362
"eval",
362363
"factorize",
363364
"get_dummies",
365+
"from_dummies",
364366
"get_option",
365367
"infer_freq",
366368
"interval_range",

pandas/core/reshape/api.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
# flake8: noqa:F401
22

33
from pandas.core.reshape.concat import concat
4-
from pandas.core.reshape.encoding import get_dummies
4+
from pandas.core.reshape.encoding import (
5+
from_dummies,
6+
get_dummies,
7+
)
58
from pandas.core.reshape.melt import (
69
lreshape,
710
melt,

pandas/core/reshape/encoding.py

+202
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from __future__ import annotations
22

3+
from collections import defaultdict
34
import itertools
5+
from typing import Hashable
46

57
import numpy as np
68

@@ -68,6 +70,7 @@ def get_dummies(
6870
See Also
6971
--------
7072
Series.str.get_dummies : Convert Series to dummy codes.
73+
:func:`~pandas.from_dummies` : Convert dummy codes to categorical ``DataFrame``.
7174
7275
Notes
7376
-----
@@ -316,3 +319,202 @@ def get_empty_frame(data) -> DataFrame:
316319
dummy_mat = dummy_mat[:, 1:]
317320
dummy_cols = dummy_cols[1:]
318321
return DataFrame(dummy_mat, index=index, columns=dummy_cols)
322+
323+
324+
def from_dummies(
325+
data: DataFrame,
326+
sep: None | str = None,
327+
default_category: None | Hashable | dict[str, Hashable] = None,
328+
) -> DataFrame:
329+
"""
330+
Create a categorical ``DataFrame`` from a ``DataFrame`` of dummy variables.
331+
332+
Inverts the operation performed by :func:`~pandas.get_dummies`.
333+
334+
.. versionadded:: 1.5.0
335+
336+
Parameters
337+
----------
338+
data : DataFrame
339+
Data which contains dummy-coded variables in form of integer columns of
340+
1's and 0's.
341+
sep : str, default None
342+
Separator used in the column names of the dummy categories they are
343+
character indicating the separation of the categorical names from the prefixes.
344+
For example, if your column names are 'prefix_A' and 'prefix_B',
345+
you can strip the underscore by specifying sep='_'.
346+
default_category : None, Hashable or dict of Hashables, default None
347+
The default category is the implied category when a value has none of the
348+
listed categories specified with a one, i.e. if all dummies in a row are
349+
zero. Can be a single value for all variables or a dict directly mapping
350+
the default categories to a prefix of a variable.
351+
352+
Returns
353+
-------
354+
DataFrame
355+
Categorical data decoded from the dummy input-data.
356+
357+
Raises
358+
------
359+
ValueError
360+
* When the input ``DataFrame`` ``data`` contains NA values.
361+
* When the input ``DataFrame`` ``data`` contains column names with separators
362+
that do not match the separator specified with ``sep``.
363+
* When a ``dict`` passed to ``default_category`` does not include an implied
364+
category for each prefix.
365+
* When a value in ``data`` has more than one category assigned to it.
366+
* When ``default_category=None`` and a value in ``data`` has no category
367+
assigned to it.
368+
TypeError
369+
* When the input ``data`` is not of type ``DataFrame``.
370+
* When the input ``DataFrame`` ``data`` contains non-dummy data.
371+
* When the passed ``sep`` is of a wrong data type.
372+
* When the passed ``default_category`` is of a wrong data type.
373+
374+
See Also
375+
--------
376+
:func:`~pandas.get_dummies` : Convert ``Series`` or ``DataFrame`` to dummy codes.
377+
:class:`~pandas.Categorical` : Represent a categorical variable in classic.
378+
379+
Notes
380+
-----
381+
The columns of the passed dummy data should only include 1's and 0's,
382+
or boolean values.
383+
384+
Examples
385+
--------
386+
>>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0],
387+
... "c": [0, 0, 1, 0]})
388+
389+
>>> df
390+
a b c
391+
0 1 0 0
392+
1 0 1 0
393+
2 0 0 1
394+
3 1 0 0
395+
396+
>>> pd.from_dummies(df)
397+
0 a
398+
1 b
399+
2 c
400+
3 a
401+
402+
>>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0],
403+
... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],
404+
... "col2_c": [0, 0, 1]})
405+
406+
>>> df
407+
col1_a col1_b col2_a col2_b col2_c
408+
0 1 0 0 1 0
409+
1 0 1 1 0 0
410+
2 1 0 0 0 1
411+
412+
>>> pd.from_dummies(df, sep="_")
413+
col1 col2
414+
0 a b
415+
1 b a
416+
2 a c
417+
418+
>>> df = pd.DataFrame({"col1_a": [1, 0, 0], "col1_b": [0, 1, 0],
419+
... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],
420+
... "col2_c": [0, 0, 0]})
421+
422+
>>> df
423+
col1_a col1_b col2_a col2_b col2_c
424+
0 1 0 0 1 0
425+
1 0 1 1 0 0
426+
2 0 0 0 0 0
427+
428+
>>> pd.from_dummies(df, sep="_", default_category={"col1": "d", "col2": "e"})
429+
col1 col2
430+
0 a b
431+
1 b a
432+
2 d e
433+
"""
434+
from pandas.core.reshape.concat import concat
435+
436+
if not isinstance(data, DataFrame):
437+
raise TypeError(
438+
"Expected 'data' to be a 'DataFrame'; "
439+
f"Received 'data' of type: {type(data).__name__}"
440+
)
441+
442+
if data.isna().any().any():
443+
raise ValueError(
444+
"Dummy DataFrame contains NA value in column: "
445+
f"'{data.isna().any().idxmax()}'"
446+
)
447+
448+
# index data with a list of all columns that are dummies
449+
try:
450+
data_to_decode = data.astype("boolean", copy=False)
451+
except TypeError:
452+
raise TypeError("Passed DataFrame contains non-dummy data")
453+
454+
# collect prefixes and get lists to slice data for each prefix
455+
variables_slice = defaultdict(list)
456+
if sep is None:
457+
variables_slice[""] = list(data.columns)
458+
elif isinstance(sep, str):
459+
for col in data_to_decode.columns:
460+
prefix = col.split(sep)[0]
461+
if len(prefix) == len(col):
462+
raise ValueError(f"Separator not specified for column: {col}")
463+
variables_slice[prefix].append(col)
464+
else:
465+
raise TypeError(
466+
"Expected 'sep' to be of type 'str' or 'None'; "
467+
f"Received 'sep' of type: {type(sep).__name__}"
468+
)
469+
470+
if default_category is not None:
471+
if isinstance(default_category, dict):
472+
if not len(default_category) == len(variables_slice):
473+
len_msg = (
474+
f"Length of 'default_category' ({len(default_category)}) "
475+
f"did not match the length of the columns being encoded "
476+
f"({len(variables_slice)})"
477+
)
478+
raise ValueError(len_msg)
479+
elif isinstance(default_category, Hashable):
480+
default_category = dict(
481+
zip(variables_slice, [default_category] * len(variables_slice))
482+
)
483+
else:
484+
raise TypeError(
485+
"Expected 'default_category' to be of type "
486+
"'None', 'Hashable', or 'dict'; "
487+
"Received 'default_category' of type: "
488+
f"{type(default_category).__name__}"
489+
)
490+
491+
cat_data = {}
492+
for prefix, prefix_slice in variables_slice.items():
493+
if sep is None:
494+
cats = prefix_slice.copy()
495+
else:
496+
cats = [col[len(prefix + sep) :] for col in prefix_slice]
497+
assigned = data_to_decode.loc[:, prefix_slice].sum(axis=1)
498+
if any(assigned > 1):
499+
raise ValueError(
500+
"Dummy DataFrame contains multi-assignment(s); "
501+
f"First instance in row: {assigned.idxmax()}"
502+
)
503+
elif any(assigned == 0):
504+
if isinstance(default_category, dict):
505+
cats.append(default_category[prefix])
506+
else:
507+
raise ValueError(
508+
"Dummy DataFrame contains unassigned value(s); "
509+
f"First instance in row: {assigned.idxmin()}"
510+
)
511+
data_slice = concat(
512+
(data_to_decode.loc[:, prefix_slice], assigned == 0), axis=1
513+
)
514+
else:
515+
data_slice = data_to_decode.loc[:, prefix_slice]
516+
cats_array = np.array(cats, dtype="object")
517+
# get indices of True entries along axis=1
518+
cat_data[prefix] = cats_array[data_slice.to_numpy().nonzero()[1]]
519+
520+
return DataFrame(cat_data)

pandas/tests/api/test_api.py

+1
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ class TestPDApi(Base):
116116
"eval",
117117
"factorize",
118118
"get_dummies",
119+
"from_dummies",
119120
"infer_freq",
120121
"isna",
121122
"isnull",

0 commit comments

Comments
 (0)