pandas-dev · jreback · Jun 30, 2022 · Jun 9, 2021 · Jun 9, 2021 · Jun 14, 2021
diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst
@@ -719,6 +719,58 @@ To choose another dtype, use the ``dtype`` argument:
     pd.get_dummies(df, dtype=bool).dtypes
 
 
+To convert a "dummy" or "indicator" ``DataFrame``, into a categorical ``DataFrame``
+(a categorical ``Series``), for example ``k`` columns of a ``DataFrame`` containing
+1s and 0s can derive a ``DataFrame`` (a ``Series``) which has ``k`` distinct values
+:func:`~pandas.from_dummies`:
+
+.. ipython:: python
+
+   d = pd.DataFrame({"prefix_a": [0, 1, 0], "prefix_b": [1, 0, 1]})
+
+   pd.from_dummies(d)
+
+The ``k`` distinct values can also be represented be a ``dropped_first`` which
+means that no vale assigned implies a the value of the dropped value:
+
+.. ipython:: python
+
+   d = pd.DataFrame({"prefix_a": [0, 1, 0]})
+
+   pd.from_dummies(d, dropped_first="b")
+
+The function is the inverse of :func:`pandas.get_dummies <pandas.reshape.get_dummies>`.
+
+All non-dummy columns are included untouched in the output. You can control
+which columns are included in the output with the ``columns`` argument.
+
+.. ipython:: python
+
+    pd.get_dummies(df, columns=["C", "prefix_A", "prefix_B"])
+
+You can pass values for for the ``prefix_sep`` argument depending on how many or
+nested prefix separators are used in the column names. By default the prefix
+separator is assumed to be a '_', however ``prefix_sep`` can be specified in
+3 ways:
+
+* string: Use the same value for ``prefix_sep`` for each column
+  to be dencoded.
+* list: Variables will be decoded by the first instance of prefix separator passed
+  the list that is encountered in the column name.
+* dict: Directly map prefix separators to prefixes. Can be used in case mixed
+  separators are used within the variable name and to separate the variable from
+  the prefix.
+
+.. ipython:: python
+
+    simple = pd.get_dummies(df, prefix_sep="-")
+    simple
+    from_list = pd.get_dummies(df, prefix_sep=["_", "-"])
+    from_list
+    from_dict = pd.get_dummies(df, prefix_sep={"prefix1": "-", "prefix2": "_"})
+    from_dict
+
+
 .. _reshaping.factorize:
 
 Factorizing values

diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
@@ -1053,6 +1053,225 @@ def get_empty_frame(data) -> DataFrame:
         return DataFrame(dummy_mat, index=index, columns=dummy_cols)
 
 
+def from_dummies(
+    data: DataFrame,
+    to_series: bool = False,
+    prefix_sep: str | list[str] | dict[str, str] = "_",
+    columns: None | list[str] = None,
+    dropped_first: None | str | list[str] | dict[str, str] = None,
+) -> Series | DataFrame:
+    """
+    Create a categorical `Series` or `DataFrame` from a `DataFrame` of dummy
+    variables.
+
+    Inverts the operation performed by 'get_dummies'.
+
+    Parameters
+    ----------
+    data : `DataFrame`
+        Data which contains dummy-coded variables.
+    to_series : bool, default False
+        Converts the input data to a categorical `Series`, converts the input data
+        to a categorical `DataFrame` if False.
+    prefix_sep : str, list of str, or dict of str, default '_'
+        Separator/deliminator used in the column names of the dummy categories.
+        Pass a list if multiple prefix separators are used in the columns names.
+        Alternatively, pass a dictionary to map prefix separators to prefixes if
+        multiple and/ mixed separators are used in the column names.
+    columns : None or list of str, default 'None'
+        The columns which to convert from dummy-encoding and return as categorical
+        `DataFrame`.
+        If `columns` is None then all dummy columns are converted and appended
+        to the non-dummy columns.
+    dropped_fist : None, str, list of str, or dict of str, default None
+        The implied value the dummy takes when all values are zero.
+        Can be a a single value for all variables, a list with a number of values
+        equal to the dummy variables, or a dict directly mapping the dropped value
+        to a prefix of a variable.
+
+    Returns
+    -------
+    `Series` or `DataFrame`
+        Categorical data decoded from the dummy input-data.
+
+    See Also
+    --------
+    get_dummies : Convert `Series` or `DataFrame` to dummy codes.
+
+    Examples
+    --------
+    >>> d = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0],
+                          "c": [0, 0, 1, 0]})
+
+    >>> pd.from_dummies(s, to_series=True)
+    0  a
+    1  b
+    2  c
+    3  a
+
+    >>> d = pd.DataFrame({"C": [1, 2, 3], "col1_a": [1, 0, 1],
+                          "col1_b": [0, 1, 0], "col2_a": [0, 1, 0],
+                          "col2_b": [1, 0, 0], "col2_c": [0, 0, 1]})
+
+    >>> pd.from_dummies(d)
+       C   col1    col2
+    0  1     a       b
+    1  2     b       a
+    2  3     a       c
+
+    >>> d = pd.DataFrame({"C": [1, 2, 3], "col1_a": [1, 0, 0],
+                        "col1_b": [0, 1, 0], "col2_a": [0, 1, 0],
+                        "col2_b": [1, 0, 0], "col2_c": [0, 0, 0]})
+
+    >>> pd.from_dummies(d, dropped_first=["d", "e"])
+       C   col1    col2
+    0  1     a       b
+    1  2     b       a
+    2  3     d       e
+
+    >>> d = pd.DataFrame({"col1_a-a": [1, 0, 1], "col1_b-b": [0, 1, 0],
+                          "col2-a_a": [0, 1, 0], "col2-b_b": [1, 0, 0],
+                          "col2-c_c": [0, 0, 1]})
+
+    >>> pd.from_dummies(d, prefix_sep={"col1": "_", "col2": "-"})
+       col1  col2
+    0  a-a   b-b
+    1  b-b   a-a
+    2  a-a   c-c
+    """
+    from pandas.core.reshape.concat import concat
+
+    if data.isna().any().any():
+        raise ValueError(
+            f"Dummy DataFrame contains NA value in column: "
+            f"'{data.columns[data.isna().any().argmax()]}'"
+        )
+
+    if to_series:
+        return _from_dummies_1d(data, dropped_first)
+
+    data_to_decode: DataFrame
+    if columns is None:
+        columns = data.columns
+    elif not is_list_like(columns):
+        raise TypeError("Argument for parameter 'columns' must be list-like")
+    # index data with a list of all columns that are dummies
+    cat_columns = []
+    non_cat_columns = []
+    for col in columns:
+        if any(ps in col for ps in prefix_sep):
+            cat_columns.append(col)
+        else:
+            non_cat_columns.append(col)
+    data_to_decode = data[cat_columns].astype("boolean")
+    non_cat_data = data[non_cat_columns]
+
+    # get separator for each prefix and lists to slice data for each prefix
+    if isinstance(prefix_sep, dict):
+        variables_slice = {prefix: [] for prefix in prefix_sep}
+        for col in data_to_decode.columns:
+            for prefix in prefix_sep:
+                if prefix in col:
+                    variables_slice[prefix].append(col)
+    else:
+        sep_for_prefix = {}
+        variables_slice = {}
+        for col in data_to_decode.columns:
+            ps = [ps for ps in prefix_sep if ps in col][0]
+            prefix = col.split(ps)[0]
+            if prefix not in sep_for_prefix:
+                sep_for_prefix[prefix] = ps
+            if prefix not in variables_slice:
+                variables_slice[prefix] = [col]
+            else:
+                variables_slice[prefix].append(col)
+        prefix_sep = sep_for_prefix
+
+    # validate number of dropped_first
+    def check_len(item, name) -> None:
+        if not len(item) == len(variables_slice):
+            len_msg = (
+                f"Length of '{name}' ({len(item)}) did not match the "
+                "length of the columns being encoded "
+                f"({len(variables_slice)})."
+            )
+            raise ValueError(len_msg)
+
+    if dropped_first:
+        if isinstance(dropped_first, dict):
+            check_len(dropped_first, "dropped_first")
+        elif is_list_like(dropped_first):
+            check_len(dropped_first, "dropped_first")
+            dropped_first = dict(zip(variables_slice, dropped_first))
+        else:
+            dropped_first = dict(
+                zip(variables_slice, [dropped_first] * len(variables_slice))
+            )
+
+    cat_data = {}
+    for prefix, prefix_slice in variables_slice.items():
+        cats = [col[len(prefix + prefix_sep[prefix]) :] for col in prefix_slice]
+        assigned = data_to_decode[prefix_slice].sum(axis=1)
+        if any(assigned > 1):
+            raise ValueError(
+                f"Dummy DataFrame contains multi-assignment(s) for prefix: "
+                f"'{prefix}' in row {assigned.argmax()}."
+            )
+        elif any(assigned == 0):
+            if dropped_first:
+                cats.append(dropped_first[prefix])
+            else:
+                cats.append("from_dummies_nan_placeholer_string")
+            data_slice = concat((data_to_decode[prefix_slice], assigned == 0), axis=1)
+        else:
+            data_slice = data_to_decode[prefix_slice]
+        cat_data[prefix] = data_slice.dot(cats)
+
+    categorical_df = concat((non_cat_data, DataFrame(cat_data)), axis=1)
+    if dropped_first is None:
+        categorical_df.replace(
+            "from_dummies_nan_placeholer_string", np.nan, inplace=True
+        )
+    return categorical_df
+
+
+def _from_dummies_1d(
+    data: DataFrame,
+    dropped_first: None | str = None,
+) -> Series:
+    """
+    Helper function for from_dummies.
+
+    Handles the conversion of dummy encoded data to a categorical `Series`.
+    For parameters and usage see: from_dummies.
+    """
+    from pandas.core.reshape.concat import concat
+
+    if dropped_first and not isinstance(dropped_first, str):
+        raise ValueError("Only one dropped first value possible in 1D dummy DataFrame.")
+
+    data = data.astype("boolean")
+    cats = data.columns.tolist()
+    assigned = data.sum(axis=1)
+    if any(assigned > 1):
+        raise ValueError(
+            f"Dummy DataFrame contains multi-assignment in row {assigned.argmax()}."
+        )
+    elif any(assigned == 0):
+        if dropped_first:
+            cats.append(dropped_first)
+        else:
+            cats.append("from_dummies_nan_placeholer_string")
+        data = concat((data, assigned == 0), axis=1)
+
+    categorical_series = data.dot(cats)
+    if dropped_first is None:
+        categorical_series.replace(
+            "from_dummies_nan_placeholer_string", np.nan, inplace=True
+        )
+    return categorical_series
+
+
 def _reorder_for_extension_array_stack(
     arr: ExtensionArray, n_rows: int, n_columns: int
 ) -> ExtensionArray: