first draft

MarcoGorelli · MarcoGorelli · commit efbe53ff6813 · 2020-02-07T21:58:03.000Z
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -42,7 +42,7 @@ Other enhancements
 ^^^^^^^^^^^^^^^^^^
 
 - :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`)
--
+- we have added a :meth:`pandas.from_dummies`, which is an inverse transformation of :meth:`pandas.get_dummies` (:issue:`8745`)
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/__init__.py b/pandas/__init__.py
@@ -135,6 +135,7 @@
     get_dummies,
     cut,
     qcut,
+    from_dummies,
 )
 
 import pandas.api
diff --git a/pandas/core/reshape/api.py b/pandas/core/reshape/api.py
@@ -4,5 +4,5 @@
 from pandas.core.reshape.melt import lreshape, melt, wide_to_long
 from pandas.core.reshape.merge import merge, merge_asof, merge_ordered
 from pandas.core.reshape.pivot import crosstab, pivot, pivot_table
-from pandas.core.reshape.reshape import get_dummies
+from pandas.core.reshape.reshape import from_dummies, get_dummies
 from pandas.core.reshape.tile import cut, qcut
diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
@@ -751,6 +751,138 @@ def _convert_level_number(level_num, columns):
     return result
 
 
+def from_dummies(data, columns=None, prefix_sep="_", dtype="category", fill_first=None):
+    """
+    The inverse transformation of ``pandas.get_dummies``.
+
+    Parameters
+    ----------
+    data : DataFrame
+    columns : list-like, default None
+        Column names in the DataFrame to be decoded.
+        If `columns` is None then all the columns will be converted.
+    prefix_sep : str, default '_'
+        Separator between original column name and dummy variable
+    dtype : dtype, default 'category'
+        Data dtype for new columns - only a single data type is allowed
+    fill_first : str, list, or dict, default None
+        Used to fill rows for which all the dummy variables are 0
+
+    Returns
+    -------
+    transformed : DataFrame
+
+    Examples
+    --------
+    Say we have a dataframe where some variables have been dummified:
+
+    >>> df = pd.DataFrame(
+    ...     {
+    ...         "animal_baboon": [0, 0, 1],
+    ...         "animal_lemur": [0, 1, 0],
+    ...         "animal_zebra": [1, 0, 0],
+    ...         "other_col": ["a", "b", "c"],
+    ...     }
+    ... )
+    >>> df
+       animal_baboon  animal_lemur  animal_zebra other_col
+    0              0             0             1         a
+    1              0             1             0         b
+    2              1             0             0         c
+
+    We can recover the original dataframe using `from_dummies`:
+
+    >>> pd.from_dummies(df, columns=['animal'])
+      other_col  animal
+    0         a   zebra
+    1         b   lemur
+    2         c  baboon
+
+    Suppose our dataframe has one column from each dummified column
+    dropped:
+
+    >>> df = df.drop('animal_zebra', axis=1)
+    >>> df
+       animal_baboon  animal_lemur other_col
+    0              0             0         a
+    1              0             1         b
+    2              1             0         c
+
+    We can still recover the original dataframe, by using the argument
+    `fill_first`:
+
+    >>> pd.from_dummies(df, columns=["animal"], fill_first=["zebra"])
+      other_col  animal
+    0         a   zebra
+    1         b   lemur
+    2         c  baboon
+    """
+    if dtype is None:
+        dtype = "category"
+
+    if columns is None:
+        data_to_decode = data.copy()
+        columns = data.columns.tolist()
+        columns = list(
+            {i.split(prefix_sep)[0] for i in data.columns if prefix_sep in i}
+        )
+
+    data_to_decode = data[
+        [i for i in data.columns for c in columns if i.startswith(c + prefix_sep)]
+    ]
+
+    # Check each row sums to 1 or 0
+    if not all(i in [0, 1] for i in data_to_decode.sum(axis=1).unique().tolist()):
+        raise ValueError(
+            "Data cannot be decoded! Each row must contain only 0s and"
+            " 1s, and each row may have at most one 1"
+        )
+
+    if fill_first is None:
+        fill_first = [None] * len(columns)
+    elif isinstance(fill_first, str):
+        fill_first = itertools.cycle([fill_first])
+    elif isinstance(fill_first, dict):
+        fill_first = [fill_first[col] for col in columns]
+
+    out = data.copy()
+    for column, fill_first_ in zip(columns, fill_first):
+        cols, labels = [
+            [
+                i.replace(x, "")
+                for i in data_to_decode.columns
+                if column + prefix_sep in i
+            ]
+            for x in ["", column + prefix_sep]
+        ]
+        if not cols:
+            continue
+        out = out.drop(cols, axis=1)
+        if fill_first_:
+            cols = [column + prefix_sep + fill_first_] + cols
+            labels = [fill_first_] + labels
+            data[cols[0]] = (1 - data[cols[1:]]).all(axis=1)
+        out[column] = Series(
+            np.array(labels)[np.argmax(data[cols].to_numpy(), axis=1)], dtype=dtype
+        )
+    return out
+
+
+def _check_len(item, name, data_to_encode):
+    """ Validate prefixes and separator to avoid silently dropping cols. """
+    len_msg = (
+        "Length of '{name}' ({len_item}) did not match the "
+        "length of the columns being encoded ({len_enc})."
+    )
+
+    if is_list_like(item):
+        if not len(item) == data_to_encode.shape[1]:
+            len_msg = len_msg.format(
+                name=name, len_item=len(item), len_enc=data_to_encode.shape[1]
+            )
+            raise ValueError(len_msg)
+
+
 def get_dummies(
     data,
     prefix=None,
@@ -871,20 +1003,8 @@ def get_dummies(
         else:
             data_to_encode = data[columns]
 
-        # validate prefixes and separator to avoid silently dropping cols
-        def check_len(item, name):
-
-            if is_list_like(item):
-                if not len(item) == data_to_encode.shape[1]:
-                    len_msg = (
-                        f"Length of '{name}' ({len(item)}) did not match the "
-                        "length of the columns being encoded "
-                        f"({data_to_encode.shape[1]})."
-                    )
-                    raise ValueError(len_msg)
-
-        check_len(prefix, "prefix")
-        check_len(prefix_sep, "prefix_sep")
+        _check_len(prefix, "prefix", data_to_encode)
+        _check_len(prefix_sep, "prefix_sep", data_to_encode)
 
         if isinstance(prefix, str):
             prefix = itertools.cycle([prefix])
diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py
@@ -0,0 +1,63 @@
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+
+
+@pytest.mark.parametrize(
+    "dtype, expected_dict",
+    [
+        ("str", {"col1": ["a", "a", "b"]}),
+        (str, {"col1": ["a", "a", "b"]},),
+        (None, {"col1": ["a", "a", "b"]}),
+    ],
+)
+def test_dtype(dtype, expected_dict):
+    df = pd.DataFrame({"col1_a": [1, 1, 0], "col1_b": [0, 0, 1]})
+    result = pd.from_dummies(df, dtype=dtype)
+    expected = pd.DataFrame(expected_dict)
+    if dtype is None:
+        expected = expected.astype("category")
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "fill_first, expected_dict",
+    [
+        ("a", {"col1": ["a", "a", "b"]}),
+        (["a"], {"col1": ["a", "a", "b"]}),
+        ({"col1": "a"}, {"col1": ["a", "a", "b"]}),
+    ],
+)
+def test_fill_first(fill_first, expected_dict):
+    df = pd.DataFrame({"col1_b": [0, 0, 1]})
+    result = pd.from_dummies(df, fill_first=fill_first)
+    # get_dummies changes the ordering of columns,
+    # see https://github.com/pandas-dev/pandas/issues/17612
+    expected = pd.DataFrame(expected_dict, dtype="category")
+    tm.assert_frame_equal(result, expected)
+
+
+def test_malformed():
+    df = pd.DataFrame({"col1_a": [1, 1, 0], "col1_b": [1, 0, 1]})
+    msg = (
+        "Data cannot be decoded! Each row must contain only 0s and 1s"
+        ", and each row may have at most one 1"
+    )
+    with pytest.raises(ValueError, match=msg):
+        pd.from_dummies(df)
+
+
+@pytest.mark.parametrize(
+    "prefix_sep, input_dict",
+    [
+        ("_", {"col1_a": [1, 1, 0], "col1_b": [0, 0, 1]}),
+        ("*", {"col1*a": [1, 1, 0], "col1*b": [0, 0, 1]}),
+        (".", {"col1.a": [1, 1, 0], "col1.b": [0, 0, 1]}),
+    ],
+)
+def test_prefix_sep(prefix_sep, input_dict):
+    df = pd.DataFrame(input_dict)
+    result = pd.from_dummies(df, prefix_sep=prefix_sep)
+    expected = pd.DataFrame({"col1": ["a", "a", "b"]}, dtype="category")
+    tm.assert_frame_equal(result, expected)

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ Other enhancements`
`42`	`42`	`^^^^^^^^^^^^^^^^^^`
`43`	`43`
`44`	`44`	- :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`)
`45`		`--`
	`45`	+- we have added a :meth:`pandas.from_dummies`, which is an inverse transformation of :meth:`pandas.get_dummies` (:issue:`8745`)
`46`	`46`	`-`
`47`	`47`
`48`	`48`	`.. ---------------------------------------------------------------------------`
Original file line number	Diff line number	Diff line change
`@@ -135,6 +135,7 @@`
`135`	`135`	`get_dummies,`
`136`	`136`	`cut,`
`137`	`137`	`qcut,`
	`138`	`+ from_dummies,`
`138`	`139`	`)`
`139`	`140`
`140`	`141`	`import pandas.api`