diff --git a/doc/source/reference/general_functions.rst b/doc/source/reference/general_functions.rst index a42d54b7e50ef..f82d9c9a6482c 100644 --- a/doc/source/reference/general_functions.rst +++ b/doc/source/reference/general_functions.rst @@ -23,6 +23,7 @@ Data manipulations merge_asof concat get_dummies + from_dummies factorize unique wide_to_long diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index b24890564d1bf..adca9de6c130a 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -706,6 +706,30 @@ To choose another dtype, use the ``dtype`` argument: pd.get_dummies(df, dtype=bool).dtypes +.. versionadded:: 1.5.0 + +To convert a "dummy" or "indicator" ``DataFrame``, into a categorical ``DataFrame``, +for example ``k`` columns of a ``DataFrame`` containing 1s and 0s can derive a +``DataFrame`` which has ``k`` distinct values using +:func:`~pandas.from_dummies`: + +.. ipython:: python + + df = pd.DataFrame({"prefix_a": [0, 1, 0], "prefix_b": [1, 0, 1]}) + df + + pd.from_dummies(df, sep="_") + +Dummy coded data only requires ``k - 1`` categories to be included, in this case +the ``k`` th category is the default category, implied by not being assigned any of +the other ``k - 1`` categories, can be passed via ``default_category``. + +.. ipython:: python + + df = pd.DataFrame({"prefix_a": [0, 1, 0]}) + df + + pd.from_dummies(df, sep="_", default_category="b") .. _reshaping.factorize: diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index a76b682f135db..5312d2db73000 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -100,6 +100,25 @@ as seen in the following example. 1 2021-01-02 08:00:00 4 2 2021-01-02 16:00:00 5 +.. _whatsnew_150.enhancements.from_dummies: + +from_dummies +^^^^^^^^^^^^ + +Added new function :func:`~pandas.from_dummies` to convert a dummy coded :class:`DataFrame` into a categorical :class:`DataFrame`. + +Example:: + +.. ipython:: python + + import pandas as pd + + df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], + "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], + "col2_c": [0, 0, 1]}) + + pd.from_dummies(df, sep="_") + .. _whatsnew_150.enhancements.orc: Writing to ORC files diff --git a/pandas/__init__.py b/pandas/__init__.py index 3645e8744d8af..1c4151372273b 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -128,6 +128,7 @@ pivot, pivot_table, get_dummies, + from_dummies, cut, qcut, ) @@ -361,6 +362,7 @@ def __getattr__(name): "eval", "factorize", "get_dummies", + "from_dummies", "get_option", "infer_freq", "interval_range", diff --git a/pandas/core/reshape/api.py b/pandas/core/reshape/api.py index 7226c57cc27d8..f100cca5c7615 100644 --- a/pandas/core/reshape/api.py +++ b/pandas/core/reshape/api.py @@ -1,7 +1,10 @@ # flake8: noqa:F401 from pandas.core.reshape.concat import concat -from pandas.core.reshape.encoding import get_dummies +from pandas.core.reshape.encoding import ( + from_dummies, + get_dummies, +) from pandas.core.reshape.melt import ( lreshape, melt, diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index f0500ec142955..fc908a5648885 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -1,6 +1,8 @@ from __future__ import annotations +from collections import defaultdict import itertools +from typing import Hashable import numpy as np @@ -68,6 +70,7 @@ def get_dummies( See Also -------- Series.str.get_dummies : Convert Series to dummy codes. + :func:`~pandas.from_dummies` : Convert dummy codes to categorical ``DataFrame``. Notes ----- @@ -316,3 +319,202 @@ def get_empty_frame(data) -> DataFrame: dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols) + + +def from_dummies( + data: DataFrame, + sep: None | str = None, + default_category: None | Hashable | dict[str, Hashable] = None, +) -> DataFrame: + """ + Create a categorical ``DataFrame`` from a ``DataFrame`` of dummy variables. + + Inverts the operation performed by :func:`~pandas.get_dummies`. + + .. versionadded:: 1.5.0 + + Parameters + ---------- + data : DataFrame + Data which contains dummy-coded variables in form of integer columns of + 1's and 0's. + sep : str, default None + Separator used in the column names of the dummy categories they are + character indicating the separation of the categorical names from the prefixes. + For example, if your column names are 'prefix_A' and 'prefix_B', + you can strip the underscore by specifying sep='_'. + default_category : None, Hashable or dict of Hashables, default None + The default category is the implied category when a value has none of the + listed categories specified with a one, i.e. if all dummies in a row are + zero. Can be a single value for all variables or a dict directly mapping + the default categories to a prefix of a variable. + + Returns + ------- + DataFrame + Categorical data decoded from the dummy input-data. + + Raises + ------ + ValueError + * When the input ``DataFrame`` ``data`` contains NA values. + * When the input ``DataFrame`` ``data`` contains column names with separators + that do not match the separator specified with ``sep``. + * When a ``dict`` passed to ``default_category`` does not include an implied + category for each prefix. + * When a value in ``data`` has more than one category assigned to it. + * When ``default_category=None`` and a value in ``data`` has no category + assigned to it. + TypeError + * When the input ``data`` is not of type ``DataFrame``. + * When the input ``DataFrame`` ``data`` contains non-dummy data. + * When the passed ``sep`` is of a wrong data type. + * When the passed ``default_category`` is of a wrong data type. + + See Also + -------- + :func:`~pandas.get_dummies` : Convert ``Series`` or ``DataFrame`` to dummy codes. + :class:`~pandas.Categorical` : Represent a categorical variable in classic. + + Notes + ----- + The columns of the passed dummy data should only include 1's and 0's, + or boolean values. + + Examples + -------- + >>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], + ... "c": [0, 0, 1, 0]}) + + >>> df + a b c + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 + 3 1 0 0 + + >>> pd.from_dummies(df) + 0 a + 1 b + 2 c + 3 a + + >>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], + ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], + ... "col2_c": [0, 0, 1]}) + + >>> df + col1_a col1_b col2_a col2_b col2_c + 0 1 0 0 1 0 + 1 0 1 1 0 0 + 2 1 0 0 0 1 + + >>> pd.from_dummies(df, sep="_") + col1 col2 + 0 a b + 1 b a + 2 a c + + >>> df = pd.DataFrame({"col1_a": [1, 0, 0], "col1_b": [0, 1, 0], + ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], + ... "col2_c": [0, 0, 0]}) + + >>> df + col1_a col1_b col2_a col2_b col2_c + 0 1 0 0 1 0 + 1 0 1 1 0 0 + 2 0 0 0 0 0 + + >>> pd.from_dummies(df, sep="_", default_category={"col1": "d", "col2": "e"}) + col1 col2 + 0 a b + 1 b a + 2 d e + """ + from pandas.core.reshape.concat import concat + + if not isinstance(data, DataFrame): + raise TypeError( + "Expected 'data' to be a 'DataFrame'; " + f"Received 'data' of type: {type(data).__name__}" + ) + + if data.isna().any().any(): + raise ValueError( + "Dummy DataFrame contains NA value in column: " + f"'{data.isna().any().idxmax()}'" + ) + + # index data with a list of all columns that are dummies + try: + data_to_decode = data.astype("boolean", copy=False) + except TypeError: + raise TypeError("Passed DataFrame contains non-dummy data") + + # collect prefixes and get lists to slice data for each prefix + variables_slice = defaultdict(list) + if sep is None: + variables_slice[""] = list(data.columns) + elif isinstance(sep, str): + for col in data_to_decode.columns: + prefix = col.split(sep)[0] + if len(prefix) == len(col): + raise ValueError(f"Separator not specified for column: {col}") + variables_slice[prefix].append(col) + else: + raise TypeError( + "Expected 'sep' to be of type 'str' or 'None'; " + f"Received 'sep' of type: {type(sep).__name__}" + ) + + if default_category is not None: + if isinstance(default_category, dict): + if not len(default_category) == len(variables_slice): + len_msg = ( + f"Length of 'default_category' ({len(default_category)}) " + f"did not match the length of the columns being encoded " + f"({len(variables_slice)})" + ) + raise ValueError(len_msg) + elif isinstance(default_category, Hashable): + default_category = dict( + zip(variables_slice, [default_category] * len(variables_slice)) + ) + else: + raise TypeError( + "Expected 'default_category' to be of type " + "'None', 'Hashable', or 'dict'; " + "Received 'default_category' of type: " + f"{type(default_category).__name__}" + ) + + cat_data = {} + for prefix, prefix_slice in variables_slice.items(): + if sep is None: + cats = prefix_slice.copy() + else: + cats = [col[len(prefix + sep) :] for col in prefix_slice] + assigned = data_to_decode.loc[:, prefix_slice].sum(axis=1) + if any(assigned > 1): + raise ValueError( + "Dummy DataFrame contains multi-assignment(s); " + f"First instance in row: {assigned.idxmax()}" + ) + elif any(assigned == 0): + if isinstance(default_category, dict): + cats.append(default_category[prefix]) + else: + raise ValueError( + "Dummy DataFrame contains unassigned value(s); " + f"First instance in row: {assigned.idxmin()}" + ) + data_slice = concat( + (data_to_decode.loc[:, prefix_slice], assigned == 0), axis=1 + ) + else: + data_slice = data_to_decode.loc[:, prefix_slice] + cats_array = np.array(cats, dtype="object") + # get indices of True entries along axis=1 + cat_data[prefix] = cats_array[data_slice.to_numpy().nonzero()[1]] + + return DataFrame(cat_data) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 1bc2cf5085f1a..bfdf3eb6992ca 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -116,6 +116,7 @@ class TestPDApi(Base): "eval", "factorize", "get_dummies", + "from_dummies", "infer_freq", "isna", "isnull", diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py new file mode 100644 index 0000000000000..c52331e54f95e --- /dev/null +++ b/pandas/tests/reshape/test_from_dummies.py @@ -0,0 +1,398 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Series, + from_dummies, + get_dummies, +) +import pandas._testing as tm + + +@pytest.fixture +def dummies_basic(): + return DataFrame( + { + "col1_a": [1, 0, 1], + "col1_b": [0, 1, 0], + "col2_a": [0, 1, 0], + "col2_b": [1, 0, 0], + "col2_c": [0, 0, 1], + }, + ) + + +@pytest.fixture +def dummies_with_unassigned(): + return DataFrame( + { + "col1_a": [1, 0, 0], + "col1_b": [0, 1, 0], + "col2_a": [0, 1, 0], + "col2_b": [0, 0, 0], + "col2_c": [0, 0, 1], + }, + ) + + +def test_error_wrong_data_type(): + dummies = [0, 1, 0] + with pytest.raises( + TypeError, + match=r"Expected 'data' to be a 'DataFrame'; Received 'data' of type: list", + ): + from_dummies(dummies) + + +def test_error_no_prefix_contains_unassigned(): + dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) + with pytest.raises( + ValueError, + match=( + r"Dummy DataFrame contains unassigned value\(s\); " + r"First instance in row: 2" + ), + ): + from_dummies(dummies) + + +def test_error_no_prefix_wrong_default_category_type(): + dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}) + with pytest.raises( + TypeError, + match=( + r"Expected 'default_category' to be of type 'None', 'Hashable', or 'dict'; " + r"Received 'default_category' of type: list" + ), + ): + from_dummies(dummies, default_category=["c", "d"]) + + +def test_error_no_prefix_multi_assignment(): + dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}) + with pytest.raises( + ValueError, + match=( + r"Dummy DataFrame contains multi-assignment\(s\); " + r"First instance in row: 2" + ), + ): + from_dummies(dummies) + + +def test_error_no_prefix_contains_nan(): + dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, np.nan]}) + with pytest.raises( + ValueError, match=r"Dummy DataFrame contains NA value in column: 'b'" + ): + from_dummies(dummies) + + +def test_error_contains_non_dummies(): + dummies = DataFrame( + {"a": [1, 6, 3, 1], "b": [0, 1, 0, 2], "c": ["c1", "c2", "c3", "c4"]} + ) + with pytest.raises( + TypeError, + match=r"Passed DataFrame contains non-dummy data", + ): + from_dummies(dummies) + + +def test_error_with_prefix_multiple_seperators(): + dummies = DataFrame( + { + "col1_a": [1, 0, 1], + "col1_b": [0, 1, 0], + "col2-a": [0, 1, 0], + "col2-b": [1, 0, 1], + }, + ) + with pytest.raises( + ValueError, + match=(r"Separator not specified for column: col2-a"), + ): + from_dummies(dummies, sep="_") + + +def test_error_with_prefix_sep_wrong_type(dummies_basic): + + with pytest.raises( + TypeError, + match=( + r"Expected 'sep' to be of type 'str' or 'None'; " + r"Received 'sep' of type: list" + ), + ): + from_dummies(dummies_basic, sep=["_"]) + + +def test_error_with_prefix_contains_unassigned(dummies_with_unassigned): + with pytest.raises( + ValueError, + match=( + r"Dummy DataFrame contains unassigned value\(s\); " + r"First instance in row: 2" + ), + ): + from_dummies(dummies_with_unassigned, sep="_") + + +def test_error_with_prefix_default_category_wrong_type(dummies_with_unassigned): + with pytest.raises( + TypeError, + match=( + r"Expected 'default_category' to be of type 'None', 'Hashable', or 'dict'; " + r"Received 'default_category' of type: list" + ), + ): + from_dummies(dummies_with_unassigned, sep="_", default_category=["x", "y"]) + + +def test_error_with_prefix_default_category_dict_not_complete( + dummies_with_unassigned, +): + with pytest.raises( + ValueError, + match=( + r"Length of 'default_category' \(1\) did not match " + r"the length of the columns being encoded \(2\)" + ), + ): + from_dummies(dummies_with_unassigned, sep="_", default_category={"col1": "x"}) + + +def test_error_with_prefix_contains_nan(dummies_basic): + dummies_basic["col2_c"][2] = np.nan + with pytest.raises( + ValueError, match=r"Dummy DataFrame contains NA value in column: 'col2_c'" + ): + from_dummies(dummies_basic, sep="_") + + +def test_error_with_prefix_contains_non_dummies(dummies_basic): + dummies_basic["col2_c"][2] = "str" + with pytest.raises(TypeError, match=r"Passed DataFrame contains non-dummy data"): + from_dummies(dummies_basic, sep="_") + + +def test_error_with_prefix_double_assignment(): + dummies = DataFrame( + { + "col1_a": [1, 0, 1], + "col1_b": [1, 1, 0], + "col2_a": [0, 1, 0], + "col2_b": [1, 0, 0], + "col2_c": [0, 0, 1], + }, + ) + with pytest.raises( + ValueError, + match=( + r"Dummy DataFrame contains multi-assignment\(s\); " + r"First instance in row: 0" + ), + ): + from_dummies(dummies, sep="_") + + +def test_roundtrip_series_to_dataframe(): + categories = Series(["a", "b", "c", "a"]) + dummies = get_dummies(categories) + result = from_dummies(dummies) + expected = DataFrame({"": ["a", "b", "c", "a"]}) + tm.assert_frame_equal(result, expected) + + +def test_roundtrip_single_column_dataframe(): + categories = DataFrame({"": ["a", "b", "c", "a"]}) + dummies = get_dummies(categories) + result = from_dummies(dummies, sep="_") + expected = categories + tm.assert_frame_equal(result, expected) + + +def test_roundtrip_with_prefixes(): + categories = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]}) + dummies = get_dummies(categories) + result = from_dummies(dummies, sep="_") + expected = categories + tm.assert_frame_equal(result, expected) + + +def test_no_prefix_string_cats_basic(): + dummies = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}) + expected = DataFrame({"": ["a", "b", "c", "a"]}) + result = from_dummies(dummies) + tm.assert_frame_equal(result, expected) + + +def test_no_prefix_string_cats_basic_bool_values(): + dummies = DataFrame( + { + "a": [True, False, False, True], + "b": [False, True, False, False], + "c": [False, False, True, False], + } + ) + expected = DataFrame({"": ["a", "b", "c", "a"]}) + result = from_dummies(dummies) + tm.assert_frame_equal(result, expected) + + +def test_no_prefix_string_cats_basic_mixed_bool_values(): + dummies = DataFrame( + {"a": [1, 0, 0, 1], "b": [False, True, False, False], "c": [0, 0, 1, 0]} + ) + expected = DataFrame({"": ["a", "b", "c", "a"]}) + result = from_dummies(dummies) + tm.assert_frame_equal(result, expected) + + +def test_no_prefix_int_cats_basic(): + dummies = DataFrame( + {1: [1, 0, 0, 0], 25: [0, 1, 0, 0], 2: [0, 0, 1, 0], 5: [0, 0, 0, 1]} + ) + expected = DataFrame({"": [1, 25, 2, 5]}, dtype="object") + result = from_dummies(dummies) + tm.assert_frame_equal(result, expected) + + +def test_no_prefix_float_cats_basic(): + dummies = DataFrame( + {1.0: [1, 0, 0, 0], 25.0: [0, 1, 0, 0], 2.5: [0, 0, 1, 0], 5.84: [0, 0, 0, 1]} + ) + expected = DataFrame({"": [1.0, 25.0, 2.5, 5.84]}, dtype="object") + result = from_dummies(dummies) + tm.assert_frame_equal(result, expected) + + +def test_no_prefix_mixed_cats_basic(): + dummies = DataFrame( + { + 1.23: [1, 0, 0, 0, 0], + "c": [0, 1, 0, 0, 0], + 2: [0, 0, 1, 0, 0], + False: [0, 0, 0, 1, 0], + None: [0, 0, 0, 0, 1], + } + ) + expected = DataFrame({"": [1.23, "c", 2, False, None]}, dtype="object") + result = from_dummies(dummies) + tm.assert_frame_equal(result, expected) + + +def test_no_prefix_string_cats_contains_get_dummies_NaN_column(): + dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0], "NaN": [0, 0, 1]}) + expected = DataFrame({"": ["a", "b", "NaN"]}) + result = from_dummies(dummies) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "default_category, expected", + [ + pytest.param( + "c", + DataFrame({"": ["a", "b", "c"]}), + id="default_category is a str", + ), + pytest.param( + 1, + DataFrame({"": ["a", "b", 1]}), + id="default_category is a int", + ), + pytest.param( + 1.25, + DataFrame({"": ["a", "b", 1.25]}), + id="default_category is a float", + ), + pytest.param( + 0, + DataFrame({"": ["a", "b", 0]}), + id="default_category is a 0", + ), + pytest.param( + False, + DataFrame({"": ["a", "b", False]}), + id="default_category is a bool", + ), + pytest.param( + (1, 2), + DataFrame({"": ["a", "b", (1, 2)]}), + id="default_category is a tuple", + ), + ], +) +def test_no_prefix_string_cats_default_category(default_category, expected): + dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) + result = from_dummies(dummies, default_category=default_category) + tm.assert_frame_equal(result, expected) + + +def test_with_prefix_basic(dummies_basic): + expected = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]}) + result = from_dummies(dummies_basic, sep="_") + tm.assert_frame_equal(result, expected) + + +def test_with_prefix_contains_get_dummies_NaN_column(): + dummies = DataFrame( + { + "col1_a": [1, 0, 0], + "col1_b": [0, 1, 0], + "col1_NaN": [0, 0, 1], + "col2_a": [0, 1, 0], + "col2_b": [0, 0, 0], + "col2_c": [0, 0, 1], + "col2_NaN": [1, 0, 0], + }, + ) + expected = DataFrame({"col1": ["a", "b", "NaN"], "col2": ["NaN", "a", "c"]}) + result = from_dummies(dummies, sep="_") + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "default_category, expected", + [ + pytest.param( + "x", + DataFrame({"col1": ["a", "b", "x"], "col2": ["x", "a", "c"]}), + id="default_category is a str", + ), + pytest.param( + 0, + DataFrame({"col1": ["a", "b", 0], "col2": [0, "a", "c"]}), + id="default_category is a 0", + ), + pytest.param( + False, + DataFrame({"col1": ["a", "b", False], "col2": [False, "a", "c"]}), + id="default_category is a False", + ), + pytest.param( + {"col2": 1, "col1": 2.5}, + DataFrame({"col1": ["a", "b", 2.5], "col2": [1, "a", "c"]}), + id="default_category is a dict with int and float values", + ), + pytest.param( + {"col2": None, "col1": False}, + DataFrame({"col1": ["a", "b", False], "col2": [None, "a", "c"]}), + id="default_category is a dict with bool and None values", + ), + pytest.param( + {"col2": (1, 2), "col1": [1.25, False]}, + DataFrame({"col1": ["a", "b", [1.25, False]], "col2": [(1, 2), "a", "c"]}), + id="default_category is a dict with list and tuple values", + ), + ], +) +def test_with_prefix_default_category( + dummies_with_unassigned, default_category, expected +): + result = from_dummies( + dummies_with_unassigned, sep="_", default_category=default_category + ) + tm.assert_frame_equal(result, expected)