From f3e6afef545380317cea849a714394d3d2db64e9 Mon Sep 17 00:00:00 2001 From: pckSF Date: Wed, 9 Jun 2021 17:41:23 +0100 Subject: [PATCH 01/95] Initial draft: from_dummies --- pandas/core/reshape/reshape.py | 162 ++++++++ pandas/tests/reshape/test_from_dummies.py | 441 ++++++++++++++++++++++ 2 files changed, 603 insertions(+) create mode 100644 pandas/tests/reshape/test_from_dummies.py diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 93859eb11dd44..f56e2a32e4156 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1053,6 +1053,168 @@ def get_empty_frame(data) -> DataFrame: return DataFrame(dummy_mat, index=index, columns=dummy_cols) +def from_dummies( + data, + to_series: bool = False, + variables: None | str | list[str] | dict[str, str] = None, + prefix_sep: str | list[str] | dict[str, str] = "_", + dummy_na: bool = False, + columns: None | list[str] = None, + dropped_first: None | str | list[str] | dict[str, str] = None, +) -> Series | DataFrame: + """ + soon + """ + from pandas.core.reshape.concat import concat + + if to_series: + return _from_dummies_1d(data, dummy_na, dropped_first) + + data_to_decode: DataFrame + if columns is None: + # index data with a list of all columns that are dummies + cat_columns = [] + non_cat_columns = [] + for col in data.columns: + if any(ps in col for ps in prefix_sep): + cat_columns.append(col) + else: + non_cat_columns.append(col) + data_to_decode = data[cat_columns] + non_cat_data = data[non_cat_columns] + elif not is_list_like(columns): + raise TypeError("Input must be a list-like for parameter 'columns'") + else: + data_to_decode = data[columns] + non_cat_data = data[[col for col in data.columns if col not in columns]] + + # get separator for each prefix and lists to slice data for each prefix + if isinstance(prefix_sep, dict): + variables_slice = {prefix: [] for prefix in prefix_sep} + for col in data_to_decode.columns: + for prefix in prefix_sep: + if prefix in col: + variables_slice[prefix].append(col) + else: + sep_for_prefix = {} + variables_slice = {} + for col in data_to_decode.columns: + ps = [ps for ps in prefix_sep if ps in col][0] + prefix = col.split(ps)[0] + if prefix not in sep_for_prefix: + sep_for_prefix[prefix] = ps + if prefix not in variables_slice: + variables_slice[prefix] = [col] + else: + variables_slice[prefix].append(col) + prefix_sep = sep_for_prefix + + # validate number of passed arguments + def check_len(item, name) -> None: + if not len(item) == len(variables_slice): + len_msg = ( + f"Length of '{name}' ({len(item)}) did not match the " + "length of the columns being encoded " + f"({len(variables_slice)})." + ) + raise ValueError(len_msg) + + # obtain prefix to category mapping + variables: dict[str, str] + if isinstance(variables, dict): + check_len(variables, "variables") + variables = variables + elif is_list_like(variables): + check_len(variables, "variables") + variables = dict(zip(variables_slice, variables)) + elif isinstance(variables, str): + variables = dict( + zip( + variables_slice, + (f"{variables}{i}" for i in range(len(variables_slice))), + ) + ) + else: + variables = dict(zip(variables_slice, variables_slice)) + + if dropped_first: + if isinstance(dropped_first, dict): + check_len(dropped_first, "dropped_first") + elif is_list_like(dropped_first): + check_len(dropped_first, "dropped_first") + dropped_first = dict(zip(variables_slice, dropped_first)) + else: + dropped_first = dict( + zip(variables_slice, [dropped_first] * len(variables_slice)) + ) + + cat_data = {var: [] for _, var in variables.items()} + for index, row in data.iterrows(): + for prefix, prefix_slice in variables_slice.items(): + slice_sum = row[prefix_slice].sum() + if slice_sum > 1: + raise ValueError( + f"Dummy DataFrame contains multi-assignment(s) for prefix: " + f"'{prefix}' in row {index}." + ) + elif slice_sum == 0: + if dropped_first: + category = dropped_first[prefix] + elif not dummy_na: + category = np.nan + else: + raise ValueError( + f"Dummy DataFrame contains no assignment for prefix: " + f"'{prefix}' in row {index}." + ) + else: + cat_index = row[prefix_slice].argmax() + category = prefix_slice[cat_index].split(prefix_sep[prefix])[1] + if dummy_na and category == "NaN": + category = np.nan + cat_data[variables[prefix]].append(category) + + if columns: + return DataFrame(cat_data) + else: + return concat([non_cat_data, DataFrame(cat_data)], axis=1) + + +def _from_dummies_1d( + data, + dummy_na: bool = False, + dropped_first: None | str = None, +) -> Series: + """ + soon + """ + if dropped_first and not isinstance(dropped_first, str): + raise ValueError("Only one dropped first value possible in 1D dummy DataFrame.") + + cat_data = [] + for index, row in data.iterrows(): + row_sum = row.sum() + if row_sum > 1: + raise ValueError( + f"Dummy DataFrame contains multi-assignment in row {index}." + ) + elif row_sum == 0: + if dropped_first: + category = dropped_first + elif not dummy_na: + category = np.nan + else: + raise ValueError( + f"Dummy DataFrame contains no assignment in row {index}." + ) + else: + category = data.columns[row.argmax()] + if dummy_na and category == "NaN": + category = np.nan + cat_data.append(category) + return Series(cat_data) + + def _reorder_for_extension_array_stack( arr: ExtensionArray, n_rows: int, n_columns: int ) -> ExtensionArray: diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py new file mode 100644 index 0000000000000..da6ff875bb846 --- /dev/null +++ b/pandas/tests/reshape/test_from_dummies.py @@ -0,0 +1,441 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Series, +) +from pandas.core.reshape.reshape import from_dummies + + +def test_from_dummies_to_series_basic(): + dummies = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}) + expected = Series(list("abca")) + result = from_dummies(dummies, to_series=True) + assert all(result == expected) + + +def test_from_dummies_to_series_dummy_na(): + dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0], "NaN": [0, 0, 1]}) + expected = Series(["a", "b", np.nan]) + result = from_dummies(dummies, to_series=True, dummy_na=True) + assert all(result[:2] == expected[:2]) + assert result[2] is expected[2] + + +def test_from_dummies_to_series_contains_nan(): + dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) + expected = Series(["a", "b", np.nan]) + result = from_dummies(dummies, to_series=True) + assert all(result[:2] == expected[:2]) + assert result[2] is expected[2] + + +def test_from_dummies_to_series_dropped_first(): + dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) + expected = Series(["a", "b", "c"]) + result = from_dummies(dummies, to_series=True, dropped_first="c") + assert all(result == expected) + + +def test_from_dummies_to_series_wrong_dropped_first(): + dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}) + with pytest.raises( + ValueError, + match=r"Only one dropped first value possible in 1D dummy DataFrame.", + ): + from_dummies(dummies, to_series=True, dropped_first=["c", "d"]) + + +def test_from_dummies_to_series_multi_assignment(): + dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}) + with pytest.raises( + ValueError, match=r"Dummy DataFrame contains multi-assignment in row 2." + ): + from_dummies(dummies, to_series=True) + + +def test_from_dummies_to_series_unassigned_row(): + dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) + with pytest.raises( + ValueError, match=r"Dummy DataFrame contains no assignment in row 2." + ): + from_dummies(dummies, to_series=True, dummy_na=True) + + +def test_from_dummies_no_dummies(): + dummies = DataFrame( + {"a": [1, 6, 3, 1], "b": [0, 1, 0, 2], "c": ["c1", "c2", "c3", "c4"]} + ) + expected = dummies + result = from_dummies(dummies) + assert all(result == expected) + + +def test_from_dummies_to_df_basic(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a": [1, 0, 1], + "col1_b": [0, 1, 0], + "col2_a": [0, 1, 0], + "col2_b": [1, 0, 0], + "col2_c": [0, 0, 1], + }, + ) + expected = DataFrame( + {"C": [1, 2, 3], "col1": ["a", "b", "a"], "col2": ["b", "a", "c"]} + ) + result = from_dummies(dummies) + assert all(result == expected) + + +def test_from_dummies_to_df_variable_string(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a": [1, 0, 1], + "col1_b": [0, 1, 0], + "col2_a": [0, 1, 0], + "col2_b": [1, 0, 0], + "col2_c": [0, 0, 1], + }, + ) + expected = DataFrame( + {"C": [1, 2, 3], "varname0": ["a", "b", "a"], "varname1": ["b", "a", "c"]} + ) + result = from_dummies(dummies, variables="varname") + assert all(result == expected) + + +def test_from_dummies_to_df_variable_list(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a": [1, 0, 1], + "col1_b": [0, 1, 0], + "col2_a": [0, 1, 0], + "col2_b": [1, 0, 0], + "col2_c": [0, 0, 1], + }, + ) + expected = DataFrame({"C": [1, 2, 3], "A": ["a", "b", "a"], "B": ["b", "a", "c"]}) + result = from_dummies(dummies, variables=["A", "B"]) + assert all(result == expected) + + +def test_from_dummies_to_df_variable_list_not_complete(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a": [1, 0, 1], + "col1_b": [0, 1, 0], + "col2_a": [0, 1, 0], + "col2_b": [1, 0, 0], + "col2_c": [0, 0, 1], + }, + ) + with pytest.raises( + ValueError, + match=( + r"Length of 'variables' \(1\) did not match " + r"the length of the columns being encoded \(2\)." + ), + ): + from_dummies(dummies, variables=["A"]) + + +def test_from_dummies_to_df_variable_dict(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a": [1, 0, 1], + "col1_b": [0, 1, 0], + "col2_a": [0, 1, 0], + "col2_b": [1, 0, 0], + "col2_c": [0, 0, 1], + }, + ) + expected = DataFrame({"C": [1, 2, 3], "A": ["b", "a", "c"], "B": ["a", "b", "a"]}) + result = from_dummies(dummies, variables={"col2": "A", "col1": "B"}) + assert all(result == expected) + + +def test_from_dummies_to_df_variable_dict_not_complete(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a": [1, 0, 1], + "col1_b": [0, 1, 0], + "col2_a": [0, 1, 0], + "col2_b": [1, 0, 0], + "col2_c": [0, 0, 1], + }, + ) + with pytest.raises( + ValueError, + match=( + r"Length of 'variables' \(1\) did not match " + r"the length of the columns being encoded \(2\)." + ), + ): + from_dummies(dummies, variables={"col1": "A"}) + + +def test_from_dummies_to_df_prefix_sep_list(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a": [1, 0, 1], + "col1_b": [0, 1, 0], + "col2-a": [0, 1, 0], + "col2-b": [1, 0, 0], + "col2-c": [0, 0, 1], + }, + ) + expected = DataFrame( + {"C": [1, 2, 3], "col1": ["a", "b", "a"], "col2": ["b", "a", "c"]} + ) + result = from_dummies(dummies, prefix_sep=["_", "-"]) + assert all(result == expected) + + +def test_from_dummies_to_df_prefix_sep_dict(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a-a": [1, 0, 1], + "col1_b-a": [0, 1, 0], + "col2-a_a": [0, 1, 0], + "col2-b_b": [1, 0, 0], + "col2-c_c": [0, 0, 1], + }, + ) + expected = DataFrame( + {"C": [1, 2, 3], "col1": ["a-a", "b-b", "a-a"], "col2": ["b_b", "a_a", "c_c"]} + ) + result = from_dummies( + dummies, + prefix_sep={ + "col1": "_", + "col2": "-", + }, + ) + assert all(result == expected) + + +def test_from_dummies_to_df_dummy_na(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a": [1, 0, 0], + "col1_b": [0, 1, 0], + "col1_NaN": [0, 0, 1], + "col2_a": [0, 1, 0], + "col2_b": [0, 0, 0], + "col2_c": [0, 0, 1], + "col2_NAN": [1, 0, 0], + }, + ) + expected = DataFrame( + {"C": [1, 2, 3], "col1": ["a", "b", np.nan], "col2": [np.nan, "a", "c"]} + ) + result = from_dummies(dummies, dummy_na=True) + assert all(result["C"] == expected["C"]) + assert all(result["col1"][:2] == expected["col1"][:2]) + assert all(result["col2"][1:] == expected["col2"][1:]) + assert result["col1"][2] is expected["col1"][2] + assert result["col2"][1] is expected["col2"][1] + + +def test_from_dummies_to_df_contains_nan(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a": [1, 0, 0], + "col1_b": [0, 1, 0], + "col2_a": [0, 1, 0], + "col2_b": [0, 0, 0], + "col2_c": [0, 0, 1], + }, + ) + expected = DataFrame( + {"C": [1, 2, 3], "col1": ["a", "b", np.nan], "col2": [np.nan, "a", "c"]} + ) + result = from_dummies(dummies) + assert all(result["C"] == expected["C"]) + assert all(result["col1"][:2] == expected["col1"][:2]) + assert all(result["col2"][1:] == expected["col2"][1:]) + assert result["col1"][2] is expected["col1"][2] + assert result["col2"][1] is expected["col2"][1] + + +def test_from_dummies_to_df_columns(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a": [1, 0, 1], + "col1_b": [0, 1, 0], + "col2_a": [0, 1, 0], + "col2_b": [1, 0, 0], + "col2_c": [0, 0, 1], + }, + ) + expected = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]}) + result = from_dummies( + dummies, columns=["col1_a", "col1_b", "col2_a", "col2_b", "col2_c"] + ) + assert all(result == expected) + + +def test_from_dummies_to_df_dropped_first_str(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a": [1, 0, 0], + "col1_b": [0, 1, 0], + "col2_a": [0, 1, 0], + "col2_b": [0, 0, 0], + "col2_c": [0, 0, 1], + }, + ) + expected = DataFrame( + {"C": [1, 2, 3], "col1": ["a", "b", "x"], "col2": ["x", "a", "c"]} + ) + result = from_dummies(dummies, dropped_first="x") + assert all(result == expected) + + +def test_from_dummies_to_df_dropped_first_list(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a": [1, 0, 0], + "col1_b": [0, 1, 0], + "col2_a": [0, 1, 0], + "col2_b": [0, 0, 0], + "col2_c": [0, 0, 1], + }, + ) + expected = DataFrame( + {"C": [1, 2, 3], "col1": ["a", "b", "x"], "col2": ["y", "a", "c"]} + ) + result = from_dummies(dummies, dropped_first=["x", "y"]) + assert all(result == expected) + + +def test_from_dummies_to_df_dropped_first_list_not_complete(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a": [1, 0, 0], + "col1_b": [0, 1, 0], + "col2_a": [0, 1, 0], + "col2_b": [0, 0, 0], + "col2_c": [0, 0, 1], + }, + ) + with pytest.raises( + ValueError, + match=( + r"Length of 'dropped_first' \(1\) did not match " + r"the length of the columns being encoded \(2\)." + ), + ): + from_dummies(dummies, dropped_first=["x"]) + + +def test_from_dummies_to_df_dropped_first_dict(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a": [1, 0, 0], + "col1_b": [0, 1, 0], + "col2_a": [0, 1, 0], + "col2_b": [0, 0, 0], + "col2_c": [0, 0, 1], + }, + ) + expected = DataFrame( + {"C": [1, 2, 3], "col1": ["a", "b", "y"], "col2": ["x", "a", "c"]} + ) + result = from_dummies(dummies, dropped_first={"col2": "x", "col1": "y"}) + assert all(result == expected) + + +def test_from_dummies_to_df_dropped_first_dict_not_complete(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a": [1, 0, 0], + "col1_b": [0, 1, 0], + "col2_a": [0, 1, 0], + "col2_b": [0, 0, 0], + "col2_c": [0, 0, 1], + }, + ) + with pytest.raises( + ValueError, + match=( + r"Length of 'dropped_first' \(1\) did not match " + r"the length of the columns being encoded \(2\)." + ), + ): + from_dummies(dummies, dropped_first={"col1": "x"}) + + +def test_from_dummies_to_df_wrong_column_type(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a": [1, 0, 0], + "col1_b": [0, 1, 0], + "col2_a": [0, 1, 0], + "col2_b": [1, 0, 0], + "col2_c": [0, 0, 1], + }, + ) + with pytest.raises( + TypeError, + match=r"Input must be a list-like for parameter 'columns'", + ): + from_dummies(dummies, columns="col1_a") + + +def test_from_dummies_to_df_double_assignment(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a": [1, 0, 0], + "col1_b": [1, 1, 0], + "col2_a": [0, 1, 0], + "col2_b": [1, 0, 0], + "col2_c": [0, 0, 1], + }, + ) + with pytest.raises( + ValueError, + match=( + r"Dummy DataFrame contains multi-assignment\(s\) for prefix: " + r"'col1' in row 0." + ), + ): + from_dummies(dummies) + + +def test_from_dummies_to_df_no_assignment(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a": [0, 0, 0], + "col1_b": [0, 1, 0], + "col2_a": [0, 1, 0], + "col2_b": [1, 0, 0], + "col2_c": [0, 0, 1], + }, + ) + with pytest.raises( + ValueError, + match=r"Dummy DataFrame contains no assignment for prefix: 'col1' in row 0.", + ): + from_dummies(dummies, dummy_na=True) From c7c5588cdbf0b956e2baebb0b0843f40c4f0ebe2 Mon Sep 17 00:00:00 2001 From: pckSF Date: Wed, 9 Jun 2021 19:16:41 +0100 Subject: [PATCH 02/95] Clean-up tests with fixtures --- pandas/tests/reshape/test_from_dummies.py | 244 ++++++---------------- 1 file changed, 62 insertions(+), 182 deletions(-) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index da6ff875bb846..ee6cb70e386fa 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -8,6 +8,34 @@ from pandas.core.reshape.reshape import from_dummies +@pytest.fixture +def dummies_basic(): + return DataFrame( + { + "C": [1, 2, 3], + "col1_a": [1, 0, 1], + "col1_b": [0, 1, 0], + "col2_a": [0, 1, 0], + "col2_b": [1, 0, 0], + "col2_c": [0, 0, 1], + }, + ) + + +@pytest.fixture +def dummies_with_unassigned(): + return DataFrame( + { + "C": [1, 2, 3], + "col1_a": [1, 0, 0], + "col1_b": [0, 1, 0], + "col2_a": [0, 1, 0], + "col2_b": [0, 0, 0], + "col2_c": [0, 0, 1], + }, + ) + + def test_from_dummies_to_series_basic(): dummies = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}) expected = Series(list("abca")) @@ -72,69 +100,29 @@ def test_from_dummies_no_dummies(): assert all(result == expected) -def test_from_dummies_to_df_basic(): - dummies = DataFrame( - { - "C": [1, 2, 3], - "col1_a": [1, 0, 1], - "col1_b": [0, 1, 0], - "col2_a": [0, 1, 0], - "col2_b": [1, 0, 0], - "col2_c": [0, 0, 1], - }, - ) +def test_from_dummies_to_df_basic(dummies_basic): expected = DataFrame( {"C": [1, 2, 3], "col1": ["a", "b", "a"], "col2": ["b", "a", "c"]} ) - result = from_dummies(dummies) + result = from_dummies(dummies_basic) assert all(result == expected) -def test_from_dummies_to_df_variable_string(): - dummies = DataFrame( - { - "C": [1, 2, 3], - "col1_a": [1, 0, 1], - "col1_b": [0, 1, 0], - "col2_a": [0, 1, 0], - "col2_b": [1, 0, 0], - "col2_c": [0, 0, 1], - }, - ) +def test_from_dummies_to_df_variable_string(dummies_basic): expected = DataFrame( {"C": [1, 2, 3], "varname0": ["a", "b", "a"], "varname1": ["b", "a", "c"]} ) - result = from_dummies(dummies, variables="varname") + result = from_dummies(dummies_basic, variables="varname") assert all(result == expected) -def test_from_dummies_to_df_variable_list(): - dummies = DataFrame( - { - "C": [1, 2, 3], - "col1_a": [1, 0, 1], - "col1_b": [0, 1, 0], - "col2_a": [0, 1, 0], - "col2_b": [1, 0, 0], - "col2_c": [0, 0, 1], - }, - ) +def test_from_dummies_to_df_variable_list(dummies_basic): expected = DataFrame({"C": [1, 2, 3], "A": ["a", "b", "a"], "B": ["b", "a", "c"]}) - result = from_dummies(dummies, variables=["A", "B"]) + result = from_dummies(dummies_basic, variables=["A", "B"]) assert all(result == expected) -def test_from_dummies_to_df_variable_list_not_complete(): - dummies = DataFrame( - { - "C": [1, 2, 3], - "col1_a": [1, 0, 1], - "col1_b": [0, 1, 0], - "col2_a": [0, 1, 0], - "col2_b": [1, 0, 0], - "col2_c": [0, 0, 1], - }, - ) +def test_from_dummies_to_df_variable_list_not_complete(dummies_basic): with pytest.raises( ValueError, match=( @@ -142,36 +130,16 @@ def test_from_dummies_to_df_variable_list_not_complete(): r"the length of the columns being encoded \(2\)." ), ): - from_dummies(dummies, variables=["A"]) + from_dummies(dummies_basic, variables=["A"]) -def test_from_dummies_to_df_variable_dict(): - dummies = DataFrame( - { - "C": [1, 2, 3], - "col1_a": [1, 0, 1], - "col1_b": [0, 1, 0], - "col2_a": [0, 1, 0], - "col2_b": [1, 0, 0], - "col2_c": [0, 0, 1], - }, - ) +def test_from_dummies_to_df_variable_dict(dummies_basic): expected = DataFrame({"C": [1, 2, 3], "A": ["b", "a", "c"], "B": ["a", "b", "a"]}) - result = from_dummies(dummies, variables={"col2": "A", "col1": "B"}) + result = from_dummies(dummies_basic, variables={"col2": "A", "col1": "B"}) assert all(result == expected) -def test_from_dummies_to_df_variable_dict_not_complete(): - dummies = DataFrame( - { - "C": [1, 2, 3], - "col1_a": [1, 0, 1], - "col1_b": [0, 1, 0], - "col2_a": [0, 1, 0], - "col2_b": [1, 0, 0], - "col2_c": [0, 0, 1], - }, - ) +def test_from_dummies_to_df_variable_dict_not_complete(dummies_basic): with pytest.raises( ValueError, match=( @@ -179,7 +147,7 @@ def test_from_dummies_to_df_variable_dict_not_complete(): r"the length of the columns being encoded \(2\)." ), ): - from_dummies(dummies, variables={"col1": "A"}) + from_dummies(dummies_basic, variables={"col1": "A"}) def test_from_dummies_to_df_prefix_sep_list(): @@ -248,21 +216,11 @@ def test_from_dummies_to_df_dummy_na(): assert result["col2"][1] is expected["col2"][1] -def test_from_dummies_to_df_contains_nan(): - dummies = DataFrame( - { - "C": [1, 2, 3], - "col1_a": [1, 0, 0], - "col1_b": [0, 1, 0], - "col2_a": [0, 1, 0], - "col2_b": [0, 0, 0], - "col2_c": [0, 0, 1], - }, - ) +def test_from_dummies_to_df_contains_nan(dummies_with_unassigned): expected = DataFrame( {"C": [1, 2, 3], "col1": ["a", "b", np.nan], "col2": [np.nan, "a", "c"]} ) - result = from_dummies(dummies) + result = from_dummies(dummies_with_unassigned) assert all(result["C"] == expected["C"]) assert all(result["col1"][:2] == expected["col1"][:2]) assert all(result["col2"][1:] == expected["col2"][1:]) @@ -270,71 +228,31 @@ def test_from_dummies_to_df_contains_nan(): assert result["col2"][1] is expected["col2"][1] -def test_from_dummies_to_df_columns(): - dummies = DataFrame( - { - "C": [1, 2, 3], - "col1_a": [1, 0, 1], - "col1_b": [0, 1, 0], - "col2_a": [0, 1, 0], - "col2_b": [1, 0, 0], - "col2_c": [0, 0, 1], - }, - ) +def test_from_dummies_to_df_columns(dummies_basic): expected = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]}) result = from_dummies( - dummies, columns=["col1_a", "col1_b", "col2_a", "col2_b", "col2_c"] + dummies_basic, columns=["col1_a", "col1_b", "col2_a", "col2_b", "col2_c"] ) assert all(result == expected) -def test_from_dummies_to_df_dropped_first_str(): - dummies = DataFrame( - { - "C": [1, 2, 3], - "col1_a": [1, 0, 0], - "col1_b": [0, 1, 0], - "col2_a": [0, 1, 0], - "col2_b": [0, 0, 0], - "col2_c": [0, 0, 1], - }, - ) +def test_from_dummies_to_df_dropped_first_str(dummies_with_unassigned): expected = DataFrame( {"C": [1, 2, 3], "col1": ["a", "b", "x"], "col2": ["x", "a", "c"]} ) - result = from_dummies(dummies, dropped_first="x") + result = from_dummies(dummies_with_unassigned, dropped_first="x") assert all(result == expected) -def test_from_dummies_to_df_dropped_first_list(): - dummies = DataFrame( - { - "C": [1, 2, 3], - "col1_a": [1, 0, 0], - "col1_b": [0, 1, 0], - "col2_a": [0, 1, 0], - "col2_b": [0, 0, 0], - "col2_c": [0, 0, 1], - }, - ) +def test_from_dummies_to_df_dropped_first_list(dummies_with_unassigned): expected = DataFrame( {"C": [1, 2, 3], "col1": ["a", "b", "x"], "col2": ["y", "a", "c"]} ) - result = from_dummies(dummies, dropped_first=["x", "y"]) + result = from_dummies(dummies_with_unassigned, dropped_first=["x", "y"]) assert all(result == expected) -def test_from_dummies_to_df_dropped_first_list_not_complete(): - dummies = DataFrame( - { - "C": [1, 2, 3], - "col1_a": [1, 0, 0], - "col1_b": [0, 1, 0], - "col2_a": [0, 1, 0], - "col2_b": [0, 0, 0], - "col2_c": [0, 0, 1], - }, - ) +def test_from_dummies_to_df_dropped_first_list_not_complete(dummies_with_unassigned): with pytest.raises( ValueError, match=( @@ -342,38 +260,20 @@ def test_from_dummies_to_df_dropped_first_list_not_complete(): r"the length of the columns being encoded \(2\)." ), ): - from_dummies(dummies, dropped_first=["x"]) + from_dummies(dummies_with_unassigned, dropped_first=["x"]) -def test_from_dummies_to_df_dropped_first_dict(): - dummies = DataFrame( - { - "C": [1, 2, 3], - "col1_a": [1, 0, 0], - "col1_b": [0, 1, 0], - "col2_a": [0, 1, 0], - "col2_b": [0, 0, 0], - "col2_c": [0, 0, 1], - }, - ) +def test_from_dummies_to_df_dropped_first_dict(dummies_with_unassigned): expected = DataFrame( {"C": [1, 2, 3], "col1": ["a", "b", "y"], "col2": ["x", "a", "c"]} ) - result = from_dummies(dummies, dropped_first={"col2": "x", "col1": "y"}) + result = from_dummies( + dummies_with_unassigned, dropped_first={"col2": "x", "col1": "y"} + ) assert all(result == expected) -def test_from_dummies_to_df_dropped_first_dict_not_complete(): - dummies = DataFrame( - { - "C": [1, 2, 3], - "col1_a": [1, 0, 0], - "col1_b": [0, 1, 0], - "col2_a": [0, 1, 0], - "col2_b": [0, 0, 0], - "col2_c": [0, 0, 1], - }, - ) +def test_from_dummies_to_df_dropped_first_dict_not_complete(dummies_with_unassigned): with pytest.raises( ValueError, match=( @@ -381,32 +281,22 @@ def test_from_dummies_to_df_dropped_first_dict_not_complete(): r"the length of the columns being encoded \(2\)." ), ): - from_dummies(dummies, dropped_first={"col1": "x"}) + from_dummies(dummies_with_unassigned, dropped_first={"col1": "x"}) -def test_from_dummies_to_df_wrong_column_type(): - dummies = DataFrame( - { - "C": [1, 2, 3], - "col1_a": [1, 0, 0], - "col1_b": [0, 1, 0], - "col2_a": [0, 1, 0], - "col2_b": [1, 0, 0], - "col2_c": [0, 0, 1], - }, - ) +def test_from_dummies_to_df_wrong_column_type(dummies_basic): with pytest.raises( TypeError, match=r"Input must be a list-like for parameter 'columns'", ): - from_dummies(dummies, columns="col1_a") + from_dummies(dummies_basic, columns="col1_a") def test_from_dummies_to_df_double_assignment(): dummies = DataFrame( { "C": [1, 2, 3], - "col1_a": [1, 0, 0], + "col1_a": [1, 0, 1], "col1_b": [1, 1, 0], "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], @@ -423,19 +313,9 @@ def test_from_dummies_to_df_double_assignment(): from_dummies(dummies) -def test_from_dummies_to_df_no_assignment(): - dummies = DataFrame( - { - "C": [1, 2, 3], - "col1_a": [0, 0, 0], - "col1_b": [0, 1, 0], - "col2_a": [0, 1, 0], - "col2_b": [1, 0, 0], - "col2_c": [0, 0, 1], - }, - ) +def test_from_dummies_to_df_no_assignment(dummies_with_unassigned): with pytest.raises( ValueError, - match=r"Dummy DataFrame contains no assignment for prefix: 'col1' in row 0.", + match=r"Dummy DataFrame contains no assignment for prefix: 'col2' in row 0.", ): - from_dummies(dummies, dummy_na=True) + from_dummies(dummies_with_unassigned, dummy_na=True) From d06540fd8a2521d5a7d4ff741408bfac647397e3 Mon Sep 17 00:00:00 2001 From: pckSF Date: Mon, 14 Jun 2021 23:35:55 +0100 Subject: [PATCH 03/95] Make tests more elegant --- pandas/tests/reshape/test_from_dummies.py | 53 ++++++++++------------- 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index ee6cb70e386fa..dfc1d29c9e0e8 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -5,6 +5,7 @@ DataFrame, Series, ) +import pandas._testing as tm from pandas.core.reshape.reshape import from_dummies @@ -40,30 +41,28 @@ def test_from_dummies_to_series_basic(): dummies = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}) expected = Series(list("abca")) result = from_dummies(dummies, to_series=True) - assert all(result == expected) + tm.assert_series_equal(result, expected) def test_from_dummies_to_series_dummy_na(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0], "NaN": [0, 0, 1]}) expected = Series(["a", "b", np.nan]) result = from_dummies(dummies, to_series=True, dummy_na=True) - assert all(result[:2] == expected[:2]) - assert result[2] is expected[2] + tm.assert_series_equal(result, expected) def test_from_dummies_to_series_contains_nan(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) expected = Series(["a", "b", np.nan]) result = from_dummies(dummies, to_series=True) - assert all(result[:2] == expected[:2]) - assert result[2] is expected[2] + tm.assert_series_equal(result, expected) def test_from_dummies_to_series_dropped_first(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) expected = Series(["a", "b", "c"]) result = from_dummies(dummies, to_series=True, dropped_first="c") - assert all(result == expected) + tm.assert_series_equal(result, expected) def test_from_dummies_to_series_wrong_dropped_first(): @@ -95,9 +94,11 @@ def test_from_dummies_no_dummies(): dummies = DataFrame( {"a": [1, 6, 3, 1], "b": [0, 1, 0, 2], "c": ["c1", "c2", "c3", "c4"]} ) - expected = dummies + expected = DataFrame( + {"a": [1, 6, 3, 1], "b": [0, 1, 0, 2], "c": ["c1", "c2", "c3", "c4"]} + ) result = from_dummies(dummies) - assert all(result == expected) + tm.assert_frame_equal(result, expected) def test_from_dummies_to_df_basic(dummies_basic): @@ -105,7 +106,7 @@ def test_from_dummies_to_df_basic(dummies_basic): {"C": [1, 2, 3], "col1": ["a", "b", "a"], "col2": ["b", "a", "c"]} ) result = from_dummies(dummies_basic) - assert all(result == expected) + tm.assert_frame_equal(result, expected) def test_from_dummies_to_df_variable_string(dummies_basic): @@ -113,13 +114,13 @@ def test_from_dummies_to_df_variable_string(dummies_basic): {"C": [1, 2, 3], "varname0": ["a", "b", "a"], "varname1": ["b", "a", "c"]} ) result = from_dummies(dummies_basic, variables="varname") - assert all(result == expected) + tm.assert_frame_equal(result, expected) def test_from_dummies_to_df_variable_list(dummies_basic): expected = DataFrame({"C": [1, 2, 3], "A": ["a", "b", "a"], "B": ["b", "a", "c"]}) result = from_dummies(dummies_basic, variables=["A", "B"]) - assert all(result == expected) + tm.assert_frame_equal(result, expected) def test_from_dummies_to_df_variable_list_not_complete(dummies_basic): @@ -136,7 +137,7 @@ def test_from_dummies_to_df_variable_list_not_complete(dummies_basic): def test_from_dummies_to_df_variable_dict(dummies_basic): expected = DataFrame({"C": [1, 2, 3], "A": ["b", "a", "c"], "B": ["a", "b", "a"]}) result = from_dummies(dummies_basic, variables={"col2": "A", "col1": "B"}) - assert all(result == expected) + tm.assert_frame_equal(result, expected) def test_from_dummies_to_df_variable_dict_not_complete(dummies_basic): @@ -165,7 +166,7 @@ def test_from_dummies_to_df_prefix_sep_list(): {"C": [1, 2, 3], "col1": ["a", "b", "a"], "col2": ["b", "a", "c"]} ) result = from_dummies(dummies, prefix_sep=["_", "-"]) - assert all(result == expected) + tm.assert_frame_equal(result, expected) def test_from_dummies_to_df_prefix_sep_dict(): @@ -173,7 +174,7 @@ def test_from_dummies_to_df_prefix_sep_dict(): { "C": [1, 2, 3], "col1_a-a": [1, 0, 1], - "col1_b-a": [0, 1, 0], + "col1_b-b": [0, 1, 0], "col2-a_a": [0, 1, 0], "col2-b_b": [1, 0, 0], "col2-c_c": [0, 0, 1], @@ -189,7 +190,7 @@ def test_from_dummies_to_df_prefix_sep_dict(): "col2": "-", }, ) - assert all(result == expected) + tm.assert_frame_equal(result, expected) def test_from_dummies_to_df_dummy_na(): @@ -202,18 +203,14 @@ def test_from_dummies_to_df_dummy_na(): "col2_a": [0, 1, 0], "col2_b": [0, 0, 0], "col2_c": [0, 0, 1], - "col2_NAN": [1, 0, 0], + "col2_NaN": [1, 0, 0], }, ) expected = DataFrame( {"C": [1, 2, 3], "col1": ["a", "b", np.nan], "col2": [np.nan, "a", "c"]} ) result = from_dummies(dummies, dummy_na=True) - assert all(result["C"] == expected["C"]) - assert all(result["col1"][:2] == expected["col1"][:2]) - assert all(result["col2"][1:] == expected["col2"][1:]) - assert result["col1"][2] is expected["col1"][2] - assert result["col2"][1] is expected["col2"][1] + tm.assert_frame_equal(result, expected) def test_from_dummies_to_df_contains_nan(dummies_with_unassigned): @@ -221,11 +218,7 @@ def test_from_dummies_to_df_contains_nan(dummies_with_unassigned): {"C": [1, 2, 3], "col1": ["a", "b", np.nan], "col2": [np.nan, "a", "c"]} ) result = from_dummies(dummies_with_unassigned) - assert all(result["C"] == expected["C"]) - assert all(result["col1"][:2] == expected["col1"][:2]) - assert all(result["col2"][1:] == expected["col2"][1:]) - assert result["col1"][2] is expected["col1"][2] - assert result["col2"][1] is expected["col2"][1] + tm.assert_frame_equal(result, expected) def test_from_dummies_to_df_columns(dummies_basic): @@ -233,7 +226,7 @@ def test_from_dummies_to_df_columns(dummies_basic): result = from_dummies( dummies_basic, columns=["col1_a", "col1_b", "col2_a", "col2_b", "col2_c"] ) - assert all(result == expected) + tm.assert_frame_equal(result, expected) def test_from_dummies_to_df_dropped_first_str(dummies_with_unassigned): @@ -241,7 +234,7 @@ def test_from_dummies_to_df_dropped_first_str(dummies_with_unassigned): {"C": [1, 2, 3], "col1": ["a", "b", "x"], "col2": ["x", "a", "c"]} ) result = from_dummies(dummies_with_unassigned, dropped_first="x") - assert all(result == expected) + tm.assert_frame_equal(result, expected) def test_from_dummies_to_df_dropped_first_list(dummies_with_unassigned): @@ -249,7 +242,7 @@ def test_from_dummies_to_df_dropped_first_list(dummies_with_unassigned): {"C": [1, 2, 3], "col1": ["a", "b", "x"], "col2": ["y", "a", "c"]} ) result = from_dummies(dummies_with_unassigned, dropped_first=["x", "y"]) - assert all(result == expected) + tm.assert_frame_equal(result, expected) def test_from_dummies_to_df_dropped_first_list_not_complete(dummies_with_unassigned): @@ -270,7 +263,7 @@ def test_from_dummies_to_df_dropped_first_dict(dummies_with_unassigned): result = from_dummies( dummies_with_unassigned, dropped_first={"col2": "x", "col1": "y"} ) - assert all(result == expected) + tm.assert_frame_equal(result, expected) def test_from_dummies_to_df_dropped_first_dict_not_complete(dummies_with_unassigned): From 1fa4e8ac7ea3e251f4b054acd6be4564ee13abe6 Mon Sep 17 00:00:00 2001 From: pckSF Date: Tue, 22 Jun 2021 23:19:16 +0100 Subject: [PATCH 04/95] Remove variable argument --- pandas/core/reshape/reshape.py | 25 ++------------ pandas/tests/reshape/test_from_dummies.py | 42 ----------------------- 2 files changed, 3 insertions(+), 64 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index f56e2a32e4156..3aa96739afbb5 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1056,7 +1056,6 @@ def get_empty_frame(data) -> DataFrame: def from_dummies( data, to_series: bool = False, - variables: None | str | list[str] | dict[str, str] = None, prefix_sep: str | list[str] | dict[str, str] = "_", dummy_na: bool = False, columns: None | list[str] = None, @@ -1109,7 +1108,7 @@ def from_dummies( variables_slice[prefix].append(col) prefix_sep = sep_for_prefix - # validate number of passed arguments + # validate number of dropped_first def check_len(item, name) -> None: if not len(item) == len(variables_slice): len_msg = ( @@ -1119,24 +1118,6 @@ def check_len(item, name) -> None: ) raise ValueError(len_msg) - # obtain prefix to category mapping - variables: dict[str, str] - if isinstance(variables, dict): - check_len(variables, "variables") - variables = variables - elif is_list_like(variables): - check_len(variables, "variables") - variables = dict(zip(variables_slice, variables)) - elif isinstance(variables, str): - variables = dict( - zip( - variables_slice, - (f"{variables}{i}" for i in range(len(variables_slice))), - ) - ) - else: - variables = dict(zip(variables_slice, variables_slice)) - if dropped_first: if isinstance(dropped_first, dict): check_len(dropped_first, "dropped_first") @@ -1148,7 +1129,7 @@ def check_len(item, name) -> None: zip(variables_slice, [dropped_first] * len(variables_slice)) ) - cat_data = {var: [] for _, var in variables.items()} + cat_data = {prefix: [] for prefix in variables_slice} for index, row in data.iterrows(): for prefix, prefix_slice in variables_slice.items(): slice_sum = row[prefix_slice].sum() @@ -1172,7 +1153,7 @@ def check_len(item, name) -> None: category = prefix_slice[cat_index].split(prefix_sep[prefix])[1] if dummy_na and category == "NaN": category = np.nan - cat_data[variables[prefix]].append(category) + cat_data[prefix].append(category) if columns: return DataFrame(cat_data) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index dfc1d29c9e0e8..dbb012a8bed7e 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -109,48 +109,6 @@ def test_from_dummies_to_df_basic(dummies_basic): tm.assert_frame_equal(result, expected) -def test_from_dummies_to_df_variable_string(dummies_basic): - expected = DataFrame( - {"C": [1, 2, 3], "varname0": ["a", "b", "a"], "varname1": ["b", "a", "c"]} - ) - result = from_dummies(dummies_basic, variables="varname") - tm.assert_frame_equal(result, expected) - - -def test_from_dummies_to_df_variable_list(dummies_basic): - expected = DataFrame({"C": [1, 2, 3], "A": ["a", "b", "a"], "B": ["b", "a", "c"]}) - result = from_dummies(dummies_basic, variables=["A", "B"]) - tm.assert_frame_equal(result, expected) - - -def test_from_dummies_to_df_variable_list_not_complete(dummies_basic): - with pytest.raises( - ValueError, - match=( - r"Length of 'variables' \(1\) did not match " - r"the length of the columns being encoded \(2\)." - ), - ): - from_dummies(dummies_basic, variables=["A"]) - - -def test_from_dummies_to_df_variable_dict(dummies_basic): - expected = DataFrame({"C": [1, 2, 3], "A": ["b", "a", "c"], "B": ["a", "b", "a"]}) - result = from_dummies(dummies_basic, variables={"col2": "A", "col1": "B"}) - tm.assert_frame_equal(result, expected) - - -def test_from_dummies_to_df_variable_dict_not_complete(dummies_basic): - with pytest.raises( - ValueError, - match=( - r"Length of 'variables' \(1\) did not match " - r"the length of the columns being encoded \(2\)." - ), - ): - from_dummies(dummies_basic, variables={"col1": "A"}) - - def test_from_dummies_to_df_prefix_sep_list(): dummies = DataFrame( { From c7f8ec815c1a1695c91f77d62875aec1e76e04d2 Mon Sep 17 00:00:00 2001 From: pckSF Date: Tue, 22 Jun 2021 23:25:52 +0100 Subject: [PATCH 05/95] Remove dummy_na argument --- pandas/core/reshape/reshape.py | 21 +++-------------- pandas/tests/reshape/test_from_dummies.py | 28 +++++------------------ 2 files changed, 9 insertions(+), 40 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 3aa96739afbb5..7cad316e6b817 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1057,7 +1057,6 @@ def from_dummies( data, to_series: bool = False, prefix_sep: str | list[str] | dict[str, str] = "_", - dummy_na: bool = False, columns: None | list[str] = None, dropped_first: None | str | list[str] | dict[str, str] = None, ) -> Series | DataFrame: @@ -1067,7 +1066,7 @@ def from_dummies( from pandas.core.reshape.concat import concat if to_series: - return _from_dummies_1d(data, dummy_na, dropped_first) + return _from_dummies_1d(data, dropped_first) data_to_decode: DataFrame if columns is None: @@ -1141,18 +1140,11 @@ def check_len(item, name) -> None: elif slice_sum == 0: if dropped_first: category = dropped_first[prefix] - elif not dummy_na: - category = np.nan else: - raise ValueError( - f"Dummy DataFrame contains no assignment for prefix: " - f"'{prefix}' in row {index}." - ) + category = np.nan else: cat_index = row[prefix_slice].argmax() category = prefix_slice[cat_index].split(prefix_sep[prefix])[1] - if dummy_na and category == "NaN": - category = np.nan cat_data[prefix].append(category) if columns: @@ -1163,7 +1155,6 @@ def check_len(item, name) -> None: def _from_dummies_1d( data, - dummy_na: bool = False, dropped_first: None | str = None, ) -> Series: """ @@ -1182,16 +1173,10 @@ def _from_dummies_1d( elif row_sum == 0: if dropped_first: category = dropped_first - elif not dummy_na: - category = np.nan else: - raise ValueError( - f"Dummy DataFrame contains no assignment in row {index}." - ) + category = np.nan else: category = data.columns[row.argmax()] - if dummy_na and category == "NaN": - category = np.nan cat_data.append(category) return Series(cat_data) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index dbb012a8bed7e..718d50442fd81 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -44,10 +44,10 @@ def test_from_dummies_to_series_basic(): tm.assert_series_equal(result, expected) -def test_from_dummies_to_series_dummy_na(): +def test_from_dummies_to_series_contains_get_dummies_NaN_column(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0], "NaN": [0, 0, 1]}) - expected = Series(["a", "b", np.nan]) - result = from_dummies(dummies, to_series=True, dummy_na=True) + expected = Series(["a", "b", "NaN"]) + result = from_dummies(dummies, to_series=True) tm.assert_series_equal(result, expected) @@ -82,14 +82,6 @@ def test_from_dummies_to_series_multi_assignment(): from_dummies(dummies, to_series=True) -def test_from_dummies_to_series_unassigned_row(): - dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) - with pytest.raises( - ValueError, match=r"Dummy DataFrame contains no assignment in row 2." - ): - from_dummies(dummies, to_series=True, dummy_na=True) - - def test_from_dummies_no_dummies(): dummies = DataFrame( {"a": [1, 6, 3, 1], "b": [0, 1, 0, 2], "c": ["c1", "c2", "c3", "c4"]} @@ -151,7 +143,7 @@ def test_from_dummies_to_df_prefix_sep_dict(): tm.assert_frame_equal(result, expected) -def test_from_dummies_to_df_dummy_na(): +def test_from_dummies_to_df_contains_get_dummies_NaN_column(): dummies = DataFrame( { "C": [1, 2, 3], @@ -165,9 +157,9 @@ def test_from_dummies_to_df_dummy_na(): }, ) expected = DataFrame( - {"C": [1, 2, 3], "col1": ["a", "b", np.nan], "col2": [np.nan, "a", "c"]} + {"C": [1, 2, 3], "col1": ["a", "b", "NaN"], "col2": ["NaN", "a", "c"]} ) - result = from_dummies(dummies, dummy_na=True) + result = from_dummies(dummies) tm.assert_frame_equal(result, expected) @@ -262,11 +254,3 @@ def test_from_dummies_to_df_double_assignment(): ), ): from_dummies(dummies) - - -def test_from_dummies_to_df_no_assignment(dummies_with_unassigned): - with pytest.raises( - ValueError, - match=r"Dummy DataFrame contains no assignment for prefix: 'col2' in row 0.", - ): - from_dummies(dummies_with_unassigned, dummy_na=True) From 3cc98ca683a3b1e79c53dfed8ff64b555fd20e73 Mon Sep 17 00:00:00 2001 From: pckSF Date: Wed, 30 Jun 2021 23:10:06 +0100 Subject: [PATCH 06/95] Remove loop over df rows --- pandas/core/reshape/reshape.py | 69 ++++++++++++----------- pandas/tests/reshape/test_from_dummies.py | 5 +- 2 files changed, 38 insertions(+), 36 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 7cad316e6b817..59b7fa5655846 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1128,29 +1128,31 @@ def check_len(item, name) -> None: zip(variables_slice, [dropped_first] * len(variables_slice)) ) - cat_data = {prefix: [] for prefix in variables_slice} - for index, row in data.iterrows(): - for prefix, prefix_slice in variables_slice.items(): - slice_sum = row[prefix_slice].sum() - if slice_sum > 1: - raise ValueError( - f"Dummy DataFrame contains multi-assignment(s) for prefix: " - f"'{prefix}' in row {index}." - ) - elif slice_sum == 0: - if dropped_first: - category = dropped_first[prefix] - else: - category = np.nan + cat_data = {} + for prefix, prefix_slice in variables_slice.items(): + cats = [col[len(prefix + prefix_sep[prefix]) :] for col in prefix_slice] + assigned = data[prefix_slice].sum(axis=1) + if any(assigned > 1): + raise ValueError( + f"Dummy DataFrame contains multi-assignment(s) for prefix: " + f"'{prefix}' in row {assigned.argmax()}." + ) + elif any(assigned == 0): + if dropped_first: + cats.append(dropped_first[prefix]) else: - cat_index = row[prefix_slice].argmax() - category = prefix_slice[cat_index].split(prefix_sep[prefix])[1] - cat_data[prefix].append(category) + cats.append("nan") + bool_data_slice = concat( + (data[prefix_slice].astype("boolean"), assigned == 0), axis=1 + ) + else: + bool_data_slice = data[prefix_slice].astype("boolean") + cat_data[prefix] = bool_data_slice.dot(cats) if columns: return DataFrame(cat_data) else: - return concat([non_cat_data, DataFrame(cat_data)], axis=1) + return concat((non_cat_data, DataFrame(cat_data)), axis=1) def _from_dummies_1d( @@ -1160,25 +1162,26 @@ def _from_dummies_1d( """ soon """ + from pandas.core.reshape.concat import concat + if dropped_first and not isinstance(dropped_first, str): raise ValueError("Only one dropped first value possible in 1D dummy DataFrame.") - cat_data = [] - for index, row in data.iterrows(): - row_sum = row.sum() - if row_sum > 1: - raise ValueError( - f"Dummy DataFrame contains multi-assignment in row {index}." - ) - elif row_sum == 0: - if dropped_first: - category = dropped_first - else: - category = np.nan + cats = data.columns.tolist() + assigned = data.sum(axis=1) + if any(assigned > 1): + raise ValueError( + f"Dummy DataFrame contains multi-assignment in row {assigned.argmax()}." + ) + elif any(assigned == 0): + if dropped_first: + cats.append(dropped_first) else: - category = data.columns[row.argmax()] - cat_data.append(category) - return Series(cat_data) + cats.append("nan") + bool_data_slice = concat((data.astype("boolean"), assigned == 0), axis=1) + else: + bool_data_slice = data.astype("boolean") + return bool_data_slice.dot(cats) def _reorder_for_extension_array_stack( diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 718d50442fd81..02cb0a5d5aa13 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -1,4 +1,3 @@ -import numpy as np import pytest from pandas import ( @@ -53,7 +52,7 @@ def test_from_dummies_to_series_contains_get_dummies_NaN_column(): def test_from_dummies_to_series_contains_nan(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) - expected = Series(["a", "b", np.nan]) + expected = Series(["a", "b", "nan"]) result = from_dummies(dummies, to_series=True) tm.assert_series_equal(result, expected) @@ -165,7 +164,7 @@ def test_from_dummies_to_df_contains_get_dummies_NaN_column(): def test_from_dummies_to_df_contains_nan(dummies_with_unassigned): expected = DataFrame( - {"C": [1, 2, 3], "col1": ["a", "b", np.nan], "col2": [np.nan, "a", "c"]} + {"C": [1, 2, 3], "col1": ["a", "b", "nan"], "col2": ["nan", "a", "c"]} ) result = from_dummies(dummies_with_unassigned) tm.assert_frame_equal(result, expected) From 0e131c6dbd8cf28bf9e4d296eb939902951f79a1 Mon Sep 17 00:00:00 2001 From: pckSF Date: Sat, 3 Jul 2021 00:24:49 +0100 Subject: [PATCH 07/95] Add fillna and basic tests --- pandas/core/reshape/reshape.py | 46 ++++++++++----- pandas/tests/reshape/test_from_dummies.py | 70 +++++++++++++++++++++++ 2 files changed, 101 insertions(+), 15 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 59b7fa5655846..fac7a416833c6 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1054,11 +1054,12 @@ def get_empty_frame(data) -> DataFrame: def from_dummies( - data, + data: DataFrame, to_series: bool = False, prefix_sep: str | list[str] | dict[str, str] = "_", columns: None | list[str] = None, dropped_first: None | str | list[str] | dict[str, str] = None, + fillna: None | bool = None, ) -> Series | DataFrame: """ soon @@ -1066,7 +1067,7 @@ def from_dummies( from pandas.core.reshape.concat import concat if to_series: - return _from_dummies_1d(data, dropped_first) + return _from_dummies_1d(data, dropped_first, fillna) data_to_decode: DataFrame if columns is None: @@ -1078,14 +1079,22 @@ def from_dummies( cat_columns.append(col) else: non_cat_columns.append(col) - data_to_decode = data[cat_columns] + data_to_decode = data[cat_columns].astype("boolean") non_cat_data = data[non_cat_columns] elif not is_list_like(columns): raise TypeError("Input must be a list-like for parameter 'columns'") else: - data_to_decode = data[columns] + data_to_decode = data[columns].astype("boolean") non_cat_data = data[[col for col in data.columns if col not in columns]] + if fillna is not None: + data_to_decode = data_to_decode.fillna(fillna) + elif data_to_decode.isna().any().any(): + raise ValueError( + f"Dummy DataFrame contains NA value in column: " + f"'{data_to_decode.columns[data_to_decode.isna().any().argmax()]}'" + ) + # get separator for each prefix and lists to slice data for each prefix if isinstance(prefix_sep, dict): variables_slice = {prefix: [] for prefix in prefix_sep} @@ -1131,7 +1140,7 @@ def check_len(item, name) -> None: cat_data = {} for prefix, prefix_slice in variables_slice.items(): cats = [col[len(prefix + prefix_sep[prefix]) :] for col in prefix_slice] - assigned = data[prefix_slice].sum(axis=1) + assigned = data_to_decode[prefix_slice].sum(axis=1) if any(assigned > 1): raise ValueError( f"Dummy DataFrame contains multi-assignment(s) for prefix: " @@ -1142,12 +1151,10 @@ def check_len(item, name) -> None: cats.append(dropped_first[prefix]) else: cats.append("nan") - bool_data_slice = concat( - (data[prefix_slice].astype("boolean"), assigned == 0), axis=1 - ) + data_slice = concat((data_to_decode[prefix_slice], assigned == 0), axis=1) else: - bool_data_slice = data[prefix_slice].astype("boolean") - cat_data[prefix] = bool_data_slice.dot(cats) + data_slice = data_to_decode[prefix_slice] + cat_data[prefix] = data_slice.dot(cats) if columns: return DataFrame(cat_data) @@ -1156,8 +1163,9 @@ def check_len(item, name) -> None: def _from_dummies_1d( - data, + data: DataFrame, dropped_first: None | str = None, + fillna: None | bool = None, ) -> Series: """ soon @@ -1167,6 +1175,15 @@ def _from_dummies_1d( if dropped_first and not isinstance(dropped_first, str): raise ValueError("Only one dropped first value possible in 1D dummy DataFrame.") + data = data.astype("boolean") + if fillna is not None: + data = data.fillna(fillna) + elif data.isna().any().any(): + raise ValueError( + f"Dummy DataFrame contains NA value in column: " + f"'{data.columns[data.isna().any().argmax()]}'" + ) + cats = data.columns.tolist() assigned = data.sum(axis=1) if any(assigned > 1): @@ -1178,10 +1195,9 @@ def _from_dummies_1d( cats.append(dropped_first) else: cats.append("nan") - bool_data_slice = concat((data.astype("boolean"), assigned == 0), axis=1) - else: - bool_data_slice = data.astype("boolean") - return bool_data_slice.dot(cats) + data = concat((data, assigned == 0), axis=1) + + return data.dot(cats) def _reorder_for_extension_array_stack( diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 02cb0a5d5aa13..2200788b90087 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -1,3 +1,4 @@ +import numpy as np import pytest from pandas import ( @@ -64,6 +65,20 @@ def test_from_dummies_to_series_dropped_first(): tm.assert_series_equal(result, expected) +def test_from_dummies_to_series_fillna_True(): + dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, np.nan]}) + expected = Series(["a", "b", "b"]) + result = from_dummies(dummies, to_series=True, fillna=True) + tm.assert_series_equal(result, expected) + + +def test_from_dummies_to_series_fillna_False(): + dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, np.nan]}) + expected = Series(["a", "b", "nan"]) + result = from_dummies(dummies, to_series=True, fillna=False) + tm.assert_series_equal(result, expected) + + def test_from_dummies_to_series_wrong_dropped_first(): dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}) with pytest.raises( @@ -81,6 +96,14 @@ def test_from_dummies_to_series_multi_assignment(): from_dummies(dummies, to_series=True) +def test_from_dummies_to_series_no_fillna_but_na_value(): + dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, np.nan]}) + with pytest.raises( + ValueError, match=r"Dummy DataFrame contains NA value in column: 'b'" + ): + from_dummies(dummies, to_series=True) + + def test_from_dummies_no_dummies(): dummies = DataFrame( {"a": [1, 6, 3, 1], "b": [0, 1, 0, 2], "c": ["c1", "c2", "c3", "c4"]} @@ -226,6 +249,24 @@ def test_from_dummies_to_df_dropped_first_dict_not_complete(dummies_with_unassig from_dummies(dummies_with_unassigned, dropped_first={"col1": "x"}) +def test_from_dummies_to_df_fillna_True(dummies_basic): + dummies_basic["col2_c"][2] = np.nan + expected = DataFrame( + {"C": [1, 2, 3], "col1": ["a", "b", "a"], "col2": ["b", "a", "c"]} + ) + result = from_dummies(dummies_basic, fillna=True) + tm.assert_frame_equal(result, expected) + + +def test_from_dummies_to_df_fillna_False(dummies_basic): + dummies_basic["col2_c"][2] = np.nan + expected = DataFrame( + {"C": [1, 2, 3], "col1": ["a", "b", "a"], "col2": ["b", "a", "nan"]} + ) + result = from_dummies(dummies_basic, fillna=False) + tm.assert_frame_equal(result, expected) + + def test_from_dummies_to_df_wrong_column_type(dummies_basic): with pytest.raises( TypeError, @@ -234,6 +275,14 @@ def test_from_dummies_to_df_wrong_column_type(dummies_basic): from_dummies(dummies_basic, columns="col1_a") +def test_from_dummies_to_df_no_fillna_but_na_value(dummies_basic): + dummies_basic["col2_c"][2] = np.nan + with pytest.raises( + ValueError, match=r"Dummy DataFrame contains NA value in column: 'col2_c'" + ): + from_dummies(dummies_basic) + + def test_from_dummies_to_df_double_assignment(): dummies = DataFrame( { @@ -253,3 +302,24 @@ def test_from_dummies_to_df_double_assignment(): ), ): from_dummies(dummies) + + +def test_from_dummies_to_df_fillna_True_double_assignment(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a": [1, 0, 1], + "col1_b": [1, np.nan, 0], + "col2_a": [0, 1, 0], + "col2_b": [1, 0, 0], + "col2_c": [0, 0, 1], + }, + ) + with pytest.raises( + ValueError, + match=( + r"Dummy DataFrame contains multi-assignment\(s\) for prefix: " + r"'col1' in row 0." + ), + ): + from_dummies(dummies, fillna=True) From 9f74dc76b4351bfa2b0177b89b2f848eedd63f39 Mon Sep 17 00:00:00 2001 From: pckSF Date: Sat, 3 Jul 2021 00:43:15 +0100 Subject: [PATCH 08/95] Fix testnames regarding nan and unassigned --- pandas/tests/reshape/test_from_dummies.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 2200788b90087..7c20b0c3396ae 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -51,7 +51,7 @@ def test_from_dummies_to_series_contains_get_dummies_NaN_column(): tm.assert_series_equal(result, expected) -def test_from_dummies_to_series_contains_nan(): +def test_from_dummies_to_series_contains_unassigned(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) expected = Series(["a", "b", "nan"]) result = from_dummies(dummies, to_series=True) @@ -96,7 +96,7 @@ def test_from_dummies_to_series_multi_assignment(): from_dummies(dummies, to_series=True) -def test_from_dummies_to_series_no_fillna_but_na_value(): +def test_from_dummies_to_series_no_fillna_but_contains_nan(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, np.nan]}) with pytest.raises( ValueError, match=r"Dummy DataFrame contains NA value in column: 'b'" @@ -185,7 +185,7 @@ def test_from_dummies_to_df_contains_get_dummies_NaN_column(): tm.assert_frame_equal(result, expected) -def test_from_dummies_to_df_contains_nan(dummies_with_unassigned): +def test_from_dummies_to_df_contains_unassigned(dummies_with_unassigned): expected = DataFrame( {"C": [1, 2, 3], "col1": ["a", "b", "nan"], "col2": ["nan", "a", "c"]} ) @@ -275,7 +275,7 @@ def test_from_dummies_to_df_wrong_column_type(dummies_basic): from_dummies(dummies_basic, columns="col1_a") -def test_from_dummies_to_df_no_fillna_but_na_value(dummies_basic): +def test_from_dummies_to_df_no_fillna_but_contains_nan(dummies_basic): dummies_basic["col2_c"][2] = np.nan with pytest.raises( ValueError, match=r"Dummy DataFrame contains NA value in column: 'col2_c'" From 442b3404cfa01365b3dc4b039b63c18d51ffcada Mon Sep 17 00:00:00 2001 From: pckSF Date: Sat, 3 Jul 2021 12:17:10 +0100 Subject: [PATCH 09/95] Remove fillna --- pandas/core/reshape/reshape.py | 26 +++-------- pandas/tests/reshape/test_from_dummies.py | 57 +---------------------- 2 files changed, 9 insertions(+), 74 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index fac7a416833c6..47bc1ea8f4079 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1059,15 +1059,20 @@ def from_dummies( prefix_sep: str | list[str] | dict[str, str] = "_", columns: None | list[str] = None, dropped_first: None | str | list[str] | dict[str, str] = None, - fillna: None | bool = None, ) -> Series | DataFrame: """ soon """ from pandas.core.reshape.concat import concat + if data.isna().any().any(): + raise ValueError( + f"Dummy DataFrame contains NA value in column: " + f"'{data.columns[data.isna().any().argmax()]}'" + ) + if to_series: - return _from_dummies_1d(data, dropped_first, fillna) + return _from_dummies_1d(data, dropped_first) data_to_decode: DataFrame if columns is None: @@ -1087,14 +1092,6 @@ def from_dummies( data_to_decode = data[columns].astype("boolean") non_cat_data = data[[col for col in data.columns if col not in columns]] - if fillna is not None: - data_to_decode = data_to_decode.fillna(fillna) - elif data_to_decode.isna().any().any(): - raise ValueError( - f"Dummy DataFrame contains NA value in column: " - f"'{data_to_decode.columns[data_to_decode.isna().any().argmax()]}'" - ) - # get separator for each prefix and lists to slice data for each prefix if isinstance(prefix_sep, dict): variables_slice = {prefix: [] for prefix in prefix_sep} @@ -1165,7 +1162,6 @@ def check_len(item, name) -> None: def _from_dummies_1d( data: DataFrame, dropped_first: None | str = None, - fillna: None | bool = None, ) -> Series: """ soon @@ -1176,14 +1172,6 @@ def _from_dummies_1d( raise ValueError("Only one dropped first value possible in 1D dummy DataFrame.") data = data.astype("boolean") - if fillna is not None: - data = data.fillna(fillna) - elif data.isna().any().any(): - raise ValueError( - f"Dummy DataFrame contains NA value in column: " - f"'{data.columns[data.isna().any().argmax()]}'" - ) - cats = data.columns.tolist() assigned = data.sum(axis=1) if any(assigned > 1): diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 7c20b0c3396ae..6980e23963048 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -65,20 +65,6 @@ def test_from_dummies_to_series_dropped_first(): tm.assert_series_equal(result, expected) -def test_from_dummies_to_series_fillna_True(): - dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, np.nan]}) - expected = Series(["a", "b", "b"]) - result = from_dummies(dummies, to_series=True, fillna=True) - tm.assert_series_equal(result, expected) - - -def test_from_dummies_to_series_fillna_False(): - dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, np.nan]}) - expected = Series(["a", "b", "nan"]) - result = from_dummies(dummies, to_series=True, fillna=False) - tm.assert_series_equal(result, expected) - - def test_from_dummies_to_series_wrong_dropped_first(): dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}) with pytest.raises( @@ -96,7 +82,7 @@ def test_from_dummies_to_series_multi_assignment(): from_dummies(dummies, to_series=True) -def test_from_dummies_to_series_no_fillna_but_contains_nan(): +def test_from_dummies_to_series_contains_nan(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, np.nan]}) with pytest.raises( ValueError, match=r"Dummy DataFrame contains NA value in column: 'b'" @@ -249,24 +235,6 @@ def test_from_dummies_to_df_dropped_first_dict_not_complete(dummies_with_unassig from_dummies(dummies_with_unassigned, dropped_first={"col1": "x"}) -def test_from_dummies_to_df_fillna_True(dummies_basic): - dummies_basic["col2_c"][2] = np.nan - expected = DataFrame( - {"C": [1, 2, 3], "col1": ["a", "b", "a"], "col2": ["b", "a", "c"]} - ) - result = from_dummies(dummies_basic, fillna=True) - tm.assert_frame_equal(result, expected) - - -def test_from_dummies_to_df_fillna_False(dummies_basic): - dummies_basic["col2_c"][2] = np.nan - expected = DataFrame( - {"C": [1, 2, 3], "col1": ["a", "b", "a"], "col2": ["b", "a", "nan"]} - ) - result = from_dummies(dummies_basic, fillna=False) - tm.assert_frame_equal(result, expected) - - def test_from_dummies_to_df_wrong_column_type(dummies_basic): with pytest.raises( TypeError, @@ -275,7 +243,7 @@ def test_from_dummies_to_df_wrong_column_type(dummies_basic): from_dummies(dummies_basic, columns="col1_a") -def test_from_dummies_to_df_no_fillna_but_contains_nan(dummies_basic): +def test_from_dummies_to_df_contains_nan(dummies_basic): dummies_basic["col2_c"][2] = np.nan with pytest.raises( ValueError, match=r"Dummy DataFrame contains NA value in column: 'col2_c'" @@ -302,24 +270,3 @@ def test_from_dummies_to_df_double_assignment(): ), ): from_dummies(dummies) - - -def test_from_dummies_to_df_fillna_True_double_assignment(): - dummies = DataFrame( - { - "C": [1, 2, 3], - "col1_a": [1, 0, 1], - "col1_b": [1, np.nan, 0], - "col2_a": [0, 1, 0], - "col2_b": [1, 0, 0], - "col2_c": [0, 0, 1], - }, - ) - with pytest.raises( - ValueError, - match=( - r"Dummy DataFrame contains multi-assignment\(s\) for prefix: " - r"'col1' in row 0." - ), - ): - from_dummies(dummies, fillna=True) From 38cf04dc13fd8549909289b3d62e2da715224f56 Mon Sep 17 00:00:00 2001 From: pckSF Date: Sun, 11 Jul 2021 16:42:24 +0100 Subject: [PATCH 10/95] Add from_dummies docstring --- pandas/core/reshape/reshape.py | 82 +++++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 47bc1ea8f4079..f97b6fb31d195 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1061,7 +1061,87 @@ def from_dummies( dropped_first: None | str | list[str] | dict[str, str] = None, ) -> Series | DataFrame: """ - soon + Create a categorical `Series` or `DataFrame` from a `DataFrame` of dummy + variables. + + Inverts the operation performed by 'get_dummies'. + + Parameters + ---------- + data : `DataFrame` + Data which contains dummy-coded variables. + to_series : bool, default False + Converts the input data to a categorical `Series`, converts the input data + to a categorical `DataFrame` if False. + prefix_sep : str, list of str, or dict of str, default '_' + Separator/deliminator used in the column names of the dummy categories. + Pass a list if multiple prefix separators are used in the columns names. + Alternatively, pass a dictionary to map prefix separators to prefixes if + multiple and/ mixed separators are used in the column names. + columns : None or list of str, default 'None' + The columns which to convert from dummy-encoding and return as categorical + `DataFrame`. + If `columns` is None then all dummy columns are converted and appended + to the non-dummy columns. + dropped_fist : None, str, list of str, or dict of str, default None + The implied value the dummy takes when all values are zero. + Can be a a single value for all variables, a list with a number of values + equal to the dummy variables, or a dict directly mapping the dropped value + to a prefix of a variable. + + Returns + ------- + `Series` or `DataFrame` + Categorical data decoded from the dummy input-data. + + See Also + -------- + get_dummies : Convert `Series` or `DataFrame` to dummy codes. + + Examples + -------- + >>> d = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], + "c": [0, 0, 1, 0]}) + + >>> pd.from_dummies(s, to_series=True) + 0 a + 1 b + 2 c + 3 a + + >>> d = pd.DataFrame({"C": [1, 2, 3], "col1_a": [1, 0, 1], + "col1_b": [0, 1, 0], "col2_a": [0, 1, 0], + "col2_b": [1, 0, 0], "col2_c": [0, 0, 1]}) + + + >>> pd.from_dummies(d) + C col1 col2 + 0 1 a b + 1 2 b a + 2 3 a c + + >>> d = pd.DataFrame({"C": [1, 2, 3], "col1_a": [1, 0, 0], + "col1_b": [0, 1, 0], "col2_a": [0, 1, 0], + "col2_b": [1, 0, 0], "col2_c": [0, 0, 0]}) + + + >>> pd.from_dummies(d, dropped_first=["d", "e"]) + C col1 col2 + 0 1 a b + 1 2 b a + 2 3 d e + + + >>> d = pd.DataFrame({"col1_a-a": [1, 0, 1], "col1_b-b": [0, 1, 0], + "col2-a_a": [0, 1, 0], "col2-b_b": [1, 0, 0], + "col2-c_c": [0, 0, 1]}) + + + >>> pd.from_dummies(d, prefix_sep={"col1": "_", "col2": "-"}) + col1 col2 + 0 a-a b-b + 1 b-b a-a + 2 a-a c-c """ from pandas.core.reshape.concat import concat From 8eccfab2a1eb3f3fe1c22205e7d53ba46a0ccff8 Mon Sep 17 00:00:00 2001 From: pckSF Date: Sun, 11 Jul 2021 17:27:11 +0100 Subject: [PATCH 11/95] Add docstring to _from_dummies_1d --- pandas/core/reshape/reshape.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index f97b6fb31d195..8f39d0a4751f4 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1113,7 +1113,6 @@ def from_dummies( "col1_b": [0, 1, 0], "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], "col2_c": [0, 0, 1]}) - >>> pd.from_dummies(d) C col1 col2 0 1 a b @@ -1124,19 +1123,16 @@ def from_dummies( "col1_b": [0, 1, 0], "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], "col2_c": [0, 0, 0]}) - >>> pd.from_dummies(d, dropped_first=["d", "e"]) C col1 col2 0 1 a b 1 2 b a 2 3 d e - >>> d = pd.DataFrame({"col1_a-a": [1, 0, 1], "col1_b-b": [0, 1, 0], "col2-a_a": [0, 1, 0], "col2-b_b": [1, 0, 0], "col2-c_c": [0, 0, 1]}) - >>> pd.from_dummies(d, prefix_sep={"col1": "_", "col2": "-"}) col1 col2 0 a-a b-b @@ -1244,7 +1240,10 @@ def _from_dummies_1d( dropped_first: None | str = None, ) -> Series: """ - soon + Helper function for from_dummies. + + Handles the conversion of dummy encoded data to a categorical `Series`. + For parameters and usage see: from_dummies. """ from pandas.core.reshape.concat import concat From fd027c58306a2692dcb4acace4b0459b6b851ff8 Mon Sep 17 00:00:00 2001 From: pckSF Date: Sun, 11 Jul 2021 18:30:41 +0100 Subject: [PATCH 12/95] Fix column behaviour --- pandas/core/reshape/reshape.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 8f39d0a4751f4..0b5b133fee2cb 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1152,21 +1152,19 @@ def from_dummies( data_to_decode: DataFrame if columns is None: - # index data with a list of all columns that are dummies - cat_columns = [] - non_cat_columns = [] - for col in data.columns: - if any(ps in col for ps in prefix_sep): - cat_columns.append(col) - else: - non_cat_columns.append(col) - data_to_decode = data[cat_columns].astype("boolean") - non_cat_data = data[non_cat_columns] + columns = data.columns elif not is_list_like(columns): raise TypeError("Input must be a list-like for parameter 'columns'") - else: - data_to_decode = data[columns].astype("boolean") - non_cat_data = data[[col for col in data.columns if col not in columns]] + # index data with a list of all columns that are dummies + cat_columns = [] + non_cat_columns = [] + for col in columns: + if any(ps in col for ps in prefix_sep): + cat_columns.append(col) + else: + non_cat_columns.append(col) + data_to_decode = data[cat_columns].astype("boolean") + non_cat_data = data[non_cat_columns] # get separator for each prefix and lists to slice data for each prefix if isinstance(prefix_sep, dict): From 106ff3cf096cd0bb299562b3fb757aa136d525d4 Mon Sep 17 00:00:00 2001 From: pckSF Date: Sun, 11 Jul 2021 18:54:15 +0100 Subject: [PATCH 13/95] Update handling of unassigned rows --- pandas/core/reshape/reshape.py | 23 +++++++++++++++-------- pandas/tests/reshape/test_from_dummies.py | 6 +++--- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 0b5b133fee2cb..f191a832cf4a0 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1154,7 +1154,7 @@ def from_dummies( if columns is None: columns = data.columns elif not is_list_like(columns): - raise TypeError("Input must be a list-like for parameter 'columns'") + raise TypeError("Argument for parameter 'columns' must be list-like") # index data with a list of all columns that are dummies cat_columns = [] non_cat_columns = [] @@ -1221,16 +1221,18 @@ def check_len(item, name) -> None: if dropped_first: cats.append(dropped_first[prefix]) else: - cats.append("nan") + cats.append("from_dummies_nan_placeholer_string") data_slice = concat((data_to_decode[prefix_slice], assigned == 0), axis=1) else: data_slice = data_to_decode[prefix_slice] cat_data[prefix] = data_slice.dot(cats) - if columns: - return DataFrame(cat_data) - else: - return concat((non_cat_data, DataFrame(cat_data)), axis=1) + categorical_df = concat((non_cat_data, DataFrame(cat_data)), axis=1) + if dropped_first is None: + categorical_df.replace( + "from_dummies_nan_placeholer_string", np.nan, inplace=True + ) + return categorical_df def _from_dummies_1d( @@ -1259,10 +1261,15 @@ def _from_dummies_1d( if dropped_first: cats.append(dropped_first) else: - cats.append("nan") + cats.append("from_dummies_nan_placeholer_string") data = concat((data, assigned == 0), axis=1) - return data.dot(cats) + categorical_series = data.dot(cats) + if dropped_first is None: + categorical_series.replace( + "from_dummies_nan_placeholer_string", np.nan, inplace=True + ) + return categorical_series def _reorder_for_extension_array_stack( diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 6980e23963048..6b54bfa6e34c8 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -53,7 +53,7 @@ def test_from_dummies_to_series_contains_get_dummies_NaN_column(): def test_from_dummies_to_series_contains_unassigned(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) - expected = Series(["a", "b", "nan"]) + expected = Series(["a", "b", np.nan]) result = from_dummies(dummies, to_series=True) tm.assert_series_equal(result, expected) @@ -173,7 +173,7 @@ def test_from_dummies_to_df_contains_get_dummies_NaN_column(): def test_from_dummies_to_df_contains_unassigned(dummies_with_unassigned): expected = DataFrame( - {"C": [1, 2, 3], "col1": ["a", "b", "nan"], "col2": ["nan", "a", "c"]} + {"C": [1, 2, 3], "col1": ["a", "b", np.nan], "col2": [np.nan, "a", "c"]} ) result = from_dummies(dummies_with_unassigned) tm.assert_frame_equal(result, expected) @@ -238,7 +238,7 @@ def test_from_dummies_to_df_dropped_first_dict_not_complete(dummies_with_unassig def test_from_dummies_to_df_wrong_column_type(dummies_basic): with pytest.raises( TypeError, - match=r"Input must be a list-like for parameter 'columns'", + match=r"Argument for parameter 'columns' must be list-like", ): from_dummies(dummies_basic, columns="col1_a") From 20192283e138548cb744a3c52bb6be868b427687 Mon Sep 17 00:00:00 2001 From: pckSF Date: Sat, 17 Jul 2021 22:22:24 +0100 Subject: [PATCH 14/95] Start user_guide entry --- doc/source/user_guide/reshaping.rst | 110 ++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 7d1d03fe020a6..999596d530ec1 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -719,6 +719,116 @@ To choose another dtype, use the ``dtype`` argument: pd.get_dummies(df, dtype=bool).dtypes +To convert a "dummy" or "indicator" ``DataFrame``, into a categorical ``DataFrame`` +(a categorical ``Series``), for example ``k`` columns of a ``DataFrame`` containing +1s and 0s can derive a ``DataFrame`` (a ``Series``) which has ``k`` distinct values +:func:`~pandas.from_dummies`: + +.. ipython:: python + + d = pd.DataFrame({"prefix_a": [0, 1, 0], "prefix_b": [1, 0, 1]}) + + pd.from_dummies(d) + +The ``k`` distinct values can also be represented be a ``dropped_first`` which +means that no vale assigned implies a the value of the dropped value: + +.. ipython:: python + + d = pd.DataFrame({"prefix_a": [0, 1, 0]}) + + pd.from_dummies(d, dropped_first="b") + +The function is the inverse of :func:`pandas.get_dummies `. + + + + +################################################################################ +This function is often used along with discretization functions like ``cut``: + +.. ipython:: python + + values = np.random.randn(10) + values + + bins = [0, 0.2, 0.4, 0.6, 0.8, 1] + + pd.get_dummies(pd.cut(values, bins)) + +See also :func:`Series.str.get_dummies `. + +:func:`get_dummies` also accepts a ``DataFrame``. By default all categorical +variables (categorical in the statistical sense, those with ``object`` or +``categorical`` dtype) are encoded as dummy variables. + + +.. ipython:: python + + df = pd.DataFrame({"A": ["a", "b", "a"], "B": ["c", "c", "b"], "C": [1, 2, 3]}) + pd.get_dummies(df) + +All non-object columns are included untouched in the output. You can control +the columns that are encoded with the ``columns`` keyword. + +.. ipython:: python + + pd.get_dummies(df, columns=["A"]) + +Notice that the ``B`` column is still included in the output, it just hasn't +been encoded. You can drop ``B`` before calling ``get_dummies`` if you don't +want to include it in the output. + +As with the ``Series`` version, you can pass values for the ``prefix`` and +``prefix_sep``. By default the column name is used as the prefix, and '_' as +the prefix separator. You can specify ``prefix`` and ``prefix_sep`` in 3 ways: + +* string: Use the same value for ``prefix`` or ``prefix_sep`` for each column + to be encoded. +* list: Must be the same length as the number of columns being encoded. +* dict: Mapping column name to prefix. + +.. ipython:: python + + simple = pd.get_dummies(df, prefix="new_prefix") + simple + from_list = pd.get_dummies(df, prefix=["from_A", "from_B"]) + from_list + from_dict = pd.get_dummies(df, prefix={"B": "from_B", "A": "from_A"}) + from_dict + +Sometimes it will be useful to only keep k-1 levels of a categorical +variable to avoid collinearity when feeding the result to statistical models. +You can switch to this mode by turn on ``drop_first``. + +.. ipython:: python + + s = pd.Series(list("abcaa")) + + pd.get_dummies(s) + + pd.get_dummies(s, drop_first=True) + +When a column contains only one level, it will be omitted in the result. + +.. ipython:: python + + df = pd.DataFrame({"A": list("aaaaa"), "B": list("ababc")}) + + pd.get_dummies(df) + + pd.get_dummies(df, drop_first=True) + +By default new columns will have ``np.uint8`` dtype. +To choose another dtype, use the ``dtype`` argument: + +.. ipython:: python + + df = pd.DataFrame({"A": list("abc"), "B": [1.1, 2.2, 3.3]}) + + pd.get_dummies(df, dtype=bool).dtypes +################################################################################ + .. _reshaping.factorize: Factorizing values From be39c056652914126e8b45ed6e6057ef3cd6185e Mon Sep 17 00:00:00 2001 From: pckSF Date: Mon, 19 Jul 2021 23:47:16 +0100 Subject: [PATCH 15/95] Draft reshaping user_guide entry --- doc/source/user_guide/reshaping.rst | 62 +++++++++-------------------- 1 file changed, 18 insertions(+), 44 deletions(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 999596d530ec1..0a032ba5186fe 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -741,62 +741,36 @@ means that no vale assigned implies a the value of the dropped value: The function is the inverse of :func:`pandas.get_dummies `. - - - -################################################################################ -This function is often used along with discretization functions like ``cut``: - -.. ipython:: python - - values = np.random.randn(10) - values - - bins = [0, 0.2, 0.4, 0.6, 0.8, 1] - - pd.get_dummies(pd.cut(values, bins)) - -See also :func:`Series.str.get_dummies `. - -:func:`get_dummies` also accepts a ``DataFrame``. By default all categorical -variables (categorical in the statistical sense, those with ``object`` or -``categorical`` dtype) are encoded as dummy variables. - - -.. ipython:: python - - df = pd.DataFrame({"A": ["a", "b", "a"], "B": ["c", "c", "b"], "C": [1, 2, 3]}) - pd.get_dummies(df) - -All non-object columns are included untouched in the output. You can control -the columns that are encoded with the ``columns`` keyword. +All non-dummy columns are included untouched in the output. You can control +which columns are included in the output with the ``columns`` argument. .. ipython:: python - pd.get_dummies(df, columns=["A"]) - -Notice that the ``B`` column is still included in the output, it just hasn't -been encoded. You can drop ``B`` before calling ``get_dummies`` if you don't -want to include it in the output. + pd.get_dummies(df, columns=["C", "prefix_A", "prefix_B"]) -As with the ``Series`` version, you can pass values for the ``prefix`` and -``prefix_sep``. By default the column name is used as the prefix, and '_' as -the prefix separator. You can specify ``prefix`` and ``prefix_sep`` in 3 ways: +You can pass values for for the ``prefix_sep`` argument depending on how many or +nested prefix separators are used in the column names. By default the prefix +separator is assumed to be a '_', however ``prefix_sep`` can be specified in +3 ways: -* string: Use the same value for ``prefix`` or ``prefix_sep`` for each column - to be encoded. -* list: Must be the same length as the number of columns being encoded. -* dict: Mapping column name to prefix. +* string: Use the same value for ``prefix_sep`` for each column + to be dencoded. +* list: Variables will be decoded by the first instance of prefix separator passed + the list that is encountered in the column name. +* dict: Directly map prefix separators to prefixes. Can be used in case mixed + separators are used within the variable name and to separate the variable from + the prefix. .. ipython:: python - simple = pd.get_dummies(df, prefix="new_prefix") + simple = pd.get_dummies(df, prefix_sep="-") simple - from_list = pd.get_dummies(df, prefix=["from_A", "from_B"]) + from_list = pd.get_dummies(df, prefix_sep=["_", "-"]) from_list - from_dict = pd.get_dummies(df, prefix={"B": "from_B", "A": "from_A"}) + from_dict = pd.get_dummies(df, prefix_sep={"prefix1": "-", "prefix2": "_"}) from_dict +####################s########################################################### Sometimes it will be useful to only keep k-1 levels of a categorical variable to avoid collinearity when feeding the result to statistical models. You can switch to this mode by turn on ``drop_first``. From d406227dafcdefe4be73d5b15f02f6794fa2de21 Mon Sep 17 00:00:00 2001 From: pckSF Date: Mon, 19 Jul 2021 23:49:39 +0100 Subject: [PATCH 16/95] Fix: remove temp workspace separation --- doc/source/user_guide/reshaping.rst | 32 ----------------------------- 1 file changed, 32 deletions(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 0a032ba5186fe..94b69c314670a 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -770,38 +770,6 @@ separator is assumed to be a '_', however ``prefix_sep`` can be specified in from_dict = pd.get_dummies(df, prefix_sep={"prefix1": "-", "prefix2": "_"}) from_dict -####################s########################################################### -Sometimes it will be useful to only keep k-1 levels of a categorical -variable to avoid collinearity when feeding the result to statistical models. -You can switch to this mode by turn on ``drop_first``. - -.. ipython:: python - - s = pd.Series(list("abcaa")) - - pd.get_dummies(s) - - pd.get_dummies(s, drop_first=True) - -When a column contains only one level, it will be omitted in the result. - -.. ipython:: python - - df = pd.DataFrame({"A": list("aaaaa"), "B": list("ababc")}) - - pd.get_dummies(df) - - pd.get_dummies(df, drop_first=True) - -By default new columns will have ``np.uint8`` dtype. -To choose another dtype, use the ``dtype`` argument: - -.. ipython:: python - - df = pd.DataFrame({"A": list("abc"), "B": [1.1, 2.2, 3.3]}) - - pd.get_dummies(df, dtype=bool).dtypes -################################################################################ .. _reshaping.factorize: From 61a25e080bf906d424343dc34181ef2abee70171 Mon Sep 17 00:00:00 2001 From: pckSF Date: Thu, 5 Aug 2021 20:00:39 +0200 Subject: [PATCH 17/95] Add raise ValueError on unassigned values --- pandas/core/reshape/reshape.py | 22 ++++++++++----------- pandas/tests/reshape/test_from_dummies.py | 24 +++++++++++++---------- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index f191a832cf4a0..5c2e2c8dfbae0 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1215,23 +1215,22 @@ def check_len(item, name) -> None: if any(assigned > 1): raise ValueError( f"Dummy DataFrame contains multi-assignment(s) for prefix: " - f"'{prefix}' in row {assigned.argmax()}." + f"'{prefix}'; First instance in row: {assigned.argmax()}." ) elif any(assigned == 0): if dropped_first: cats.append(dropped_first[prefix]) else: - cats.append("from_dummies_nan_placeholer_string") + raise ValueError( + f"Dummy DataFrame contains unassigned value(s) for prefix: " + f"'{prefix}'; First instance in row: {assigned.argmin()}." + ) data_slice = concat((data_to_decode[prefix_slice], assigned == 0), axis=1) else: data_slice = data_to_decode[prefix_slice] cat_data[prefix] = data_slice.dot(cats) categorical_df = concat((non_cat_data, DataFrame(cat_data)), axis=1) - if dropped_first is None: - categorical_df.replace( - "from_dummies_nan_placeholer_string", np.nan, inplace=True - ) return categorical_df @@ -1255,20 +1254,19 @@ def _from_dummies_1d( assigned = data.sum(axis=1) if any(assigned > 1): raise ValueError( - f"Dummy DataFrame contains multi-assignment in row {assigned.argmax()}." + f"Dummy DataFrame contains multi-assignment in row: {assigned.argmax()}." ) elif any(assigned == 0): if dropped_first: cats.append(dropped_first) else: - cats.append("from_dummies_nan_placeholer_string") + raise ValueError( + f"Dummy DataFrame contains unassigned value in row: " + f"{assigned.argmin()}." + ) data = concat((data, assigned == 0), axis=1) categorical_series = data.dot(cats) - if dropped_first is None: - categorical_series.replace( - "from_dummies_nan_placeholer_string", np.nan, inplace=True - ) return categorical_series diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 6b54bfa6e34c8..09911fa04c168 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -53,9 +53,10 @@ def test_from_dummies_to_series_contains_get_dummies_NaN_column(): def test_from_dummies_to_series_contains_unassigned(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) - expected = Series(["a", "b", np.nan]) - result = from_dummies(dummies, to_series=True) - tm.assert_series_equal(result, expected) + with pytest.raises( + ValueError, match=r"Dummy DataFrame contains unassigned value in row: 2" + ): + from_dummies(dummies, to_series=True) def test_from_dummies_to_series_dropped_first(): @@ -77,7 +78,7 @@ def test_from_dummies_to_series_wrong_dropped_first(): def test_from_dummies_to_series_multi_assignment(): dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}) with pytest.raises( - ValueError, match=r"Dummy DataFrame contains multi-assignment in row 2." + ValueError, match=r"Dummy DataFrame contains multi-assignment in row: 2." ): from_dummies(dummies, to_series=True) @@ -172,11 +173,14 @@ def test_from_dummies_to_df_contains_get_dummies_NaN_column(): def test_from_dummies_to_df_contains_unassigned(dummies_with_unassigned): - expected = DataFrame( - {"C": [1, 2, 3], "col1": ["a", "b", np.nan], "col2": [np.nan, "a", "c"]} - ) - result = from_dummies(dummies_with_unassigned) - tm.assert_frame_equal(result, expected) + with pytest.raises( + ValueError, + match=( + r"Dummy DataFrame contains unassigned value\(s\) for prefix: " + r"'col1'; First instance in row: 2" + ), + ): + from_dummies(dummies_with_unassigned) def test_from_dummies_to_df_columns(dummies_basic): @@ -266,7 +270,7 @@ def test_from_dummies_to_df_double_assignment(): ValueError, match=( r"Dummy DataFrame contains multi-assignment\(s\) for prefix: " - r"'col1' in row 0." + r"'col1'; First instance in row: 0." ), ): from_dummies(dummies) From 5bcfbb40325170eaa4d83a5da12c480ef437ec95 Mon Sep 17 00:00:00 2001 From: pckSF Date: Thu, 12 Aug 2021 01:19:53 +0200 Subject: [PATCH 18/95] Fix mypy issues --- pandas/core/reshape/reshape.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c584b709b9a12..c3bcb7a3be17d 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1076,7 +1076,7 @@ def from_dummies( data: DataFrame, to_series: bool = False, prefix_sep: str | list[str] | dict[str, str] = "_", - columns: None | list[str] = None, + columns: None | Index | list[str] = None, dropped_first: None | str | list[str] | dict[str, str] = None, ) -> Series | DataFrame: """ @@ -1097,7 +1097,7 @@ def from_dummies( Pass a list if multiple prefix separators are used in the columns names. Alternatively, pass a dictionary to map prefix separators to prefixes if multiple and/ mixed separators are used in the column names. - columns : None or list of str, default 'None' + columns : None, Index, or list of str, default 'None' The columns which to convert from dummy-encoding and return as categorical `DataFrame`. If `columns` is None then all dummy columns are converted and appended @@ -1187,7 +1187,7 @@ def from_dummies( # get separator for each prefix and lists to slice data for each prefix if isinstance(prefix_sep, dict): - variables_slice = {prefix: [] for prefix in prefix_sep} + variables_slice: dict[str, list] = {prefix: [] for prefix in prefix_sep} for col in data_to_decode.columns: for prefix in prefix_sep: if prefix in col: @@ -1219,7 +1219,7 @@ def check_len(item, name) -> None: if dropped_first: if isinstance(dropped_first, dict): check_len(dropped_first, "dropped_first") - elif is_list_like(dropped_first): + elif isinstance(dropped_first, list): check_len(dropped_first, "dropped_first") dropped_first = dict(zip(variables_slice, dropped_first)) else: @@ -1237,7 +1237,7 @@ def check_len(item, name) -> None: f"'{prefix}'; First instance in row: {assigned.argmax()}." ) elif any(assigned == 0): - if dropped_first: + if isinstance(dropped_first, dict): cats.append(dropped_first[prefix]) else: raise ValueError( @@ -1255,7 +1255,7 @@ def check_len(item, name) -> None: def _from_dummies_1d( data: DataFrame, - dropped_first: None | str = None, + dropped_first: None | str | list[str] | dict[str, str] = None, ) -> Series: """ Helper function for from_dummies. From ca6200e973e7500efc7d9c934ff5429e0ac6c9a9 Mon Sep 17 00:00:00 2001 From: pckSF Date: Thu, 12 Aug 2021 01:25:04 +0200 Subject: [PATCH 19/95] Fix docstring multi-line statements --- pandas/core/reshape/reshape.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c3bcb7a3be17d..a20f05c47de82 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1120,7 +1120,7 @@ def from_dummies( Examples -------- >>> d = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], - "c": [0, 0, 1, 0]}) + ... "c": [0, 0, 1, 0]}) >>> pd.from_dummies(s, to_series=True) 0 a @@ -1129,8 +1129,8 @@ def from_dummies( 3 a >>> d = pd.DataFrame({"C": [1, 2, 3], "col1_a": [1, 0, 1], - "col1_b": [0, 1, 0], "col2_a": [0, 1, 0], - "col2_b": [1, 0, 0], "col2_c": [0, 0, 1]}) + ... "col1_b": [0, 1, 0], "col2_a": [0, 1, 0], + ... "col2_b": [1, 0, 0], "col2_c": [0, 0, 1]}) >>> pd.from_dummies(d) C col1 col2 @@ -1139,8 +1139,8 @@ def from_dummies( 2 3 a c >>> d = pd.DataFrame({"C": [1, 2, 3], "col1_a": [1, 0, 0], - "col1_b": [0, 1, 0], "col2_a": [0, 1, 0], - "col2_b": [1, 0, 0], "col2_c": [0, 0, 0]}) + ... "col1_b": [0, 1, 0], "col2_a": [0, 1, 0], + ... "col2_b": [1, 0, 0], "col2_c": [0, 0, 0]}) >>> pd.from_dummies(d, dropped_first=["d", "e"]) C col1 col2 @@ -1149,8 +1149,8 @@ def from_dummies( 2 3 d e >>> d = pd.DataFrame({"col1_a-a": [1, 0, 1], "col1_b-b": [0, 1, 0], - "col2-a_a": [0, 1, 0], "col2-b_b": [1, 0, 0], - "col2-c_c": [0, 0, 1]}) + ... "col2-a_a": [0, 1, 0], "col2-b_b": [1, 0, 0], + ... "col2-c_c": [0, 0, 1]}) >>> pd.from_dummies(d, prefix_sep={"col1": "_", "col2": "-"}) col1 col2 From bf17cdbfb80b66922fd46e31b3c76c37213a1253 Mon Sep 17 00:00:00 2001 From: pckSF Date: Sun, 29 Aug 2021 17:48:41 +0200 Subject: [PATCH 20/95] Add TypeError for wrong dropped_first type --- pandas/core/reshape/reshape.py | 13 +++++++++++-- pandas/tests/reshape/test_from_dummies.py | 20 ++++++++++++++++++-- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index a20f05c47de82..a8ee0ed00d753 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1222,10 +1222,15 @@ def check_len(item, name) -> None: elif isinstance(dropped_first, list): check_len(dropped_first, "dropped_first") dropped_first = dict(zip(variables_slice, dropped_first)) - else: + elif isinstance(dropped_first, str): dropped_first = dict( zip(variables_slice, [dropped_first] * len(variables_slice)) ) + else: + raise TypeError( + f"Expected 'dropped_first' to be of type 'str', 'list', or 'dict'; " + f"Received 'dropped_first' of type: {type(dropped_first).__name__}" + ) cat_data = {} for prefix, prefix_slice in variables_slice.items(): @@ -1266,7 +1271,11 @@ def _from_dummies_1d( from pandas.core.reshape.concat import concat if dropped_first and not isinstance(dropped_first, str): - raise ValueError("Only one dropped first value possible in 1D dummy DataFrame.") + raise TypeError( + f"Only one dropped first value possible in 1D dummy DataFrame: " + f"'dropped_first' should be of type 'str'; " + f"Received 'dropped_first' of type: {type(dropped_first).__name__}" + ) data = data.astype("boolean") cats = data.columns.tolist() diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 09911fa04c168..5296ebdb78b3d 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -69,8 +69,12 @@ def test_from_dummies_to_series_dropped_first(): def test_from_dummies_to_series_wrong_dropped_first(): dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}) with pytest.raises( - ValueError, - match=r"Only one dropped first value possible in 1D dummy DataFrame.", + TypeError, + match=( + r"Only one dropped first value possible in 1D dummy DataFrame: " + r"'dropped_first' should be of type 'str'; " + r"Received 'dropped_first' of type: list" + ), ): from_dummies(dummies, to_series=True, dropped_first=["c", "d"]) @@ -218,6 +222,18 @@ def test_from_dummies_to_df_dropped_first_list_not_complete(dummies_with_unassig from_dummies(dummies_with_unassigned, dropped_first=["x"]) +def test_from_dummies_to_df_dropped_first_wrong_type(dummies_with_unassigned): + + with pytest.raises( + TypeError, + match=( + r"Expected 'dropped_first' to be of type 'str', 'list', or 'dict'; " + r"Received 'dropped_first' of type: tuple" + ), + ): + from_dummies(dummies_with_unassigned, dropped_first=("x", "y")) + + def test_from_dummies_to_df_dropped_first_dict(dummies_with_unassigned): expected = DataFrame( {"C": [1, 2, 3], "col1": ["a", "b", "y"], "col2": ["x", "a", "c"]} From 92b5dae031c41ea7d91491c7f78d3a2c25758fdf Mon Sep 17 00:00:00 2001 From: pckSF Date: Mon, 6 Sep 2021 20:19:11 +0200 Subject: [PATCH 21/95] Add tests for incomplete seperators --- pandas/tests/reshape/test_from_dummies.py | 75 ++++++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 5296ebdb78b3d..7b63479ccbe7f 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -66,7 +66,7 @@ def test_from_dummies_to_series_dropped_first(): tm.assert_series_equal(result, expected) -def test_from_dummies_to_series_wrong_dropped_first(): +def test_from_dummies_to_series_wrong_dropped_first_type(): dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}) with pytest.raises( TypeError, @@ -95,6 +95,13 @@ def test_from_dummies_to_series_contains_nan(): from_dummies(dummies, to_series=True) +def test_from_dummies_to_series_False(): + dummies = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}) + expected = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}) + result = from_dummies(dummies, to_series=False) + tm.assert_frame_equal(result, expected) + + def test_from_dummies_no_dummies(): dummies = DataFrame( {"a": [1, 6, 3, 1], "b": [0, 1, 0, 2], "c": ["c1", "c2", "c3", "c4"]} @@ -114,6 +121,28 @@ def test_from_dummies_to_df_basic(dummies_basic): tm.assert_frame_equal(result, expected) +def test_from_dummies_to_df_prefix_multiple_seperators(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a": [1, 0, 1], + "col1_b": [0, 1, 0], + "col2-a": [0, 1, 0], + "col2-b": [1, 0, 1], + }, + ) + expected = DataFrame( + { + "C": [1, 2, 3], + "col2-a": [0, 1, 0], + "col2-b": [1, 0, 1], + "col1": ["a", "b", "a"], + } + ) + result = from_dummies(dummies) + tm.assert_frame_equal(result, expected) + + def test_from_dummies_to_df_prefix_sep_list(): dummies = DataFrame( { @@ -132,6 +161,28 @@ def test_from_dummies_to_df_prefix_sep_list(): tm.assert_frame_equal(result, expected) +def test_from_dummies_to_df_prefix_sep_list_incomplete(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a": [1, 0, 1], + "col1_b": [0, 1, 0], + "col2-a": [0, 1, 0], + "col2-b": [1, 0, 1], + }, + ) + expected = DataFrame( + { + "C": [1, 2, 3], + "col2-a": [0, 1, 0], + "col2-b": [1, 0, 1], + "col1": ["a", "b", "a"], + } + ) + result = from_dummies(dummies, prefix_sep=["_"]) + tm.assert_frame_equal(result, expected) + + def test_from_dummies_to_df_prefix_sep_dict(): dummies = DataFrame( { @@ -156,6 +207,28 @@ def test_from_dummies_to_df_prefix_sep_dict(): tm.assert_frame_equal(result, expected) +def test_from_dummies_to_df_prefix_sep_dict_incomplete(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a-a": [1, 0, 1], + "col1_b-b": [0, 1, 0], + "col2-a_a": [0, 1, 0], + "col2-b_b": [1, 0, 1], + }, + ) + expected = DataFrame( + { + "C": [1, 2, 3], + "col2-a_a": [0, 1, 0], + "col2-b_b": [1, 0, 1], + "col1": ["a-a", "b-b", "a-a"], + } + ) + result = from_dummies(dummies, prefix_sep={"col1": "_"}) + tm.assert_frame_equal(result, expected) + + def test_from_dummies_to_df_contains_get_dummies_NaN_column(): dummies = DataFrame( { From c2cd747a082fefae1f8bc76de88eb38d264bb24f Mon Sep 17 00:00:00 2001 From: pckSF Date: Tue, 7 Sep 2021 00:06:30 +0200 Subject: [PATCH 22/95] Add tests for complex prefix separators --- pandas/core/reshape/reshape.py | 6 +- pandas/tests/reshape/test_from_dummies.py | 68 +++++++++++++++++++++++ 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index a8ee0ed00d753..45af6f680bd8b 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1095,8 +1095,10 @@ def from_dummies( prefix_sep : str, list of str, or dict of str, default '_' Separator/deliminator used in the column names of the dummy categories. Pass a list if multiple prefix separators are used in the columns names. - Alternatively, pass a dictionary to map prefix separators to prefixes if - multiple and/ mixed separators are used in the column names. + Will separate the prefix based on the first encountered separator following + the order of the list. Alternatively, pass a dictionary to map prefix + separators to prefixes if multiple and/or mixed separators are used in the + column names. columns : None, Index, or list of str, default 'None' The columns which to convert from dummy-encoding and return as categorical `DataFrame`. diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 7b63479ccbe7f..e59a1ee0de5ff 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -207,6 +207,27 @@ def test_from_dummies_to_df_prefix_sep_dict(): tm.assert_frame_equal(result, expected) +def test_from_dummies_to_df_prefix_separators_too_complex_for_sep_list(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a-a": [1, 0, 1], + "col1_b-b": [0, 1, 0], + "col2-a_a": [0, 1, 0], + "col2-b_b": [1, 0, 0], + "col2-c_c": [0, 0, 1], + }, + ) + with pytest.raises( + ValueError, + match=( + r"Dummy DataFrame contains unassigned value\(s\) for prefix: " + r"'col2-a'; First instance in row: 0" + ), + ): + from_dummies(dummies, prefix_sep=["_", "-"]) + + def test_from_dummies_to_df_prefix_sep_dict_incomplete(): dummies = DataFrame( { @@ -363,3 +384,50 @@ def test_from_dummies_to_df_double_assignment(): ), ): from_dummies(dummies) + + +def test_from_dummies_collate_prefix_sep_and_dropped_first_list(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a": [1, 0, 0], + "col1_b": [0, 1, 0], + "col2-a": [0, 1, 0], + "col2-b": [0, 0, 0], + "col2-c": [0, 0, 1], + }, + ) + expected = DataFrame( + {"C": [1, 2, 3], "col1": ["a", "b", "x"], "col2": ["y", "a", "c"]} + ) + result = from_dummies( + dummies, + prefix_sep=["_", "-"], + dropped_first=["x", "y"], + ) + tm.assert_frame_equal(result, expected) + + +def test_from_dummies_collate_prefix_sep_and_dropped_first_dict(): + dummies = DataFrame( + { + "C": [1, 2, 3], + "col1_a-a": [1, 0, 0], + "col1_b-b": [0, 1, 0], + "col2-a_a": [0, 1, 0], + "col2-b_b": [0, 0, 0], + "col2-c_c": [0, 0, 1], + }, + ) + expected = DataFrame( + {"C": [1, 2, 3], "col1": ["a-a", "b-b", "x"], "col2": ["y", "a_a", "c_c"]} + ) + result = from_dummies( + dummies, + prefix_sep={ + "col1": "_", + "col2": "-", + }, + dropped_first={"col1": "x", "col2": "y"}, + ) + tm.assert_frame_equal(result, expected) From dc50464bc11eccea6dc998c9180b487938c6f578 Mon Sep 17 00:00:00 2001 From: pckSF Date: Thu, 9 Sep 2021 22:15:10 +0200 Subject: [PATCH 23/95] Remove magic handling of non-dummy columns --- pandas/core/reshape/reshape.py | 14 +----- pandas/tests/reshape/test_from_dummies.py | 58 ++++------------------- 2 files changed, 11 insertions(+), 61 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 45af6f680bd8b..f507fd4f542fe 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1171,21 +1171,12 @@ def from_dummies( if to_series: return _from_dummies_1d(data, dropped_first) - data_to_decode: DataFrame if columns is None: columns = data.columns elif not is_list_like(columns): raise TypeError("Argument for parameter 'columns' must be list-like") # index data with a list of all columns that are dummies - cat_columns = [] - non_cat_columns = [] - for col in columns: - if any(ps in col for ps in prefix_sep): - cat_columns.append(col) - else: - non_cat_columns.append(col) - data_to_decode = data[cat_columns].astype("boolean") - non_cat_data = data[non_cat_columns] + data_to_decode = data[columns].astype("boolean") # get separator for each prefix and lists to slice data for each prefix if isinstance(prefix_sep, dict): @@ -1256,8 +1247,7 @@ def check_len(item, name) -> None: data_slice = data_to_decode[prefix_slice] cat_data[prefix] = data_slice.dot(cats) - categorical_df = concat((non_cat_data, DataFrame(cat_data)), axis=1) - return categorical_df + return DataFrame(cat_data) def _from_dummies_1d( diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index e59a1ee0de5ff..67fcbb14bf166 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -13,7 +13,6 @@ def dummies_basic(): return DataFrame( { - "C": [1, 2, 3], "col1_a": [1, 0, 1], "col1_b": [0, 1, 0], "col2_a": [0, 1, 0], @@ -27,7 +26,6 @@ def dummies_basic(): def dummies_with_unassigned(): return DataFrame( { - "C": [1, 2, 3], "col1_a": [1, 0, 0], "col1_b": [0, 1, 0], "col2_a": [0, 1, 0], @@ -95,13 +93,6 @@ def test_from_dummies_to_series_contains_nan(): from_dummies(dummies, to_series=True) -def test_from_dummies_to_series_False(): - dummies = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}) - expected = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}) - result = from_dummies(dummies, to_series=False) - tm.assert_frame_equal(result, expected) - - def test_from_dummies_no_dummies(): dummies = DataFrame( {"a": [1, 6, 3, 1], "b": [0, 1, 0, 2], "c": ["c1", "c2", "c3", "c4"]} @@ -114,9 +105,7 @@ def test_from_dummies_no_dummies(): def test_from_dummies_to_df_basic(dummies_basic): - expected = DataFrame( - {"C": [1, 2, 3], "col1": ["a", "b", "a"], "col2": ["b", "a", "c"]} - ) + expected = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]}) result = from_dummies(dummies_basic) tm.assert_frame_equal(result, expected) @@ -124,7 +113,6 @@ def test_from_dummies_to_df_basic(dummies_basic): def test_from_dummies_to_df_prefix_multiple_seperators(): dummies = DataFrame( { - "C": [1, 2, 3], "col1_a": [1, 0, 1], "col1_b": [0, 1, 0], "col2-a": [0, 1, 0], @@ -133,7 +121,6 @@ def test_from_dummies_to_df_prefix_multiple_seperators(): ) expected = DataFrame( { - "C": [1, 2, 3], "col2-a": [0, 1, 0], "col2-b": [1, 0, 1], "col1": ["a", "b", "a"], @@ -146,7 +133,6 @@ def test_from_dummies_to_df_prefix_multiple_seperators(): def test_from_dummies_to_df_prefix_sep_list(): dummies = DataFrame( { - "C": [1, 2, 3], "col1_a": [1, 0, 1], "col1_b": [0, 1, 0], "col2-a": [0, 1, 0], @@ -154,9 +140,7 @@ def test_from_dummies_to_df_prefix_sep_list(): "col2-c": [0, 0, 1], }, ) - expected = DataFrame( - {"C": [1, 2, 3], "col1": ["a", "b", "a"], "col2": ["b", "a", "c"]} - ) + expected = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]}) result = from_dummies(dummies, prefix_sep=["_", "-"]) tm.assert_frame_equal(result, expected) @@ -164,7 +148,6 @@ def test_from_dummies_to_df_prefix_sep_list(): def test_from_dummies_to_df_prefix_sep_list_incomplete(): dummies = DataFrame( { - "C": [1, 2, 3], "col1_a": [1, 0, 1], "col1_b": [0, 1, 0], "col2-a": [0, 1, 0], @@ -173,7 +156,6 @@ def test_from_dummies_to_df_prefix_sep_list_incomplete(): ) expected = DataFrame( { - "C": [1, 2, 3], "col2-a": [0, 1, 0], "col2-b": [1, 0, 1], "col1": ["a", "b", "a"], @@ -186,7 +168,6 @@ def test_from_dummies_to_df_prefix_sep_list_incomplete(): def test_from_dummies_to_df_prefix_sep_dict(): dummies = DataFrame( { - "C": [1, 2, 3], "col1_a-a": [1, 0, 1], "col1_b-b": [0, 1, 0], "col2-a_a": [0, 1, 0], @@ -194,9 +175,7 @@ def test_from_dummies_to_df_prefix_sep_dict(): "col2-c_c": [0, 0, 1], }, ) - expected = DataFrame( - {"C": [1, 2, 3], "col1": ["a-a", "b-b", "a-a"], "col2": ["b_b", "a_a", "c_c"]} - ) + expected = DataFrame({"col1": ["a-a", "b-b", "a-a"], "col2": ["b_b", "a_a", "c_c"]}) result = from_dummies( dummies, prefix_sep={ @@ -210,7 +189,6 @@ def test_from_dummies_to_df_prefix_sep_dict(): def test_from_dummies_to_df_prefix_separators_too_complex_for_sep_list(): dummies = DataFrame( { - "C": [1, 2, 3], "col1_a-a": [1, 0, 1], "col1_b-b": [0, 1, 0], "col2-a_a": [0, 1, 0], @@ -231,7 +209,6 @@ def test_from_dummies_to_df_prefix_separators_too_complex_for_sep_list(): def test_from_dummies_to_df_prefix_sep_dict_incomplete(): dummies = DataFrame( { - "C": [1, 2, 3], "col1_a-a": [1, 0, 1], "col1_b-b": [0, 1, 0], "col2-a_a": [0, 1, 0], @@ -240,7 +217,6 @@ def test_from_dummies_to_df_prefix_sep_dict_incomplete(): ) expected = DataFrame( { - "C": [1, 2, 3], "col2-a_a": [0, 1, 0], "col2-b_b": [1, 0, 1], "col1": ["a-a", "b-b", "a-a"], @@ -253,7 +229,6 @@ def test_from_dummies_to_df_prefix_sep_dict_incomplete(): def test_from_dummies_to_df_contains_get_dummies_NaN_column(): dummies = DataFrame( { - "C": [1, 2, 3], "col1_a": [1, 0, 0], "col1_b": [0, 1, 0], "col1_NaN": [0, 0, 1], @@ -263,9 +238,7 @@ def test_from_dummies_to_df_contains_get_dummies_NaN_column(): "col2_NaN": [1, 0, 0], }, ) - expected = DataFrame( - {"C": [1, 2, 3], "col1": ["a", "b", "NaN"], "col2": ["NaN", "a", "c"]} - ) + expected = DataFrame({"col1": ["a", "b", "NaN"], "col2": ["NaN", "a", "c"]}) result = from_dummies(dummies) tm.assert_frame_equal(result, expected) @@ -290,17 +263,13 @@ def test_from_dummies_to_df_columns(dummies_basic): def test_from_dummies_to_df_dropped_first_str(dummies_with_unassigned): - expected = DataFrame( - {"C": [1, 2, 3], "col1": ["a", "b", "x"], "col2": ["x", "a", "c"]} - ) + expected = DataFrame({"col1": ["a", "b", "x"], "col2": ["x", "a", "c"]}) result = from_dummies(dummies_with_unassigned, dropped_first="x") tm.assert_frame_equal(result, expected) def test_from_dummies_to_df_dropped_first_list(dummies_with_unassigned): - expected = DataFrame( - {"C": [1, 2, 3], "col1": ["a", "b", "x"], "col2": ["y", "a", "c"]} - ) + expected = DataFrame({"col1": ["a", "b", "x"], "col2": ["y", "a", "c"]}) result = from_dummies(dummies_with_unassigned, dropped_first=["x", "y"]) tm.assert_frame_equal(result, expected) @@ -329,9 +298,7 @@ def test_from_dummies_to_df_dropped_first_wrong_type(dummies_with_unassigned): def test_from_dummies_to_df_dropped_first_dict(dummies_with_unassigned): - expected = DataFrame( - {"C": [1, 2, 3], "col1": ["a", "b", "y"], "col2": ["x", "a", "c"]} - ) + expected = DataFrame({"col1": ["a", "b", "y"], "col2": ["x", "a", "c"]}) result = from_dummies( dummies_with_unassigned, dropped_first={"col2": "x", "col1": "y"} ) @@ -368,7 +335,6 @@ def test_from_dummies_to_df_contains_nan(dummies_basic): def test_from_dummies_to_df_double_assignment(): dummies = DataFrame( { - "C": [1, 2, 3], "col1_a": [1, 0, 1], "col1_b": [1, 1, 0], "col2_a": [0, 1, 0], @@ -389,7 +355,6 @@ def test_from_dummies_to_df_double_assignment(): def test_from_dummies_collate_prefix_sep_and_dropped_first_list(): dummies = DataFrame( { - "C": [1, 2, 3], "col1_a": [1, 0, 0], "col1_b": [0, 1, 0], "col2-a": [0, 1, 0], @@ -397,9 +362,7 @@ def test_from_dummies_collate_prefix_sep_and_dropped_first_list(): "col2-c": [0, 0, 1], }, ) - expected = DataFrame( - {"C": [1, 2, 3], "col1": ["a", "b", "x"], "col2": ["y", "a", "c"]} - ) + expected = DataFrame({"col1": ["a", "b", "x"], "col2": ["y", "a", "c"]}) result = from_dummies( dummies, prefix_sep=["_", "-"], @@ -411,7 +374,6 @@ def test_from_dummies_collate_prefix_sep_and_dropped_first_list(): def test_from_dummies_collate_prefix_sep_and_dropped_first_dict(): dummies = DataFrame( { - "C": [1, 2, 3], "col1_a-a": [1, 0, 0], "col1_b-b": [0, 1, 0], "col2-a_a": [0, 1, 0], @@ -419,9 +381,7 @@ def test_from_dummies_collate_prefix_sep_and_dropped_first_dict(): "col2-c_c": [0, 0, 1], }, ) - expected = DataFrame( - {"C": [1, 2, 3], "col1": ["a-a", "b-b", "x"], "col2": ["y", "a_a", "c_c"]} - ) + expected = DataFrame({"col1": ["a-a", "b-b", "x"], "col2": ["y", "a_a", "c_c"]}) result = from_dummies( dummies, prefix_sep={ From 4d9cfd080e6dbc1853cd3b529f7df6d5e576ce0c Mon Sep 17 00:00:00 2001 From: pckSF Date: Thu, 9 Sep 2021 23:31:39 +0200 Subject: [PATCH 24/95] Removed to_series argument --- pandas/core/reshape/reshape.py | 86 ++++-------- pandas/tests/reshape/test_from_dummies.py | 154 +++++++++++----------- 2 files changed, 105 insertions(+), 135 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index f507fd4f542fe..016ed806b433d 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1074,11 +1074,10 @@ def get_empty_frame(data) -> DataFrame: def from_dummies( data: DataFrame, - to_series: bool = False, - prefix_sep: str | list[str] | dict[str, str] = "_", columns: None | Index | list[str] = None, + prefix_sep: None | str | list[str] | dict[str, str] = None, dropped_first: None | str | list[str] | dict[str, str] = None, -) -> Series | DataFrame: +) -> DataFrame: """ Create a categorical `Series` or `DataFrame` from a `DataFrame` of dummy variables. @@ -1089,9 +1088,6 @@ def from_dummies( ---------- data : `DataFrame` Data which contains dummy-coded variables. - to_series : bool, default False - Converts the input data to a categorical `Series`, converts the input data - to a categorical `DataFrame` if False. prefix_sep : str, list of str, or dict of str, default '_' Separator/deliminator used in the column names of the dummy categories. Pass a list if multiple prefix separators are used in the columns names. @@ -1168,18 +1164,20 @@ def from_dummies( f"'{data.columns[data.isna().any().argmax()]}'" ) - if to_series: - return _from_dummies_1d(data, dropped_first) - if columns is None: columns = data.columns elif not is_list_like(columns): raise TypeError("Argument for parameter 'columns' must be list-like") # index data with a list of all columns that are dummies - data_to_decode = data[columns].astype("boolean") + try: + data_to_decode = data[columns].astype("boolean") + except TypeError: + raise TypeError("Passed DataFrame contains non-dummy data") # get separator for each prefix and lists to slice data for each prefix - if isinstance(prefix_sep, dict): + if prefix_sep is None: + variables_slice = {"categories": columns} + elif isinstance(prefix_sep, dict): variables_slice: dict[str, list] = {prefix: [] for prefix in prefix_sep} for col in data_to_decode.columns: for prefix in prefix_sep: @@ -1189,10 +1187,15 @@ def from_dummies( sep_for_prefix = {} variables_slice = {} for col in data_to_decode.columns: - ps = [ps for ps in prefix_sep if ps in col][0] - prefix = col.split(ps)[0] + ps = [ps for ps in prefix_sep if ps in col] + if len(ps) == 0: + raise ValueError( + f"Prefix separator not specified for all columns; " + f"First instance column: '{col}'" + ) + prefix = col.split(ps[0])[0] if prefix not in sep_for_prefix: - sep_for_prefix[prefix] = ps + sep_for_prefix[prefix] = ps[0] if prefix not in variables_slice: variables_slice[prefix] = [col] else: @@ -1205,7 +1208,7 @@ def check_len(item, name) -> None: len_msg = ( f"Length of '{name}' ({len(item)}) did not match the " "length of the columns being encoded " - f"({len(variables_slice)})." + f"({len(variables_slice)})" ) raise ValueError(len_msg) @@ -1227,20 +1230,23 @@ def check_len(item, name) -> None: cat_data = {} for prefix, prefix_slice in variables_slice.items(): - cats = [col[len(prefix + prefix_sep[prefix]) :] for col in prefix_slice] + if prefix_sep is None: + cats = columns.tolist() + else: + cats = [col[len(prefix + prefix_sep[prefix]) :] for col in prefix_slice] assigned = data_to_decode[prefix_slice].sum(axis=1) if any(assigned > 1): raise ValueError( - f"Dummy DataFrame contains multi-assignment(s) for prefix: " - f"'{prefix}'; First instance in row: {assigned.argmax()}." + f"Dummy DataFrame contains multi-assignment(s); " + f"First instance in row: {assigned.argmax()}" ) elif any(assigned == 0): if isinstance(dropped_first, dict): cats.append(dropped_first[prefix]) else: raise ValueError( - f"Dummy DataFrame contains unassigned value(s) for prefix: " - f"'{prefix}'; First instance in row: {assigned.argmin()}." + f"Dummy DataFrame contains unassigned value(s); " + f"First instance in row: {assigned.argmin()}" ) data_slice = concat((data_to_decode[prefix_slice], assigned == 0), axis=1) else: @@ -1250,46 +1256,6 @@ def check_len(item, name) -> None: return DataFrame(cat_data) -def _from_dummies_1d( - data: DataFrame, - dropped_first: None | str | list[str] | dict[str, str] = None, -) -> Series: - """ - Helper function for from_dummies. - - Handles the conversion of dummy encoded data to a categorical `Series`. - For parameters and usage see: from_dummies. - """ - from pandas.core.reshape.concat import concat - - if dropped_first and not isinstance(dropped_first, str): - raise TypeError( - f"Only one dropped first value possible in 1D dummy DataFrame: " - f"'dropped_first' should be of type 'str'; " - f"Received 'dropped_first' of type: {type(dropped_first).__name__}" - ) - - data = data.astype("boolean") - cats = data.columns.tolist() - assigned = data.sum(axis=1) - if any(assigned > 1): - raise ValueError( - f"Dummy DataFrame contains multi-assignment in row: {assigned.argmax()}." - ) - elif any(assigned == 0): - if dropped_first: - cats.append(dropped_first) - else: - raise ValueError( - f"Dummy DataFrame contains unassigned value in row: " - f"{assigned.argmin()}." - ) - data = concat((data, assigned == 0), axis=1) - - categorical_series = data.dot(cats) - return categorical_series - - def _reorder_for_extension_array_stack( arr: ExtensionArray, n_rows: int, n_columns: int ) -> ExtensionArray: diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 67fcbb14bf166..658d565a5c764 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -1,10 +1,7 @@ import numpy as np import pytest -from pandas import ( - DataFrame, - Series, -) +from pandas import DataFrame import pandas._testing as tm from pandas.core.reshape.reshape import from_dummies @@ -37,52 +34,59 @@ def dummies_with_unassigned(): def test_from_dummies_to_series_basic(): dummies = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}) - expected = Series(list("abca")) - result = from_dummies(dummies, to_series=True) - tm.assert_series_equal(result, expected) + expected = DataFrame({"categories": ["a", "b", "c", "a"]}) + result = from_dummies(dummies) + tm.assert_frame_equal(result, expected) def test_from_dummies_to_series_contains_get_dummies_NaN_column(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0], "NaN": [0, 0, 1]}) - expected = Series(["a", "b", "NaN"]) - result = from_dummies(dummies, to_series=True) - tm.assert_series_equal(result, expected) + expected = DataFrame({"categories": ["a", "b", "NaN"]}) + result = from_dummies(dummies) + tm.assert_frame_equal(result, expected) def test_from_dummies_to_series_contains_unassigned(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) with pytest.raises( - ValueError, match=r"Dummy DataFrame contains unassigned value in row: 2" + ValueError, + match=( + r"Dummy DataFrame contains unassigned value\(s\); " + r"First instance in row: 2" + ), ): - from_dummies(dummies, to_series=True) + from_dummies(dummies) def test_from_dummies_to_series_dropped_first(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) - expected = Series(["a", "b", "c"]) - result = from_dummies(dummies, to_series=True, dropped_first="c") - tm.assert_series_equal(result, expected) + expected = DataFrame({"categories": ["a", "b", "c"]}) + result = from_dummies(dummies, dropped_first="c") + tm.assert_frame_equal(result, expected) def test_from_dummies_to_series_wrong_dropped_first_type(): dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}) with pytest.raises( - TypeError, + ValueError, match=( - r"Only one dropped first value possible in 1D dummy DataFrame: " - r"'dropped_first' should be of type 'str'; " - r"Received 'dropped_first' of type: list" + r"Length of 'dropped_first' \(2\) did not match the length of the " + r"columns being encoded \(1\)" ), ): - from_dummies(dummies, to_series=True, dropped_first=["c", "d"]) + from_dummies(dummies, dropped_first=["c", "d"]) def test_from_dummies_to_series_multi_assignment(): dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}) with pytest.raises( - ValueError, match=r"Dummy DataFrame contains multi-assignment in row: 2." + ValueError, + match=( + r"Dummy DataFrame contains multi-assignment\(s\); " + r"First instance in row: 2" + ), ): - from_dummies(dummies, to_series=True) + from_dummies(dummies) def test_from_dummies_to_series_contains_nan(): @@ -90,23 +94,23 @@ def test_from_dummies_to_series_contains_nan(): with pytest.raises( ValueError, match=r"Dummy DataFrame contains NA value in column: 'b'" ): - from_dummies(dummies, to_series=True) + from_dummies(dummies) def test_from_dummies_no_dummies(): dummies = DataFrame( {"a": [1, 6, 3, 1], "b": [0, 1, 0, 2], "c": ["c1", "c2", "c3", "c4"]} ) - expected = DataFrame( - {"a": [1, 6, 3, 1], "b": [0, 1, 0, 2], "c": ["c1", "c2", "c3", "c4"]} - ) - result = from_dummies(dummies) - tm.assert_frame_equal(result, expected) + with pytest.raises( + TypeError, + match=r"Passed DataFrame contains non-dummy data", + ): + from_dummies(dummies, prefix_sep="_") def test_from_dummies_to_df_basic(dummies_basic): expected = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]}) - result = from_dummies(dummies_basic) + result = from_dummies(dummies_basic, prefix_sep="_") tm.assert_frame_equal(result, expected) @@ -119,15 +123,14 @@ def test_from_dummies_to_df_prefix_multiple_seperators(): "col2-b": [1, 0, 1], }, ) - expected = DataFrame( - { - "col2-a": [0, 1, 0], - "col2-b": [1, 0, 1], - "col1": ["a", "b", "a"], - } - ) - result = from_dummies(dummies) - tm.assert_frame_equal(result, expected) + with pytest.raises( + ValueError, + match=( + r"Prefix separator not specified for all columns; " + r"First instance column: 'col2-a'" + ), + ): + from_dummies(dummies, prefix_sep="_") def test_from_dummies_to_df_prefix_sep_list(): @@ -154,15 +157,14 @@ def test_from_dummies_to_df_prefix_sep_list_incomplete(): "col2-b": [1, 0, 1], }, ) - expected = DataFrame( - { - "col2-a": [0, 1, 0], - "col2-b": [1, 0, 1], - "col1": ["a", "b", "a"], - } - ) - result = from_dummies(dummies, prefix_sep=["_"]) - tm.assert_frame_equal(result, expected) + with pytest.raises( + ValueError, + match=( + r"Prefix separator not specified for all columns; " + r"First instance column: 'col2-a'" + ), + ): + from_dummies(dummies, prefix_sep=["_"]) def test_from_dummies_to_df_prefix_sep_dict(): @@ -199,14 +201,14 @@ def test_from_dummies_to_df_prefix_separators_too_complex_for_sep_list(): with pytest.raises( ValueError, match=( - r"Dummy DataFrame contains unassigned value\(s\) for prefix: " - r"'col2-a'; First instance in row: 0" + r"Dummy DataFrame contains unassigned value\(s\); " + r"First instance in row: 0" ), ): from_dummies(dummies, prefix_sep=["_", "-"]) -def test_from_dummies_to_df_prefix_sep_dict_incomplete(): +def test_from_dummies_to_df_prefix_partial_sep_dict(): dummies = DataFrame( { "col1_a-a": [1, 0, 1], @@ -215,13 +217,7 @@ def test_from_dummies_to_df_prefix_sep_dict_incomplete(): "col2-b_b": [1, 0, 1], }, ) - expected = DataFrame( - { - "col2-a_a": [0, 1, 0], - "col2-b_b": [1, 0, 1], - "col1": ["a-a", "b-b", "a-a"], - } - ) + expected = DataFrame({"col1": ["a-a", "b-b", "a-a"]}) result = from_dummies(dummies, prefix_sep={"col1": "_"}) tm.assert_frame_equal(result, expected) @@ -239,7 +235,7 @@ def test_from_dummies_to_df_contains_get_dummies_NaN_column(): }, ) expected = DataFrame({"col1": ["a", "b", "NaN"], "col2": ["NaN", "a", "c"]}) - result = from_dummies(dummies) + result = from_dummies(dummies, prefix_sep="_") tm.assert_frame_equal(result, expected) @@ -247,30 +243,34 @@ def test_from_dummies_to_df_contains_unassigned(dummies_with_unassigned): with pytest.raises( ValueError, match=( - r"Dummy DataFrame contains unassigned value\(s\) for prefix: " - r"'col1'; First instance in row: 2" + r"Dummy DataFrame contains unassigned value\(s\); " + r"First instance in row: 2" ), ): - from_dummies(dummies_with_unassigned) + from_dummies(dummies_with_unassigned, prefix_sep="_") def test_from_dummies_to_df_columns(dummies_basic): expected = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]}) result = from_dummies( - dummies_basic, columns=["col1_a", "col1_b", "col2_a", "col2_b", "col2_c"] + dummies_basic, + prefix_sep="_", + columns=["col1_a", "col1_b", "col2_a", "col2_b", "col2_c"], ) tm.assert_frame_equal(result, expected) def test_from_dummies_to_df_dropped_first_str(dummies_with_unassigned): expected = DataFrame({"col1": ["a", "b", "x"], "col2": ["x", "a", "c"]}) - result = from_dummies(dummies_with_unassigned, dropped_first="x") + result = from_dummies(dummies_with_unassigned, prefix_sep="_", dropped_first="x") tm.assert_frame_equal(result, expected) def test_from_dummies_to_df_dropped_first_list(dummies_with_unassigned): expected = DataFrame({"col1": ["a", "b", "x"], "col2": ["y", "a", "c"]}) - result = from_dummies(dummies_with_unassigned, dropped_first=["x", "y"]) + result = from_dummies( + dummies_with_unassigned, prefix_sep="_", dropped_first=["x", "y"] + ) tm.assert_frame_equal(result, expected) @@ -279,10 +279,10 @@ def test_from_dummies_to_df_dropped_first_list_not_complete(dummies_with_unassig ValueError, match=( r"Length of 'dropped_first' \(1\) did not match " - r"the length of the columns being encoded \(2\)." + r"the length of the columns being encoded \(2\)" ), ): - from_dummies(dummies_with_unassigned, dropped_first=["x"]) + from_dummies(dummies_with_unassigned, prefix_sep="_", dropped_first=["x"]) def test_from_dummies_to_df_dropped_first_wrong_type(dummies_with_unassigned): @@ -294,13 +294,15 @@ def test_from_dummies_to_df_dropped_first_wrong_type(dummies_with_unassigned): r"Received 'dropped_first' of type: tuple" ), ): - from_dummies(dummies_with_unassigned, dropped_first=("x", "y")) + from_dummies(dummies_with_unassigned, prefix_sep="_", dropped_first=("x", "y")) def test_from_dummies_to_df_dropped_first_dict(dummies_with_unassigned): expected = DataFrame({"col1": ["a", "b", "y"], "col2": ["x", "a", "c"]}) result = from_dummies( - dummies_with_unassigned, dropped_first={"col2": "x", "col1": "y"} + dummies_with_unassigned, + prefix_sep="_", + dropped_first={"col2": "x", "col1": "y"}, ) tm.assert_frame_equal(result, expected) @@ -310,10 +312,12 @@ def test_from_dummies_to_df_dropped_first_dict_not_complete(dummies_with_unassig ValueError, match=( r"Length of 'dropped_first' \(1\) did not match " - r"the length of the columns being encoded \(2\)." + r"the length of the columns being encoded \(2\)" ), ): - from_dummies(dummies_with_unassigned, dropped_first={"col1": "x"}) + from_dummies( + dummies_with_unassigned, prefix_sep="_", dropped_first={"col1": "x"} + ) def test_from_dummies_to_df_wrong_column_type(dummies_basic): @@ -321,7 +325,7 @@ def test_from_dummies_to_df_wrong_column_type(dummies_basic): TypeError, match=r"Argument for parameter 'columns' must be list-like", ): - from_dummies(dummies_basic, columns="col1_a") + from_dummies(dummies_basic, prefix_sep="_", columns="col1_a") def test_from_dummies_to_df_contains_nan(dummies_basic): @@ -329,7 +333,7 @@ def test_from_dummies_to_df_contains_nan(dummies_basic): with pytest.raises( ValueError, match=r"Dummy DataFrame contains NA value in column: 'col2_c'" ): - from_dummies(dummies_basic) + from_dummies(dummies_basic, prefix_sep="_") def test_from_dummies_to_df_double_assignment(): @@ -345,11 +349,11 @@ def test_from_dummies_to_df_double_assignment(): with pytest.raises( ValueError, match=( - r"Dummy DataFrame contains multi-assignment\(s\) for prefix: " - r"'col1'; First instance in row: 0." + r"Dummy DataFrame contains multi-assignment\(s\); " + r"First instance in row: 0" ), ): - from_dummies(dummies) + from_dummies(dummies, prefix_sep="_") def test_from_dummies_collate_prefix_sep_and_dropped_first_list(): From 82d6743d43b7bcca44ceff05dbab1f8fa0aea409 Mon Sep 17 00:00:00 2001 From: pckSF Date: Thu, 9 Sep 2021 23:37:46 +0200 Subject: [PATCH 25/95] Renamed column argument to subset --- pandas/core/reshape/reshape.py | 26 +++++++++++------------ pandas/tests/reshape/test_from_dummies.py | 8 +++---- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 016ed806b433d..9e19db2bae168 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1074,7 +1074,7 @@ def get_empty_frame(data) -> DataFrame: def from_dummies( data: DataFrame, - columns: None | Index | list[str] = None, + subset: None | Index | list[str] = None, prefix_sep: None | str | list[str] | dict[str, str] = None, dropped_first: None | str | list[str] | dict[str, str] = None, ) -> DataFrame: @@ -1088,6 +1088,11 @@ def from_dummies( ---------- data : `DataFrame` Data which contains dummy-coded variables. + subset : None, Index, or list of str, default 'None' + The columns which to convert from dummy-encoding and return as categorical + `DataFrame`. + If `columns` is None then all dummy columns are converted and appended + to the non-dummy columns. prefix_sep : str, list of str, or dict of str, default '_' Separator/deliminator used in the column names of the dummy categories. Pass a list if multiple prefix separators are used in the columns names. @@ -1095,11 +1100,6 @@ def from_dummies( the order of the list. Alternatively, pass a dictionary to map prefix separators to prefixes if multiple and/or mixed separators are used in the column names. - columns : None, Index, or list of str, default 'None' - The columns which to convert from dummy-encoding and return as categorical - `DataFrame`. - If `columns` is None then all dummy columns are converted and appended - to the non-dummy columns. dropped_fist : None, str, list of str, or dict of str, default None The implied value the dummy takes when all values are zero. Can be a a single value for all variables, a list with a number of values @@ -1164,19 +1164,19 @@ def from_dummies( f"'{data.columns[data.isna().any().argmax()]}'" ) - if columns is None: - columns = data.columns - elif not is_list_like(columns): - raise TypeError("Argument for parameter 'columns' must be list-like") + if subset is None: + subset = data.columns + elif not is_list_like(subset): + raise TypeError("Argument for parameter 'subset' must be list-like") # index data with a list of all columns that are dummies try: - data_to_decode = data[columns].astype("boolean") + data_to_decode = data[subset].astype("boolean") except TypeError: raise TypeError("Passed DataFrame contains non-dummy data") # get separator for each prefix and lists to slice data for each prefix if prefix_sep is None: - variables_slice = {"categories": columns} + variables_slice = {"categories": subset} elif isinstance(prefix_sep, dict): variables_slice: dict[str, list] = {prefix: [] for prefix in prefix_sep} for col in data_to_decode.columns: @@ -1231,7 +1231,7 @@ def check_len(item, name) -> None: cat_data = {} for prefix, prefix_slice in variables_slice.items(): if prefix_sep is None: - cats = columns.tolist() + cats = subset.tolist() else: cats = [col[len(prefix + prefix_sep[prefix]) :] for col in prefix_slice] assigned = data_to_decode[prefix_slice].sum(axis=1) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 658d565a5c764..cbb23d049a981 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -250,12 +250,12 @@ def test_from_dummies_to_df_contains_unassigned(dummies_with_unassigned): from_dummies(dummies_with_unassigned, prefix_sep="_") -def test_from_dummies_to_df_columns(dummies_basic): +def test_from_dummies_to_df_subset(dummies_basic): expected = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]}) result = from_dummies( dummies_basic, prefix_sep="_", - columns=["col1_a", "col1_b", "col2_a", "col2_b", "col2_c"], + subset=["col1_a", "col1_b", "col2_a", "col2_b", "col2_c"], ) tm.assert_frame_equal(result, expected) @@ -323,9 +323,9 @@ def test_from_dummies_to_df_dropped_first_dict_not_complete(dummies_with_unassig def test_from_dummies_to_df_wrong_column_type(dummies_basic): with pytest.raises( TypeError, - match=r"Argument for parameter 'columns' must be list-like", + match=r"Argument for parameter 'subset' must be list-like", ): - from_dummies(dummies_basic, prefix_sep="_", columns="col1_a") + from_dummies(dummies_basic, prefix_sep="_", subset="col1_a") def test_from_dummies_to_df_contains_nan(dummies_basic): From 153202d0c5ed32eeb8d413ed3abcba5529cf617c Mon Sep 17 00:00:00 2001 From: pckSF Date: Thu, 9 Sep 2021 23:39:30 +0200 Subject: [PATCH 26/95] Renamed tests to reflect the removal of to_series --- pandas/tests/reshape/test_from_dummies.py | 56 ++++++++++++----------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index cbb23d049a981..003064d1598d7 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -32,21 +32,21 @@ def dummies_with_unassigned(): ) -def test_from_dummies_to_series_basic(): +def test_from_dummies_no_prefix_basic(): dummies = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}) expected = DataFrame({"categories": ["a", "b", "c", "a"]}) result = from_dummies(dummies) tm.assert_frame_equal(result, expected) -def test_from_dummies_to_series_contains_get_dummies_NaN_column(): +def test_from_dummies_no_prefix_contains_get_dummies_NaN_column(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0], "NaN": [0, 0, 1]}) expected = DataFrame({"categories": ["a", "b", "NaN"]}) result = from_dummies(dummies) tm.assert_frame_equal(result, expected) -def test_from_dummies_to_series_contains_unassigned(): +def test_from_dummies_no_prefix_contains_unassigned(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) with pytest.raises( ValueError, @@ -58,14 +58,14 @@ def test_from_dummies_to_series_contains_unassigned(): from_dummies(dummies) -def test_from_dummies_to_series_dropped_first(): +def test_from_dummies_no_prefix_dropped_first(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) expected = DataFrame({"categories": ["a", "b", "c"]}) result = from_dummies(dummies, dropped_first="c") tm.assert_frame_equal(result, expected) -def test_from_dummies_to_series_wrong_dropped_first_type(): +def test_from_dummies_no_prefix_wrong_dropped_first_type(): dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}) with pytest.raises( ValueError, @@ -77,7 +77,7 @@ def test_from_dummies_to_series_wrong_dropped_first_type(): from_dummies(dummies, dropped_first=["c", "d"]) -def test_from_dummies_to_series_multi_assignment(): +def test_from_dummies_no_prefix_multi_assignment(): dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}) with pytest.raises( ValueError, @@ -89,7 +89,7 @@ def test_from_dummies_to_series_multi_assignment(): from_dummies(dummies) -def test_from_dummies_to_series_contains_nan(): +def test_from_dummies_no_prefix_contains_nan(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, np.nan]}) with pytest.raises( ValueError, match=r"Dummy DataFrame contains NA value in column: 'b'" @@ -108,13 +108,13 @@ def test_from_dummies_no_dummies(): from_dummies(dummies, prefix_sep="_") -def test_from_dummies_to_df_basic(dummies_basic): +def test_from_dummies_with_prefix_basic(dummies_basic): expected = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]}) result = from_dummies(dummies_basic, prefix_sep="_") tm.assert_frame_equal(result, expected) -def test_from_dummies_to_df_prefix_multiple_seperators(): +def test_from_dummies_with_prefix_prefix_multiple_seperators(): dummies = DataFrame( { "col1_a": [1, 0, 1], @@ -133,7 +133,7 @@ def test_from_dummies_to_df_prefix_multiple_seperators(): from_dummies(dummies, prefix_sep="_") -def test_from_dummies_to_df_prefix_sep_list(): +def test_from_dummies_with_prefix_prefix_sep_list(): dummies = DataFrame( { "col1_a": [1, 0, 1], @@ -148,7 +148,7 @@ def test_from_dummies_to_df_prefix_sep_list(): tm.assert_frame_equal(result, expected) -def test_from_dummies_to_df_prefix_sep_list_incomplete(): +def test_from_dummies_with_prefix_prefix_sep_list_incomplete(): dummies = DataFrame( { "col1_a": [1, 0, 1], @@ -167,7 +167,7 @@ def test_from_dummies_to_df_prefix_sep_list_incomplete(): from_dummies(dummies, prefix_sep=["_"]) -def test_from_dummies_to_df_prefix_sep_dict(): +def test_from_dummies_with_prefix_prefix_sep_dict(): dummies = DataFrame( { "col1_a-a": [1, 0, 1], @@ -188,7 +188,7 @@ def test_from_dummies_to_df_prefix_sep_dict(): tm.assert_frame_equal(result, expected) -def test_from_dummies_to_df_prefix_separators_too_complex_for_sep_list(): +def test_from_dummies_with_prefix_prefix_separators_too_complex_for_sep_list(): dummies = DataFrame( { "col1_a-a": [1, 0, 1], @@ -208,7 +208,7 @@ def test_from_dummies_to_df_prefix_separators_too_complex_for_sep_list(): from_dummies(dummies, prefix_sep=["_", "-"]) -def test_from_dummies_to_df_prefix_partial_sep_dict(): +def test_from_dummies_with_prefix_prefix_partial_sep_dict(): dummies = DataFrame( { "col1_a-a": [1, 0, 1], @@ -222,7 +222,7 @@ def test_from_dummies_to_df_prefix_partial_sep_dict(): tm.assert_frame_equal(result, expected) -def test_from_dummies_to_df_contains_get_dummies_NaN_column(): +def test_from_dummies_with_prefix_contains_get_dummies_NaN_column(): dummies = DataFrame( { "col1_a": [1, 0, 0], @@ -239,7 +239,7 @@ def test_from_dummies_to_df_contains_get_dummies_NaN_column(): tm.assert_frame_equal(result, expected) -def test_from_dummies_to_df_contains_unassigned(dummies_with_unassigned): +def test_from_dummies_with_prefix_contains_unassigned(dummies_with_unassigned): with pytest.raises( ValueError, match=( @@ -250,7 +250,7 @@ def test_from_dummies_to_df_contains_unassigned(dummies_with_unassigned): from_dummies(dummies_with_unassigned, prefix_sep="_") -def test_from_dummies_to_df_subset(dummies_basic): +def test_from_dummies_with_prefix_subset(dummies_basic): expected = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]}) result = from_dummies( dummies_basic, @@ -260,13 +260,13 @@ def test_from_dummies_to_df_subset(dummies_basic): tm.assert_frame_equal(result, expected) -def test_from_dummies_to_df_dropped_first_str(dummies_with_unassigned): +def test_from_dummies_with_prefix_dropped_first_str(dummies_with_unassigned): expected = DataFrame({"col1": ["a", "b", "x"], "col2": ["x", "a", "c"]}) result = from_dummies(dummies_with_unassigned, prefix_sep="_", dropped_first="x") tm.assert_frame_equal(result, expected) -def test_from_dummies_to_df_dropped_first_list(dummies_with_unassigned): +def test_from_dummies_with_prefix_dropped_first_list(dummies_with_unassigned): expected = DataFrame({"col1": ["a", "b", "x"], "col2": ["y", "a", "c"]}) result = from_dummies( dummies_with_unassigned, prefix_sep="_", dropped_first=["x", "y"] @@ -274,7 +274,9 @@ def test_from_dummies_to_df_dropped_first_list(dummies_with_unassigned): tm.assert_frame_equal(result, expected) -def test_from_dummies_to_df_dropped_first_list_not_complete(dummies_with_unassigned): +def test_from_dummies_with_prefix_dropped_first_list_not_complete( + dummies_with_unassigned, +): with pytest.raises( ValueError, match=( @@ -285,7 +287,7 @@ def test_from_dummies_to_df_dropped_first_list_not_complete(dummies_with_unassig from_dummies(dummies_with_unassigned, prefix_sep="_", dropped_first=["x"]) -def test_from_dummies_to_df_dropped_first_wrong_type(dummies_with_unassigned): +def test_from_dummies_with_prefix_dropped_first_wrong_type(dummies_with_unassigned): with pytest.raises( TypeError, @@ -297,7 +299,7 @@ def test_from_dummies_to_df_dropped_first_wrong_type(dummies_with_unassigned): from_dummies(dummies_with_unassigned, prefix_sep="_", dropped_first=("x", "y")) -def test_from_dummies_to_df_dropped_first_dict(dummies_with_unassigned): +def test_from_dummies_with_prefix_dropped_first_dict(dummies_with_unassigned): expected = DataFrame({"col1": ["a", "b", "y"], "col2": ["x", "a", "c"]}) result = from_dummies( dummies_with_unassigned, @@ -307,7 +309,9 @@ def test_from_dummies_to_df_dropped_first_dict(dummies_with_unassigned): tm.assert_frame_equal(result, expected) -def test_from_dummies_to_df_dropped_first_dict_not_complete(dummies_with_unassigned): +def test_from_dummies_with_prefix_dropped_first_dict_not_complete( + dummies_with_unassigned, +): with pytest.raises( ValueError, match=( @@ -320,7 +324,7 @@ def test_from_dummies_to_df_dropped_first_dict_not_complete(dummies_with_unassig ) -def test_from_dummies_to_df_wrong_column_type(dummies_basic): +def test_from_dummies_with_prefix_wrong_column_type(dummies_basic): with pytest.raises( TypeError, match=r"Argument for parameter 'subset' must be list-like", @@ -328,7 +332,7 @@ def test_from_dummies_to_df_wrong_column_type(dummies_basic): from_dummies(dummies_basic, prefix_sep="_", subset="col1_a") -def test_from_dummies_to_df_contains_nan(dummies_basic): +def test_from_dummies_with_prefix_contains_nan(dummies_basic): dummies_basic["col2_c"][2] = np.nan with pytest.raises( ValueError, match=r"Dummy DataFrame contains NA value in column: 'col2_c'" @@ -336,7 +340,7 @@ def test_from_dummies_to_df_contains_nan(dummies_basic): from_dummies(dummies_basic, prefix_sep="_") -def test_from_dummies_to_df_double_assignment(): +def test_from_dummies_with_prefix_double_assignment(): dummies = DataFrame( { "col1_a": [1, 0, 1], From d3dd9f700315bca9f40cfbe46796ac522c2ef5f2 Mon Sep 17 00:00:00 2001 From: pckSF Date: Thu, 9 Sep 2021 23:47:46 +0200 Subject: [PATCH 27/95] Fix input data NA value test to account for subset --- pandas/core/reshape/reshape.py | 72 +++++++++++++++++----------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 9e19db2bae168..89fa71a131159 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1079,8 +1079,7 @@ def from_dummies( dropped_first: None | str | list[str] | dict[str, str] = None, ) -> DataFrame: """ - Create a categorical `Series` or `DataFrame` from a `DataFrame` of dummy - variables. + Create a categorical `DataFrame` from a `DataFrame` of dummy variables. Inverts the operation performed by 'get_dummies'. @@ -1108,7 +1107,7 @@ def from_dummies( Returns ------- - `Series` or `DataFrame` + `DataFrame` Categorical data decoded from the dummy input-data. See Also @@ -1117,36 +1116,36 @@ def from_dummies( Examples -------- - >>> d = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], + >>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], ... "c": [0, 0, 1, 0]}) - >>> pd.from_dummies(s, to_series=True) - 0 a - 1 b - 2 c - 3 a - - >>> d = pd.DataFrame({"C": [1, 2, 3], "col1_a": [1, 0, 1], - ... "col1_b": [0, 1, 0], "col2_a": [0, 1, 0], - ... "col2_b": [1, 0, 0], "col2_c": [0, 0, 1]}) - - >>> pd.from_dummies(d) - C col1 col2 - 0 1 a b - 1 2 b a - 2 3 a c - - >>> d = pd.DataFrame({"C": [1, 2, 3], "col1_a": [1, 0, 0], - ... "col1_b": [0, 1, 0], "col2_a": [0, 1, 0], - ... "col2_b": [1, 0, 0], "col2_c": [0, 0, 0]}) - - >>> pd.from_dummies(d, dropped_first=["d", "e"]) - C col1 col2 - 0 1 a b - 1 2 b a - 2 3 d e - - >>> d = pd.DataFrame({"col1_a-a": [1, 0, 1], "col1_b-b": [0, 1, 0], + >>> pd.from_dummies(s) + 0 categories + 1 b + 2 c + 3 a + + >>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], + ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], + ... "col2_c": [0, 0, 1]}) + + >>> pd.from_dummies(d, prefix_sep="_") + col1 col2 + 0 a b + 1 b a + 2 a c + + >>> df = pd.DataFrame({"col1_a": [1, 0, 0], "col1_b": [0, 1, 0], + ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], + ... "col2_c": [0, 0, 0]}) + + >>> pd.from_dummies(d, prefix_sep="_", dropped_first=["d", "e"]) + col1 col2 + 0 a b + 1 b a + 2 d e + + >>> df = pd.DataFrame({"col1_a-a": [1, 0, 1], "col1_b-b": [0, 1, 0], ... "col2-a_a": [0, 1, 0], "col2-b_b": [1, 0, 0], ... "col2-c_c": [0, 0, 1]}) @@ -1158,16 +1157,17 @@ def from_dummies( """ from pandas.core.reshape.concat import concat - if data.isna().any().any(): + if subset is None: + subset = data.columns + elif not is_list_like(subset): + raise TypeError("Argument for parameter 'subset' must be list-like") + + if data[subset].isna().any().any(): raise ValueError( f"Dummy DataFrame contains NA value in column: " f"'{data.columns[data.isna().any().argmax()]}'" ) - if subset is None: - subset = data.columns - elif not is_list_like(subset): - raise TypeError("Argument for parameter 'subset' must be list-like") # index data with a list of all columns that are dummies try: data_to_decode = data[subset].astype("boolean") From e6ec17537acd4ee6b5c372be2346f0f593b17b05 Mon Sep 17 00:00:00 2001 From: pckSF Date: Fri, 10 Sep 2021 00:13:24 +0200 Subject: [PATCH 28/95] Renamed argument prefix_sep to just sep --- pandas/core/reshape/reshape.py | 28 +++++------ pandas/tests/reshape/test_from_dummies.py | 58 +++++++++++------------ 2 files changed, 43 insertions(+), 43 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 89fa71a131159..ba4f574050cb7 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1075,7 +1075,7 @@ def get_empty_frame(data) -> DataFrame: def from_dummies( data: DataFrame, subset: None | Index | list[str] = None, - prefix_sep: None | str | list[str] | dict[str, str] = None, + sep: None | str | list[str] | dict[str, str] = None, dropped_first: None | str | list[str] | dict[str, str] = None, ) -> DataFrame: """ @@ -1092,7 +1092,7 @@ def from_dummies( `DataFrame`. If `columns` is None then all dummy columns are converted and appended to the non-dummy columns. - prefix_sep : str, list of str, or dict of str, default '_' + sep : str, list of str, or dict of str, default '_' Separator/deliminator used in the column names of the dummy categories. Pass a list if multiple prefix separators are used in the columns names. Will separate the prefix based on the first encountered separator following @@ -1129,7 +1129,7 @@ def from_dummies( ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], ... "col2_c": [0, 0, 1]}) - >>> pd.from_dummies(d, prefix_sep="_") + >>> pd.from_dummies(d, sep="_") col1 col2 0 a b 1 b a @@ -1139,7 +1139,7 @@ def from_dummies( ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], ... "col2_c": [0, 0, 0]}) - >>> pd.from_dummies(d, prefix_sep="_", dropped_first=["d", "e"]) + >>> pd.from_dummies(d, sep="_", dropped_first=["d", "e"]) col1 col2 0 a b 1 b a @@ -1149,7 +1149,7 @@ def from_dummies( ... "col2-a_a": [0, 1, 0], "col2-b_b": [1, 0, 0], ... "col2-c_c": [0, 0, 1]}) - >>> pd.from_dummies(d, prefix_sep={"col1": "_", "col2": "-"}) + >>> pd.from_dummies(d, sep={"col1": "_", "col2": "-"}) col1 col2 0 a-a b-b 1 b-b a-a @@ -1175,22 +1175,22 @@ def from_dummies( raise TypeError("Passed DataFrame contains non-dummy data") # get separator for each prefix and lists to slice data for each prefix - if prefix_sep is None: + if sep is None: variables_slice = {"categories": subset} - elif isinstance(prefix_sep, dict): - variables_slice: dict[str, list] = {prefix: [] for prefix in prefix_sep} + elif isinstance(sep, dict): + variables_slice: dict[str, list] = {prefix: [] for prefix in sep} for col in data_to_decode.columns: - for prefix in prefix_sep: + for prefix in sep: if prefix in col: variables_slice[prefix].append(col) else: sep_for_prefix = {} variables_slice = {} for col in data_to_decode.columns: - ps = [ps for ps in prefix_sep if ps in col] + ps = [ps for ps in sep if ps in col] if len(ps) == 0: raise ValueError( - f"Prefix separator not specified for all columns; " + f"Separator not specified for all columns; " f"First instance column: '{col}'" ) prefix = col.split(ps[0])[0] @@ -1200,7 +1200,7 @@ def from_dummies( variables_slice[prefix] = [col] else: variables_slice[prefix].append(col) - prefix_sep = sep_for_prefix + sep = sep_for_prefix # validate number of dropped_first def check_len(item, name) -> None: @@ -1230,10 +1230,10 @@ def check_len(item, name) -> None: cat_data = {} for prefix, prefix_slice in variables_slice.items(): - if prefix_sep is None: + if sep is None: cats = subset.tolist() else: - cats = [col[len(prefix + prefix_sep[prefix]) :] for col in prefix_slice] + cats = [col[len(prefix + sep[prefix]) :] for col in prefix_slice] assigned = data_to_decode[prefix_slice].sum(axis=1) if any(assigned > 1): raise ValueError( diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 003064d1598d7..afce3f793b051 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -105,12 +105,12 @@ def test_from_dummies_no_dummies(): TypeError, match=r"Passed DataFrame contains non-dummy data", ): - from_dummies(dummies, prefix_sep="_") + from_dummies(dummies, sep="_") def test_from_dummies_with_prefix_basic(dummies_basic): expected = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]}) - result = from_dummies(dummies_basic, prefix_sep="_") + result = from_dummies(dummies_basic, sep="_") tm.assert_frame_equal(result, expected) @@ -126,11 +126,11 @@ def test_from_dummies_with_prefix_prefix_multiple_seperators(): with pytest.raises( ValueError, match=( - r"Prefix separator not specified for all columns; " + r"Separator not specified for all columns; " r"First instance column: 'col2-a'" ), ): - from_dummies(dummies, prefix_sep="_") + from_dummies(dummies, sep="_") def test_from_dummies_with_prefix_prefix_sep_list(): @@ -144,7 +144,7 @@ def test_from_dummies_with_prefix_prefix_sep_list(): }, ) expected = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]}) - result = from_dummies(dummies, prefix_sep=["_", "-"]) + result = from_dummies(dummies, sep=["_", "-"]) tm.assert_frame_equal(result, expected) @@ -160,11 +160,11 @@ def test_from_dummies_with_prefix_prefix_sep_list_incomplete(): with pytest.raises( ValueError, match=( - r"Prefix separator not specified for all columns; " + r"Separator not specified for all columns; " r"First instance column: 'col2-a'" ), ): - from_dummies(dummies, prefix_sep=["_"]) + from_dummies(dummies, sep=["_"]) def test_from_dummies_with_prefix_prefix_sep_dict(): @@ -180,7 +180,7 @@ def test_from_dummies_with_prefix_prefix_sep_dict(): expected = DataFrame({"col1": ["a-a", "b-b", "a-a"], "col2": ["b_b", "a_a", "c_c"]}) result = from_dummies( dummies, - prefix_sep={ + sep={ "col1": "_", "col2": "-", }, @@ -205,7 +205,7 @@ def test_from_dummies_with_prefix_prefix_separators_too_complex_for_sep_list(): r"First instance in row: 0" ), ): - from_dummies(dummies, prefix_sep=["_", "-"]) + from_dummies(dummies, sep=["_", "-"]) def test_from_dummies_with_prefix_prefix_partial_sep_dict(): @@ -218,7 +218,7 @@ def test_from_dummies_with_prefix_prefix_partial_sep_dict(): }, ) expected = DataFrame({"col1": ["a-a", "b-b", "a-a"]}) - result = from_dummies(dummies, prefix_sep={"col1": "_"}) + result = from_dummies(dummies, sep={"col1": "_"}) tm.assert_frame_equal(result, expected) @@ -235,7 +235,7 @@ def test_from_dummies_with_prefix_contains_get_dummies_NaN_column(): }, ) expected = DataFrame({"col1": ["a", "b", "NaN"], "col2": ["NaN", "a", "c"]}) - result = from_dummies(dummies, prefix_sep="_") + result = from_dummies(dummies, sep="_") tm.assert_frame_equal(result, expected) @@ -247,30 +247,28 @@ def test_from_dummies_with_prefix_contains_unassigned(dummies_with_unassigned): r"First instance in row: 2" ), ): - from_dummies(dummies_with_unassigned, prefix_sep="_") + from_dummies(dummies_with_unassigned, sep="_") def test_from_dummies_with_prefix_subset(dummies_basic): expected = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]}) result = from_dummies( dummies_basic, - prefix_sep="_", subset=["col1_a", "col1_b", "col2_a", "col2_b", "col2_c"], + sep="_", ) tm.assert_frame_equal(result, expected) def test_from_dummies_with_prefix_dropped_first_str(dummies_with_unassigned): expected = DataFrame({"col1": ["a", "b", "x"], "col2": ["x", "a", "c"]}) - result = from_dummies(dummies_with_unassigned, prefix_sep="_", dropped_first="x") + result = from_dummies(dummies_with_unassigned, sep="_", dropped_first="x") tm.assert_frame_equal(result, expected) def test_from_dummies_with_prefix_dropped_first_list(dummies_with_unassigned): expected = DataFrame({"col1": ["a", "b", "x"], "col2": ["y", "a", "c"]}) - result = from_dummies( - dummies_with_unassigned, prefix_sep="_", dropped_first=["x", "y"] - ) + result = from_dummies(dummies_with_unassigned, sep="_", dropped_first=["x", "y"]) tm.assert_frame_equal(result, expected) @@ -284,7 +282,7 @@ def test_from_dummies_with_prefix_dropped_first_list_not_complete( r"the length of the columns being encoded \(2\)" ), ): - from_dummies(dummies_with_unassigned, prefix_sep="_", dropped_first=["x"]) + from_dummies(dummies_with_unassigned, sep="_", dropped_first=["x"]) def test_from_dummies_with_prefix_dropped_first_wrong_type(dummies_with_unassigned): @@ -296,14 +294,14 @@ def test_from_dummies_with_prefix_dropped_first_wrong_type(dummies_with_unassign r"Received 'dropped_first' of type: tuple" ), ): - from_dummies(dummies_with_unassigned, prefix_sep="_", dropped_first=("x", "y")) + from_dummies(dummies_with_unassigned, sep="_", dropped_first=("x", "y")) def test_from_dummies_with_prefix_dropped_first_dict(dummies_with_unassigned): expected = DataFrame({"col1": ["a", "b", "y"], "col2": ["x", "a", "c"]}) result = from_dummies( dummies_with_unassigned, - prefix_sep="_", + sep="_", dropped_first={"col2": "x", "col1": "y"}, ) tm.assert_frame_equal(result, expected) @@ -319,17 +317,19 @@ def test_from_dummies_with_prefix_dropped_first_dict_not_complete( r"the length of the columns being encoded \(2\)" ), ): - from_dummies( - dummies_with_unassigned, prefix_sep="_", dropped_first={"col1": "x"} - ) + from_dummies(dummies_with_unassigned, sep="_", dropped_first={"col1": "x"}) -def test_from_dummies_with_prefix_wrong_column_type(dummies_basic): +def test_from_dummies_with_prefix_wrong_subset_type(dummies_basic): with pytest.raises( TypeError, match=r"Argument for parameter 'subset' must be list-like", ): - from_dummies(dummies_basic, prefix_sep="_", subset="col1_a") + from_dummies( + dummies_basic, + subset="col1_a", + sep="_", + ) def test_from_dummies_with_prefix_contains_nan(dummies_basic): @@ -337,7 +337,7 @@ def test_from_dummies_with_prefix_contains_nan(dummies_basic): with pytest.raises( ValueError, match=r"Dummy DataFrame contains NA value in column: 'col2_c'" ): - from_dummies(dummies_basic, prefix_sep="_") + from_dummies(dummies_basic, sep="_") def test_from_dummies_with_prefix_double_assignment(): @@ -357,7 +357,7 @@ def test_from_dummies_with_prefix_double_assignment(): r"First instance in row: 0" ), ): - from_dummies(dummies, prefix_sep="_") + from_dummies(dummies, sep="_") def test_from_dummies_collate_prefix_sep_and_dropped_first_list(): @@ -373,7 +373,7 @@ def test_from_dummies_collate_prefix_sep_and_dropped_first_list(): expected = DataFrame({"col1": ["a", "b", "x"], "col2": ["y", "a", "c"]}) result = from_dummies( dummies, - prefix_sep=["_", "-"], + sep=["_", "-"], dropped_first=["x", "y"], ) tm.assert_frame_equal(result, expected) @@ -392,7 +392,7 @@ def test_from_dummies_collate_prefix_sep_and_dropped_first_dict(): expected = DataFrame({"col1": ["a-a", "b-b", "x"], "col2": ["y", "a_a", "c_c"]}) result = from_dummies( dummies, - prefix_sep={ + sep={ "col1": "_", "col2": "-", }, From ee6025d61ee845ce9f8d21b75aa6bfc67166659b Mon Sep 17 00:00:00 2001 From: pckSF Date: Fri, 10 Sep 2021 00:46:31 +0200 Subject: [PATCH 29/95] Improve docstring for sep --- pandas/core/reshape/reshape.py | 5 ++++- pandas/tests/reshape/test_from_dummies.py | 10 +++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index ba4f574050cb7..c60902efbcb6c 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1093,7 +1093,10 @@ def from_dummies( If `columns` is None then all dummy columns are converted and appended to the non-dummy columns. sep : str, list of str, or dict of str, default '_' - Separator/deliminator used in the column names of the dummy categories. + Separator used in the column names of the dummy categories they are + character indicating the separation of the categorical names from the prefixes. + For example, if your column names are 'prefix_A' and 'prefix_B', + you can strip the underscore by specifying sep='_'. Pass a list if multiple prefix separators are used in the columns names. Will separate the prefix based on the first encountered separator following the order of the list. Alternatively, pass a dictionary to map prefix diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index afce3f793b051..d51843f94a1da 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -172,17 +172,17 @@ def test_from_dummies_with_prefix_prefix_sep_dict(): { "col1_a-a": [1, 0, 1], "col1_b-b": [0, 1, 0], - "col2-a_a": [0, 1, 0], - "col2-b_b": [1, 0, 0], - "col2-c_c": [0, 0, 1], + "col_2-a": [0, 1, 0], + "col_2-b": [1, 0, 0], + "col_2-c": [0, 0, 1], }, ) - expected = DataFrame({"col1": ["a-a", "b-b", "a-a"], "col2": ["b_b", "a_a", "c_c"]}) + expected = DataFrame({"col1": ["a-a", "b-b", "a-a"], "col_2": ["b", "a", "c"]}) result = from_dummies( dummies, sep={ "col1": "_", - "col2": "-", + "col_2": "-", }, ) tm.assert_frame_equal(result, expected) From 4e741c895f6381a31e24c27a879707db058ea94c Mon Sep 17 00:00:00 2001 From: pckSF Date: Fri, 10 Sep 2021 00:55:50 +0200 Subject: [PATCH 30/95] Update user guide entry --- doc/source/user_guide/reshaping.rst | 90 ++++++++++++++++++++--------- 1 file changed, 62 insertions(+), 28 deletions(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 94b69c314670a..87d5b6a406f10 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -718,57 +718,91 @@ To choose another dtype, use the ``dtype`` argument: pd.get_dummies(df, dtype=bool).dtypes +.. versionadded:: 1.4.0 -To convert a "dummy" or "indicator" ``DataFrame``, into a categorical ``DataFrame`` -(a categorical ``Series``), for example ``k`` columns of a ``DataFrame`` containing -1s and 0s can derive a ``DataFrame`` (a ``Series``) which has ``k`` distinct values +To convert a "dummy" or "indicator" ``DataFrame``, into a categorical ``DataFrame``, +for example ``k`` columns of a ``DataFrame`` containing 1s and 0s can derive a +``DataFrame`` which has ``k`` distinct values :func:`~pandas.from_dummies`: .. ipython:: python - d = pd.DataFrame({"prefix_a": [0, 1, 0], "prefix_b": [1, 0, 1]}) + df = pd.DataFrame({"prefix_a": [0, 1, 0], "prefix_b": [1, 0, 1]}) + df - pd.from_dummies(d) + pd.from_dummies(df, prefix_sep="_") The ``k`` distinct values can also be represented be a ``dropped_first`` which -means that no vale assigned implies a the value of the dropped value: +means that no value assigned implies a the value of the dropped value: .. ipython:: python - d = pd.DataFrame({"prefix_a": [0, 1, 0]}) - - pd.from_dummies(d, dropped_first="b") + df = pd.DataFrame({"prefix_a": [0, 1, 0]}) + df -The function is the inverse of :func:`pandas.get_dummies `. + pd.from_dummies(df, prefix_sep="_", dropped_first="b") -All non-dummy columns are included untouched in the output. You can control -which columns are included in the output with the ``columns`` argument. +The ``subset`` argument controls which columns of the input ```DataFrame`` to consider +for the decoding: .. ipython:: python - pd.get_dummies(df, columns=["C", "prefix_A", "prefix_B"]) + df = pd.DataFrame({"C": [1, 2, 3], "prefix_a": [0, 1, 0], "prefix_b": [1, 0, 1]}) + df + + pd.get_dummies(df, subset=["prefix_a", "prefix_b"], prefix_sep="_") -You can pass values for for the ``prefix_sep`` argument depending on how many or -nested prefix separators are used in the column names. By default the prefix -separator is assumed to be a '_', however ``prefix_sep`` can be specified in -3 ways: +``sep`` is (or are) character(s) indicating the separation of the categorical names +from the prefixes. For example, if your column names are ``prefix_A`` and ``prefix_B``, +you can strip the underscore by specifying ``sep='_'``. +You can pass values for the ``sep`` argument depending on how many or +nested prefix separators are used in the column names. +If a ``list`` of separators is passed ``from_dummies`` will separate based on the +first encountered separator following the order of the list. * string: Use the same value for ``prefix_sep`` for each column to be dencoded. -* list: Variables will be decoded by the first instance of prefix separator passed - the list that is encountered in the column name. -* dict: Directly map prefix separators to prefixes. Can be used in case mixed - separators are used within the variable name and to separate the variable from - the prefix. +* list: Variables will be decoded by the first prefix separator in the + passed list that is encountered in the column name. +* dict: Directly map prefix separators to prefixes. Can be used in case multiple + separation characters are used to separata the prefixes as well as in the + variable names themself. .. ipython:: python - simple = pd.get_dummies(df, prefix_sep="-") - simple - from_list = pd.get_dummies(df, prefix_sep=["_", "-"]) - from_list - from_dict = pd.get_dummies(df, prefix_sep={"prefix1": "-", "prefix2": "_"}) - from_dict + df_simple = pd.DataFrame({"prefix_a": [0, 1, 0], "prefix_b": [1, 0, 1]}) + df_simple + + simple = pd.get_dummies(df_simple, prefix_sep="_") + simple + + df_multi = pd.DataFrame( + { + "prefix1_a": [0, 1, 0], + "prefix1_b": [1, 0, 1], + "prefix2-a": [0, 1, 0], + "prefix2-b": [1, 0, 1], + "prefix2-c": [0, 1, 0], + } + ) + df_multi + + from_list = pd.get_dummies(df_multi, prefix_sep=["_", "-"]) + from_list + + df_complex = pd.DataFrame( + { + "col1_a-a": [1, 0, 1], + "col1_b-b": [0, 1, 0], + "col2-a_a": [0, 1, 0], + "col2-b_b": [1, 0, 0], + "col2-c_c": [0, 0, 1], + } + ) + df_complex + + from_dict = pd.get_dummies(df_complex, prefix_sep={"prefix1": "-", "prefix2": "_"}) + from_dict .. _reshaping.factorize: From 1b4a8e90c82013ee5ae65d61d3f98e73d2168e47 Mon Sep 17 00:00:00 2001 From: pckSF Date: Fri, 10 Sep 2021 00:57:11 +0200 Subject: [PATCH 31/95] Fix wrong variable name in docstring: d to df --- pandas/core/reshape/reshape.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c60902efbcb6c..07a42c4457fce 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1122,7 +1122,7 @@ def from_dummies( >>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], ... "c": [0, 0, 1, 0]}) - >>> pd.from_dummies(s) + >>> pd.from_dummies(df) 0 categories 1 b 2 c @@ -1132,7 +1132,7 @@ def from_dummies( ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], ... "col2_c": [0, 0, 1]}) - >>> pd.from_dummies(d, sep="_") + >>> pd.from_dummies(df, sep="_") col1 col2 0 a b 1 b a @@ -1142,7 +1142,7 @@ def from_dummies( ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], ... "col2_c": [0, 0, 0]}) - >>> pd.from_dummies(d, sep="_", dropped_first=["d", "e"]) + >>> pd.from_dummies(df, sep="_", dropped_first=["d", "e"]) col1 col2 0 a b 1 b a @@ -1152,7 +1152,7 @@ def from_dummies( ... "col2-a_a": [0, 1, 0], "col2-b_b": [1, 0, 0], ... "col2-c_c": [0, 0, 1]}) - >>> pd.from_dummies(d, sep={"col1": "_", "col2": "-"}) + >>> pd.from_dummies(df, sep={"col1": "_", "col2": "-"}) col1 col2 0 a-a b-b 1 b-b a-a From 90177be1caf5daefa460819b989c82748925e13b Mon Sep 17 00:00:00 2001 From: pckSF Date: Fri, 10 Sep 2021 01:18:43 +0200 Subject: [PATCH 32/95] Fix mypy issues --- pandas/core/reshape/reshape.py | 17 +++++++++++------ pandas/tests/reshape/test_from_dummies.py | 5 ++++- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 07a42c4457fce..cf29ada9b447b 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1074,7 +1074,7 @@ def get_empty_frame(data) -> DataFrame: def from_dummies( data: DataFrame, - subset: None | Index | list[str] = None, + subset: None | list[str] = None, sep: None | str | list[str] | dict[str, str] = None, dropped_first: None | str | list[str] | dict[str, str] = None, ) -> DataFrame: @@ -1161,9 +1161,14 @@ def from_dummies( from pandas.core.reshape.concat import concat if subset is None: - subset = data.columns - elif not is_list_like(subset): - raise TypeError("Argument for parameter 'subset' must be list-like") + subset = data.columns.tolist() + elif isinstance(subset, Index): + subset = subset.tolist() + elif not isinstance(subset, list): + raise TypeError( + f"Expected 'subset' to be of type 'Index', or 'list'; " + f"Received 'subset' of type: {type(dropped_first).__name__}" + ) if data[subset].isna().any().any(): raise ValueError( @@ -1181,7 +1186,7 @@ def from_dummies( if sep is None: variables_slice = {"categories": subset} elif isinstance(sep, dict): - variables_slice: dict[str, list] = {prefix: [] for prefix in sep} + variables_slice = {prefix: [] for prefix in sep} for col in data_to_decode.columns: for prefix in sep: if prefix in col: @@ -1234,7 +1239,7 @@ def check_len(item, name) -> None: cat_data = {} for prefix, prefix_slice in variables_slice.items(): if sep is None: - cats = subset.tolist() + cats = subset.copy() else: cats = [col[len(prefix + sep[prefix]) :] for col in prefix_slice] assigned = data_to_decode[prefix_slice].sum(axis=1) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index d51843f94a1da..22cf26ef9a6ed 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -323,7 +323,10 @@ def test_from_dummies_with_prefix_dropped_first_dict_not_complete( def test_from_dummies_with_prefix_wrong_subset_type(dummies_basic): with pytest.raises( TypeError, - match=r"Argument for parameter 'subset' must be list-like", + match=( + r"Expected 'subset' to be of type 'Index', or 'list'; " + r"Received 'subset' of type: NoneType" + ), ): from_dummies( dummies_basic, From 46457fa434ba02f6738b4373651f2a3e38f1aa22 Mon Sep 17 00:00:00 2001 From: pckSF Date: Fri, 10 Sep 2021 11:53:29 +0200 Subject: [PATCH 33/95] Fix post upstream merge mypy issues --- pandas/core/reshape/reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 74f13b6128e0d..5dd8e070ac1fa 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1276,7 +1276,7 @@ def check_len(item, name) -> None: data_slice = concat((data_to_decode[prefix_slice], assigned == 0), axis=1) else: data_slice = data_to_decode[prefix_slice] - cat_data[prefix] = data_slice.dot(cats) + cat_data[prefix] = data_slice.dot(np.array(cats)) return DataFrame(cat_data) From 131f42b20c4e158217b0b1b4a0c67addfbb2833b Mon Sep 17 00:00:00 2001 From: pckSF Date: Fri, 10 Sep 2021 12:15:05 +0200 Subject: [PATCH 34/95] Fix errors in user guide --- doc/source/user_guide/reshaping.rst | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 3cba50fbfc352..946324e78d857 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -738,7 +738,7 @@ for example ``k`` columns of a ``DataFrame`` containing 1s and 0s can derive a df = pd.DataFrame({"prefix_a": [0, 1, 0], "prefix_b": [1, 0, 1]}) df - pd.from_dummies(df, prefix_sep="_") + pd.from_dummies(df, sep="_") The ``k`` distinct values can also be represented be a ``dropped_first`` which means that no value assigned implies a the value of the dropped value: @@ -748,7 +748,7 @@ means that no value assigned implies a the value of the dropped value: df = pd.DataFrame({"prefix_a": [0, 1, 0]}) df - pd.from_dummies(df, prefix_sep="_", dropped_first="b") + pd.from_dummies(df, sep="_", dropped_first="b") The ``subset`` argument controls which columns of the input ```DataFrame`` to consider for the decoding: @@ -758,7 +758,7 @@ for the decoding: df = pd.DataFrame({"C": [1, 2, 3], "prefix_a": [0, 1, 0], "prefix_b": [1, 0, 1]}) df - pd.get_dummies(df, subset=["prefix_a", "prefix_b"], prefix_sep="_") + pd.from_dummies(df, subset=["prefix_a", "prefix_b"], sep="_") ``sep`` is (or are) character(s) indicating the separation of the categorical names from the prefixes. For example, if your column names are ``prefix_A`` and ``prefix_B``, @@ -781,7 +781,7 @@ first encountered separator following the order of the list. df_simple = pd.DataFrame({"prefix_a": [0, 1, 0], "prefix_b": [1, 0, 1]}) df_simple - simple = pd.get_dummies(df_simple, prefix_sep="_") + simple = pd.from_dummies(df_simple, sep="_") simple df_multi = pd.DataFrame( @@ -789,27 +789,27 @@ first encountered separator following the order of the list. "prefix1_a": [0, 1, 0], "prefix1_b": [1, 0, 1], "prefix2-a": [0, 1, 0], - "prefix2-b": [1, 0, 1], - "prefix2-c": [0, 1, 0], + "prefix2-b": [1, 0, 0], + "prefix2-c": [0, 0, 1], } ) df_multi - from_list = pd.get_dummies(df_multi, prefix_sep=["_", "-"]) + from_list = pd.from_dummies(df_multi, sep=["_", "-"]) from_list df_complex = pd.DataFrame( { - "col1_a-a": [1, 0, 1], - "col1_b-b": [0, 1, 0], - "col2-a_a": [0, 1, 0], - "col2-b_b": [1, 0, 0], - "col2-c_c": [0, 0, 1], + "prefix1_a-a": [1, 0, 1], + "prefix1_b-b": [0, 1, 0], + "prefix_2-a": [0, 1, 0], + "prefix_2-b": [1, 0, 0], + "prefix_2-c": [0, 0, 1], } ) df_complex - from_dict = pd.get_dummies(df_complex, prefix_sep={"prefix1": "-", "prefix2": "_"}) + from_dict = pd.from_dummies(df_complex, sep={"prefix1": "_", "prefix_2": "-"}) from_dict From 6dacf533fb5a7529adc3830b7c8a860e0db321c6 Mon Sep 17 00:00:00 2001 From: pckSF Date: Thu, 7 Oct 2021 16:14:04 +0200 Subject: [PATCH 35/95] Allow hashable categories --- pandas/core/reshape/reshape.py | 18 ++++++++----- pandas/tests/reshape/test_from_dummies.py | 33 ++++++++++++++++++++--- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 5dd8e070ac1fa..dd979722afe5f 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1,7 +1,10 @@ from __future__ import annotations import itertools -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Hashable, +) import numpy as np @@ -1091,7 +1094,7 @@ def get_empty_frame(data) -> DataFrame: def from_dummies( data: DataFrame, - subset: None | list[str] = None, + subset: None | list[Hashable] = None, sep: None | str | list[str] | dict[str, str] = None, dropped_first: None | str | list[str] | dict[str, str] = None, ) -> DataFrame: @@ -1104,11 +1107,10 @@ def from_dummies( ---------- data : `DataFrame` Data which contains dummy-coded variables. - subset : None, Index, or list of str, default 'None' + subset : None, Index, or list of Hashables, default 'None' The columns which to convert from dummy-encoding and return as categorical - `DataFrame`. - If `columns` is None then all dummy columns are converted and appended - to the non-dummy columns. + `DataFrame`. If `columns` is None then all dummy columns are converted and + appended to the non-dummy columns. sep : str, list of str, or dict of str, default '_' Separator used in the column names of the dummy categories they are character indicating the separation of the categorical names from the prefixes. @@ -1276,7 +1278,9 @@ def check_len(item, name) -> None: data_slice = concat((data_to_decode[prefix_slice], assigned == 0), axis=1) else: data_slice = data_to_decode[prefix_slice] - cat_data[prefix] = data_slice.dot(np.array(cats)) + cats = np.array(cats, dtype="object") + # get indices of True entries along axis=1 + cat_data[prefix] = cats[data_slice.to_numpy().nonzero()[1]] return DataFrame(cat_data) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 22cf26ef9a6ed..fea75ca9bce9c 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -32,14 +32,41 @@ def dummies_with_unassigned(): ) -def test_from_dummies_no_prefix_basic(): +def test_from_dummies_no_prefix_string_cats_basic(): dummies = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}) expected = DataFrame({"categories": ["a", "b", "c", "a"]}) result = from_dummies(dummies) tm.assert_frame_equal(result, expected) -def test_from_dummies_no_prefix_contains_get_dummies_NaN_column(): +def test_from_dummies_no_prefix_int_cats_basic(): + dummies = DataFrame( + {1: [1, 0, 0, 0], 25: [0, 1, 0, 0], 2: [0, 0, 1, 0], 5: [0, 0, 0, 1]} + ) + expected = DataFrame({"categories": [1, 25, 2, 5]}, dtype="object") + result = from_dummies(dummies) + tm.assert_frame_equal(result, expected) + + +def test_from_dummies_no_prefix_float_cats_basic(): + dummies = DataFrame( + {1.0: [1, 0, 0, 0], 25.0: [0, 1, 0, 0], 2.5: [0, 0, 1, 0], 5.84: [0, 0, 0, 1]} + ) + expected = DataFrame({"categories": [1.0, 25.0, 2.5, 5.84]}, dtype="object") + result = from_dummies(dummies) + tm.assert_frame_equal(result, expected) + + +def test_from_dummies_no_prefix_mixed_cats_basic(): + dummies = DataFrame( + {1.23: [1, 0, 0, 0], "c": [0, 1, 0, 0], 2: [0, 0, 1, 0], False: [0, 0, 0, 1]} + ) + expected = DataFrame({"categories": [1.23, "c", 2, False]}, dtype="object") + result = from_dummies(dummies) + tm.assert_frame_equal(result, expected) + + +def test_from_dummies_no_prefix_string_cats_contains_get_dummies_NaN_column(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0], "NaN": [0, 0, 1]}) expected = DataFrame({"categories": ["a", "b", "NaN"]}) result = from_dummies(dummies) @@ -58,7 +85,7 @@ def test_from_dummies_no_prefix_contains_unassigned(): from_dummies(dummies) -def test_from_dummies_no_prefix_dropped_first(): +def test_from_dummies_no_prefix_string_cats_dropped_first(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) expected = DataFrame({"categories": ["a", "b", "c"]}) result = from_dummies(dummies, dropped_first="c") From 61edd30010cea3cc0e6fc27919cbde207a4e9c8a Mon Sep 17 00:00:00 2001 From: pckSF Date: Sat, 16 Oct 2021 22:48:24 +0200 Subject: [PATCH 36/95] Add None category to mixed_cats_basic test --- pandas/tests/reshape/test_from_dummies.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index fea75ca9bce9c..cf85860a3553b 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -59,9 +59,15 @@ def test_from_dummies_no_prefix_float_cats_basic(): def test_from_dummies_no_prefix_mixed_cats_basic(): dummies = DataFrame( - {1.23: [1, 0, 0, 0], "c": [0, 1, 0, 0], 2: [0, 0, 1, 0], False: [0, 0, 0, 1]} + { + 1.23: [1, 0, 0, 0, 0], + "c": [0, 1, 0, 0, 0], + 2: [0, 0, 1, 0, 0], + False: [0, 0, 0, 1, 0], + None: [0, 0, 0, 0, 1], + } ) - expected = DataFrame({"categories": [1.23, "c", 2, False]}, dtype="object") + expected = DataFrame({"categories": [1.23, "c", 2, False, None]}, dtype="object") result = from_dummies(dummies) tm.assert_frame_equal(result, expected) From 04f360c37c0f5f9862d557ed363d42dc0d817a2f Mon Sep 17 00:00:00 2001 From: pckSF Date: Fri, 22 Oct 2021 01:28:07 +0200 Subject: [PATCH 37/95] Add index to argument types and fix resulting mypy issues --- pandas/core/reshape/reshape.py | 12 ++++++------ pandas/tests/reshape/test_from_dummies.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index dd979722afe5f..21578836f5898 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1094,7 +1094,7 @@ def get_empty_frame(data) -> DataFrame: def from_dummies( data: DataFrame, - subset: None | list[Hashable] = None, + subset: None | Index | list[Hashable] = None, sep: None | str | list[str] | dict[str, str] = None, dropped_first: None | str | list[str] | dict[str, str] = None, ) -> DataFrame: @@ -1180,13 +1180,13 @@ def from_dummies( from pandas.core.reshape.concat import concat if subset is None: - subset = data.columns.tolist() + subset = list(data.columns) elif isinstance(subset, Index): - subset = subset.tolist() + subset = list(subset) elif not isinstance(subset, list): raise TypeError( f"Expected 'subset' to be of type 'Index', or 'list'; " - f"Received 'subset' of type: {type(dropped_first).__name__}" + f"Received 'subset' of type: {type(subset).__name__}" ) if data[subset].isna().any().any(): @@ -1278,9 +1278,9 @@ def check_len(item, name) -> None: data_slice = concat((data_to_decode[prefix_slice], assigned == 0), axis=1) else: data_slice = data_to_decode[prefix_slice] - cats = np.array(cats, dtype="object") + cats_array = np.array(cats, dtype="object") # get indices of True entries along axis=1 - cat_data[prefix] = cats[data_slice.to_numpy().nonzero()[1]] + cat_data[prefix] = cats_array[data_slice.to_numpy().nonzero()[1]] return DataFrame(cat_data) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index cf85860a3553b..81b465907cf95 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -358,7 +358,7 @@ def test_from_dummies_with_prefix_wrong_subset_type(dummies_basic): TypeError, match=( r"Expected 'subset' to be of type 'Index', or 'list'; " - r"Received 'subset' of type: NoneType" + r"Received 'subset' of type: str" ), ): from_dummies( From 56ea182794c36397222dc73a7a19907b19b09fe8 Mon Sep 17 00:00:00 2001 From: pckSF Date: Sat, 20 Nov 2021 12:06:21 +0100 Subject: [PATCH 38/95] Remove list from dropped_first args --- pandas/core/reshape/reshape.py | 14 +++---- pandas/tests/reshape/test_from_dummies.py | 46 ++--------------------- 2 files changed, 9 insertions(+), 51 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 5c40603666668..e8b64eb9613a8 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1101,7 +1101,7 @@ def from_dummies( data: DataFrame, subset: None | Index | list[Hashable] = None, sep: None | str | list[str] | dict[str, str] = None, - dropped_first: None | str | list[str] | dict[str, str] = None, + dropped_first: None | str | dict[str, str] = None, ) -> DataFrame: """ Create a categorical `DataFrame` from a `DataFrame` of dummy variables. @@ -1126,11 +1126,10 @@ def from_dummies( the order of the list. Alternatively, pass a dictionary to map prefix separators to prefixes if multiple and/or mixed separators are used in the column names. - dropped_fist : None, str, list of str, or dict of str, default None + dropped_fist : None, str or dict of str, default None The implied value the dummy takes when all values are zero. - Can be a a single value for all variables, a list with a number of values - equal to the dummy variables, or a dict directly mapping the dropped value - to a prefix of a variable. + Can be a a single value for all variables or a dict directly mapping the + dropped value to a prefix of a variable. Returns ------- @@ -1247,16 +1246,13 @@ def check_len(item, name) -> None: if dropped_first: if isinstance(dropped_first, dict): check_len(dropped_first, "dropped_first") - elif isinstance(dropped_first, list): - check_len(dropped_first, "dropped_first") - dropped_first = dict(zip(variables_slice, dropped_first)) elif isinstance(dropped_first, str): dropped_first = dict( zip(variables_slice, [dropped_first] * len(variables_slice)) ) else: raise TypeError( - f"Expected 'dropped_first' to be of type 'str', 'list', or 'dict'; " + f"Expected 'dropped_first' to be of type 'str' or 'dict'; " f"Received 'dropped_first' of type: {type(dropped_first).__name__}" ) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 81b465907cf95..8feb77d1a6e08 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -101,10 +101,10 @@ def test_from_dummies_no_prefix_string_cats_dropped_first(): def test_from_dummies_no_prefix_wrong_dropped_first_type(): dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}) with pytest.raises( - ValueError, + TypeError, match=( - r"Length of 'dropped_first' \(2\) did not match the length of the " - r"columns being encoded \(1\)" + r"Expected \'dropped_first\' to be of type \'str\' or \'dict\'; " + r"Received \'dropped_first\' of type: list" ), ): from_dummies(dummies, dropped_first=["c", "d"]) @@ -299,31 +299,12 @@ def test_from_dummies_with_prefix_dropped_first_str(dummies_with_unassigned): tm.assert_frame_equal(result, expected) -def test_from_dummies_with_prefix_dropped_first_list(dummies_with_unassigned): - expected = DataFrame({"col1": ["a", "b", "x"], "col2": ["y", "a", "c"]}) - result = from_dummies(dummies_with_unassigned, sep="_", dropped_first=["x", "y"]) - tm.assert_frame_equal(result, expected) - - -def test_from_dummies_with_prefix_dropped_first_list_not_complete( - dummies_with_unassigned, -): - with pytest.raises( - ValueError, - match=( - r"Length of 'dropped_first' \(1\) did not match " - r"the length of the columns being encoded \(2\)" - ), - ): - from_dummies(dummies_with_unassigned, sep="_", dropped_first=["x"]) - - def test_from_dummies_with_prefix_dropped_first_wrong_type(dummies_with_unassigned): with pytest.raises( TypeError, match=( - r"Expected 'dropped_first' to be of type 'str', 'list', or 'dict'; " + r"Expected 'dropped_first' to be of type 'str' or 'dict'; " r"Received 'dropped_first' of type: tuple" ), ): @@ -396,25 +377,6 @@ def test_from_dummies_with_prefix_double_assignment(): from_dummies(dummies, sep="_") -def test_from_dummies_collate_prefix_sep_and_dropped_first_list(): - dummies = DataFrame( - { - "col1_a": [1, 0, 0], - "col1_b": [0, 1, 0], - "col2-a": [0, 1, 0], - "col2-b": [0, 0, 0], - "col2-c": [0, 0, 1], - }, - ) - expected = DataFrame({"col1": ["a", "b", "x"], "col2": ["y", "a", "c"]}) - result = from_dummies( - dummies, - sep=["_", "-"], - dropped_first=["x", "y"], - ) - tm.assert_frame_equal(result, expected) - - def test_from_dummies_collate_prefix_sep_and_dropped_first_dict(): dummies = DataFrame( { From 39a0199972f13b87b128b109354516ed9c833860 Mon Sep 17 00:00:00 2001 From: pckSF Date: Sat, 20 Nov 2021 12:07:29 +0100 Subject: [PATCH 39/95] Remove list from sep args --- pandas/core/reshape/reshape.py | 47 ++++++++------- pandas/tests/reshape/test_from_dummies.py | 69 ++++++----------------- 2 files changed, 43 insertions(+), 73 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index e8b64eb9613a8..99667449d7bfb 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1100,7 +1100,7 @@ def get_empty_frame(data) -> DataFrame: def from_dummies( data: DataFrame, subset: None | Index | list[Hashable] = None, - sep: None | str | list[str] | dict[str, str] = None, + sep: None | str | dict[str, str] = None, dropped_first: None | str | dict[str, str] = None, ) -> DataFrame: """ @@ -1116,16 +1116,13 @@ def from_dummies( The columns which to convert from dummy-encoding and return as categorical `DataFrame`. If `columns` is None then all dummy columns are converted and appended to the non-dummy columns. - sep : str, list of str, or dict of str, default '_' + sep : str or dict of str, default '_' Separator used in the column names of the dummy categories they are character indicating the separation of the categorical names from the prefixes. For example, if your column names are 'prefix_A' and 'prefix_B', you can strip the underscore by specifying sep='_'. - Pass a list if multiple prefix separators are used in the columns names. - Will separate the prefix based on the first encountered separator following - the order of the list. Alternatively, pass a dictionary to map prefix - separators to prefixes if multiple and/or mixed separators are used in the - column names. + Alternatively, pass a dictionary to map prefix separators to prefixes if + multiple and/or mixed separators are used in the column names. dropped_fist : None, str or dict of str, default None The implied value the dummy takes when all values are zero. Can be a a single value for all variables or a dict directly mapping the @@ -1207,31 +1204,39 @@ def from_dummies( # get separator for each prefix and lists to slice data for each prefix if sep is None: - variables_slice = {"categories": subset} + variables_slice = {"": subset} elif isinstance(sep, dict): variables_slice = {prefix: [] for prefix in sep} for col in data_to_decode.columns: + sep_available = False for prefix in sep: if prefix in col: + sep_available = True variables_slice[prefix].append(col) - else: - sep_for_prefix = {} - variables_slice = {} + break + if not sep_available: + raise ValueError( + f"Separator not specified for all columns; " + f"First instance column: {col}" + ) + elif isinstance(sep, str): + variables_slice: dict[str, list] = {} for col in data_to_decode.columns: - ps = [ps for ps in sep if ps in col] - if len(ps) == 0: + prefix = col.split(sep)[0] + if len(prefix) == len(col): raise ValueError( f"Separator not specified for all columns; " - f"First instance column: '{col}'" + f"First instance column: {col}" ) - prefix = col.split(ps[0])[0] - if prefix not in sep_for_prefix: - sep_for_prefix[prefix] = ps[0] if prefix not in variables_slice: variables_slice[prefix] = [col] else: variables_slice[prefix].append(col) - sep = sep_for_prefix + else: + raise TypeError( + f"Expected 'sep' to be of type 'str' or 'dict'; " + f"Received 'sep' of type: {type(sep).__name__}" + ) # validate number of dropped_first def check_len(item, name) -> None: @@ -1260,8 +1265,10 @@ def check_len(item, name) -> None: for prefix, prefix_slice in variables_slice.items(): if sep is None: cats = subset.copy() - else: - cats = [col[len(prefix + sep[prefix]) :] for col in prefix_slice] + elif isinstance(sep, str): + cats = [col[len(prefix + sep):] for col in prefix_slice] + elif isinstance(sep, dict): + cats = [col[len(prefix + sep[prefix]):] for col in prefix_slice] assigned = data_to_decode[prefix_slice].sum(axis=1) if any(assigned > 1): raise ValueError( diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 8feb77d1a6e08..ef714f7ce79ff 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -160,46 +160,12 @@ def test_from_dummies_with_prefix_prefix_multiple_seperators(): ValueError, match=( r"Separator not specified for all columns; " - r"First instance column: 'col2-a'" + r"First instance column: col2-a" ), ): from_dummies(dummies, sep="_") -def test_from_dummies_with_prefix_prefix_sep_list(): - dummies = DataFrame( - { - "col1_a": [1, 0, 1], - "col1_b": [0, 1, 0], - "col2-a": [0, 1, 0], - "col2-b": [1, 0, 0], - "col2-c": [0, 0, 1], - }, - ) - expected = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]}) - result = from_dummies(dummies, sep=["_", "-"]) - tm.assert_frame_equal(result, expected) - - -def test_from_dummies_with_prefix_prefix_sep_list_incomplete(): - dummies = DataFrame( - { - "col1_a": [1, 0, 1], - "col1_b": [0, 1, 0], - "col2-a": [0, 1, 0], - "col2-b": [1, 0, 1], - }, - ) - with pytest.raises( - ValueError, - match=( - r"Separator not specified for all columns; " - r"First instance column: 'col2-a'" - ), - ): - from_dummies(dummies, sep=["_"]) - - def test_from_dummies_with_prefix_prefix_sep_dict(): dummies = DataFrame( { @@ -221,38 +187,35 @@ def test_from_dummies_with_prefix_prefix_sep_dict(): tm.assert_frame_equal(result, expected) -def test_from_dummies_with_prefix_prefix_separators_too_complex_for_sep_list(): +def test_from_dummies_with_prefix_prefix_partial_sep_dict(): dummies = DataFrame( { "col1_a-a": [1, 0, 1], "col1_b-b": [0, 1, 0], "col2-a_a": [0, 1, 0], - "col2-b_b": [1, 0, 0], - "col2-c_c": [0, 0, 1], + "col2-b_b": [1, 0, 1], }, ) with pytest.raises( ValueError, match=( - r"Dummy DataFrame contains unassigned value\(s\); " - r"First instance in row: 0" + r"Separator not specified for all columns; " + r"First instance column: col2-a_a" ), ): - from_dummies(dummies, sep=["_", "-"]) + from_dummies(dummies, sep={"col1": "_"}) -def test_from_dummies_with_prefix_prefix_partial_sep_dict(): - dummies = DataFrame( - { - "col1_a-a": [1, 0, 1], - "col1_b-b": [0, 1, 0], - "col2-a_a": [0, 1, 0], - "col2-b_b": [1, 0, 1], - }, - ) - expected = DataFrame({"col1": ["a-a", "b-b", "a-a"]}) - result = from_dummies(dummies, sep={"col1": "_"}) - tm.assert_frame_equal(result, expected) +def test_from_dummies_with_prefix_sep_wrong_type(dummies_basic): + + with pytest.raises( + TypeError, + match=( + r"Expected 'sep' to be of type 'str' or 'dict'; " + r"Received 'sep' of type: list" + ), + ): + from_dummies(dummies_basic, sep=["_"]) def test_from_dummies_with_prefix_contains_get_dummies_NaN_column(): From e05fe3f08630fde311ac6c51770971e360360fd4 Mon Sep 17 00:00:00 2001 From: pckSF Date: Sat, 20 Nov 2021 12:08:34 +0100 Subject: [PATCH 40/95] Remove default category name --- pandas/tests/reshape/test_from_dummies.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index ef714f7ce79ff..60295618fcbd1 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -34,7 +34,7 @@ def dummies_with_unassigned(): def test_from_dummies_no_prefix_string_cats_basic(): dummies = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}) - expected = DataFrame({"categories": ["a", "b", "c", "a"]}) + expected = DataFrame({"": ["a", "b", "c", "a"]}) result = from_dummies(dummies) tm.assert_frame_equal(result, expected) @@ -43,7 +43,7 @@ def test_from_dummies_no_prefix_int_cats_basic(): dummies = DataFrame( {1: [1, 0, 0, 0], 25: [0, 1, 0, 0], 2: [0, 0, 1, 0], 5: [0, 0, 0, 1]} ) - expected = DataFrame({"categories": [1, 25, 2, 5]}, dtype="object") + expected = DataFrame({"": [1, 25, 2, 5]}, dtype="object") result = from_dummies(dummies) tm.assert_frame_equal(result, expected) @@ -52,7 +52,7 @@ def test_from_dummies_no_prefix_float_cats_basic(): dummies = DataFrame( {1.0: [1, 0, 0, 0], 25.0: [0, 1, 0, 0], 2.5: [0, 0, 1, 0], 5.84: [0, 0, 0, 1]} ) - expected = DataFrame({"categories": [1.0, 25.0, 2.5, 5.84]}, dtype="object") + expected = DataFrame({"": [1.0, 25.0, 2.5, 5.84]}, dtype="object") result = from_dummies(dummies) tm.assert_frame_equal(result, expected) @@ -67,14 +67,14 @@ def test_from_dummies_no_prefix_mixed_cats_basic(): None: [0, 0, 0, 0, 1], } ) - expected = DataFrame({"categories": [1.23, "c", 2, False, None]}, dtype="object") + expected = DataFrame({"": [1.23, "c", 2, False, None]}, dtype="object") result = from_dummies(dummies) tm.assert_frame_equal(result, expected) def test_from_dummies_no_prefix_string_cats_contains_get_dummies_NaN_column(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0], "NaN": [0, 0, 1]}) - expected = DataFrame({"categories": ["a", "b", "NaN"]}) + expected = DataFrame({"": ["a", "b", "NaN"]}) result = from_dummies(dummies) tm.assert_frame_equal(result, expected) @@ -93,7 +93,7 @@ def test_from_dummies_no_prefix_contains_unassigned(): def test_from_dummies_no_prefix_string_cats_dropped_first(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) - expected = DataFrame({"categories": ["a", "b", "c"]}) + expected = DataFrame({"": ["a", "b", "c"]}) result = from_dummies(dummies, dropped_first="c") tm.assert_frame_equal(result, expected) From 23f6c07e6a83999668beee6e2509802b3a18dbd1 Mon Sep 17 00:00:00 2001 From: pckSF Date: Sat, 20 Nov 2021 12:10:12 +0100 Subject: [PATCH 41/95] Adapt docstring examples to removal of list from sep and dropped_first args --- pandas/core/reshape/reshape.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 99667449d7bfb..8c2db539953ec 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1121,11 +1121,11 @@ def from_dummies( character indicating the separation of the categorical names from the prefixes. For example, if your column names are 'prefix_A' and 'prefix_B', you can strip the underscore by specifying sep='_'. - Alternatively, pass a dictionary to map prefix separators to prefixes if + Alternatively, pass a dictionary to map prefix separators to prefixes if multiple and/or mixed separators are used in the column names. dropped_fist : None, str or dict of str, default None The implied value the dummy takes when all values are zero. - Can be a a single value for all variables or a dict directly mapping the + Can be a a single value for all variables or a dict directly mapping the dropped value to a prefix of a variable. Returns @@ -1162,7 +1162,7 @@ def from_dummies( ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], ... "col2_c": [0, 0, 0]}) - >>> pd.from_dummies(df, sep="_", dropped_first=["d", "e"]) + >>> pd.from_dummies(df, sep="_", dropped_first={"col1": "d", "col2": "e"]) col1 col2 0 a b 1 b a @@ -1266,9 +1266,9 @@ def check_len(item, name) -> None: if sep is None: cats = subset.copy() elif isinstance(sep, str): - cats = [col[len(prefix + sep):] for col in prefix_slice] + cats = [col[len(prefix + sep) :] for col in prefix_slice] elif isinstance(sep, dict): - cats = [col[len(prefix + sep[prefix]):] for col in prefix_slice] + cats = [col[len(prefix + sep[prefix]) :] for col in prefix_slice] assigned = data_to_decode[prefix_slice].sum(axis=1) if any(assigned > 1): raise ValueError( From 7190879b3aabc8098ac4484e2055654c079decc7 Mon Sep 17 00:00:00 2001 From: pckSF Date: Sat, 20 Nov 2021 12:34:23 +0100 Subject: [PATCH 42/95] Update docstring: Remove default category name --- pandas/core/reshape/reshape.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 8c2db539953ec..134fd67f23213 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1143,10 +1143,9 @@ def from_dummies( ... "c": [0, 0, 1, 0]}) >>> pd.from_dummies(df) - 0 categories - 1 b - 2 c - 3 a + 0 b + 1 c + 2 a >>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], From 012a1dd04f8535e6f75c5929a5f4be0641df22c1 Mon Sep 17 00:00:00 2001 From: pckSF Date: Sat, 20 Nov 2021 12:34:58 +0100 Subject: [PATCH 43/95] Updaterst: Add missing word --- doc/source/user_guide/reshaping.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 946324e78d857..4fac9226e01d3 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -730,7 +730,7 @@ To choose another dtype, use the ``dtype`` argument: To convert a "dummy" or "indicator" ``DataFrame``, into a categorical ``DataFrame``, for example ``k`` columns of a ``DataFrame`` containing 1s and 0s can derive a -``DataFrame`` which has ``k`` distinct values +``DataFrame`` which has ``k`` distinct values using :func:`~pandas.from_dummies`: .. ipython:: python From 52ed9097d0cda0942e9b43e83197cf7120bc08e8 Mon Sep 17 00:00:00 2001 From: pckSF Date: Sat, 20 Nov 2021 12:35:31 +0100 Subject: [PATCH 44/95] Add from_dummies to reshaping api --- pandas/core/reshape/api.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/api.py b/pandas/core/reshape/api.py index 58d741c2c6988..6cdaf647fb9fc 100644 --- a/pandas/core/reshape/api.py +++ b/pandas/core/reshape/api.py @@ -16,7 +16,10 @@ pivot, pivot_table, ) -from pandas.core.reshape.reshape import get_dummies +from pandas.core.reshape.reshape import ( + from_dummies, + get_dummies, +) from pandas.core.reshape.tile import ( cut, qcut, From 0cf35d8d8511fca4838438c540dd056488d32a1c Mon Sep 17 00:00:00 2001 From: pckSF Date: Sat, 20 Nov 2021 13:10:03 +0100 Subject: [PATCH 45/95] Add: allow dropped_first to be any hashable type --- pandas/core/reshape/reshape.py | 14 +++++----- pandas/tests/reshape/test_from_dummies.py | 31 ++++++++++++++++++----- 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 134fd67f23213..1a1478e50d94d 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1101,7 +1101,7 @@ def from_dummies( data: DataFrame, subset: None | Index | list[Hashable] = None, sep: None | str | dict[str, str] = None, - dropped_first: None | str | dict[str, str] = None, + dropped_first: None | Hashable | dict[str, Hashable] = None, ) -> DataFrame: """ Create a categorical `DataFrame` from a `DataFrame` of dummy variables. @@ -1123,7 +1123,7 @@ def from_dummies( you can strip the underscore by specifying sep='_'. Alternatively, pass a dictionary to map prefix separators to prefixes if multiple and/or mixed separators are used in the column names. - dropped_fist : None, str or dict of str, default None + dropped_fist : None, Hashable or dict of Hashables, default None The implied value the dummy takes when all values are zero. Can be a a single value for all variables or a dict directly mapping the dropped value to a prefix of a variable. @@ -1219,7 +1219,7 @@ def from_dummies( f"First instance column: {col}" ) elif isinstance(sep, str): - variables_slice: dict[str, list] = {} + variables_slice = {} for col in data_to_decode.columns: prefix = col.split(sep)[0] if len(prefix) == len(col): @@ -1250,13 +1250,13 @@ def check_len(item, name) -> None: if dropped_first: if isinstance(dropped_first, dict): check_len(dropped_first, "dropped_first") - elif isinstance(dropped_first, str): + elif isinstance(dropped_first, Hashable): dropped_first = dict( zip(variables_slice, [dropped_first] * len(variables_slice)) ) else: raise TypeError( - f"Expected 'dropped_first' to be of type 'str' or 'dict'; " + f"Expected 'dropped_first' to be of type 'Hashable' or 'dict'; " f"Received 'dropped_first' of type: {type(dropped_first).__name__}" ) @@ -1264,10 +1264,10 @@ def check_len(item, name) -> None: for prefix, prefix_slice in variables_slice.items(): if sep is None: cats = subset.copy() - elif isinstance(sep, str): - cats = [col[len(prefix + sep) :] for col in prefix_slice] elif isinstance(sep, dict): cats = [col[len(prefix + sep[prefix]) :] for col in prefix_slice] + else: + cats = [col[len(prefix + sep) :] for col in prefix_slice] assigned = data_to_decode[prefix_slice].sum(axis=1) if any(assigned > 1): raise ValueError( diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 60295618fcbd1..4f1509870a5f5 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -103,8 +103,8 @@ def test_from_dummies_no_prefix_wrong_dropped_first_type(): with pytest.raises( TypeError, match=( - r"Expected \'dropped_first\' to be of type \'str\' or \'dict\'; " - r"Received \'dropped_first\' of type: list" + r"Expected 'dropped_first' to be of type 'Hashable' or 'dict'; " + r"Received 'dropped_first' of type: list" ), ): from_dummies(dummies, dropped_first=["c", "d"]) @@ -263,15 +263,14 @@ def test_from_dummies_with_prefix_dropped_first_str(dummies_with_unassigned): def test_from_dummies_with_prefix_dropped_first_wrong_type(dummies_with_unassigned): - with pytest.raises( TypeError, match=( - r"Expected 'dropped_first' to be of type 'str' or 'dict'; " - r"Received 'dropped_first' of type: tuple" + r"Expected 'dropped_first' to be of type 'Hashable' or 'dict'; " + r"Received 'dropped_first' of type: list" ), ): - from_dummies(dummies_with_unassigned, sep="_", dropped_first=("x", "y")) + from_dummies(dummies_with_unassigned, sep="_", dropped_first=["x", "y"]) def test_from_dummies_with_prefix_dropped_first_dict(dummies_with_unassigned): @@ -284,6 +283,26 @@ def test_from_dummies_with_prefix_dropped_first_dict(dummies_with_unassigned): tm.assert_frame_equal(result, expected) +def test_from_dummies_with_prefix_dropped_first_int_and_float(dummies_with_unassigned): + expected = DataFrame({"col1": ["a", "b", 2.5], "col2": [1, "a", "c"]}) + result = from_dummies( + dummies_with_unassigned, + sep="_", + dropped_first={"col2": 1, "col1": 2.5}, + ) + tm.assert_frame_equal(result, expected) + + +def test_from_dummies_with_prefix_dropped_first_bool_and_none(dummies_with_unassigned): + expected = DataFrame({"col1": ["a", "b", False], "col2": [None, "a", "c"]}) + result = from_dummies( + dummies_with_unassigned, + sep="_", + dropped_first={"col2": None, "col1": False}, + ) + tm.assert_frame_equal(result, expected) + + def test_from_dummies_with_prefix_dropped_first_dict_not_complete( dummies_with_unassigned, ): From b9303bc08db44d7d5b523a51c60bcc5e6cbc86cf Mon Sep 17 00:00:00 2001 From: pckSF Date: Sat, 20 Nov 2021 13:33:46 +0100 Subject: [PATCH 46/95] Add: Temporary mypy fix --- pandas/core/reshape/reshape.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 1a1478e50d94d..e85fb0be1a279 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1265,9 +1265,15 @@ def check_len(item, name) -> None: if sep is None: cats = subset.copy() elif isinstance(sep, dict): - cats = [col[len(prefix + sep[prefix]) :] for col in prefix_slice] + cats = [ + col[len(prefix + sep[prefix]) :] + for col in prefix_slice + if isinstance(col, str) + ] else: - cats = [col[len(prefix + sep) :] for col in prefix_slice] + cats = [ + col[len(prefix + sep) :] for col in prefix_slice if isinstance(col, str) + ] assigned = data_to_decode[prefix_slice].sum(axis=1) if any(assigned > 1): raise ValueError( From 55ad27446010c6b1d301c1b41197ec938d9f5786 Mon Sep 17 00:00:00 2001 From: pckSF Date: Sat, 27 Nov 2021 12:56:08 +0100 Subject: [PATCH 47/95] Add from_dummies to pandas __init__ file --- doc/source/reference/general_functions.rst | 1 + pandas/__init__.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/doc/source/reference/general_functions.rst b/doc/source/reference/general_functions.rst index b5832cb8aa591..35c16eb60fb26 100644 --- a/doc/source/reference/general_functions.rst +++ b/doc/source/reference/general_functions.rst @@ -23,6 +23,7 @@ Data manipulations merge_asof concat get_dummies + from_dummies factorize unique wide_to_long diff --git a/pandas/__init__.py b/pandas/__init__.py index 9505d0481ee19..7278dc9c5b66d 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -127,6 +127,7 @@ pivot, pivot_table, get_dummies, + from_dummies, cut, qcut, ) @@ -361,6 +362,7 @@ def __getattr__(name): "eval", "factorize", "get_dummies", + "from_dummies", "get_option", "infer_freq", "interval_range", From 1b17815cf9d94e2c6545227fd39efc65dd20654d Mon Sep 17 00:00:00 2001 From: pckSF Date: Sat, 27 Nov 2021 16:43:24 +0100 Subject: [PATCH 48/95] Add from_dummies to test_api tests --- pandas/tests/api/test_api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index ec20bc49c8a4b..989d61fce925b 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -117,6 +117,7 @@ class TestPDApi(Base): "eval", "factorize", "get_dummies", + "from_dummies", "infer_freq", "isna", "isnull", From 00c7b05f79be357028c29f75232e393fdb6cd5bf Mon Sep 17 00:00:00 2001 From: pckSF Date: Sat, 27 Nov 2021 17:50:58 +0100 Subject: [PATCH 49/95] Fix docstring examples --- pandas/core/reshape/reshape.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index e85fb0be1a279..28fe998dd4d4f 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1143,9 +1143,10 @@ def from_dummies( ... "c": [0, 0, 1, 0]}) >>> pd.from_dummies(df) - 0 b - 1 c - 2 a + 0 a + 1 b + 2 c + 3 a >>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], From 07ba5361b440c193e4509f1424b9e25a253cab4b Mon Sep 17 00:00:00 2001 From: pckSF Date: Sat, 27 Nov 2021 17:51:56 +0100 Subject: [PATCH 50/95] Adapt documentation to account for removal of list arguments --- doc/source/user_guide/reshaping.rst | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 4fac9226e01d3..4b7cf071e3071 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -765,13 +765,9 @@ from the prefixes. For example, if your column names are ``prefix_A`` and ``pref you can strip the underscore by specifying ``sep='_'``. You can pass values for the ``sep`` argument depending on how many or nested prefix separators are used in the column names. -If a ``list`` of separators is passed ``from_dummies`` will separate based on the -first encountered separator following the order of the list. * string: Use the same value for ``prefix_sep`` for each column to be dencoded. -* list: Variables will be decoded by the first prefix separator in the - passed list that is encountered in the column name. * dict: Directly map prefix separators to prefixes. Can be used in case multiple separation characters are used to separata the prefixes as well as in the variable names themself. @@ -784,20 +780,6 @@ first encountered separator following the order of the list. simple = pd.from_dummies(df_simple, sep="_") simple - df_multi = pd.DataFrame( - { - "prefix1_a": [0, 1, 0], - "prefix1_b": [1, 0, 1], - "prefix2-a": [0, 1, 0], - "prefix2-b": [1, 0, 0], - "prefix2-c": [0, 0, 1], - } - ) - df_multi - - from_list = pd.from_dummies(df_multi, sep=["_", "-"]) - from_list - df_complex = pd.DataFrame( { "prefix1_a-a": [1, 0, 1], From bbe41d03d968fea704be2fd1b49ff53944ad9733 Mon Sep 17 00:00:00 2001 From: pckSF Date: Sun, 28 Nov 2021 00:55:33 +0100 Subject: [PATCH 51/95] Fix wrong parenthesis in docstring --- pandas/core/reshape/reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 28fe998dd4d4f..7261f3894f534 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1162,7 +1162,7 @@ def from_dummies( ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], ... "col2_c": [0, 0, 0]}) - >>> pd.from_dummies(df, sep="_", dropped_first={"col1": "d", "col2": "e"]) + >>> pd.from_dummies(df, sep="_", dropped_first={"col1": "d", "col2": "e"}) col1 col2 0 a b 1 b a From 329394bfef35561b8a34b7a58e093cba4c7f2714 Mon Sep 17 00:00:00 2001 From: pckSF Date: Sun, 28 Nov 2021 01:58:51 +0100 Subject: [PATCH 52/95] Fix docstring example expected return --- pandas/core/reshape/reshape.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 7261f3894f534..551a5a1ff5aa3 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1174,9 +1174,9 @@ def from_dummies( >>> pd.from_dummies(df, sep={"col1": "_", "col2": "-"}) col1 col2 - 0 a-a b-b - 1 b-b a-a - 2 a-a c-c + 0 a-a b_b + 1 b-b a_a + 2 a-a c_c """ from pandas.core.reshape.concat import concat From b83ac6a860ce1f06f7759bb34f6acb8aadd6d85a Mon Sep 17 00:00:00 2001 From: pckSF Date: Mon, 29 Nov 2021 23:47:02 +0100 Subject: [PATCH 53/95] Simplify from_dummies --- pandas/core/reshape/reshape.py | 71 ++----------- pandas/tests/reshape/test_from_dummies.py | 120 +++------------------- 2 files changed, 24 insertions(+), 167 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 551a5a1ff5aa3..7211749873b9a 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1099,8 +1099,7 @@ def get_empty_frame(data) -> DataFrame: def from_dummies( data: DataFrame, - subset: None | Index | list[Hashable] = None, - sep: None | str | dict[str, str] = None, + sep: None | str = None, dropped_first: None | Hashable | dict[str, Hashable] = None, ) -> DataFrame: """ @@ -1112,17 +1111,11 @@ def from_dummies( ---------- data : `DataFrame` Data which contains dummy-coded variables. - subset : None, Index, or list of Hashables, default 'None' - The columns which to convert from dummy-encoding and return as categorical - `DataFrame`. If `columns` is None then all dummy columns are converted and - appended to the non-dummy columns. - sep : str or dict of str, default '_' + sep : str, default None Separator used in the column names of the dummy categories they are character indicating the separation of the categorical names from the prefixes. For example, if your column names are 'prefix_A' and 'prefix_B', you can strip the underscore by specifying sep='_'. - Alternatively, pass a dictionary to map prefix separators to prefixes if - multiple and/or mixed separators are used in the column names. dropped_fist : None, Hashable or dict of Hashables, default None The implied value the dummy takes when all values are zero. Can be a a single value for all variables or a dict directly mapping the @@ -1167,30 +1160,10 @@ def from_dummies( 0 a b 1 b a 2 d e - - >>> df = pd.DataFrame({"col1_a-a": [1, 0, 1], "col1_b-b": [0, 1, 0], - ... "col2-a_a": [0, 1, 0], "col2-b_b": [1, 0, 0], - ... "col2-c_c": [0, 0, 1]}) - - >>> pd.from_dummies(df, sep={"col1": "_", "col2": "-"}) - col1 col2 - 0 a-a b_b - 1 b-b a_a - 2 a-a c_c """ from pandas.core.reshape.concat import concat - if subset is None: - subset = list(data.columns) - elif isinstance(subset, Index): - subset = list(subset) - elif not isinstance(subset, list): - raise TypeError( - f"Expected 'subset' to be of type 'Index', or 'list'; " - f"Received 'subset' of type: {type(subset).__name__}" - ) - - if data[subset].isna().any().any(): + if data.isna().any().any(): raise ValueError( f"Dummy DataFrame contains NA value in column: " f"'{data.columns[data.isna().any().argmax()]}'" @@ -1198,43 +1171,26 @@ def from_dummies( # index data with a list of all columns that are dummies try: - data_to_decode = data[subset].astype("boolean") + data_to_decode = data.astype("boolean") except TypeError: raise TypeError("Passed DataFrame contains non-dummy data") - # get separator for each prefix and lists to slice data for each prefix + # collect prefixes and get lists to slice data for each prefix if sep is None: - variables_slice = {"": subset} - elif isinstance(sep, dict): - variables_slice = {prefix: [] for prefix in sep} - for col in data_to_decode.columns: - sep_available = False - for prefix in sep: - if prefix in col: - sep_available = True - variables_slice[prefix].append(col) - break - if not sep_available: - raise ValueError( - f"Separator not specified for all columns; " - f"First instance column: {col}" - ) + variables_slice = {"": list(data.columns)} elif isinstance(sep, str): variables_slice = {} for col in data_to_decode.columns: prefix = col.split(sep)[0] if len(prefix) == len(col): - raise ValueError( - f"Separator not specified for all columns; " - f"First instance column: {col}" - ) + raise ValueError(f"Separator not specified for column: {col}") if prefix not in variables_slice: variables_slice[prefix] = [col] else: variables_slice[prefix].append(col) else: raise TypeError( - f"Expected 'sep' to be of type 'str' or 'dict'; " + f"Expected 'sep' to be of type 'str' or 'None'; " f"Received 'sep' of type: {type(sep).__name__}" ) @@ -1257,20 +1213,15 @@ def check_len(item, name) -> None: ) else: raise TypeError( - f"Expected 'dropped_first' to be of type 'Hashable' or 'dict'; " + f"Expected 'dropped_first' to be of type " + f"'None', 'Hashable', or 'dict'; " f"Received 'dropped_first' of type: {type(dropped_first).__name__}" ) cat_data = {} for prefix, prefix_slice in variables_slice.items(): if sep is None: - cats = subset.copy() - elif isinstance(sep, dict): - cats = [ - col[len(prefix + sep[prefix]) :] - for col in prefix_slice - if isinstance(col, str) - ] + cats = prefix_slice.copy() else: cats = [ col[len(prefix + sep) :] for col in prefix_slice if isinstance(col, str) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 4f1509870a5f5..d0fa45383783d 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -103,7 +103,7 @@ def test_from_dummies_no_prefix_wrong_dropped_first_type(): with pytest.raises( TypeError, match=( - r"Expected 'dropped_first' to be of type 'Hashable' or 'dict'; " + r"Expected 'dropped_first' to be of type 'None', 'Hashable', or 'dict'; " r"Received 'dropped_first' of type: list" ), ): @@ -130,7 +130,7 @@ def test_from_dummies_no_prefix_contains_nan(): from_dummies(dummies) -def test_from_dummies_no_dummies(): +def test_from_dummies_contains_non_dummies(): dummies = DataFrame( {"a": [1, 6, 3, 1], "b": [0, 1, 0, 2], "c": ["c1", "c2", "c3", "c4"]} ) @@ -138,7 +138,7 @@ def test_from_dummies_no_dummies(): TypeError, match=r"Passed DataFrame contains non-dummy data", ): - from_dummies(dummies, sep="_") + from_dummies(dummies) def test_from_dummies_with_prefix_basic(dummies_basic): @@ -147,7 +147,7 @@ def test_from_dummies_with_prefix_basic(dummies_basic): tm.assert_frame_equal(result, expected) -def test_from_dummies_with_prefix_prefix_multiple_seperators(): +def test_from_dummies_with_prefix_multiple_seperators(): dummies = DataFrame( { "col1_a": [1, 0, 1], @@ -158,60 +158,17 @@ def test_from_dummies_with_prefix_prefix_multiple_seperators(): ) with pytest.raises( ValueError, - match=( - r"Separator not specified for all columns; " - r"First instance column: col2-a" - ), + match=(r"Separator not specified for column: col2-a"), ): from_dummies(dummies, sep="_") -def test_from_dummies_with_prefix_prefix_sep_dict(): - dummies = DataFrame( - { - "col1_a-a": [1, 0, 1], - "col1_b-b": [0, 1, 0], - "col_2-a": [0, 1, 0], - "col_2-b": [1, 0, 0], - "col_2-c": [0, 0, 1], - }, - ) - expected = DataFrame({"col1": ["a-a", "b-b", "a-a"], "col_2": ["b", "a", "c"]}) - result = from_dummies( - dummies, - sep={ - "col1": "_", - "col_2": "-", - }, - ) - tm.assert_frame_equal(result, expected) - - -def test_from_dummies_with_prefix_prefix_partial_sep_dict(): - dummies = DataFrame( - { - "col1_a-a": [1, 0, 1], - "col1_b-b": [0, 1, 0], - "col2-a_a": [0, 1, 0], - "col2-b_b": [1, 0, 1], - }, - ) - with pytest.raises( - ValueError, - match=( - r"Separator not specified for all columns; " - r"First instance column: col2-a_a" - ), - ): - from_dummies(dummies, sep={"col1": "_"}) - - def test_from_dummies_with_prefix_sep_wrong_type(dummies_basic): with pytest.raises( TypeError, match=( - r"Expected 'sep' to be of type 'str' or 'dict'; " + r"Expected 'sep' to be of type 'str' or 'None'; " r"Received 'sep' of type: list" ), ): @@ -246,16 +203,6 @@ def test_from_dummies_with_prefix_contains_unassigned(dummies_with_unassigned): from_dummies(dummies_with_unassigned, sep="_") -def test_from_dummies_with_prefix_subset(dummies_basic): - expected = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]}) - result = from_dummies( - dummies_basic, - subset=["col1_a", "col1_b", "col2_a", "col2_b", "col2_c"], - sep="_", - ) - tm.assert_frame_equal(result, expected) - - def test_from_dummies_with_prefix_dropped_first_str(dummies_with_unassigned): expected = DataFrame({"col1": ["a", "b", "x"], "col2": ["x", "a", "c"]}) result = from_dummies(dummies_with_unassigned, sep="_", dropped_first="x") @@ -266,23 +213,13 @@ def test_from_dummies_with_prefix_dropped_first_wrong_type(dummies_with_unassign with pytest.raises( TypeError, match=( - r"Expected 'dropped_first' to be of type 'Hashable' or 'dict'; " + r"Expected 'dropped_first' to be of type 'None', 'Hashable', or 'dict'; " r"Received 'dropped_first' of type: list" ), ): from_dummies(dummies_with_unassigned, sep="_", dropped_first=["x", "y"]) -def test_from_dummies_with_prefix_dropped_first_dict(dummies_with_unassigned): - expected = DataFrame({"col1": ["a", "b", "y"], "col2": ["x", "a", "c"]}) - result = from_dummies( - dummies_with_unassigned, - sep="_", - dropped_first={"col2": "x", "col1": "y"}, - ) - tm.assert_frame_equal(result, expected) - - def test_from_dummies_with_prefix_dropped_first_int_and_float(dummies_with_unassigned): expected = DataFrame({"col1": ["a", "b", 2.5], "col2": [1, "a", "c"]}) result = from_dummies( @@ -316,21 +253,6 @@ def test_from_dummies_with_prefix_dropped_first_dict_not_complete( from_dummies(dummies_with_unassigned, sep="_", dropped_first={"col1": "x"}) -def test_from_dummies_with_prefix_wrong_subset_type(dummies_basic): - with pytest.raises( - TypeError, - match=( - r"Expected 'subset' to be of type 'Index', or 'list'; " - r"Received 'subset' of type: str" - ), - ): - from_dummies( - dummies_basic, - subset="col1_a", - sep="_", - ) - - def test_from_dummies_with_prefix_contains_nan(dummies_basic): dummies_basic["col2_c"][2] = np.nan with pytest.raises( @@ -339,6 +261,12 @@ def test_from_dummies_with_prefix_contains_nan(dummies_basic): from_dummies(dummies_basic, sep="_") +def test_from_dummies_with_prefix_contains_non_dummies(dummies_basic): + dummies_basic["col2_c"][2] = "str" + with pytest.raises(TypeError, match=r"Passed DataFrame contains non-dummy data"): + from_dummies(dummies_basic, sep="_") + + def test_from_dummies_with_prefix_double_assignment(): dummies = DataFrame( { @@ -357,25 +285,3 @@ def test_from_dummies_with_prefix_double_assignment(): ), ): from_dummies(dummies, sep="_") - - -def test_from_dummies_collate_prefix_sep_and_dropped_first_dict(): - dummies = DataFrame( - { - "col1_a-a": [1, 0, 0], - "col1_b-b": [0, 1, 0], - "col2-a_a": [0, 1, 0], - "col2-b_b": [0, 0, 0], - "col2-c_c": [0, 0, 1], - }, - ) - expected = DataFrame({"col1": ["a-a", "b-b", "x"], "col2": ["y", "a_a", "c_c"]}) - result = from_dummies( - dummies, - sep={ - "col1": "_", - "col2": "-", - }, - dropped_first={"col1": "x", "col2": "y"}, - ) - tm.assert_frame_equal(result, expected) From 1f5e1dce9a8d891af4add1cdc34d0ab6891c289d Mon Sep 17 00:00:00 2001 From: pckSF Date: Tue, 30 Nov 2021 00:04:00 +0100 Subject: [PATCH 54/95] Update user guide entry --- doc/source/user_guide/reshaping.rst | 52 +++-------------------------- 1 file changed, 4 insertions(+), 48 deletions(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 4b7cf071e3071..b07d3f02b395f 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -740,60 +740,16 @@ for example ``k`` columns of a ``DataFrame`` containing 1s and 0s can derive a pd.from_dummies(df, sep="_") -The ``k`` distinct values can also be represented be a ``dropped_first`` which -means that no value assigned implies a the value of the dropped value: +Dummy coded data only requires ``k - 1`` values to be included, in this case +the ``k`` th value implied by not being assigned any of the other ``k - 1`` values +can be passed via ``implied_value``. .. ipython:: python df = pd.DataFrame({"prefix_a": [0, 1, 0]}) df - pd.from_dummies(df, sep="_", dropped_first="b") - -The ``subset`` argument controls which columns of the input ```DataFrame`` to consider -for the decoding: - -.. ipython:: python - - df = pd.DataFrame({"C": [1, 2, 3], "prefix_a": [0, 1, 0], "prefix_b": [1, 0, 1]}) - df - - pd.from_dummies(df, subset=["prefix_a", "prefix_b"], sep="_") - -``sep`` is (or are) character(s) indicating the separation of the categorical names -from the prefixes. For example, if your column names are ``prefix_A`` and ``prefix_B``, -you can strip the underscore by specifying ``sep='_'``. -You can pass values for the ``sep`` argument depending on how many or -nested prefix separators are used in the column names. - -* string: Use the same value for ``prefix_sep`` for each column - to be dencoded. -* dict: Directly map prefix separators to prefixes. Can be used in case multiple - separation characters are used to separata the prefixes as well as in the - variable names themself. - -.. ipython:: python - - df_simple = pd.DataFrame({"prefix_a": [0, 1, 0], "prefix_b": [1, 0, 1]}) - df_simple - - simple = pd.from_dummies(df_simple, sep="_") - simple - - df_complex = pd.DataFrame( - { - "prefix1_a-a": [1, 0, 1], - "prefix1_b-b": [0, 1, 0], - "prefix_2-a": [0, 1, 0], - "prefix_2-b": [1, 0, 0], - "prefix_2-c": [0, 0, 1], - } - ) - df_complex - - from_dict = pd.from_dummies(df_complex, sep={"prefix1": "_", "prefix_2": "-"}) - from_dict - + pd.from_dummies(df, sep="_", implied_value="b") .. _reshaping.factorize: From 8a3421b5ff569f4a4d4bf4de6fc3169cc4599f71 Mon Sep 17 00:00:00 2001 From: pckSF Date: Tue, 30 Nov 2021 00:04:19 +0100 Subject: [PATCH 55/95] Change arg dropped_first to implied_value --- pandas/core/reshape/reshape.py | 28 ++++++++--------- pandas/tests/reshape/test_from_dummies.py | 38 +++++++++++------------ 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 7211749873b9a..ebaaff380c94b 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1100,7 +1100,7 @@ def get_empty_frame(data) -> DataFrame: def from_dummies( data: DataFrame, sep: None | str = None, - dropped_first: None | Hashable | dict[str, Hashable] = None, + implied_value: None | Hashable | dict[str, Hashable] = None, ) -> DataFrame: """ Create a categorical `DataFrame` from a `DataFrame` of dummy variables. @@ -1116,7 +1116,7 @@ def from_dummies( character indicating the separation of the categorical names from the prefixes. For example, if your column names are 'prefix_A' and 'prefix_B', you can strip the underscore by specifying sep='_'. - dropped_fist : None, Hashable or dict of Hashables, default None + implied_value : None, Hashable or dict of Hashables, default None The implied value the dummy takes when all values are zero. Can be a a single value for all variables or a dict directly mapping the dropped value to a prefix of a variable. @@ -1155,7 +1155,7 @@ def from_dummies( ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], ... "col2_c": [0, 0, 0]}) - >>> pd.from_dummies(df, sep="_", dropped_first={"col1": "d", "col2": "e"}) + >>> pd.from_dummies(df, sep="_", implied_value={"col1": "d", "col2": "e"}) col1 col2 0 a b 1 b a @@ -1194,7 +1194,7 @@ def from_dummies( f"Received 'sep' of type: {type(sep).__name__}" ) - # validate number of dropped_first + # validate number of implied_value def check_len(item, name) -> None: if not len(item) == len(variables_slice): len_msg = ( @@ -1204,18 +1204,18 @@ def check_len(item, name) -> None: ) raise ValueError(len_msg) - if dropped_first: - if isinstance(dropped_first, dict): - check_len(dropped_first, "dropped_first") - elif isinstance(dropped_first, Hashable): - dropped_first = dict( - zip(variables_slice, [dropped_first] * len(variables_slice)) + if implied_value: + if isinstance(implied_value, dict): + check_len(implied_value, "implied_value") + elif isinstance(implied_value, Hashable): + implied_value = dict( + zip(variables_slice, [implied_value] * len(variables_slice)) ) else: raise TypeError( - f"Expected 'dropped_first' to be of type " + f"Expected 'implied_value' to be of type " f"'None', 'Hashable', or 'dict'; " - f"Received 'dropped_first' of type: {type(dropped_first).__name__}" + f"Received 'implied_value' of type: {type(implied_value).__name__}" ) cat_data = {} @@ -1233,8 +1233,8 @@ def check_len(item, name) -> None: f"First instance in row: {assigned.argmax()}" ) elif any(assigned == 0): - if isinstance(dropped_first, dict): - cats.append(dropped_first[prefix]) + if isinstance(implied_value, dict): + cats.append(implied_value[prefix]) else: raise ValueError( f"Dummy DataFrame contains unassigned value(s); " diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index d0fa45383783d..f3ed0b28ea86e 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -91,23 +91,23 @@ def test_from_dummies_no_prefix_contains_unassigned(): from_dummies(dummies) -def test_from_dummies_no_prefix_string_cats_dropped_first(): +def test_from_dummies_no_prefix_string_cats_implied_value(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) expected = DataFrame({"": ["a", "b", "c"]}) - result = from_dummies(dummies, dropped_first="c") + result = from_dummies(dummies, implied_value="c") tm.assert_frame_equal(result, expected) -def test_from_dummies_no_prefix_wrong_dropped_first_type(): +def test_from_dummies_no_prefix_wrong_implied_value_type(): dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}) with pytest.raises( TypeError, match=( - r"Expected 'dropped_first' to be of type 'None', 'Hashable', or 'dict'; " - r"Received 'dropped_first' of type: list" + r"Expected 'implied_value' to be of type 'None', 'Hashable', or 'dict'; " + r"Received 'implied_value' of type: list" ), ): - from_dummies(dummies, dropped_first=["c", "d"]) + from_dummies(dummies, implied_value=["c", "d"]) def test_from_dummies_no_prefix_multi_assignment(): @@ -203,54 +203,54 @@ def test_from_dummies_with_prefix_contains_unassigned(dummies_with_unassigned): from_dummies(dummies_with_unassigned, sep="_") -def test_from_dummies_with_prefix_dropped_first_str(dummies_with_unassigned): +def test_from_dummies_with_prefix_implied_value_str(dummies_with_unassigned): expected = DataFrame({"col1": ["a", "b", "x"], "col2": ["x", "a", "c"]}) - result = from_dummies(dummies_with_unassigned, sep="_", dropped_first="x") + result = from_dummies(dummies_with_unassigned, sep="_", implied_value="x") tm.assert_frame_equal(result, expected) -def test_from_dummies_with_prefix_dropped_first_wrong_type(dummies_with_unassigned): +def test_from_dummies_with_prefix_implied_value_wrong_type(dummies_with_unassigned): with pytest.raises( TypeError, match=( - r"Expected 'dropped_first' to be of type 'None', 'Hashable', or 'dict'; " - r"Received 'dropped_first' of type: list" + r"Expected 'implied_value' to be of type 'None', 'Hashable', or 'dict'; " + r"Received 'implied_value' of type: list" ), ): - from_dummies(dummies_with_unassigned, sep="_", dropped_first=["x", "y"]) + from_dummies(dummies_with_unassigned, sep="_", implied_value=["x", "y"]) -def test_from_dummies_with_prefix_dropped_first_int_and_float(dummies_with_unassigned): +def test_from_dummies_with_prefix_implied_value_int_and_float(dummies_with_unassigned): expected = DataFrame({"col1": ["a", "b", 2.5], "col2": [1, "a", "c"]}) result = from_dummies( dummies_with_unassigned, sep="_", - dropped_first={"col2": 1, "col1": 2.5}, + implied_value={"col2": 1, "col1": 2.5}, ) tm.assert_frame_equal(result, expected) -def test_from_dummies_with_prefix_dropped_first_bool_and_none(dummies_with_unassigned): +def test_from_dummies_with_prefix_implied_value_bool_and_none(dummies_with_unassigned): expected = DataFrame({"col1": ["a", "b", False], "col2": [None, "a", "c"]}) result = from_dummies( dummies_with_unassigned, sep="_", - dropped_first={"col2": None, "col1": False}, + implied_value={"col2": None, "col1": False}, ) tm.assert_frame_equal(result, expected) -def test_from_dummies_with_prefix_dropped_first_dict_not_complete( +def test_from_dummies_with_prefix_implied_value_dict_not_complete( dummies_with_unassigned, ): with pytest.raises( ValueError, match=( - r"Length of 'dropped_first' \(1\) did not match " + r"Length of 'implied_value' \(1\) did not match " r"the length of the columns being encoded \(2\)" ), ): - from_dummies(dummies_with_unassigned, sep="_", dropped_first={"col1": "x"}) + from_dummies(dummies_with_unassigned, sep="_", implied_value={"col1": "x"}) def test_from_dummies_with_prefix_contains_nan(dummies_basic): From 16cdaa00e40da13fcd8e7e107718c1f4c3b164e5 Mon Sep 17 00:00:00 2001 From: pckSF Date: Tue, 30 Nov 2021 00:23:24 +0100 Subject: [PATCH 56/95] Add dosctring note and test for boolean dummy values --- pandas/core/reshape/reshape.py | 5 +++++ pandas/tests/reshape/test_from_dummies.py | 22 ++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index ebaaff380c94b..c53e4b1c8c7ff 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1126,6 +1126,11 @@ def from_dummies( `DataFrame` Categorical data decoded from the dummy input-data. + Notes + ----- + The columns of the passed dummy data should only include 1's and 0's, + or boolean values. + See Also -------- get_dummies : Convert `Series` or `DataFrame` to dummy codes. diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index f3ed0b28ea86e..24089927ebc2b 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -39,6 +39,28 @@ def test_from_dummies_no_prefix_string_cats_basic(): tm.assert_frame_equal(result, expected) +def test_from_dummies_no_prefix_string_cats_basic_bool_values(): + dummies = DataFrame( + { + "a": [True, False, False, True], + "b": [False, True, False, False], + "c": [False, False, True, False], + } + ) + expected = DataFrame({"": ["a", "b", "c", "a"]}) + result = from_dummies(dummies) + tm.assert_frame_equal(result, expected) + + +def test_from_dummies_no_prefix_string_cats_basic_mixed_bool_values(): + dummies = DataFrame( + {"a": [1, 0, 0, 1], "b": [False, True, False, False], "c": [0, 0, 1, 0]} + ) + expected = DataFrame({"": ["a", "b", "c", "a"]}) + result = from_dummies(dummies) + tm.assert_frame_equal(result, expected) + + def test_from_dummies_no_prefix_int_cats_basic(): dummies = DataFrame( {1: [1, 0, 0, 0], 25: [0, 1, 0, 0], 2: [0, 0, 1, 0], 5: [0, 0, 0, 1]} From 174df1f5713ac0c38780e6bc31e039af7730c9e4 Mon Sep 17 00:00:00 2001 From: pckSF Date: Tue, 30 Nov 2021 00:41:07 +0100 Subject: [PATCH 57/95] Fix docstring typo --- pandas/core/reshape/reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c53e4b1c8c7ff..0105c504b4a6f 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1119,7 +1119,7 @@ def from_dummies( implied_value : None, Hashable or dict of Hashables, default None The implied value the dummy takes when all values are zero. Can be a a single value for all variables or a dict directly mapping the - dropped value to a prefix of a variable. + implied values to a prefix of a variable. Returns ------- From e45d3f880afea3a8aefe39cf09c2a3ae8681a4df Mon Sep 17 00:00:00 2001 From: pckSF Date: Tue, 30 Nov 2021 00:52:39 +0100 Subject: [PATCH 58/95] Change arg implied_value to implied_category --- doc/source/user_guide/reshaping.rst | 8 ++--- pandas/core/reshape/reshape.py | 33 +++++++++--------- pandas/tests/reshape/test_from_dummies.py | 42 +++++++++++++---------- 3 files changed, 44 insertions(+), 39 deletions(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index b07d3f02b395f..393fedb39bd73 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -740,16 +740,16 @@ for example ``k`` columns of a ``DataFrame`` containing 1s and 0s can derive a pd.from_dummies(df, sep="_") -Dummy coded data only requires ``k - 1`` values to be included, in this case -the ``k`` th value implied by not being assigned any of the other ``k - 1`` values -can be passed via ``implied_value``. +Dummy coded data only requires ``k - 1`` categories to be included, in this case +the ``k`` th categories, implied by not being assigned any of the other ``k - 1`` +categories, can be passed via ``implied_category``. .. ipython:: python df = pd.DataFrame({"prefix_a": [0, 1, 0]}) df - pd.from_dummies(df, sep="_", implied_value="b") + pd.from_dummies(df, sep="_", implied_category="b") .. _reshaping.factorize: diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 0105c504b4a6f..b824dd2d6a25d 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1100,7 +1100,7 @@ def get_empty_frame(data) -> DataFrame: def from_dummies( data: DataFrame, sep: None | str = None, - implied_value: None | Hashable | dict[str, Hashable] = None, + implied_category: None | Hashable | dict[str, Hashable] = None, ) -> DataFrame: """ Create a categorical `DataFrame` from a `DataFrame` of dummy variables. @@ -1116,10 +1116,10 @@ def from_dummies( character indicating the separation of the categorical names from the prefixes. For example, if your column names are 'prefix_A' and 'prefix_B', you can strip the underscore by specifying sep='_'. - implied_value : None, Hashable or dict of Hashables, default None - The implied value the dummy takes when all values are zero. + implied_category : None, Hashable or dict of Hashables, default None + The implied category the dummy takes when all values are zero. Can be a a single value for all variables or a dict directly mapping the - implied values to a prefix of a variable. + implied categories to a prefix of a variable. Returns ------- @@ -1160,7 +1160,7 @@ def from_dummies( ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], ... "col2_c": [0, 0, 0]}) - >>> pd.from_dummies(df, sep="_", implied_value={"col1": "d", "col2": "e"}) + >>> pd.from_dummies(df, sep="_", implied_category={"col1": "d", "col2": "e"}) col1 col2 0 a b 1 b a @@ -1199,7 +1199,7 @@ def from_dummies( f"Received 'sep' of type: {type(sep).__name__}" ) - # validate number of implied_value + # validate number of implied_category def check_len(item, name) -> None: if not len(item) == len(variables_slice): len_msg = ( @@ -1209,18 +1209,19 @@ def check_len(item, name) -> None: ) raise ValueError(len_msg) - if implied_value: - if isinstance(implied_value, dict): - check_len(implied_value, "implied_value") - elif isinstance(implied_value, Hashable): - implied_value = dict( - zip(variables_slice, [implied_value] * len(variables_slice)) + if implied_category: + if isinstance(implied_category, dict): + check_len(implied_category, "implied_category") + elif isinstance(implied_category, Hashable): + implied_category = dict( + zip(variables_slice, [implied_category] * len(variables_slice)) ) else: raise TypeError( - f"Expected 'implied_value' to be of type " + f"Expected 'implied_category' to be of type " f"'None', 'Hashable', or 'dict'; " - f"Received 'implied_value' of type: {type(implied_value).__name__}" + f"Received 'implied_category' of type: " + f"{type(implied_category).__name__}" ) cat_data = {} @@ -1238,8 +1239,8 @@ def check_len(item, name) -> None: f"First instance in row: {assigned.argmax()}" ) elif any(assigned == 0): - if isinstance(implied_value, dict): - cats.append(implied_value[prefix]) + if isinstance(implied_category, dict): + cats.append(implied_category[prefix]) else: raise ValueError( f"Dummy DataFrame contains unassigned value(s); " diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 24089927ebc2b..f5defebede6d0 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -113,23 +113,23 @@ def test_from_dummies_no_prefix_contains_unassigned(): from_dummies(dummies) -def test_from_dummies_no_prefix_string_cats_implied_value(): +def test_from_dummies_no_prefix_string_cats_implied_category(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) expected = DataFrame({"": ["a", "b", "c"]}) - result = from_dummies(dummies, implied_value="c") + result = from_dummies(dummies, implied_category="c") tm.assert_frame_equal(result, expected) -def test_from_dummies_no_prefix_wrong_implied_value_type(): +def test_from_dummies_no_prefix_wrong_implied_category_type(): dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}) with pytest.raises( TypeError, match=( - r"Expected 'implied_value' to be of type 'None', 'Hashable', or 'dict'; " - r"Received 'implied_value' of type: list" + r"Expected 'implied_category' to be of type 'None', 'Hashable', or 'dict'; " + r"Received 'implied_category' of type: list" ), ): - from_dummies(dummies, implied_value=["c", "d"]) + from_dummies(dummies, implied_category=["c", "d"]) def test_from_dummies_no_prefix_multi_assignment(): @@ -225,54 +225,58 @@ def test_from_dummies_with_prefix_contains_unassigned(dummies_with_unassigned): from_dummies(dummies_with_unassigned, sep="_") -def test_from_dummies_with_prefix_implied_value_str(dummies_with_unassigned): +def test_from_dummies_with_prefix_implied_category_str(dummies_with_unassigned): expected = DataFrame({"col1": ["a", "b", "x"], "col2": ["x", "a", "c"]}) - result = from_dummies(dummies_with_unassigned, sep="_", implied_value="x") + result = from_dummies(dummies_with_unassigned, sep="_", implied_category="x") tm.assert_frame_equal(result, expected) -def test_from_dummies_with_prefix_implied_value_wrong_type(dummies_with_unassigned): +def test_from_dummies_with_prefix_implied_category_wrong_type(dummies_with_unassigned): with pytest.raises( TypeError, match=( - r"Expected 'implied_value' to be of type 'None', 'Hashable', or 'dict'; " - r"Received 'implied_value' of type: list" + r"Expected 'implied_category' to be of type 'None', 'Hashable', or 'dict'; " + r"Received 'implied_category' of type: list" ), ): - from_dummies(dummies_with_unassigned, sep="_", implied_value=["x", "y"]) + from_dummies(dummies_with_unassigned, sep="_", implied_category=["x", "y"]) -def test_from_dummies_with_prefix_implied_value_int_and_float(dummies_with_unassigned): +def test_from_dummies_with_prefix_implied_category_int_and_float( + dummies_with_unassigned, +): expected = DataFrame({"col1": ["a", "b", 2.5], "col2": [1, "a", "c"]}) result = from_dummies( dummies_with_unassigned, sep="_", - implied_value={"col2": 1, "col1": 2.5}, + implied_category={"col2": 1, "col1": 2.5}, ) tm.assert_frame_equal(result, expected) -def test_from_dummies_with_prefix_implied_value_bool_and_none(dummies_with_unassigned): +def test_from_dummies_with_prefix_implied_category_bool_and_none( + dummies_with_unassigned, +): expected = DataFrame({"col1": ["a", "b", False], "col2": [None, "a", "c"]}) result = from_dummies( dummies_with_unassigned, sep="_", - implied_value={"col2": None, "col1": False}, + implied_category={"col2": None, "col1": False}, ) tm.assert_frame_equal(result, expected) -def test_from_dummies_with_prefix_implied_value_dict_not_complete( +def test_from_dummies_with_prefix_implied_category_dict_not_complete( dummies_with_unassigned, ): with pytest.raises( ValueError, match=( - r"Length of 'implied_value' \(1\) did not match " + r"Length of 'implied_category' \(1\) did not match " r"the length of the columns being encoded \(2\)" ), ): - from_dummies(dummies_with_unassigned, sep="_", implied_value={"col1": "x"}) + from_dummies(dummies_with_unassigned, sep="_", implied_category={"col1": "x"}) def test_from_dummies_with_prefix_contains_nan(dummies_basic): From e83faedb55206112749759c80db650329de05d56 Mon Sep 17 00:00:00 2001 From: pckSF Date: Sat, 4 Dec 2021 12:27:21 +0100 Subject: [PATCH 59/95] Fix docstring format mistakes --- pandas/core/reshape/reshape.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index b824dd2d6a25d..18c2aa67cee9a 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1109,7 +1109,7 @@ def from_dummies( Parameters ---------- - data : `DataFrame` + data : DataFrame Data which contains dummy-coded variables. sep : str, default None Separator used in the column names of the dummy categories they are @@ -1123,7 +1123,7 @@ def from_dummies( Returns ------- - `DataFrame` + DataFrame Categorical data decoded from the dummy input-data. Notes @@ -1138,7 +1138,7 @@ def from_dummies( Examples -------- >>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], - ... "c": [0, 0, 1, 0]}) + ... "c": [0, 0, 1, 0]}) >>> pd.from_dummies(df) 0 a @@ -1147,8 +1147,8 @@ def from_dummies( 3 a >>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], - ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], - ... "col2_c": [0, 0, 1]}) + ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], + ... "col2_c": [0, 0, 1]}) >>> pd.from_dummies(df, sep="_") col1 col2 From 1e12e6a17752113c875752dad9eb39b66865bc9f Mon Sep 17 00:00:00 2001 From: pckSF Date: Sat, 4 Dec 2021 12:36:37 +0100 Subject: [PATCH 60/95] Replace argmax/min with idxmax/min --- pandas/core/reshape/reshape.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 18c2aa67cee9a..fa2bace88f46b 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1171,7 +1171,7 @@ def from_dummies( if data.isna().any().any(): raise ValueError( f"Dummy DataFrame contains NA value in column: " - f"'{data.columns[data.isna().any().argmax()]}'" + f"'{data.isna().any().idxmax()}'" ) # index data with a list of all columns that are dummies @@ -1236,7 +1236,7 @@ def check_len(item, name) -> None: if any(assigned > 1): raise ValueError( f"Dummy DataFrame contains multi-assignment(s); " - f"First instance in row: {assigned.argmax()}" + f"First instance in row: {assigned.idxmax()}" ) elif any(assigned == 0): if isinstance(implied_category, dict): @@ -1244,7 +1244,7 @@ def check_len(item, name) -> None: else: raise ValueError( f"Dummy DataFrame contains unassigned value(s); " - f"First instance in row: {assigned.argmin()}" + f"First instance in row: {assigned.idxmin()}" ) data_slice = concat((data_to_decode[prefix_slice], assigned == 0), axis=1) else: From 24e98997f7f01d18bf4067d4dac0f559a2b4c882 Mon Sep 17 00:00:00 2001 From: pckSF Date: Sat, 4 Dec 2021 12:48:01 +0100 Subject: [PATCH 61/95] Reduce complexity by using defaultdict --- pandas/core/reshape/reshape.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index fa2bace88f46b..d1f8b6ce19a47 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1,5 +1,6 @@ from __future__ import annotations +from collections import defaultdict import itertools from typing import ( TYPE_CHECKING, @@ -1181,18 +1182,15 @@ def from_dummies( raise TypeError("Passed DataFrame contains non-dummy data") # collect prefixes and get lists to slice data for each prefix + variables_slice = defaultdict(list) if sep is None: - variables_slice = {"": list(data.columns)} + variables_slice[""] = list(data.columns) elif isinstance(sep, str): - variables_slice = {} for col in data_to_decode.columns: prefix = col.split(sep)[0] if len(prefix) == len(col): raise ValueError(f"Separator not specified for column: {col}") - if prefix not in variables_slice: - variables_slice[prefix] = [col] - else: - variables_slice[prefix].append(col) + variables_slice[prefix].append(col) else: raise TypeError( f"Expected 'sep' to be of type 'str' or 'None'; " From 0ac8fff7826dbcc0c79465fabf11f7c6a3808af8 Mon Sep 17 00:00:00 2001 From: pckSF Date: Thu, 16 Dec 2021 01:22:25 +0100 Subject: [PATCH 62/95] Ignore dependency based mypy errors --- pandas/core/reshape/reshape.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 055c4a628a985..9c329ef35e4eb 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1171,9 +1171,11 @@ def from_dummies( """ from pandas.core.reshape.concat import concat - if data.isna().any().any(): + # error: Item "bool" of "Union[Series, bool]" has no attribute "any" + if data.isna().any().any(): # type: ignore[union-attr] + # error: Item "bool" of "Union[Series, bool]" has no attribute "idxmax" raise ValueError( - f"Dummy DataFrame contains NA value in column: " + f"Dummy DataFrame contains NA value in column: " # type: ignore[union-attr] f"'{data.isna().any().idxmax()}'" ) @@ -1229,9 +1231,7 @@ def check_len(item, name) -> None: if sep is None: cats = prefix_slice.copy() else: - cats = [ - col[len(prefix + sep) :] for col in prefix_slice if isinstance(col, str) - ] + cats = [col[len(prefix + sep) :] for col in prefix_slice] assigned = data_to_decode[prefix_slice].sum(axis=1) if any(assigned > 1): raise ValueError( From 54fdcbd91ea5fbccbac7d361f264a60be5e0ac01 Mon Sep 17 00:00:00 2001 From: pckSF Date: Wed, 29 Dec 2021 14:59:45 +0100 Subject: [PATCH 63/95] Add Raises section to docstring --- pandas/core/reshape/reshape.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 9c329ef35e4eb..e9a1a297e7d76 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1108,7 +1108,7 @@ def from_dummies( """ Create a categorical `DataFrame` from a `DataFrame` of dummy variables. - Inverts the operation performed by 'get_dummies'. + Inverts the operation performed by `get_dummies`. Parameters ---------- @@ -1129,6 +1129,22 @@ def from_dummies( DataFrame Categorical data decoded from the dummy input-data. + Raises + ------ + ValueError + * When the input `DataFrame` `data` contains NA values. + * When the input `DataFrame` `data` contains column names with separators + that do not match the separator specified with `sep`. + * When a `dict` passed to `implied_category` does not include an implied + category for each prefix. + * When a value in `data` has more than one category assigned to it. + * When `implied_category=None` and a value in `data` has no category assigned + to it. + TypeError + * When the input `DataFrame` `data` contains non-dummy data. + * When the passed `sep` is of a wrong data type. + * When the passed `implied_category` is of a wrong data type. + Notes ----- The columns of the passed dummy data should only include 1's and 0's, From ced3ed079bfe6b6a6a0d79719500e07faba12bb7 Mon Sep 17 00:00:00 2001 From: pckSF Date: Wed, 5 Jan 2022 19:38:57 +0100 Subject: [PATCH 64/95] Change implied_category to base_category --- doc/source/user_guide/reshaping.rst | 6 ++-- pandas/core/reshape/reshape.py | 43 ++++++++++++----------- pandas/tests/reshape/test_from_dummies.py | 38 ++++++++++---------- 3 files changed, 44 insertions(+), 43 deletions(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 393fedb39bd73..a2d6b3aaa1756 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -741,15 +741,15 @@ for example ``k`` columns of a ``DataFrame`` containing 1s and 0s can derive a pd.from_dummies(df, sep="_") Dummy coded data only requires ``k - 1`` categories to be included, in this case -the ``k`` th categories, implied by not being assigned any of the other ``k - 1`` -categories, can be passed via ``implied_category``. +the ``k`` th category is the base category, implied by not being assigned any of +the other ``k - 1`` categories, can be passed via ``base_category``. .. ipython:: python df = pd.DataFrame({"prefix_a": [0, 1, 0]}) df - pd.from_dummies(df, sep="_", implied_category="b") + pd.from_dummies(df, sep="_", base_category="b") .. _reshaping.factorize: diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index e9a1a297e7d76..8e3297c3872b8 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1103,7 +1103,7 @@ def get_empty_frame(data) -> DataFrame: def from_dummies( data: DataFrame, sep: None | str = None, - implied_category: None | Hashable | dict[str, Hashable] = None, + base_category: None | Hashable | dict[str, Hashable] = None, ) -> DataFrame: """ Create a categorical `DataFrame` from a `DataFrame` of dummy variables. @@ -1119,10 +1119,11 @@ def from_dummies( character indicating the separation of the categorical names from the prefixes. For example, if your column names are 'prefix_A' and 'prefix_B', you can strip the underscore by specifying sep='_'. - implied_category : None, Hashable or dict of Hashables, default None - The implied category the dummy takes when all values are zero. - Can be a a single value for all variables or a dict directly mapping the - implied categories to a prefix of a variable. + base_category : None, Hashable or dict of Hashables, default None + The base category is the implied category when a value has non none of the + listed categories specified with a one, i.e. if all dummies in a row are + zero. Can be a a single value for all variables or a dict directly mapping + the base categories to a prefix of a variable. Returns ------- @@ -1135,15 +1136,15 @@ def from_dummies( * When the input `DataFrame` `data` contains NA values. * When the input `DataFrame` `data` contains column names with separators that do not match the separator specified with `sep`. - * When a `dict` passed to `implied_category` does not include an implied + * When a `dict` passed to `base_category` does not include an implied category for each prefix. * When a value in `data` has more than one category assigned to it. - * When `implied_category=None` and a value in `data` has no category assigned + * When `base_category=None` and a value in `data` has no category assigned to it. TypeError * When the input `DataFrame` `data` contains non-dummy data. * When the passed `sep` is of a wrong data type. - * When the passed `implied_category` is of a wrong data type. + * When the passed `base_category` is of a wrong data type. Notes ----- @@ -1179,7 +1180,7 @@ def from_dummies( ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], ... "col2_c": [0, 0, 0]}) - >>> pd.from_dummies(df, sep="_", implied_category={"col1": "d", "col2": "e"}) + >>> pd.from_dummies(df, sep="_", base_category={"col1": "d", "col2": "e"}) col1 col2 0 a b 1 b a @@ -1217,7 +1218,7 @@ def from_dummies( f"Received 'sep' of type: {type(sep).__name__}" ) - # validate number of implied_category + # validate length of base_category def check_len(item, name) -> None: if not len(item) == len(variables_slice): len_msg = ( @@ -1227,19 +1228,19 @@ def check_len(item, name) -> None: ) raise ValueError(len_msg) - if implied_category: - if isinstance(implied_category, dict): - check_len(implied_category, "implied_category") - elif isinstance(implied_category, Hashable): - implied_category = dict( - zip(variables_slice, [implied_category] * len(variables_slice)) + if base_category: + if isinstance(base_category, dict): + check_len(base_category, "base_category") + elif isinstance(base_category, Hashable): + base_category = dict( + zip(variables_slice, [base_category] * len(variables_slice)) ) else: raise TypeError( - f"Expected 'implied_category' to be of type " + f"Expected 'base_category' to be of type " f"'None', 'Hashable', or 'dict'; " - f"Received 'implied_category' of type: " - f"{type(implied_category).__name__}" + f"Received 'base_category' of type: " + f"{type(base_category).__name__}" ) cat_data = {} @@ -1255,8 +1256,8 @@ def check_len(item, name) -> None: f"First instance in row: {assigned.idxmax()}" ) elif any(assigned == 0): - if isinstance(implied_category, dict): - cats.append(implied_category[prefix]) + if isinstance(base_category, dict): + cats.append(base_category[prefix]) else: raise ValueError( f"Dummy DataFrame contains unassigned value(s); " diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index f5defebede6d0..ef1bcc80f2fe9 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -113,23 +113,23 @@ def test_from_dummies_no_prefix_contains_unassigned(): from_dummies(dummies) -def test_from_dummies_no_prefix_string_cats_implied_category(): +def test_from_dummies_no_prefix_string_cats_base_category(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) expected = DataFrame({"": ["a", "b", "c"]}) - result = from_dummies(dummies, implied_category="c") + result = from_dummies(dummies, base_category="c") tm.assert_frame_equal(result, expected) -def test_from_dummies_no_prefix_wrong_implied_category_type(): +def test_from_dummies_no_prefix_wrong_base_category_type(): dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}) with pytest.raises( TypeError, match=( - r"Expected 'implied_category' to be of type 'None', 'Hashable', or 'dict'; " - r"Received 'implied_category' of type: list" + r"Expected 'base_category' to be of type 'None', 'Hashable', or 'dict'; " + r"Received 'base_category' of type: list" ), ): - from_dummies(dummies, implied_category=["c", "d"]) + from_dummies(dummies, base_category=["c", "d"]) def test_from_dummies_no_prefix_multi_assignment(): @@ -225,58 +225,58 @@ def test_from_dummies_with_prefix_contains_unassigned(dummies_with_unassigned): from_dummies(dummies_with_unassigned, sep="_") -def test_from_dummies_with_prefix_implied_category_str(dummies_with_unassigned): +def test_from_dummies_with_prefix_base_category_str(dummies_with_unassigned): expected = DataFrame({"col1": ["a", "b", "x"], "col2": ["x", "a", "c"]}) - result = from_dummies(dummies_with_unassigned, sep="_", implied_category="x") + result = from_dummies(dummies_with_unassigned, sep="_", base_category="x") tm.assert_frame_equal(result, expected) -def test_from_dummies_with_prefix_implied_category_wrong_type(dummies_with_unassigned): +def test_from_dummies_with_prefix_base_category_wrong_type(dummies_with_unassigned): with pytest.raises( TypeError, match=( - r"Expected 'implied_category' to be of type 'None', 'Hashable', or 'dict'; " - r"Received 'implied_category' of type: list" + r"Expected 'base_category' to be of type 'None', 'Hashable', or 'dict'; " + r"Received 'base_category' of type: list" ), ): - from_dummies(dummies_with_unassigned, sep="_", implied_category=["x", "y"]) + from_dummies(dummies_with_unassigned, sep="_", base_category=["x", "y"]) -def test_from_dummies_with_prefix_implied_category_int_and_float( +def test_from_dummies_with_prefix_base_category_int_and_float( dummies_with_unassigned, ): expected = DataFrame({"col1": ["a", "b", 2.5], "col2": [1, "a", "c"]}) result = from_dummies( dummies_with_unassigned, sep="_", - implied_category={"col2": 1, "col1": 2.5}, + base_category={"col2": 1, "col1": 2.5}, ) tm.assert_frame_equal(result, expected) -def test_from_dummies_with_prefix_implied_category_bool_and_none( +def test_from_dummies_with_prefix_base_category_bool_and_none( dummies_with_unassigned, ): expected = DataFrame({"col1": ["a", "b", False], "col2": [None, "a", "c"]}) result = from_dummies( dummies_with_unassigned, sep="_", - implied_category={"col2": None, "col1": False}, + base_category={"col2": None, "col1": False}, ) tm.assert_frame_equal(result, expected) -def test_from_dummies_with_prefix_implied_category_dict_not_complete( +def test_from_dummies_with_prefix_base_category_dict_not_complete( dummies_with_unassigned, ): with pytest.raises( ValueError, match=( - r"Length of 'implied_category' \(1\) did not match " + r"Length of 'base_category' \(1\) did not match " r"the length of the columns being encoded \(2\)" ), ): - from_dummies(dummies_with_unassigned, sep="_", implied_category={"col1": "x"}) + from_dummies(dummies_with_unassigned, sep="_", base_category={"col1": "x"}) def test_from_dummies_with_prefix_contains_nan(dummies_basic): From 6db77447b5697310c45d193e6544d60b9978595e Mon Sep 17 00:00:00 2001 From: pckSF Date: Wed, 5 Jan 2022 19:52:51 +0100 Subject: [PATCH 65/95] Add proper reference to get_dummies in docstring --- pandas/core/reshape/reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 8e3297c3872b8..6beb162565768 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1108,7 +1108,7 @@ def from_dummies( """ Create a categorical `DataFrame` from a `DataFrame` of dummy variables. - Inverts the operation performed by `get_dummies`. + Inverts the operation performed by :func:`~pandas.get_dummies`. Parameters ---------- From c84d973a3f463d0a97b3ed9ca8cd2d9141480aa5 Mon Sep 17 00:00:00 2001 From: pckSF Date: Wed, 5 Jan 2022 20:03:54 +0100 Subject: [PATCH 66/95] Remove unnecessary copy of input data --- pandas/core/reshape/reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 6beb162565768..cdbc9ad1ba1d8 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1198,7 +1198,7 @@ def from_dummies( # index data with a list of all columns that are dummies try: - data_to_decode = data.astype("boolean") + data_to_decode = data.astype("boolean", copy=False) except TypeError: raise TypeError("Passed DataFrame contains non-dummy data") From 8f91012b7a96284ad3a1a9ebbf8e176a9f1f6f1a Mon Sep 17 00:00:00 2001 From: pckSF Date: Thu, 6 Jan 2022 00:44:13 +0100 Subject: [PATCH 67/95] Fix docstring section order --- pandas/core/reshape/reshape.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c9e9fc8e57674..4ceb6e3f183c3 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1155,15 +1155,15 @@ def from_dummies( * When the passed `sep` is of a wrong data type. * When the passed `base_category` is of a wrong data type. + See Also + -------- + :func:`~pandas.get_dummies` : Convert `Series` or `DataFrame` to dummy codes. + Notes ----- The columns of the passed dummy data should only include 1's and 0's, or boolean values. - See Also - -------- - get_dummies : Convert `Series` or `DataFrame` to dummy codes. - Examples -------- >>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], From 84d5bd803de966249f884e2a2d3a17aad94ae470 Mon Sep 17 00:00:00 2001 From: pckSF Date: Mon, 10 Jan 2022 01:02:20 +0100 Subject: [PATCH 68/95] Remove redundant f-strings --- pandas/core/reshape/reshape.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 4ceb6e3f183c3..6e08a34a64f1e 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1201,7 +1201,7 @@ def from_dummies( if data.isna().any().any(): # type: ignore[union-attr] # error: Item "bool" of "Union[Series, bool]" has no attribute "idxmax" raise ValueError( - f"Dummy DataFrame contains NA value in column: " # type: ignore[union-attr] + "Dummy DataFrame contains NA value in column: " # type: ignore[union-attr] f"'{data.isna().any().idxmax()}'" ) @@ -1223,7 +1223,7 @@ def from_dummies( variables_slice[prefix].append(col) else: raise TypeError( - f"Expected 'sep' to be of type 'str' or 'None'; " + "Expected 'sep' to be of type 'str' or 'None'; " f"Received 'sep' of type: {type(sep).__name__}" ) @@ -1246,9 +1246,9 @@ def check_len(item, name) -> None: ) else: raise TypeError( - f"Expected 'base_category' to be of type " - f"'None', 'Hashable', or 'dict'; " - f"Received 'base_category' of type: " + "Expected 'base_category' to be of type " + "'None', 'Hashable', or 'dict'; " + "Received 'base_category' of type: " f"{type(base_category).__name__}" ) @@ -1261,7 +1261,7 @@ def check_len(item, name) -> None: assigned = data_to_decode[prefix_slice].sum(axis=1) if any(assigned > 1): raise ValueError( - f"Dummy DataFrame contains multi-assignment(s); " + "Dummy DataFrame contains multi-assignment(s); " f"First instance in row: {assigned.idxmax()}" ) elif any(assigned == 0): @@ -1269,7 +1269,7 @@ def check_len(item, name) -> None: cats.append(base_category[prefix]) else: raise ValueError( - f"Dummy DataFrame contains unassigned value(s); " + "Dummy DataFrame contains unassigned value(s); " f"First instance in row: {assigned.idxmin()}" ) data_slice = concat((data_to_decode[prefix_slice], assigned == 0), axis=1) From fd0f9856fbd3a6cb030db29fd428787d25415867 Mon Sep 17 00:00:00 2001 From: pckSF Date: Mon, 10 Jan 2022 01:09:01 +0100 Subject: [PATCH 69/95] Add check for 'data' type --- pandas/core/reshape/reshape.py | 6 ++++++ pandas/tests/reshape/test_from_dummies.py | 9 +++++++++ 2 files changed, 15 insertions(+) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 6e08a34a64f1e..526cf168695c8 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1197,6 +1197,12 @@ def from_dummies( """ from pandas.core.reshape.concat import concat + if not isinstance(data, DataFrame): + raise TypeError( + "Expected 'data' to be a 'DataFrame'; " + f"Received 'data' of type: {type(data).__name__}" + ) + # error: Item "bool" of "Union[Series, bool]" has no attribute "any" if data.isna().any().any(): # type: ignore[union-attr] # error: Item "bool" of "Union[Series, bool]" has no attribute "idxmax" diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index ef1bcc80f2fe9..a7980fc264b18 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -32,6 +32,15 @@ def dummies_with_unassigned(): ) +def test_from_dummies_wrong_data_type(): + dummies = [0, 1, 0] + with pytest.raises( + TypeError, + match=r"Expected 'data' to be a 'DataFrame'; Received 'data' of type: list", + ): + from_dummies(dummies) + + def test_from_dummies_no_prefix_string_cats_basic(): dummies = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}) expected = DataFrame({"": ["a", "b", "c", "a"]}) From 6230d0fa42775cc5f13a9efd3487aaa33100c48c Mon Sep 17 00:00:00 2001 From: pckSF Date: Fri, 14 Jan 2022 15:30:08 +0100 Subject: [PATCH 70/95] Add TypeError for wrong data type to docstring --- pandas/core/reshape/reshape.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 526cf168695c8..3e5a525b72f53 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1151,6 +1151,7 @@ def from_dummies( * When `base_category=None` and a value in `data` has no category assigned to it. TypeError + * When the input `data` is not of type `DataFrame`. * When the input `DataFrame` `data` contains non-dummy data. * When the passed `sep` is of a wrong data type. * When the passed `base_category` is of a wrong data type. From 84a60f73b84d7709f8a0e511322b93b475228afb Mon Sep 17 00:00:00 2001 From: pckSF Date: Fri, 14 Jan 2022 15:30:27 +0100 Subject: [PATCH 71/95] Add roundtrip tests get_dummies from_dummies --- pandas/tests/reshape/test_from_dummies.py | 34 +++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index a7980fc264b18..1ead0655fe62c 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -1,9 +1,15 @@ import numpy as np import pytest -from pandas import DataFrame +from pandas import ( + DataFrame, + Series, +) import pandas._testing as tm -from pandas.core.reshape.reshape import from_dummies +from pandas.core.reshape.reshape import ( + from_dummies, + get_dummies, +) @pytest.fixture @@ -32,6 +38,30 @@ def dummies_with_unassigned(): ) +def test_roundtrip_series_to_dataframe(): + categories = Series(["a", "b", "c", "a"]) + dummies = get_dummies(categories) + result = from_dummies(dummies) + expected = DataFrame({"": ["a", "b", "c", "a"]}) + tm.assert_frame_equal(result, expected) + + +def test_roundtrip_single_column_dataframe(): + categories = DataFrame({"": ["a", "b", "c", "a"]}) + dummies = get_dummies(categories) + result = from_dummies(dummies, sep="_") + expected = categories + tm.assert_frame_equal(result, expected) + + +def test_roundtrip_with_prefixes(): + categories = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]}) + dummies = get_dummies(categories) + result = from_dummies(dummies, sep="_") + expected = categories + tm.assert_frame_equal(result, expected) + + def test_from_dummies_wrong_data_type(): dummies = [0, 1, 0] with pytest.raises( From bc658bab76c3079538f9a14a92a188e1a2e0c44e Mon Sep 17 00:00:00 2001 From: pckSF Date: Sat, 29 Jan 2022 11:55:28 +0100 Subject: [PATCH 72/95] Fix from_dummies import in test file --- pandas/tests/reshape/test_from_dummies.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 1ead0655fe62c..c53e763f27847 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -4,12 +4,10 @@ from pandas import ( DataFrame, Series, -) -import pandas._testing as tm -from pandas.core.reshape.reshape import ( from_dummies, get_dummies, ) +import pandas._testing as tm @pytest.fixture From 9fbca72a431e198234f81d3a00125208baa463a2 Mon Sep 17 00:00:00 2001 From: pckSF Date: Sun, 30 Jan 2022 17:11:34 +0100 Subject: [PATCH 73/95] Update userguide versionadded to 1.5 --- doc/source/user_guide/reshaping.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 173ef82b98b07..7ef920c317e3e 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -701,7 +701,7 @@ To choose another dtype, use the ``dtype`` argument: pd.get_dummies(df, dtype=bool).dtypes -.. versionadded:: 1.4.0 +.. versionadded:: 1.5.0 To convert a "dummy" or "indicator" ``DataFrame``, into a categorical ``DataFrame``, for example ``k`` columns of a ``DataFrame`` containing 1s and 0s can derive a From 2581fc96a0b9d23ba17d14eda6e2b5aade9a3a9e Mon Sep 17 00:00:00 2001 From: pckSF Date: Mon, 31 Jan 2022 17:44:09 +0100 Subject: [PATCH 74/95] Draft whats-new entry --- doc/source/whatsnew/v1.5.0.rst | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 1d4054d5ea0f1..3844825bde4ed 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -22,6 +22,29 @@ Styler - New method :meth:`.Styler.to_string` for alternative customisable output methods (:issue:`44502`) - Various bug fixes, see below. +.. _whatsnew_150.enhancements.from_dummies: + +from_dummies +^^^^^^^^^^^^ + +Added new function :func:`~pandas.get_dummies` to convert a dummy coded :class:`DataFrame` into a categorical :class:`DataFrame`. + +Example:: + +.. code-block:: python + + >>> import pandas as pd + + >>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], + ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], + ... "col2_c": [0, 0, 1]}) + + >>> pd.from_dummies(df, sep="_") + col1 col2 + 0 a b + 1 b a + 2 a c + .. _whatsnew_150.enhancements.enhancement2: enhancement2 From 85a0ed86a6389218c7e246d7b1aab542129c5203 Mon Sep 17 00:00:00 2001 From: pckSF Date: Mon, 31 Jan 2022 18:56:19 +0100 Subject: [PATCH 75/95] Change code-block to ipython --- doc/source/whatsnew/v1.5.0.rst | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 3844825bde4ed..438ac19669ef6 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -31,19 +31,16 @@ Added new function :func:`~pandas.get_dummies` to convert a dummy coded :class:` Example:: -.. code-block:: python +.. ipython:: python + + import pandas as pd - >>> import pandas as pd + df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], + "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], + "col2_c": [0, 0, 1]}) - >>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], - ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], - ... "col2_c": [0, 0, 1]}) + pd.from_dummies(df, sep="_") - >>> pd.from_dummies(df, sep="_") - col1 col2 - 0 a b - 1 b a - 2 a c .. _whatsnew_150.enhancements.enhancement2: From 5b74039d603735d0695be6864c8baa9ecab03a69 Mon Sep 17 00:00:00 2001 From: pckSF Date: Tue, 1 Feb 2022 15:04:16 +0100 Subject: [PATCH 76/95] Improve test names and organization --- pandas/tests/reshape/test_from_dummies.py | 334 +++++++++++----------- 1 file changed, 167 insertions(+), 167 deletions(-) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index c53e763f27847..82df8455ec817 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -36,31 +36,7 @@ def dummies_with_unassigned(): ) -def test_roundtrip_series_to_dataframe(): - categories = Series(["a", "b", "c", "a"]) - dummies = get_dummies(categories) - result = from_dummies(dummies) - expected = DataFrame({"": ["a", "b", "c", "a"]}) - tm.assert_frame_equal(result, expected) - - -def test_roundtrip_single_column_dataframe(): - categories = DataFrame({"": ["a", "b", "c", "a"]}) - dummies = get_dummies(categories) - result = from_dummies(dummies, sep="_") - expected = categories - tm.assert_frame_equal(result, expected) - - -def test_roundtrip_with_prefixes(): - categories = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]}) - dummies = get_dummies(categories) - result = from_dummies(dummies, sep="_") - expected = categories - tm.assert_frame_equal(result, expected) - - -def test_from_dummies_wrong_data_type(): +def test_error_wrong_data_type(): dummies = [0, 1, 0] with pytest.raises( TypeError, @@ -69,76 +45,7 @@ def test_from_dummies_wrong_data_type(): from_dummies(dummies) -def test_from_dummies_no_prefix_string_cats_basic(): - dummies = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}) - expected = DataFrame({"": ["a", "b", "c", "a"]}) - result = from_dummies(dummies) - tm.assert_frame_equal(result, expected) - - -def test_from_dummies_no_prefix_string_cats_basic_bool_values(): - dummies = DataFrame( - { - "a": [True, False, False, True], - "b": [False, True, False, False], - "c": [False, False, True, False], - } - ) - expected = DataFrame({"": ["a", "b", "c", "a"]}) - result = from_dummies(dummies) - tm.assert_frame_equal(result, expected) - - -def test_from_dummies_no_prefix_string_cats_basic_mixed_bool_values(): - dummies = DataFrame( - {"a": [1, 0, 0, 1], "b": [False, True, False, False], "c": [0, 0, 1, 0]} - ) - expected = DataFrame({"": ["a", "b", "c", "a"]}) - result = from_dummies(dummies) - tm.assert_frame_equal(result, expected) - - -def test_from_dummies_no_prefix_int_cats_basic(): - dummies = DataFrame( - {1: [1, 0, 0, 0], 25: [0, 1, 0, 0], 2: [0, 0, 1, 0], 5: [0, 0, 0, 1]} - ) - expected = DataFrame({"": [1, 25, 2, 5]}, dtype="object") - result = from_dummies(dummies) - tm.assert_frame_equal(result, expected) - - -def test_from_dummies_no_prefix_float_cats_basic(): - dummies = DataFrame( - {1.0: [1, 0, 0, 0], 25.0: [0, 1, 0, 0], 2.5: [0, 0, 1, 0], 5.84: [0, 0, 0, 1]} - ) - expected = DataFrame({"": [1.0, 25.0, 2.5, 5.84]}, dtype="object") - result = from_dummies(dummies) - tm.assert_frame_equal(result, expected) - - -def test_from_dummies_no_prefix_mixed_cats_basic(): - dummies = DataFrame( - { - 1.23: [1, 0, 0, 0, 0], - "c": [0, 1, 0, 0, 0], - 2: [0, 0, 1, 0, 0], - False: [0, 0, 0, 1, 0], - None: [0, 0, 0, 0, 1], - } - ) - expected = DataFrame({"": [1.23, "c", 2, False, None]}, dtype="object") - result = from_dummies(dummies) - tm.assert_frame_equal(result, expected) - - -def test_from_dummies_no_prefix_string_cats_contains_get_dummies_NaN_column(): - dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0], "NaN": [0, 0, 1]}) - expected = DataFrame({"": ["a", "b", "NaN"]}) - result = from_dummies(dummies) - tm.assert_frame_equal(result, expected) - - -def test_from_dummies_no_prefix_contains_unassigned(): +def test_error_no_prefix_contains_unassigned(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) with pytest.raises( ValueError, @@ -150,14 +57,7 @@ def test_from_dummies_no_prefix_contains_unassigned(): from_dummies(dummies) -def test_from_dummies_no_prefix_string_cats_base_category(): - dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) - expected = DataFrame({"": ["a", "b", "c"]}) - result = from_dummies(dummies, base_category="c") - tm.assert_frame_equal(result, expected) - - -def test_from_dummies_no_prefix_wrong_base_category_type(): +def test_error_no_prefix_wrong_base_category_type(): dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}) with pytest.raises( TypeError, @@ -169,7 +69,7 @@ def test_from_dummies_no_prefix_wrong_base_category_type(): from_dummies(dummies, base_category=["c", "d"]) -def test_from_dummies_no_prefix_multi_assignment(): +def test_error_no_prefix_multi_assignment(): dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}) with pytest.raises( ValueError, @@ -181,7 +81,7 @@ def test_from_dummies_no_prefix_multi_assignment(): from_dummies(dummies) -def test_from_dummies_no_prefix_contains_nan(): +def test_error_no_prefix_contains_nan(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, np.nan]}) with pytest.raises( ValueError, match=r"Dummy DataFrame contains NA value in column: 'b'" @@ -189,7 +89,7 @@ def test_from_dummies_no_prefix_contains_nan(): from_dummies(dummies) -def test_from_dummies_contains_non_dummies(): +def test_error_contains_non_dummies(): dummies = DataFrame( {"a": [1, 6, 3, 1], "b": [0, 1, 0, 2], "c": ["c1", "c2", "c3", "c4"]} ) @@ -200,13 +100,7 @@ def test_from_dummies_contains_non_dummies(): from_dummies(dummies) -def test_from_dummies_with_prefix_basic(dummies_basic): - expected = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]}) - result = from_dummies(dummies_basic, sep="_") - tm.assert_frame_equal(result, expected) - - -def test_from_dummies_with_prefix_multiple_seperators(): +def test_error_with_prefix_multiple_seperators(): dummies = DataFrame( { "col1_a": [1, 0, 1], @@ -222,7 +116,7 @@ def test_from_dummies_with_prefix_multiple_seperators(): from_dummies(dummies, sep="_") -def test_from_dummies_with_prefix_sep_wrong_type(dummies_basic): +def test_error_with_prefix_sep_wrong_type(dummies_basic): with pytest.raises( TypeError, @@ -234,24 +128,7 @@ def test_from_dummies_with_prefix_sep_wrong_type(dummies_basic): from_dummies(dummies_basic, sep=["_"]) -def test_from_dummies_with_prefix_contains_get_dummies_NaN_column(): - dummies = DataFrame( - { - "col1_a": [1, 0, 0], - "col1_b": [0, 1, 0], - "col1_NaN": [0, 0, 1], - "col2_a": [0, 1, 0], - "col2_b": [0, 0, 0], - "col2_c": [0, 0, 1], - "col2_NaN": [1, 0, 0], - }, - ) - expected = DataFrame({"col1": ["a", "b", "NaN"], "col2": ["NaN", "a", "c"]}) - result = from_dummies(dummies, sep="_") - tm.assert_frame_equal(result, expected) - - -def test_from_dummies_with_prefix_contains_unassigned(dummies_with_unassigned): +def test_error_with_prefix_contains_unassigned(dummies_with_unassigned): with pytest.raises( ValueError, match=( @@ -262,13 +139,7 @@ def test_from_dummies_with_prefix_contains_unassigned(dummies_with_unassigned): from_dummies(dummies_with_unassigned, sep="_") -def test_from_dummies_with_prefix_base_category_str(dummies_with_unassigned): - expected = DataFrame({"col1": ["a", "b", "x"], "col2": ["x", "a", "c"]}) - result = from_dummies(dummies_with_unassigned, sep="_", base_category="x") - tm.assert_frame_equal(result, expected) - - -def test_from_dummies_with_prefix_base_category_wrong_type(dummies_with_unassigned): +def test_error_with_prefix_base_category_wrong_type(dummies_with_unassigned): with pytest.raises( TypeError, match=( @@ -279,31 +150,7 @@ def test_from_dummies_with_prefix_base_category_wrong_type(dummies_with_unassign from_dummies(dummies_with_unassigned, sep="_", base_category=["x", "y"]) -def test_from_dummies_with_prefix_base_category_int_and_float( - dummies_with_unassigned, -): - expected = DataFrame({"col1": ["a", "b", 2.5], "col2": [1, "a", "c"]}) - result = from_dummies( - dummies_with_unassigned, - sep="_", - base_category={"col2": 1, "col1": 2.5}, - ) - tm.assert_frame_equal(result, expected) - - -def test_from_dummies_with_prefix_base_category_bool_and_none( - dummies_with_unassigned, -): - expected = DataFrame({"col1": ["a", "b", False], "col2": [None, "a", "c"]}) - result = from_dummies( - dummies_with_unassigned, - sep="_", - base_category={"col2": None, "col1": False}, - ) - tm.assert_frame_equal(result, expected) - - -def test_from_dummies_with_prefix_base_category_dict_not_complete( +def test_error_with_prefix_base_category_dict_not_complete( dummies_with_unassigned, ): with pytest.raises( @@ -316,7 +163,7 @@ def test_from_dummies_with_prefix_base_category_dict_not_complete( from_dummies(dummies_with_unassigned, sep="_", base_category={"col1": "x"}) -def test_from_dummies_with_prefix_contains_nan(dummies_basic): +def test_error_with_prefix_contains_nan(dummies_basic): dummies_basic["col2_c"][2] = np.nan with pytest.raises( ValueError, match=r"Dummy DataFrame contains NA value in column: 'col2_c'" @@ -324,13 +171,13 @@ def test_from_dummies_with_prefix_contains_nan(dummies_basic): from_dummies(dummies_basic, sep="_") -def test_from_dummies_with_prefix_contains_non_dummies(dummies_basic): +def test_error_with_prefix_contains_non_dummies(dummies_basic): dummies_basic["col2_c"][2] = "str" with pytest.raises(TypeError, match=r"Passed DataFrame contains non-dummy data"): from_dummies(dummies_basic, sep="_") -def test_from_dummies_with_prefix_double_assignment(): +def test_error_with_prefix_double_assignment(): dummies = DataFrame( { "col1_a": [1, 0, 1], @@ -348,3 +195,156 @@ def test_from_dummies_with_prefix_double_assignment(): ), ): from_dummies(dummies, sep="_") + + +def test_roundtrip_series_to_dataframe(): + categories = Series(["a", "b", "c", "a"]) + dummies = get_dummies(categories) + result = from_dummies(dummies) + expected = DataFrame({"": ["a", "b", "c", "a"]}) + tm.assert_frame_equal(result, expected) + + +def test_roundtrip_single_column_dataframe(): + categories = DataFrame({"": ["a", "b", "c", "a"]}) + dummies = get_dummies(categories) + result = from_dummies(dummies, sep="_") + expected = categories + tm.assert_frame_equal(result, expected) + + +def test_roundtrip_with_prefixes(): + categories = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]}) + dummies = get_dummies(categories) + result = from_dummies(dummies, sep="_") + expected = categories + tm.assert_frame_equal(result, expected) + + +def test_no_prefix_string_cats_basic(): + dummies = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}) + expected = DataFrame({"": ["a", "b", "c", "a"]}) + result = from_dummies(dummies) + tm.assert_frame_equal(result, expected) + + +def test_no_prefix_string_cats_basic_bool_values(): + dummies = DataFrame( + { + "a": [True, False, False, True], + "b": [False, True, False, False], + "c": [False, False, True, False], + } + ) + expected = DataFrame({"": ["a", "b", "c", "a"]}) + result = from_dummies(dummies) + tm.assert_frame_equal(result, expected) + + +def test_no_prefix_string_cats_basic_mixed_bool_values(): + dummies = DataFrame( + {"a": [1, 0, 0, 1], "b": [False, True, False, False], "c": [0, 0, 1, 0]} + ) + expected = DataFrame({"": ["a", "b", "c", "a"]}) + result = from_dummies(dummies) + tm.assert_frame_equal(result, expected) + + +def test_no_prefix_int_cats_basic(): + dummies = DataFrame( + {1: [1, 0, 0, 0], 25: [0, 1, 0, 0], 2: [0, 0, 1, 0], 5: [0, 0, 0, 1]} + ) + expected = DataFrame({"": [1, 25, 2, 5]}, dtype="object") + result = from_dummies(dummies) + tm.assert_frame_equal(result, expected) + + +def test_no_prefix_float_cats_basic(): + dummies = DataFrame( + {1.0: [1, 0, 0, 0], 25.0: [0, 1, 0, 0], 2.5: [0, 0, 1, 0], 5.84: [0, 0, 0, 1]} + ) + expected = DataFrame({"": [1.0, 25.0, 2.5, 5.84]}, dtype="object") + result = from_dummies(dummies) + tm.assert_frame_equal(result, expected) + + +def test_no_prefix_mixed_cats_basic(): + dummies = DataFrame( + { + 1.23: [1, 0, 0, 0, 0], + "c": [0, 1, 0, 0, 0], + 2: [0, 0, 1, 0, 0], + False: [0, 0, 0, 1, 0], + None: [0, 0, 0, 0, 1], + } + ) + expected = DataFrame({"": [1.23, "c", 2, False, None]}, dtype="object") + result = from_dummies(dummies) + tm.assert_frame_equal(result, expected) + + +def test_no_prefix_string_cats_contains_get_dummies_NaN_column(): + dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0], "NaN": [0, 0, 1]}) + expected = DataFrame({"": ["a", "b", "NaN"]}) + result = from_dummies(dummies) + tm.assert_frame_equal(result, expected) + + +def test_no_prefix_string_cats_base_category(): + dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) + expected = DataFrame({"": ["a", "b", "c"]}) + result = from_dummies(dummies, base_category="c") + tm.assert_frame_equal(result, expected) + + +def test_with_prefix_basic(dummies_basic): + expected = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]}) + result = from_dummies(dummies_basic, sep="_") + tm.assert_frame_equal(result, expected) + + +def test_with_prefix_contains_get_dummies_NaN_column(): + dummies = DataFrame( + { + "col1_a": [1, 0, 0], + "col1_b": [0, 1, 0], + "col1_NaN": [0, 0, 1], + "col2_a": [0, 1, 0], + "col2_b": [0, 0, 0], + "col2_c": [0, 0, 1], + "col2_NaN": [1, 0, 0], + }, + ) + expected = DataFrame({"col1": ["a", "b", "NaN"], "col2": ["NaN", "a", "c"]}) + result = from_dummies(dummies, sep="_") + tm.assert_frame_equal(result, expected) + + +def test_with_prefix_base_category_str(dummies_with_unassigned): + expected = DataFrame({"col1": ["a", "b", "x"], "col2": ["x", "a", "c"]}) + result = from_dummies(dummies_with_unassigned, sep="_", base_category="x") + tm.assert_frame_equal(result, expected) + + +def test_with_prefix_base_category_int_and_float( + dummies_with_unassigned, +): + expected = DataFrame({"col1": ["a", "b", 2.5], "col2": [1, "a", "c"]}) + result = from_dummies( + dummies_with_unassigned, + sep="_", + base_category={"col2": 1, "col1": 2.5}, + ) + tm.assert_frame_equal(result, expected) + + +def test_with_prefix_base_category_bool_and_none( + dummies_with_unassigned, +): + expected = DataFrame({"col1": ["a", "b", False], "col2": [None, "a", "c"]}) + result = from_dummies( + dummies_with_unassigned, + sep="_", + base_category={"col2": None, "col1": False}, + ) + tm.assert_frame_equal(result, expected) From 015ee94ce21836e19ce0e7e452bae4e12940a210 Mon Sep 17 00:00:00 2001 From: pckSF Date: Tue, 1 Feb 2022 15:08:47 +0100 Subject: [PATCH 77/95] Show DataFrames used in docstring examples --- pandas/core/reshape/encoding.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 175cce43ab253..69049da9033b4 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -381,6 +381,13 @@ def from_dummies( >>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], ... "c": [0, 0, 1, 0]}) + >>> df + a b c + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 + 3 1 0 0 + >>> pd.from_dummies(df) 0 a 1 b @@ -391,6 +398,12 @@ def from_dummies( ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], ... "col2_c": [0, 0, 1]}) + >>> df + col1_a col1_b col2_a col2_b col2_c + 0 1 0 0 1 0 + 1 0 1 1 0 0 + 2 1 0 0 0 1 + >>> pd.from_dummies(df, sep="_") col1 col2 0 a b @@ -401,6 +414,12 @@ def from_dummies( ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], ... "col2_c": [0, 0, 0]}) + >>> df + col1_a col1_b col2_a col2_b col2_c + 0 1 0 0 1 0 + 1 0 1 1 0 0 + 2 0 0 0 0 0 + >>> pd.from_dummies(df, sep="_", base_category={"col1": "d", "col2": "e"}) col1 col2 0 a b From ae9f3d28f2d68202d8105292c49a7eedeef9503f Mon Sep 17 00:00:00 2001 From: pckSF Date: Wed, 20 Apr 2022 16:55:59 +0200 Subject: [PATCH 78/95] Fix whatsnew entry typo --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 19a1b0e3125a2..b8f972accba30 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -29,7 +29,7 @@ Styler from_dummies ^^^^^^^^^^^^ -Added new function :func:`~pandas.get_dummies` to convert a dummy coded :class:`DataFrame` into a categorical :class:`DataFrame`. +Added new function :func:`~pandas.from_dummies` to convert a dummy coded :class:`DataFrame` into a categorical :class:`DataFrame`. Example:: From a59ed4e5f9e28dade1046069fb088c3f40628065 Mon Sep 17 00:00:00 2001 From: pckSF Date: Thu, 28 Apr 2022 12:30:03 +0200 Subject: [PATCH 79/95] Fix whats-new --- doc/source/whatsnew/v1.5.0.rst | 41 +++++++++++++++++----------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index b8f972accba30..508928f1e1ae8 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -24,27 +24,7 @@ Styler - Added a new method :meth:`.Styler.concat` which allows adding customised footer rows to visualise additional calculations on the data, e.g. totals and counts etc. (:issue:`43875`, :issue:`46186`) - :meth:`.Styler.highlight_null` now accepts ``color`` consistently with other builtin methods and deprecates ``null_color`` although this remains backwards compatible (:issue:`45907`) -.. _whatsnew_150.enhancements.from_dummies: - -from_dummies -^^^^^^^^^^^^ - -Added new function :func:`~pandas.from_dummies` to convert a dummy coded :class:`DataFrame` into a categorical :class:`DataFrame`. - -Example:: - -.. ipython:: python - - import pandas as pd - - df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], - "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], - "col2_c": [0, 0, 1]}) - - pd.from_dummies(df, sep="_") - - -.. _whatsnew_150.enhancements.enhancement2: +.. _whatsnew_150.enhancements.resample_group_keys: Control of index with ``group_keys`` in :meth:`DataFrame.resample` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -95,6 +75,25 @@ as seen in the following example. 1 2021-01-02 08:00:00 4 2 2021-01-02 16:00:00 5 +.. _whatsnew_150.enhancements.from_dummies: + +from_dummies +^^^^^^^^^^^^ + +Added new function :func:`~pandas.from_dummies` to convert a dummy coded :class:`DataFrame` into a categorical :class:`DataFrame`. + +Example:: + +.. ipython:: python + + import pandas as pd + + df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], + "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], + "col2_c": [0, 0, 1]}) + + pd.from_dummies(df, sep="_") + .. _whatsnew_150.enhancements.other: Other enhancements From 7fa66b3b7d1fd727a9843f8ed4e6bc6aae425b7e Mon Sep 17 00:00:00 2001 From: Peter Schmitt-Foerster Date: Fri, 3 Jun 2022 22:31:10 +0200 Subject: [PATCH 80/95] Change base_category to default_category --- doc/source/user_guide/reshaping.rst | 4 +-- pandas/core/reshape/encoding.py | 36 ++++++++++----------- pandas/tests/reshape/test_from_dummies.py | 38 +++++++++++------------ 3 files changed, 39 insertions(+), 39 deletions(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 7ef920c317e3e..080f0ac3fc3f2 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -717,14 +717,14 @@ for example ``k`` columns of a ``DataFrame`` containing 1s and 0s can derive a Dummy coded data only requires ``k - 1`` categories to be included, in this case the ``k`` th category is the base category, implied by not being assigned any of -the other ``k - 1`` categories, can be passed via ``base_category``. +the other ``k - 1`` categories, can be passed via ``default_category``. .. ipython:: python df = pd.DataFrame({"prefix_a": [0, 1, 0]}) df - pd.from_dummies(df, sep="_", base_category="b") + pd.from_dummies(df, sep="_", default_category="b") .. _reshaping.factorize: diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 69049da9033b4..659c9c7b72fea 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -323,7 +323,7 @@ def get_empty_frame(data) -> DataFrame: def from_dummies( data: DataFrame, sep: None | str = None, - base_category: None | Hashable | dict[str, Hashable] = None, + default_category: None | Hashable | dict[str, Hashable] = None, ) -> DataFrame: """ Create a categorical `DataFrame` from a `DataFrame` of dummy variables. @@ -339,7 +339,7 @@ def from_dummies( character indicating the separation of the categorical names from the prefixes. For example, if your column names are 'prefix_A' and 'prefix_B', you can strip the underscore by specifying sep='_'. - base_category : None, Hashable or dict of Hashables, default None + default_category : None, Hashable or dict of Hashables, default None The base category is the implied category when a value has non none of the listed categories specified with a one, i.e. if all dummies in a row are zero. Can be a a single value for all variables or a dict directly mapping @@ -356,16 +356,16 @@ def from_dummies( * When the input `DataFrame` `data` contains NA values. * When the input `DataFrame` `data` contains column names with separators that do not match the separator specified with `sep`. - * When a `dict` passed to `base_category` does not include an implied + * When a `dict` passed to `default_category` does not include an implied category for each prefix. * When a value in `data` has more than one category assigned to it. - * When `base_category=None` and a value in `data` has no category assigned + * When `default_category=None` and a value in `data` has no category assigned to it. TypeError * When the input `data` is not of type `DataFrame`. * When the input `DataFrame` `data` contains non-dummy data. * When the passed `sep` is of a wrong data type. - * When the passed `base_category` is of a wrong data type. + * When the passed `default_category` is of a wrong data type. See Also -------- @@ -420,7 +420,7 @@ def from_dummies( 1 0 1 1 0 0 2 0 0 0 0 0 - >>> pd.from_dummies(df, sep="_", base_category={"col1": "d", "col2": "e"}) + >>> pd.from_dummies(df, sep="_", default_category={"col1": "d", "col2": "e"}) col1 col2 0 a b 1 b a @@ -464,7 +464,7 @@ def from_dummies( f"Received 'sep' of type: {type(sep).__name__}" ) - # validate length of base_category + # validate length of default_category def check_len(item, name) -> None: if not len(item) == len(variables_slice): len_msg = ( @@ -474,19 +474,19 @@ def check_len(item, name) -> None: ) raise ValueError(len_msg) - if base_category: - if isinstance(base_category, dict): - check_len(base_category, "base_category") - elif isinstance(base_category, Hashable): - base_category = dict( - zip(variables_slice, [base_category] * len(variables_slice)) + if default_category: + if isinstance(default_category, dict): + check_len(default_category, "default_category") + elif isinstance(default_category, Hashable): + default_category = dict( + zip(variables_slice, [default_category] * len(variables_slice)) ) else: raise TypeError( - "Expected 'base_category' to be of type " + "Expected 'default_category' to be of type " "'None', 'Hashable', or 'dict'; " - "Received 'base_category' of type: " - f"{type(base_category).__name__}" + "Received 'default_category' of type: " + f"{type(default_category).__name__}" ) cat_data = {} @@ -502,8 +502,8 @@ def check_len(item, name) -> None: f"First instance in row: {assigned.idxmax()}" ) elif any(assigned == 0): - if isinstance(base_category, dict): - cats.append(base_category[prefix]) + if isinstance(default_category, dict): + cats.append(default_category[prefix]) else: raise ValueError( "Dummy DataFrame contains unassigned value(s); " diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 82df8455ec817..035b7150d141c 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -57,16 +57,16 @@ def test_error_no_prefix_contains_unassigned(): from_dummies(dummies) -def test_error_no_prefix_wrong_base_category_type(): +def test_error_no_prefix_wrong_default_category_type(): dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}) with pytest.raises( TypeError, match=( - r"Expected 'base_category' to be of type 'None', 'Hashable', or 'dict'; " - r"Received 'base_category' of type: list" + r"Expected 'default_category' to be of type 'None', 'Hashable', or 'dict'; " + r"Received 'default_category' of type: list" ), ): - from_dummies(dummies, base_category=["c", "d"]) + from_dummies(dummies, default_category=["c", "d"]) def test_error_no_prefix_multi_assignment(): @@ -139,28 +139,28 @@ def test_error_with_prefix_contains_unassigned(dummies_with_unassigned): from_dummies(dummies_with_unassigned, sep="_") -def test_error_with_prefix_base_category_wrong_type(dummies_with_unassigned): +def test_error_with_prefix_default_category_wrong_type(dummies_with_unassigned): with pytest.raises( TypeError, match=( - r"Expected 'base_category' to be of type 'None', 'Hashable', or 'dict'; " - r"Received 'base_category' of type: list" + r"Expected 'default_category' to be of type 'None', 'Hashable', or 'dict'; " + r"Received 'default_category' of type: list" ), ): - from_dummies(dummies_with_unassigned, sep="_", base_category=["x", "y"]) + from_dummies(dummies_with_unassigned, sep="_", default_category=["x", "y"]) -def test_error_with_prefix_base_category_dict_not_complete( +def test_error_with_prefix_default_category_dict_not_complete( dummies_with_unassigned, ): with pytest.raises( ValueError, match=( - r"Length of 'base_category' \(1\) did not match " + r"Length of 'default_category' \(1\) did not match " r"the length of the columns being encoded \(2\)" ), ): - from_dummies(dummies_with_unassigned, sep="_", base_category={"col1": "x"}) + from_dummies(dummies_with_unassigned, sep="_", default_category={"col1": "x"}) def test_error_with_prefix_contains_nan(dummies_basic): @@ -290,10 +290,10 @@ def test_no_prefix_string_cats_contains_get_dummies_NaN_column(): tm.assert_frame_equal(result, expected) -def test_no_prefix_string_cats_base_category(): +def test_no_prefix_string_cats_default_category(): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) expected = DataFrame({"": ["a", "b", "c"]}) - result = from_dummies(dummies, base_category="c") + result = from_dummies(dummies, default_category="c") tm.assert_frame_equal(result, expected) @@ -320,31 +320,31 @@ def test_with_prefix_contains_get_dummies_NaN_column(): tm.assert_frame_equal(result, expected) -def test_with_prefix_base_category_str(dummies_with_unassigned): +def test_with_prefix_default_category_str(dummies_with_unassigned): expected = DataFrame({"col1": ["a", "b", "x"], "col2": ["x", "a", "c"]}) - result = from_dummies(dummies_with_unassigned, sep="_", base_category="x") + result = from_dummies(dummies_with_unassigned, sep="_", default_category="x") tm.assert_frame_equal(result, expected) -def test_with_prefix_base_category_int_and_float( +def test_with_prefix_default_category_int_and_float( dummies_with_unassigned, ): expected = DataFrame({"col1": ["a", "b", 2.5], "col2": [1, "a", "c"]}) result = from_dummies( dummies_with_unassigned, sep="_", - base_category={"col2": 1, "col1": 2.5}, + default_category={"col2": 1, "col1": 2.5}, ) tm.assert_frame_equal(result, expected) -def test_with_prefix_base_category_bool_and_none( +def test_with_prefix_default_category_bool_and_none( dummies_with_unassigned, ): expected = DataFrame({"col1": ["a", "b", False], "col2": [None, "a", "c"]}) result = from_dummies( dummies_with_unassigned, sep="_", - base_category={"col2": None, "col1": False}, + default_category={"col2": None, "col1": False}, ) tm.assert_frame_equal(result, expected) From 530889e3c8126eb522b542a0ac0ab8d3e7892c76 Mon Sep 17 00:00:00 2001 From: Peter Schmitt-Foerster Date: Fri, 3 Jun 2022 23:52:00 +0200 Subject: [PATCH 81/95] Add double ticks to render code in docstring --- pandas/core/reshape/encoding.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 659c9c7b72fea..b6466a90903cf 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -326,9 +326,9 @@ def from_dummies( default_category: None | Hashable | dict[str, Hashable] = None, ) -> DataFrame: """ - Create a categorical `DataFrame` from a `DataFrame` of dummy variables. + Create a categorical ``DataFrame`` from a ``DataFrame`` of dummy variables. - Inverts the operation performed by :func:`~pandas.get_dummies`. + Inverts the operation performed by :func:``~pandas.get_dummies``. Parameters ---------- @@ -353,23 +353,24 @@ def from_dummies( Raises ------ ValueError - * When the input `DataFrame` `data` contains NA values. - * When the input `DataFrame` `data` contains column names with separators - that do not match the separator specified with `sep`. - * When a `dict` passed to `default_category` does not include an implied + * When the input ``DataFrame`` ``data`` contains NA values. + * When the input ``DataFrame`` ``data`` contains column names with separators + that do not match the separator specified with ``sep``. + * When a ``dict`` passed to ``default_category`` does not include an implied category for each prefix. - * When a value in `data` has more than one category assigned to it. - * When `default_category=None` and a value in `data` has no category assigned - to it. + * When a value in ``data`` has more than one category assigned to it. + * When ``default_category=None`` and a value in ``data`` has no category + assigned to it. TypeError - * When the input `data` is not of type `DataFrame`. - * When the input `DataFrame` `data` contains non-dummy data. - * When the passed `sep` is of a wrong data type. - * When the passed `default_category` is of a wrong data type. + * When the input ``data`` is not of type ``DataFrame``. + * When the input ``DataFrame`` ``data`` contains non-dummy data. + * When the passed ``sep`` is of a wrong data type. + * When the passed ``default_category`` is of a wrong data type. See Also -------- - :func:`~pandas.get_dummies` : Convert `Series` or `DataFrame` to dummy codes. + :func:``~pandas.get_dummies`` : Convert ``Series`` or ``DataFrame`` to dummy + codes. Notes ----- From 6536c654aa597aa429fa544813cd18e2684948aa Mon Sep 17 00:00:00 2001 From: Peter Schmitt-Foerster Date: Fri, 3 Jun 2022 23:57:18 +0200 Subject: [PATCH 82/95] Fix docstring typos and alignments --- doc/source/user_guide/reshaping.rst | 2 +- pandas/core/reshape/encoding.py | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index c4044c1e55c11..adca9de6c130a 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -721,7 +721,7 @@ for example ``k`` columns of a ``DataFrame`` containing 1s and 0s can derive a pd.from_dummies(df, sep="_") Dummy coded data only requires ``k - 1`` categories to be included, in this case -the ``k`` th category is the base category, implied by not being assigned any of +the ``k`` th category is the default category, implied by not being assigned any of the other ``k - 1`` categories, can be passed via ``default_category``. .. ipython:: python diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index b6466a90903cf..c1e415c4b2b9a 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -333,17 +333,18 @@ def from_dummies( Parameters ---------- data : DataFrame - Data which contains dummy-coded variables. + Data which contains dummy-coded variables in form of integer columns of + 1's and 0's. sep : str, default None Separator used in the column names of the dummy categories they are character indicating the separation of the categorical names from the prefixes. For example, if your column names are 'prefix_A' and 'prefix_B', you can strip the underscore by specifying sep='_'. default_category : None, Hashable or dict of Hashables, default None - The base category is the implied category when a value has non none of the + The default category is the implied category when a value has none of the listed categories specified with a one, i.e. if all dummies in a row are - zero. Can be a a single value for all variables or a dict directly mapping - the base categories to a prefix of a variable. + zero. Can be a single value for all variables or a dict directly mapping + the default categories to a prefix of a variable. Returns ------- @@ -383,7 +384,7 @@ def from_dummies( ... "c": [0, 0, 1, 0]}) >>> df - a b c + a b c 0 1 0 0 1 0 1 0 2 0 0 1 @@ -400,7 +401,7 @@ def from_dummies( ... "col2_c": [0, 0, 1]}) >>> df - col1_a col1_b col2_a col2_b col2_c + col1_a col1_b col2_a col2_b col2_c 0 1 0 0 1 0 1 0 1 1 0 0 2 1 0 0 0 1 @@ -416,7 +417,7 @@ def from_dummies( ... "col2_c": [0, 0, 0]}) >>> df - col1_a col1_b col2_a col2_b col2_c + col1_a col1_b col2_a col2_b col2_c 0 1 0 0 1 0 1 0 1 1 0 0 2 0 0 0 0 0 From 1272a23d9f0ea1549830ee53f43bc4747677e897 Mon Sep 17 00:00:00 2001 From: Peter Schmitt-Foerster Date: Sat, 4 Jun 2022 00:02:58 +0200 Subject: [PATCH 83/95] Inline the check_len check for the default_vategory --- pandas/core/reshape/encoding.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index c1e415c4b2b9a..c4e807670cf01 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -466,19 +466,15 @@ def from_dummies( f"Received 'sep' of type: {type(sep).__name__}" ) - # validate length of default_category - def check_len(item, name) -> None: - if not len(item) == len(variables_slice): - len_msg = ( - f"Length of '{name}' ({len(item)}) did not match the " - "length of the columns being encoded " - f"({len(variables_slice)})" - ) - raise ValueError(len_msg) - if default_category: if isinstance(default_category, dict): - check_len(default_category, "default_category") + if not len(default_category) == len(variables_slice): + len_msg = ( + f"Length of 'default_category' ({len(default_category)}) " + f"did not match the length of the columns being encoded " + f"({len(variables_slice)})" + ) + raise ValueError(len_msg) elif isinstance(default_category, Hashable): default_category = dict( zip(variables_slice, [default_category] * len(variables_slice)) From fd3b1159e119e1b646869856af98ad0728d344ea Mon Sep 17 00:00:00 2001 From: Peter Schmitt-Foerster Date: Sat, 4 Jun 2022 00:08:02 +0200 Subject: [PATCH 84/95] Fix mypy issues by removing fixed ignores --- pandas/core/reshape/encoding.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index c4e807670cf01..8f56bdc85abc4 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -436,11 +436,9 @@ def from_dummies( f"Received 'data' of type: {type(data).__name__}" ) - # error: Item "bool" of "Union[Series, bool]" has no attribute "any" - if data.isna().any().any(): # type: ignore[union-attr] - # error: Item "bool" of "Union[Series, bool]" has no attribute "idxmax" + if data.isna().any().any(): raise ValueError( - "Dummy DataFrame contains NA value in column: " # type: ignore[union-attr] + "Dummy DataFrame contains NA value in column: " f"'{data.isna().any().idxmax()}'" ) From bd5a118ddced436b6c5a5ced5960f910c1b6342a Mon Sep 17 00:00:00 2001 From: Peter Schmitt-Foerster Date: Sat, 4 Jun 2022 10:54:54 +0200 Subject: [PATCH 85/95] Fix error encountered during docstring parsing --- pandas/core/reshape/encoding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 8f56bdc85abc4..6e6a87ca8b39d 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -370,7 +370,7 @@ def from_dummies( See Also -------- - :func:``~pandas.get_dummies`` : Convert ``Series`` or ``DataFrame`` to dummy + :func:``~pandas.get_dummies`` to convert ``Series`` or ``DataFrame`` to dummy codes. Notes From f7d08d00fdc555b588c974a79ac0ec447c5ae336 Mon Sep 17 00:00:00 2001 From: Peter Schmitt-Foerster Date: Sat, 4 Jun 2022 15:03:05 +0200 Subject: [PATCH 86/95] Fix redundant backticks following :func: --- pandas/core/reshape/encoding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 6e6a87ca8b39d..2b03992fbd5bc 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -328,7 +328,7 @@ def from_dummies( """ Create a categorical ``DataFrame`` from a ``DataFrame`` of dummy variables. - Inverts the operation performed by :func:``~pandas.get_dummies``. + Inverts the operation performed by :func:`~pandas.get_dummies`. Parameters ---------- @@ -370,7 +370,7 @@ def from_dummies( See Also -------- - :func:``~pandas.get_dummies`` to convert ``Series`` or ``DataFrame`` to dummy + :func:`~pandas.get_dummies`: Convert ``Series`` or ``DataFrame`` to dummy codes. Notes From c32e514d401357e6c1407b8a8bc8fc21dbb05ebc Mon Sep 17 00:00:00 2001 From: Peter Schmitt-Foerster Date: Sat, 4 Jun 2022 15:35:17 +0200 Subject: [PATCH 87/95] Add space before colon for numpydoc --- pandas/core/reshape/encoding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 2b03992fbd5bc..309d80c77316d 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -370,7 +370,7 @@ def from_dummies( See Also -------- - :func:`~pandas.get_dummies`: Convert ``Series`` or ``DataFrame`` to dummy + :func:`~pandas.get_dummies` : Convert ``Series`` or ``DataFrame`` to dummy codes. Notes From 0fda02fa0aa2345269eda7730ad255b11aa23916 Mon Sep 17 00:00:00 2001 From: Peter Schmitt-Foerster Date: Mon, 6 Jun 2022 11:17:13 +0200 Subject: [PATCH 88/95] Added pd.Categorical to See Also --- pandas/core/reshape/encoding.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 309d80c77316d..755caf686a45e 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -372,6 +372,8 @@ def from_dummies( -------- :func:`~pandas.get_dummies` : Convert ``Series`` or ``DataFrame`` to dummy codes. + :class:`~pandas.Categorical` : Represent a categorical variable in classic + R / S-plus fashion. Notes ----- From 62b09aec58f99be82dabae51bfc78f306deda77e Mon Sep 17 00:00:00 2001 From: Peter Schmitt-Foerster Date: Mon, 6 Jun 2022 11:19:32 +0200 Subject: [PATCH 89/95] Add version added --- pandas/core/reshape/encoding.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 755caf686a45e..cb0786be61a5a 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -330,6 +330,8 @@ def from_dummies( Inverts the operation performed by :func:`~pandas.get_dummies`. + .. versionadded:: 1.5.0 + Parameters ---------- data : DataFrame From 1dcdd9a1389d54e7edf62c77be61337205552f90 Mon Sep 17 00:00:00 2001 From: Peter Schmitt-Foerster Date: Mon, 6 Jun 2022 11:21:26 +0200 Subject: [PATCH 90/95] Add from_dummies to get_dummies see also --- pandas/core/reshape/encoding.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index cb0786be61a5a..c87c6910beb07 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -70,6 +70,8 @@ def get_dummies( See Also -------- Series.str.get_dummies : Convert Series to dummy codes. + :func:`~pandas.from_dummies` : Create a categorical ``DataFrame`` from a + ``DataFrame`` of dummy variables. Notes ----- From 3c006904e3814ca948f6b3f2338a0077bd2de598 Mon Sep 17 00:00:00 2001 From: Peter Schmitt-Foerster Date: Mon, 6 Jun 2022 11:24:45 +0200 Subject: [PATCH 91/95] Fix see also missing period error --- pandas/core/reshape/encoding.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index c87c6910beb07..7970d8b855f6e 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -374,10 +374,8 @@ def from_dummies( See Also -------- - :func:`~pandas.get_dummies` : Convert ``Series`` or ``DataFrame`` to dummy - codes. - :class:`~pandas.Categorical` : Represent a categorical variable in classic - R / S-plus fashion. + :func:`~pandas.get_dummies` : Convert ``Series`` or ``DataFrame`` to dummy codes. + :class:`~pandas.Categorical` : Represent a categorical variable in classic. Notes ----- From 4425b4a477873377e18597a6f10a6e2373a65a9c Mon Sep 17 00:00:00 2001 From: Peter Schmitt-Foerster Date: Mon, 6 Jun 2022 14:26:03 +0200 Subject: [PATCH 92/95] Fix See Also of get_dummies --- pandas/core/reshape/encoding.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 7970d8b855f6e..a47eca190f86e 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -70,8 +70,7 @@ def get_dummies( See Also -------- Series.str.get_dummies : Convert Series to dummy codes. - :func:`~pandas.from_dummies` : Create a categorical ``DataFrame`` from a - ``DataFrame`` of dummy variables. + :func:`~pandas.from_dummies` : Convert dummy codes to categorical ``DataFrame`` Notes ----- From 15503b0d436a02cacac3f08e50738a69bd8628d8 Mon Sep 17 00:00:00 2001 From: Peter Schmitt-Foerster Date: Wed, 22 Jun 2022 21:49:36 +0200 Subject: [PATCH 93/95] Fix docs compiler error --- pandas/core/reshape/encoding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index a47eca190f86e..3fe356a458295 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -70,7 +70,7 @@ def get_dummies( See Also -------- Series.str.get_dummies : Convert Series to dummy codes. - :func:`~pandas.from_dummies` : Convert dummy codes to categorical ``DataFrame`` + :func:`~pandas.from_dummies` : Convert dummy codes to categorical ``DataFrame``. Notes ----- From f06a45c9965a6113fe6e7d93b0090ff7675b319d Mon Sep 17 00:00:00 2001 From: Peter Schmitt-Foerster Date: Sat, 25 Jun 2022 12:04:00 +0200 Subject: [PATCH 94/95] Fix default_category=0 bug and add corresponding tests --- pandas/core/reshape/encoding.py | 2 +- pandas/tests/reshape/test_from_dummies.py | 102 ++++++++++++++++------ 2 files changed, 76 insertions(+), 28 deletions(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 3fe356a458295..aea5157dab44f 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -467,7 +467,7 @@ def from_dummies( f"Received 'sep' of type: {type(sep).__name__}" ) - if default_category: + if default_category is not None: if isinstance(default_category, dict): if not len(default_category) == len(variables_slice): len_msg = ( diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 035b7150d141c..c52331e54f95e 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -290,10 +290,44 @@ def test_no_prefix_string_cats_contains_get_dummies_NaN_column(): tm.assert_frame_equal(result, expected) -def test_no_prefix_string_cats_default_category(): +@pytest.mark.parametrize( + "default_category, expected", + [ + pytest.param( + "c", + DataFrame({"": ["a", "b", "c"]}), + id="default_category is a str", + ), + pytest.param( + 1, + DataFrame({"": ["a", "b", 1]}), + id="default_category is a int", + ), + pytest.param( + 1.25, + DataFrame({"": ["a", "b", 1.25]}), + id="default_category is a float", + ), + pytest.param( + 0, + DataFrame({"": ["a", "b", 0]}), + id="default_category is a 0", + ), + pytest.param( + False, + DataFrame({"": ["a", "b", False]}), + id="default_category is a bool", + ), + pytest.param( + (1, 2), + DataFrame({"": ["a", "b", (1, 2)]}), + id="default_category is a tuple", + ), + ], +) +def test_no_prefix_string_cats_default_category(default_category, expected): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) - expected = DataFrame({"": ["a", "b", "c"]}) - result = from_dummies(dummies, default_category="c") + result = from_dummies(dummies, default_category=default_category) tm.assert_frame_equal(result, expected) @@ -320,31 +354,45 @@ def test_with_prefix_contains_get_dummies_NaN_column(): tm.assert_frame_equal(result, expected) -def test_with_prefix_default_category_str(dummies_with_unassigned): - expected = DataFrame({"col1": ["a", "b", "x"], "col2": ["x", "a", "c"]}) - result = from_dummies(dummies_with_unassigned, sep="_", default_category="x") - tm.assert_frame_equal(result, expected) - - -def test_with_prefix_default_category_int_and_float( - dummies_with_unassigned, -): - expected = DataFrame({"col1": ["a", "b", 2.5], "col2": [1, "a", "c"]}) - result = from_dummies( - dummies_with_unassigned, - sep="_", - default_category={"col2": 1, "col1": 2.5}, - ) - tm.assert_frame_equal(result, expected) - - -def test_with_prefix_default_category_bool_and_none( - dummies_with_unassigned, +@pytest.mark.parametrize( + "default_category, expected", + [ + pytest.param( + "x", + DataFrame({"col1": ["a", "b", "x"], "col2": ["x", "a", "c"]}), + id="default_category is a str", + ), + pytest.param( + 0, + DataFrame({"col1": ["a", "b", 0], "col2": [0, "a", "c"]}), + id="default_category is a 0", + ), + pytest.param( + False, + DataFrame({"col1": ["a", "b", False], "col2": [False, "a", "c"]}), + id="default_category is a False", + ), + pytest.param( + {"col2": 1, "col1": 2.5}, + DataFrame({"col1": ["a", "b", 2.5], "col2": [1, "a", "c"]}), + id="default_category is a dict with int and float values", + ), + pytest.param( + {"col2": None, "col1": False}, + DataFrame({"col1": ["a", "b", False], "col2": [None, "a", "c"]}), + id="default_category is a dict with bool and None values", + ), + pytest.param( + {"col2": (1, 2), "col1": [1.25, False]}, + DataFrame({"col1": ["a", "b", [1.25, False]], "col2": [(1, 2), "a", "c"]}), + id="default_category is a dict with list and tuple values", + ), + ], +) +def test_with_prefix_default_category( + dummies_with_unassigned, default_category, expected ): - expected = DataFrame({"col1": ["a", "b", False], "col2": [None, "a", "c"]}) result = from_dummies( - dummies_with_unassigned, - sep="_", - default_category={"col2": None, "col1": False}, + dummies_with_unassigned, sep="_", default_category=default_category ) tm.assert_frame_equal(result, expected) From 23c133f005e5b54092645a5c841fa8f1b19c098f Mon Sep 17 00:00:00 2001 From: Peter Schmitt-Foerster Date: Sat, 25 Jun 2022 12:42:33 +0200 Subject: [PATCH 95/95] Use .loc[:, prefix_slice] instead of [prefix_slice] --- pandas/core/reshape/encoding.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index aea5157dab44f..fc908a5648885 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -494,7 +494,7 @@ def from_dummies( cats = prefix_slice.copy() else: cats = [col[len(prefix + sep) :] for col in prefix_slice] - assigned = data_to_decode[prefix_slice].sum(axis=1) + assigned = data_to_decode.loc[:, prefix_slice].sum(axis=1) if any(assigned > 1): raise ValueError( "Dummy DataFrame contains multi-assignment(s); " @@ -508,9 +508,11 @@ def from_dummies( "Dummy DataFrame contains unassigned value(s); " f"First instance in row: {assigned.idxmin()}" ) - data_slice = concat((data_to_decode[prefix_slice], assigned == 0), axis=1) + data_slice = concat( + (data_to_decode.loc[:, prefix_slice], assigned == 0), axis=1 + ) else: - data_slice = data_to_decode[prefix_slice] + data_slice = data_to_decode.loc[:, prefix_slice] cats_array = np.array(cats, dtype="object") # get indices of True entries along axis=1 cat_data[prefix] = cats_array[data_slice.to_numpy().nonzero()[1]]