|
1 | 1 | from __future__ import annotations
|
2 | 2 |
|
| 3 | +from collections import defaultdict |
3 | 4 | import itertools
|
| 5 | +from typing import Hashable |
4 | 6 |
|
5 | 7 | import numpy as np
|
6 | 8 |
|
@@ -68,6 +70,7 @@ def get_dummies(
|
68 | 70 | See Also
|
69 | 71 | --------
|
70 | 72 | Series.str.get_dummies : Convert Series to dummy codes.
|
| 73 | + :func:`~pandas.from_dummies` : Convert dummy codes to categorical ``DataFrame``. |
71 | 74 |
|
72 | 75 | Notes
|
73 | 76 | -----
|
@@ -316,3 +319,202 @@ def get_empty_frame(data) -> DataFrame:
|
316 | 319 | dummy_mat = dummy_mat[:, 1:]
|
317 | 320 | dummy_cols = dummy_cols[1:]
|
318 | 321 | return DataFrame(dummy_mat, index=index, columns=dummy_cols)
|
| 322 | + |
| 323 | + |
| 324 | +def from_dummies( |
| 325 | + data: DataFrame, |
| 326 | + sep: None | str = None, |
| 327 | + default_category: None | Hashable | dict[str, Hashable] = None, |
| 328 | +) -> DataFrame: |
| 329 | + """ |
| 330 | + Create a categorical ``DataFrame`` from a ``DataFrame`` of dummy variables. |
| 331 | +
|
| 332 | + Inverts the operation performed by :func:`~pandas.get_dummies`. |
| 333 | +
|
| 334 | + .. versionadded:: 1.5.0 |
| 335 | +
|
| 336 | + Parameters |
| 337 | + ---------- |
| 338 | + data : DataFrame |
| 339 | + Data which contains dummy-coded variables in form of integer columns of |
| 340 | + 1's and 0's. |
| 341 | + sep : str, default None |
| 342 | + Separator used in the column names of the dummy categories they are |
| 343 | + character indicating the separation of the categorical names from the prefixes. |
| 344 | + For example, if your column names are 'prefix_A' and 'prefix_B', |
| 345 | + you can strip the underscore by specifying sep='_'. |
| 346 | + default_category : None, Hashable or dict of Hashables, default None |
| 347 | + The default category is the implied category when a value has none of the |
| 348 | + listed categories specified with a one, i.e. if all dummies in a row are |
| 349 | + zero. Can be a single value for all variables or a dict directly mapping |
| 350 | + the default categories to a prefix of a variable. |
| 351 | +
|
| 352 | + Returns |
| 353 | + ------- |
| 354 | + DataFrame |
| 355 | + Categorical data decoded from the dummy input-data. |
| 356 | +
|
| 357 | + Raises |
| 358 | + ------ |
| 359 | + ValueError |
| 360 | + * When the input ``DataFrame`` ``data`` contains NA values. |
| 361 | + * When the input ``DataFrame`` ``data`` contains column names with separators |
| 362 | + that do not match the separator specified with ``sep``. |
| 363 | + * When a ``dict`` passed to ``default_category`` does not include an implied |
| 364 | + category for each prefix. |
| 365 | + * When a value in ``data`` has more than one category assigned to it. |
| 366 | + * When ``default_category=None`` and a value in ``data`` has no category |
| 367 | + assigned to it. |
| 368 | + TypeError |
| 369 | + * When the input ``data`` is not of type ``DataFrame``. |
| 370 | + * When the input ``DataFrame`` ``data`` contains non-dummy data. |
| 371 | + * When the passed ``sep`` is of a wrong data type. |
| 372 | + * When the passed ``default_category`` is of a wrong data type. |
| 373 | +
|
| 374 | + See Also |
| 375 | + -------- |
| 376 | + :func:`~pandas.get_dummies` : Convert ``Series`` or ``DataFrame`` to dummy codes. |
| 377 | + :class:`~pandas.Categorical` : Represent a categorical variable in classic. |
| 378 | +
|
| 379 | + Notes |
| 380 | + ----- |
| 381 | + The columns of the passed dummy data should only include 1's and 0's, |
| 382 | + or boolean values. |
| 383 | +
|
| 384 | + Examples |
| 385 | + -------- |
| 386 | + >>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], |
| 387 | + ... "c": [0, 0, 1, 0]}) |
| 388 | +
|
| 389 | + >>> df |
| 390 | + a b c |
| 391 | + 0 1 0 0 |
| 392 | + 1 0 1 0 |
| 393 | + 2 0 0 1 |
| 394 | + 3 1 0 0 |
| 395 | +
|
| 396 | + >>> pd.from_dummies(df) |
| 397 | + 0 a |
| 398 | + 1 b |
| 399 | + 2 c |
| 400 | + 3 a |
| 401 | +
|
| 402 | + >>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], |
| 403 | + ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], |
| 404 | + ... "col2_c": [0, 0, 1]}) |
| 405 | +
|
| 406 | + >>> df |
| 407 | + col1_a col1_b col2_a col2_b col2_c |
| 408 | + 0 1 0 0 1 0 |
| 409 | + 1 0 1 1 0 0 |
| 410 | + 2 1 0 0 0 1 |
| 411 | +
|
| 412 | + >>> pd.from_dummies(df, sep="_") |
| 413 | + col1 col2 |
| 414 | + 0 a b |
| 415 | + 1 b a |
| 416 | + 2 a c |
| 417 | +
|
| 418 | + >>> df = pd.DataFrame({"col1_a": [1, 0, 0], "col1_b": [0, 1, 0], |
| 419 | + ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], |
| 420 | + ... "col2_c": [0, 0, 0]}) |
| 421 | +
|
| 422 | + >>> df |
| 423 | + col1_a col1_b col2_a col2_b col2_c |
| 424 | + 0 1 0 0 1 0 |
| 425 | + 1 0 1 1 0 0 |
| 426 | + 2 0 0 0 0 0 |
| 427 | +
|
| 428 | + >>> pd.from_dummies(df, sep="_", default_category={"col1": "d", "col2": "e"}) |
| 429 | + col1 col2 |
| 430 | + 0 a b |
| 431 | + 1 b a |
| 432 | + 2 d e |
| 433 | + """ |
| 434 | + from pandas.core.reshape.concat import concat |
| 435 | + |
| 436 | + if not isinstance(data, DataFrame): |
| 437 | + raise TypeError( |
| 438 | + "Expected 'data' to be a 'DataFrame'; " |
| 439 | + f"Received 'data' of type: {type(data).__name__}" |
| 440 | + ) |
| 441 | + |
| 442 | + if data.isna().any().any(): |
| 443 | + raise ValueError( |
| 444 | + "Dummy DataFrame contains NA value in column: " |
| 445 | + f"'{data.isna().any().idxmax()}'" |
| 446 | + ) |
| 447 | + |
| 448 | + # index data with a list of all columns that are dummies |
| 449 | + try: |
| 450 | + data_to_decode = data.astype("boolean", copy=False) |
| 451 | + except TypeError: |
| 452 | + raise TypeError("Passed DataFrame contains non-dummy data") |
| 453 | + |
| 454 | + # collect prefixes and get lists to slice data for each prefix |
| 455 | + variables_slice = defaultdict(list) |
| 456 | + if sep is None: |
| 457 | + variables_slice[""] = list(data.columns) |
| 458 | + elif isinstance(sep, str): |
| 459 | + for col in data_to_decode.columns: |
| 460 | + prefix = col.split(sep)[0] |
| 461 | + if len(prefix) == len(col): |
| 462 | + raise ValueError(f"Separator not specified for column: {col}") |
| 463 | + variables_slice[prefix].append(col) |
| 464 | + else: |
| 465 | + raise TypeError( |
| 466 | + "Expected 'sep' to be of type 'str' or 'None'; " |
| 467 | + f"Received 'sep' of type: {type(sep).__name__}" |
| 468 | + ) |
| 469 | + |
| 470 | + if default_category is not None: |
| 471 | + if isinstance(default_category, dict): |
| 472 | + if not len(default_category) == len(variables_slice): |
| 473 | + len_msg = ( |
| 474 | + f"Length of 'default_category' ({len(default_category)}) " |
| 475 | + f"did not match the length of the columns being encoded " |
| 476 | + f"({len(variables_slice)})" |
| 477 | + ) |
| 478 | + raise ValueError(len_msg) |
| 479 | + elif isinstance(default_category, Hashable): |
| 480 | + default_category = dict( |
| 481 | + zip(variables_slice, [default_category] * len(variables_slice)) |
| 482 | + ) |
| 483 | + else: |
| 484 | + raise TypeError( |
| 485 | + "Expected 'default_category' to be of type " |
| 486 | + "'None', 'Hashable', or 'dict'; " |
| 487 | + "Received 'default_category' of type: " |
| 488 | + f"{type(default_category).__name__}" |
| 489 | + ) |
| 490 | + |
| 491 | + cat_data = {} |
| 492 | + for prefix, prefix_slice in variables_slice.items(): |
| 493 | + if sep is None: |
| 494 | + cats = prefix_slice.copy() |
| 495 | + else: |
| 496 | + cats = [col[len(prefix + sep) :] for col in prefix_slice] |
| 497 | + assigned = data_to_decode.loc[:, prefix_slice].sum(axis=1) |
| 498 | + if any(assigned > 1): |
| 499 | + raise ValueError( |
| 500 | + "Dummy DataFrame contains multi-assignment(s); " |
| 501 | + f"First instance in row: {assigned.idxmax()}" |
| 502 | + ) |
| 503 | + elif any(assigned == 0): |
| 504 | + if isinstance(default_category, dict): |
| 505 | + cats.append(default_category[prefix]) |
| 506 | + else: |
| 507 | + raise ValueError( |
| 508 | + "Dummy DataFrame contains unassigned value(s); " |
| 509 | + f"First instance in row: {assigned.idxmin()}" |
| 510 | + ) |
| 511 | + data_slice = concat( |
| 512 | + (data_to_decode.loc[:, prefix_slice], assigned == 0), axis=1 |
| 513 | + ) |
| 514 | + else: |
| 515 | + data_slice = data_to_decode.loc[:, prefix_slice] |
| 516 | + cats_array = np.array(cats, dtype="object") |
| 517 | + # get indices of True entries along axis=1 |
| 518 | + cat_data[prefix] = cats_array[data_slice.to_numpy().nonzero()[1]] |
| 519 | + |
| 520 | + return DataFrame(cat_data) |
0 commit comments