From cd9205c3b745b0180046fef5a731f673c4148bb5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 28 Jul 2023 17:36:45 -0700 Subject: [PATCH 1/3] BUG: from_dummies always returning object data --- doc/source/whatsnew/v2.1.0.rst | 2 ++ pandas/core/reshape/encoding.py | 11 ++++-- pandas/tests/reshape/test_from_dummies.py | 43 +++++++++++++++++++++-- 3 files changed, 51 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 0fdec3175f635..4ee883bc2bf63 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -684,6 +684,8 @@ Other - Bug in :class:`DatetimeIndex` where ``repr`` of index passed with time does not print time is midnight and non-day based freq(:issue:`53470`) - Bug in :class:`FloatingArray.__contains__` with ``NaN`` item incorrectly returning ``False`` when ``NaN`` values are present (:issue:`52840`) - Bug in :func:`api.interchange.from_dataframe` when converting an empty DataFrame object (:issue:`53155`) +- Bug in :func:`from_dumies` where the resulting data would always be ``object`` dtype instead of the dtype of the columns (:issue:`?`) +- Bug in :func:`from_dumies` where the resulting :class:`Index` did not match the original :class:`Index` (:issue:`?`) - Bug in :func:`assert_almost_equal` now throwing assertion error for two unequal sets (:issue:`51727`) - Bug in :func:`assert_frame_equal` checks category dtypes even when asked not to check index type (:issue:`52126`) - Bug in :meth:`DataFrame.reindex` with a ``fill_value`` that should be inferred with a :class:`ExtensionDtype` incorrectly inferring ``object`` dtype (:issue:`52586`) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 8c464c2229515..f2ba9a67d89b8 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -529,8 +529,13 @@ def from_dummies( ) else: data_slice = data_to_decode.loc[:, prefix_slice] - cats_array = np.array(cats, dtype="object") + cats_array = data._constructor_sliced(cats, dtype=data.columns.dtype) # get indices of True entries along axis=1 - cat_data[prefix] = cats_array[data_slice.to_numpy().nonzero()[1]] + true_values = data_slice.idxmax(axis=1) + indexer = data_slice.columns.get_indexer_for(true_values) + cat_data[prefix] = cats_array.take(indexer).set_axis(data.index) - return DataFrame(cat_data) + result = DataFrame(cat_data) + if sep is not None: + result.columns = result.columns.astype(data.columns.dtype) + return result diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index aab49538a2dcf..bdbb3791ca6b7 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -257,7 +257,7 @@ def test_no_prefix_int_cats_basic(): dummies = DataFrame( {1: [1, 0, 0, 0], 25: [0, 1, 0, 0], 2: [0, 0, 1, 0], 5: [0, 0, 0, 1]} ) - expected = DataFrame({"": [1, 25, 2, 5]}, dtype="object") + expected = DataFrame({"": [1, 25, 2, 5]}) result = from_dummies(dummies) tm.assert_frame_equal(result, expected) @@ -266,7 +266,7 @@ def test_no_prefix_float_cats_basic(): dummies = DataFrame( {1.0: [1, 0, 0, 0], 25.0: [0, 1, 0, 0], 2.5: [0, 0, 1, 0], 5.84: [0, 0, 0, 1]} ) - expected = DataFrame({"": [1.0, 25.0, 2.5, 5.84]}, dtype="object") + expected = DataFrame({"": [1.0, 25.0, 2.5, 5.84]}) result = from_dummies(dummies) tm.assert_frame_equal(result, expected) @@ -399,3 +399,42 @@ def test_with_prefix_default_category( dummies_with_unassigned, sep="_", default_category=default_category ) tm.assert_frame_equal(result, expected) + + +def test_ea_categories(): + df = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}) + df.columns = df.columns.astype("string[python]") + result = from_dummies(df) + expected = DataFrame({"": Series(list("abca"), dtype="string[python]")}) + tm.assert_frame_equal(result, expected) + + +def test_ea_categories_with_sep(): + df = DataFrame( + { + "col1_a": [1, 0, 1], + "col1_b": [0, 1, 0], + "col2_a": [0, 1, 0], + "col2_b": [1, 0, 0], + "col2_c": [0, 0, 1], + } + ) + df.columns = df.columns.astype("string[python]") + result = from_dummies(df, sep="_") + expected = DataFrame( + { + "col1": Series(list("aba"), dtype="string[python]"), + "col2": Series(list("bac"), dtype="string[python]"), + } + ) + expected.columns = expected.columns.astype("string[python]") + tm.assert_frame_equal(result, expected) + + +def test_maintain_original_index(): + df = DataFrame( + {"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}, index=list("abcd") + ) + result = from_dummies(df) + expected = DataFrame({"": list("abca")}, index=list("abcd")) + tm.assert_frame_equal(result, expected) From 295bfd70f8468273444fc2079a9e0c30e0e6448c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 28 Jul 2023 17:37:39 -0700 Subject: [PATCH 2/3] misspelling --- doc/source/whatsnew/v2.1.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 4ee883bc2bf63..89b50553a601d 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -684,8 +684,8 @@ Other - Bug in :class:`DatetimeIndex` where ``repr`` of index passed with time does not print time is midnight and non-day based freq(:issue:`53470`) - Bug in :class:`FloatingArray.__contains__` with ``NaN`` item incorrectly returning ``False`` when ``NaN`` values are present (:issue:`52840`) - Bug in :func:`api.interchange.from_dataframe` when converting an empty DataFrame object (:issue:`53155`) -- Bug in :func:`from_dumies` where the resulting data would always be ``object`` dtype instead of the dtype of the columns (:issue:`?`) -- Bug in :func:`from_dumies` where the resulting :class:`Index` did not match the original :class:`Index` (:issue:`?`) +- Bug in :func:`from_dummies` where the resulting data would always be ``object`` dtype instead of the dtype of the columns (:issue:`?`) +- Bug in :func:`from_dummies` where the resulting :class:`Index` did not match the original :class:`Index` (:issue:`?`) - Bug in :func:`assert_almost_equal` now throwing assertion error for two unequal sets (:issue:`51727`) - Bug in :func:`assert_frame_equal` checks category dtypes even when asked not to check index type (:issue:`52126`) - Bug in :meth:`DataFrame.reindex` with a ``fill_value`` that should be inferred with a :class:`ExtensionDtype` incorrectly inferring ``object`` dtype (:issue:`52586`) From 10bafea064f5f0177305bf6d25028502df76ad80 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 28 Jul 2023 17:39:54 -0700 Subject: [PATCH 3/3] Add Gh issue --- doc/source/whatsnew/v2.1.0.rst | 4 ++-- pandas/tests/reshape/test_from_dummies.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 89b50553a601d..260ff19ba16c1 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -684,10 +684,10 @@ Other - Bug in :class:`DatetimeIndex` where ``repr`` of index passed with time does not print time is midnight and non-day based freq(:issue:`53470`) - Bug in :class:`FloatingArray.__contains__` with ``NaN`` item incorrectly returning ``False`` when ``NaN`` values are present (:issue:`52840`) - Bug in :func:`api.interchange.from_dataframe` when converting an empty DataFrame object (:issue:`53155`) -- Bug in :func:`from_dummies` where the resulting data would always be ``object`` dtype instead of the dtype of the columns (:issue:`?`) -- Bug in :func:`from_dummies` where the resulting :class:`Index` did not match the original :class:`Index` (:issue:`?`) - Bug in :func:`assert_almost_equal` now throwing assertion error for two unequal sets (:issue:`51727`) - Bug in :func:`assert_frame_equal` checks category dtypes even when asked not to check index type (:issue:`52126`) +- Bug in :func:`from_dummies` where the resulting :class:`Index` did not match the original :class:`Index` (:issue:`54300`) +- Bug in :func:`from_dummies` where the resulting data would always be ``object`` dtype instead of the dtype of the columns (:issue:`54300`) - Bug in :meth:`DataFrame.reindex` with a ``fill_value`` that should be inferred with a :class:`ExtensionDtype` incorrectly inferring ``object`` dtype (:issue:`52586`) - Bug in :meth:`DataFrame.shift` and :meth:`Series.shift` and :meth:`DataFrameGroupBy.shift` when passing both "freq" and "fill_value" silently ignoring "fill_value" instead of raising ``ValueError`` (:issue:`53832`) - Bug in :meth:`DataFrame.shift` with ``axis=1`` on a :class:`DataFrame` with a single :class:`ExtensionDtype` column giving incorrect results (:issue:`53832`) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index bdbb3791ca6b7..0074a90d7a51e 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -402,6 +402,7 @@ def test_with_prefix_default_category( def test_ea_categories(): + # GH 54300 df = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}) df.columns = df.columns.astype("string[python]") result = from_dummies(df) @@ -410,6 +411,7 @@ def test_ea_categories(): def test_ea_categories_with_sep(): + # GH 54300 df = DataFrame( { "col1_a": [1, 0, 1], @@ -432,6 +434,7 @@ def test_ea_categories_with_sep(): def test_maintain_original_index(): + # GH 54300 df = DataFrame( {"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}, index=list("abcd") )