diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 1ae76984484af..2843ee7551780 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -247,6 +247,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ - Bug in :func:`concat` between a :class:`Series` with integer dtype and another with :class:`CategoricalDtype` with integer categories and containing ``NaN`` values casting to object dtype instead of ``float64`` (:issue:`45359`) +- Bug in :func:`get_dummies` that selected object and categorical dtypes but not string (:issue:`44965`) - Sparse diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 75f005489785a..965bd46baa7c7 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -838,7 +838,7 @@ def get_dummies( columns : list-like, default None Column names in the DataFrame to be encoded. If `columns` is None then all the columns with - `object` or `category` dtype will be converted. + `object`, `string`, or `category` dtype will be converted. sparse : bool, default False Whether the dummy-encoded columns should be backed by a :class:`SparseArray` (True) or a regular NumPy array (False). @@ -915,7 +915,7 @@ def get_dummies( """ from pandas.core.reshape.concat import concat - dtypes_to_encode = ["object", "category"] + dtypes_to_encode = ["object", "string", "category"] if isinstance(data, DataFrame): # determine columns being encoded diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 653ea88ed62ac..6c9a60caaa2be 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -196,6 +196,22 @@ def test_dataframe_dummies_all_obj(self, df, sparse): tm.assert_frame_equal(result, expected) + def test_dataframe_dummies_string_dtype(self, df): + # GH44965 + df = df[["A", "B"]] + df = df.astype({"A": "object", "B": "string"}) + result = get_dummies(df) + expected = DataFrame( + { + "A_a": [1, 0, 1], + "A_b": [0, 1, 0], + "B_b": [1, 1, 0], + "B_c": [0, 0, 1], + }, + dtype=np.uint8, + ) + tm.assert_frame_equal(result, expected) + def test_dataframe_dummies_mix_default(self, df, sparse, dtype): result = get_dummies(df, sparse=sparse, dtype=dtype) if sparse: