ENH: change get_dummies default dtype to bool (#48022)

kianelbo · MarcoGorelli · web-flow · commit bfdf22313354 · 2022-10-11T09:29:10.000-07:00
* ENH: Warn when dtype is not passed to get_dummies * Edit get_dummies' dtype warning * Add whatsnew entry for issue #45848 * Fix dtype warning test * Suppress warnings in docs * Edit whatsnew entry Co-authored-by: Marco Edward Gorelli <marcogorelli@protonmail.com> * Fix find_stack_level in get_dummies dtype warning * Change the default dtype of get_dummies to bool * Revert dtype(bool) change * Move the changelog entry to v1.6.0.rst * Move whatsnew entry to 'Other API changes' Co-authored-by: Marco Edward Gorelli <marcogorelli@protonmail.com> Co-authored-by: Marco Edward Gorelli <33491632+MarcoGorelli@users.noreply.github.com>
diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst
@@ -118,6 +118,7 @@ Other API changes
 ^^^^^^^^^^^^^^^^^
 - Passing ``nanoseconds`` greater than 999 or less than 0 in :class:`Timestamp` now raises a ``ValueError`` (:issue:`48538`, :issue:`48255`)
 - :func:`read_csv`: specifying an incorrect number of columns with ``index_col`` of now raises ``ParserError`` instead of ``IndexError`` when using the c parser.
+- Default value of ``dtype`` in :func:`get_dummies` is changed to ``bool`` from ``uint8`` (:issue:`45848`)
 - :meth:`DataFrame.astype`, :meth:`Series.astype`, and :meth:`DatetimeIndex.astype` casting datetime64 data to any of "datetime64[s]", "datetime64[ms]", "datetime64[us]" will return an object with the given resolution instead of coercing back to "datetime64[ns]" (:issue:`48928`)
 - :meth:`DataFrame.astype`, :meth:`Series.astype`, and :meth:`DatetimeIndex.astype` casting timedelta64 data to any of "timedelta64[s]", "timedelta64[ms]", "timedelta64[us]" will return an object with the given resolution instead of coercing to "float64" dtype (:issue:`48963`)
 -
diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py
@@ -66,7 +66,7 @@ def get_dummies(
     drop_first : bool, default False
         Whether to get k-1 dummies out of k categorical levels by removing the
         first level.
-    dtype : dtype, default np.uint8
+    dtype : dtype, default bool
         Data type for new columns. Only a single dtype is allowed.
 
     Returns
@@ -89,50 +89,50 @@ def get_dummies(
     >>> s = pd.Series(list('abca'))
 
     >>> pd.get_dummies(s)
-       a  b  c
-    0  1  0  0
-    1  0  1  0
-    2  0  0  1
-    3  1  0  0
+           a      b      c
+    0   True  False  False
+    1  False   True  False
+    2  False  False   True
+    3   True  False  False
 
     >>> s1 = ['a', 'b', np.nan]
 
     >>> pd.get_dummies(s1)
-       a  b
-    0  1  0
-    1  0  1
-    2  0  0
+           a      b
+    0   True  False
+    1  False   True
+    2  False  False
 
     >>> pd.get_dummies(s1, dummy_na=True)
-       a  b  NaN
-    0  1  0    0
-    1  0  1    0
-    2  0  0    1
+           a      b    NaN
+    0   True  False  False
+    1  False   True  False
+    2  False  False   True
 
     >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
     ...                    'C': [1, 2, 3]})
 
     >>> pd.get_dummies(df, prefix=['col1', 'col2'])
        C  col1_a  col1_b  col2_a  col2_b  col2_c
-    0  1       1       0       0       1       0
-    1  2       0       1       1       0       0
-    2  3       1       0       0       0       1
+    0  1    True   False   False    True   False
+    1  2   False    True    True   False   False
+    2  3    True   False   False   False    True
 
     >>> pd.get_dummies(pd.Series(list('abcaa')))
-       a  b  c
-    0  1  0  0
-    1  0  1  0
-    2  0  0  1
-    3  1  0  0
-    4  1  0  0
+           a      b      c
+    0   True  False  False
+    1  False   True  False
+    2  False  False   True
+    3   True  False  False
+    4   True  False  False
 
     >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
-       b  c
-    0  0  0
-    1  1  0
-    2  0  1
-    3  0  0
-    4  0  0
+           b      c
+    0  False  False
+    1   True  False
+    2  False   True
+    3  False  False
+    4  False  False
 
     >>> pd.get_dummies(pd.Series(list('abc')), dtype=float)
          a    b    c
@@ -236,7 +236,7 @@ def _get_dummies_1d(
     codes, levels = factorize_from_iterable(Series(data))
 
     if dtype is None:
-        dtype = np.dtype(np.uint8)
+        dtype = np.dtype(bool)
     # error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str,
     # dtype[Any], Type[object]]"; expected "Type[Any]"
     dtype = np.dtype(dtype)  # type: ignore[arg-type]
diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py
@@ -52,9 +52,7 @@ def test_getitem_list_of_labels_categoricalindex_cols(self):
         # GH#16115
         cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")])
 
-        expected = DataFrame(
-            [[1, 0], [0, 1]], dtype="uint8", index=[0, 1], columns=cats
-        )
+        expected = DataFrame([[1, 0], [0, 1]], dtype="bool", index=[0, 1], columns=cats)
         dummies = get_dummies(cats)
         result = dummies[list(dummies.columns)]
         tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py
@@ -171,7 +171,7 @@ def test_get_dummies_unicode(self, sparse):
         s = [e, eacute, eacute]
         res = get_dummies(s, prefix="letter", sparse=sparse)
         exp = DataFrame(
-            {"letter_e": [1, 0, 0], f"letter_{eacute}": [0, 1, 1]}, dtype=np.uint8
+            {"letter_e": [True, False, False], f"letter_{eacute}": [False, True, True]}
         )
         if sparse:
             exp = exp.apply(SparseArray, fill_value=0)
@@ -182,15 +182,15 @@ def test_dataframe_dummies_all_obj(self, df, sparse):
         result = get_dummies(df, sparse=sparse)
         expected = DataFrame(
             {"A_a": [1, 0, 1], "A_b": [0, 1, 0], "B_b": [1, 1, 0], "B_c": [0, 0, 1]},
-            dtype=np.uint8,
+            dtype=bool,
         )
         if sparse:
             expected = DataFrame(
                 {
-                    "A_a": SparseArray([1, 0, 1], dtype="uint8"),
-                    "A_b": SparseArray([0, 1, 0], dtype="uint8"),
-                    "B_b": SparseArray([1, 1, 0], dtype="uint8"),
-                    "B_c": SparseArray([0, 0, 1], dtype="uint8"),
+                    "A_a": SparseArray([1, 0, 1], dtype="bool"),
+                    "A_b": SparseArray([0, 1, 0], dtype="bool"),
+                    "B_b": SparseArray([1, 1, 0], dtype="bool"),
+                    "B_c": SparseArray([0, 0, 1], dtype="bool"),
                 }
             )
 
@@ -208,7 +208,7 @@ def test_dataframe_dummies_string_dtype(self, df):
                 "B_b": [1, 1, 0],
                 "B_c": [0, 0, 1],
             },
-            dtype=np.uint8,
+            dtype=bool,
         )
         tm.assert_frame_equal(result, expected)
 
@@ -238,12 +238,11 @@ def test_dataframe_dummies_prefix_list(self, df, sparse):
         expected = DataFrame(
             {
                 "C": [1, 2, 3],
-                "from_A_a": [1, 0, 1],
-                "from_A_b": [0, 1, 0],
-                "from_B_b": [1, 1, 0],
-                "from_B_c": [0, 0, 1],
+                "from_A_a": [True, False, True],
+                "from_A_b": [False, True, False],
+                "from_B_b": [True, True, False],
+                "from_B_c": [False, False, True],
             },
-            dtype=np.uint8,
         )
         expected[["C"]] = df[["C"]]
         cols = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
@@ -258,9 +257,12 @@ def test_dataframe_dummies_prefix_str(self, df, sparse):
         result = get_dummies(df, prefix="bad", sparse=sparse)
         bad_columns = ["bad_a", "bad_b", "bad_b", "bad_c"]
         expected = DataFrame(
-            [[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]],
+            [
+                [1, True, False, True, False],
+                [2, False, True, True, False],
+                [3, True, False, False, True],
+            ],
             columns=["C"] + bad_columns,
-            dtype=np.uint8,
         )
         expected = expected.astype({"C": np.int64})
         if sparse:
@@ -269,10 +271,10 @@ def test_dataframe_dummies_prefix_str(self, df, sparse):
             expected = pd.concat(
                 [
                     Series([1, 2, 3], name="C"),
-                    Series([1, 0, 1], name="bad_a", dtype="Sparse[uint8]"),
-                    Series([0, 1, 0], name="bad_b", dtype="Sparse[uint8]"),
-                    Series([1, 1, 0], name="bad_b", dtype="Sparse[uint8]"),
-                    Series([0, 0, 1], name="bad_c", dtype="Sparse[uint8]"),
+                    Series([True, False, True], name="bad_a", dtype="Sparse[bool]"),
+                    Series([False, True, False], name="bad_b", dtype="Sparse[bool]"),
+                    Series([True, True, False], name="bad_b", dtype="Sparse[bool]"),
+                    Series([False, False, True], name="bad_c", dtype="Sparse[bool]"),
                 ],
                 axis=1,
             )
@@ -290,30 +292,29 @@ def test_dataframe_dummies_subset(self, df, sparse):
             },
         )
         cols = expected.columns
-        expected[cols[1:]] = expected[cols[1:]].astype(np.uint8)
+        expected[cols[1:]] = expected[cols[1:]].astype(bool)
         expected[["C"]] = df[["C"]]
         if sparse:
             cols = ["from_A_a", "from_A_b"]
-            expected[cols] = expected[cols].astype(SparseDtype("uint8", 0))
+            expected[cols] = expected[cols].astype(SparseDtype("bool", 0))
         tm.assert_frame_equal(result, expected)
 
     def test_dataframe_dummies_prefix_sep(self, df, sparse):
         result = get_dummies(df, prefix_sep="..", sparse=sparse)
         expected = DataFrame(
             {
                 "C": [1, 2, 3],
-                "A..a": [1, 0, 1],
-                "A..b": [0, 1, 0],
-                "B..b": [1, 1, 0],
-                "B..c": [0, 0, 1],
+                "A..a": [True, False, True],
+                "A..b": [False, True, False],
+                "B..b": [True, True, False],
+                "B..c": [False, False, True],
             },
-            dtype=np.uint8,
         )
         expected[["C"]] = df[["C"]]
         expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]]
         if sparse:
             cols = ["A..a", "A..b", "B..b", "B..c"]
-            expected[cols] = expected[cols].astype(SparseDtype("uint8", 0))
+            expected[cols] = expected[cols].astype(SparseDtype("bool", 0))
 
         tm.assert_frame_equal(result, expected)
 
@@ -356,9 +357,9 @@ def test_dataframe_dummies_prefix_dict(self, sparse):
         )
 
         columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
-        expected[columns] = expected[columns].astype(np.uint8)
+        expected[columns] = expected[columns].astype(bool)
         if sparse:
-            expected[columns] = expected[columns].astype(SparseDtype("uint8", 0))
+            expected[columns] = expected[columns].astype(SparseDtype("bool", 0))
 
         tm.assert_frame_equal(result, expected)
 
@@ -422,19 +423,19 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
         [
             (
                 {"data": DataFrame({"ä": ["a"]})},
-                DataFrame({"ä_a": [1]}, dtype=np.uint8),
+                DataFrame({"ä_a": [True]}),
             ),
             (
                 {"data": DataFrame({"x": ["ä"]})},
-                DataFrame({"x_ä": [1]}, dtype=np.uint8),
+                DataFrame({"x_ä": [True]}),
             ),
             (
                 {"data": DataFrame({"x": ["a"]}), "prefix": "ä"},
-                DataFrame({"ä_a": [1]}, dtype=np.uint8),
+                DataFrame({"ä_a": [True]}),
             ),
             (
                 {"data": DataFrame({"x": ["a"]}), "prefix_sep": "ä"},
-                DataFrame({"xäa": [1]}, dtype=np.uint8),
+                DataFrame({"xäa": [True]}),
             ),
         ],
     )
@@ -451,7 +452,7 @@ def test_get_dummies_basic_drop_first(self, sparse):
         s_series = Series(s_list)
         s_series_index = Series(s_list, list("ABC"))
 
-        expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=np.uint8)
+        expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=bool)
 
         result = get_dummies(s_list, drop_first=True, sparse=sparse)
         if sparse:
@@ -487,14 +488,14 @@ def test_get_dummies_basic_drop_first_NA(self, sparse):
         # Test NA handling together with drop_first
         s_NA = ["a", "b", np.nan]
         res = get_dummies(s_NA, drop_first=True, sparse=sparse)
-        exp = DataFrame({"b": [0, 1, 0]}, dtype=np.uint8)
+        exp = DataFrame({"b": [0, 1, 0]}, dtype=bool)
         if sparse:
             exp = exp.apply(SparseArray, fill_value=0)
 
         tm.assert_frame_equal(res, exp)
 
         res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse)
-        exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=np.uint8).reindex(
+        exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=bool).reindex(
             ["b", np.nan], axis=1
         )
         if sparse:
@@ -510,7 +511,7 @@ def test_get_dummies_basic_drop_first_NA(self, sparse):
     def test_dataframe_dummies_drop_first(self, df, sparse):
         df = df[["A", "B"]]
         result = get_dummies(df, drop_first=True, sparse=sparse)
-        expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=np.uint8)
+        expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=bool)
         if sparse:
             expected = expected.apply(SparseArray, fill_value=0)
         tm.assert_frame_equal(result, expected)
@@ -522,7 +523,7 @@ def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype):
             {"C": [1, 2, 3], "A_b": [0, 1, 0], "B_c": [0, 0, 1], "cat_y": [0, 1, 1]}
         )
         cols = ["A_b", "B_c", "cat_y"]
-        expected[cols] = expected[cols].astype(np.uint8)
+        expected[cols] = expected[cols].astype(bool)
         expected = expected[["C", "A_b", "B_c", "cat_y"]]
         if sparse:
             for col in cols:
@@ -544,7 +545,7 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
             }
         )
         cols = ["A_b", "A_nan", "B_c", "B_nan"]
-        expected[cols] = expected[cols].astype(np.uint8)
+        expected[cols] = expected[cols].astype(bool)
         expected = expected.sort_index(axis=1)
         if sparse:
             for col in cols:
@@ -559,13 +560,13 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
     def test_get_dummies_int_int(self):
         data = Series([1, 2, 1])
         result = get_dummies(data)
-        expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=np.uint8)
+        expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=bool)
         tm.assert_frame_equal(result, expected)
 
         data = Series(Categorical(["a", "b", "a"]))
         result = get_dummies(data)
         expected = DataFrame(
-            [[1, 0], [0, 1], [1, 0]], columns=Categorical(["a", "b"]), dtype=np.uint8
+            [[1, 0], [0, 1], [1, 0]], columns=Categorical(["a", "b"]), dtype=bool
         )
         tm.assert_frame_equal(result, expected)
 
@@ -616,9 +617,12 @@ def test_get_dummies_duplicate_columns(self, df):
         result = get_dummies(df).sort_index(axis=1)
 
         expected = DataFrame(
-            [[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]],
+            [
+                [1, True, False, True, False],
+                [2, False, True, True, False],
+                [3, True, False, False, True],
+            ],
             columns=["A", "A_a", "A_b", "A_b", "A_c"],
-            dtype=np.uint8,
         ).sort_index(axis=1)
 
         expected = expected.astype({"A": np.int64})
@@ -628,7 +632,7 @@ def test_get_dummies_duplicate_columns(self, df):
     def test_get_dummies_all_sparse(self):
         df = DataFrame({"A": [1, 2]})
         result = get_dummies(df, columns=["A"], sparse=True)
-        dtype = SparseDtype("uint8", 0)
+        dtype = SparseDtype("bool", 0)
         expected = DataFrame(
             {
                 "A_1": SparseArray([1, 0], dtype=dtype),

Original file line number	Diff line number	Diff line change
`@@ -118,6 +118,7 @@ Other API changes`
`118`	`118`	`^^^^^^^^^^^^^^^^^`
`119`	`119`	- Passing ``nanoseconds`` greater than 999 or less than 0 in :class:`Timestamp` now raises a ``ValueError`` (:issue:`48538`, :issue:`48255`)
`120`	`120`	- :func:`read_csv`: specifying an incorrect number of columns with ``index_col`` of now raises ``ParserError`` instead of ``IndexError`` when using the c parser.
	`121`	+- Default value of ``dtype`` in :func:`get_dummies` is changed to ``bool`` from ``uint8`` (:issue:`45848`)
`121`	`122`	- :meth:`DataFrame.astype`, :meth:`Series.astype`, and :meth:`DatetimeIndex.astype` casting datetime64 data to any of "datetime64[s]", "datetime64[ms]", "datetime64[us]" will return an object with the given resolution instead of coercing back to "datetime64[ns]" (:issue:`48928`)
`122`	`123`	- :meth:`DataFrame.astype`, :meth:`Series.astype`, and :meth:`DatetimeIndex.astype` casting timedelta64 data to any of "timedelta64[s]", "timedelta64[ms]", "timedelta64[us]" will return an object with the given resolution instead of coercing to "float64" dtype (:issue:`48963`)
`123`	`124`	`-`