Skip to content

Commit bfdf223

Browse files
ENH: change get_dummies default dtype to bool (#48022)
* ENH: Warn when dtype is not passed to get_dummies * Edit get_dummies' dtype warning * Add whatsnew entry for issue #45848 * Fix dtype warning test * Suppress warnings in docs * Edit whatsnew entry Co-authored-by: Marco Edward Gorelli <[email protected]> * Fix find_stack_level in get_dummies dtype warning * Change the default dtype of get_dummies to bool * Revert dtype(bool) change * Move the changelog entry to v1.6.0.rst * Move whatsnew entry to 'Other API changes' Co-authored-by: Marco Edward Gorelli <[email protected]> Co-authored-by: Marco Edward Gorelli <[email protected]>
1 parent b48a73f commit bfdf223

File tree

4 files changed

+79
-76
lines changed

4 files changed

+79
-76
lines changed

doc/source/whatsnew/v1.6.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ Other API changes
118118
^^^^^^^^^^^^^^^^^
119119
- Passing ``nanoseconds`` greater than 999 or less than 0 in :class:`Timestamp` now raises a ``ValueError`` (:issue:`48538`, :issue:`48255`)
120120
- :func:`read_csv`: specifying an incorrect number of columns with ``index_col`` of now raises ``ParserError`` instead of ``IndexError`` when using the c parser.
121+
- Default value of ``dtype`` in :func:`get_dummies` is changed to ``bool`` from ``uint8`` (:issue:`45848`)
121122
- :meth:`DataFrame.astype`, :meth:`Series.astype`, and :meth:`DatetimeIndex.astype` casting datetime64 data to any of "datetime64[s]", "datetime64[ms]", "datetime64[us]" will return an object with the given resolution instead of coercing back to "datetime64[ns]" (:issue:`48928`)
122123
- :meth:`DataFrame.astype`, :meth:`Series.astype`, and :meth:`DatetimeIndex.astype` casting timedelta64 data to any of "timedelta64[s]", "timedelta64[ms]", "timedelta64[us]" will return an object with the given resolution instead of coercing to "float64" dtype (:issue:`48963`)
123124
-

pandas/core/reshape/encoding.py

+30-30
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def get_dummies(
6666
drop_first : bool, default False
6767
Whether to get k-1 dummies out of k categorical levels by removing the
6868
first level.
69-
dtype : dtype, default np.uint8
69+
dtype : dtype, default bool
7070
Data type for new columns. Only a single dtype is allowed.
7171
7272
Returns
@@ -89,50 +89,50 @@ def get_dummies(
8989
>>> s = pd.Series(list('abca'))
9090
9191
>>> pd.get_dummies(s)
92-
a b c
93-
0 1 0 0
94-
1 0 1 0
95-
2 0 0 1
96-
3 1 0 0
92+
a b c
93+
0 True False False
94+
1 False True False
95+
2 False False True
96+
3 True False False
9797
9898
>>> s1 = ['a', 'b', np.nan]
9999
100100
>>> pd.get_dummies(s1)
101-
a b
102-
0 1 0
103-
1 0 1
104-
2 0 0
101+
a b
102+
0 True False
103+
1 False True
104+
2 False False
105105
106106
>>> pd.get_dummies(s1, dummy_na=True)
107-
a b NaN
108-
0 1 0 0
109-
1 0 1 0
110-
2 0 0 1
107+
a b NaN
108+
0 True False False
109+
1 False True False
110+
2 False False True
111111
112112
>>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
113113
... 'C': [1, 2, 3]})
114114
115115
>>> pd.get_dummies(df, prefix=['col1', 'col2'])
116116
C col1_a col1_b col2_a col2_b col2_c
117-
0 1 1 0 0 1 0
118-
1 2 0 1 1 0 0
119-
2 3 1 0 0 0 1
117+
0 1 True False False True False
118+
1 2 False True True False False
119+
2 3 True False False False True
120120
121121
>>> pd.get_dummies(pd.Series(list('abcaa')))
122-
a b c
123-
0 1 0 0
124-
1 0 1 0
125-
2 0 0 1
126-
3 1 0 0
127-
4 1 0 0
122+
a b c
123+
0 True False False
124+
1 False True False
125+
2 False False True
126+
3 True False False
127+
4 True False False
128128
129129
>>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
130-
b c
131-
0 0 0
132-
1 1 0
133-
2 0 1
134-
3 0 0
135-
4 0 0
130+
b c
131+
0 False False
132+
1 True False
133+
2 False True
134+
3 False False
135+
4 False False
136136
137137
>>> pd.get_dummies(pd.Series(list('abc')), dtype=float)
138138
a b c
@@ -236,7 +236,7 @@ def _get_dummies_1d(
236236
codes, levels = factorize_from_iterable(Series(data))
237237

238238
if dtype is None:
239-
dtype = np.dtype(np.uint8)
239+
dtype = np.dtype(bool)
240240
# error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str,
241241
# dtype[Any], Type[object]]"; expected "Type[Any]"
242242
dtype = np.dtype(dtype) # type: ignore[arg-type]

pandas/tests/frame/indexing/test_getitem.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,7 @@ def test_getitem_list_of_labels_categoricalindex_cols(self):
5252
# GH#16115
5353
cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")])
5454

55-
expected = DataFrame(
56-
[[1, 0], [0, 1]], dtype="uint8", index=[0, 1], columns=cats
57-
)
55+
expected = DataFrame([[1, 0], [0, 1]], dtype="bool", index=[0, 1], columns=cats)
5856
dummies = get_dummies(cats)
5957
result = dummies[list(dummies.columns)]
6058
tm.assert_frame_equal(result, expected)

pandas/tests/reshape/test_get_dummies.py

+47-43
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ def test_get_dummies_unicode(self, sparse):
171171
s = [e, eacute, eacute]
172172
res = get_dummies(s, prefix="letter", sparse=sparse)
173173
exp = DataFrame(
174-
{"letter_e": [1, 0, 0], f"letter_{eacute}": [0, 1, 1]}, dtype=np.uint8
174+
{"letter_e": [True, False, False], f"letter_{eacute}": [False, True, True]}
175175
)
176176
if sparse:
177177
exp = exp.apply(SparseArray, fill_value=0)
@@ -182,15 +182,15 @@ def test_dataframe_dummies_all_obj(self, df, sparse):
182182
result = get_dummies(df, sparse=sparse)
183183
expected = DataFrame(
184184
{"A_a": [1, 0, 1], "A_b": [0, 1, 0], "B_b": [1, 1, 0], "B_c": [0, 0, 1]},
185-
dtype=np.uint8,
185+
dtype=bool,
186186
)
187187
if sparse:
188188
expected = DataFrame(
189189
{
190-
"A_a": SparseArray([1, 0, 1], dtype="uint8"),
191-
"A_b": SparseArray([0, 1, 0], dtype="uint8"),
192-
"B_b": SparseArray([1, 1, 0], dtype="uint8"),
193-
"B_c": SparseArray([0, 0, 1], dtype="uint8"),
190+
"A_a": SparseArray([1, 0, 1], dtype="bool"),
191+
"A_b": SparseArray([0, 1, 0], dtype="bool"),
192+
"B_b": SparseArray([1, 1, 0], dtype="bool"),
193+
"B_c": SparseArray([0, 0, 1], dtype="bool"),
194194
}
195195
)
196196

@@ -208,7 +208,7 @@ def test_dataframe_dummies_string_dtype(self, df):
208208
"B_b": [1, 1, 0],
209209
"B_c": [0, 0, 1],
210210
},
211-
dtype=np.uint8,
211+
dtype=bool,
212212
)
213213
tm.assert_frame_equal(result, expected)
214214

@@ -238,12 +238,11 @@ def test_dataframe_dummies_prefix_list(self, df, sparse):
238238
expected = DataFrame(
239239
{
240240
"C": [1, 2, 3],
241-
"from_A_a": [1, 0, 1],
242-
"from_A_b": [0, 1, 0],
243-
"from_B_b": [1, 1, 0],
244-
"from_B_c": [0, 0, 1],
241+
"from_A_a": [True, False, True],
242+
"from_A_b": [False, True, False],
243+
"from_B_b": [True, True, False],
244+
"from_B_c": [False, False, True],
245245
},
246-
dtype=np.uint8,
247246
)
248247
expected[["C"]] = df[["C"]]
249248
cols = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
@@ -258,9 +257,12 @@ def test_dataframe_dummies_prefix_str(self, df, sparse):
258257
result = get_dummies(df, prefix="bad", sparse=sparse)
259258
bad_columns = ["bad_a", "bad_b", "bad_b", "bad_c"]
260259
expected = DataFrame(
261-
[[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]],
260+
[
261+
[1, True, False, True, False],
262+
[2, False, True, True, False],
263+
[3, True, False, False, True],
264+
],
262265
columns=["C"] + bad_columns,
263-
dtype=np.uint8,
264266
)
265267
expected = expected.astype({"C": np.int64})
266268
if sparse:
@@ -269,10 +271,10 @@ def test_dataframe_dummies_prefix_str(self, df, sparse):
269271
expected = pd.concat(
270272
[
271273
Series([1, 2, 3], name="C"),
272-
Series([1, 0, 1], name="bad_a", dtype="Sparse[uint8]"),
273-
Series([0, 1, 0], name="bad_b", dtype="Sparse[uint8]"),
274-
Series([1, 1, 0], name="bad_b", dtype="Sparse[uint8]"),
275-
Series([0, 0, 1], name="bad_c", dtype="Sparse[uint8]"),
274+
Series([True, False, True], name="bad_a", dtype="Sparse[bool]"),
275+
Series([False, True, False], name="bad_b", dtype="Sparse[bool]"),
276+
Series([True, True, False], name="bad_b", dtype="Sparse[bool]"),
277+
Series([False, False, True], name="bad_c", dtype="Sparse[bool]"),
276278
],
277279
axis=1,
278280
)
@@ -290,30 +292,29 @@ def test_dataframe_dummies_subset(self, df, sparse):
290292
},
291293
)
292294
cols = expected.columns
293-
expected[cols[1:]] = expected[cols[1:]].astype(np.uint8)
295+
expected[cols[1:]] = expected[cols[1:]].astype(bool)
294296
expected[["C"]] = df[["C"]]
295297
if sparse:
296298
cols = ["from_A_a", "from_A_b"]
297-
expected[cols] = expected[cols].astype(SparseDtype("uint8", 0))
299+
expected[cols] = expected[cols].astype(SparseDtype("bool", 0))
298300
tm.assert_frame_equal(result, expected)
299301

300302
def test_dataframe_dummies_prefix_sep(self, df, sparse):
301303
result = get_dummies(df, prefix_sep="..", sparse=sparse)
302304
expected = DataFrame(
303305
{
304306
"C": [1, 2, 3],
305-
"A..a": [1, 0, 1],
306-
"A..b": [0, 1, 0],
307-
"B..b": [1, 1, 0],
308-
"B..c": [0, 0, 1],
307+
"A..a": [True, False, True],
308+
"A..b": [False, True, False],
309+
"B..b": [True, True, False],
310+
"B..c": [False, False, True],
309311
},
310-
dtype=np.uint8,
311312
)
312313
expected[["C"]] = df[["C"]]
313314
expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]]
314315
if sparse:
315316
cols = ["A..a", "A..b", "B..b", "B..c"]
316-
expected[cols] = expected[cols].astype(SparseDtype("uint8", 0))
317+
expected[cols] = expected[cols].astype(SparseDtype("bool", 0))
317318

318319
tm.assert_frame_equal(result, expected)
319320

@@ -356,9 +357,9 @@ def test_dataframe_dummies_prefix_dict(self, sparse):
356357
)
357358

358359
columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
359-
expected[columns] = expected[columns].astype(np.uint8)
360+
expected[columns] = expected[columns].astype(bool)
360361
if sparse:
361-
expected[columns] = expected[columns].astype(SparseDtype("uint8", 0))
362+
expected[columns] = expected[columns].astype(SparseDtype("bool", 0))
362363

363364
tm.assert_frame_equal(result, expected)
364365

@@ -422,19 +423,19 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
422423
[
423424
(
424425
{"data": DataFrame({"ä": ["a"]})},
425-
DataFrame({"ä_a": [1]}, dtype=np.uint8),
426+
DataFrame({"ä_a": [True]}),
426427
),
427428
(
428429
{"data": DataFrame({"x": ["ä"]})},
429-
DataFrame({"x_ä": [1]}, dtype=np.uint8),
430+
DataFrame({"x_ä": [True]}),
430431
),
431432
(
432433
{"data": DataFrame({"x": ["a"]}), "prefix": "ä"},
433-
DataFrame({"ä_a": [1]}, dtype=np.uint8),
434+
DataFrame({"ä_a": [True]}),
434435
),
435436
(
436437
{"data": DataFrame({"x": ["a"]}), "prefix_sep": "ä"},
437-
DataFrame({"xäa": [1]}, dtype=np.uint8),
438+
DataFrame({"xäa": [True]}),
438439
),
439440
],
440441
)
@@ -451,7 +452,7 @@ def test_get_dummies_basic_drop_first(self, sparse):
451452
s_series = Series(s_list)
452453
s_series_index = Series(s_list, list("ABC"))
453454

454-
expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=np.uint8)
455+
expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=bool)
455456

456457
result = get_dummies(s_list, drop_first=True, sparse=sparse)
457458
if sparse:
@@ -487,14 +488,14 @@ def test_get_dummies_basic_drop_first_NA(self, sparse):
487488
# Test NA handling together with drop_first
488489
s_NA = ["a", "b", np.nan]
489490
res = get_dummies(s_NA, drop_first=True, sparse=sparse)
490-
exp = DataFrame({"b": [0, 1, 0]}, dtype=np.uint8)
491+
exp = DataFrame({"b": [0, 1, 0]}, dtype=bool)
491492
if sparse:
492493
exp = exp.apply(SparseArray, fill_value=0)
493494

494495
tm.assert_frame_equal(res, exp)
495496

496497
res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse)
497-
exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=np.uint8).reindex(
498+
exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=bool).reindex(
498499
["b", np.nan], axis=1
499500
)
500501
if sparse:
@@ -510,7 +511,7 @@ def test_get_dummies_basic_drop_first_NA(self, sparse):
510511
def test_dataframe_dummies_drop_first(self, df, sparse):
511512
df = df[["A", "B"]]
512513
result = get_dummies(df, drop_first=True, sparse=sparse)
513-
expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=np.uint8)
514+
expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=bool)
514515
if sparse:
515516
expected = expected.apply(SparseArray, fill_value=0)
516517
tm.assert_frame_equal(result, expected)
@@ -522,7 +523,7 @@ def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype):
522523
{"C": [1, 2, 3], "A_b": [0, 1, 0], "B_c": [0, 0, 1], "cat_y": [0, 1, 1]}
523524
)
524525
cols = ["A_b", "B_c", "cat_y"]
525-
expected[cols] = expected[cols].astype(np.uint8)
526+
expected[cols] = expected[cols].astype(bool)
526527
expected = expected[["C", "A_b", "B_c", "cat_y"]]
527528
if sparse:
528529
for col in cols:
@@ -544,7 +545,7 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
544545
}
545546
)
546547
cols = ["A_b", "A_nan", "B_c", "B_nan"]
547-
expected[cols] = expected[cols].astype(np.uint8)
548+
expected[cols] = expected[cols].astype(bool)
548549
expected = expected.sort_index(axis=1)
549550
if sparse:
550551
for col in cols:
@@ -559,13 +560,13 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
559560
def test_get_dummies_int_int(self):
560561
data = Series([1, 2, 1])
561562
result = get_dummies(data)
562-
expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=np.uint8)
563+
expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=bool)
563564
tm.assert_frame_equal(result, expected)
564565

565566
data = Series(Categorical(["a", "b", "a"]))
566567
result = get_dummies(data)
567568
expected = DataFrame(
568-
[[1, 0], [0, 1], [1, 0]], columns=Categorical(["a", "b"]), dtype=np.uint8
569+
[[1, 0], [0, 1], [1, 0]], columns=Categorical(["a", "b"]), dtype=bool
569570
)
570571
tm.assert_frame_equal(result, expected)
571572

@@ -616,9 +617,12 @@ def test_get_dummies_duplicate_columns(self, df):
616617
result = get_dummies(df).sort_index(axis=1)
617618

618619
expected = DataFrame(
619-
[[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]],
620+
[
621+
[1, True, False, True, False],
622+
[2, False, True, True, False],
623+
[3, True, False, False, True],
624+
],
620625
columns=["A", "A_a", "A_b", "A_b", "A_c"],
621-
dtype=np.uint8,
622626
).sort_index(axis=1)
623627

624628
expected = expected.astype({"A": np.int64})
@@ -628,7 +632,7 @@ def test_get_dummies_duplicate_columns(self, df):
628632
def test_get_dummies_all_sparse(self):
629633
df = DataFrame({"A": [1, 2]})
630634
result = get_dummies(df, columns=["A"], sparse=True)
631-
dtype = SparseDtype("uint8", 0)
635+
dtype = SparseDtype("bool", 0)
632636
expected = DataFrame(
633637
{
634638
"A_1": SparseArray([1, 0], dtype=dtype),

0 commit comments

Comments
 (0)