Skip to content

Commit ccec504

Browse files
TomAugspurgerjreback
authored andcommitted
BUG: int dtype for get_dummies
closes #8725 Changes `get_dummies` return columns with `uint8` dtypes instead of coercing to floats if they were alongside other float columns. Author: Tom Augspurger <[email protected]> Closes #13796 from TomAugspurger/get_dummies_dtype and squashes the following commits: cace0f7 [Tom Augspurger] BUG: int dtype for get_dummies
1 parent 362a561 commit ccec504

File tree

5 files changed

+179
-94
lines changed

5 files changed

+179
-94
lines changed

doc/source/whatsnew/v0.19.0.txt

+27-2
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,32 @@ Previous versions of pandas would permanently silence numpy's ufunc error handli
371371

372372
After upgrading pandas, you may see *new* ``RuntimeWarnings`` being issued from your code. These are likely legitimate, and the underlying cause likely existed in the code when using previous versions of pandas that simply silenced the warning. Use `numpy.errstate <http://docs.scipy.org/doc/numpy/reference/generated/numpy.errstate.html>`__ around the source of the ``RuntimeWarning`` to control how these conditions are handled.
373373

374+
.. _whatsnew_0190.get_dummies_dtypes:
375+
376+
get_dummies dtypes
377+
^^^^^^^^^^^^^^^^^^
378+
379+
The ``pd.get_dummies`` function now returns dummy-encoded columns as small integers, rather than floats (:issue:`8725`)
380+
381+
Previous behaviour:
382+
383+
.. code-block:: ipython
384+
385+
In [1]: pd.get_dummies(['a', 'b', 'a', 'c']).dtypes
386+
387+
Out[1]:
388+
a float64
389+
b float64
390+
c float64
391+
dtype: object
392+
393+
New Behavior:
394+
395+
.. ipython:: python
396+
397+
pd.get_dummies(['a', 'b', 'a', 'c']).dtypes
398+
399+
374400
.. _whatsnew_0190.enhancements.other:
375401

376402
Other enhancements
@@ -479,7 +505,6 @@ API changes
479505
- ``Series.unique()`` with datetime and timezone now returns return array of ``Timestamp`` with timezone (:issue:`13565`)
480506

481507

482-
483508
.. _whatsnew_0190.api.tolist:
484509

485510
``Series.tolist()`` will now return Python types
@@ -1355,7 +1380,7 @@ Bug Fixes
13551380
- Bug in using NumPy ufunc with ``PeriodIndex`` to add or subtract integer raise ``IncompatibleFrequency``. Note that using standard operator like ``+`` or ``-`` is recommended, because standard operators use more efficient path (:issue:`13980`)
13561381
- Bug in operations on ``NaT`` returning ``float`` instead of ``datetime64[ns]`` (:issue:`12941`)
13571382
- Bug in ``Series`` flexible arithmetic methods (like ``.add()``) raises ``ValueError`` when ``axis=None`` (:issue:`13894`)
1358-
- Bug in ``DataFrame.to_csv()`` with ``MultiIndex`` columns in which a stray empty line was added (:issue:`6618`)
1383+
- Bug in ``DataFrame.to_csv()`` with ``MultiIndex`` columns in which a stray empty line was added (:issue:`6618`)
13591384

13601385

13611386
- Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`)

pandas/core/reshape.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -1161,14 +1161,17 @@ def get_empty_Frame(data, sparse):
11611161
sp_indices = sp_indices[1:]
11621162
dummy_cols = dummy_cols[1:]
11631163
for col, ixs in zip(dummy_cols, sp_indices):
1164-
sarr = SparseArray(np.ones(len(ixs)),
1165-
sparse_index=IntIndex(N, ixs), fill_value=0)
1164+
sarr = SparseArray(np.ones(len(ixs), dtype=np.uint8),
1165+
sparse_index=IntIndex(N, ixs), fill_value=0,
1166+
dtype=np.uint8)
11661167
sparse_series[col] = SparseSeries(data=sarr, index=index)
11671168

1168-
return SparseDataFrame(sparse_series, index=index, columns=dummy_cols)
1169+
out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols,
1170+
dtype=np.uint8)
1171+
return out
11691172

11701173
else:
1171-
dummy_mat = np.eye(number_of_cols).take(codes, axis=0)
1174+
dummy_mat = np.eye(number_of_cols, dtype=np.uint8).take(codes, axis=0)
11721175

11731176
if not dummy_na:
11741177
# reset NaN GH4446

pandas/stats/tests/test_ols.py

+2
Original file line numberDiff line numberDiff line change
@@ -645,6 +645,7 @@ def testWithXEffects(self):
645645
exp_x = DataFrame([[0., 0., 14., 1.], [0, 1, 17, 1], [1, 0, 48, 1]],
646646
columns=['x1_30', 'x1_9', 'x2', 'intercept'],
647647
index=res.index, dtype=float)
648+
exp_x[['x1_30', 'x1_9']] = exp_x[['x1_30', 'x1_9']].astype(np.uint8)
648649
assert_frame_equal(res, exp_x.reindex(columns=res.columns))
649650

650651
def testWithXEffectsAndDroppedDummies(self):
@@ -659,6 +660,7 @@ def testWithXEffectsAndDroppedDummies(self):
659660
exp_x = DataFrame([[1., 0., 14., 1.], [0, 1, 17, 1], [0, 0, 48, 1]],
660661
columns=['x1_6', 'x1_9', 'x2', 'intercept'],
661662
index=res.index, dtype=float)
663+
exp_x[['x1_6', 'x1_9']] = exp_x[['x1_6', 'x1_9']].astype(np.uint8)
662664

663665
assert_frame_equal(res, exp_x.reindex(columns=res.columns))
664666

pandas/tests/test_panel.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -2429,18 +2429,18 @@ def test_truncate(self):
24292429
def test_axis_dummies(self):
24302430
from pandas.core.reshape import make_axis_dummies
24312431

2432-
minor_dummies = make_axis_dummies(self.panel, 'minor')
2432+
minor_dummies = make_axis_dummies(self.panel, 'minor').astype(np.uint8)
24332433
self.assertEqual(len(minor_dummies.columns),
24342434
len(self.panel.index.levels[1]))
24352435

2436-
major_dummies = make_axis_dummies(self.panel, 'major')
2436+
major_dummies = make_axis_dummies(self.panel, 'major').astype(np.uint8)
24372437
self.assertEqual(len(major_dummies.columns),
24382438
len(self.panel.index.levels[0]))
24392439

24402440
mapping = {'A': 'one', 'B': 'one', 'C': 'two', 'D': 'two'}
24412441

24422442
transformed = make_axis_dummies(self.panel, 'minor',
2443-
transform=mapping.get)
2443+
transform=mapping.get).astype(np.uint8)
24442444
self.assertEqual(len(transformed.columns), 2)
24452445
self.assert_index_equal(transformed.columns, Index(['one', 'two']))
24462446

@@ -2450,7 +2450,7 @@ def test_get_dummies(self):
24502450
from pandas.core.reshape import get_dummies, make_axis_dummies
24512451

24522452
self.panel['Label'] = self.panel.index.labels[1]
2453-
minor_dummies = make_axis_dummies(self.panel, 'minor')
2453+
minor_dummies = make_axis_dummies(self.panel, 'minor').astype(np.uint8)
24542454
dummies = get_dummies(self.panel['Label'])
24552455
self.assert_numpy_array_equal(dummies.values, minor_dummies.values)
24562456

0 commit comments

Comments
 (0)