Skip to content

BUG: int dtype for get_dummies #13796

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion doc/source/whatsnew/v0.19.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,17 @@ Previous versions of pandas would permanently silence numpy's ufunc error handli

After upgrading pandas, you may see *new* ``RuntimeWarnings`` being issued from your code. These are likely legitimate, and the underlying cause likely existed in the code when using previous versions of pandas that simply silenced the warning. Use `numpy.errstate <http://docs.scipy.org/doc/numpy/reference/generated/numpy.errstate.html>`__ around the source of the ``RuntimeWarning`` to control how these conditions are handled.

get_dummies dtypes
^^^^^^^^^^^^^^^^^^

The ``pd.get_dummies`` function now returns dummy-encoded columns as integers, rather than floats

.. ipython:: python

pd.get_dummies(['a', 'b', 'a', 'c']).dtypes

Previously, this would have been a DataFrame of float columns (:issue:`8725`).

.. _whatsnew_0190.enhancements.other:

Other enhancements
Expand Down Expand Up @@ -479,7 +490,6 @@ API changes
- ``Series.unique()`` with datetime and timezone now returns return array of ``Timestamp`` with timezone (:issue:`13565`)



.. _whatsnew_0190.api.tolist:

``Series.tolist()`` will now return Python types
Expand Down
11 changes: 7 additions & 4 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -1161,14 +1161,17 @@ def get_empty_Frame(data, sparse):
sp_indices = sp_indices[1:]
dummy_cols = dummy_cols[1:]
for col, ixs in zip(dummy_cols, sp_indices):
sarr = SparseArray(np.ones(len(ixs)),
sparse_index=IntIndex(N, ixs), fill_value=0)
sarr = SparseArray(np.ones(len(ixs), dtype=np.uint8),
sparse_index=IntIndex(N, ixs), fill_value=0,
dtype=np.uint8)
sparse_series[col] = SparseSeries(data=sarr, index=index)

return SparseDataFrame(sparse_series, index=index, columns=dummy_cols)
out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols,
dtype=np.uint8)
return out

else:
dummy_mat = np.eye(number_of_cols).take(codes, axis=0)
dummy_mat = np.eye(number_of_cols, dtype=np.uint8).take(codes, axis=0)

if not dummy_na:
# reset NaN GH4446
Expand Down
2 changes: 2 additions & 0 deletions pandas/stats/tests/test_ols.py
Original file line number Diff line number Diff line change
Expand Up @@ -645,6 +645,7 @@ def testWithXEffects(self):
exp_x = DataFrame([[0., 0., 14., 1.], [0, 1, 17, 1], [1, 0, 48, 1]],
columns=['x1_30', 'x1_9', 'x2', 'intercept'],
index=res.index, dtype=float)
exp_x[['x1_30', 'x1_9']] = exp_x[['x1_30', 'x1_9']].astype(np.uint8)
assert_frame_equal(res, exp_x.reindex(columns=res.columns))

def testWithXEffectsAndDroppedDummies(self):
Expand All @@ -659,6 +660,7 @@ def testWithXEffectsAndDroppedDummies(self):
exp_x = DataFrame([[1., 0., 14., 1.], [0, 1, 17, 1], [0, 0, 48, 1]],
columns=['x1_6', 'x1_9', 'x2', 'intercept'],
index=res.index, dtype=float)
exp_x[['x1_6', 'x1_9']] = exp_x[['x1_6', 'x1_9']].astype(np.uint8)

assert_frame_equal(res, exp_x.reindex(columns=res.columns))

Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/test_panel.py
Original file line number Diff line number Diff line change
Expand Up @@ -2429,18 +2429,18 @@ def test_truncate(self):
def test_axis_dummies(self):
from pandas.core.reshape import make_axis_dummies

minor_dummies = make_axis_dummies(self.panel, 'minor')
minor_dummies = make_axis_dummies(self.panel, 'minor').astype(np.uint8)
self.assertEqual(len(minor_dummies.columns),
len(self.panel.index.levels[1]))

major_dummies = make_axis_dummies(self.panel, 'major')
major_dummies = make_axis_dummies(self.panel, 'major').astype(np.uint8)
self.assertEqual(len(major_dummies.columns),
len(self.panel.index.levels[0]))

mapping = {'A': 'one', 'B': 'one', 'C': 'two', 'D': 'two'}

transformed = make_axis_dummies(self.panel, 'minor',
transform=mapping.get)
transform=mapping.get).astype(np.uint8)
self.assertEqual(len(transformed.columns), 2)
self.assert_index_equal(transformed.columns, Index(['one', 'two']))

Expand All @@ -2450,7 +2450,7 @@ def test_get_dummies(self):
from pandas.core.reshape import get_dummies, make_axis_dummies

self.panel['Label'] = self.panel.index.labels[1]
minor_dummies = make_axis_dummies(self.panel, 'minor')
minor_dummies = make_axis_dummies(self.panel, 'minor').astype(np.uint8)
dummies = get_dummies(self.panel['Label'])
self.assert_numpy_array_equal(dummies.values, minor_dummies.values)

Expand Down
Loading