diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 4365c66237752..9cd92883aa939 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -371,6 +371,17 @@ Previous versions of pandas would permanently silence numpy's ufunc error handli After upgrading pandas, you may see *new* ``RuntimeWarnings`` being issued from your code. These are likely legitimate, and the underlying cause likely existed in the code when using previous versions of pandas that simply silenced the warning. Use `numpy.errstate `__ around the source of the ``RuntimeWarning`` to control how these conditions are handled. +get_dummies dtypes +^^^^^^^^^^^^^^^^^^ + +The ``pd.get_dummies`` function now returns dummy-encoded columns as integers, rather than floats + +.. ipython:: python + + pd.get_dummies(['a', 'b', 'a', 'c']).dtypes + +Previously, this would have been a DataFrame of float columns (:issue:`8725`). + .. _whatsnew_0190.enhancements.other: Other enhancements @@ -479,7 +490,6 @@ API changes - ``Series.unique()`` with datetime and timezone now returns return array of ``Timestamp`` with timezone (:issue:`13565`) - .. _whatsnew_0190.api.tolist: ``Series.tolist()`` will now return Python types diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 4f601a2d377a6..b451f49fce78c 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -1161,14 +1161,17 @@ def get_empty_Frame(data, sparse): sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): - sarr = SparseArray(np.ones(len(ixs)), - sparse_index=IntIndex(N, ixs), fill_value=0) + sarr = SparseArray(np.ones(len(ixs), dtype=np.uint8), + sparse_index=IntIndex(N, ixs), fill_value=0, + dtype=np.uint8) sparse_series[col] = SparseSeries(data=sarr, index=index) - return SparseDataFrame(sparse_series, index=index, columns=dummy_cols) + out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, + dtype=np.uint8) + return out else: - dummy_mat = np.eye(number_of_cols).take(codes, axis=0) + dummy_mat = np.eye(number_of_cols, dtype=np.uint8).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 diff --git a/pandas/stats/tests/test_ols.py b/pandas/stats/tests/test_ols.py index bac824f0b4840..770f7b35a02ca 100644 --- a/pandas/stats/tests/test_ols.py +++ b/pandas/stats/tests/test_ols.py @@ -645,6 +645,7 @@ def testWithXEffects(self): exp_x = DataFrame([[0., 0., 14., 1.], [0, 1, 17, 1], [1, 0, 48, 1]], columns=['x1_30', 'x1_9', 'x2', 'intercept'], index=res.index, dtype=float) + exp_x[['x1_30', 'x1_9']] = exp_x[['x1_30', 'x1_9']].astype(np.uint8) assert_frame_equal(res, exp_x.reindex(columns=res.columns)) def testWithXEffectsAndDroppedDummies(self): @@ -659,6 +660,7 @@ def testWithXEffectsAndDroppedDummies(self): exp_x = DataFrame([[1., 0., 14., 1.], [0, 1, 17, 1], [0, 0, 48, 1]], columns=['x1_6', 'x1_9', 'x2', 'intercept'], index=res.index, dtype=float) + exp_x[['x1_6', 'x1_9']] = exp_x[['x1_6', 'x1_9']].astype(np.uint8) assert_frame_equal(res, exp_x.reindex(columns=res.columns)) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 10a6693525590..0b266d799cf8c 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -2429,18 +2429,18 @@ def test_truncate(self): def test_axis_dummies(self): from pandas.core.reshape import make_axis_dummies - minor_dummies = make_axis_dummies(self.panel, 'minor') + minor_dummies = make_axis_dummies(self.panel, 'minor').astype(np.uint8) self.assertEqual(len(minor_dummies.columns), len(self.panel.index.levels[1])) - major_dummies = make_axis_dummies(self.panel, 'major') + major_dummies = make_axis_dummies(self.panel, 'major').astype(np.uint8) self.assertEqual(len(major_dummies.columns), len(self.panel.index.levels[0])) mapping = {'A': 'one', 'B': 'one', 'C': 'two', 'D': 'two'} transformed = make_axis_dummies(self.panel, 'minor', - transform=mapping.get) + transform=mapping.get).astype(np.uint8) self.assertEqual(len(transformed.columns), 2) self.assert_index_equal(transformed.columns, Index(['one', 'two'])) @@ -2450,7 +2450,7 @@ def test_get_dummies(self): from pandas.core.reshape import get_dummies, make_axis_dummies self.panel['Label'] = self.panel.index.labels[1] - minor_dummies = make_axis_dummies(self.panel, 'minor') + minor_dummies = make_axis_dummies(self.panel, 'minor').astype(np.uint8) dummies = get_dummies(self.panel['Label']) self.assert_numpy_array_equal(dummies.values, minor_dummies.values) diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 7136d7effc1fc..8bfd6350adc06 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -174,15 +174,15 @@ def test_basic(self): s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) - expected = DataFrame({'a': {0: 1.0, - 1: 0.0, - 2: 0.0}, - 'b': {0: 0.0, - 1: 1.0, - 2: 0.0}, - 'c': {0: 0.0, - 1: 0.0, - 2: 1.0}}) + expected = DataFrame({'a': {0: 1, + 1: 0, + 2: 0}, + 'b': {0: 0, + 1: 1, + 2: 0}, + 'c': {0: 0, + 1: 0, + 2: 1}}, dtype=np.uint8) assert_frame_equal(get_dummies(s_list, sparse=self.sparse), expected) assert_frame_equal(get_dummies(s_series, sparse=self.sparse), expected) @@ -200,7 +200,7 @@ def test_basic_types(self): if not self.sparse: exp_df_type = DataFrame - exp_blk_type = pd.core.internals.FloatBlock + exp_blk_type = pd.core.internals.IntBlock else: exp_df_type = SparseDataFrame exp_blk_type = pd.core.internals.SparseBlock @@ -239,22 +239,24 @@ def test_just_na(self): def test_include_na(self): s = ['a', 'b', np.nan] res = get_dummies(s, sparse=self.sparse) - exp = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0}, - 'b': {0: 0.0, 1: 1.0, 2: 0.0}}) + exp = DataFrame({'a': {0: 1, 1: 0, 2: 0}, + 'b': {0: 0, 1: 1, 2: 0}}, dtype=np.uint8) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 res_na = get_dummies(s, dummy_na=True, sparse=self.sparse) - exp_na = DataFrame({nan: {0: 0.0, 1: 0.0, 2: 1.0}, - 'a': {0: 1.0, 1: 0.0, 2: 0.0}, - 'b': {0: 0.0, 1: 1.0, 2: 0.0}}) + exp_na = DataFrame({nan: {0: 0, 1: 0, 2: 1}, + 'a': {0: 1, 1: 0, 2: 0}, + 'b': {0: 0, 1: 1, 2: 0}}, + dtype=np.uint8) exp_na = exp_na.reindex_axis(['a', 'b', nan], 1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse) - exp_just_na = DataFrame(Series(1.0, index=[0]), columns=[nan]) + exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan], + dtype=np.uint8) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values) def test_unicode(self @@ -264,31 +266,34 @@ def test_unicode(self eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE') s = [e, eacute, eacute] res = get_dummies(s, prefix='letter', sparse=self.sparse) - exp = DataFrame({'letter_e': {0: 1.0, - 1: 0.0, - 2: 0.0}, - u('letter_%s') % eacute: {0: 0.0, - 1: 1.0, - 2: 1.0}}) + exp = DataFrame({'letter_e': {0: 1, + 1: 0, + 2: 0}, + u('letter_%s') % eacute: {0: 0, + 1: 1, + 2: 1}}, + dtype=np.uint8) assert_frame_equal(res, exp) def test_dataframe_dummies_all_obj(self): df = self.df[['A', 'B']] result = get_dummies(df, sparse=self.sparse) - expected = DataFrame({'A_a': [1., 0, 1], - 'A_b': [0., 1, 0], - 'B_b': [1., 1, 0], - 'B_c': [0., 0, 1]}) + expected = DataFrame({'A_a': [1, 0, 1], + 'A_b': [0, 1, 0], + 'B_b': [1, 1, 0], + 'B_c': [0, 0, 1]}, dtype=np.uint8) assert_frame_equal(result, expected) def test_dataframe_dummies_mix_default(self): df = self.df result = get_dummies(df, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], - 'A_a': [1., 0, 1], - 'A_b': [0., 1, 0], - 'B_b': [1., 1, 0], - 'B_c': [0., 0, 1]}) + 'A_a': [1, 0, 1], + 'A_b': [0, 1, 0], + 'B_b': [1, 1, 0], + 'B_c': [0, 0, 1]}) + cols = ['A_a', 'A_b', 'B_b', 'B_c'] + expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] assert_frame_equal(result, expected) @@ -299,10 +304,12 @@ def test_dataframe_dummies_prefix_list(self): 'C': [1, 2, 3]}) result = get_dummies(df, prefix=prefixes, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], - 'from_A_a': [1., 0, 1], - 'from_A_b': [0., 1, 0], - 'from_B_b': [1., 1, 0], - 'from_B_c': [0., 0, 1]}) + 'from_A_a': [1, 0, 1], + 'from_A_b': [0, 1, 0], + 'from_B_b': [1, 1, 0], + 'from_B_c': [0, 0, 1]}) + cols = expected.columns[1:] + expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']] assert_frame_equal(result, expected) @@ -311,31 +318,37 @@ def test_dataframe_dummies_prefix_str(self): # not that you should do this... df = self.df result = get_dummies(df, prefix='bad', sparse=self.sparse) - expected = DataFrame([[1, 1., 0., 1., 0.], - [2, 0., 1., 1., 0.], - [3, 1., 0., 0., 1.]], - columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c']) + expected = DataFrame([[1, 1, 0, 1, 0], + [2, 0, 1, 1, 0], + [3, 1, 0, 0, 1]], + columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c'], + dtype=np.uint8) + expected = expected.astype({"C": np.int}) assert_frame_equal(result, expected) def test_dataframe_dummies_subset(self): df = self.df result = get_dummies(df, prefix=['from_A'], columns=['A'], sparse=self.sparse) - expected = DataFrame({'from_A_a': [1., 0, 1], - 'from_A_b': [0., 1, 0], + expected = DataFrame({'from_A_a': [1, 0, 1], + 'from_A_b': [0, 1, 0], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) + cols = ['from_A_a', 'from_A_b'] + expected[cols] = expected[cols].astype(np.uint8) assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_sep(self): df = self.df result = get_dummies(df, prefix_sep='..', sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], - 'A..a': [1., 0, 1], - 'A..b': [0., 1, 0], - 'B..b': [1., 1, 0], - 'B..c': [0., 0, 1]}) + 'A..a': [1, 0, 1], + 'A..b': [0, 1, 0], + 'B..b': [1, 1, 0], + 'B..c': [0, 0, 1]}) expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']] + cols = expected.columns[1:] + expected[cols] = expected[cols].astype(np.uint8) assert_frame_equal(result, expected) result = get_dummies(df, prefix_sep=['..', '__'], sparse=self.sparse) @@ -360,11 +373,13 @@ def test_dataframe_dummies_prefix_dict(self): 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) result = get_dummies(df, prefix=prefixes, sparse=self.sparse) - expected = DataFrame({'from_A_a': [1., 0, 1], - 'from_A_b': [0., 1, 0], - 'from_B_b': [1., 1, 0], - 'from_B_c': [0., 0, 1], + expected = DataFrame({'from_A_a': [1, 0, 1], + 'from_A_b': [0, 1, 0], + 'from_B_b': [1, 1, 0], + 'from_B_c': [0, 0, 1], 'C': [1, 2, 3]}) + cols = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c'] + expected[cols] = expected[cols].astype(np.uint8) assert_frame_equal(result, expected) def test_dataframe_dummies_with_na(self): @@ -372,12 +387,14 @@ def test_dataframe_dummies_with_na(self): df.loc[3, :] = [np.nan, np.nan, np.nan] result = get_dummies(df, dummy_na=True, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3, np.nan], - 'A_a': [1., 0, 1, 0], - 'A_b': [0., 1, 0, 0], - 'A_nan': [0., 0, 0, 1], - 'B_b': [1., 1, 0, 0], - 'B_c': [0., 0, 1, 0], - 'B_nan': [0., 0, 0, 1]}) + 'A_a': [1, 0, 1, 0], + 'A_b': [0, 1, 0, 0], + 'A_nan': [0, 0, 0, 1], + 'B_b': [1, 1, 0, 0], + 'B_c': [0, 0, 1, 0], + 'B_nan': [0, 0, 0, 1]}) + cols = ['A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan'] + expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan']] assert_frame_equal(result, expected) @@ -391,12 +408,14 @@ def test_dataframe_dummies_with_categorical(self): df['cat'] = pd.Categorical(['x', 'y', 'y']) result = get_dummies(df, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], - 'A_a': [1., 0, 1], - 'A_b': [0., 1, 0], - 'B_b': [1., 1, 0], - 'B_c': [0., 0, 1], - 'cat_x': [1., 0, 0], - 'cat_y': [0., 1, 1]}) + 'A_a': [1, 0, 1], + 'A_b': [0, 1, 0], + 'B_b': [1, 1, 0], + 'B_c': [0, 0, 1], + 'cat_x': [1, 0, 0], + 'cat_y': [0, 1, 1]}) + cols = ['A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y'] + expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y']] assert_frame_equal(result, expected) @@ -408,12 +427,12 @@ def test_basic_drop_first(self): s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) - expected = DataFrame({'b': {0: 0.0, - 1: 1.0, - 2: 0.0}, - 'c': {0: 0.0, - 1: 0.0, - 2: 1.0}}) + expected = DataFrame({'b': {0: 0, + 1: 1, + 2: 0}, + 'c': {0: 0, + 1: 0, + 2: 1}}, dtype=np.uint8) result = get_dummies(s_list, sparse=self.sparse, drop_first=True) assert_frame_equal(result, expected) @@ -449,19 +468,19 @@ def test_basic_drop_first_NA(self): # Test NA hadling together with drop_first s_NA = ['a', 'b', np.nan] res = get_dummies(s_NA, sparse=self.sparse, drop_first=True) - exp = DataFrame({'b': {0: 0.0, - 1: 1.0, - 2: 0.0}}) + exp = DataFrame({'b': {0: 0, + 1: 1, + 2: 0}}, dtype=np.uint8) assert_frame_equal(res, exp) res_na = get_dummies(s_NA, dummy_na=True, sparse=self.sparse, drop_first=True) - exp_na = DataFrame({'b': {0: 0.0, - 1: 1.0, - 2: 0.0}, - nan: {0: 0.0, - 1: 0.0, - 2: 1.0}}).reindex_axis( + exp_na = DataFrame({'b': {0: 0, + 1: 1, + 2: 0}, + nan: {0: 0, + 1: 0, + 2: 1}}, dtype=np.uint8).reindex_axis( ['b', nan], 1) assert_frame_equal(res_na, exp_na) @@ -473,8 +492,8 @@ def test_basic_drop_first_NA(self): def test_dataframe_dummies_drop_first(self): df = self.df[['A', 'B']] result = get_dummies(df, sparse=self.sparse, drop_first=True) - expected = DataFrame({'A_b': [0., 1, 0], - 'B_c': [0., 0, 1]}) + expected = DataFrame({'A_b': [0, 1, 0], + 'B_c': [0, 0, 1]}, dtype=np.uint8) assert_frame_equal(result, expected) def test_dataframe_dummies_drop_first_with_categorical(self): @@ -482,9 +501,11 @@ def test_dataframe_dummies_drop_first_with_categorical(self): df['cat'] = pd.Categorical(['x', 'y', 'y']) result = get_dummies(df, sparse=self.sparse, drop_first=True) expected = DataFrame({'C': [1, 2, 3], - 'A_b': [0., 1, 0], - 'B_c': [0., 0, 1], - 'cat_y': [0., 1, 1]}) + 'A_b': [0, 1, 0], + 'B_c': [0, 0, 1], + 'cat_y': [0, 1, 1]}) + cols = ['A_b', 'B_c', 'cat_y'] + expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_b', 'B_c', 'cat_y']] assert_frame_equal(result, expected) @@ -494,10 +515,13 @@ def test_dataframe_dummies_drop_first_with_na(self): result = get_dummies(df, dummy_na=True, sparse=self.sparse, drop_first=True) expected = DataFrame({'C': [1, 2, 3, np.nan], - 'A_b': [0., 1, 0, 0], - 'A_nan': [0., 0, 0, 1], - 'B_c': [0., 0, 1, 0], - 'B_nan': [0., 0, 0, 1]}) + 'A_b': [0, 1, 0, 0], + 'A_nan': [0, 0, 0, 1], + 'B_c': [0, 0, 1, 0], + 'B_nan': [0, 0, 0, 1]}) + cols = ['A_b', 'A_nan', 'B_c', 'B_nan'] + expected[cols] = expected[cols].astype(np.uint8) + expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']] assert_frame_equal(result, expected) @@ -506,6 +530,37 @@ def test_dataframe_dummies_drop_first_with_na(self): expected = expected[['C', 'A_b', 'B_c']] assert_frame_equal(result, expected) + def test_int_int(self): + data = Series([1, 2, 1]) + result = pd.get_dummies(data) + expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], + dtype=np.uint8) + tm.assert_frame_equal(result, expected) + + data = Series(pd.Categorical(['a', 'b', 'a'])) + result = pd.get_dummies(data) + expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=['a', 'b'], + dtype=np.uint8) + tm.assert_frame_equal(result, expected) + + def test_int_df(self): + data = DataFrame( + {'A': [1, 2, 1], + 'B': pd.Categorical(['a', 'b', 'a']), + 'C': [1, 2, 1], + 'D': [1., 2., 1.] + } + ) + columns = ['C', 'D', 'A_1', 'A_2', 'B_a', 'B_b'] + expected = DataFrame([ + [1, 1., 1, 0, 1, 0], + [2, 2., 0, 1, 0, 1], + [1, 1., 1, 0, 1, 0] + ], columns=columns) + expected[columns[2:]] = expected[columns[2:]].astype(np.uint8) + result = pd.get_dummies(data, columns=['A', 'B']) + tm.assert_frame_equal(result, expected) + class TestGetDummiesSparse(TestGetDummies): sparse = True