From a333fa95f2063551a7d3c467e0b30b409a9b804c Mon Sep 17 00:00:00 2001 From: Andrew Savchyn Date: Wed, 15 Nov 2017 21:47:43 +0100 Subject: [PATCH 01/24] TST: get_dummies dtype tests --- pandas/tests/reshape/test_reshape.py | 230 ++++++++++++++------------- 1 file changed, 120 insertions(+), 110 deletions(-) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 2722c3e92d85a..366c70d088bc1 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -217,34 +217,31 @@ def test_multiindex(self): class TestGetDummies(object): - + dtype_str = 'uint8' sparse = False + def setup_class(cls): + cls.dtype = np.dtype(cls.dtype_str) + def setup_method(self, method): self.df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], - 'C': [1, 2, 3]}) + 'C': [1, 2, 3]}, dtype=self.dtype) def test_basic(self): s_list = list('abc') s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) - expected = DataFrame({'a': {0: 1, - 1: 0, - 2: 0}, - 'b': {0: 0, - 1: 1, - 2: 0}, - 'c': {0: 0, - 1: 0, - 2: 1}}, dtype=np.uint8) - assert_frame_equal(get_dummies(s_list, sparse=self.sparse), expected) - assert_frame_equal(get_dummies(s_series, sparse=self.sparse), expected) + expected = DataFrame({'a': [1, 0, 0], + 'b': [0, 1, 0], + 'c': [0, 0, 1]}, dtype=self.dtype) + assert_frame_equal(get_dummies(s_list, sparse=self.sparse, dtype=self.dtype), expected) + assert_frame_equal(get_dummies(s_series, sparse=self.sparse, dtype=self.dtype), expected) expected.index = list('ABC') assert_frame_equal( - get_dummies(s_series_index, sparse=self.sparse), expected) + get_dummies(s_series_index, sparse=self.sparse, dtype=self.dtype), expected) def test_basic_types(self): # GH 10531 @@ -257,7 +254,7 @@ def test_basic_types(self): expected = DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1]}, - dtype='uint8', + dtype=self.dtype_str, columns=list('abc')) if not self.sparse: compare = tm.assert_frame_equal @@ -265,20 +262,19 @@ def test_basic_types(self): expected = expected.to_sparse(fill_value=0, kind='integer') compare = tm.assert_sp_frame_equal - result = get_dummies(s_list, sparse=self.sparse) + result = get_dummies(s_list, sparse=self.sparse, dtype=self.dtype) compare(result, expected) - result = get_dummies(s_series, sparse=self.sparse) + result = get_dummies(s_series, sparse=self.sparse, dtype=self.dtype) compare(result, expected) - result = get_dummies(s_df, sparse=self.sparse, columns=s_df.columns) + result = get_dummies(s_df, sparse=self.sparse, columns=s_df.columns, dtype=self.dtype) tm.assert_series_equal(result.get_dtype_counts(), - Series({'uint8': 8})) + Series({self.dtype_str: 8})) - result = get_dummies(s_df, sparse=self.sparse, columns=['a']) - expected = Series({'uint8': 3, 'int64': 1, 'object': 1}).sort_values() - tm.assert_series_equal(result.get_dtype_counts().sort_values(), - expected) + result = get_dummies(s_df, sparse=self.sparse, columns=['a'], dtype=self.dtype) + expected = Series({self.dtype_str: 3, 'int64': 1, 'object': 1}).sort_values() + tm.assert_series_equal(result.get_dtype_counts().sort_values(), expected) def test_just_na(self): just_na_list = [np.nan] @@ -300,25 +296,25 @@ def test_just_na(self): def test_include_na(self): s = ['a', 'b', np.nan] - res = get_dummies(s, sparse=self.sparse) + res = get_dummies(s, sparse=self.sparse, dtype=self.dtype) exp = DataFrame({'a': {0: 1, 1: 0, 2: 0}, - 'b': {0: 0, 1: 1, 2: 0}}, dtype=np.uint8) + 'b': {0: 0, 1: 1, 2: 0}}, dtype=self.dtype) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 - res_na = get_dummies(s, dummy_na=True, sparse=self.sparse) + res_na = get_dummies(s, dummy_na=True, sparse=self.sparse, dtype=self.dtype) exp_na = DataFrame({nan: {0: 0, 1: 0, 2: 1}, 'a': {0: 1, 1: 0, 2: 0}, 'b': {0: 0, 1: 1, 2: 0}}, - dtype=np.uint8) + dtype=self.dtype) exp_na = exp_na.reindex(['a', 'b', nan], axis=1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns assert_frame_equal(res_na, exp_na) - res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse) + res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse, dtype=self.dtype) exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan], - dtype=np.uint8) + dtype=self.dtype) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values) def test_unicode(self @@ -327,35 +323,35 @@ def test_unicode(self e = 'e' eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE') s = [e, eacute, eacute] - res = get_dummies(s, prefix='letter', sparse=self.sparse) + res = get_dummies(s, prefix='letter', sparse=self.sparse, dtype=self.dtype) exp = DataFrame({'letter_e': {0: 1, 1: 0, 2: 0}, u('letter_%s') % eacute: {0: 0, 1: 1, 2: 1}}, - dtype=np.uint8) + dtype=self.dtype) assert_frame_equal(res, exp) def test_dataframe_dummies_all_obj(self): df = self.df[['A', 'B']] - result = get_dummies(df, sparse=self.sparse) + result = get_dummies(df, sparse=self.sparse, dtype=self.dtype) expected = DataFrame({'A_a': [1, 0, 1], 'A_b': [0, 1, 0], 'B_b': [1, 1, 0], - 'B_c': [0, 0, 1]}, dtype=np.uint8) + 'B_c': [0, 0, 1]}, dtype=self.dtype) assert_frame_equal(result, expected) def test_dataframe_dummies_mix_default(self): df = self.df - result = get_dummies(df, sparse=self.sparse) + result = get_dummies(df, sparse=self.sparse, dtype=self.dtype) expected = DataFrame({'C': [1, 2, 3], 'A_a': [1, 0, 1], 'A_b': [0, 1, 0], 'B_b': [1, 1, 0], - 'B_c': [0, 0, 1]}) + 'B_c': [0, 0, 1]}, dtype=self.dtype) cols = ['A_a', 'A_b', 'B_b', 'B_c'] - expected[cols] = expected[cols].astype(np.uint8) + expected[cols] = expected[cols].astype(self.dtype) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] assert_frame_equal(result, expected) @@ -363,62 +359,61 @@ def test_dataframe_dummies_prefix_list(self): prefixes = ['from_A', 'from_B'] df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], - 'C': [1, 2, 3]}) - result = get_dummies(df, prefix=prefixes, sparse=self.sparse) + 'C': [1, 2, 3]}, dtype=self.dtype) + result = get_dummies(df, prefix=prefixes, sparse=self.sparse, dtype=self.dtype) expected = DataFrame({'C': [1, 2, 3], 'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], 'from_B_b': [1, 1, 0], - 'from_B_c': [0, 0, 1]}) + 'from_B_c': [0, 0, 1]}, dtype=self.dtype) cols = expected.columns[1:] - expected[cols] = expected[cols].astype(np.uint8) - expected = expected[['C', 'from_A_a', 'from_A_b', 'from_B_b', - 'from_B_c']] + expected[cols] = expected[cols].astype(self.dtype) + expected = expected[['C', 'from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']] assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_str(self): # not that you should do this... df = self.df - result = get_dummies(df, prefix='bad', sparse=self.sparse) + result = get_dummies(df, prefix='bad', sparse=self.sparse, dtype=self.dtype) expected = DataFrame([[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]], columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c'], - dtype=np.uint8) - expected = expected.astype({"C": np.int64}) + dtype=self.dtype) + expected = expected.astype({"C": self.dtype}) assert_frame_equal(result, expected) def test_dataframe_dummies_subset(self): df = self.df result = get_dummies(df, prefix=['from_A'], columns=['A'], - sparse=self.sparse) + sparse=self.sparse, dtype=self.dtype) expected = DataFrame({'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], 'B': ['b', 'b', 'c'], - 'C': [1, 2, 3]}) + 'C': [1, 2, 3]}, dtype=self.dtype) cols = ['from_A_a', 'from_A_b'] - expected[cols] = expected[cols].astype(np.uint8) + expected[cols] = expected[cols].astype(self.dtype) assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_sep(self): df = self.df - result = get_dummies(df, prefix_sep='..', sparse=self.sparse) + result = get_dummies(df, prefix_sep='..', sparse=self.sparse, dtype=self.dtype) expected = DataFrame({'C': [1, 2, 3], 'A..a': [1, 0, 1], 'A..b': [0, 1, 0], 'B..b': [1, 1, 0], - 'B..c': [0, 0, 1]}) + 'B..c': [0, 0, 1]}, dtype=self.dtype) expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']] cols = expected.columns[1:] - expected[cols] = expected[cols].astype(np.uint8) + expected[cols] = expected[cols].astype(self.dtype) assert_frame_equal(result, expected) - result = get_dummies(df, prefix_sep=['..', '__'], sparse=self.sparse) + result = get_dummies(df, prefix_sep=['..', '__'], sparse=self.sparse, dtype=self.dtype) expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'}) assert_frame_equal(result, expected) result = get_dummies(df, prefix_sep={'A': '..', - 'B': '__'}, sparse=self.sparse) + 'B': '__'}, sparse=self.sparse, dtype=self.dtype) assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_bad_length(self): @@ -433,51 +428,46 @@ def test_dataframe_dummies_prefix_dict(self): prefixes = {'A': 'from_A', 'B': 'from_B'} df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], - 'C': [1, 2, 3]}) - result = get_dummies(df, prefix=prefixes, sparse=self.sparse) + 'C': [1, 2, 3]}, dtype=self.dtype) + result = get_dummies(df, prefix=prefixes, sparse=self.sparse, dtype=self.dtype) expected = DataFrame({'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], 'from_B_b': [1, 1, 0], 'from_B_c': [0, 0, 1], - 'C': [1, 2, 3]}) - cols = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c'] - expected[cols] = expected[cols].astype(np.uint8) + 'C': [1, 2, 3]}, dtype=self.dtype) assert_frame_equal(result, expected) def test_dataframe_dummies_with_na(self): df = self.df df.loc[3, :] = [np.nan, np.nan, np.nan] - result = get_dummies(df, dummy_na=True, sparse=self.sparse) + result = get_dummies(df, dummy_na=True, sparse=self.sparse, dtype=self.dtype) expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_a': [1, 0, 1, 0], 'A_b': [0, 1, 0, 0], 'A_nan': [0, 0, 0, 1], 'B_b': [1, 1, 0, 0], 'B_c': [0, 0, 1, 0], - 'B_nan': [0, 0, 0, 1]}) - cols = ['A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan'] - expected[cols] = expected[cols].astype(np.uint8) - expected = expected[['C', 'A_a', 'A_b', 'A_nan', - 'B_b', 'B_c', 'B_nan']] + 'B_nan': [0, 0, 0, 1]}, dtype=self.dtype) + expected[['C']] = expected[['C']].astype(np.float64) + expected = expected[['C', 'A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan']] assert_frame_equal(result, expected) - result = get_dummies(df, dummy_na=False, sparse=self.sparse) + result = get_dummies(df, dummy_na=False, sparse=self.sparse, dtype=self.dtype) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] assert_frame_equal(result, expected) def test_dataframe_dummies_with_categorical(self): df = self.df df['cat'] = pd.Categorical(['x', 'y', 'y']) - result = get_dummies(df, sparse=self.sparse) + result = get_dummies(df, sparse=self.sparse, dtype=self.dtype) expected = DataFrame({'C': [1, 2, 3], 'A_a': [1, 0, 1], 'A_b': [0, 1, 0], 'B_b': [1, 1, 0], 'B_c': [0, 0, 1], 'cat_x': [1, 0, 0], - 'cat_y': [0, 1, 1]}) + 'cat_y': [0, 1, 1]}, dtype=self.dtype) cols = ['A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y'] - expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y']] assert_frame_equal(result, expected) @@ -489,22 +479,18 @@ def test_basic_drop_first(self): s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) - expected = DataFrame({'b': {0: 0, - 1: 1, - 2: 0}, - 'c': {0: 0, - 1: 0, - 2: 1}}, dtype=np.uint8) + expected = DataFrame({'b': [0, 1, 0], + 'c': [0, 0, 1]}, dtype=self.dtype) - result = get_dummies(s_list, sparse=self.sparse, drop_first=True) + result = get_dummies(s_list, sparse=self.sparse, drop_first=True, dtype=self.dtype) assert_frame_equal(result, expected) - result = get_dummies(s_series, sparse=self.sparse, drop_first=True) + result = get_dummies(s_series, sparse=self.sparse, drop_first=True, dtype=self.dtype) assert_frame_equal(result, expected) expected.index = list('ABC') result = get_dummies(s_series_index, sparse=self.sparse, - drop_first=True) + drop_first=True, dtype=self.dtype) assert_frame_equal(result, expected) def test_basic_drop_first_one_level(self): @@ -529,45 +515,39 @@ def test_basic_drop_first_one_level(self): def test_basic_drop_first_NA(self): # Test NA hadling together with drop_first s_NA = ['a', 'b', np.nan] - res = get_dummies(s_NA, sparse=self.sparse, drop_first=True) - exp = DataFrame({'b': {0: 0, - 1: 1, - 2: 0}}, dtype=np.uint8) + res = get_dummies(s_NA, sparse=self.sparse, drop_first=True, dtype=self.dtype) + exp = DataFrame({'b': [0, 1, 0]}, dtype=self.dtype) assert_frame_equal(res, exp) res_na = get_dummies(s_NA, dummy_na=True, sparse=self.sparse, - drop_first=True) - exp_na = DataFrame({'b': {0: 0, - 1: 1, - 2: 0}, - nan: {0: 0, - 1: 0, - 2: 1}}, dtype=np.uint8).reindex( - ['b', nan], axis=1) + drop_first=True, dtype=self.dtype) + exp_na = DataFrame({'b': [0, 1, 0], + nan: [0, 0, 1]}, + dtype=self.dtype).reindex(['b', nan], axis=1) assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse, - drop_first=True) + drop_first=True, dtype=self.dtype) exp_just_na = DataFrame(index=np.arange(1)) assert_frame_equal(res_just_na, exp_just_na) def test_dataframe_dummies_drop_first(self): df = self.df[['A', 'B']] - result = get_dummies(df, sparse=self.sparse, drop_first=True) + result = get_dummies(df, sparse=self.sparse, drop_first=True, dtype=self.dtype) expected = DataFrame({'A_b': [0, 1, 0], - 'B_c': [0, 0, 1]}, dtype=np.uint8) + 'B_c': [0, 0, 1]}, dtype=self.dtype) assert_frame_equal(result, expected) def test_dataframe_dummies_drop_first_with_categorical(self): df = self.df df['cat'] = pd.Categorical(['x', 'y', 'y']) - result = get_dummies(df, sparse=self.sparse, drop_first=True) + result = get_dummies(df, sparse=self.sparse, drop_first=True, dtype=self.dtype) expected = DataFrame({'C': [1, 2, 3], 'A_b': [0, 1, 0], 'B_c': [0, 0, 1], - 'cat_y': [0, 1, 1]}) + 'cat_y': [0, 1, 1]}, dtype=self.dtype) cols = ['A_b', 'B_c', 'cat_y'] - expected[cols] = expected[cols].astype(np.uint8) + expected[cols] = expected[cols].astype(self.dtype) expected = expected[['C', 'A_b', 'B_c', 'cat_y']] assert_frame_equal(result, expected) @@ -575,35 +555,41 @@ def test_dataframe_dummies_drop_first_with_na(self): df = self.df df.loc[3, :] = [np.nan, np.nan, np.nan] result = get_dummies(df, dummy_na=True, sparse=self.sparse, - drop_first=True) + drop_first=True, dtype=self.dtype) expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_b': [0, 1, 0, 0], 'A_nan': [0, 0, 0, 1], 'B_c': [0, 0, 1, 0], - 'B_nan': [0, 0, 0, 1]}) + 'B_nan': [0, 0, 0, 1]}, dtype=self.dtype) + expected[['C']] = expected[['C']].astype(np.float64) cols = ['A_b', 'A_nan', 'B_c', 'B_nan'] - expected[cols] = expected[cols].astype(np.uint8) + expected[cols] = expected[cols].astype(self.dtype) expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']] assert_frame_equal(result, expected) result = get_dummies(df, dummy_na=False, sparse=self.sparse, - drop_first=True) + drop_first=True, dtype=self.dtype) expected = expected[['C', 'A_b', 'B_c']] assert_frame_equal(result, expected) def test_int_int(self): data = Series([1, 2, 1]) - result = pd.get_dummies(data) - expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], - dtype=np.uint8) + result = pd.get_dummies(data, dtype=self.dtype) + expected = DataFrame([[1, 0], + [0, 1], + [1, 0]], + columns=[1, 2], + dtype=self.dtype) tm.assert_frame_equal(result, expected) data = Series(pd.Categorical(['a', 'b', 'a'])) - result = pd.get_dummies(data) - expected = DataFrame([[1, 0], [0, 1], [1, 0]], + result = pd.get_dummies(data, dtype=self.dtype) + expected = DataFrame([[1, 0], + [0, 1], + [1, 0]], columns=pd.Categorical(['a', 'b']), - dtype=np.uint8) + dtype=self.dtype) tm.assert_frame_equal(result, expected) def test_int_df(self): @@ -620,8 +606,8 @@ def test_int_df(self): [2, 2., 0, 1, 0, 1], [1, 1., 1, 0, 1, 0] ], columns=columns) - expected[columns[2:]] = expected[columns[2:]].astype(np.uint8) - result = pd.get_dummies(data, columns=['A', 'B']) + expected[columns[2:]] = expected[columns[2:]].astype(self.dtype) + result = pd.get_dummies(data, columns=['A', 'B'], dtype=self.dtype) tm.assert_frame_equal(result, expected) def test_dataframe_dummies_preserve_categorical_dtype(self): @@ -629,13 +615,13 @@ def test_dataframe_dummies_preserve_categorical_dtype(self): for ordered in [False, True]: cat = pd.Categorical(list("xy"), categories=list("xyz"), ordered=ordered) - result = get_dummies(cat) + result = get_dummies(cat, dtype=self.dtype) - data = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.uint8) + data = np.array([[1, 0, 0], [0, 1, 0]], dtype=self.dtype) cols = pd.CategoricalIndex(cat.categories, categories=cat.categories, ordered=ordered) - expected = DataFrame(data, columns=cols) + expected = DataFrame(data, columns=cols, dtype=self.dtype) tm.assert_frame_equal(result, expected) @@ -647,6 +633,30 @@ class TestGetDummiesSparse(TestGetDummies): def test_include_na(self): super(TestGetDummiesSparse, self).test_include_na() +class TestGetDummiesDtypeMixin(object): + dtype_str = 'float64' + + @pytest.mark.skip(reason='no element types to test') + def test_just_na(self): + pass + + @pytest.mark.skip(reason='no internal elements to assert type') + def test_dataframe_dummies_prefix_bad_length(self): + pass + + @pytest.mark.skip(reason='no internal elements to assert type') + def test_dataframe_dummies_prefix_sep_bad_length(self): + pass + + @pytest.mark.skip(reason='no internal elements to assert type') + def test_basic_drop_first_one_level(self): + pass + +class TestGetDummiesDtypeFloat(TestGetDummiesDtypeMixin, TestGetDummies): + pass + +class TestGetDummiesSparseDtypeFloat(TestGetDummiesDtypeMixin, TestGetDummiesSparse): + pass class TestMakeAxisDummies(object): From f84f83e03810eaea9e92d2c6b6496225123e0cd5 Mon Sep 17 00:00:00 2001 From: Andrew Savchyn Date: Wed, 15 Nov 2017 22:02:44 +0100 Subject: [PATCH 02/24] ENH: add dtype argument to get_dummies --- pandas/core/reshape/reshape.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 8b656d8ba25e9..0a266affcbbc6 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -697,7 +697,7 @@ def _convert_level_number(level_num, columns): def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, - columns=None, sparse=False, drop_first=False): + columns=None, sparse=False, drop_first=False, dtype=None): """ Convert categorical variable into dummy/indicator variables @@ -725,6 +725,8 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, drop_first : bool, default False Whether to get k-1 dummies out of k categorical levels by removing the first level. + dtype : dtype, default np.uint8 + Data type to force on a new columns. Only a single dtype is allowed. .. versionadded:: 0.18.0 @@ -783,6 +785,12 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, 3 0 0 4 0 0 + >>> pd.get_dummies(pd.Series(list('abc')), dtype=float) + a b c + 0 1.0 0.0 0.0 + 1 0.0 1.0 0.0 + 2 0.0 0.0 1.0 + See Also -------- Series.str.get_dummies @@ -790,6 +798,9 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, from pandas.core.reshape.concat import concat from itertools import cycle + if dtype is None: + dtype = np.uint8 + if isinstance(data, DataFrame): # determine columns being encoded @@ -835,17 +846,17 @@ def check_len(item, name): dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep, dummy_na=dummy_na, sparse=sparse, - drop_first=drop_first) + drop_first=drop_first, dtype=dtype) with_dummies.append(dummy) result = concat(with_dummies, axis=1) else: result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na, - sparse=sparse, drop_first=drop_first) + sparse=sparse, drop_first=drop_first, dtype=dtype) return result def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, - sparse=False, drop_first=False): + sparse=False, drop_first=False, dtype=np.uint8): # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) @@ -903,18 +914,18 @@ def get_empty_Frame(data, sparse): sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): - sarr = SparseArray(np.ones(len(ixs), dtype=np.uint8), + sarr = SparseArray(np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=0, - dtype=np.uint8) + dtype=dtype) sparse_series[col] = SparseSeries(data=sarr, index=index) out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, default_fill_value=0, - dtype=np.uint8) + dtype=dtype) return out else: - dummy_mat = np.eye(number_of_cols, dtype=np.uint8).take(codes, axis=0) + dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 From 2737069aa7ea27838a01e512239dd49a1100e87c Mon Sep 17 00:00:00 2001 From: Andrew Savchyn Date: Thu, 16 Nov 2017 21:24:41 +0100 Subject: [PATCH 03/24] CLN: clean up lint errors --- pandas/core/reshape/reshape.py | 4 +- pandas/tests/reshape/test_reshape.py | 113 ++++++++++++++++++++------- 2 files changed, 88 insertions(+), 29 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 0a266affcbbc6..f40e1f25a24e3 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -851,7 +851,9 @@ def check_len(item, name): result = concat(with_dummies, axis=1) else: result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na, - sparse=sparse, drop_first=drop_first, dtype=dtype) + sparse=sparse, + drop_first=drop_first, + dtype=dtype) return result diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 366c70d088bc1..461a99447d153 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -236,12 +236,17 @@ def test_basic(self): expected = DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1]}, dtype=self.dtype) - assert_frame_equal(get_dummies(s_list, sparse=self.sparse, dtype=self.dtype), expected) - assert_frame_equal(get_dummies(s_series, sparse=self.sparse, dtype=self.dtype), expected) + result = get_dummies(s_list, sparse=self.sparse, dtype=self.dtype) + assert_frame_equal(result, expected) + + result = get_dummies(s_series, sparse=self.sparse, dtype=self.dtype) + assert_frame_equal(result, expected) expected.index = list('ABC') - assert_frame_equal( - get_dummies(s_series_index, sparse=self.sparse, dtype=self.dtype), expected) + result = get_dummies(s_series_index, + sparse=self.sparse, + dtype=self.dtype) + assert_frame_equal(result, expected) def test_basic_types(self): # GH 10531 @@ -268,13 +273,22 @@ def test_basic_types(self): result = get_dummies(s_series, sparse=self.sparse, dtype=self.dtype) compare(result, expected) - result = get_dummies(s_df, sparse=self.sparse, columns=s_df.columns, dtype=self.dtype) + result = get_dummies(s_df, + sparse=self.sparse, + columns=s_df.columns, + dtype=self.dtype) tm.assert_series_equal(result.get_dtype_counts(), Series({self.dtype_str: 8})) - result = get_dummies(s_df, sparse=self.sparse, columns=['a'], dtype=self.dtype) - expected = Series({self.dtype_str: 3, 'int64': 1, 'object': 1}).sort_values() - tm.assert_series_equal(result.get_dtype_counts().sort_values(), expected) + result = get_dummies(s_df, + sparse=self.sparse, + columns=['a'], + dtype=self.dtype) + expected = Series({self.dtype_str: 3, + 'int64': 1, + 'object': 1}).sort_values() + tm.assert_series_equal(result.get_dtype_counts().sort_values(), + expected) def test_just_na(self): just_na_list = [np.nan] @@ -302,7 +316,10 @@ def test_include_na(self): assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 - res_na = get_dummies(s, dummy_na=True, sparse=self.sparse, dtype=self.dtype) + res_na = get_dummies(s, + dummy_na=True, + sparse=self.sparse, + dtype=self.dtype) exp_na = DataFrame({nan: {0: 0, 1: 0, 2: 1}, 'a': {0: 1, 1: 0, 2: 0}, 'b': {0: 0, 1: 1, 2: 0}}, @@ -312,7 +329,10 @@ def test_include_na(self): exp_na.columns = res_na.columns assert_frame_equal(res_na, exp_na) - res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse, dtype=self.dtype) + res_just_na = get_dummies([nan], + dummy_na=True, + sparse=self.sparse, + dtype=self.dtype) exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan], dtype=self.dtype) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values) @@ -323,7 +343,10 @@ def test_unicode(self e = 'e' eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE') s = [e, eacute, eacute] - res = get_dummies(s, prefix='letter', sparse=self.sparse, dtype=self.dtype) + res = get_dummies(s, + prefix='letter', + sparse=self.sparse, + dtype=self.dtype) exp = DataFrame({'letter_e': {0: 1, 1: 0, 2: 0}, @@ -360,7 +383,10 @@ def test_dataframe_dummies_prefix_list(self): df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}, dtype=self.dtype) - result = get_dummies(df, prefix=prefixes, sparse=self.sparse, dtype=self.dtype) + result = get_dummies(df, + prefix=prefixes, + sparse=self.sparse, + dtype=self.dtype) expected = DataFrame({'C': [1, 2, 3], 'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], @@ -368,13 +394,15 @@ def test_dataframe_dummies_prefix_list(self): 'from_B_c': [0, 0, 1]}, dtype=self.dtype) cols = expected.columns[1:] expected[cols] = expected[cols].astype(self.dtype) - expected = expected[['C', 'from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']] + expected = expected[['C', 'from_A_a', 'from_A_b', + 'from_B_b', 'from_B_c']] assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_str(self): # not that you should do this... df = self.df - result = get_dummies(df, prefix='bad', sparse=self.sparse, dtype=self.dtype) + result = get_dummies(df, prefix='bad', sparse=self.sparse, + dtype=self.dtype) expected = DataFrame([[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]], @@ -397,7 +425,8 @@ def test_dataframe_dummies_subset(self): def test_dataframe_dummies_prefix_sep(self): df = self.df - result = get_dummies(df, prefix_sep='..', sparse=self.sparse, dtype=self.dtype) + result = get_dummies(df, prefix_sep='..', sparse=self.sparse, + dtype=self.dtype) expected = DataFrame({'C': [1, 2, 3], 'A..a': [1, 0, 1], 'A..b': [0, 1, 0], @@ -408,12 +437,17 @@ def test_dataframe_dummies_prefix_sep(self): expected[cols] = expected[cols].astype(self.dtype) assert_frame_equal(result, expected) - result = get_dummies(df, prefix_sep=['..', '__'], sparse=self.sparse, dtype=self.dtype) + result = get_dummies(df, + prefix_sep=['..', '__'], + sparse=self.sparse, + dtype=self.dtype) expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'}) assert_frame_equal(result, expected) result = get_dummies(df, prefix_sep={'A': '..', - 'B': '__'}, sparse=self.sparse, dtype=self.dtype) + 'B': '__'}, + sparse=self.sparse, + dtype=self.dtype) assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_bad_length(self): @@ -429,7 +463,8 @@ def test_dataframe_dummies_prefix_dict(self): df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}, dtype=self.dtype) - result = get_dummies(df, prefix=prefixes, sparse=self.sparse, dtype=self.dtype) + result = get_dummies(df, prefix=prefixes, sparse=self.sparse, + dtype=self.dtype) expected = DataFrame({'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], 'from_B_b': [1, 1, 0], @@ -440,7 +475,8 @@ def test_dataframe_dummies_prefix_dict(self): def test_dataframe_dummies_with_na(self): df = self.df df.loc[3, :] = [np.nan, np.nan, np.nan] - result = get_dummies(df, dummy_na=True, sparse=self.sparse, dtype=self.dtype) + result = get_dummies(df, dummy_na=True, sparse=self.sparse, + dtype=self.dtype) expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_a': [1, 0, 1, 0], 'A_b': [0, 1, 0, 0], @@ -449,10 +485,12 @@ def test_dataframe_dummies_with_na(self): 'B_c': [0, 0, 1, 0], 'B_nan': [0, 0, 0, 1]}, dtype=self.dtype) expected[['C']] = expected[['C']].astype(np.float64) - expected = expected[['C', 'A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan']] + expected = expected[['C', 'A_a', 'A_b', 'A_nan', + 'B_b', 'B_c', 'B_nan']] assert_frame_equal(result, expected) - result = get_dummies(df, dummy_na=False, sparse=self.sparse, dtype=self.dtype) + result = get_dummies(df, dummy_na=False, sparse=self.sparse, + dtype=self.dtype) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] assert_frame_equal(result, expected) @@ -467,7 +505,6 @@ def test_dataframe_dummies_with_categorical(self): 'B_c': [0, 0, 1], 'cat_x': [1, 0, 0], 'cat_y': [0, 1, 1]}, dtype=self.dtype) - cols = ['A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y'] expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y']] assert_frame_equal(result, expected) @@ -482,10 +519,16 @@ def test_basic_drop_first(self): expected = DataFrame({'b': [0, 1, 0], 'c': [0, 0, 1]}, dtype=self.dtype) - result = get_dummies(s_list, sparse=self.sparse, drop_first=True, dtype=self.dtype) + result = get_dummies(s_list, + sparse=self.sparse, + drop_first=True, + dtype=self.dtype) assert_frame_equal(result, expected) - result = get_dummies(s_series, sparse=self.sparse, drop_first=True, dtype=self.dtype) + result = get_dummies(s_series, + sparse=self.sparse, + drop_first=True, + dtype=self.dtype) assert_frame_equal(result, expected) expected.index = list('ABC') @@ -515,7 +558,10 @@ def test_basic_drop_first_one_level(self): def test_basic_drop_first_NA(self): # Test NA hadling together with drop_first s_NA = ['a', 'b', np.nan] - res = get_dummies(s_NA, sparse=self.sparse, drop_first=True, dtype=self.dtype) + res = get_dummies(s_NA, + sparse=self.sparse, + drop_first=True, + dtype=self.dtype) exp = DataFrame({'b': [0, 1, 0]}, dtype=self.dtype) assert_frame_equal(res, exp) @@ -533,7 +579,10 @@ def test_basic_drop_first_NA(self): def test_dataframe_dummies_drop_first(self): df = self.df[['A', 'B']] - result = get_dummies(df, sparse=self.sparse, drop_first=True, dtype=self.dtype) + result = get_dummies(df, + sparse=self.sparse, + drop_first=True, + dtype=self.dtype) expected = DataFrame({'A_b': [0, 1, 0], 'B_c': [0, 0, 1]}, dtype=self.dtype) assert_frame_equal(result, expected) @@ -541,7 +590,10 @@ def test_dataframe_dummies_drop_first(self): def test_dataframe_dummies_drop_first_with_categorical(self): df = self.df df['cat'] = pd.Categorical(['x', 'y', 'y']) - result = get_dummies(df, sparse=self.sparse, drop_first=True, dtype=self.dtype) + result = get_dummies(df, + sparse=self.sparse, + drop_first=True, + dtype=self.dtype) expected = DataFrame({'C': [1, 2, 3], 'A_b': [0, 1, 0], 'B_c': [0, 0, 1], @@ -633,6 +685,7 @@ class TestGetDummiesSparse(TestGetDummies): def test_include_na(self): super(TestGetDummiesSparse, self).test_include_na() + class TestGetDummiesDtypeMixin(object): dtype_str = 'float64' @@ -652,12 +705,16 @@ def test_dataframe_dummies_prefix_sep_bad_length(self): def test_basic_drop_first_one_level(self): pass + class TestGetDummiesDtypeFloat(TestGetDummiesDtypeMixin, TestGetDummies): pass -class TestGetDummiesSparseDtypeFloat(TestGetDummiesDtypeMixin, TestGetDummiesSparse): + +class TestGetDummiesSparseDtypeFloat(TestGetDummiesDtypeMixin, + TestGetDummiesSparse): pass + class TestMakeAxisDummies(object): def test_preserve_categorical_dtype(self): From c412dae602e1cfcbcee1ce0b3b9c3fc6edb0e66f Mon Sep 17 00:00:00 2001 From: Andrew Savchyn Date: Thu, 16 Nov 2017 21:33:09 +0100 Subject: [PATCH 04/24] DOC: update whatsnew --- doc/source/whatsnew/v0.22.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 4f403ff8053a7..f8119d8fdff94 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -156,7 +156,7 @@ Sparse Reshaping ^^^^^^^^^ -- +- :func:`get_dummies` now supports ``dtype`` argument - - From b869afe1da9d9dcfb49c75c55340603811e01093 Mon Sep 17 00:00:00 2001 From: Andrew Savchyn Date: Fri, 17 Nov 2017 21:58:55 +0100 Subject: [PATCH 05/24] DOC: improve get_dummies dtype documentation --- doc/source/whatsnew/v0.22.0.txt | 2 +- pandas/core/reshape/reshape.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index f8119d8fdff94..4f403ff8053a7 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -156,7 +156,7 @@ Sparse Reshaping ^^^^^^^^^ -- :func:`get_dummies` now supports ``dtype`` argument +- - - diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index f40e1f25a24e3..f35b5b74004d8 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -725,10 +725,13 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, drop_first : bool, default False Whether to get k-1 dummies out of k categorical levels by removing the first level. + + .. versionadded:: 0.18.0 + dtype : dtype, default np.uint8 Data type to force on a new columns. Only a single dtype is allowed. - .. versionadded:: 0.18.0 + .. versionadded:: 0.22.0 Returns ------- From 7038b3102db118e2e8d5ac89ce0788b1164079aa Mon Sep 17 00:00:00 2001 From: Andrew Savchyn Date: Sat, 18 Nov 2017 21:31:48 +0100 Subject: [PATCH 06/24] TST: change get_dummies test setup Use pytest fixtures. Add test for dtype=None. --- pandas/tests/reshape/test_reshape.py | 386 ++++++++++++--------------- 1 file changed, 168 insertions(+), 218 deletions(-) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 461a99447d153..77949c4535424 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -217,39 +217,47 @@ def test_multiindex(self): class TestGetDummies(object): - dtype_str = 'uint8' - sparse = False - def setup_class(cls): - cls.dtype = np.dtype(cls.dtype_str) + @pytest.fixture + def df(self, dtype): + return DataFrame({'A': ['a', 'b', 'a'], + 'B': ['b', 'b', 'c'], + 'C': [1, 2, 3]}, dtype=dtype) - def setup_method(self, method): - self.df = DataFrame({'A': ['a', 'b', 'a'], - 'B': ['b', 'b', 'c'], - 'C': [1, 2, 3]}, dtype=self.dtype) + @pytest.fixture(params=['uint8', 'float64']) + def dtype(self, request): + return np.dtype(request.param) + + @pytest.fixture(params=['dense', 'sparse']) + def sparse(self, request): + # params are strings to simplify reading test results, + # e.g. TestGetDummies::test_basic[uint8-sparse] instead of [uint8-True] + return request.param == 'sparse' + + def test_basic(self, sparse, dtype): + kwargs = {'sparse': sparse, 'dtype': dtype} - def test_basic(self): s_list = list('abc') s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) expected = DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0], - 'c': [0, 0, 1]}, dtype=self.dtype) - result = get_dummies(s_list, sparse=self.sparse, dtype=self.dtype) + 'c': [0, 0, 1]}, dtype=dtype) + result = get_dummies(s_list, **kwargs) assert_frame_equal(result, expected) - result = get_dummies(s_series, sparse=self.sparse, dtype=self.dtype) + result = get_dummies(s_series, **kwargs) assert_frame_equal(result, expected) expected.index = list('ABC') - result = get_dummies(s_series_index, - sparse=self.sparse, - dtype=self.dtype) + result = get_dummies(s_series_index, **kwargs) assert_frame_equal(result, expected) - def test_basic_types(self): + def test_basic_types(self, sparse, dtype): # GH 10531 + kwargs = {'sparse': sparse, 'dtype': dtype} + s_list = list('abc') s_series = Series(s_list) s_df = DataFrame({'a': [0, 1, 0, 1, 2], @@ -259,46 +267,54 @@ def test_basic_types(self): expected = DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1]}, - dtype=self.dtype_str, + dtype=dtype, columns=list('abc')) - if not self.sparse: + if not sparse: compare = tm.assert_frame_equal else: expected = expected.to_sparse(fill_value=0, kind='integer') compare = tm.assert_sp_frame_equal - result = get_dummies(s_list, sparse=self.sparse, dtype=self.dtype) + result = get_dummies(s_list, **kwargs) compare(result, expected) - result = get_dummies(s_series, sparse=self.sparse, dtype=self.dtype) + result = get_dummies(s_series, **kwargs) compare(result, expected) - result = get_dummies(s_df, - sparse=self.sparse, - columns=s_df.columns, - dtype=self.dtype) + result = get_dummies(s_df, columns=s_df.columns, **kwargs) tm.assert_series_equal(result.get_dtype_counts(), - Series({self.dtype_str: 8})) + Series({dtype.name: 8})) - result = get_dummies(s_df, - sparse=self.sparse, - columns=['a'], - dtype=self.dtype) - expected = Series({self.dtype_str: 3, + result = get_dummies(s_df, columns=['a'], **kwargs) + expected = Series({dtype.name: 3, 'int64': 1, 'object': 1}).sort_values() tm.assert_series_equal(result.get_dtype_counts().sort_values(), expected) - def test_just_na(self): + def test_dtype_none(self, df, sparse, dtype): + expected = DataFrame({'A_a': [1, 0, 1], + 'A_b': [0, 1, 0], + 'B_b': [1, 1, 0], + 'B_c': [0, 0, 1], + 'C': [1, 2, 3]}, + # dummy dimension have type uint8 by default + dtype=np.uint8).sort_index(axis=1) + + # column C must retain it's type + expected[['C']] = expected[['C']].astype(dtype) + + result = get_dummies(df, sparse=sparse, dtype=None).sort_index(axis=1) + assert_frame_equal(result, expected) + + def test_just_na(self, sparse): just_na_list = [np.nan] just_na_series = Series(just_na_list) just_na_series_index = Series(just_na_list, index=['A']) - res_list = get_dummies(just_na_list, sparse=self.sparse) - res_series = get_dummies(just_na_series, sparse=self.sparse) - res_series_index = get_dummies(just_na_series_index, - sparse=self.sparse) + res_list = get_dummies(just_na_list, sparse=sparse) + res_series = get_dummies(just_na_series, sparse=sparse) + res_series_index = get_dummies(just_na_series_index, sparse=sparse) assert res_list.empty assert res_series.empty @@ -308,343 +324,314 @@ def test_just_na(self): assert res_series.index.tolist() == [0] assert res_series_index.index.tolist() == ['A'] - def test_include_na(self): + def test_include_na(self, sparse, dtype): + if sparse: + pytest.xfail(reason='nan in index is problematic (GH 16894)') + + kwargs = {'sparse': sparse, 'dtype': dtype} s = ['a', 'b', np.nan] - res = get_dummies(s, sparse=self.sparse, dtype=self.dtype) + res = get_dummies(s, **kwargs) exp = DataFrame({'a': {0: 1, 1: 0, 2: 0}, - 'b': {0: 0, 1: 1, 2: 0}}, dtype=self.dtype) + 'b': {0: 0, 1: 1, 2: 0}}, dtype=dtype) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 - res_na = get_dummies(s, - dummy_na=True, - sparse=self.sparse, - dtype=self.dtype) + res_na = get_dummies(s, dummy_na=True, **kwargs) exp_na = DataFrame({nan: {0: 0, 1: 0, 2: 1}, 'a': {0: 1, 1: 0, 2: 0}, 'b': {0: 0, 1: 1, 2: 0}}, - dtype=self.dtype) + dtype=dtype) exp_na = exp_na.reindex(['a', 'b', nan], axis=1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns assert_frame_equal(res_na, exp_na) - res_just_na = get_dummies([nan], - dummy_na=True, - sparse=self.sparse, - dtype=self.dtype) + res_just_na = get_dummies([nan], dummy_na=True, **kwargs) exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan], - dtype=self.dtype) + dtype=dtype) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values) - def test_unicode(self - ): # See GH 6885 - get_dummies chokes on unicode values + def test_unicode(self, sparse, dtype): + kwargs = {'sparse': sparse, 'dtype': dtype} + # See GH 6885 - get_dummies chokes on unicode values import unicodedata e = 'e' eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE') s = [e, eacute, eacute] - res = get_dummies(s, - prefix='letter', - sparse=self.sparse, - dtype=self.dtype) + res = get_dummies(s, prefix='letter', **kwargs) exp = DataFrame({'letter_e': {0: 1, 1: 0, 2: 0}, u('letter_%s') % eacute: {0: 0, 1: 1, 2: 1}}, - dtype=self.dtype) + dtype=dtype) assert_frame_equal(res, exp) - def test_dataframe_dummies_all_obj(self): - df = self.df[['A', 'B']] - result = get_dummies(df, sparse=self.sparse, dtype=self.dtype) + def test_dataframe_dummies_all_obj(self, df, sparse, dtype): + kwargs = {'sparse': sparse, 'dtype': dtype} + df = df[['A', 'B']] + result = get_dummies(df, **kwargs) expected = DataFrame({'A_a': [1, 0, 1], 'A_b': [0, 1, 0], 'B_b': [1, 1, 0], - 'B_c': [0, 0, 1]}, dtype=self.dtype) + 'B_c': [0, 0, 1]}, dtype=dtype) assert_frame_equal(result, expected) - def test_dataframe_dummies_mix_default(self): - df = self.df - result = get_dummies(df, sparse=self.sparse, dtype=self.dtype) + def test_dataframe_dummies_mix_default(self, df, sparse, dtype): + kwargs = {'sparse': sparse, 'dtype': dtype} + result = get_dummies(df, **kwargs) expected = DataFrame({'C': [1, 2, 3], 'A_a': [1, 0, 1], 'A_b': [0, 1, 0], 'B_b': [1, 1, 0], - 'B_c': [0, 0, 1]}, dtype=self.dtype) + 'B_c': [0, 0, 1]}, dtype=dtype) cols = ['A_a', 'A_b', 'B_b', 'B_c'] - expected[cols] = expected[cols].astype(self.dtype) + expected[cols] = expected[cols].astype(dtype) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] assert_frame_equal(result, expected) - def test_dataframe_dummies_prefix_list(self): + def test_dataframe_dummies_prefix_list(self, df, sparse, dtype): + kwargs = {'sparse': sparse, 'dtype': dtype} prefixes = ['from_A', 'from_B'] df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], - 'C': [1, 2, 3]}, dtype=self.dtype) - result = get_dummies(df, - prefix=prefixes, - sparse=self.sparse, - dtype=self.dtype) + 'C': [1, 2, 3]}, dtype=dtype) + result = get_dummies(df, prefix=prefixes, **kwargs) expected = DataFrame({'C': [1, 2, 3], 'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], 'from_B_b': [1, 1, 0], - 'from_B_c': [0, 0, 1]}, dtype=self.dtype) + 'from_B_c': [0, 0, 1]}, dtype=dtype) cols = expected.columns[1:] - expected[cols] = expected[cols].astype(self.dtype) + expected[cols] = expected[cols].astype(dtype) expected = expected[['C', 'from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']] assert_frame_equal(result, expected) - def test_dataframe_dummies_prefix_str(self): + def test_dataframe_dummies_prefix_str(self, df, sparse, dtype): # not that you should do this... - df = self.df - result = get_dummies(df, prefix='bad', sparse=self.sparse, - dtype=self.dtype) + kwargs = {'sparse': sparse, 'dtype': dtype} + result = get_dummies(df, prefix='bad', **kwargs) expected = DataFrame([[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]], columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c'], - dtype=self.dtype) - expected = expected.astype({"C": self.dtype}) + dtype=dtype) + expected = expected.astype({"C": dtype}) assert_frame_equal(result, expected) - def test_dataframe_dummies_subset(self): - df = self.df - result = get_dummies(df, prefix=['from_A'], columns=['A'], - sparse=self.sparse, dtype=self.dtype) + def test_dataframe_dummies_subset(self, df, sparse, dtype): + kwargs = {'sparse': sparse, 'dtype': dtype} + result = get_dummies(df, prefix=['from_A'], columns=['A'], **kwargs) expected = DataFrame({'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], 'B': ['b', 'b', 'c'], - 'C': [1, 2, 3]}, dtype=self.dtype) + 'C': [1, 2, 3]}, dtype=dtype) cols = ['from_A_a', 'from_A_b'] - expected[cols] = expected[cols].astype(self.dtype) + expected[cols] = expected[cols].astype(dtype) assert_frame_equal(result, expected) - def test_dataframe_dummies_prefix_sep(self): - df = self.df - result = get_dummies(df, prefix_sep='..', sparse=self.sparse, - dtype=self.dtype) + def test_dataframe_dummies_prefix_sep(self, df, sparse, dtype): + kwargs = {'sparse': sparse, 'dtype': dtype} + result = get_dummies(df, prefix_sep='..', **kwargs) expected = DataFrame({'C': [1, 2, 3], 'A..a': [1, 0, 1], 'A..b': [0, 1, 0], 'B..b': [1, 1, 0], - 'B..c': [0, 0, 1]}, dtype=self.dtype) + 'B..c': [0, 0, 1]}, dtype=dtype) expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']] cols = expected.columns[1:] - expected[cols] = expected[cols].astype(self.dtype) + expected[cols] = expected[cols].astype(dtype) assert_frame_equal(result, expected) - result = get_dummies(df, - prefix_sep=['..', '__'], - sparse=self.sparse, - dtype=self.dtype) + result = get_dummies(df, prefix_sep=['..', '__'], **kwargs) expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'}) assert_frame_equal(result, expected) - result = get_dummies(df, prefix_sep={'A': '..', - 'B': '__'}, - sparse=self.sparse, - dtype=self.dtype) + result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'}, **kwargs) assert_frame_equal(result, expected) - def test_dataframe_dummies_prefix_bad_length(self): + def test_dataframe_dummies_prefix_bad_length(self, df, sparse): with pytest.raises(ValueError): - get_dummies(self.df, prefix=['too few'], sparse=self.sparse) + get_dummies(df, prefix=['too few'], sparse=sparse) - def test_dataframe_dummies_prefix_sep_bad_length(self): + def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse): with pytest.raises(ValueError): - get_dummies(self.df, prefix_sep=['bad'], sparse=self.sparse) + get_dummies(df, prefix_sep=['bad'], sparse=sparse) - def test_dataframe_dummies_prefix_dict(self): + def test_dataframe_dummies_prefix_dict(self, sparse, dtype): + kwargs = {'sparse': sparse, 'dtype': dtype} prefixes = {'A': 'from_A', 'B': 'from_B'} df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], - 'C': [1, 2, 3]}, dtype=self.dtype) - result = get_dummies(df, prefix=prefixes, sparse=self.sparse, - dtype=self.dtype) + 'C': [1, 2, 3]}, dtype=dtype) + result = get_dummies(df, prefix=prefixes, **kwargs) expected = DataFrame({'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], 'from_B_b': [1, 1, 0], 'from_B_c': [0, 0, 1], - 'C': [1, 2, 3]}, dtype=self.dtype) + 'C': [1, 2, 3]}, dtype=dtype) assert_frame_equal(result, expected) - def test_dataframe_dummies_with_na(self): - df = self.df + def test_dataframe_dummies_with_na(self, df, sparse, dtype): + kwargs = {'sparse': sparse, 'dtype': dtype} df.loc[3, :] = [np.nan, np.nan, np.nan] - result = get_dummies(df, dummy_na=True, sparse=self.sparse, - dtype=self.dtype) + result = get_dummies(df, dummy_na=True, **kwargs) expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_a': [1, 0, 1, 0], 'A_b': [0, 1, 0, 0], 'A_nan': [0, 0, 0, 1], 'B_b': [1, 1, 0, 0], 'B_c': [0, 0, 1, 0], - 'B_nan': [0, 0, 0, 1]}, dtype=self.dtype) + 'B_nan': [0, 0, 0, 1]}, dtype=dtype) expected[['C']] = expected[['C']].astype(np.float64) expected = expected[['C', 'A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan']] assert_frame_equal(result, expected) - result = get_dummies(df, dummy_na=False, sparse=self.sparse, - dtype=self.dtype) + result = get_dummies(df, dummy_na=False, **kwargs) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] assert_frame_equal(result, expected) - def test_dataframe_dummies_with_categorical(self): - df = self.df + def test_dataframe_dummies_with_categorical(self, df, sparse, dtype): + kwargs = {'sparse': sparse, 'dtype': dtype} df['cat'] = pd.Categorical(['x', 'y', 'y']) - result = get_dummies(df, sparse=self.sparse, dtype=self.dtype) + result = get_dummies(df, **kwargs) expected = DataFrame({'C': [1, 2, 3], 'A_a': [1, 0, 1], 'A_b': [0, 1, 0], 'B_b': [1, 1, 0], 'B_c': [0, 0, 1], 'cat_x': [1, 0, 0], - 'cat_y': [0, 1, 1]}, dtype=self.dtype) + 'cat_y': [0, 1, 1]}, dtype=dtype) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y']] assert_frame_equal(result, expected) - def test_basic_drop_first(self): + def test_basic_drop_first(self, sparse, dtype): # GH12402 Add a new parameter `drop_first` to avoid collinearity # Basic case + kwargs = {'sparse': sparse, 'dtype': dtype} s_list = list('abc') s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) expected = DataFrame({'b': [0, 1, 0], - 'c': [0, 0, 1]}, dtype=self.dtype) + 'c': [0, 0, 1]}, dtype=dtype) - result = get_dummies(s_list, - sparse=self.sparse, - drop_first=True, - dtype=self.dtype) + result = get_dummies(s_list, drop_first=True, **kwargs) assert_frame_equal(result, expected) - result = get_dummies(s_series, - sparse=self.sparse, - drop_first=True, - dtype=self.dtype) + result = get_dummies(s_series, drop_first=True, **kwargs) assert_frame_equal(result, expected) expected.index = list('ABC') - result = get_dummies(s_series_index, sparse=self.sparse, - drop_first=True, dtype=self.dtype) + result = get_dummies(s_series_index, drop_first=True, **kwargs) assert_frame_equal(result, expected) - def test_basic_drop_first_one_level(self): + def test_basic_drop_first_one_level(self, sparse, dtype): # Test the case that categorical variable only has one level. + kwargs = {'sparse': sparse, 'dtype': dtype} s_list = list('aaa') s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) expected = DataFrame(index=np.arange(3)) - result = get_dummies(s_list, sparse=self.sparse, drop_first=True) + result = get_dummies(s_list, drop_first=True, **kwargs) assert_frame_equal(result, expected) - result = get_dummies(s_series, sparse=self.sparse, drop_first=True) + result = get_dummies(s_series, drop_first=True, **kwargs) assert_frame_equal(result, expected) expected = DataFrame(index=list('ABC')) - result = get_dummies(s_series_index, sparse=self.sparse, - drop_first=True) + result = get_dummies(s_series_index, drop_first=True, **kwargs) assert_frame_equal(result, expected) - def test_basic_drop_first_NA(self): + def test_basic_drop_first_NA(self, sparse, dtype): # Test NA hadling together with drop_first + kwargs = {'sparse': sparse, 'dtype': dtype} s_NA = ['a', 'b', np.nan] - res = get_dummies(s_NA, - sparse=self.sparse, - drop_first=True, - dtype=self.dtype) - exp = DataFrame({'b': [0, 1, 0]}, dtype=self.dtype) + res = get_dummies(s_NA, drop_first=True, **kwargs) + exp = DataFrame({'b': [0, 1, 0]}, dtype=dtype) assert_frame_equal(res, exp) - res_na = get_dummies(s_NA, dummy_na=True, sparse=self.sparse, - drop_first=True, dtype=self.dtype) + res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, **kwargs) exp_na = DataFrame({'b': [0, 1, 0], nan: [0, 0, 1]}, - dtype=self.dtype).reindex(['b', nan], axis=1) + dtype=dtype).reindex(['b', nan], axis=1) assert_frame_equal(res_na, exp_na) - res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse, - drop_first=True, dtype=self.dtype) + res_just_na = get_dummies([nan], + dummy_na=True, + drop_first=True, + **kwargs) exp_just_na = DataFrame(index=np.arange(1)) assert_frame_equal(res_just_na, exp_just_na) - def test_dataframe_dummies_drop_first(self): - df = self.df[['A', 'B']] - result = get_dummies(df, - sparse=self.sparse, - drop_first=True, - dtype=self.dtype) + def test_dataframe_dummies_drop_first(self, df, sparse, dtype): + kwargs = {'sparse': sparse, 'dtype': dtype} + df = df[['A', 'B']] + result = get_dummies(df, drop_first=True, **kwargs) expected = DataFrame({'A_b': [0, 1, 0], - 'B_c': [0, 0, 1]}, dtype=self.dtype) + 'B_c': [0, 0, 1]}, dtype=dtype) assert_frame_equal(result, expected) - def test_dataframe_dummies_drop_first_with_categorical(self): - df = self.df + def test_dataframe_dummies_drop_first_with_categorical( + self, df, sparse, dtype): + kwargs = {'sparse': sparse, 'dtype': dtype} df['cat'] = pd.Categorical(['x', 'y', 'y']) - result = get_dummies(df, - sparse=self.sparse, - drop_first=True, - dtype=self.dtype) + result = get_dummies(df, drop_first=True, **kwargs) expected = DataFrame({'C': [1, 2, 3], 'A_b': [0, 1, 0], 'B_c': [0, 0, 1], - 'cat_y': [0, 1, 1]}, dtype=self.dtype) + 'cat_y': [0, 1, 1]}, dtype=dtype) cols = ['A_b', 'B_c', 'cat_y'] - expected[cols] = expected[cols].astype(self.dtype) + expected[cols] = expected[cols].astype(dtype) expected = expected[['C', 'A_b', 'B_c', 'cat_y']] assert_frame_equal(result, expected) - def test_dataframe_dummies_drop_first_with_na(self): - df = self.df + def test_dataframe_dummies_drop_first_with_na(self, df, sparse, dtype): + kwargs = {'sparse': sparse, 'dtype': dtype} df.loc[3, :] = [np.nan, np.nan, np.nan] - result = get_dummies(df, dummy_na=True, sparse=self.sparse, - drop_first=True, dtype=self.dtype) + result = get_dummies(df, dummy_na=True, drop_first=True, **kwargs) expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_b': [0, 1, 0, 0], 'A_nan': [0, 0, 0, 1], 'B_c': [0, 0, 1, 0], - 'B_nan': [0, 0, 0, 1]}, dtype=self.dtype) + 'B_nan': [0, 0, 0, 1]}, dtype=dtype) expected[['C']] = expected[['C']].astype(np.float64) cols = ['A_b', 'A_nan', 'B_c', 'B_nan'] - expected[cols] = expected[cols].astype(self.dtype) + expected[cols] = expected[cols].astype(dtype) expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']] assert_frame_equal(result, expected) - result = get_dummies(df, dummy_na=False, sparse=self.sparse, - drop_first=True, dtype=self.dtype) + result = get_dummies(df, dummy_na=False, drop_first=True, **kwargs) expected = expected[['C', 'A_b', 'B_c']] assert_frame_equal(result, expected) - def test_int_int(self): + def test_int_int(self, dtype): data = Series([1, 2, 1]) - result = pd.get_dummies(data, dtype=self.dtype) + result = pd.get_dummies(data, dtype=dtype) expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], - dtype=self.dtype) + dtype=dtype) tm.assert_frame_equal(result, expected) data = Series(pd.Categorical(['a', 'b', 'a'])) - result = pd.get_dummies(data, dtype=self.dtype) + result = pd.get_dummies(data, dtype=dtype) expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=pd.Categorical(['a', 'b']), - dtype=self.dtype) + dtype=dtype) tm.assert_frame_equal(result, expected) - def test_int_df(self): + def test_int_df(self, dtype): data = DataFrame( {'A': [1, 2, 1], 'B': pd.Categorical(['a', 'b', 'a']), @@ -658,63 +645,26 @@ def test_int_df(self): [2, 2., 0, 1, 0, 1], [1, 1., 1, 0, 1, 0] ], columns=columns) - expected[columns[2:]] = expected[columns[2:]].astype(self.dtype) - result = pd.get_dummies(data, columns=['A', 'B'], dtype=self.dtype) + expected[columns[2:]] = expected[columns[2:]].astype(dtype) + result = pd.get_dummies(data, columns=['A', 'B'], dtype=dtype) tm.assert_frame_equal(result, expected) - def test_dataframe_dummies_preserve_categorical_dtype(self): + def test_dataframe_dummies_preserve_categorical_dtype(self, dtype): # GH13854 for ordered in [False, True]: cat = pd.Categorical(list("xy"), categories=list("xyz"), ordered=ordered) - result = get_dummies(cat, dtype=self.dtype) + result = get_dummies(cat, dtype=dtype) - data = np.array([[1, 0, 0], [0, 1, 0]], dtype=self.dtype) + data = np.array([[1, 0, 0], [0, 1, 0]], dtype=dtype) cols = pd.CategoricalIndex(cat.categories, categories=cat.categories, ordered=ordered) - expected = DataFrame(data, columns=cols, dtype=self.dtype) + expected = DataFrame(data, columns=cols, dtype=dtype) tm.assert_frame_equal(result, expected) -class TestGetDummiesSparse(TestGetDummies): - sparse = True - - @pytest.mark.xfail(reason='nan in index is problematic (GH 16894)') - def test_include_na(self): - super(TestGetDummiesSparse, self).test_include_na() - - -class TestGetDummiesDtypeMixin(object): - dtype_str = 'float64' - - @pytest.mark.skip(reason='no element types to test') - def test_just_na(self): - pass - - @pytest.mark.skip(reason='no internal elements to assert type') - def test_dataframe_dummies_prefix_bad_length(self): - pass - - @pytest.mark.skip(reason='no internal elements to assert type') - def test_dataframe_dummies_prefix_sep_bad_length(self): - pass - - @pytest.mark.skip(reason='no internal elements to assert type') - def test_basic_drop_first_one_level(self): - pass - - -class TestGetDummiesDtypeFloat(TestGetDummiesDtypeMixin, TestGetDummies): - pass - - -class TestGetDummiesSparseDtypeFloat(TestGetDummiesDtypeMixin, - TestGetDummiesSparse): - pass - - class TestMakeAxisDummies(object): def test_preserve_categorical_dtype(self): From c412be0fa186e01d2483d2e1be5bc674d28c49ef Mon Sep 17 00:00:00 2001 From: Andrew Savchyn Date: Sun, 19 Nov 2017 20:35:11 +0100 Subject: [PATCH 07/24] DOC: more info for dtype argument of get_dummies in whatsnew --- doc/source/whatsnew/v0.22.0.txt | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 4f403ff8053a7..f42891bccd10b 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -30,6 +30,27 @@ Other Enhancements - Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`) - :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`) +``get_dummies`` now supports ``dtype`` argument +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`get_dummies` function now accepts ``dtype`` argument, which forces specific dtype for new columns. When ``dtype`` is not specified or equals to ``None``, new columns will have dtype ``uint8`` (as before), so this change is backwards compatible. (:issue:`18330`) + +**Previous behavior**: + +.. ipython:: python + + df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) + pd.get_dummies(df, columns=['c']) + +**New behavior**: + +.. ipython:: python + + df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) + pd.get_dummies(df, columns=['c']) + pd.get_dummies(df, columns=['c'], dtype=bool) + pd.get_dummies(df, columns=['c'], dtype=np.float64) + .. _whatsnew_0220.api_breaking: Backwards incompatible API changes From 769b3b65d1419c7776f0c8ea85a15aab420f237e Mon Sep 17 00:00:00 2001 From: Andrew Savchyn Date: Sun, 19 Nov 2017 20:58:20 +0100 Subject: [PATCH 08/24] ENH: raise TypeError for object dtype on get_dummies --- pandas/core/reshape/reshape.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index f35b5b74004d8..651fd3528bd06 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -804,6 +804,9 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, if dtype is None: dtype = np.uint8 + if np.dtype(dtype) is np.dtype('O'): + raise TypeError("'object' is not a valid type for get_dummies") + if isinstance(data, DataFrame): # determine columns being encoded From 20556f20662da851cc0b5aff3ad14a2341241765 Mon Sep 17 00:00:00 2001 From: Andrew Savchyn Date: Sun, 19 Nov 2017 21:00:31 +0100 Subject: [PATCH 09/24] TST: better tests for get_dummies dtype --- pandas/tests/reshape/test_reshape.py | 262 +++++++++++++-------------- 1 file changed, 129 insertions(+), 133 deletions(-) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 77949c4535424..31479255eceb5 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -219,12 +219,12 @@ def test_multiindex(self): class TestGetDummies(object): @pytest.fixture - def df(self, dtype): + def df(self): return DataFrame({'A': ['a', 'b', 'a'], - 'B': ['b', 'b', 'c'], - 'C': [1, 2, 3]}, dtype=dtype) + 'B': ['b', 'b', 'c'], + 'C': [1, 2, 3]}) - @pytest.fixture(params=['uint8', 'float64']) + @pytest.fixture(params=['uint8', 'int64', np.float64, bool, None]) def dtype(self, request): return np.dtype(request.param) @@ -234,30 +234,36 @@ def sparse(self, request): # e.g. TestGetDummies::test_basic[uint8-sparse] instead of [uint8-True] return request.param == 'sparse' - def test_basic(self, sparse, dtype): - kwargs = {'sparse': sparse, 'dtype': dtype} + def effective_dtype(self, dtype): + if dtype is None: + return np.uint8 + return dtype + + def test_throws_on_dtype_object(self, df): + with pytest.raises(TypeError): + get_dummies(df, dtype='object') + def test_basic(self, sparse, dtype): s_list = list('abc') s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) expected = DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0], - 'c': [0, 0, 1]}, dtype=dtype) - result = get_dummies(s_list, **kwargs) + 'c': [0, 0, 1]}, + dtype=self.effective_dtype(dtype)) + result = get_dummies(s_list, sparse=sparse, dtype=dtype) assert_frame_equal(result, expected) - result = get_dummies(s_series, **kwargs) + result = get_dummies(s_series, sparse=sparse, dtype=dtype) assert_frame_equal(result, expected) expected.index = list('ABC') - result = get_dummies(s_series_index, **kwargs) + result = get_dummies(s_series_index, sparse=sparse, dtype=dtype) assert_frame_equal(result, expected) def test_basic_types(self, sparse, dtype): # GH 10531 - kwargs = {'sparse': sparse, 'dtype': dtype} - s_list = list('abc') s_series = Series(s_list) s_df = DataFrame({'a': [0, 1, 0, 1, 2], @@ -267,7 +273,7 @@ def test_basic_types(self, sparse, dtype): expected = DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1]}, - dtype=dtype, + dtype=self.effective_dtype(dtype), columns=list('abc')) if not sparse: compare = tm.assert_frame_equal @@ -275,37 +281,26 @@ def test_basic_types(self, sparse, dtype): expected = expected.to_sparse(fill_value=0, kind='integer') compare = tm.assert_sp_frame_equal - result = get_dummies(s_list, **kwargs) + result = get_dummies(s_list, sparse=sparse, dtype=dtype) compare(result, expected) - result = get_dummies(s_series, **kwargs) + result = get_dummies(s_series, sparse=sparse, dtype=dtype) compare(result, expected) - result = get_dummies(s_df, columns=s_df.columns, **kwargs) + result = get_dummies(s_df, columns=s_df.columns, + sparse=sparse, dtype=dtype) tm.assert_series_equal(result.get_dtype_counts(), Series({dtype.name: 8})) - result = get_dummies(s_df, columns=['a'], **kwargs) - expected = Series({dtype.name: 3, - 'int64': 1, - 'object': 1}).sort_values() - tm.assert_series_equal(result.get_dtype_counts().sort_values(), - expected) - - def test_dtype_none(self, df, sparse, dtype): - expected = DataFrame({'A_a': [1, 0, 1], - 'A_b': [0, 1, 0], - 'B_b': [1, 1, 0], - 'B_c': [0, 0, 1], - 'C': [1, 2, 3]}, - # dummy dimension have type uint8 by default - dtype=np.uint8).sort_index(axis=1) + result = get_dummies(s_df, columns=['a'], sparse=sparse, dtype=dtype) + dtype_name = self.effective_dtype(dtype).name - # column C must retain it's type - expected[['C']] = expected[['C']].astype(dtype) + expected_counts = { 'int64': 1, 'object': 1 } + expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0) - result = get_dummies(df, sparse=sparse, dtype=None).sort_index(axis=1) - assert_frame_equal(result, expected) + expected = Series(expected_counts).sort_values() + tm.assert_series_equal(result.get_dtype_counts().sort_values(), + expected) def test_just_na(self, sparse): just_na_list = [np.nan] @@ -328,81 +323,72 @@ def test_include_na(self, sparse, dtype): if sparse: pytest.xfail(reason='nan in index is problematic (GH 16894)') - kwargs = {'sparse': sparse, 'dtype': dtype} s = ['a', 'b', np.nan] - res = get_dummies(s, **kwargs) - exp = DataFrame({'a': {0: 1, 1: 0, 2: 0}, - 'b': {0: 0, 1: 1, 2: 0}}, dtype=dtype) + res = get_dummies(s, sparse=sparse, dtype=dtype) + exp = DataFrame({'a': [1, 0, 0], + 'b': [0, 1, 0]}, + dtype=self.effective_dtype(dtype)) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 - res_na = get_dummies(s, dummy_na=True, **kwargs) - exp_na = DataFrame({nan: {0: 0, 1: 0, 2: 1}, - 'a': {0: 1, 1: 0, 2: 0}, - 'b': {0: 0, 1: 1, 2: 0}}, - dtype=dtype) + res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype) + exp_na = DataFrame({nan: [0, 0, 1], + 'a': [1, 0, 0], + 'b': [0, 1, 0]}, + dtype=self.effective_dtype(dtype)) exp_na = exp_na.reindex(['a', 'b', nan], axis=1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns assert_frame_equal(res_na, exp_na) - res_just_na = get_dummies([nan], dummy_na=True, **kwargs) + res_just_na = get_dummies([nan], dummy_na=True, + sparse=sparse, dtype=dtype) exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan], - dtype=dtype) + dtype=self.effective_dtype(dtype)) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values) def test_unicode(self, sparse, dtype): - kwargs = {'sparse': sparse, 'dtype': dtype} # See GH 6885 - get_dummies chokes on unicode values import unicodedata e = 'e' eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE') s = [e, eacute, eacute] - res = get_dummies(s, prefix='letter', **kwargs) - exp = DataFrame({'letter_e': {0: 1, - 1: 0, - 2: 0}, - u('letter_%s') % eacute: {0: 0, - 1: 1, - 2: 1}}, - dtype=dtype) + res = get_dummies(s, prefix='letter', sparse=sparse, dtype=dtype) + exp = DataFrame({'letter_e': [1, 0, 0], + u('letter_%s') % eacute: [0, 1, 1]}, + dtype=self.effective_dtype(dtype)) assert_frame_equal(res, exp) def test_dataframe_dummies_all_obj(self, df, sparse, dtype): - kwargs = {'sparse': sparse, 'dtype': dtype} df = df[['A', 'B']] - result = get_dummies(df, **kwargs) + result = get_dummies(df, sparse=sparse, dtype=dtype) expected = DataFrame({'A_a': [1, 0, 1], 'A_b': [0, 1, 0], 'B_b': [1, 1, 0], - 'B_c': [0, 0, 1]}, dtype=dtype) + 'B_c': [0, 0, 1]}, + dtype=self.effective_dtype(dtype)) assert_frame_equal(result, expected) def test_dataframe_dummies_mix_default(self, df, sparse, dtype): - kwargs = {'sparse': sparse, 'dtype': dtype} - result = get_dummies(df, **kwargs) + result = get_dummies(df, sparse=sparse, dtype=dtype) expected = DataFrame({'C': [1, 2, 3], 'A_a': [1, 0, 1], 'A_b': [0, 1, 0], 'B_b': [1, 1, 0], - 'B_c': [0, 0, 1]}, dtype=dtype) + 'B_c': [0, 0, 1]}) cols = ['A_a', 'A_b', 'B_b', 'B_c'] expected[cols] = expected[cols].astype(dtype) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_list(self, df, sparse, dtype): - kwargs = {'sparse': sparse, 'dtype': dtype} prefixes = ['from_A', 'from_B'] - df = DataFrame({'A': ['a', 'b', 'a'], - 'B': ['b', 'b', 'c'], - 'C': [1, 2, 3]}, dtype=dtype) - result = get_dummies(df, prefix=prefixes, **kwargs) + result = get_dummies(df, prefix=prefixes, sparse=sparse, dtype=dtype) expected = DataFrame({'C': [1, 2, 3], 'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], 'from_B_b': [1, 1, 0], - 'from_B_c': [0, 0, 1]}, dtype=dtype) + 'from_B_c': [0, 0, 1]}) cols = expected.columns[1:] expected[cols] = expected[cols].astype(dtype) expected = expected[['C', 'from_A_a', 'from_A_b', @@ -411,45 +397,47 @@ def test_dataframe_dummies_prefix_list(self, df, sparse, dtype): def test_dataframe_dummies_prefix_str(self, df, sparse, dtype): # not that you should do this... - kwargs = {'sparse': sparse, 'dtype': dtype} - result = get_dummies(df, prefix='bad', **kwargs) + result = get_dummies(df, prefix='bad', sparse=sparse, dtype=dtype) + bad_columns = ['bad_a', 'bad_b', 'bad_b', 'bad_c'] expected = DataFrame([[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]], - columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c'], - dtype=dtype) - expected = expected.astype({"C": dtype}) + columns=['C'] + bad_columns, + dtype=self.effective_dtype(dtype)) + expected['C'] = [1,2,3] assert_frame_equal(result, expected) def test_dataframe_dummies_subset(self, df, sparse, dtype): - kwargs = {'sparse': sparse, 'dtype': dtype} - result = get_dummies(df, prefix=['from_A'], columns=['A'], **kwargs) + result = get_dummies(df, prefix=['from_A'], columns=['A'], + sparse=sparse, dtype=dtype) expected = DataFrame({'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], 'B': ['b', 'b', 'c'], - 'C': [1, 2, 3]}, dtype=dtype) + 'C': [1, 2, 3]}) + expected[['from_A_a', 'from_A_b']] = expected[['from_A_a', 'from_A_b']].astype(dtype) cols = ['from_A_a', 'from_A_b'] expected[cols] = expected[cols].astype(dtype) assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_sep(self, df, sparse, dtype): - kwargs = {'sparse': sparse, 'dtype': dtype} - result = get_dummies(df, prefix_sep='..', **kwargs) + result = get_dummies(df, prefix_sep='..', sparse=sparse, dtype=dtype) expected = DataFrame({'C': [1, 2, 3], 'A..a': [1, 0, 1], 'A..b': [0, 1, 0], 'B..b': [1, 1, 0], - 'B..c': [0, 0, 1]}, dtype=dtype) + 'B..c': [0, 0, 1]}) expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']] cols = expected.columns[1:] - expected[cols] = expected[cols].astype(dtype) + expected[cols] = expected[cols].astype(self.effective_dtype(dtype)) assert_frame_equal(result, expected) - result = get_dummies(df, prefix_sep=['..', '__'], **kwargs) + result = get_dummies(df, prefix_sep=['..', '__'], + sparse=sparse, dtype=dtype) expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'}) assert_frame_equal(result, expected) - result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'}, **kwargs) + result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'}, + sparse=sparse, dtype=dtype) assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_bad_length(self, df, sparse): @@ -461,154 +449,160 @@ def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse): get_dummies(df, prefix_sep=['bad'], sparse=sparse) def test_dataframe_dummies_prefix_dict(self, sparse, dtype): - kwargs = {'sparse': sparse, 'dtype': dtype} prefixes = {'A': 'from_A', 'B': 'from_B'} df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], - 'C': [1, 2, 3]}, dtype=dtype) - result = get_dummies(df, prefix=prefixes, **kwargs) + 'C': [1, 2, 3]}) + result = get_dummies(df, prefix=prefixes, sparse=sparse, dtype=dtype) + expected = DataFrame({'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], 'from_B_b': [1, 1, 0], 'from_B_c': [0, 0, 1], - 'C': [1, 2, 3]}, dtype=dtype) + 'C': [1, 2, 3]}) + columns = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c'] + expected[columns] = expected[columns].astype(self.effective_dtype(dtype)) assert_frame_equal(result, expected) def test_dataframe_dummies_with_na(self, df, sparse, dtype): - kwargs = {'sparse': sparse, 'dtype': dtype} df.loc[3, :] = [np.nan, np.nan, np.nan] - result = get_dummies(df, dummy_na=True, **kwargs) + result = get_dummies(df, dummy_na=True, + sparse=sparse, dtype=dtype).sort_index(axis=1) expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_a': [1, 0, 1, 0], 'A_b': [0, 1, 0, 0], 'A_nan': [0, 0, 0, 1], 'B_b': [1, 1, 0, 0], 'B_c': [0, 0, 1, 0], - 'B_nan': [0, 0, 0, 1]}, dtype=dtype) - expected[['C']] = expected[['C']].astype(np.float64) - expected = expected[['C', 'A_a', 'A_b', 'A_nan', - 'B_b', 'B_c', 'B_nan']] + 'B_nan': [0, 0, 0, 1]}).sort_index(axis=1) + + e_dtype = self.effective_dtype(dtype) + columns = ['A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan'] + expected[columns] = expected[columns].astype(e_dtype) assert_frame_equal(result, expected) - result = get_dummies(df, dummy_na=False, **kwargs) + result = get_dummies(df, dummy_na=False, sparse=sparse, dtype=dtype) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] assert_frame_equal(result, expected) def test_dataframe_dummies_with_categorical(self, df, sparse, dtype): - kwargs = {'sparse': sparse, 'dtype': dtype} df['cat'] = pd.Categorical(['x', 'y', 'y']) - result = get_dummies(df, **kwargs) + result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1) expected = DataFrame({'C': [1, 2, 3], 'A_a': [1, 0, 1], 'A_b': [0, 1, 0], 'B_b': [1, 1, 0], 'B_c': [0, 0, 1], 'cat_x': [1, 0, 0], - 'cat_y': [0, 1, 1]}, dtype=dtype) - expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c', - 'cat_x', 'cat_y']] + 'cat_y': [0, 1, 1]}).sort_index(axis=1) + columns = ['A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y'] + expected[columns] = expected[columns].astype(self.effective_dtype(dtype)) + expected.sort_index(axis=1) assert_frame_equal(result, expected) def test_basic_drop_first(self, sparse, dtype): # GH12402 Add a new parameter `drop_first` to avoid collinearity # Basic case - kwargs = {'sparse': sparse, 'dtype': dtype} s_list = list('abc') s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) expected = DataFrame({'b': [0, 1, 0], - 'c': [0, 0, 1]}, dtype=dtype) + 'c': [0, 0, 1]}, dtype=self.effective_dtype(dtype)) - result = get_dummies(s_list, drop_first=True, **kwargs) + result = get_dummies(s_list, drop_first=True, + sparse=sparse, dtype=dtype) assert_frame_equal(result, expected) - result = get_dummies(s_series, drop_first=True, **kwargs) + result = get_dummies(s_series, drop_first=True, + sparse=sparse, dtype=dtype) assert_frame_equal(result, expected) expected.index = list('ABC') - result = get_dummies(s_series_index, drop_first=True, **kwargs) + result = get_dummies(s_series_index, drop_first=True, + sparse=sparse, dtype=dtype) assert_frame_equal(result, expected) def test_basic_drop_first_one_level(self, sparse, dtype): # Test the case that categorical variable only has one level. - kwargs = {'sparse': sparse, 'dtype': dtype} s_list = list('aaa') s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) expected = DataFrame(index=np.arange(3)) - result = get_dummies(s_list, drop_first=True, **kwargs) + result = get_dummies(s_list, drop_first=True, + sparse=sparse, dtype=dtype) assert_frame_equal(result, expected) - result = get_dummies(s_series, drop_first=True, **kwargs) + result = get_dummies(s_series, drop_first=True, + sparse=sparse, dtype=dtype) assert_frame_equal(result, expected) expected = DataFrame(index=list('ABC')) - result = get_dummies(s_series_index, drop_first=True, **kwargs) + result = get_dummies(s_series_index, drop_first=True, + sparse=sparse, dtype=dtype) assert_frame_equal(result, expected) def test_basic_drop_first_NA(self, sparse, dtype): # Test NA hadling together with drop_first - kwargs = {'sparse': sparse, 'dtype': dtype} s_NA = ['a', 'b', np.nan] - res = get_dummies(s_NA, drop_first=True, **kwargs) - exp = DataFrame({'b': [0, 1, 0]}, dtype=dtype) + res = get_dummies(s_NA, drop_first=True, sparse=sparse, dtype=dtype) + exp = DataFrame({'b': [0, 1, 0]}, dtype=self.effective_dtype(dtype)) assert_frame_equal(res, exp) - res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, **kwargs) - exp_na = DataFrame({'b': [0, 1, 0], - nan: [0, 0, 1]}, - dtype=dtype).reindex(['b', nan], axis=1) + res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, + sparse=sparse, dtype=dtype) + exp_na = DataFrame( + {'b': [0, 1, 0], + nan: [0, 0, 1]}, + dtype=self.effective_dtype(dtype)).reindex(['b', nan], axis=1) assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, drop_first=True, - **kwargs) + sparse=sparse, dtype=dtype) exp_just_na = DataFrame(index=np.arange(1)) assert_frame_equal(res_just_na, exp_just_na) def test_dataframe_dummies_drop_first(self, df, sparse, dtype): - kwargs = {'sparse': sparse, 'dtype': dtype} df = df[['A', 'B']] - result = get_dummies(df, drop_first=True, **kwargs) + result = get_dummies(df, drop_first=True, sparse=sparse, dtype=dtype) expected = DataFrame({'A_b': [0, 1, 0], - 'B_c': [0, 0, 1]}, dtype=dtype) + 'B_c': [0, 0, 1]}, + dtype=self.effective_dtype(dtype)) assert_frame_equal(result, expected) def test_dataframe_dummies_drop_first_with_categorical( self, df, sparse, dtype): - kwargs = {'sparse': sparse, 'dtype': dtype} df['cat'] = pd.Categorical(['x', 'y', 'y']) - result = get_dummies(df, drop_first=True, **kwargs) + result = get_dummies(df, drop_first=True, sparse=sparse, dtype=dtype) expected = DataFrame({'C': [1, 2, 3], 'A_b': [0, 1, 0], 'B_c': [0, 0, 1], - 'cat_y': [0, 1, 1]}, dtype=dtype) + 'cat_y': [0, 1, 1]}) cols = ['A_b', 'B_c', 'cat_y'] - expected[cols] = expected[cols].astype(dtype) + expected[cols] = expected[cols].astype(self.effective_dtype(dtype)) expected = expected[['C', 'A_b', 'B_c', 'cat_y']] assert_frame_equal(result, expected) def test_dataframe_dummies_drop_first_with_na(self, df, sparse, dtype): - kwargs = {'sparse': sparse, 'dtype': dtype} df.loc[3, :] = [np.nan, np.nan, np.nan] - result = get_dummies(df, dummy_na=True, drop_first=True, **kwargs) + result = get_dummies(df, dummy_na=True, drop_first=True, + sparse=sparse, dtype=dtype).sort_index(axis=1) expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_b': [0, 1, 0, 0], 'A_nan': [0, 0, 0, 1], 'B_c': [0, 0, 1, 0], - 'B_nan': [0, 0, 0, 1]}, dtype=dtype) - expected[['C']] = expected[['C']].astype(np.float64) + 'B_nan': [0, 0, 0, 1]}) cols = ['A_b', 'A_nan', 'B_c', 'B_nan'] - expected[cols] = expected[cols].astype(dtype) - - expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']] + expected[cols] = expected[cols].astype(self.effective_dtype(dtype)) + expected = expected.sort_index(axis=1) assert_frame_equal(result, expected) - result = get_dummies(df, dummy_na=False, drop_first=True, **kwargs) + result = get_dummies(df, dummy_na=False, drop_first=True, + sparse=sparse, dtype=dtype) expected = expected[['C', 'A_b', 'B_c']] assert_frame_equal(result, expected) @@ -619,7 +613,7 @@ def test_int_int(self, dtype): [0, 1], [1, 0]], columns=[1, 2], - dtype=dtype) + dtype=self.effective_dtype(dtype)) tm.assert_frame_equal(result, expected) data = Series(pd.Categorical(['a', 'b', 'a'])) @@ -628,7 +622,7 @@ def test_int_int(self, dtype): [0, 1], [1, 0]], columns=pd.Categorical(['a', 'b']), - dtype=dtype) + dtype=self.effective_dtype(dtype)) tm.assert_frame_equal(result, expected) def test_int_df(self, dtype): @@ -656,11 +650,13 @@ def test_dataframe_dummies_preserve_categorical_dtype(self, dtype): ordered=ordered) result = get_dummies(cat, dtype=dtype) - data = np.array([[1, 0, 0], [0, 1, 0]], dtype=dtype) + data = np.array([[1, 0, 0], [0, 1, 0]], + dtype=self.effective_dtype(dtype)) cols = pd.CategoricalIndex(cat.categories, categories=cat.categories, ordered=ordered) - expected = DataFrame(data, columns=cols, dtype=dtype) + expected = DataFrame(data, columns=cols, + dtype=self.effective_dtype(dtype)) tm.assert_frame_equal(result, expected) From b3ec8854933f3aef79615a7c3e79bf8e067293d5 Mon Sep 17 00:00:00 2001 From: Andrew Savchyn Date: Sun, 19 Nov 2017 21:08:48 +0100 Subject: [PATCH 10/24] CLN: cleanup reshape test style --- pandas/tests/reshape/test_reshape.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 31479255eceb5..15c61b535fe58 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -221,8 +221,8 @@ class TestGetDummies(object): @pytest.fixture def df(self): return DataFrame({'A': ['a', 'b', 'a'], - 'B': ['b', 'b', 'c'], - 'C': [1, 2, 3]}) + 'B': ['b', 'b', 'c'], + 'C': [1, 2, 3]}) @pytest.fixture(params=['uint8', 'int64', np.float64, bool, None]) def dtype(self, request): @@ -295,7 +295,7 @@ def test_basic_types(self, sparse, dtype): result = get_dummies(s_df, columns=['a'], sparse=sparse, dtype=dtype) dtype_name = self.effective_dtype(dtype).name - expected_counts = { 'int64': 1, 'object': 1 } + expected_counts = {'int64': 1, 'object': 1} expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0) expected = Series(expected_counts).sort_values() @@ -404,7 +404,7 @@ def test_dataframe_dummies_prefix_str(self, df, sparse, dtype): [3, 1, 0, 0, 1]], columns=['C'] + bad_columns, dtype=self.effective_dtype(dtype)) - expected['C'] = [1,2,3] + expected['C'] = [1, 2, 3] assert_frame_equal(result, expected) def test_dataframe_dummies_subset(self, df, sparse, dtype): @@ -414,9 +414,8 @@ def test_dataframe_dummies_subset(self, df, sparse, dtype): 'from_A_b': [0, 1, 0], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) - expected[['from_A_a', 'from_A_b']] = expected[['from_A_a', 'from_A_b']].astype(dtype) - cols = ['from_A_a', 'from_A_b'] - expected[cols] = expected[cols].astype(dtype) + columns = ['from_A_a', 'from_A_b'] + expected[columns] = expected[columns].astype(dtype) assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_sep(self, df, sparse, dtype): @@ -460,8 +459,10 @@ def test_dataframe_dummies_prefix_dict(self, sparse, dtype): 'from_B_b': [1, 1, 0], 'from_B_c': [0, 0, 1], 'C': [1, 2, 3]}) + columns = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c'] - expected[columns] = expected[columns].astype(self.effective_dtype(dtype)) + effective_dtype = self.effective_dtype(dtype) + expected[columns] = expected[columns].astype(effective_dtype) assert_frame_equal(result, expected) def test_dataframe_dummies_with_na(self, df, sparse, dtype): @@ -495,8 +496,10 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype): 'B_c': [0, 0, 1], 'cat_x': [1, 0, 0], 'cat_y': [0, 1, 1]}).sort_index(axis=1) + columns = ['A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y'] - expected[columns] = expected[columns].astype(self.effective_dtype(dtype)) + effective_dtype = self.effective_dtype(dtype) + expected[columns] = expected[columns].astype(effective_dtype) expected.sort_index(axis=1) assert_frame_equal(result, expected) @@ -508,7 +511,8 @@ def test_basic_drop_first(self, sparse, dtype): s_series_index = Series(s_list, list('ABC')) expected = DataFrame({'b': [0, 1, 0], - 'c': [0, 0, 1]}, dtype=self.effective_dtype(dtype)) + 'c': [0, 0, 1]}, + dtype=self.effective_dtype(dtype)) result = get_dummies(s_list, drop_first=True, sparse=sparse, dtype=dtype) From 9e5d0bb77b5c09f02a63b71f8bc89afac13c2700 Mon Sep 17 00:00:00 2001 From: Andrew Savchyn Date: Sun, 19 Nov 2017 23:43:04 +0100 Subject: [PATCH 11/24] DOC: fix wording in whatsnew for get_dummies dtype argument --- doc/source/whatsnew/v0.22.0.txt | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index f42891bccd10b..12f8996e87c43 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -30,26 +30,16 @@ Other Enhancements - Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`) - :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`) -``get_dummies`` now supports ``dtype`` argument -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``get_dummies`` now supports ``dtype`` argument (:issue:`18330`) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:func:`get_dummies` function now accepts ``dtype`` argument, which forces specific dtype for new columns. When ``dtype`` is not specified or equals to ``None``, new columns will have dtype ``uint8`` (as before), so this change is backwards compatible. (:issue:`18330`) - -**Previous behavior**: - -.. ipython:: python - - df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) - pd.get_dummies(df, columns=['c']) - -**New behavior**: +The :func:`get_dummies` now accepts a ``dtype`` argument, which specifies a specific dtype for the new columns. When ``dtype`` is not specified or ``None``, the dtype will be ``uint8`` as before. (:issue:`18330`) .. ipython:: python df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) - pd.get_dummies(df, columns=['c']) - pd.get_dummies(df, columns=['c'], dtype=bool) - pd.get_dummies(df, columns=['c'], dtype=np.float64) + pd.get_dummies(df, columns=['c']).dtypes + pd.get_dummies(df, columns=['c'], dtype=bool).dtypes .. _whatsnew_0220.api_breaking: From b8ab3651307ccf3ffe215880cb3253f37e035e37 Mon Sep 17 00:00:00 2001 From: Andrew Savchyn Date: Sun, 19 Nov 2017 23:47:20 +0100 Subject: [PATCH 12/24] CLN: Raise ValueError on invalid dtype --- pandas/core/reshape/reshape.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 651fd3528bd06..1facec06c84a2 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -10,7 +10,7 @@ from pandas.core.dtypes.common import ( _ensure_platform_int, is_list_like, is_bool_dtype, - needs_i8_conversion, is_sparse) + needs_i8_conversion, is_sparse, is_object_dtype) from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.missing import notna @@ -804,8 +804,8 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, if dtype is None: dtype = np.uint8 - if np.dtype(dtype) is np.dtype('O'): - raise TypeError("'object' is not a valid type for get_dummies") + if is_object_dtype(dtype): + raise ValueError("dtype=object is not a valid dtype for get_dummies") if isinstance(data, DataFrame): # determine columns being encoded From 9db17f206834034a03d145b5032983a893f472aa Mon Sep 17 00:00:00 2001 From: Andrew Savchyn Date: Sun, 19 Nov 2017 23:48:21 +0100 Subject: [PATCH 13/24] TST: remove fixtures where not needed --- pandas/tests/reshape/test_reshape.py | 66 ++++++++++++++-------------- 1 file changed, 32 insertions(+), 34 deletions(-) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 15c61b535fe58..7248ca7745bc9 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -224,7 +224,7 @@ def df(self): 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) - @pytest.fixture(params=['uint8', 'int64', np.float64, bool, None]) + @pytest.fixture(params=['uint8', 'i8', np.float64, bool, None]) def dtype(self, request): return np.dtype(request.param) @@ -240,7 +240,7 @@ def effective_dtype(self, dtype): return dtype def test_throws_on_dtype_object(self, df): - with pytest.raises(TypeError): + with pytest.raises(ValueError): get_dummies(df, dtype='object') def test_basic(self, sparse, dtype): @@ -347,26 +347,26 @@ def test_include_na(self, sparse, dtype): dtype=self.effective_dtype(dtype)) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values) - def test_unicode(self, sparse, dtype): + def test_unicode(self, sparse): # See GH 6885 - get_dummies chokes on unicode values import unicodedata e = 'e' eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE') s = [e, eacute, eacute] - res = get_dummies(s, prefix='letter', sparse=sparse, dtype=dtype) + res = get_dummies(s, prefix='letter', sparse=sparse) exp = DataFrame({'letter_e': [1, 0, 0], u('letter_%s') % eacute: [0, 1, 1]}, - dtype=self.effective_dtype(dtype)) + dtype=np.uint8) assert_frame_equal(res, exp) - def test_dataframe_dummies_all_obj(self, df, sparse, dtype): + def test_dataframe_dummies_all_obj(self, df, sparse): df = df[['A', 'B']] - result = get_dummies(df, sparse=sparse, dtype=dtype) + result = get_dummies(df, sparse=sparse) expected = DataFrame({'A_a': [1, 0, 1], 'A_b': [0, 1, 0], 'B_b': [1, 1, 0], 'B_c': [0, 0, 1]}, - dtype=self.effective_dtype(dtype)) + dtype=np.uint8) assert_frame_equal(result, expected) def test_dataframe_dummies_mix_default(self, df, sparse, dtype): @@ -381,30 +381,30 @@ def test_dataframe_dummies_mix_default(self, df, sparse, dtype): expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] assert_frame_equal(result, expected) - def test_dataframe_dummies_prefix_list(self, df, sparse, dtype): + def test_dataframe_dummies_prefix_list(self, df, sparse): prefixes = ['from_A', 'from_B'] - result = get_dummies(df, prefix=prefixes, sparse=sparse, dtype=dtype) + result = get_dummies(df, prefix=prefixes, sparse=sparse) expected = DataFrame({'C': [1, 2, 3], 'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], 'from_B_b': [1, 1, 0], - 'from_B_c': [0, 0, 1]}) - cols = expected.columns[1:] - expected[cols] = expected[cols].astype(dtype) + 'from_B_c': [0, 0, 1]}, + dtype=np.uint8) + expected[['C']] = df[['C']] expected = expected[['C', 'from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']] assert_frame_equal(result, expected) - def test_dataframe_dummies_prefix_str(self, df, sparse, dtype): + def test_dataframe_dummies_prefix_str(self, df, sparse): # not that you should do this... - result = get_dummies(df, prefix='bad', sparse=sparse, dtype=dtype) + result = get_dummies(df, prefix='bad', sparse=sparse) bad_columns = ['bad_a', 'bad_b', 'bad_b', 'bad_c'] expected = DataFrame([[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]], columns=['C'] + bad_columns, - dtype=self.effective_dtype(dtype)) - expected['C'] = [1, 2, 3] + dtype=np.uint8) + expected[['C']] = df[['C']] assert_frame_equal(result, expected) def test_dataframe_dummies_subset(self, df, sparse, dtype): @@ -418,25 +418,24 @@ def test_dataframe_dummies_subset(self, df, sparse, dtype): expected[columns] = expected[columns].astype(dtype) assert_frame_equal(result, expected) - def test_dataframe_dummies_prefix_sep(self, df, sparse, dtype): - result = get_dummies(df, prefix_sep='..', sparse=sparse, dtype=dtype) + def test_dataframe_dummies_prefix_sep(self, df, sparse): + result = get_dummies(df, prefix_sep='..', sparse=sparse) expected = DataFrame({'C': [1, 2, 3], 'A..a': [1, 0, 1], 'A..b': [0, 1, 0], 'B..b': [1, 1, 0], - 'B..c': [0, 0, 1]}) + 'B..c': [0, 0, 1]}, + dtype=np.uint8) + expected[['C']] = df[['C']] expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']] - cols = expected.columns[1:] - expected[cols] = expected[cols].astype(self.effective_dtype(dtype)) assert_frame_equal(result, expected) - result = get_dummies(df, prefix_sep=['..', '__'], - sparse=sparse, dtype=dtype) + result = get_dummies(df, prefix_sep=['..', '__'], sparse=sparse) expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'}) assert_frame_equal(result, expected) result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'}, - sparse=sparse, dtype=dtype) + sparse=sparse) assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_bad_length(self, df, sparse): @@ -447,12 +446,12 @@ def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse): with pytest.raises(ValueError): get_dummies(df, prefix_sep=['bad'], sparse=sparse) - def test_dataframe_dummies_prefix_dict(self, sparse, dtype): + def test_dataframe_dummies_prefix_dict(self, sparse): prefixes = {'A': 'from_A', 'B': 'from_B'} df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) - result = get_dummies(df, prefix=prefixes, sparse=sparse, dtype=dtype) + result = get_dummies(df, prefix=prefixes, sparse=sparse) expected = DataFrame({'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], @@ -461,8 +460,7 @@ def test_dataframe_dummies_prefix_dict(self, sparse, dtype): 'C': [1, 2, 3]}) columns = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c'] - effective_dtype = self.effective_dtype(dtype) - expected[columns] = expected[columns].astype(effective_dtype) + expected[columns] = expected[columns].astype(np.uint8) assert_frame_equal(result, expected) def test_dataframe_dummies_with_na(self, df, sparse, dtype): @@ -610,23 +608,23 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse, dtype): expected = expected[['C', 'A_b', 'B_c']] assert_frame_equal(result, expected) - def test_int_int(self, dtype): + def test_int_int(self): data = Series([1, 2, 1]) - result = pd.get_dummies(data, dtype=dtype) + result = pd.get_dummies(data) expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], - dtype=self.effective_dtype(dtype)) + dtype=np.uint8) tm.assert_frame_equal(result, expected) data = Series(pd.Categorical(['a', 'b', 'a'])) - result = pd.get_dummies(data, dtype=dtype) + result = pd.get_dummies(data) expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=pd.Categorical(['a', 'b']), - dtype=self.effective_dtype(dtype)) + dtype=np.uint8) tm.assert_frame_equal(result, expected) def test_int_df(self, dtype): From ef7a473ad540abecc8a94ade5aba34468a1d8407 Mon Sep 17 00:00:00 2001 From: Andrew Savchyn Date: Sun, 19 Nov 2017 23:51:52 +0100 Subject: [PATCH 14/24] TST: remove dtype fixture from subset test --- pandas/tests/reshape/test_reshape.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 7248ca7745bc9..b77a001a5bef8 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -407,15 +407,14 @@ def test_dataframe_dummies_prefix_str(self, df, sparse): expected[['C']] = df[['C']] assert_frame_equal(result, expected) - def test_dataframe_dummies_subset(self, df, sparse, dtype): + def test_dataframe_dummies_subset(self, df, sparse): result = get_dummies(df, prefix=['from_A'], columns=['A'], - sparse=sparse, dtype=dtype) + sparse=sparse) expected = DataFrame({'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], 'B': ['b', 'b', 'c'], - 'C': [1, 2, 3]}) - columns = ['from_A_a', 'from_A_b'] - expected[columns] = expected[columns].astype(dtype) + 'C': [1, 2, 3]}, dtype=np.uint8) + expected[['C']] = df[['C']] assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_sep(self, df, sparse): From 67d346d63577f2d00c55a81674dd36f9698e7cad Mon Sep 17 00:00:00 2001 From: Andrew Savchyn Date: Mon, 20 Nov 2017 09:14:59 +0100 Subject: [PATCH 15/24] TST: fix bug in get_dummy tests under python3 --- pandas/tests/reshape/test_reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index b77a001a5bef8..2c54a93bbd1b9 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -397,6 +397,7 @@ def test_dataframe_dummies_prefix_list(self, df, sparse): def test_dataframe_dummies_prefix_str(self, df, sparse): # not that you should do this... + df[['C']] = df[['C']].astype(np.uint8) result = get_dummies(df, prefix='bad', sparse=sparse) bad_columns = ['bad_a', 'bad_b', 'bad_b', 'bad_c'] expected = DataFrame([[1, 1, 0, 1, 0], @@ -404,7 +405,6 @@ def test_dataframe_dummies_prefix_str(self, df, sparse): [3, 1, 0, 0, 1]], columns=['C'] + bad_columns, dtype=np.uint8) - expected[['C']] = df[['C']] assert_frame_equal(result, expected) def test_dataframe_dummies_subset(self, df, sparse): From 367e7539e5112c52cb2ab4a4cbc17e7f4dd0375b Mon Sep 17 00:00:00 2001 From: Andrew Savchyn Date: Mon, 20 Nov 2017 22:09:42 +0100 Subject: [PATCH 16/24] TST: Remove dtype fixture where not needed --- pandas/tests/reshape/test_reshape.py | 58 ++++++++++++---------------- 1 file changed, 25 insertions(+), 33 deletions(-) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 2c54a93bbd1b9..8dbcc2bcd1810 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -500,7 +500,7 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype): expected.sort_index(axis=1) assert_frame_equal(result, expected) - def test_basic_drop_first(self, sparse, dtype): + def test_basic_drop_first(self, sparse): # GH12402 Add a new parameter `drop_first` to avoid collinearity # Basic case s_list = list('abc') @@ -509,22 +509,19 @@ def test_basic_drop_first(self, sparse, dtype): expected = DataFrame({'b': [0, 1, 0], 'c': [0, 0, 1]}, - dtype=self.effective_dtype(dtype)) + dtype=np.uint8) - result = get_dummies(s_list, drop_first=True, - sparse=sparse, dtype=dtype) + result = get_dummies(s_list, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) - result = get_dummies(s_series, drop_first=True, - sparse=sparse, dtype=dtype) + result = get_dummies(s_series, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) expected.index = list('ABC') - result = get_dummies(s_series_index, drop_first=True, - sparse=sparse, dtype=dtype) + result = get_dummies(s_series_index, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) - def test_basic_drop_first_one_level(self, sparse, dtype): + def test_basic_drop_first_one_level(self, sparse): # Test the case that categorical variable only has one level. s_list = list('aaa') s_series = Series(s_list) @@ -532,78 +529,73 @@ def test_basic_drop_first_one_level(self, sparse, dtype): expected = DataFrame(index=np.arange(3)) - result = get_dummies(s_list, drop_first=True, - sparse=sparse, dtype=dtype) + result = get_dummies(s_list, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) - result = get_dummies(s_series, drop_first=True, - sparse=sparse, dtype=dtype) + result = get_dummies(s_series, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) expected = DataFrame(index=list('ABC')) - result = get_dummies(s_series_index, drop_first=True, - sparse=sparse, dtype=dtype) + result = get_dummies(s_series_index, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) - def test_basic_drop_first_NA(self, sparse, dtype): + def test_basic_drop_first_NA(self, sparse): # Test NA hadling together with drop_first s_NA = ['a', 'b', np.nan] - res = get_dummies(s_NA, drop_first=True, sparse=sparse, dtype=dtype) - exp = DataFrame({'b': [0, 1, 0]}, dtype=self.effective_dtype(dtype)) + res = get_dummies(s_NA, drop_first=True, sparse=sparse) + exp = DataFrame({'b': [0, 1, 0]}, dtype=np.uint8) assert_frame_equal(res, exp) res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, - sparse=sparse, dtype=dtype) + sparse=sparse) exp_na = DataFrame( {'b': [0, 1, 0], nan: [0, 0, 1]}, - dtype=self.effective_dtype(dtype)).reindex(['b', nan], axis=1) + dtype=np.uint8).reindex(['b', nan], axis=1) assert_frame_equal(res_na, exp_na) - res_just_na = get_dummies([nan], - dummy_na=True, - drop_first=True, - sparse=sparse, dtype=dtype) + res_just_na = get_dummies([nan], dummy_na=True, drop_first=True, + sparse=sparse) exp_just_na = DataFrame(index=np.arange(1)) assert_frame_equal(res_just_na, exp_just_na) - def test_dataframe_dummies_drop_first(self, df, sparse, dtype): + def test_dataframe_dummies_drop_first(self, df, sparse): df = df[['A', 'B']] - result = get_dummies(df, drop_first=True, sparse=sparse, dtype=dtype) + result = get_dummies(df, drop_first=True, sparse=sparse) expected = DataFrame({'A_b': [0, 1, 0], 'B_c': [0, 0, 1]}, - dtype=self.effective_dtype(dtype)) + dtype=np.uint8) assert_frame_equal(result, expected) def test_dataframe_dummies_drop_first_with_categorical( self, df, sparse, dtype): df['cat'] = pd.Categorical(['x', 'y', 'y']) - result = get_dummies(df, drop_first=True, sparse=sparse, dtype=dtype) + result = get_dummies(df, drop_first=True, sparse=sparse) expected = DataFrame({'C': [1, 2, 3], 'A_b': [0, 1, 0], 'B_c': [0, 0, 1], 'cat_y': [0, 1, 1]}) cols = ['A_b', 'B_c', 'cat_y'] - expected[cols] = expected[cols].astype(self.effective_dtype(dtype)) + expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_b', 'B_c', 'cat_y']] assert_frame_equal(result, expected) - def test_dataframe_dummies_drop_first_with_na(self, df, sparse, dtype): + def test_dataframe_dummies_drop_first_with_na(self, df, sparse): df.loc[3, :] = [np.nan, np.nan, np.nan] result = get_dummies(df, dummy_na=True, drop_first=True, - sparse=sparse, dtype=dtype).sort_index(axis=1) + sparse=sparse).sort_index(axis=1) expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_b': [0, 1, 0, 0], 'A_nan': [0, 0, 0, 1], 'B_c': [0, 0, 1, 0], 'B_nan': [0, 0, 0, 1]}) cols = ['A_b', 'A_nan', 'B_c', 'B_nan'] - expected[cols] = expected[cols].astype(self.effective_dtype(dtype)) + expected[cols] = expected[cols].astype(np.uint8) expected = expected.sort_index(axis=1) assert_frame_equal(result, expected) result = get_dummies(df, dummy_na=False, drop_first=True, - sparse=sparse, dtype=dtype) + sparse=sparse) expected = expected[['C', 'A_b', 'B_c']] assert_frame_equal(result, expected) From 4e478607486dc778b83147670b565bdc5a2a099a Mon Sep 17 00:00:00 2001 From: Andrew Savchyn Date: Mon, 20 Nov 2017 22:11:16 +0100 Subject: [PATCH 17/24] CLN: move dtype logic to internal function in get_dummies --- pandas/core/reshape/reshape.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 1facec06c84a2..eacb82d39ae46 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -801,12 +801,6 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, from pandas.core.reshape.concat import concat from itertools import cycle - if dtype is None: - dtype = np.uint8 - - if is_object_dtype(dtype): - raise ValueError("dtype=object is not a valid dtype for get_dummies") - if isinstance(data, DataFrame): # determine columns being encoded @@ -864,10 +858,18 @@ def check_len(item, name): def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, - sparse=False, drop_first=False, dtype=np.uint8): + sparse=False, drop_first=False, dtype=None): # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) + if dtype is None: + dtype = np.uint8 + else: + dtype = np.dtype(dtype) + + if is_object_dtype(dtype): + raise ValueError("dtype=object is not a valid dtype for get_dummies") + def get_empty_Frame(data, sparse): if isinstance(data, Series): index = data.index From bf8327cbb432d8bf57e950184e0c11879142d92b Mon Sep 17 00:00:00 2001 From: Andrew Savchyn Date: Mon, 20 Nov 2017 22:12:18 +0100 Subject: [PATCH 18/24] DOC: add ref to get_dummies entry in whatsnew --- doc/source/whatsnew/v0.22.0.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 12f8996e87c43..fa71d8c3a989b 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -29,9 +29,12 @@ Other Enhancements - :class:`pandas.io.formats.style.Styler` now has method ``hide_columns()`` to determine whether columns will be hidden in output (:issue:`14194`) - Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`) - :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`) +- :func:`get_dummies` now supports ``dtype`` argument, see :ref:`here ` for more (:issue: `18330`) -``get_dummies`` now supports ``dtype`` argument (:issue:`18330`) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. _whatsnew_0220.enhancements.get_dummies_dtype + +``get_dummies`` now supports ``dtype`` argument +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The :func:`get_dummies` now accepts a ``dtype`` argument, which specifies a specific dtype for the new columns. When ``dtype`` is not specified or ``None``, the dtype will be ``uint8`` as before. (:issue:`18330`) From f3abd2bcc342572f85e32a7942ad48f062899e4d Mon Sep 17 00:00:00 2001 From: Andrew Savchyn Date: Tue, 21 Nov 2017 01:21:01 +0100 Subject: [PATCH 19/24] DOC: remove extra space in whatsnew --- doc/source/whatsnew/v0.22.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index fa71d8c3a989b..c8a18f5953361 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -29,7 +29,7 @@ Other Enhancements - :class:`pandas.io.formats.style.Styler` now has method ``hide_columns()`` to determine whether columns will be hidden in output (:issue:`14194`) - Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`) - :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`) -- :func:`get_dummies` now supports ``dtype`` argument, see :ref:`here ` for more (:issue: `18330`) +- :func:`get_dummies` now supports ``dtype`` argument, see :ref:`here ` for more (:issue:`18330`) .. _whatsnew_0220.enhancements.get_dummies_dtype From 649d3032ec3c58465e028d581cd0993c3dbabd36 Mon Sep 17 00:00:00 2001 From: Andrew Savchyn Date: Tue, 21 Nov 2017 01:21:51 +0100 Subject: [PATCH 20/24] TST: change dtype on expected output instead of input --- pandas/tests/reshape/test_reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 8dbcc2bcd1810..56fdd89a60643 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -397,7 +397,6 @@ def test_dataframe_dummies_prefix_list(self, df, sparse): def test_dataframe_dummies_prefix_str(self, df, sparse): # not that you should do this... - df[['C']] = df[['C']].astype(np.uint8) result = get_dummies(df, prefix='bad', sparse=sparse) bad_columns = ['bad_a', 'bad_b', 'bad_b', 'bad_c'] expected = DataFrame([[1, 1, 0, 1, 0], @@ -405,6 +404,7 @@ def test_dataframe_dummies_prefix_str(self, df, sparse): [3, 1, 0, 0, 1]], columns=['C'] + bad_columns, dtype=np.uint8) + expected = expected.astype({"C": np.int64}) assert_frame_equal(result, expected) def test_dataframe_dummies_subset(self, df, sparse): From a7a60b7e9dc58f8377ff3e84abfecb760ff9f08e Mon Sep 17 00:00:00 2001 From: Andrew Savchyn Date: Wed, 22 Nov 2017 07:50:22 +0100 Subject: [PATCH 21/24] DOC: update whatsnew, change test name --- doc/source/whatsnew/v0.22.0.txt | 29 ++++++++++++++-------------- pandas/tests/reshape/test_reshape.py | 2 +- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index c8a18f5953361..658ec5a35c05f 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -17,21 +17,8 @@ New features - - -.. _whatsnew_0220.enhancements.other: -Other Enhancements -^^^^^^^^^^^^^^^^^^ - -- Better support for :func:`Dataframe.style.to_excel` output with the ``xlsxwriter`` engine. (:issue:`16149`) -- :func:`pandas.tseries.frequencies.to_offset` now accepts leading '+' signs e.g. '+1h'. (:issue:`18171`) -- :func:`MultiIndex.unique` now supports the ``level=`` argument, to get unique values from a specific index level (:issue:`17896`) -- :class:`pandas.io.formats.style.Styler` now has method ``hide_index()`` to determine whether the index will be rendered in ouptut (:issue:`14194`) -- :class:`pandas.io.formats.style.Styler` now has method ``hide_columns()`` to determine whether columns will be hidden in output (:issue:`14194`) -- Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`) -- :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`) -- :func:`get_dummies` now supports ``dtype`` argument, see :ref:`here ` for more (:issue:`18330`) - -.. _whatsnew_0220.enhancements.get_dummies_dtype +.. _whatsnew_0210.enhancements.get_dummies_dtype: ``get_dummies`` now supports ``dtype`` argument ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -44,6 +31,20 @@ The :func:`get_dummies` now accepts a ``dtype`` argument, which specifies a spec pd.get_dummies(df, columns=['c']).dtypes pd.get_dummies(df, columns=['c'], dtype=bool).dtypes + +.. _whatsnew_0220.enhancements.other: + +Other Enhancements +^^^^^^^^^^^^^^^^^^ + +- Better support for :func:`Dataframe.style.to_excel` output with the ``xlsxwriter`` engine. (:issue:`16149`) +- :func:`pandas.tseries.frequencies.to_offset` now accepts leading '+' signs e.g. '+1h'. (:issue:`18171`) +- :func:`MultiIndex.unique` now supports the ``level=`` argument, to get unique values from a specific index level (:issue:`17896`) +- :class:`pandas.io.formats.style.Styler` now has method ``hide_index()`` to determine whether the index will be rendered in ouptut (:issue:`14194`) +- :class:`pandas.io.formats.style.Styler` now has method ``hide_columns()`` to determine whether columns will be hidden in output (:issue:`14194`) +- Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`) +- :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`) + .. _whatsnew_0220.api_breaking: Backwards incompatible API changes diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 56fdd89a60643..5d4aa048ae303 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -239,7 +239,7 @@ def effective_dtype(self, dtype): return np.uint8 return dtype - def test_throws_on_dtype_object(self, df): + def test_raises_on_dtype_object(self, df): with pytest.raises(ValueError): get_dummies(df, dtype='object') From bc192fd976c8ba87eb854e80110beb3c8be5f14d Mon Sep 17 00:00:00 2001 From: Andrew Savchyn Date: Wed, 22 Nov 2017 12:06:20 +0100 Subject: [PATCH 22/24] DOC: add get_dummies dtype argument description to reshaping.rst --- doc/source/reshaping.rst | 13 ++++++++++++- pandas/core/reshape/reshape.py | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 1209c4a8d6be8..1b81d83bb76c7 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -240,7 +240,7 @@ values will be set to ``NaN``. df3 df3.unstack() -.. versionadded: 0.18.0 +.. versionadded:: 0.18.0 Alternatively, unstack takes an optional ``fill_value`` argument, for specifying the value of missing data. @@ -634,6 +634,17 @@ When a column contains only one level, it will be omitted in the result. pd.get_dummies(df, drop_first=True) +By default new columns will have ``np.uint8`` dtype. To choose another dtype use ``dtype`` argument: + +.. ipython:: python + + df = pd.DataFrame({'A': list('abc'), 'B': [1.1, 2.2, 3.3]}) + + pd.get_dummies(df, dtype=bool).dtypes + +.. versionadded:: 0.22.0 + + .. _reshaping.factorize: Factorizing values diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index eacb82d39ae46..78bae58c52b2a 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -729,7 +729,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, .. versionadded:: 0.18.0 dtype : dtype, default np.uint8 - Data type to force on a new columns. Only a single dtype is allowed. + Data type for new columns. Only a single dtype is allowed. .. versionadded:: 0.22.0 From d19d81fa6060b3c5bc4a9cb09c5aa05364b84d37 Mon Sep 17 00:00:00 2001 From: Andrew Savchyn Date: Wed, 22 Nov 2017 12:21:35 +0100 Subject: [PATCH 23/24] DOC: update whatsnew style, minore codestyle change --- doc/source/whatsnew/v0.22.0.txt | 2 +- pandas/core/reshape/reshape.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 658ec5a35c05f..162a1ca2bfcb3 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -23,7 +23,7 @@ New features ``get_dummies`` now supports ``dtype`` argument ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The :func:`get_dummies` now accepts a ``dtype`` argument, which specifies a specific dtype for the new columns. When ``dtype`` is not specified or ``None``, the dtype will be ``uint8`` as before. (:issue:`18330`) +The :func:`get_dummies` now accepts a ``dtype`` argument, which specifies a dtype for the new columns. The default remains uint8. (:issue:`18330`) .. ipython:: python diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 78bae58c52b2a..5bb86885c0875 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -864,8 +864,7 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, if dtype is None: dtype = np.uint8 - else: - dtype = np.dtype(dtype) + dtype = np.dtype(dtype) if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") From 158a317aedc738b7b740a3b480a01b4289778847 Mon Sep 17 00:00:00 2001 From: Andrew Savchyn Date: Wed, 22 Nov 2017 14:15:22 +0100 Subject: [PATCH 24/24] DOC: fix typo and trigger tests --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index cbc259ba7bceb..782971a742b54 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -965,7 +965,7 @@ def _set_axis_name(self, name, axis=0, inplace=False): inplace : bool whether to modify `self` directly or return a copy - .. versionadded: 0.21.0 + .. versionadded:: 0.21.0 Returns -------