diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f850bfba4f90e..46e8a0a5b2d3f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4875,26 +4875,27 @@ def describe_numeric_1d(series, percentiles): def describe_categorical_1d(data): names = ['count', 'unique'] objcounts = data.value_counts() - result = [data.count(), len(objcounts[objcounts != 0])] + count_unique = len(objcounts[objcounts != 0]) + result = [data.count(), count_unique] if result[1] > 0: top, freq = objcounts.index[0], objcounts.iloc[0] - if (data.dtype == object or - com.is_categorical_dtype(data.dtype)): - names += ['top', 'freq'] - result += [top, freq] - - elif com.is_datetime64_dtype(data): + if com.is_datetime64_dtype(data): asint = data.dropna().values.view('i8') names += ['top', 'freq', 'first', 'last'] result += [lib.Timestamp(top), freq, lib.Timestamp(asint.min()), lib.Timestamp(asint.max())] + else: + names += ['top', 'freq'] + result += [top, freq] return pd.Series(result, index=names, name=data.name) def describe_1d(data, percentiles): - if com.is_numeric_dtype(data): + if com.is_bool_dtype(data): + return describe_categorical_1d(data) + elif com.is_numeric_dtype(data): return describe_numeric_1d(data, percentiles) elif com.is_timedelta64_dtype(data): return describe_numeric_1d(data, percentiles) @@ -4906,7 +4907,7 @@ def describe_1d(data, percentiles): elif (include is None) and (exclude is None): if len(self._get_numeric_data()._info_axis) > 0: # when some numerics are found, keep only numerics - data = self.select_dtypes(include=[np.number, np.bool]) + data = self.select_dtypes(include=[np.number]) else: data = self elif include == 'all': diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 4154c24f227f9..fed845827bf92 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -241,24 +241,16 @@ def test_bool_describe_in_mixed_frame(self): 'int_data': [10, 20, 30, 40, 50], }) - # Boolean data and integer data is included in .describe() output, - # string data isn't - self.assert_numpy_array_equal(df.describe().columns, [ - 'bool_data', 'int_data']) + # Integer data are included in .describe() output, + # Boolean and string data are not. + self.assert_numpy_array_equal(df.describe().columns, ['int_data']) - bool_describe = df.describe()['bool_data'] + bool_describe = df.describe(include='all')['bool_data'] - # Both the min and the max values should stay booleans - self.assertEqual(bool_describe['min'].dtype, np.bool_) - self.assertEqual(bool_describe['max'].dtype, np.bool_) + # Top value is a boolean value that is False + self.assertTrue(isinstance(bool_describe['top'] , bool)) + self.assertFalse(bool_describe['top']) - self.assertFalse(bool_describe['min']) - self.assertTrue(bool_describe['max']) - - # For numeric operations, like mean or median, the values True/False - # are cast to the integer values 1 and 0 - assert_almost_equal(bool_describe['mean'], 0.4) - assert_almost_equal(bool_describe['50%'], 0) def test_reduce_mixed_frame(self): # GH 6806 diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 4c7510783eda0..375902b782098 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -955,7 +955,7 @@ def test_describe_objects(self): s = Series(['a', 'b', 'b', np.nan, np.nan, np.nan, 'c', 'd', 'a', 'a']) result = s.describe() expected = Series({'count': 7, 'unique': 4, - 'top': 'a', 'freq': 3}, index=result.index) + 'top': 'a', 'freq': 3,'second':'b', 'second_freq': 2}, index=result.index) assert_series_equal(result, expected) dt = list(self.ts.index) @@ -1486,9 +1486,8 @@ def test_describe_typefiltering_category_bool(self): 'D_num': np.arange(24.) + .5, 'E_ts': tm.makeTimeSeries()[:24].index}) - # bool is considered numeric in describe, although not an np.number desc = df.describe() - expected_cols = ['C_bool', 'D_num'] + expected_cols = ['D_num'] expected = DataFrame(dict((k, df[k].describe()) for k in expected_cols), columns=expected_cols)