Skip to content

Commit bcc987c

Browse files
John Fremlinjreback
John Fremlin
authored andcommitted
BUG: describe() outputs bool similarly to categorical data
closes #6625 closes #12458
1 parent 7d1499c commit bcc987c

File tree

4 files changed

+27
-36
lines changed

4 files changed

+27
-36
lines changed

doc/source/whatsnew/v0.18.0.txt

+1-8
Original file line numberDiff line numberDiff line change
@@ -856,21 +856,14 @@ Other API Changes
856856
ValueError: Cannot convert arg ['20150101 07:00:00'] to a time.
857857

858858
- ``.memory_usage()`` now includes values in the index, as does memory_usage in ``.info`` (:issue:`11597`)
859-
860859
- ``DataFrame.to_latex()`` now supports non-ascii encodings (eg utf-8) in Python 2 with the parameter ``encoding`` (:issue:`7061`)
861-
862860
- ``pandas.merge()`` and ``DataFrame.merge()`` will show a specific error message when trying to merge with an object that is not of type ``DataFrame`` or a subclass (:issue:`12081`)
863-
864861
- ``DataFrame.unstack`` and ``Series.unstack`` now take ``fill_value`` keyword to allow direct replacement of missing values when an unstack results in missing values in the resulting ``DataFrame``. As an added benefit, specifying ``fill_value`` will preserve the data type of the original stacked data. (:issue:`9746`)
865-
866862
- As part of the new API for :ref:`window functions <whatsnew_0180.enhancements.moments>` and :ref:`resampling <whatsnew_0180.breaking.resample>`, aggregation functions have been clarified, raising more informative error messages on invalid aggregations. (:issue:`9052`). A full set of examples are presented in :ref:`groupby <groupby.aggregation>`.
867-
868863
- Statistical functions for ``NDFrame`` objects will now raise if non-numpy-compatible arguments are passed in for ``**kwargs`` (:issue:`12301`)
869-
870864
- ``.to_latex`` and ``.to_html`` gain a ``decimal`` parameter like ``.to_csv``; the default is ``'.'`` (:issue:`12031`)
871-
872865
- More helpful error message when constructing a ``DataFrame`` with empty data but with indices (:issue:`8020`)
873-
866+
- ``.describe()`` will now properly handle bool dtype as a categorical (:issue:`6625`)
874867

875868
.. _whatsnew_0180.deprecations:
876869

pandas/core/generic.py

+10-9
Original file line numberDiff line numberDiff line change
@@ -4875,26 +4875,27 @@ def describe_numeric_1d(series, percentiles):
48754875
def describe_categorical_1d(data):
48764876
names = ['count', 'unique']
48774877
objcounts = data.value_counts()
4878-
result = [data.count(), len(objcounts[objcounts != 0])]
4878+
count_unique = len(objcounts[objcounts != 0])
4879+
result = [data.count(), count_unique]
48794880
if result[1] > 0:
48804881
top, freq = objcounts.index[0], objcounts.iloc[0]
48814882

4882-
if (data.dtype == object or
4883-
com.is_categorical_dtype(data.dtype)):
4884-
names += ['top', 'freq']
4885-
result += [top, freq]
4886-
4887-
elif com.is_datetime64_dtype(data):
4883+
if com.is_datetime64_dtype(data):
48884884
asint = data.dropna().values.view('i8')
48894885
names += ['top', 'freq', 'first', 'last']
48904886
result += [lib.Timestamp(top), freq,
48914887
lib.Timestamp(asint.min()),
48924888
lib.Timestamp(asint.max())]
4889+
else:
4890+
names += ['top', 'freq']
4891+
result += [top, freq]
48934892

48944893
return pd.Series(result, index=names, name=data.name)
48954894

48964895
def describe_1d(data, percentiles):
4897-
if com.is_numeric_dtype(data):
4896+
if com.is_bool_dtype(data):
4897+
return describe_categorical_1d(data)
4898+
elif com.is_numeric_dtype(data):
48984899
return describe_numeric_1d(data, percentiles)
48994900
elif com.is_timedelta64_dtype(data):
49004901
return describe_numeric_1d(data, percentiles)
@@ -4906,7 +4907,7 @@ def describe_1d(data, percentiles):
49064907
elif (include is None) and (exclude is None):
49074908
if len(self._get_numeric_data()._info_axis) > 0:
49084909
# when some numerics are found, keep only numerics
4909-
data = self.select_dtypes(include=[np.number, np.bool])
4910+
data = self.select_dtypes(include=[np.number])
49104911
else:
49114912
data = self
49124913
elif include == 'all':

pandas/tests/frame/test_analytics.py

+13-16
Original file line numberDiff line numberDiff line change
@@ -241,24 +241,21 @@ def test_bool_describe_in_mixed_frame(self):
241241
'int_data': [10, 20, 30, 40, 50],
242242
})
243243

244-
# Boolean data and integer data is included in .describe() output,
245-
# string data isn't
246-
self.assert_numpy_array_equal(df.describe().columns, [
247-
'bool_data', 'int_data'])
248-
249-
bool_describe = df.describe()['bool_data']
250-
251-
# Both the min and the max values should stay booleans
252-
self.assertEqual(bool_describe['min'].dtype, np.bool_)
253-
self.assertEqual(bool_describe['max'].dtype, np.bool_)
244+
# Integer data are included in .describe() output,
245+
# Boolean and string data are not.
246+
result = df.describe()
247+
expected = DataFrame({'int_data' : [5, 30, df.int_data.std(),
248+
10, 20, 30, 40, 50]},
249+
index=['count', 'mean', 'std', 'min', '25%',
250+
'50%', '75%', 'max'])
251+
assert_frame_equal(result, expected)
254252

255-
self.assertFalse(bool_describe['min'])
256-
self.assertTrue(bool_describe['max'])
253+
# Top value is a boolean value that is False
254+
result = df.describe(include=['bool'])
257255

258-
# For numeric operations, like mean or median, the values True/False
259-
# are cast to the integer values 1 and 0
260-
assert_almost_equal(bool_describe['mean'], 0.4)
261-
assert_almost_equal(bool_describe['50%'], 0)
256+
expected = DataFrame({'bool_data' : [5, 2, False, 3]},
257+
index=['count', 'unique', 'top', 'freq'])
258+
assert_frame_equal(result, expected)
262259

263260
def test_reduce_mixed_frame(self):
264261
# GH 6806

pandas/tests/test_generic.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -956,7 +956,8 @@ def test_describe_objects(self):
956956
s = Series(['a', 'b', 'b', np.nan, np.nan, np.nan, 'c', 'd', 'a', 'a'])
957957
result = s.describe()
958958
expected = Series({'count': 7, 'unique': 4,
959-
'top': 'a', 'freq': 3}, index=result.index)
959+
'top': 'a', 'freq': 3, 'second': 'b',
960+
'second_freq': 2}, index=result.index)
960961
assert_series_equal(result, expected)
961962

962963
dt = list(self.ts.index)
@@ -1487,9 +1488,8 @@ def test_describe_typefiltering_category_bool(self):
14871488
'D_num': np.arange(24.) + .5,
14881489
'E_ts': tm.makeTimeSeries()[:24].index})
14891490

1490-
# bool is considered numeric in describe, although not an np.number
14911491
desc = df.describe()
1492-
expected_cols = ['C_bool', 'D_num']
1492+
expected_cols = ['D_num']
14931493
expected = DataFrame(dict((k, df[k].describe())
14941494
for k in expected_cols),
14951495
columns=expected_cols)

0 commit comments

Comments
 (0)