Skip to content

Commit c8b3690

Browse files
John Fremlinvii
John Fremlin
authored andcommitted
describe() outputs bool similarly to categorical data
1 parent fe584e7 commit c8b3690

File tree

3 files changed

+19
-27
lines changed

3 files changed

+19
-27
lines changed

pandas/core/generic.py

+10-9
Original file line numberDiff line numberDiff line change
@@ -4875,26 +4875,27 @@ def describe_numeric_1d(series, percentiles):
48754875
def describe_categorical_1d(data):
48764876
names = ['count', 'unique']
48774877
objcounts = data.value_counts()
4878-
result = [data.count(), len(objcounts[objcounts != 0])]
4878+
count_unique = len(objcounts[objcounts != 0])
4879+
result = [data.count(), count_unique]
48794880
if result[1] > 0:
48804881
top, freq = objcounts.index[0], objcounts.iloc[0]
48814882

4882-
if (data.dtype == object or
4883-
com.is_categorical_dtype(data.dtype)):
4884-
names += ['top', 'freq']
4885-
result += [top, freq]
4886-
4887-
elif com.is_datetime64_dtype(data):
4883+
if com.is_datetime64_dtype(data):
48884884
asint = data.dropna().values.view('i8')
48894885
names += ['top', 'freq', 'first', 'last']
48904886
result += [lib.Timestamp(top), freq,
48914887
lib.Timestamp(asint.min()),
48924888
lib.Timestamp(asint.max())]
4889+
else:
4890+
names += ['top', 'freq']
4891+
result += [top, freq]
48934892

48944893
return pd.Series(result, index=names, name=data.name)
48954894

48964895
def describe_1d(data, percentiles):
4897-
if com.is_numeric_dtype(data):
4896+
if com.is_bool_dtype(data):
4897+
return describe_categorical_1d(data)
4898+
elif com.is_numeric_dtype(data):
48984899
return describe_numeric_1d(data, percentiles)
48994900
elif com.is_timedelta64_dtype(data):
49004901
return describe_numeric_1d(data, percentiles)
@@ -4906,7 +4907,7 @@ def describe_1d(data, percentiles):
49064907
elif (include is None) and (exclude is None):
49074908
if len(self._get_numeric_data()._info_axis) > 0:
49084909
# when some numerics are found, keep only numerics
4909-
data = self.select_dtypes(include=[np.number, np.bool])
4910+
data = self.select_dtypes(include=[np.number])
49104911
else:
49114912
data = self
49124913
elif include == 'all':

pandas/tests/frame/test_analytics.py

+7-15
Original file line numberDiff line numberDiff line change
@@ -241,24 +241,16 @@ def test_bool_describe_in_mixed_frame(self):
241241
'int_data': [10, 20, 30, 40, 50],
242242
})
243243

244-
# Boolean data and integer data is included in .describe() output,
245-
# string data isn't
246-
self.assert_numpy_array_equal(df.describe().columns, [
247-
'bool_data', 'int_data'])
244+
# Integer data are included in .describe() output,
245+
# Boolean and string data are not.
246+
self.assert_numpy_array_equal(df.describe().columns, ['int_data'])
248247

249-
bool_describe = df.describe()['bool_data']
248+
bool_describe = df.describe(include='all')['bool_data']
250249

251-
# Both the min and the max values should stay booleans
252-
self.assertEqual(bool_describe['min'].dtype, np.bool_)
253-
self.assertEqual(bool_describe['max'].dtype, np.bool_)
250+
# Top value is a boolean value that is False
251+
self.assertTrue(isinstance(bool_describe['top'] , bool))
252+
self.assertFalse(bool_describe['top'])
254253

255-
self.assertFalse(bool_describe['min'])
256-
self.assertTrue(bool_describe['max'])
257-
258-
# For numeric operations, like mean or median, the values True/False
259-
# are cast to the integer values 1 and 0
260-
assert_almost_equal(bool_describe['mean'], 0.4)
261-
assert_almost_equal(bool_describe['50%'], 0)
262254

263255
def test_reduce_mixed_frame(self):
264256
# GH 6806

pandas/tests/test_generic.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -955,7 +955,7 @@ def test_describe_objects(self):
955955
s = Series(['a', 'b', 'b', np.nan, np.nan, np.nan, 'c', 'd', 'a', 'a'])
956956
result = s.describe()
957957
expected = Series({'count': 7, 'unique': 4,
958-
'top': 'a', 'freq': 3}, index=result.index)
958+
'top': 'a', 'freq': 3,'second':'b', 'second_freq': 2}, index=result.index)
959959
assert_series_equal(result, expected)
960960

961961
dt = list(self.ts.index)
@@ -1486,9 +1486,8 @@ def test_describe_typefiltering_category_bool(self):
14861486
'D_num': np.arange(24.) + .5,
14871487
'E_ts': tm.makeTimeSeries()[:24].index})
14881488

1489-
# bool is considered numeric in describe, although not an np.number
14901489
desc = df.describe()
1491-
expected_cols = ['C_bool', 'D_num']
1490+
expected_cols = ['D_num']
14921491
expected = DataFrame(dict((k, df[k].describe())
14931492
for k in expected_cols),
14941493
columns=expected_cols)

0 commit comments

Comments
 (0)