describe() outputs bool similarly to categorical data

John Fremlin · vii · commit c8b3690c4ba9 · 2016-02-26T10:50:42.000-08:00
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -4875,26 +4875,27 @@ def describe_numeric_1d(series, percentiles):
         def describe_categorical_1d(data):
             names = ['count', 'unique']
             objcounts = data.value_counts()
-            result = [data.count(), len(objcounts[objcounts != 0])]
+            count_unique = len(objcounts[objcounts != 0])
+            result = [data.count(), count_unique]
             if result[1] > 0:
                 top, freq = objcounts.index[0], objcounts.iloc[0]
 
-                if (data.dtype == object or
-                        com.is_categorical_dtype(data.dtype)):
-                    names += ['top', 'freq']
-                    result += [top, freq]
-
-                elif com.is_datetime64_dtype(data):
+                if com.is_datetime64_dtype(data):
                     asint = data.dropna().values.view('i8')
                     names += ['top', 'freq', 'first', 'last']
                     result += [lib.Timestamp(top), freq,
                                lib.Timestamp(asint.min()),
                                lib.Timestamp(asint.max())]
+                else:
+                    names += ['top', 'freq']
+                    result += [top, freq]
 
             return pd.Series(result, index=names, name=data.name)
 
         def describe_1d(data, percentiles):
-            if com.is_numeric_dtype(data):
+            if com.is_bool_dtype(data):
+                return describe_categorical_1d(data)
+            elif com.is_numeric_dtype(data):
                 return describe_numeric_1d(data, percentiles)
             elif com.is_timedelta64_dtype(data):
                 return describe_numeric_1d(data, percentiles)
@@ -4906,7 +4907,7 @@ def describe_1d(data, percentiles):
         elif (include is None) and (exclude is None):
             if len(self._get_numeric_data()._info_axis) > 0:
                 # when some numerics are found, keep only numerics
-                data = self.select_dtypes(include=[np.number, np.bool])
+                data = self.select_dtypes(include=[np.number])
             else:
                 data = self
         elif include == 'all':
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -241,24 +241,16 @@ def test_bool_describe_in_mixed_frame(self):
             'int_data': [10, 20, 30, 40, 50],
         })
 
-        # Boolean data and integer data is included in .describe() output,
-        # string data isn't
-        self.assert_numpy_array_equal(df.describe().columns, [
-                                      'bool_data', 'int_data'])
+        # Integer data are included in .describe() output,
+        # Boolean and string data are not.
+        self.assert_numpy_array_equal(df.describe().columns, ['int_data'])
 
-        bool_describe = df.describe()['bool_data']
+        bool_describe = df.describe(include='all')['bool_data']
 
-        # Both the min and the max values should stay booleans
-        self.assertEqual(bool_describe['min'].dtype, np.bool_)
-        self.assertEqual(bool_describe['max'].dtype, np.bool_)
+        # Top value is a boolean value that is False
+        self.assertTrue(isinstance(bool_describe['top'] , bool))
+        self.assertFalse(bool_describe['top'])
 
-        self.assertFalse(bool_describe['min'])
-        self.assertTrue(bool_describe['max'])
-
-        # For numeric operations, like mean or median, the values True/False
-        # are cast to the integer values 1 and 0
-        assert_almost_equal(bool_describe['mean'], 0.4)
-        assert_almost_equal(bool_describe['50%'], 0)
 
     def test_reduce_mixed_frame(self):
         # GH 6806
diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py
@@ -955,7 +955,7 @@ def test_describe_objects(self):
         s = Series(['a', 'b', 'b', np.nan, np.nan, np.nan, 'c', 'd', 'a', 'a'])
         result = s.describe()
         expected = Series({'count': 7, 'unique': 4,
-                           'top': 'a', 'freq': 3}, index=result.index)
+                           'top': 'a', 'freq': 3,'second':'b', 'second_freq': 2}, index=result.index)
         assert_series_equal(result, expected)
 
         dt = list(self.ts.index)
@@ -1486,9 +1486,8 @@ def test_describe_typefiltering_category_bool(self):
                         'D_num': np.arange(24.) + .5,
                         'E_ts': tm.makeTimeSeries()[:24].index})
 
-        # bool is considered numeric in describe, although not an np.number
         desc = df.describe()
-        expected_cols = ['C_bool', 'D_num']
+        expected_cols = ['D_num']
         expected = DataFrame(dict((k, df[k].describe())
                                   for k in expected_cols),
                              columns=expected_cols)