BUG: describe() outputs bool similarly to categorical data

John Fremlin · jreback · commit bcc987c9df06 · 2016-02-27T10:33:24.000-05:00
closes #6625 closes #12458
diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt
@@ -856,21 +856,14 @@ Other API Changes
      ValueError: Cannot convert arg ['20150101 07:00:00'] to a time.
 
 - ``.memory_usage()`` now includes values in the index, as does memory_usage in ``.info`` (:issue:`11597`)
-
 - ``DataFrame.to_latex()`` now supports non-ascii encodings (eg utf-8) in Python 2 with the parameter ``encoding`` (:issue:`7061`)
-
 - ``pandas.merge()`` and ``DataFrame.merge()`` will show a specific error message when trying to merge with an object that is not of type ``DataFrame`` or a subclass (:issue:`12081`)
-
 - ``DataFrame.unstack`` and ``Series.unstack`` now take ``fill_value`` keyword to allow direct replacement of missing values when an unstack results in missing values in the resulting ``DataFrame``. As an added benefit, specifying ``fill_value`` will preserve the data type of the original stacked data.  (:issue:`9746`)
-
 - As part of the new API for :ref:`window functions <whatsnew_0180.enhancements.moments>` and :ref:`resampling <whatsnew_0180.breaking.resample>`, aggregation functions have been clarified, raising more informative error messages on invalid aggregations. (:issue:`9052`). A full set of examples are presented in :ref:`groupby <groupby.aggregation>`.
-
 - Statistical functions for ``NDFrame`` objects will now raise if non-numpy-compatible arguments are passed in for ``**kwargs`` (:issue:`12301`)
-
 - ``.to_latex`` and ``.to_html`` gain a ``decimal`` parameter like ``.to_csv``; the default is ``'.'`` (:issue:`12031`)
-
 - More helpful error message when constructing a ``DataFrame`` with empty data but with indices (:issue:`8020`)
-
+- ``.describe()`` will now properly handle bool dtype as a categorical (:issue:`6625`)
 
 .. _whatsnew_0180.deprecations:
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -4875,26 +4875,27 @@ def describe_numeric_1d(series, percentiles):
         def describe_categorical_1d(data):
             names = ['count', 'unique']
             objcounts = data.value_counts()
-            result = [data.count(), len(objcounts[objcounts != 0])]
+            count_unique = len(objcounts[objcounts != 0])
+            result = [data.count(), count_unique]
             if result[1] > 0:
                 top, freq = objcounts.index[0], objcounts.iloc[0]
 
-                if (data.dtype == object or
-                        com.is_categorical_dtype(data.dtype)):
-                    names += ['top', 'freq']
-                    result += [top, freq]
-
-                elif com.is_datetime64_dtype(data):
+                if com.is_datetime64_dtype(data):
                     asint = data.dropna().values.view('i8')
                     names += ['top', 'freq', 'first', 'last']
                     result += [lib.Timestamp(top), freq,
                                lib.Timestamp(asint.min()),
                                lib.Timestamp(asint.max())]
+                else:
+                    names += ['top', 'freq']
+                    result += [top, freq]
 
             return pd.Series(result, index=names, name=data.name)
 
         def describe_1d(data, percentiles):
-            if com.is_numeric_dtype(data):
+            if com.is_bool_dtype(data):
+                return describe_categorical_1d(data)
+            elif com.is_numeric_dtype(data):
                 return describe_numeric_1d(data, percentiles)
             elif com.is_timedelta64_dtype(data):
                 return describe_numeric_1d(data, percentiles)
@@ -4906,7 +4907,7 @@ def describe_1d(data, percentiles):
         elif (include is None) and (exclude is None):
             if len(self._get_numeric_data()._info_axis) > 0:
                 # when some numerics are found, keep only numerics
-                data = self.select_dtypes(include=[np.number, np.bool])
+                data = self.select_dtypes(include=[np.number])
             else:
                 data = self
         elif include == 'all':
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -241,24 +241,21 @@ def test_bool_describe_in_mixed_frame(self):
             'int_data': [10, 20, 30, 40, 50],
         })
 
-        # Boolean data and integer data is included in .describe() output,
-        # string data isn't
-        self.assert_numpy_array_equal(df.describe().columns, [
-                                      'bool_data', 'int_data'])
-
-        bool_describe = df.describe()['bool_data']
-
-        # Both the min and the max values should stay booleans
-        self.assertEqual(bool_describe['min'].dtype, np.bool_)
-        self.assertEqual(bool_describe['max'].dtype, np.bool_)
+        # Integer data are included in .describe() output,
+        # Boolean and string data are not.
+        result = df.describe()
+        expected = DataFrame({'int_data' : [5, 30, df.int_data.std(),
+                                            10, 20, 30, 40, 50]},
+                             index=['count', 'mean', 'std', 'min', '25%',
+                                    '50%', '75%', 'max'])
+        assert_frame_equal(result, expected)
 
-        self.assertFalse(bool_describe['min'])
-        self.assertTrue(bool_describe['max'])
+        # Top value is a boolean value that is False
+        result = df.describe(include=['bool'])
 
-        # For numeric operations, like mean or median, the values True/False
-        # are cast to the integer values 1 and 0
-        assert_almost_equal(bool_describe['mean'], 0.4)
-        assert_almost_equal(bool_describe['50%'], 0)
+        expected = DataFrame({'bool_data' : [5, 2, False, 3]},
+                              index=['count', 'unique', 'top', 'freq'])
+        assert_frame_equal(result, expected)
 
     def test_reduce_mixed_frame(self):
         # GH 6806
diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py
@@ -956,7 +956,8 @@ def test_describe_objects(self):
         s = Series(['a', 'b', 'b', np.nan, np.nan, np.nan, 'c', 'd', 'a', 'a'])
         result = s.describe()
         expected = Series({'count': 7, 'unique': 4,
-                           'top': 'a', 'freq': 3}, index=result.index)
+                           'top': 'a', 'freq': 3, 'second': 'b',
+                           'second_freq': 2}, index=result.index)
         assert_series_equal(result, expected)
 
         dt = list(self.ts.index)
@@ -1487,9 +1488,8 @@ def test_describe_typefiltering_category_bool(self):
                         'D_num': np.arange(24.) + .5,
                         'E_ts': tm.makeTimeSeries()[:24].index})
 
-        # bool is considered numeric in describe, although not an np.number
         desc = df.describe()
-        expected_cols = ['C_bool', 'D_num']
+        expected_cols = ['D_num']
         expected = DataFrame(dict((k, df[k].describe())
                                   for k in expected_cols),
                              columns=expected_cols)