Merge pull request #7088 from TomAugspurger/describe-quantiles

Tom Augspurger · Tom Augspurger · commit f26e668839ce · 2014-05-13T19:45:22.000-05:00
ENH/API: accept list-like percentiles in describe (WIP)
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -454,6 +454,7 @@ non-null values:
    series[10:20]  = 5
    series.nunique()
 
+.. _basics.describe:
 
 Summarizing data: describe
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -471,7 +472,13 @@ course):
     frame.ix[::2] = np.nan
     frame.describe()
 
-.. _basics.describe:
+You can select specific percentiles to include in the output:
+
+.. ipython:: python
+
+    series.describe(percentiles=[.05, .25, .75, .95])
+
+By default, the median is always included.
 
 For a non-numerical Series object, `describe` will give a simple summary of the
 number of unique values and most frequently occurring values:
@@ -482,6 +489,7 @@ number of unique values and most frequently occurring values:
    s = Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a'])
    s.describe()
 
+
 There also is a utility function, ``value_range`` which takes a DataFrame and
 returns a series with the minimum/maximum values in the DataFrame.
 
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -204,6 +204,8 @@ API Changes
 - Produce :class:`~pandas.io.parsers.ParserWarning` on fallback to python
   parser when no options are ignored (:issue:`6607`)
 - Added ``factorize`` functions to ``Index`` and ``Series`` to get indexer and unique values (:issue:`7090`)
+- :meth:`DataFrame.describe` on a DataFrame with a mix of Timestamp and string like objects
+  returns a different Index (:issue:`7088`). Previously the index was unintentionally sorted.
 
 Deprecations
 ~~~~~~~~~~~~
@@ -250,6 +252,10 @@ Deprecations
 - The support for the 'mysql' flavor when using DBAPI connection objects has been deprecated.
   MySQL will be further supported with SQLAlchemy engines (:issue:`6900`).
 
+- The `percentile_width` keyword argument in :meth:`~DataFrame.describe` has been deprecated.
+  Use the `percentiles` keyword instead, which takes a list of percentiles to display. The
+  default output is unchanged.
+
 Prior Version Deprecations/Changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -339,6 +345,7 @@ Improvements to existing features
 - ``boxplot`` now supports ``layout`` keyword (:issue:`6769`)
 - Regression in the display of a MultiIndexed Series with ``display.max_rows`` is less than the
   length of the series (:issue:`7101`)
+- :meth:`~DataFrame.describe` now accepts an array of percentiles to include in the summary statistics (:issue:`4196`)
 
 .. _release.bug_fixes-0.14.0:
 
diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt
@@ -196,6 +196,8 @@ API changes
 - accept ``TextFileReader`` in ``concat``, which was affecting a common user idiom (:issue:`6583`), this was a regression
   from 0.13.1
 - Added ``factorize`` functions to ``Index`` and ``Series`` to get indexer and unique values (:issue:`7090`)
+- ``describe`` on a DataFrame with a mix of Timestamp and string like objects returns a different Index (:issue:`7088`).
+  Previously the index was unintentionally sorted.
 
 .. _whatsnew_0140.display:
 
@@ -511,6 +513,10 @@ Deprecations
 - The support for the 'mysql' flavor when using DBAPI connection objects has been deprecated.
   MySQL will be further supported with SQLAlchemy engines (:issue:`6900`).
 
+- The `percentile_width` keyword argument in :meth:`~DataFrame.describe` has been deprecated.
+  Use the `percentiles` keyword instead, which takes a list of percentiles to display. The
+  default output is unchanged.
+
 .. _whatsnew_0140.enhancements:
 
 Enhancements
@@ -577,6 +583,7 @@ Enhancements
 - ``CustomBuisnessMonthBegin`` and ``CustomBusinessMonthEnd`` are now available (:issue:`6866`)
 - :meth:`Series.quantile` and :meth:`DataFrame.quantile` now accept an array of
   quantiles.
+- :meth:`~DataFrame.describe` now accepts an array of percentiles to include in the summary statistics (:issue:`4196`)
 - ``pivot_table`` can now accept ``Grouper`` by ``index`` and ``columns`` keywords (:issue:`6913`)
 
   .. ipython:: python
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3808,54 +3808,6 @@ def corrwith(self, other, axis=0, drop=False):
 
         return correl
 
-    def describe(self, percentile_width=50):
-        """
-        Generate various summary statistics of each column, excluding
-        NaN values. These include: count, mean, std, min, max, and
-        lower%/50%/upper% percentiles
-
-        Parameters
-        ----------
-        percentile_width : float, optional
-            width of the desired uncertainty interval, default is 50,
-            which corresponds to lower=25, upper=75
-
-        Returns
-        -------
-        DataFrame of summary statistics
-        """
-        numdata = self._get_numeric_data()
-
-        if len(numdata.columns) == 0:
-            return DataFrame(dict((k, v.describe())
-                                  for k, v in compat.iteritems(self)),
-                             columns=self.columns)
-
-        lb = .5 * (1. - percentile_width / 100.)
-        ub = 1. - lb
-
-        def pretty_name(x):
-            x *= 100
-            if x == int(x):
-                return '%.0f%%' % x
-            else:
-                return '%.1f%%' % x
-
-        destat_columns = ['count', 'mean', 'std', 'min',
-                          pretty_name(lb), '50%', pretty_name(ub),
-                          'max']
-
-        destat = []
-
-        for i in range(len(numdata.columns)):
-            series = numdata.iloc[:, i]
-            destat.append([series.count(), series.mean(), series.std(),
-                           series.min(), series.quantile(lb), series.median(),
-                           series.quantile(ub), series.max()])
-
-        return self._constructor(lmap(list, zip(*destat)),
-                                 index=destat_columns, columns=numdata.columns)
-
     #----------------------------------------------------------------------
     # ndarray-like stats methods
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -19,7 +19,7 @@
 import pandas.core.common as com
 import pandas.core.datetools as datetools
 from pandas import compat, _np_version_under1p7
-from pandas.compat import map, zip, lrange, string_types, isidentifier
+from pandas.compat import map, zip, lrange, string_types, isidentifier, lmap
 from pandas.core.common import (isnull, notnull, is_list_like,
                                 _values_from_object, _maybe_promote, _maybe_box_datetimelike,
                                 ABCSeries, SettingWithCopyError, SettingWithCopyWarning)
@@ -3478,6 +3478,154 @@ def _convert_timedeltas(x):
 
         return np.abs(self)
 
+    _shared_docs['describe'] = """
+        Generate various summary statistics, excluding NaN values.
+
+        Parameters
+        ----------
+        percentile_width : float, deprecated
+            The ``percentile_width`` argument will be removed in a future
+            version. Use ``percentiles`` instead.
+            width of the desired uncertainty interval, default is 50,
+            which corresponds to lower=25, upper=75
+        percentiles : array-like, optional
+            The percentiles to include in the output. Should all
+            be in the interval [0, 1]. By default `percentiles` is
+            [.25, .5, .75], returning the 25th, 50th, and 75th percentiles.
+
+        Returns
+        -------
+        summary: %(klass)s of summary statistics
+
+        Notes
+        -----
+        For numeric dtypes the index includes: count, mean, std, min,
+        max, and lower, 50, and upper percentiles.
+
+        If self is of object dtypes (e.g. timestamps or strings), the output
+        will include the count, unique, most common, and frequency of the
+        most common. Timestamps also include the first and last items.
+
+        If multiple values have the highest count, then the
+        `count` and `most common` pair will be arbitrarily chosen from
+        among those with the highest count.
+        """
+
+    @Appender(_shared_docs['describe'] % _shared_doc_kwargs)
+    def describe(self, percentile_width=None, percentiles=None):
+        if self.ndim >= 3:
+            msg = "describe is not implemented on on Panel or PanelND objects."
+            raise NotImplementedError(msg)
+
+        if percentile_width is not None and percentiles is not None:
+            msg = "Cannot specify both 'percentile_width' and 'percentiles.'"
+            raise ValueError(msg)
+        if percentiles is not None:
+            # get them all to be in [0, 1]
+            percentiles = np.asarray(percentiles)
+            if (percentiles > 1).any():
+                percentiles = percentiles / 100.0
+                msg = ("percentiles should all be in the interval [0, 1]. "
+                       "Try {0} instead.")
+                raise ValueError(msg.format(list(percentiles)))
+        else:
+            # only warn if they change the default
+            if percentile_width is not None:
+                do_warn = True
+            else:
+                do_warn = False
+            percentile_width = percentile_width or 50
+            lb = .5 * (1. - percentile_width / 100.)
+            ub = 1. - lb
+            percentiles = np.array([lb, 0.5, ub])
+            if do_warn:
+                msg = ("The `percentile_width` keyword is deprecated. "
+                       "Use percentiles={0} instead".format(list(percentiles)))
+                warnings.warn(msg, FutureWarning)
+
+        # median should always be included
+        if (percentiles != 0.5).all():  # median isn't included
+            lh = percentiles[percentiles < .5]
+            uh = percentiles[percentiles > .5]
+            percentiles = np.hstack([lh, 0.5, uh])
+
+        # dtypes: numeric only, numeric mixed, objects only
+        data = self._get_numeric_data()
+        if self.ndim > 1:
+            if len(data._info_axis) == 0:
+                is_object = True
+            else:
+                is_object = False
+        else:
+            is_object = not self._is_numeric_mixed_type
+
+        def pretty_name(x):
+            x *= 100
+            if x == int(x):
+                return '%.0f%%' % x
+            else:
+                return '%.1f%%' % x
+
+        def describe_numeric_1d(series, percentiles):
+                return ([series.count(), series.mean(), series.std(),
+                         series.min()] +
+                        [series.quantile(x) for x in percentiles] +
+                        [series.max()])
+
+        def describe_categorical_1d(data):
+            if data.dtype == object:
+                names = ['count', 'unique']
+                objcounts = data.value_counts()
+                result = [data.count(), len(objcounts)]
+                if result[1] > 0:
+                    names += ['top', 'freq']
+                    top, freq = objcounts.index[0], objcounts.iloc[0]
+                    result += [top, freq]
+
+            elif issubclass(data.dtype.type, np.datetime64):
+                names = ['count', 'unique']
+                asint = data.dropna().values.view('i8')
+                objcounts = compat.Counter(asint)
+                result = [data.count(), len(objcounts)]
+                if result[1] > 0:
+                    top, freq = objcounts.most_common(1)[0]
+                    names += ['first', 'last', 'top', 'freq']
+                    result += [lib.Timestamp(asint.min()),
+                               lib.Timestamp(asint.max()),
+                               lib.Timestamp(top), freq]
+
+            return pd.Series(result, index=names)
+
+        if is_object:
+            if data.ndim == 1:
+                return describe_categorical_1d(self)
+            else:
+                result = pd.DataFrame(dict((k, describe_categorical_1d(v))
+                                           for k, v in compat.iteritems(self)),
+                                      columns=self._info_axis,
+                                      index=['count', 'unique', 'first', 'last',
+                                             'top', 'freq'])
+                # just objects, no datime
+                if pd.isnull(result.loc['first']).all():
+                    result = result.drop(['first', 'last'], axis=0)
+                return result
+        else:
+            stat_index = (['count', 'mean', 'std', 'min'] +
+                          [pretty_name(x) for x in percentiles] +
+                          ['max'])
+            if data.ndim == 1:
+                return pd.Series(describe_numeric_1d(data, percentiles),
+                                 index=stat_index)
+            else:
+                destat = []
+                for i in range(len(data._info_axis)):  # BAD
+                    series = data.iloc[:, i]
+                    destat.append(describe_numeric_1d(series, percentiles))
+
+                return self._constructor(lmap(list, zip(*destat)),
+                                         index=stat_index,
+                                         columns=data._info_axis)
+
     _shared_docs['pct_change'] = """
         Percent change over given number of periods.
 
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1267,67 +1267,6 @@ def multi(values, qs):
     def ptp(self, axis=None, out=None):
         return _values_from_object(self).ptp(axis, out)
 
-    def describe(self, percentile_width=50):
-        """
-        Generate various summary statistics of Series, excluding NaN
-        values. These include: count, mean, std, min, max, and
-        lower%/50%/upper% percentiles
-
-        Parameters
-        ----------
-        percentile_width : float, optional
-            width of the desired uncertainty interval, default is 50,
-            which corresponds to lower=25, upper=75
-
-        Returns
-        -------
-        desc : Series
-        """
-        from pandas.compat import Counter
-
-        if self.dtype == object:
-            names = ['count', 'unique']
-            objcounts = Counter(self.dropna().values)
-            data = [self.count(), len(objcounts)]
-            if data[1] > 0:
-                names += ['top', 'freq']
-                top, freq = objcounts.most_common(1)[0]
-                data += [top, freq]
-
-        elif issubclass(self.dtype.type, np.datetime64):
-            names = ['count', 'unique']
-            asint = self.dropna().values.view('i8')
-            objcounts = Counter(asint)
-            data = [self.count(), len(objcounts)]
-            if data[1] > 0:
-                top, freq = objcounts.most_common(1)[0]
-                names += ['first', 'last', 'top', 'freq']
-                data += [lib.Timestamp(asint.min()),
-                         lib.Timestamp(asint.max()),
-                         lib.Timestamp(top), freq]
-        else:
-
-            lb = .5 * (1. - percentile_width / 100.)
-            ub = 1. - lb
-
-            def pretty_name(x):
-                x *= 100
-                if x == int(x):
-                    return '%.0f%%' % x
-                else:
-                    return '%.1f%%' % x
-
-            names = ['count']
-            data = [self.count()]
-            names += ['mean', 'std', 'min', pretty_name(lb), '50%',
-                      pretty_name(ub), 'max']
-            data += [self.mean(), self.std(), self.min(),
-                     self.quantile(
-                         lb), self.median(), self.quantile(ub),
-                     self.max()]
-
-        return self._constructor(data, index=names).__finalize__(self)
-
     def corr(self, other, method='pearson',
              min_periods=None):
         """
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py