DOC: Clarified and expanded describe documentation (#14995)

palewire · jorisvandenbossche · commit 8a78a2d382e1 · 2017-01-02T10:05:47.000+01:00
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -5201,60 +5201,222 @@ def abs(self):
         """
         return np.abs(self)
 
-    _shared_docs['describe'] = """
-        Generate various summary statistics, excluding NaN values.
+    def describe(self, percentiles=None, include=None, exclude=None):
+        """
+        Generates descriptive statistics that summarize the central tendency,
+        dispersion and shape of a dataset's distribution, excluding
+        ``NaN`` values.
+
+        Analyzes both numeric and object series, as well
+        as ``DataFrame`` column sets of mixed data types. The output
+        will vary depending on what is provided. Refer to the notes
+        below for more detail.
 
         Parameters
         ----------
-        percentiles : array-like, optional
-            The percentiles to include in the output. Should all
-            be in the interval [0, 1]. By default `percentiles` is
-            [.25, .5, .75], returning the 25th, 50th, and 75th percentiles.
-        include, exclude : list-like, 'all', or None (default)
-            Specify the form of the returned result. Either:
-
-            - None to both (default). The result will include only
-              numeric-typed columns or, if none are, only categorical columns.
-            - A list of dtypes or strings to be included/excluded.
-              To select all numeric types use numpy numpy.number. To select
-              categorical objects use type object. See also the select_dtypes
-              documentation. eg. df.describe(include=['O'])
-            - If include is the string 'all', the output column-set will
-              match the input one.
+        percentiles : list-like of numbers, optional
+            The percentiles to include in the output. All should
+            fall between 0 and 1. The default is
+            ``[.25, .5, .75]``, which returns the 25th, 50th, and
+            75th percentiles.
+        include : 'all', list-like of dtypes or None (default), optional
+            A white list of data types to include in the result. Ignored
+            for ``Series``. Here are the options:
+
+            - 'all' : All columns of the input will be included in the output.
+            - A list-like of dtypes : Limits the results to the
+              provided data types.
+              To limit the result to numeric types submit
+              ``numpy.number``. To limit it instead to categorical
+              objects submit the ``numpy.object`` data type. Strings
+              can also be used in the style of
+              ``select_dtypes`` (e.g. ``df.describe(include=['O'])``)
+            - None (default) : The result will include all numeric columns.
+        exclude : list-like of dtypes or None (default), optional,
+            A black list of data types to omit from the result. Ignored
+            for ``Series``. Here are the options:
+
+            - A list-like of dtypes : Excludes the provided data types
+              from the result. To select numeric types submit
+              ``numpy.number``. To select categorical objects submit the data
+              type ``numpy.object``. Strings can also be used in the style of
+              ``select_dtypes`` (e.g. ``df.describe(include=['O'])``)
+            - None (default) : The result will exclude nothing.
 
         Returns
         -------
-        summary: %(klass)s of summary statistics
+        summary:  Series/DataFrame of summary statistics
 
         Notes
         -----
-        The output DataFrame index depends on the requested dtypes:
-
-        For numeric dtypes, it will include: count, mean, std, min,
-        max, and lower, 50, and upper percentiles.
+        For numeric data, the result's index will include ``count``,
+        ``mean``, ``std``, ``min``, ``max`` as well as lower, ``50`` and
+        upper percentiles. By default the lower percentile is ``25`` and the
+        upper percentile is ``75``. The ``50`` percentile is the
+        same as the median.
+
+        For object data (e.g. strings or timestamps), the result's index
+        will include ``count``, ``unique``, ``top``, and ``freq``. The ``top``
+        is the most common value. The ``freq`` is the most common value's
+        frequency. Timestamps also include the ``first`` and ``last`` items.
+
+        If multiple object values have the highest count, then the
+        ``count`` and ``top`` results will be arbitrarily chosen from
+        among those with the highest count.
 
-        For object dtypes (e.g. timestamps or strings), the index
-        will include the count, unique, most common, and frequency of the
-        most common. Timestamps also include the first and last items.
+        For mixed data types provided via a ``DataFrame``, the default is to
+        return only an analysis of numeric columns. If ``include='all'``
+        is provided as an option, the result will include a union of
+        attributes of each type.
 
-        For mixed dtypes, the index will be the union of the corresponding
-        output types. Non-applicable entries will be filled with NaN.
-        Note that mixed-dtype outputs can only be returned from mixed-dtype
-        inputs and appropriate use of the include/exclude arguments.
+        The `include` and `exclude` parameters can be used to limit
+        which columns in a ``DataFrame`` are analyzed for the output.
+        The parameters are ignored when analyzing a ``Series``.
 
-        If multiple values have the highest count, then the
-        `count` and `most common` pair will be arbitrarily chosen from
-        among those with the highest count.
+        Examples
+        --------
+        Describing a numeric ``Series``.
 
-        The include, exclude arguments are ignored for Series.
+        >>> import pandas as pd
+        >>> s = pd.Series([1, 2, 3])
+        >>> s.describe()
+        count    3.0
+        mean     2.0
+        std      1.0
+        min      1.0
+        25%      1.5
+        50%      2.0
+        75%      2.5
+        max      3.0
+
+        Describing a categorical ``Series``.
+
+        >>> s = pd.Series(['a', 'a', 'b', 'c'])
+        >>> s.describe()
+        count     4
+        unique    3
+        top       a
+        freq      2
+        dtype: object
+
+        Describing a timestamp ``Series``.
+
+        >>> import numpy as np
+        >>> s = pd.Series([
+        ...   np.datetime64("2000-01-01"),
+        ...   np.datetime64("2010-01-01"),
+        ...   np.datetime64("2010-01-01")
+        ... ])
+        >>> s.describe()
+        count                       3
+        unique                      2
+        top       2010-01-01 00:00:00
+        freq                        2
+        first     2000-01-01 00:00:00
+        last      2010-01-01 00:00:00
+        dtype: object
+
+        Describing a ``DataFrame``. By default only numeric fields
+        are returned.
+
+        >>> df = pd.DataFrame(
+        ...   [[1, 'a'], [2, 'b'], [3, 'c']],
+        ...   columns=['numeric', 'object']
+        ... )
+        >>> df.describe()
+               numeric
+        count      3.0
+        mean       2.0
+        std        1.0
+        min        1.0
+        25%        1.5
+        50%        2.0
+        75%        2.5
+        max        3.0
+
+        Describing all columns of a ``DataFrame`` regardless of data type.
+
+        >>> df.describe(include='all')
+                numeric object
+        count       3.0      3
+        unique      NaN      3
+        top         NaN      b
+        freq        NaN      1
+        mean        2.0    NaN
+        std         1.0    NaN
+        min         1.0    NaN
+        25%         1.5    NaN
+        50%         2.0    NaN
+        75%         2.5    NaN
+        max         3.0    NaN
+
+        Describing a column from a ``DataFrame`` by accessing it as
+        an attribute.
+
+        >>> df.numeric.describe()
+        count    3.0
+        mean     2.0
+        std      1.0
+        min      1.0
+        25%      1.5
+        50%      2.0
+        75%      2.5
+        max      3.0
+        Name: numeric, dtype: float64
+
+        Including only numeric columns in a ``DataFrame`` description.
+
+        >>> df.describe(include=[np.number])
+               numeric
+        count      3.0
+        mean       2.0
+        std        1.0
+        min        1.0
+        25%        1.5
+        50%        2.0
+        75%        2.5
+        max        3.0
+
+        Including only string columns in a ``DataFrame`` description.
+
+        >>> df.describe(include=[np.object])
+               object
+        count       3
+        unique      3
+        top         b
+        freq        1
+
+        Excluding numeric columns from a ``DataFrame`` description.
+
+        >>> df.describe(exclude=[np.number])
+               object
+        count       3
+        unique      3
+        top         b
+        freq        1
+
+        Excluding object columns from a ``DataFrame`` description.
+
+        >>> df.describe(exclude=[np.object])
+               numeric
+        count      3.0
+        mean       2.0
+        std        1.0
+        min        1.0
+        25%        1.5
+        50%        2.0
+        75%        2.5
+        max        3.0
 
         See Also
         --------
+        DataFrame.count
+        DataFrame.max
+        DataFrame.min
+        DataFrame.mean
+        DataFrame.std
         DataFrame.select_dtypes
         """
-
-    @Appender(_shared_docs['describe'] % _shared_doc_kwargs)
-    def describe(self, percentiles=None, include=None, exclude=None):
         if self.ndim >= 3:
             msg = "describe is not implemented on Panel or PanelND objects."
             raise NotImplementedError(msg)