MOVE: describe to pandas/core/describe.py (#39102)

ivanovmg · web-flow · commit c4f15e4d0e86 · 2021-01-11T14:58:27.000-05:00
diff --git a/pandas/core/describe.py b/pandas/core/describe.py
@@ -0,0 +1,205 @@
+"""
+Module responsible for execution of NDFrame.describe() method.
+
+Method NDFrame.describe() delegates actual execution to function describe_ndframe().
+"""
+
+from typing import TYPE_CHECKING, List, Optional, Sequence, Union
+import warnings
+
+import numpy as np
+
+from pandas._libs.tslibs import Timestamp
+from pandas._typing import FrameOrSeries, Hashable
+from pandas.util._validators import validate_percentile
+
+from pandas.core.dtypes.common import (
+    is_bool_dtype,
+    is_datetime64_any_dtype,
+    is_numeric_dtype,
+    is_timedelta64_dtype,
+)
+
+from pandas.core.reshape.concat import concat
+
+from pandas.io.formats.format import format_percentiles
+
+if TYPE_CHECKING:
+    from pandas import Series
+
+
+def describe_ndframe(
+    *,
+    obj: FrameOrSeries,
+    include: Optional[Union[str, Sequence[str]]],
+    exclude: Optional[Union[str, Sequence[str]]],
+    datetime_is_numeric: bool,
+    percentiles: Optional[Sequence[float]],
+) -> FrameOrSeries:
+    """Describe series or dataframe.
+
+    Called from pandas.core.generic.NDFrame.describe()
+
+    Parameters
+    ----------
+    obj: DataFrame or Series
+        Either dataframe or series to be described.
+    include : 'all', list-like of dtypes or None (default), optional
+        A white list of data types to include in the result. Ignored for ``Series``.
+    exclude : list-like of dtypes or None (default), optional,
+        A black list of data types to omit from the result. Ignored for ``Series``.
+    datetime_is_numeric : bool, default False
+        Whether to treat datetime dtypes as numeric.
+    percentiles : list-like of numbers, optional
+        The percentiles to include in the output. All should fall between 0 and 1.
+        The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and
+        75th percentiles.
+
+    Returns
+    -------
+    Dataframe or series description.
+    """
+    if obj.ndim == 2 and obj.columns.size == 0:
+        raise ValueError("Cannot describe a DataFrame without columns")
+
+    if percentiles is not None:
+        # explicit conversion of `percentiles` to list
+        percentiles = list(percentiles)
+
+        # get them all to be in [0, 1]
+        validate_percentile(percentiles)
+
+        # median should always be included
+        if 0.5 not in percentiles:
+            percentiles.append(0.5)
+        percentiles = np.asarray(percentiles)
+    else:
+        percentiles = np.array([0.25, 0.5, 0.75])
+
+    # sort and check for duplicates
+    unique_pcts = np.unique(percentiles)
+    assert percentiles is not None
+    if len(unique_pcts) < len(percentiles):
+        raise ValueError("percentiles cannot contain duplicates")
+    percentiles = unique_pcts
+
+    formatted_percentiles = format_percentiles(percentiles)
+
+    def describe_numeric_1d(series) -> "Series":
+        from pandas import Series
+
+        stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
+        d = (
+            [series.count(), series.mean(), series.std(), series.min()]
+            + series.quantile(percentiles).tolist()
+            + [series.max()]
+        )
+        return Series(d, index=stat_index, name=series.name)
+
+    def describe_categorical_1d(data) -> "Series":
+        names = ["count", "unique"]
+        objcounts = data.value_counts()
+        count_unique = len(objcounts[objcounts != 0])
+        result = [data.count(), count_unique]
+        dtype = None
+        if result[1] > 0:
+            top, freq = objcounts.index[0], objcounts.iloc[0]
+            if is_datetime64_any_dtype(data.dtype):
+                if obj.ndim == 1:
+                    stacklevel = 5
+                else:
+                    stacklevel = 6
+                warnings.warn(
+                    "Treating datetime data as categorical rather than numeric in "
+                    "`.describe` is deprecated and will be removed in a future "
+                    "version of pandas. Specify `datetime_is_numeric=True` to "
+                    "silence this warning and adopt the future behavior now.",
+                    FutureWarning,
+                    stacklevel=stacklevel,
+                )
+                tz = data.dt.tz
+                asint = data.dropna().values.view("i8")
+                top = Timestamp(top)
+                if top.tzinfo is not None and tz is not None:
+                    # Don't tz_localize(None) if key is already tz-aware
+                    top = top.tz_convert(tz)
+                else:
+                    top = top.tz_localize(tz)
+                names += ["top", "freq", "first", "last"]
+                result += [
+                    top,
+                    freq,
+                    Timestamp(asint.min(), tz=tz),
+                    Timestamp(asint.max(), tz=tz),
+                ]
+            else:
+                names += ["top", "freq"]
+                result += [top, freq]
+
+        # If the DataFrame is empty, set 'top' and 'freq' to None
+        # to maintain output shape consistency
+        else:
+            names += ["top", "freq"]
+            result += [np.nan, np.nan]
+            dtype = "object"
+
+        from pandas import Series
+
+        return Series(result, index=names, name=data.name, dtype=dtype)
+
+    def describe_timestamp_1d(data) -> "Series":
+        # GH-30164
+        from pandas import Series
+
+        stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
+        d = (
+            [data.count(), data.mean(), data.min()]
+            + data.quantile(percentiles).tolist()
+            + [data.max()]
+        )
+        return Series(d, index=stat_index, name=data.name)
+
+    def describe_1d(data) -> "Series":
+        if is_bool_dtype(data.dtype):
+            return describe_categorical_1d(data)
+        elif is_numeric_dtype(data):
+            return describe_numeric_1d(data)
+        elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
+            return describe_timestamp_1d(data)
+        elif is_timedelta64_dtype(data.dtype):
+            return describe_numeric_1d(data)
+        else:
+            return describe_categorical_1d(data)
+
+    if obj.ndim == 1:
+        # Incompatible return value type
+        #  (got "Series", expected "FrameOrSeries")  [return-value]
+        return describe_1d(obj)  # type:ignore[return-value]
+    elif (include is None) and (exclude is None):
+        # when some numerics are found, keep only numerics
+        default_include = [np.number]
+        if datetime_is_numeric:
+            default_include.append("datetime")
+        data = obj.select_dtypes(include=default_include)
+        if len(data.columns) == 0:
+            data = obj
+    elif include == "all":
+        if exclude is not None:
+            msg = "exclude must be None when include is 'all'"
+            raise ValueError(msg)
+        data = obj
+    else:
+        data = obj.select_dtypes(include=include, exclude=exclude)
+
+    ldesc = [describe_1d(s) for _, s in data.items()]
+    # set a convenient order for rows
+    names: List[Hashable] = []
+    ldesc_indexes = sorted((x.index for x in ldesc), key=len)
+    for idxnames in ldesc_indexes:
+        for name in idxnames:
+            if name not in names:
+                names.append(name)
+
+    d = concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False)
+    d.columns = data.columns.copy()
+    return d
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -57,11 +57,7 @@
 from pandas.compat.numpy import function as nv
 from pandas.errors import AbstractMethodError, InvalidIndexError
 from pandas.util._decorators import doc, rewrite_axis_style_signature
-from pandas.util._validators import (
-    validate_bool_kwarg,
-    validate_fillna_kwargs,
-    validate_percentile,
-)
+from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs
 
 from pandas.core.dtypes.common import (
     ensure_int64,
@@ -95,6 +91,7 @@
 from pandas.core.base import PandasObject, SelectionMixin
 import pandas.core.common as com
 from pandas.core.construction import create_series_with_explicit_dtype, extract_array
+from pandas.core.describe import describe_ndframe
 from pandas.core.flags import Flags
 from pandas.core.indexes import base as ibase
 from pandas.core.indexes.api import (
@@ -113,11 +110,7 @@
 from pandas.core.window import Expanding, ExponentialMovingWindow, Rolling, Window
 
 from pandas.io.formats import format as fmt
-from pandas.io.formats.format import (
-    DataFrameFormatter,
-    DataFrameRenderer,
-    format_percentiles,
-)
+from pandas.io.formats.format import DataFrameFormatter, DataFrameRenderer
 from pandas.io.formats.printing import pprint_thing
 
 if TYPE_CHECKING:
@@ -10084,145 +10077,13 @@ def describe(
         75%            NaN      2.5
         max            NaN      3.0
         """
-        if self.ndim == 2 and self.columns.size == 0:
-            raise ValueError("Cannot describe a DataFrame without columns")
-
-        if percentiles is not None:
-            # explicit conversion of `percentiles` to list
-            percentiles = list(percentiles)
-
-            # get them all to be in [0, 1]
-            validate_percentile(percentiles)
-
-            # median should always be included
-            if 0.5 not in percentiles:
-                percentiles.append(0.5)
-            percentiles = np.asarray(percentiles)
-        else:
-            percentiles = np.array([0.25, 0.5, 0.75])
-
-        # sort and check for duplicates
-        unique_pcts = np.unique(percentiles)
-        if len(unique_pcts) < len(percentiles):
-            raise ValueError("percentiles cannot contain duplicates")
-        percentiles = unique_pcts
-
-        formatted_percentiles = format_percentiles(percentiles)
-
-        def describe_numeric_1d(series) -> "Series":
-            stat_index = (
-                ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
-            )
-            d = (
-                [series.count(), series.mean(), series.std(), series.min()]
-                + series.quantile(percentiles).tolist()
-                + [series.max()]
-            )
-            return pd.Series(d, index=stat_index, name=series.name)
-
-        def describe_categorical_1d(data) -> "Series":
-            names = ["count", "unique"]
-            objcounts = data.value_counts()
-            count_unique = len(objcounts[objcounts != 0])
-            result = [data.count(), count_unique]
-            dtype = None
-            if result[1] > 0:
-                top, freq = objcounts.index[0], objcounts.iloc[0]
-                if is_datetime64_any_dtype(data.dtype):
-                    if self.ndim == 1:
-                        stacklevel = 4
-                    else:
-                        stacklevel = 5
-                    warnings.warn(
-                        "Treating datetime data as categorical rather than numeric in "
-                        "`.describe` is deprecated and will be removed in a future "
-                        "version of pandas. Specify `datetime_is_numeric=True` to "
-                        "silence this warning and adopt the future behavior now.",
-                        FutureWarning,
-                        stacklevel=stacklevel,
-                    )
-                    tz = data.dt.tz
-                    asint = data.dropna().values.view("i8")
-                    top = Timestamp(top)
-                    if top.tzinfo is not None and tz is not None:
-                        # Don't tz_localize(None) if key is already tz-aware
-                        top = top.tz_convert(tz)
-                    else:
-                        top = top.tz_localize(tz)
-                    names += ["top", "freq", "first", "last"]
-                    result += [
-                        top,
-                        freq,
-                        Timestamp(asint.min(), tz=tz),
-                        Timestamp(asint.max(), tz=tz),
-                    ]
-                else:
-                    names += ["top", "freq"]
-                    result += [top, freq]
-
-            # If the DataFrame is empty, set 'top' and 'freq' to None
-            # to maintain output shape consistency
-            else:
-                names += ["top", "freq"]
-                result += [np.nan, np.nan]
-                dtype = "object"
-
-            return pd.Series(result, index=names, name=data.name, dtype=dtype)
-
-        def describe_timestamp_1d(data) -> "Series":
-            # GH-30164
-            stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
-            d = (
-                [data.count(), data.mean(), data.min()]
-                + data.quantile(percentiles).tolist()
-                + [data.max()]
-            )
-            return pd.Series(d, index=stat_index, name=data.name)
-
-        def describe_1d(data) -> "Series":
-            if is_bool_dtype(data.dtype):
-                return describe_categorical_1d(data)
-            elif is_numeric_dtype(data):
-                return describe_numeric_1d(data)
-            elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
-                return describe_timestamp_1d(data)
-            elif is_timedelta64_dtype(data.dtype):
-                return describe_numeric_1d(data)
-            else:
-                return describe_categorical_1d(data)
-
-        if self.ndim == 1:
-            # Incompatible return value type
-            #  (got "Series", expected "FrameOrSeries")  [return-value]
-            return describe_1d(self)  # type:ignore[return-value]
-        elif (include is None) and (exclude is None):
-            # when some numerics are found, keep only numerics
-            default_include = [np.number]
-            if datetime_is_numeric:
-                default_include.append("datetime")
-            data = self.select_dtypes(include=default_include)
-            if len(data.columns) == 0:
-                data = self
-        elif include == "all":
-            if exclude is not None:
-                msg = "exclude must be None when include is 'all'"
-                raise ValueError(msg)
-            data = self
-        else:
-            data = self.select_dtypes(include=include, exclude=exclude)
-
-        ldesc = [describe_1d(s) for _, s in data.items()]
-        # set a convenient order for rows
-        names: List[Hashable] = []
-        ldesc_indexes = sorted((x.index for x in ldesc), key=len)
-        for idxnames in ldesc_indexes:
-            for name in idxnames:
-                if name not in names:
-                    names.append(name)
-
-        d = pd.concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False)
-        d.columns = data.columns.copy()
-        return d
+        return describe_ndframe(
+            obj=self,
+            include=include,
+            exclude=exclude,
+            datetime_is_numeric=datetime_is_numeric,
+            percentiles=percentiles,
+        )
 
     @final
     def pct_change(