From 126075fe8f14180db25d344eb9ec7bd20c22e187 Mon Sep 17 00:00:00 2001
From: Maxim Ivanov <ivanovmg@gmail.com>
Date: Tue, 12 Jan 2021 13:37:21 +0700
Subject: [PATCH 1/2] REF: eliminate inner functions in describe

---
 pandas/core/describe.py | 194 +++++++++++++++++++++-------------------
 1 file changed, 104 insertions(+), 90 deletions(-)

diff --git a/pandas/core/describe.py b/pandas/core/describe.py
index 1b5fbaf0e78f9..f6cbeb2283b57 100644
--- a/pandas/core/describe.py
+++ b/pandas/core/describe.py
@@ -83,98 +83,15 @@ def describe_ndframe(
         raise ValueError("percentiles cannot contain duplicates")
     percentiles = unique_pcts
 
-    formatted_percentiles = format_percentiles(percentiles)
-
-    def describe_numeric_1d(series) -> "Series":
-        from pandas import Series
-
-        stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
-        d = (
-            [series.count(), series.mean(), series.std(), series.min()]
-            + series.quantile(percentiles).tolist()
-            + [series.max()]
-        )
-        return Series(d, index=stat_index, name=series.name)
-
-    def describe_categorical_1d(data) -> "Series":
-        names = ["count", "unique"]
-        objcounts = data.value_counts()
-        count_unique = len(objcounts[objcounts != 0])
-        result = [data.count(), count_unique]
-        dtype = None
-        if result[1] > 0:
-            top, freq = objcounts.index[0], objcounts.iloc[0]
-            if is_datetime64_any_dtype(data.dtype):
-                if obj.ndim == 1:
-                    stacklevel = 5
-                else:
-                    stacklevel = 6
-                warnings.warn(
-                    "Treating datetime data as categorical rather than numeric in "
-                    "`.describe` is deprecated and will be removed in a future "
-                    "version of pandas. Specify `datetime_is_numeric=True` to "
-                    "silence this warning and adopt the future behavior now.",
-                    FutureWarning,
-                    stacklevel=stacklevel,
-                )
-                tz = data.dt.tz
-                asint = data.dropna().values.view("i8")
-                top = Timestamp(top)
-                if top.tzinfo is not None and tz is not None:
-                    # Don't tz_localize(None) if key is already tz-aware
-                    top = top.tz_convert(tz)
-                else:
-                    top = top.tz_localize(tz)
-                names += ["top", "freq", "first", "last"]
-                result += [
-                    top,
-                    freq,
-                    Timestamp(asint.min(), tz=tz),
-                    Timestamp(asint.max(), tz=tz),
-                ]
-            else:
-                names += ["top", "freq"]
-                result += [top, freq]
-
-        # If the DataFrame is empty, set 'top' and 'freq' to None
-        # to maintain output shape consistency
-        else:
-            names += ["top", "freq"]
-            result += [np.nan, np.nan]
-            dtype = "object"
-
-        from pandas import Series
-
-        return Series(result, index=names, name=data.name, dtype=dtype)
-
-    def describe_timestamp_1d(data) -> "Series":
-        # GH-30164
-        from pandas import Series
-
-        stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
-        d = (
-            [data.count(), data.mean(), data.min()]
-            + data.quantile(percentiles).tolist()
-            + [data.max()]
-        )
-        return Series(d, index=stat_index, name=data.name)
-
-    def describe_1d(data) -> "Series":
-        if is_bool_dtype(data.dtype):
-            return describe_categorical_1d(data)
-        elif is_numeric_dtype(data):
-            return describe_numeric_1d(data)
-        elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
-            return describe_timestamp_1d(data)
-        elif is_timedelta64_dtype(data.dtype):
-            return describe_numeric_1d(data)
-        else:
-            return describe_categorical_1d(data)
-
     if obj.ndim == 1:
         # Incompatible return value type
         #  (got "Series", expected "FrameOrSeries")  [return-value]
-        return describe_1d(obj)  # type:ignore[return-value]
+        return describe_1d(
+            obj,
+            percentiles,
+            datetime_is_numeric,
+            is_series=True,
+        )  # type:ignore[return-value]
     elif (include is None) and (exclude is None):
         # when some numerics are found, keep only numerics
         default_include = [np.number]
@@ -191,7 +108,10 @@ def describe_1d(data) -> "Series":
     else:
         data = obj.select_dtypes(include=include, exclude=exclude)
 
-    ldesc = [describe_1d(s) for _, s in data.items()]
+    ldesc = [
+        describe_1d(s, percentiles, datetime_is_numeric, is_series=False)
+        for _, s in data.items()
+    ]
     # set a convenient order for rows
     names: List[Hashable] = []
     ldesc_indexes = sorted((x.index for x in ldesc), key=len)
@@ -203,3 +123,97 @@ def describe_1d(data) -> "Series":
     d = concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False)
     d.columns = data.columns.copy()
     return d
+
+
+def describe_numeric_1d(series, percentiles) -> "Series":
+    from pandas import Series
+
+    formatted_percentiles = format_percentiles(percentiles)
+
+    stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
+    d = (
+        [series.count(), series.mean(), series.std(), series.min()]
+        + series.quantile(percentiles).tolist()
+        + [series.max()]
+    )
+    return Series(d, index=stat_index, name=series.name)
+
+
+def describe_categorical_1d(data, is_series) -> "Series":
+    names = ["count", "unique"]
+    objcounts = data.value_counts()
+    count_unique = len(objcounts[objcounts != 0])
+    result = [data.count(), count_unique]
+    dtype = None
+    if result[1] > 0:
+        top, freq = objcounts.index[0], objcounts.iloc[0]
+        if is_datetime64_any_dtype(data.dtype):
+            if is_series:
+                stacklevel = 5
+            else:
+                stacklevel = 6
+            warnings.warn(
+                "Treating datetime data as categorical rather than numeric in "
+                "`.describe` is deprecated and will be removed in a future "
+                "version of pandas. Specify `datetime_is_numeric=True` to "
+                "silence this warning and adopt the future behavior now.",
+                FutureWarning,
+                stacklevel=stacklevel,
+            )
+            tz = data.dt.tz
+            asint = data.dropna().values.view("i8")
+            top = Timestamp(top)
+            if top.tzinfo is not None and tz is not None:
+                # Don't tz_localize(None) if key is already tz-aware
+                top = top.tz_convert(tz)
+            else:
+                top = top.tz_localize(tz)
+            names += ["top", "freq", "first", "last"]
+            result += [
+                top,
+                freq,
+                Timestamp(asint.min(), tz=tz),
+                Timestamp(asint.max(), tz=tz),
+            ]
+        else:
+            names += ["top", "freq"]
+            result += [top, freq]
+
+    # If the DataFrame is empty, set 'top' and 'freq' to None
+    # to maintain output shape consistency
+    else:
+        names += ["top", "freq"]
+        result += [np.nan, np.nan]
+        dtype = "object"
+
+    from pandas import Series
+
+    return Series(result, index=names, name=data.name, dtype=dtype)
+
+
+def describe_timestamp_1d(data, percentiles) -> "Series":
+    # GH-30164
+    from pandas import Series
+
+    formatted_percentiles = format_percentiles(percentiles)
+
+    stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
+    d = (
+        [data.count(), data.mean(), data.min()]
+        + data.quantile(percentiles).tolist()
+        + [data.max()]
+    )
+    return Series(d, index=stat_index, name=data.name)
+
+
+def describe_1d(data, percentiles, datetime_is_numeric, *, is_series) -> "Series":
+    if is_bool_dtype(data.dtype):
+        return describe_categorical_1d(data, is_series)
+    elif is_numeric_dtype(data):
+        return describe_numeric_1d(data, percentiles)
+    elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
+        return describe_timestamp_1d(data, percentiles)
+    elif is_timedelta64_dtype(data.dtype):
+        return describe_numeric_1d(data, percentiles)
+    else:
+        return describe_categorical_1d(data, is_series)

From 5d56ad8089ce105e7dcae9f6ba89f46402d0a09e Mon Sep 17 00:00:00 2001
From: Maxim Ivanov <ivanovmg@gmail.com>
Date: Tue, 12 Jan 2021 21:40:29 +0700
Subject: [PATCH 2/2] DOC: add docstrings

---
 pandas/core/describe.py | 46 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/pandas/core/describe.py b/pandas/core/describe.py
index f6cbeb2283b57..4a67725449ca8 100644
--- a/pandas/core/describe.py
+++ b/pandas/core/describe.py
@@ -126,6 +126,15 @@ def describe_ndframe(
 
 
 def describe_numeric_1d(series, percentiles) -> "Series":
+    """Describe series containing numerical data.
+
+    Parameters
+    ----------
+    series : Series
+        Series to be described.
+    percentiles : list-like of numbers, optional
+        The percentiles to include in the output.
+    """
     from pandas import Series
 
     formatted_percentiles = format_percentiles(percentiles)
@@ -140,6 +149,16 @@ def describe_numeric_1d(series, percentiles) -> "Series":
 
 
 def describe_categorical_1d(data, is_series) -> "Series":
+    """Describe series containing categorical data.
+
+    Parameters
+    ----------
+    data : Series
+        Series to be described.
+    is_series : bool
+        True if the original object is a Series.
+        False if the one column of the DataFrame is described.
+    """
     names = ["count", "unique"]
     objcounts = data.value_counts()
     count_unique = len(objcounts[objcounts != 0])
@@ -192,6 +211,15 @@ def describe_categorical_1d(data, is_series) -> "Series":
 
 
 def describe_timestamp_1d(data, percentiles) -> "Series":
+    """Describe series containing datetime64 dtype.
+
+    Parameters
+    ----------
+    data : Series
+        Series to be described.
+    percentiles : list-like of numbers, optional
+        The percentiles to include in the output.
+    """
     # GH-30164
     from pandas import Series
 
@@ -207,6 +235,24 @@ def describe_timestamp_1d(data, percentiles) -> "Series":
 
 
 def describe_1d(data, percentiles, datetime_is_numeric, *, is_series) -> "Series":
+    """Describe series.
+
+    Parameters
+    ----------
+    data : Series
+        Series to be described.
+    percentiles : list-like of numbers, optional
+        The percentiles to include in the output.
+    datetime_is_numeric : bool, default False
+        Whether to treat datetime dtypes as numeric.
+    is_series : bool
+        True if the original object is a Series.
+        False if the one column of the DataFrame is described.
+
+    Returns
+    -------
+    Series
+    """
     if is_bool_dtype(data.dtype):
         return describe_categorical_1d(data, is_series)
     elif is_numeric_dtype(data):