From 126075fe8f14180db25d344eb9ec7bd20c22e187 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 12 Jan 2021 13:37:21 +0700 Subject: [PATCH 1/2] REF: eliminate inner functions in describe --- pandas/core/describe.py | 194 +++++++++++++++++++++------------------- 1 file changed, 104 insertions(+), 90 deletions(-) diff --git a/pandas/core/describe.py b/pandas/core/describe.py index 1b5fbaf0e78f9..f6cbeb2283b57 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -83,98 +83,15 @@ def describe_ndframe( raise ValueError("percentiles cannot contain duplicates") percentiles = unique_pcts - formatted_percentiles = format_percentiles(percentiles) - - def describe_numeric_1d(series) -> "Series": - from pandas import Series - - stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"] - d = ( - [series.count(), series.mean(), series.std(), series.min()] - + series.quantile(percentiles).tolist() - + [series.max()] - ) - return Series(d, index=stat_index, name=series.name) - - def describe_categorical_1d(data) -> "Series": - names = ["count", "unique"] - objcounts = data.value_counts() - count_unique = len(objcounts[objcounts != 0]) - result = [data.count(), count_unique] - dtype = None - if result[1] > 0: - top, freq = objcounts.index[0], objcounts.iloc[0] - if is_datetime64_any_dtype(data.dtype): - if obj.ndim == 1: - stacklevel = 5 - else: - stacklevel = 6 - warnings.warn( - "Treating datetime data as categorical rather than numeric in " - "`.describe` is deprecated and will be removed in a future " - "version of pandas. Specify `datetime_is_numeric=True` to " - "silence this warning and adopt the future behavior now.", - FutureWarning, - stacklevel=stacklevel, - ) - tz = data.dt.tz - asint = data.dropna().values.view("i8") - top = Timestamp(top) - if top.tzinfo is not None and tz is not None: - # Don't tz_localize(None) if key is already tz-aware - top = top.tz_convert(tz) - else: - top = top.tz_localize(tz) - names += ["top", "freq", "first", "last"] - result += [ - top, - freq, - Timestamp(asint.min(), tz=tz), - Timestamp(asint.max(), tz=tz), - ] - else: - names += ["top", "freq"] - result += [top, freq] - - # If the DataFrame is empty, set 'top' and 'freq' to None - # to maintain output shape consistency - else: - names += ["top", "freq"] - result += [np.nan, np.nan] - dtype = "object" - - from pandas import Series - - return Series(result, index=names, name=data.name, dtype=dtype) - - def describe_timestamp_1d(data) -> "Series": - # GH-30164 - from pandas import Series - - stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"] - d = ( - [data.count(), data.mean(), data.min()] - + data.quantile(percentiles).tolist() - + [data.max()] - ) - return Series(d, index=stat_index, name=data.name) - - def describe_1d(data) -> "Series": - if is_bool_dtype(data.dtype): - return describe_categorical_1d(data) - elif is_numeric_dtype(data): - return describe_numeric_1d(data) - elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric: - return describe_timestamp_1d(data) - elif is_timedelta64_dtype(data.dtype): - return describe_numeric_1d(data) - else: - return describe_categorical_1d(data) - if obj.ndim == 1: # Incompatible return value type # (got "Series", expected "FrameOrSeries") [return-value] - return describe_1d(obj) # type:ignore[return-value] + return describe_1d( + obj, + percentiles, + datetime_is_numeric, + is_series=True, + ) # type:ignore[return-value] elif (include is None) and (exclude is None): # when some numerics are found, keep only numerics default_include = [np.number] @@ -191,7 +108,10 @@ def describe_1d(data) -> "Series": else: data = obj.select_dtypes(include=include, exclude=exclude) - ldesc = [describe_1d(s) for _, s in data.items()] + ldesc = [ + describe_1d(s, percentiles, datetime_is_numeric, is_series=False) + for _, s in data.items() + ] # set a convenient order for rows names: List[Hashable] = [] ldesc_indexes = sorted((x.index for x in ldesc), key=len) @@ -203,3 +123,97 @@ def describe_1d(data) -> "Series": d = concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False) d.columns = data.columns.copy() return d + + +def describe_numeric_1d(series, percentiles) -> "Series": + from pandas import Series + + formatted_percentiles = format_percentiles(percentiles) + + stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"] + d = ( + [series.count(), series.mean(), series.std(), series.min()] + + series.quantile(percentiles).tolist() + + [series.max()] + ) + return Series(d, index=stat_index, name=series.name) + + +def describe_categorical_1d(data, is_series) -> "Series": + names = ["count", "unique"] + objcounts = data.value_counts() + count_unique = len(objcounts[objcounts != 0]) + result = [data.count(), count_unique] + dtype = None + if result[1] > 0: + top, freq = objcounts.index[0], objcounts.iloc[0] + if is_datetime64_any_dtype(data.dtype): + if is_series: + stacklevel = 5 + else: + stacklevel = 6 + warnings.warn( + "Treating datetime data as categorical rather than numeric in " + "`.describe` is deprecated and will be removed in a future " + "version of pandas. Specify `datetime_is_numeric=True` to " + "silence this warning and adopt the future behavior now.", + FutureWarning, + stacklevel=stacklevel, + ) + tz = data.dt.tz + asint = data.dropna().values.view("i8") + top = Timestamp(top) + if top.tzinfo is not None and tz is not None: + # Don't tz_localize(None) if key is already tz-aware + top = top.tz_convert(tz) + else: + top = top.tz_localize(tz) + names += ["top", "freq", "first", "last"] + result += [ + top, + freq, + Timestamp(asint.min(), tz=tz), + Timestamp(asint.max(), tz=tz), + ] + else: + names += ["top", "freq"] + result += [top, freq] + + # If the DataFrame is empty, set 'top' and 'freq' to None + # to maintain output shape consistency + else: + names += ["top", "freq"] + result += [np.nan, np.nan] + dtype = "object" + + from pandas import Series + + return Series(result, index=names, name=data.name, dtype=dtype) + + +def describe_timestamp_1d(data, percentiles) -> "Series": + # GH-30164 + from pandas import Series + + formatted_percentiles = format_percentiles(percentiles) + + stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"] + d = ( + [data.count(), data.mean(), data.min()] + + data.quantile(percentiles).tolist() + + [data.max()] + ) + return Series(d, index=stat_index, name=data.name) + + +def describe_1d(data, percentiles, datetime_is_numeric, *, is_series) -> "Series": + if is_bool_dtype(data.dtype): + return describe_categorical_1d(data, is_series) + elif is_numeric_dtype(data): + return describe_numeric_1d(data, percentiles) + elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric: + return describe_timestamp_1d(data, percentiles) + elif is_timedelta64_dtype(data.dtype): + return describe_numeric_1d(data, percentiles) + else: + return describe_categorical_1d(data, is_series) From 5d56ad8089ce105e7dcae9f6ba89f46402d0a09e Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 12 Jan 2021 21:40:29 +0700 Subject: [PATCH 2/2] DOC: add docstrings --- pandas/core/describe.py | 46 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/pandas/core/describe.py b/pandas/core/describe.py index f6cbeb2283b57..4a67725449ca8 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -126,6 +126,15 @@ def describe_ndframe( def describe_numeric_1d(series, percentiles) -> "Series": + """Describe series containing numerical data. + + Parameters + ---------- + series : Series + Series to be described. + percentiles : list-like of numbers, optional + The percentiles to include in the output. + """ from pandas import Series formatted_percentiles = format_percentiles(percentiles) @@ -140,6 +149,16 @@ def describe_numeric_1d(series, percentiles) -> "Series": def describe_categorical_1d(data, is_series) -> "Series": + """Describe series containing categorical data. + + Parameters + ---------- + data : Series + Series to be described. + is_series : bool + True if the original object is a Series. + False if the one column of the DataFrame is described. + """ names = ["count", "unique"] objcounts = data.value_counts() count_unique = len(objcounts[objcounts != 0]) @@ -192,6 +211,15 @@ def describe_categorical_1d(data, is_series) -> "Series": def describe_timestamp_1d(data, percentiles) -> "Series": + """Describe series containing datetime64 dtype. + + Parameters + ---------- + data : Series + Series to be described. + percentiles : list-like of numbers, optional + The percentiles to include in the output. + """ # GH-30164 from pandas import Series @@ -207,6 +235,24 @@ def describe_timestamp_1d(data, percentiles) -> "Series": def describe_1d(data, percentiles, datetime_is_numeric, *, is_series) -> "Series": + """Describe series. + + Parameters + ---------- + data : Series + Series to be described. + percentiles : list-like of numbers, optional + The percentiles to include in the output. + datetime_is_numeric : bool, default False + Whether to treat datetime dtypes as numeric. + is_series : bool + True if the original object is a Series. + False if the one column of the DataFrame is described. + + Returns + ------- + Series + """ if is_bool_dtype(data.dtype): return describe_categorical_1d(data, is_series) elif is_numeric_dtype(data):