diff --git a/pandas/core/describe.py b/pandas/core/describe.py index 1b5fbaf0e78f9..4a67725449ca8 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -83,98 +83,15 @@ def describe_ndframe( raise ValueError("percentiles cannot contain duplicates") percentiles = unique_pcts - formatted_percentiles = format_percentiles(percentiles) - - def describe_numeric_1d(series) -> "Series": - from pandas import Series - - stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"] - d = ( - [series.count(), series.mean(), series.std(), series.min()] - + series.quantile(percentiles).tolist() - + [series.max()] - ) - return Series(d, index=stat_index, name=series.name) - - def describe_categorical_1d(data) -> "Series": - names = ["count", "unique"] - objcounts = data.value_counts() - count_unique = len(objcounts[objcounts != 0]) - result = [data.count(), count_unique] - dtype = None - if result[1] > 0: - top, freq = objcounts.index[0], objcounts.iloc[0] - if is_datetime64_any_dtype(data.dtype): - if obj.ndim == 1: - stacklevel = 5 - else: - stacklevel = 6 - warnings.warn( - "Treating datetime data as categorical rather than numeric in " - "`.describe` is deprecated and will be removed in a future " - "version of pandas. Specify `datetime_is_numeric=True` to " - "silence this warning and adopt the future behavior now.", - FutureWarning, - stacklevel=stacklevel, - ) - tz = data.dt.tz - asint = data.dropna().values.view("i8") - top = Timestamp(top) - if top.tzinfo is not None and tz is not None: - # Don't tz_localize(None) if key is already tz-aware - top = top.tz_convert(tz) - else: - top = top.tz_localize(tz) - names += ["top", "freq", "first", "last"] - result += [ - top, - freq, - Timestamp(asint.min(), tz=tz), - Timestamp(asint.max(), tz=tz), - ] - else: - names += ["top", "freq"] - result += [top, freq] - - # If the DataFrame is empty, set 'top' and 'freq' to None - # to maintain output shape consistency - else: - names += ["top", "freq"] - result += [np.nan, np.nan] - dtype = "object" - - from pandas import Series - - return Series(result, index=names, name=data.name, dtype=dtype) - - def describe_timestamp_1d(data) -> "Series": - # GH-30164 - from pandas import Series - - stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"] - d = ( - [data.count(), data.mean(), data.min()] - + data.quantile(percentiles).tolist() - + [data.max()] - ) - return Series(d, index=stat_index, name=data.name) - - def describe_1d(data) -> "Series": - if is_bool_dtype(data.dtype): - return describe_categorical_1d(data) - elif is_numeric_dtype(data): - return describe_numeric_1d(data) - elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric: - return describe_timestamp_1d(data) - elif is_timedelta64_dtype(data.dtype): - return describe_numeric_1d(data) - else: - return describe_categorical_1d(data) - if obj.ndim == 1: # Incompatible return value type # (got "Series", expected "FrameOrSeries") [return-value] - return describe_1d(obj) # type:ignore[return-value] + return describe_1d( + obj, + percentiles, + datetime_is_numeric, + is_series=True, + ) # type:ignore[return-value] elif (include is None) and (exclude is None): # when some numerics are found, keep only numerics default_include = [np.number] @@ -191,7 +108,10 @@ def describe_1d(data) -> "Series": else: data = obj.select_dtypes(include=include, exclude=exclude) - ldesc = [describe_1d(s) for _, s in data.items()] + ldesc = [ + describe_1d(s, percentiles, datetime_is_numeric, is_series=False) + for _, s in data.items() + ] # set a convenient order for rows names: List[Hashable] = [] ldesc_indexes = sorted((x.index for x in ldesc), key=len) @@ -203,3 +123,143 @@ def describe_1d(data) -> "Series": d = concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False) d.columns = data.columns.copy() return d + + +def describe_numeric_1d(series, percentiles) -> "Series": + """Describe series containing numerical data. + + Parameters + ---------- + series : Series + Series to be described. + percentiles : list-like of numbers, optional + The percentiles to include in the output. + """ + from pandas import Series + + formatted_percentiles = format_percentiles(percentiles) + + stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"] + d = ( + [series.count(), series.mean(), series.std(), series.min()] + + series.quantile(percentiles).tolist() + + [series.max()] + ) + return Series(d, index=stat_index, name=series.name) + + +def describe_categorical_1d(data, is_series) -> "Series": + """Describe series containing categorical data. + + Parameters + ---------- + data : Series + Series to be described. + is_series : bool + True if the original object is a Series. + False if the one column of the DataFrame is described. + """ + names = ["count", "unique"] + objcounts = data.value_counts() + count_unique = len(objcounts[objcounts != 0]) + result = [data.count(), count_unique] + dtype = None + if result[1] > 0: + top, freq = objcounts.index[0], objcounts.iloc[0] + if is_datetime64_any_dtype(data.dtype): + if is_series: + stacklevel = 5 + else: + stacklevel = 6 + warnings.warn( + "Treating datetime data as categorical rather than numeric in " + "`.describe` is deprecated and will be removed in a future " + "version of pandas. Specify `datetime_is_numeric=True` to " + "silence this warning and adopt the future behavior now.", + FutureWarning, + stacklevel=stacklevel, + ) + tz = data.dt.tz + asint = data.dropna().values.view("i8") + top = Timestamp(top) + if top.tzinfo is not None and tz is not None: + # Don't tz_localize(None) if key is already tz-aware + top = top.tz_convert(tz) + else: + top = top.tz_localize(tz) + names += ["top", "freq", "first", "last"] + result += [ + top, + freq, + Timestamp(asint.min(), tz=tz), + Timestamp(asint.max(), tz=tz), + ] + else: + names += ["top", "freq"] + result += [top, freq] + + # If the DataFrame is empty, set 'top' and 'freq' to None + # to maintain output shape consistency + else: + names += ["top", "freq"] + result += [np.nan, np.nan] + dtype = "object" + + from pandas import Series + + return Series(result, index=names, name=data.name, dtype=dtype) + + +def describe_timestamp_1d(data, percentiles) -> "Series": + """Describe series containing datetime64 dtype. + + Parameters + ---------- + data : Series + Series to be described. + percentiles : list-like of numbers, optional + The percentiles to include in the output. + """ + # GH-30164 + from pandas import Series + + formatted_percentiles = format_percentiles(percentiles) + + stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"] + d = ( + [data.count(), data.mean(), data.min()] + + data.quantile(percentiles).tolist() + + [data.max()] + ) + return Series(d, index=stat_index, name=data.name) + + +def describe_1d(data, percentiles, datetime_is_numeric, *, is_series) -> "Series": + """Describe series. + + Parameters + ---------- + data : Series + Series to be described. + percentiles : list-like of numbers, optional + The percentiles to include in the output. + datetime_is_numeric : bool, default False + Whether to treat datetime dtypes as numeric. + is_series : bool + True if the original object is a Series. + False if the one column of the DataFrame is described. + + Returns + ------- + Series + """ + if is_bool_dtype(data.dtype): + return describe_categorical_1d(data, is_series) + elif is_numeric_dtype(data): + return describe_numeric_1d(data, percentiles) + elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric: + return describe_timestamp_1d(data, percentiles) + elif is_timedelta64_dtype(data.dtype): + return describe_numeric_1d(data, percentiles) + else: + return describe_categorical_1d(data, is_series)