Skip to content

Commit c4f15e4

Browse files
authored
MOVE: describe to pandas/core/describe.py (#39102)
1 parent 46cec9f commit c4f15e4

File tree

2 files changed

+215
-149
lines changed

2 files changed

+215
-149
lines changed

pandas/core/describe.py

+205
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
"""
2+
Module responsible for execution of NDFrame.describe() method.
3+
4+
Method NDFrame.describe() delegates actual execution to function describe_ndframe().
5+
"""
6+
7+
from typing import TYPE_CHECKING, List, Optional, Sequence, Union
8+
import warnings
9+
10+
import numpy as np
11+
12+
from pandas._libs.tslibs import Timestamp
13+
from pandas._typing import FrameOrSeries, Hashable
14+
from pandas.util._validators import validate_percentile
15+
16+
from pandas.core.dtypes.common import (
17+
is_bool_dtype,
18+
is_datetime64_any_dtype,
19+
is_numeric_dtype,
20+
is_timedelta64_dtype,
21+
)
22+
23+
from pandas.core.reshape.concat import concat
24+
25+
from pandas.io.formats.format import format_percentiles
26+
27+
if TYPE_CHECKING:
28+
from pandas import Series
29+
30+
31+
def describe_ndframe(
32+
*,
33+
obj: FrameOrSeries,
34+
include: Optional[Union[str, Sequence[str]]],
35+
exclude: Optional[Union[str, Sequence[str]]],
36+
datetime_is_numeric: bool,
37+
percentiles: Optional[Sequence[float]],
38+
) -> FrameOrSeries:
39+
"""Describe series or dataframe.
40+
41+
Called from pandas.core.generic.NDFrame.describe()
42+
43+
Parameters
44+
----------
45+
obj: DataFrame or Series
46+
Either dataframe or series to be described.
47+
include : 'all', list-like of dtypes or None (default), optional
48+
A white list of data types to include in the result. Ignored for ``Series``.
49+
exclude : list-like of dtypes or None (default), optional,
50+
A black list of data types to omit from the result. Ignored for ``Series``.
51+
datetime_is_numeric : bool, default False
52+
Whether to treat datetime dtypes as numeric.
53+
percentiles : list-like of numbers, optional
54+
The percentiles to include in the output. All should fall between 0 and 1.
55+
The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and
56+
75th percentiles.
57+
58+
Returns
59+
-------
60+
Dataframe or series description.
61+
"""
62+
if obj.ndim == 2 and obj.columns.size == 0:
63+
raise ValueError("Cannot describe a DataFrame without columns")
64+
65+
if percentiles is not None:
66+
# explicit conversion of `percentiles` to list
67+
percentiles = list(percentiles)
68+
69+
# get them all to be in [0, 1]
70+
validate_percentile(percentiles)
71+
72+
# median should always be included
73+
if 0.5 not in percentiles:
74+
percentiles.append(0.5)
75+
percentiles = np.asarray(percentiles)
76+
else:
77+
percentiles = np.array([0.25, 0.5, 0.75])
78+
79+
# sort and check for duplicates
80+
unique_pcts = np.unique(percentiles)
81+
assert percentiles is not None
82+
if len(unique_pcts) < len(percentiles):
83+
raise ValueError("percentiles cannot contain duplicates")
84+
percentiles = unique_pcts
85+
86+
formatted_percentiles = format_percentiles(percentiles)
87+
88+
def describe_numeric_1d(series) -> "Series":
89+
from pandas import Series
90+
91+
stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
92+
d = (
93+
[series.count(), series.mean(), series.std(), series.min()]
94+
+ series.quantile(percentiles).tolist()
95+
+ [series.max()]
96+
)
97+
return Series(d, index=stat_index, name=series.name)
98+
99+
def describe_categorical_1d(data) -> "Series":
100+
names = ["count", "unique"]
101+
objcounts = data.value_counts()
102+
count_unique = len(objcounts[objcounts != 0])
103+
result = [data.count(), count_unique]
104+
dtype = None
105+
if result[1] > 0:
106+
top, freq = objcounts.index[0], objcounts.iloc[0]
107+
if is_datetime64_any_dtype(data.dtype):
108+
if obj.ndim == 1:
109+
stacklevel = 5
110+
else:
111+
stacklevel = 6
112+
warnings.warn(
113+
"Treating datetime data as categorical rather than numeric in "
114+
"`.describe` is deprecated and will be removed in a future "
115+
"version of pandas. Specify `datetime_is_numeric=True` to "
116+
"silence this warning and adopt the future behavior now.",
117+
FutureWarning,
118+
stacklevel=stacklevel,
119+
)
120+
tz = data.dt.tz
121+
asint = data.dropna().values.view("i8")
122+
top = Timestamp(top)
123+
if top.tzinfo is not None and tz is not None:
124+
# Don't tz_localize(None) if key is already tz-aware
125+
top = top.tz_convert(tz)
126+
else:
127+
top = top.tz_localize(tz)
128+
names += ["top", "freq", "first", "last"]
129+
result += [
130+
top,
131+
freq,
132+
Timestamp(asint.min(), tz=tz),
133+
Timestamp(asint.max(), tz=tz),
134+
]
135+
else:
136+
names += ["top", "freq"]
137+
result += [top, freq]
138+
139+
# If the DataFrame is empty, set 'top' and 'freq' to None
140+
# to maintain output shape consistency
141+
else:
142+
names += ["top", "freq"]
143+
result += [np.nan, np.nan]
144+
dtype = "object"
145+
146+
from pandas import Series
147+
148+
return Series(result, index=names, name=data.name, dtype=dtype)
149+
150+
def describe_timestamp_1d(data) -> "Series":
151+
# GH-30164
152+
from pandas import Series
153+
154+
stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
155+
d = (
156+
[data.count(), data.mean(), data.min()]
157+
+ data.quantile(percentiles).tolist()
158+
+ [data.max()]
159+
)
160+
return Series(d, index=stat_index, name=data.name)
161+
162+
def describe_1d(data) -> "Series":
163+
if is_bool_dtype(data.dtype):
164+
return describe_categorical_1d(data)
165+
elif is_numeric_dtype(data):
166+
return describe_numeric_1d(data)
167+
elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
168+
return describe_timestamp_1d(data)
169+
elif is_timedelta64_dtype(data.dtype):
170+
return describe_numeric_1d(data)
171+
else:
172+
return describe_categorical_1d(data)
173+
174+
if obj.ndim == 1:
175+
# Incompatible return value type
176+
# (got "Series", expected "FrameOrSeries") [return-value]
177+
return describe_1d(obj) # type:ignore[return-value]
178+
elif (include is None) and (exclude is None):
179+
# when some numerics are found, keep only numerics
180+
default_include = [np.number]
181+
if datetime_is_numeric:
182+
default_include.append("datetime")
183+
data = obj.select_dtypes(include=default_include)
184+
if len(data.columns) == 0:
185+
data = obj
186+
elif include == "all":
187+
if exclude is not None:
188+
msg = "exclude must be None when include is 'all'"
189+
raise ValueError(msg)
190+
data = obj
191+
else:
192+
data = obj.select_dtypes(include=include, exclude=exclude)
193+
194+
ldesc = [describe_1d(s) for _, s in data.items()]
195+
# set a convenient order for rows
196+
names: List[Hashable] = []
197+
ldesc_indexes = sorted((x.index for x in ldesc), key=len)
198+
for idxnames in ldesc_indexes:
199+
for name in idxnames:
200+
if name not in names:
201+
names.append(name)
202+
203+
d = concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False)
204+
d.columns = data.columns.copy()
205+
return d

pandas/core/generic.py

+10-149
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,7 @@
5757
from pandas.compat.numpy import function as nv
5858
from pandas.errors import AbstractMethodError, InvalidIndexError
5959
from pandas.util._decorators import doc, rewrite_axis_style_signature
60-
from pandas.util._validators import (
61-
validate_bool_kwarg,
62-
validate_fillna_kwargs,
63-
validate_percentile,
64-
)
60+
from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs
6561

6662
from pandas.core.dtypes.common import (
6763
ensure_int64,
@@ -95,6 +91,7 @@
9591
from pandas.core.base import PandasObject, SelectionMixin
9692
import pandas.core.common as com
9793
from pandas.core.construction import create_series_with_explicit_dtype, extract_array
94+
from pandas.core.describe import describe_ndframe
9895
from pandas.core.flags import Flags
9996
from pandas.core.indexes import base as ibase
10097
from pandas.core.indexes.api import (
@@ -113,11 +110,7 @@
113110
from pandas.core.window import Expanding, ExponentialMovingWindow, Rolling, Window
114111

115112
from pandas.io.formats import format as fmt
116-
from pandas.io.formats.format import (
117-
DataFrameFormatter,
118-
DataFrameRenderer,
119-
format_percentiles,
120-
)
113+
from pandas.io.formats.format import DataFrameFormatter, DataFrameRenderer
121114
from pandas.io.formats.printing import pprint_thing
122115

123116
if TYPE_CHECKING:
@@ -10084,145 +10077,13 @@ def describe(
1008410077
75% NaN 2.5
1008510078
max NaN 3.0
1008610079
"""
10087-
if self.ndim == 2 and self.columns.size == 0:
10088-
raise ValueError("Cannot describe a DataFrame without columns")
10089-
10090-
if percentiles is not None:
10091-
# explicit conversion of `percentiles` to list
10092-
percentiles = list(percentiles)
10093-
10094-
# get them all to be in [0, 1]
10095-
validate_percentile(percentiles)
10096-
10097-
# median should always be included
10098-
if 0.5 not in percentiles:
10099-
percentiles.append(0.5)
10100-
percentiles = np.asarray(percentiles)
10101-
else:
10102-
percentiles = np.array([0.25, 0.5, 0.75])
10103-
10104-
# sort and check for duplicates
10105-
unique_pcts = np.unique(percentiles)
10106-
if len(unique_pcts) < len(percentiles):
10107-
raise ValueError("percentiles cannot contain duplicates")
10108-
percentiles = unique_pcts
10109-
10110-
formatted_percentiles = format_percentiles(percentiles)
10111-
10112-
def describe_numeric_1d(series) -> "Series":
10113-
stat_index = (
10114-
["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
10115-
)
10116-
d = (
10117-
[series.count(), series.mean(), series.std(), series.min()]
10118-
+ series.quantile(percentiles).tolist()
10119-
+ [series.max()]
10120-
)
10121-
return pd.Series(d, index=stat_index, name=series.name)
10122-
10123-
def describe_categorical_1d(data) -> "Series":
10124-
names = ["count", "unique"]
10125-
objcounts = data.value_counts()
10126-
count_unique = len(objcounts[objcounts != 0])
10127-
result = [data.count(), count_unique]
10128-
dtype = None
10129-
if result[1] > 0:
10130-
top, freq = objcounts.index[0], objcounts.iloc[0]
10131-
if is_datetime64_any_dtype(data.dtype):
10132-
if self.ndim == 1:
10133-
stacklevel = 4
10134-
else:
10135-
stacklevel = 5
10136-
warnings.warn(
10137-
"Treating datetime data as categorical rather than numeric in "
10138-
"`.describe` is deprecated and will be removed in a future "
10139-
"version of pandas. Specify `datetime_is_numeric=True` to "
10140-
"silence this warning and adopt the future behavior now.",
10141-
FutureWarning,
10142-
stacklevel=stacklevel,
10143-
)
10144-
tz = data.dt.tz
10145-
asint = data.dropna().values.view("i8")
10146-
top = Timestamp(top)
10147-
if top.tzinfo is not None and tz is not None:
10148-
# Don't tz_localize(None) if key is already tz-aware
10149-
top = top.tz_convert(tz)
10150-
else:
10151-
top = top.tz_localize(tz)
10152-
names += ["top", "freq", "first", "last"]
10153-
result += [
10154-
top,
10155-
freq,
10156-
Timestamp(asint.min(), tz=tz),
10157-
Timestamp(asint.max(), tz=tz),
10158-
]
10159-
else:
10160-
names += ["top", "freq"]
10161-
result += [top, freq]
10162-
10163-
# If the DataFrame is empty, set 'top' and 'freq' to None
10164-
# to maintain output shape consistency
10165-
else:
10166-
names += ["top", "freq"]
10167-
result += [np.nan, np.nan]
10168-
dtype = "object"
10169-
10170-
return pd.Series(result, index=names, name=data.name, dtype=dtype)
10171-
10172-
def describe_timestamp_1d(data) -> "Series":
10173-
# GH-30164
10174-
stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
10175-
d = (
10176-
[data.count(), data.mean(), data.min()]
10177-
+ data.quantile(percentiles).tolist()
10178-
+ [data.max()]
10179-
)
10180-
return pd.Series(d, index=stat_index, name=data.name)
10181-
10182-
def describe_1d(data) -> "Series":
10183-
if is_bool_dtype(data.dtype):
10184-
return describe_categorical_1d(data)
10185-
elif is_numeric_dtype(data):
10186-
return describe_numeric_1d(data)
10187-
elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
10188-
return describe_timestamp_1d(data)
10189-
elif is_timedelta64_dtype(data.dtype):
10190-
return describe_numeric_1d(data)
10191-
else:
10192-
return describe_categorical_1d(data)
10193-
10194-
if self.ndim == 1:
10195-
# Incompatible return value type
10196-
# (got "Series", expected "FrameOrSeries") [return-value]
10197-
return describe_1d(self) # type:ignore[return-value]
10198-
elif (include is None) and (exclude is None):
10199-
# when some numerics are found, keep only numerics
10200-
default_include = [np.number]
10201-
if datetime_is_numeric:
10202-
default_include.append("datetime")
10203-
data = self.select_dtypes(include=default_include)
10204-
if len(data.columns) == 0:
10205-
data = self
10206-
elif include == "all":
10207-
if exclude is not None:
10208-
msg = "exclude must be None when include is 'all'"
10209-
raise ValueError(msg)
10210-
data = self
10211-
else:
10212-
data = self.select_dtypes(include=include, exclude=exclude)
10213-
10214-
ldesc = [describe_1d(s) for _, s in data.items()]
10215-
# set a convenient order for rows
10216-
names: List[Hashable] = []
10217-
ldesc_indexes = sorted((x.index for x in ldesc), key=len)
10218-
for idxnames in ldesc_indexes:
10219-
for name in idxnames:
10220-
if name not in names:
10221-
names.append(name)
10222-
10223-
d = pd.concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False)
10224-
d.columns = data.columns.copy()
10225-
return d
10080+
return describe_ndframe(
10081+
obj=self,
10082+
include=include,
10083+
exclude=exclude,
10084+
datetime_is_numeric=datetime_is_numeric,
10085+
percentiles=percentiles,
10086+
)
1022610087

1022710088
@final
1022810089
def pct_change(

0 commit comments

Comments
 (0)