Skip to content

Commit dba7641

Browse files
authored
REF: eliminate inner functions in describe (#39121)
1 parent 64ef85f commit dba7641

File tree

1 file changed

+150
-90
lines changed

1 file changed

+150
-90
lines changed

pandas/core/describe.py

+150-90
Original file line numberDiff line numberDiff line change
@@ -83,98 +83,15 @@ def describe_ndframe(
8383
raise ValueError("percentiles cannot contain duplicates")
8484
percentiles = unique_pcts
8585

86-
formatted_percentiles = format_percentiles(percentiles)
87-
88-
def describe_numeric_1d(series) -> "Series":
89-
from pandas import Series
90-
91-
stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
92-
d = (
93-
[series.count(), series.mean(), series.std(), series.min()]
94-
+ series.quantile(percentiles).tolist()
95-
+ [series.max()]
96-
)
97-
return Series(d, index=stat_index, name=series.name)
98-
99-
def describe_categorical_1d(data) -> "Series":
100-
names = ["count", "unique"]
101-
objcounts = data.value_counts()
102-
count_unique = len(objcounts[objcounts != 0])
103-
result = [data.count(), count_unique]
104-
dtype = None
105-
if result[1] > 0:
106-
top, freq = objcounts.index[0], objcounts.iloc[0]
107-
if is_datetime64_any_dtype(data.dtype):
108-
if obj.ndim == 1:
109-
stacklevel = 5
110-
else:
111-
stacklevel = 6
112-
warnings.warn(
113-
"Treating datetime data as categorical rather than numeric in "
114-
"`.describe` is deprecated and will be removed in a future "
115-
"version of pandas. Specify `datetime_is_numeric=True` to "
116-
"silence this warning and adopt the future behavior now.",
117-
FutureWarning,
118-
stacklevel=stacklevel,
119-
)
120-
tz = data.dt.tz
121-
asint = data.dropna().values.view("i8")
122-
top = Timestamp(top)
123-
if top.tzinfo is not None and tz is not None:
124-
# Don't tz_localize(None) if key is already tz-aware
125-
top = top.tz_convert(tz)
126-
else:
127-
top = top.tz_localize(tz)
128-
names += ["top", "freq", "first", "last"]
129-
result += [
130-
top,
131-
freq,
132-
Timestamp(asint.min(), tz=tz),
133-
Timestamp(asint.max(), tz=tz),
134-
]
135-
else:
136-
names += ["top", "freq"]
137-
result += [top, freq]
138-
139-
# If the DataFrame is empty, set 'top' and 'freq' to None
140-
# to maintain output shape consistency
141-
else:
142-
names += ["top", "freq"]
143-
result += [np.nan, np.nan]
144-
dtype = "object"
145-
146-
from pandas import Series
147-
148-
return Series(result, index=names, name=data.name, dtype=dtype)
149-
150-
def describe_timestamp_1d(data) -> "Series":
151-
# GH-30164
152-
from pandas import Series
153-
154-
stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
155-
d = (
156-
[data.count(), data.mean(), data.min()]
157-
+ data.quantile(percentiles).tolist()
158-
+ [data.max()]
159-
)
160-
return Series(d, index=stat_index, name=data.name)
161-
162-
def describe_1d(data) -> "Series":
163-
if is_bool_dtype(data.dtype):
164-
return describe_categorical_1d(data)
165-
elif is_numeric_dtype(data):
166-
return describe_numeric_1d(data)
167-
elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
168-
return describe_timestamp_1d(data)
169-
elif is_timedelta64_dtype(data.dtype):
170-
return describe_numeric_1d(data)
171-
else:
172-
return describe_categorical_1d(data)
173-
17486
if obj.ndim == 1:
17587
# Incompatible return value type
17688
# (got "Series", expected "FrameOrSeries") [return-value]
177-
return describe_1d(obj) # type:ignore[return-value]
89+
return describe_1d(
90+
obj,
91+
percentiles,
92+
datetime_is_numeric,
93+
is_series=True,
94+
) # type:ignore[return-value]
17895
elif (include is None) and (exclude is None):
17996
# when some numerics are found, keep only numerics
18097
default_include = [np.number]
@@ -191,7 +108,10 @@ def describe_1d(data) -> "Series":
191108
else:
192109
data = obj.select_dtypes(include=include, exclude=exclude)
193110

194-
ldesc = [describe_1d(s) for _, s in data.items()]
111+
ldesc = [
112+
describe_1d(s, percentiles, datetime_is_numeric, is_series=False)
113+
for _, s in data.items()
114+
]
195115
# set a convenient order for rows
196116
names: List[Hashable] = []
197117
ldesc_indexes = sorted((x.index for x in ldesc), key=len)
@@ -203,3 +123,143 @@ def describe_1d(data) -> "Series":
203123
d = concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False)
204124
d.columns = data.columns.copy()
205125
return d
126+
127+
128+
def describe_numeric_1d(series, percentiles) -> "Series":
129+
"""Describe series containing numerical data.
130+
131+
Parameters
132+
----------
133+
series : Series
134+
Series to be described.
135+
percentiles : list-like of numbers, optional
136+
The percentiles to include in the output.
137+
"""
138+
from pandas import Series
139+
140+
formatted_percentiles = format_percentiles(percentiles)
141+
142+
stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
143+
d = (
144+
[series.count(), series.mean(), series.std(), series.min()]
145+
+ series.quantile(percentiles).tolist()
146+
+ [series.max()]
147+
)
148+
return Series(d, index=stat_index, name=series.name)
149+
150+
151+
def describe_categorical_1d(data, is_series) -> "Series":
152+
"""Describe series containing categorical data.
153+
154+
Parameters
155+
----------
156+
data : Series
157+
Series to be described.
158+
is_series : bool
159+
True if the original object is a Series.
160+
False if the one column of the DataFrame is described.
161+
"""
162+
names = ["count", "unique"]
163+
objcounts = data.value_counts()
164+
count_unique = len(objcounts[objcounts != 0])
165+
result = [data.count(), count_unique]
166+
dtype = None
167+
if result[1] > 0:
168+
top, freq = objcounts.index[0], objcounts.iloc[0]
169+
if is_datetime64_any_dtype(data.dtype):
170+
if is_series:
171+
stacklevel = 5
172+
else:
173+
stacklevel = 6
174+
warnings.warn(
175+
"Treating datetime data as categorical rather than numeric in "
176+
"`.describe` is deprecated and will be removed in a future "
177+
"version of pandas. Specify `datetime_is_numeric=True` to "
178+
"silence this warning and adopt the future behavior now.",
179+
FutureWarning,
180+
stacklevel=stacklevel,
181+
)
182+
tz = data.dt.tz
183+
asint = data.dropna().values.view("i8")
184+
top = Timestamp(top)
185+
if top.tzinfo is not None and tz is not None:
186+
# Don't tz_localize(None) if key is already tz-aware
187+
top = top.tz_convert(tz)
188+
else:
189+
top = top.tz_localize(tz)
190+
names += ["top", "freq", "first", "last"]
191+
result += [
192+
top,
193+
freq,
194+
Timestamp(asint.min(), tz=tz),
195+
Timestamp(asint.max(), tz=tz),
196+
]
197+
else:
198+
names += ["top", "freq"]
199+
result += [top, freq]
200+
201+
# If the DataFrame is empty, set 'top' and 'freq' to None
202+
# to maintain output shape consistency
203+
else:
204+
names += ["top", "freq"]
205+
result += [np.nan, np.nan]
206+
dtype = "object"
207+
208+
from pandas import Series
209+
210+
return Series(result, index=names, name=data.name, dtype=dtype)
211+
212+
213+
def describe_timestamp_1d(data, percentiles) -> "Series":
214+
"""Describe series containing datetime64 dtype.
215+
216+
Parameters
217+
----------
218+
data : Series
219+
Series to be described.
220+
percentiles : list-like of numbers, optional
221+
The percentiles to include in the output.
222+
"""
223+
# GH-30164
224+
from pandas import Series
225+
226+
formatted_percentiles = format_percentiles(percentiles)
227+
228+
stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
229+
d = (
230+
[data.count(), data.mean(), data.min()]
231+
+ data.quantile(percentiles).tolist()
232+
+ [data.max()]
233+
)
234+
return Series(d, index=stat_index, name=data.name)
235+
236+
237+
def describe_1d(data, percentiles, datetime_is_numeric, *, is_series) -> "Series":
238+
"""Describe series.
239+
240+
Parameters
241+
----------
242+
data : Series
243+
Series to be described.
244+
percentiles : list-like of numbers, optional
245+
The percentiles to include in the output.
246+
datetime_is_numeric : bool, default False
247+
Whether to treat datetime dtypes as numeric.
248+
is_series : bool
249+
True if the original object is a Series.
250+
False if the one column of the DataFrame is described.
251+
252+
Returns
253+
-------
254+
Series
255+
"""
256+
if is_bool_dtype(data.dtype):
257+
return describe_categorical_1d(data, is_series)
258+
elif is_numeric_dtype(data):
259+
return describe_numeric_1d(data, percentiles)
260+
elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
261+
return describe_timestamp_1d(data, percentiles)
262+
elif is_timedelta64_dtype(data.dtype):
263+
return describe_numeric_1d(data, percentiles)
264+
else:
265+
return describe_categorical_1d(data, is_series)

0 commit comments

Comments
 (0)