Skip to content

REF: eliminate inner functions in describe #39121

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 13, 2021
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
194 changes: 104 additions & 90 deletions pandas/core/describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,98 +83,15 @@ def describe_ndframe(
raise ValueError("percentiles cannot contain duplicates")
percentiles = unique_pcts

formatted_percentiles = format_percentiles(percentiles)

def describe_numeric_1d(series) -> "Series":
from pandas import Series

stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
d = (
[series.count(), series.mean(), series.std(), series.min()]
+ series.quantile(percentiles).tolist()
+ [series.max()]
)
return Series(d, index=stat_index, name=series.name)

def describe_categorical_1d(data) -> "Series":
names = ["count", "unique"]
objcounts = data.value_counts()
count_unique = len(objcounts[objcounts != 0])
result = [data.count(), count_unique]
dtype = None
if result[1] > 0:
top, freq = objcounts.index[0], objcounts.iloc[0]
if is_datetime64_any_dtype(data.dtype):
if obj.ndim == 1:
stacklevel = 5
else:
stacklevel = 6
warnings.warn(
"Treating datetime data as categorical rather than numeric in "
"`.describe` is deprecated and will be removed in a future "
"version of pandas. Specify `datetime_is_numeric=True` to "
"silence this warning and adopt the future behavior now.",
FutureWarning,
stacklevel=stacklevel,
)
tz = data.dt.tz
asint = data.dropna().values.view("i8")
top = Timestamp(top)
if top.tzinfo is not None and tz is not None:
# Don't tz_localize(None) if key is already tz-aware
top = top.tz_convert(tz)
else:
top = top.tz_localize(tz)
names += ["top", "freq", "first", "last"]
result += [
top,
freq,
Timestamp(asint.min(), tz=tz),
Timestamp(asint.max(), tz=tz),
]
else:
names += ["top", "freq"]
result += [top, freq]

# If the DataFrame is empty, set 'top' and 'freq' to None
# to maintain output shape consistency
else:
names += ["top", "freq"]
result += [np.nan, np.nan]
dtype = "object"

from pandas import Series

return Series(result, index=names, name=data.name, dtype=dtype)

def describe_timestamp_1d(data) -> "Series":
# GH-30164
from pandas import Series

stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
d = (
[data.count(), data.mean(), data.min()]
+ data.quantile(percentiles).tolist()
+ [data.max()]
)
return Series(d, index=stat_index, name=data.name)

def describe_1d(data) -> "Series":
if is_bool_dtype(data.dtype):
return describe_categorical_1d(data)
elif is_numeric_dtype(data):
return describe_numeric_1d(data)
elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
return describe_timestamp_1d(data)
elif is_timedelta64_dtype(data.dtype):
return describe_numeric_1d(data)
else:
return describe_categorical_1d(data)

if obj.ndim == 1:
# Incompatible return value type
# (got "Series", expected "FrameOrSeries") [return-value]
return describe_1d(obj) # type:ignore[return-value]
return describe_1d(
obj,
percentiles,
datetime_is_numeric,
is_series=True,
) # type:ignore[return-value]
elif (include is None) and (exclude is None):
# when some numerics are found, keep only numerics
default_include = [np.number]
Expand All @@ -191,7 +108,10 @@ def describe_1d(data) -> "Series":
else:
data = obj.select_dtypes(include=include, exclude=exclude)

ldesc = [describe_1d(s) for _, s in data.items()]
ldesc = [
describe_1d(s, percentiles, datetime_is_numeric, is_series=False)
for _, s in data.items()
]
# set a convenient order for rows
names: List[Hashable] = []
ldesc_indexes = sorted((x.index for x in ldesc), key=len)
Expand All @@ -203,3 +123,97 @@ def describe_1d(data) -> "Series":
d = concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False)
d.columns = data.columns.copy()
return d


def describe_numeric_1d(series, percentiles) -> "Series":
from pandas import Series
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

move imports to the top

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ideally add doc-strings where you can

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added docstrings.


formatted_percentiles = format_percentiles(percentiles)

stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
d = (
[series.count(), series.mean(), series.std(), series.min()]
+ series.quantile(percentiles).tolist()
+ [series.max()]
)
return Series(d, index=stat_index, name=series.name)


def describe_categorical_1d(data, is_series) -> "Series":
names = ["count", "unique"]
objcounts = data.value_counts()
count_unique = len(objcounts[objcounts != 0])
result = [data.count(), count_unique]
dtype = None
if result[1] > 0:
top, freq = objcounts.index[0], objcounts.iloc[0]
if is_datetime64_any_dtype(data.dtype):
if is_series:
stacklevel = 5
else:
stacklevel = 6
warnings.warn(
"Treating datetime data as categorical rather than numeric in "
"`.describe` is deprecated and will be removed in a future "
"version of pandas. Specify `datetime_is_numeric=True` to "
"silence this warning and adopt the future behavior now.",
FutureWarning,
stacklevel=stacklevel,
)
tz = data.dt.tz
asint = data.dropna().values.view("i8")
top = Timestamp(top)
if top.tzinfo is not None and tz is not None:
# Don't tz_localize(None) if key is already tz-aware
top = top.tz_convert(tz)
else:
top = top.tz_localize(tz)
names += ["top", "freq", "first", "last"]
result += [
top,
freq,
Timestamp(asint.min(), tz=tz),
Timestamp(asint.max(), tz=tz),
]
else:
names += ["top", "freq"]
result += [top, freq]

# If the DataFrame is empty, set 'top' and 'freq' to None
# to maintain output shape consistency
else:
names += ["top", "freq"]
result += [np.nan, np.nan]
dtype = "object"

from pandas import Series

return Series(result, index=names, name=data.name, dtype=dtype)


def describe_timestamp_1d(data, percentiles) -> "Series":
# GH-30164
from pandas import Series

formatted_percentiles = format_percentiles(percentiles)

stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
d = (
[data.count(), data.mean(), data.min()]
+ data.quantile(percentiles).tolist()
+ [data.max()]
)
return Series(d, index=stat_index, name=data.name)


def describe_1d(data, percentiles, datetime_is_numeric, *, is_series) -> "Series":
if is_bool_dtype(data.dtype):
return describe_categorical_1d(data, is_series)
elif is_numeric_dtype(data):
return describe_numeric_1d(data, percentiles)
elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
return describe_timestamp_1d(data, percentiles)
elif is_timedelta64_dtype(data.dtype):
return describe_numeric_1d(data, percentiles)
else:
return describe_categorical_1d(data, is_series)