Skip to content

REF/TYP: extract function/type args in describe.py #39165

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 14, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 49 additions & 29 deletions pandas/core/describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
Method NDFrame.describe() delegates actual execution to function describe_ndframe().
"""

from typing import TYPE_CHECKING, List, Optional, Sequence, Union
from typing import TYPE_CHECKING, List, Optional, Sequence, Union, cast
import warnings

import numpy as np
Expand Down Expand Up @@ -62,32 +62,14 @@ def describe_ndframe(
if obj.ndim == 2 and obj.columns.size == 0:
raise ValueError("Cannot describe a DataFrame without columns")

if percentiles is not None:
# explicit conversion of `percentiles` to list
percentiles = list(percentiles)

# get them all to be in [0, 1]
validate_percentile(percentiles)

# median should always be included
if 0.5 not in percentiles:
percentiles.append(0.5)
percentiles = np.asarray(percentiles)
else:
percentiles = np.array([0.25, 0.5, 0.75])

# sort and check for duplicates
unique_pcts = np.unique(percentiles)
assert percentiles is not None
if len(unique_pcts) < len(percentiles):
raise ValueError("percentiles cannot contain duplicates")
percentiles = unique_pcts
percentiles = _refine_percentiles(percentiles)

if obj.ndim == 1:
series = cast("Series", obj)
# Incompatible return value type
# (got "Series", expected "FrameOrSeries") [return-value]
return describe_1d(
obj,
series,
percentiles,
datetime_is_numeric,
is_series=True,
Expand Down Expand Up @@ -125,14 +107,14 @@ def describe_ndframe(
return d


def describe_numeric_1d(series, percentiles) -> "Series":
def describe_numeric_1d(series: "Series", percentiles: Sequence[float]) -> "Series":
"""Describe series containing numerical data.

Parameters
----------
series : Series
Series to be described.
percentiles : list-like of numbers, optional
percentiles : list-like of numbers
The percentiles to include in the output.
"""
from pandas import Series
Expand All @@ -148,7 +130,7 @@ def describe_numeric_1d(series, percentiles) -> "Series":
return Series(d, index=stat_index, name=series.name)


def describe_categorical_1d(data, is_series) -> "Series":
def describe_categorical_1d(data: "Series", is_series: bool) -> "Series":
"""Describe series containing categorical data.

Parameters
Expand Down Expand Up @@ -210,14 +192,14 @@ def describe_categorical_1d(data, is_series) -> "Series":
return Series(result, index=names, name=data.name, dtype=dtype)


def describe_timestamp_1d(data, percentiles) -> "Series":
def describe_timestamp_1d(data: "Series", percentiles: Sequence[float]) -> "Series":
"""Describe series containing datetime64 dtype.

Parameters
----------
data : Series
Series to be described.
percentiles : list-like of numbers, optional
percentiles : list-like of numbers
The percentiles to include in the output.
"""
# GH-30164
Expand All @@ -234,14 +216,20 @@ def describe_timestamp_1d(data, percentiles) -> "Series":
return Series(d, index=stat_index, name=data.name)


def describe_1d(data, percentiles, datetime_is_numeric, *, is_series) -> "Series":
def describe_1d(
data: "Series",
percentiles: Sequence[float],
datetime_is_numeric: bool,
*,
is_series: bool,
) -> "Series":
"""Describe series.

Parameters
----------
data : Series
Series to be described.
percentiles : list-like of numbers, optional
percentiles : list-like of numbers
The percentiles to include in the output.
datetime_is_numeric : bool, default False
Whether to treat datetime dtypes as numeric.
Expand All @@ -263,3 +251,35 @@ def describe_1d(data, percentiles, datetime_is_numeric, *, is_series) -> "Series
return describe_numeric_1d(data, percentiles)
else:
return describe_categorical_1d(data, is_series)


def _refine_percentiles(percentiles: Optional[Sequence[float]]) -> Sequence[float]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these dont' need to be private but nbd

"""Ensure that percentiles are unique and sorted.

Parameters
----------
percentiles : list-like of numbers, optional
The percentiles to include in the output.
"""
if percentiles is None:
return np.array([0.25, 0.5, 0.75])

# explicit conversion of `percentiles` to list
percentiles = list(percentiles)

# get them all to be in [0, 1]
validate_percentile(percentiles)

# median should always be included
if 0.5 not in percentiles:
percentiles.append(0.5)

percentiles = np.asarray(percentiles)

# sort and check for duplicates
unique_pcts = np.unique(percentiles)
assert percentiles is not None
if len(unique_pcts) < len(percentiles):
raise ValueError("percentiles cannot contain duplicates")

return unique_pcts