Skip to content

REF: extract classes in pandas/core/describe.py #39186

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jan 19, 2021
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
234 changes: 131 additions & 103 deletions pandas/core/describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
Method NDFrame.describe() delegates actual execution to function describe_ndframe().
"""

from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, List, Optional, Sequence, Union, cast
import warnings

import numpy as np

from pandas._libs.tslibs import Timestamp
from pandas._typing import FrameOrSeries, Hashable
from pandas._typing import FrameOrSeries, FrameOrSeriesUnion, Hashable
from pandas.util._validators import validate_percentile

from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -60,107 +61,155 @@ def describe_ndframe(
Dataframe or series description.
"""
percentiles = refine_percentiles(percentiles)
describer = create_describer(
obj,
include=include,
exclude=exclude,
datetime_is_numeric=datetime_is_numeric,
)
result = describer.describe(percentiles=percentiles)
return cast(FrameOrSeries, result)


def create_describer(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i would just inline this to above

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

obj: "FrameOrSeries",
include: Optional[Union[str, Sequence[str]]],
exclude: Optional[Union[str, Sequence[str]]],
datetime_is_numeric: bool,
) -> "NDFrameDescriberAbstract":
"""
Create concrete NDFrameDescriberAbstract instance suitable for the object.
"""
describer: NDFrameDescriberAbstract
if obj.ndim == 1:
result_series = describe_series(
cast("Series", obj),
percentiles,
datetime_is_numeric,
return SeriesDescriber(
series=cast("Series", obj),
datetime_is_numeric=datetime_is_numeric,
)
else:
return DataFrameDescriber(
frame=cast("DataFrame", obj),
include=include,
exclude=exclude,
datetime_is_numeric=datetime_is_numeric,
)
return cast(FrameOrSeries, result_series)

frame = cast("DataFrame", obj)

if frame.ndim == 2 and frame.columns.size == 0:
raise ValueError("Cannot describe a DataFrame without columns")
class NDFrameDescriberAbstract(ABC):
"""Abstract class for describing dataframe or series."""

result_frame = describe_frame(
frame=frame,
include=include,
exclude=exclude,
percentiles=percentiles,
datetime_is_numeric=datetime_is_numeric,
)
return cast(FrameOrSeries, result_frame)
@abstractmethod
def describe(self, percentiles: Sequence[float]) -> "FrameOrSeriesUnion":
"""Do describe either series or dataframe.

Parameters
----------
percentiles : list-like of numbers
The percentiles to include in the output.
"""

def describe_series(
series: "Series",
percentiles: Sequence[float],
datetime_is_numeric: bool,
) -> "Series":
"""Describe series.

The reason for the delegation to ``describe_1d`` only:
to allow for a proper stacklevel of the FutureWarning.
class SeriesDescriber(NDFrameDescriberAbstract):
"""Class responsible for creating series description.

Parameters
----------
series : Series
data : Series
Series to be described.
percentiles : list-like of numbers
The percentiles to include in the output.
datetime_is_numeric : bool, default False
datetime_is_numeric : bool
Whether to treat datetime dtypes as numeric.

Returns
-------
Series
"""
return describe_1d(
series,
percentiles,
datetime_is_numeric,
is_series=True,
)

def __init__(
self,
series: "Series",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you make this a common method in NDFrame (and changes series -> data) to match dataframe

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will check that. But I guess it is going to violate Liskov principle.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like nothing is violated. It seems to work.

*,
datetime_is_numeric: bool,
):
self.series = series
self.datetime_is_numeric = datetime_is_numeric

def describe(self, percentiles: Sequence[float]) -> "Series":
return describe_1d(
self.series,
percentiles=percentiles,
datetime_is_numeric=self.datetime_is_numeric,
is_series=True,
)

def describe_frame(
frame: "DataFrame",
include: Optional[Union[str, Sequence[str]]],
exclude: Optional[Union[str, Sequence[str]]],
percentiles: Sequence[float],
datetime_is_numeric: bool,
) -> "DataFrame":
"""Describe DataFrame.

class DataFrameDescriber(NDFrameDescriberAbstract):
"""Class responsible for creating dataframe description.

Parameters
----------
frame : DataFrame
DataFrame to be described.
include : 'all', list-like of dtypes or None (default), optional
data : DataFrame
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

match series for data / datetime_is_numeric and add other args

Dataframe to be described.
include : 'all', list-like of dtypes or None
A white list of data types to include in the result.
exclude : list-like of dtypes or None (default), optional,
exclude : list-like of dtypes or None
A black list of data types to omit from the result.
percentiles : list-like of numbers
The percentiles to include in the output.
datetime_is_numeric : bool, default False
datetime_is_numeric : bool
Whether to treat datetime dtypes as numeric.

Returns
-------
DataFrame
"""
data = select_columns(
frame=frame,
include=include,
exclude=exclude,
datetime_is_numeric=datetime_is_numeric,
)

ldesc = [
describe_1d(s, percentiles, datetime_is_numeric, is_series=False)
for _, s in data.items()
]

col_names = reorder_columns(ldesc)
d = concat(
[x.reindex(col_names, copy=False) for x in ldesc],
axis=1,
sort=False,
)
d.columns = data.columns.copy()
return d
def __init__(
self,
frame: "DataFrame",
*,
include: Optional[Union[str, Sequence[str]]],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can do super() for first 2 args

exclude: Optional[Union[str, Sequence[str]]],
datetime_is_numeric: bool,
):
validate_frame(frame)
self.frame = frame
self.include = include
self.exclude = exclude
self.datetime_is_numeric = datetime_is_numeric

def describe(self, percentiles: Sequence[float]) -> "DataFrame":
data = self._select_data()

ldesc = [
describe_1d(
series,
percentiles=percentiles,
datetime_is_numeric=self.datetime_is_numeric,
is_series=False,
)
for _, series in data.items()
]

col_names = reorder_columns(ldesc)
d = concat(
[x.reindex(col_names, copy=False) for x in ldesc],
axis=1,
sort=False,
)
d.columns = data.columns.copy()
return d

def _select_data(self):
"""Select columns to be described."""
if (self.include is None) and (self.exclude is None):
# when some numerics are found, keep only numerics
default_include = [np.number]
if self.datetime_is_numeric:
default_include.append("datetime")
data = self.frame.select_dtypes(include=default_include)
if len(data.columns) == 0:
data = self.frame
elif self.include == "all":
if self.exclude is not None:
msg = "exclude must be None when include is 'all'"
raise ValueError(msg)
data = self.frame
else:
data = self.frame.select_dtypes(
include=self.include,
exclude=self.exclude,
)
return data


def reorder_columns(ldesc: Sequence["Series"]) -> List[Hashable]:
Expand All @@ -174,32 +223,6 @@ def reorder_columns(ldesc: Sequence["Series"]) -> List[Hashable]:
return names


def select_columns(
frame: "DataFrame",
include: Optional[Union[str, Sequence[str]]],
exclude: Optional[Union[str, Sequence[str]]],
datetime_is_numeric: bool,
) -> "DataFrame":
"""Select columns to be described."""
if (include is None) and (exclude is None):
# when some numerics are found, keep only numerics
default_include = [np.number]
if datetime_is_numeric:
default_include.append("datetime")
data = frame.select_dtypes(include=default_include)
if len(data.columns) == 0:
data = frame
elif include == "all":
if exclude is not None:
msg = "exclude must be None when include is 'all'"
raise ValueError(msg)
data = frame
else:
data = frame.select_dtypes(include=include, exclude=exclude)

return data


def describe_numeric_1d(series: "Series", percentiles: Sequence[float]) -> "Series":
"""Describe series containing numerical data.

Expand Down Expand Up @@ -376,3 +399,8 @@ def refine_percentiles(percentiles: Optional[Sequence[float]]) -> Sequence[float
raise ValueError("percentiles cannot contain duplicates")

return unique_pcts


def validate_frame(frame: "DataFrame"):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a reason not to do this in the constructor? e.g. L148

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I though of simplifying the constructor.
Ideally I would create a method _initialize_frame, which will perform the validation under the hood and return the frame.

self.frame = self._initialize_frame(frame)

But that will be a static method, which you prefer not to use.

If you consider that this extraction is unnecessary, then I can definitely inline it into the constructor.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved into the constructor.

if frame.ndim == 2 and frame.columns.size == 0:
raise ValueError("Cannot describe a DataFrame without columns")