-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
REF: extract classes in pandas/core/describe.py #39186
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
97f136b
d5f2815
ce13d32
ab2051a
cc9544c
e1003f3
7097dd3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,13 +4,14 @@ | |
Method NDFrame.describe() delegates actual execution to function describe_ndframe(). | ||
""" | ||
|
||
from abc import ABC, abstractmethod | ||
from typing import TYPE_CHECKING, List, Optional, Sequence, Union, cast | ||
import warnings | ||
|
||
import numpy as np | ||
|
||
from pandas._libs.tslibs import Timestamp | ||
from pandas._typing import FrameOrSeries, Hashable | ||
from pandas._typing import FrameOrSeries, FrameOrSeriesUnion, Hashable | ||
from pandas.util._validators import validate_percentile | ||
|
||
from pandas.core.dtypes.common import ( | ||
|
@@ -60,107 +61,155 @@ def describe_ndframe( | |
Dataframe or series description. | ||
""" | ||
percentiles = refine_percentiles(percentiles) | ||
describer = create_describer( | ||
obj, | ||
include=include, | ||
exclude=exclude, | ||
datetime_is_numeric=datetime_is_numeric, | ||
) | ||
result = describer.describe(percentiles=percentiles) | ||
return cast(FrameOrSeries, result) | ||
|
||
|
||
def create_describer( | ||
obj: "FrameOrSeries", | ||
include: Optional[Union[str, Sequence[str]]], | ||
exclude: Optional[Union[str, Sequence[str]]], | ||
datetime_is_numeric: bool, | ||
) -> "NDFrameDescriberAbstract": | ||
""" | ||
Create concrete NDFrameDescriberAbstract instance suitable for the object. | ||
""" | ||
describer: NDFrameDescriberAbstract | ||
if obj.ndim == 1: | ||
result_series = describe_series( | ||
cast("Series", obj), | ||
percentiles, | ||
datetime_is_numeric, | ||
return SeriesDescriber( | ||
series=cast("Series", obj), | ||
datetime_is_numeric=datetime_is_numeric, | ||
) | ||
else: | ||
return DataFrameDescriber( | ||
frame=cast("DataFrame", obj), | ||
include=include, | ||
exclude=exclude, | ||
datetime_is_numeric=datetime_is_numeric, | ||
) | ||
return cast(FrameOrSeries, result_series) | ||
|
||
frame = cast("DataFrame", obj) | ||
|
||
if frame.ndim == 2 and frame.columns.size == 0: | ||
raise ValueError("Cannot describe a DataFrame without columns") | ||
class NDFrameDescriberAbstract(ABC): | ||
"""Abstract class for describing dataframe or series.""" | ||
|
||
result_frame = describe_frame( | ||
frame=frame, | ||
include=include, | ||
exclude=exclude, | ||
percentiles=percentiles, | ||
datetime_is_numeric=datetime_is_numeric, | ||
) | ||
return cast(FrameOrSeries, result_frame) | ||
@abstractmethod | ||
def describe(self, percentiles: Sequence[float]) -> "FrameOrSeriesUnion": | ||
"""Do describe either series or dataframe. | ||
|
||
Parameters | ||
---------- | ||
percentiles : list-like of numbers | ||
The percentiles to include in the output. | ||
""" | ||
|
||
def describe_series( | ||
series: "Series", | ||
percentiles: Sequence[float], | ||
datetime_is_numeric: bool, | ||
) -> "Series": | ||
"""Describe series. | ||
|
||
The reason for the delegation to ``describe_1d`` only: | ||
to allow for a proper stacklevel of the FutureWarning. | ||
class SeriesDescriber(NDFrameDescriberAbstract): | ||
"""Class responsible for creating series description. | ||
|
||
Parameters | ||
---------- | ||
series : Series | ||
data : Series | ||
Series to be described. | ||
percentiles : list-like of numbers | ||
The percentiles to include in the output. | ||
datetime_is_numeric : bool, default False | ||
datetime_is_numeric : bool | ||
Whether to treat datetime dtypes as numeric. | ||
|
||
Returns | ||
------- | ||
Series | ||
""" | ||
return describe_1d( | ||
series, | ||
percentiles, | ||
datetime_is_numeric, | ||
is_series=True, | ||
) | ||
|
||
def __init__( | ||
self, | ||
series: "Series", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you make this a common method in NDFrame (and changes series -> data) to match dataframe There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will check that. But I guess it is going to violate Liskov principle. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like nothing is violated. It seems to work. |
||
*, | ||
datetime_is_numeric: bool, | ||
): | ||
self.series = series | ||
self.datetime_is_numeric = datetime_is_numeric | ||
|
||
def describe(self, percentiles: Sequence[float]) -> "Series": | ||
return describe_1d( | ||
self.series, | ||
percentiles=percentiles, | ||
datetime_is_numeric=self.datetime_is_numeric, | ||
is_series=True, | ||
) | ||
|
||
def describe_frame( | ||
frame: "DataFrame", | ||
include: Optional[Union[str, Sequence[str]]], | ||
exclude: Optional[Union[str, Sequence[str]]], | ||
percentiles: Sequence[float], | ||
datetime_is_numeric: bool, | ||
) -> "DataFrame": | ||
"""Describe DataFrame. | ||
|
||
class DataFrameDescriber(NDFrameDescriberAbstract): | ||
"""Class responsible for creating dataframe description. | ||
|
||
Parameters | ||
---------- | ||
frame : DataFrame | ||
DataFrame to be described. | ||
include : 'all', list-like of dtypes or None (default), optional | ||
data : DataFrame | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. match series for data / datetime_is_numeric and add other args |
||
Dataframe to be described. | ||
include : 'all', list-like of dtypes or None | ||
A white list of data types to include in the result. | ||
exclude : list-like of dtypes or None (default), optional, | ||
exclude : list-like of dtypes or None | ||
A black list of data types to omit from the result. | ||
percentiles : list-like of numbers | ||
The percentiles to include in the output. | ||
datetime_is_numeric : bool, default False | ||
datetime_is_numeric : bool | ||
Whether to treat datetime dtypes as numeric. | ||
|
||
Returns | ||
------- | ||
DataFrame | ||
""" | ||
data = select_columns( | ||
frame=frame, | ||
include=include, | ||
exclude=exclude, | ||
datetime_is_numeric=datetime_is_numeric, | ||
) | ||
|
||
ldesc = [ | ||
describe_1d(s, percentiles, datetime_is_numeric, is_series=False) | ||
for _, s in data.items() | ||
] | ||
|
||
col_names = reorder_columns(ldesc) | ||
d = concat( | ||
[x.reindex(col_names, copy=False) for x in ldesc], | ||
axis=1, | ||
sort=False, | ||
) | ||
d.columns = data.columns.copy() | ||
return d | ||
def __init__( | ||
self, | ||
frame: "DataFrame", | ||
*, | ||
include: Optional[Union[str, Sequence[str]]], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can do super() for first 2 args |
||
exclude: Optional[Union[str, Sequence[str]]], | ||
datetime_is_numeric: bool, | ||
): | ||
validate_frame(frame) | ||
self.frame = frame | ||
self.include = include | ||
self.exclude = exclude | ||
self.datetime_is_numeric = datetime_is_numeric | ||
|
||
def describe(self, percentiles: Sequence[float]) -> "DataFrame": | ||
data = self._select_data() | ||
|
||
ldesc = [ | ||
describe_1d( | ||
series, | ||
percentiles=percentiles, | ||
datetime_is_numeric=self.datetime_is_numeric, | ||
is_series=False, | ||
) | ||
for _, series in data.items() | ||
] | ||
|
||
col_names = reorder_columns(ldesc) | ||
d = concat( | ||
[x.reindex(col_names, copy=False) for x in ldesc], | ||
axis=1, | ||
sort=False, | ||
) | ||
d.columns = data.columns.copy() | ||
return d | ||
|
||
def _select_data(self): | ||
"""Select columns to be described.""" | ||
if (self.include is None) and (self.exclude is None): | ||
# when some numerics are found, keep only numerics | ||
default_include = [np.number] | ||
if self.datetime_is_numeric: | ||
default_include.append("datetime") | ||
data = self.frame.select_dtypes(include=default_include) | ||
if len(data.columns) == 0: | ||
data = self.frame | ||
elif self.include == "all": | ||
if self.exclude is not None: | ||
msg = "exclude must be None when include is 'all'" | ||
raise ValueError(msg) | ||
data = self.frame | ||
else: | ||
data = self.frame.select_dtypes( | ||
include=self.include, | ||
exclude=self.exclude, | ||
) | ||
return data | ||
|
||
|
||
def reorder_columns(ldesc: Sequence["Series"]) -> List[Hashable]: | ||
|
@@ -174,32 +223,6 @@ def reorder_columns(ldesc: Sequence["Series"]) -> List[Hashable]: | |
return names | ||
|
||
|
||
def select_columns( | ||
frame: "DataFrame", | ||
include: Optional[Union[str, Sequence[str]]], | ||
exclude: Optional[Union[str, Sequence[str]]], | ||
datetime_is_numeric: bool, | ||
) -> "DataFrame": | ||
"""Select columns to be described.""" | ||
if (include is None) and (exclude is None): | ||
# when some numerics are found, keep only numerics | ||
default_include = [np.number] | ||
if datetime_is_numeric: | ||
default_include.append("datetime") | ||
data = frame.select_dtypes(include=default_include) | ||
if len(data.columns) == 0: | ||
data = frame | ||
elif include == "all": | ||
if exclude is not None: | ||
msg = "exclude must be None when include is 'all'" | ||
raise ValueError(msg) | ||
data = frame | ||
else: | ||
data = frame.select_dtypes(include=include, exclude=exclude) | ||
|
||
return data | ||
|
||
|
||
def describe_numeric_1d(series: "Series", percentiles: Sequence[float]) -> "Series": | ||
"""Describe series containing numerical data. | ||
|
||
|
@@ -376,3 +399,8 @@ def refine_percentiles(percentiles: Optional[Sequence[float]]) -> Sequence[float | |
raise ValueError("percentiles cannot contain duplicates") | ||
|
||
return unique_pcts | ||
|
||
|
||
def validate_frame(frame: "DataFrame"): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. a reason not to do this in the constructor? e.g. L148 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I though of simplifying the constructor.
But that will be a static method, which you prefer not to use. If you consider that this extraction is unnecessary, then I can definitely inline it into the constructor. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Moved into the constructor. |
||
if frame.ndim == 2 and frame.columns.size == 0: | ||
raise ValueError("Cannot describe a DataFrame without columns") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i would just inline this to above
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done