From 97f136b5e0fa01b78fc3c2304bfb0d24e24d3c1c Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 15 Jan 2021 19:04:40 +0700 Subject: [PATCH 1/6] REF: extract classes in pandas/core/describe.py --- pandas/core/describe.py | 220 +++++++++++++++++++++------------------- 1 file changed, 117 insertions(+), 103 deletions(-) diff --git a/pandas/core/describe.py b/pandas/core/describe.py index 5a4c0deb7503c..de9c08f78c3f3 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -4,13 +4,14 @@ Method NDFrame.describe() delegates actual execution to function describe_ndframe(). """ +from abc import ABC, abstractmethod from typing import TYPE_CHECKING, List, Optional, Sequence, Union, cast import warnings import numpy as np from pandas._libs.tslibs import Timestamp -from pandas._typing import FrameOrSeries, Hashable +from pandas._typing import FrameOrSeries, FrameOrSeriesUnion, Hashable from pandas.util._validators import validate_percentile from pandas.core.dtypes.common import ( @@ -61,106 +62,140 @@ def describe_ndframe( """ percentiles = refine_percentiles(percentiles) + describer: NDFrameDescriberAbstract + if obj.ndim == 1: - result_series = describe_series( - cast("Series", obj), - percentiles, - datetime_is_numeric, + describer = SeriesDescriber( + series=cast("Series", obj), + datetime_is_numeric=datetime_is_numeric, + ) + else: + describer = DataFrameDescriber( + frame=cast("DataFrame", obj), + include=include, + exclude=exclude, + datetime_is_numeric=datetime_is_numeric, ) - return cast(FrameOrSeries, result_series) - frame = cast("DataFrame", obj) + result = describer.describe(percentiles=percentiles) + return cast(FrameOrSeries, result) - if frame.ndim == 2 and frame.columns.size == 0: - raise ValueError("Cannot describe a DataFrame without columns") - result_frame = describe_frame( - frame=frame, - include=include, - exclude=exclude, - percentiles=percentiles, - datetime_is_numeric=datetime_is_numeric, - ) - return cast(FrameOrSeries, result_frame) +class NDFrameDescriberAbstract(ABC): + """Abstract class for describing dataframe or series.""" + @abstractmethod + def describe(self, percentiles: Sequence[float]) -> "FrameOrSeriesUnion": + """Do describe either series or dataframe. + + Parameters + ---------- + percentiles : list-like of numbers + The percentiles to include in the output. + """ -def describe_series( - series: "Series", - percentiles: Sequence[float], - datetime_is_numeric: bool, -) -> "Series": - """Describe series. - The reason for the delegation to ``describe_1d`` only: - to allow for a proper stacklevel of the FutureWarning. +class SeriesDescriber(NDFrameDescriberAbstract): + """Class responsible for creating series description. Parameters ---------- - series : Series + data : Series Series to be described. - percentiles : list-like of numbers - The percentiles to include in the output. - datetime_is_numeric : bool, default False + datetime_is_numeric : bool Whether to treat datetime dtypes as numeric. - - Returns - ------- - Series """ - return describe_1d( - series, - percentiles, - datetime_is_numeric, - is_series=True, - ) + def __init__( + self, + series: "Series", + *, + datetime_is_numeric: bool, + ): + self.series = series + self.datetime_is_numeric = datetime_is_numeric + + def describe(self, percentiles: Sequence[float]) -> "Series": + return describe_1d( + self.series, + percentiles=percentiles, + datetime_is_numeric=self.datetime_is_numeric, + is_series=True, + ) -def describe_frame( - frame: "DataFrame", - include: Optional[Union[str, Sequence[str]]], - exclude: Optional[Union[str, Sequence[str]]], - percentiles: Sequence[float], - datetime_is_numeric: bool, -) -> "DataFrame": - """Describe DataFrame. + +class DataFrameDescriber(NDFrameDescriberAbstract): + """Class responsible for creating dataframe description. Parameters ---------- - frame : DataFrame - DataFrame to be described. - include : 'all', list-like of dtypes or None (default), optional + data : DataFrame + Dataframe to be described. + include : 'all', list-like of dtypes or None A white list of data types to include in the result. - exclude : list-like of dtypes or None (default), optional, + exclude : list-like of dtypes or None A black list of data types to omit from the result. - percentiles : list-like of numbers - The percentiles to include in the output. - datetime_is_numeric : bool, default False + datetime_is_numeric : bool Whether to treat datetime dtypes as numeric. - - Returns - ------- - DataFrame """ - data = select_columns( - frame=frame, - include=include, - exclude=exclude, - datetime_is_numeric=datetime_is_numeric, - ) - - ldesc = [ - describe_1d(s, percentiles, datetime_is_numeric, is_series=False) - for _, s in data.items() - ] - col_names = reorder_columns(ldesc) - d = concat( - [x.reindex(col_names, copy=False) for x in ldesc], - axis=1, - sort=False, - ) - d.columns = data.columns.copy() - return d + def __init__( + self, + frame: "DataFrame", + *, + include: Optional[Union[str, Sequence[str]]], + exclude: Optional[Union[str, Sequence[str]]], + datetime_is_numeric: bool, + ): + validate_frame(frame) + self.frame = frame + self.include = include + self.exclude = exclude + self.datetime_is_numeric = datetime_is_numeric + + def describe(self, percentiles: Sequence[float]) -> "DataFrame": + data = self._select_data() + + ldesc = [ + describe_1d( + series, + percentiles=percentiles, + datetime_is_numeric=self.datetime_is_numeric, + is_series=False, + ) + for _, series in data.items() + ] + + col_names = reorder_columns(ldesc) + d = concat( + [x.reindex(col_names, copy=False) for x in ldesc], + axis=1, + sort=False, + ) + d.columns = data.columns.copy() + return d + + def _select_data(self): + """Select columns to be described.""" + if (self.include is None) and (self.exclude is None): + # when some numerics are found, keep only numerics + default_include = [np.number] + if self.datetime_is_numeric: + default_include.append("datetime") + data = self.frame.select_dtypes(include=default_include) + if len(data.columns) == 0: + data = self.frame + elif self.include == "all": + if self.exclude is not None: + msg = "exclude must be None when include is 'all'" + raise ValueError(msg) + data = self.frame + else: + data = self.frame.select_dtypes( + include=self.include, + exclude=self.exclude, + ) + return data def reorder_columns(ldesc: Sequence["Series"]) -> List[Hashable]: @@ -174,32 +209,6 @@ def reorder_columns(ldesc: Sequence["Series"]) -> List[Hashable]: return names -def select_columns( - frame: "DataFrame", - include: Optional[Union[str, Sequence[str]]], - exclude: Optional[Union[str, Sequence[str]]], - datetime_is_numeric: bool, -) -> "DataFrame": - """Select columns to be described.""" - if (include is None) and (exclude is None): - # when some numerics are found, keep only numerics - default_include = [np.number] - if datetime_is_numeric: - default_include.append("datetime") - data = frame.select_dtypes(include=default_include) - if len(data.columns) == 0: - data = frame - elif include == "all": - if exclude is not None: - msg = "exclude must be None when include is 'all'" - raise ValueError(msg) - data = frame - else: - data = frame.select_dtypes(include=include, exclude=exclude) - - return data - - def describe_numeric_1d(series: "Series", percentiles: Sequence[float]) -> "Series": """Describe series containing numerical data. @@ -376,3 +385,8 @@ def refine_percentiles(percentiles: Optional[Sequence[float]]) -> Sequence[float raise ValueError("percentiles cannot contain duplicates") return unique_pcts + + +def validate_frame(frame: "DataFrame"): + if frame.ndim == 2 and frame.columns.size == 0: + raise ValueError("Cannot describe a DataFrame without columns") From d5f28157c1de97b34c48c412dcf7a6f335278139 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 15 Jan 2021 19:09:50 +0700 Subject: [PATCH 2/6] REF: extract function create_describer --- pandas/core/describe.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/pandas/core/describe.py b/pandas/core/describe.py index de9c08f78c3f3..e9fffb3570b4e 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -61,25 +61,39 @@ def describe_ndframe( Dataframe or series description. """ percentiles = refine_percentiles(percentiles) + describer = create_describer( + obj, + include=include, + exclude=exclude, + datetime_is_numeric=datetime_is_numeric, + ) + result = describer.describe(percentiles=percentiles) + return cast(FrameOrSeries, result) - describer: NDFrameDescriberAbstract +def create_describer( + obj: "FrameOrSeries", + include: Optional[Union[str, Sequence[str]]], + exclude: Optional[Union[str, Sequence[str]]], + datetime_is_numeric: bool, +) -> "NDFrameDescriberAbstract": + """ + Create concrete NDFrameDescriberAbstract instance suitable for the object. + """ + describer: NDFrameDescriberAbstract if obj.ndim == 1: - describer = SeriesDescriber( + return SeriesDescriber( series=cast("Series", obj), datetime_is_numeric=datetime_is_numeric, ) else: - describer = DataFrameDescriber( + return DataFrameDescriber( frame=cast("DataFrame", obj), include=include, exclude=exclude, datetime_is_numeric=datetime_is_numeric, ) - result = describer.describe(percentiles=percentiles) - return cast(FrameOrSeries, result) - class NDFrameDescriberAbstract(ABC): """Abstract class for describing dataframe or series.""" From ce13d32c958b627e923db1c72278384e1c895237 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 15 Jan 2021 21:52:28 +0700 Subject: [PATCH 3/6] Revert "REF: extract function create_describer" This reverts commit d5f28157c1de97b34c48c412dcf7a6f335278139. --- pandas/core/describe.py | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/pandas/core/describe.py b/pandas/core/describe.py index e9fffb3570b4e..de9c08f78c3f3 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -61,39 +61,25 @@ def describe_ndframe( Dataframe or series description. """ percentiles = refine_percentiles(percentiles) - describer = create_describer( - obj, - include=include, - exclude=exclude, - datetime_is_numeric=datetime_is_numeric, - ) - result = describer.describe(percentiles=percentiles) - return cast(FrameOrSeries, result) - -def create_describer( - obj: "FrameOrSeries", - include: Optional[Union[str, Sequence[str]]], - exclude: Optional[Union[str, Sequence[str]]], - datetime_is_numeric: bool, -) -> "NDFrameDescriberAbstract": - """ - Create concrete NDFrameDescriberAbstract instance suitable for the object. - """ describer: NDFrameDescriberAbstract + if obj.ndim == 1: - return SeriesDescriber( + describer = SeriesDescriber( series=cast("Series", obj), datetime_is_numeric=datetime_is_numeric, ) else: - return DataFrameDescriber( + describer = DataFrameDescriber( frame=cast("DataFrame", obj), include=include, exclude=exclude, datetime_is_numeric=datetime_is_numeric, ) + result = describer.describe(percentiles=percentiles) + return cast(FrameOrSeries, result) + class NDFrameDescriberAbstract(ABC): """Abstract class for describing dataframe or series.""" From ab2051a88249193cbecd91836d57228021ecabf3 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 15 Jan 2021 22:26:13 +0700 Subject: [PATCH 4/6] REF: move constructor to abstract class --- pandas/core/describe.py | 61 +++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 33 deletions(-) diff --git a/pandas/core/describe.py b/pandas/core/describe.py index de9c08f78c3f3..7123ae5202a7f 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -63,15 +63,14 @@ def describe_ndframe( percentiles = refine_percentiles(percentiles) describer: NDFrameDescriberAbstract - if obj.ndim == 1: describer = SeriesDescriber( - series=cast("Series", obj), + obj=cast("Series", obj), datetime_is_numeric=datetime_is_numeric, ) else: describer = DataFrameDescriber( - frame=cast("DataFrame", obj), + obj=cast("DataFrame", obj), include=include, exclude=exclude, datetime_is_numeric=datetime_is_numeric, @@ -82,7 +81,19 @@ def describe_ndframe( class NDFrameDescriberAbstract(ABC): - """Abstract class for describing dataframe or series.""" + """Abstract class for describing dataframe or series. + + Parameters + ---------- + obj : Series or DataFrame + Object to be described. + datetime_is_numeric : bool + Whether to treat datetime dtypes as numeric. + """ + + def __init__(self, obj: "FrameOrSeriesUnion", datetime_is_numeric: bool): + self.obj = obj + self.datetime_is_numeric = datetime_is_numeric @abstractmethod def describe(self, percentiles: Sequence[float]) -> "FrameOrSeriesUnion": @@ -96,28 +107,13 @@ def describe(self, percentiles: Sequence[float]) -> "FrameOrSeriesUnion": class SeriesDescriber(NDFrameDescriberAbstract): - """Class responsible for creating series description. - - Parameters - ---------- - data : Series - Series to be described. - datetime_is_numeric : bool - Whether to treat datetime dtypes as numeric. - """ + """Class responsible for creating series description.""" - def __init__( - self, - series: "Series", - *, - datetime_is_numeric: bool, - ): - self.series = series - self.datetime_is_numeric = datetime_is_numeric + obj: "Series" def describe(self, percentiles: Sequence[float]) -> "Series": return describe_1d( - self.series, + self.obj, percentiles=percentiles, datetime_is_numeric=self.datetime_is_numeric, is_series=True, @@ -125,12 +121,12 @@ def describe(self, percentiles: Sequence[float]) -> "Series": class DataFrameDescriber(NDFrameDescriberAbstract): - """Class responsible for creating dataframe description. + """Class responsible for creating dataobj description. Parameters ---------- - data : DataFrame - Dataframe to be described. + obj : DataFrame + DataFrame to be described. include : 'all', list-like of dtypes or None A white list of data types to include in the result. exclude : list-like of dtypes or None @@ -141,17 +137,16 @@ class DataFrameDescriber(NDFrameDescriberAbstract): def __init__( self, - frame: "DataFrame", + obj: "DataFrame", *, include: Optional[Union[str, Sequence[str]]], exclude: Optional[Union[str, Sequence[str]]], datetime_is_numeric: bool, ): - validate_frame(frame) - self.frame = frame self.include = include self.exclude = exclude - self.datetime_is_numeric = datetime_is_numeric + validate_frame(obj) + super().__init__(obj, datetime_is_numeric=datetime_is_numeric) def describe(self, percentiles: Sequence[float]) -> "DataFrame": data = self._select_data() @@ -182,16 +177,16 @@ def _select_data(self): default_include = [np.number] if self.datetime_is_numeric: default_include.append("datetime") - data = self.frame.select_dtypes(include=default_include) + data = self.obj.select_dtypes(include=default_include) if len(data.columns) == 0: - data = self.frame + data = self.obj elif self.include == "all": if self.exclude is not None: msg = "exclude must be None when include is 'all'" raise ValueError(msg) - data = self.frame + data = self.obj else: - data = self.frame.select_dtypes( + data = self.obj.select_dtypes( include=self.include, exclude=self.exclude, ) From e1003f3482f656e584a005982aa695ee02303e6d Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 18 Jan 2021 10:08:15 +0700 Subject: [PATCH 5/6] REF: inline frame validation --- pandas/core/describe.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/describe.py b/pandas/core/describe.py index 670f55afa7cb4..837cd11918bc2 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -146,7 +146,10 @@ def __init__( ): self.include = include self.exclude = exclude - validate_frame(obj) + + if obj.ndim == 2 and obj.columns.size == 0: + raise ValueError("Cannot describe a DataFrame without columns") + super().__init__(obj, datetime_is_numeric=datetime_is_numeric) def describe(self, percentiles: Sequence[float]) -> DataFrame: @@ -381,8 +384,3 @@ def refine_percentiles(percentiles: Optional[Sequence[float]]) -> Sequence[float raise ValueError("percentiles cannot contain duplicates") return unique_pcts - - -def validate_frame(frame: "DataFrame") -> None: - if frame.ndim == 2 and frame.columns.size == 0: - raise ValueError("Cannot describe a DataFrame without columns") From 7097dd3d09f62f30a61dfa2c3a52d87d7b42f80f Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 18 Jan 2021 10:29:38 +0700 Subject: [PATCH 6/6] CLN: fix - new type annotation --- pandas/core/describe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/describe.py b/pandas/core/describe.py index 837cd11918bc2..09862b72c4a4f 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -97,7 +97,7 @@ def __init__(self, obj: "FrameOrSeriesUnion", datetime_is_numeric: bool): self.datetime_is_numeric = datetime_is_numeric @abstractmethod - def describe(self, percentiles: Sequence[float]) -> "FrameOrSeriesUnion": + def describe(self, percentiles: Sequence[float]) -> FrameOrSeriesUnion: """Do describe either series or dataframe. Parameters