Skip to content

MOVE: describe to pandas/core/describe.py #39102

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
205 changes: 205 additions & 0 deletions pandas/core/describe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
"""
Module responsible for execution of NDFrame.describe() method.

Method NDFrame.describe() delegates actual execution to function describe_ndframe().
"""

from typing import TYPE_CHECKING, List, Optional, Sequence, Union
import warnings

import numpy as np

from pandas._libs.tslibs import Timestamp
from pandas._typing import FrameOrSeries, Hashable
from pandas.util._validators import validate_percentile

from pandas.core.dtypes.common import (
is_bool_dtype,
is_datetime64_any_dtype,
is_numeric_dtype,
is_timedelta64_dtype,
)

from pandas.core.reshape.concat import concat

from pandas.io.formats.format import format_percentiles

if TYPE_CHECKING:
from pandas import Series


def describe_ndframe(
*,
obj: FrameOrSeries,
include: Optional[Union[str, Sequence[str]]],
exclude: Optional[Union[str, Sequence[str]]],
datetime_is_numeric: bool,
percentiles: Optional[Sequence[float]],
) -> FrameOrSeries:
"""Describe series or dataframe.

Called from pandas.core.generic.NDFrame.describe()

Parameters
----------
obj: DataFrame or Series
Either dataframe or series to be described.
include : 'all', list-like of dtypes or None (default), optional
A white list of data types to include in the result. Ignored for ``Series``.
exclude : list-like of dtypes or None (default), optional,
A black list of data types to omit from the result. Ignored for ``Series``.
datetime_is_numeric : bool, default False
Whether to treat datetime dtypes as numeric.
percentiles : list-like of numbers, optional
The percentiles to include in the output. All should fall between 0 and 1.
The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and
75th percentiles.

Returns
-------
Dataframe or series description.
"""
if obj.ndim == 2 and obj.columns.size == 0:
raise ValueError("Cannot describe a DataFrame without columns")

if percentiles is not None:
# explicit conversion of `percentiles` to list
percentiles = list(percentiles)

# get them all to be in [0, 1]
validate_percentile(percentiles)

# median should always be included
if 0.5 not in percentiles:
percentiles.append(0.5)
percentiles = np.asarray(percentiles)
else:
percentiles = np.array([0.25, 0.5, 0.75])

# sort and check for duplicates
unique_pcts = np.unique(percentiles)
assert percentiles is not None
if len(unique_pcts) < len(percentiles):
raise ValueError("percentiles cannot contain duplicates")
percentiles = unique_pcts

formatted_percentiles = format_percentiles(percentiles)

def describe_numeric_1d(series) -> "Series":
from pandas import Series

stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
d = (
[series.count(), series.mean(), series.std(), series.min()]
+ series.quantile(percentiles).tolist()
+ [series.max()]
)
return Series(d, index=stat_index, name=series.name)

def describe_categorical_1d(data) -> "Series":
names = ["count", "unique"]
objcounts = data.value_counts()
count_unique = len(objcounts[objcounts != 0])
result = [data.count(), count_unique]
dtype = None
if result[1] > 0:
top, freq = objcounts.index[0], objcounts.iloc[0]
if is_datetime64_any_dtype(data.dtype):
if obj.ndim == 1:
stacklevel = 5
else:
stacklevel = 6
warnings.warn(
"Treating datetime data as categorical rather than numeric in "
"`.describe` is deprecated and will be removed in a future "
"version of pandas. Specify `datetime_is_numeric=True` to "
"silence this warning and adopt the future behavior now.",
FutureWarning,
stacklevel=stacklevel,
)
tz = data.dt.tz
asint = data.dropna().values.view("i8")
top = Timestamp(top)
if top.tzinfo is not None and tz is not None:
# Don't tz_localize(None) if key is already tz-aware
top = top.tz_convert(tz)
else:
top = top.tz_localize(tz)
names += ["top", "freq", "first", "last"]
result += [
top,
freq,
Timestamp(asint.min(), tz=tz),
Timestamp(asint.max(), tz=tz),
]
else:
names += ["top", "freq"]
result += [top, freq]

# If the DataFrame is empty, set 'top' and 'freq' to None
# to maintain output shape consistency
else:
names += ["top", "freq"]
result += [np.nan, np.nan]
dtype = "object"

from pandas import Series

return Series(result, index=names, name=data.name, dtype=dtype)

def describe_timestamp_1d(data) -> "Series":
# GH-30164
from pandas import Series

stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
d = (
[data.count(), data.mean(), data.min()]
+ data.quantile(percentiles).tolist()
+ [data.max()]
)
return Series(d, index=stat_index, name=data.name)

def describe_1d(data) -> "Series":
if is_bool_dtype(data.dtype):
return describe_categorical_1d(data)
elif is_numeric_dtype(data):
return describe_numeric_1d(data)
elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
return describe_timestamp_1d(data)
elif is_timedelta64_dtype(data.dtype):
return describe_numeric_1d(data)
else:
return describe_categorical_1d(data)

if obj.ndim == 1:
# Incompatible return value type
# (got "Series", expected "FrameOrSeries") [return-value]
return describe_1d(obj) # type:ignore[return-value]
elif (include is None) and (exclude is None):
# when some numerics are found, keep only numerics
default_include = [np.number]
if datetime_is_numeric:
default_include.append("datetime")
data = obj.select_dtypes(include=default_include)
if len(data.columns) == 0:
data = obj
elif include == "all":
if exclude is not None:
msg = "exclude must be None when include is 'all'"
raise ValueError(msg)
data = obj
else:
data = obj.select_dtypes(include=include, exclude=exclude)

ldesc = [describe_1d(s) for _, s in data.items()]
# set a convenient order for rows
names: List[Hashable] = []
ldesc_indexes = sorted((x.index for x in ldesc), key=len)
for idxnames in ldesc_indexes:
for name in idxnames:
if name not in names:
names.append(name)

d = concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False)
d.columns = data.columns.copy()
return d
159 changes: 10 additions & 149 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,7 @@
from pandas.compat.numpy import function as nv
from pandas.errors import AbstractMethodError, InvalidIndexError
from pandas.util._decorators import doc, rewrite_axis_style_signature
from pandas.util._validators import (
validate_bool_kwarg,
validate_fillna_kwargs,
validate_percentile,
)
from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs

from pandas.core.dtypes.common import (
ensure_int64,
Expand Down Expand Up @@ -95,6 +91,7 @@
from pandas.core.base import PandasObject, SelectionMixin
import pandas.core.common as com
from pandas.core.construction import create_series_with_explicit_dtype, extract_array
from pandas.core.describe import describe_ndframe
from pandas.core.flags import Flags
from pandas.core.indexes import base as ibase
from pandas.core.indexes.api import (
Expand All @@ -113,11 +110,7 @@
from pandas.core.window import Expanding, ExponentialMovingWindow, Rolling, Window

from pandas.io.formats import format as fmt
from pandas.io.formats.format import (
DataFrameFormatter,
DataFrameRenderer,
format_percentiles,
)
from pandas.io.formats.format import DataFrameFormatter, DataFrameRenderer
from pandas.io.formats.printing import pprint_thing

if TYPE_CHECKING:
Expand Down Expand Up @@ -10084,145 +10077,13 @@ def describe(
75% NaN 2.5
max NaN 3.0
"""
if self.ndim == 2 and self.columns.size == 0:
raise ValueError("Cannot describe a DataFrame without columns")

if percentiles is not None:
# explicit conversion of `percentiles` to list
percentiles = list(percentiles)

# get them all to be in [0, 1]
validate_percentile(percentiles)

# median should always be included
if 0.5 not in percentiles:
percentiles.append(0.5)
percentiles = np.asarray(percentiles)
else:
percentiles = np.array([0.25, 0.5, 0.75])

# sort and check for duplicates
unique_pcts = np.unique(percentiles)
if len(unique_pcts) < len(percentiles):
raise ValueError("percentiles cannot contain duplicates")
percentiles = unique_pcts

formatted_percentiles = format_percentiles(percentiles)

def describe_numeric_1d(series) -> "Series":
stat_index = (
["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
)
d = (
[series.count(), series.mean(), series.std(), series.min()]
+ series.quantile(percentiles).tolist()
+ [series.max()]
)
return pd.Series(d, index=stat_index, name=series.name)

def describe_categorical_1d(data) -> "Series":
names = ["count", "unique"]
objcounts = data.value_counts()
count_unique = len(objcounts[objcounts != 0])
result = [data.count(), count_unique]
dtype = None
if result[1] > 0:
top, freq = objcounts.index[0], objcounts.iloc[0]
if is_datetime64_any_dtype(data.dtype):
if self.ndim == 1:
stacklevel = 4
else:
stacklevel = 5
warnings.warn(
"Treating datetime data as categorical rather than numeric in "
"`.describe` is deprecated and will be removed in a future "
"version of pandas. Specify `datetime_is_numeric=True` to "
"silence this warning and adopt the future behavior now.",
FutureWarning,
stacklevel=stacklevel,
)
tz = data.dt.tz
asint = data.dropna().values.view("i8")
top = Timestamp(top)
if top.tzinfo is not None and tz is not None:
# Don't tz_localize(None) if key is already tz-aware
top = top.tz_convert(tz)
else:
top = top.tz_localize(tz)
names += ["top", "freq", "first", "last"]
result += [
top,
freq,
Timestamp(asint.min(), tz=tz),
Timestamp(asint.max(), tz=tz),
]
else:
names += ["top", "freq"]
result += [top, freq]

# If the DataFrame is empty, set 'top' and 'freq' to None
# to maintain output shape consistency
else:
names += ["top", "freq"]
result += [np.nan, np.nan]
dtype = "object"

return pd.Series(result, index=names, name=data.name, dtype=dtype)

def describe_timestamp_1d(data) -> "Series":
# GH-30164
stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
d = (
[data.count(), data.mean(), data.min()]
+ data.quantile(percentiles).tolist()
+ [data.max()]
)
return pd.Series(d, index=stat_index, name=data.name)

def describe_1d(data) -> "Series":
if is_bool_dtype(data.dtype):
return describe_categorical_1d(data)
elif is_numeric_dtype(data):
return describe_numeric_1d(data)
elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
return describe_timestamp_1d(data)
elif is_timedelta64_dtype(data.dtype):
return describe_numeric_1d(data)
else:
return describe_categorical_1d(data)

if self.ndim == 1:
# Incompatible return value type
# (got "Series", expected "FrameOrSeries") [return-value]
return describe_1d(self) # type:ignore[return-value]
elif (include is None) and (exclude is None):
# when some numerics are found, keep only numerics
default_include = [np.number]
if datetime_is_numeric:
default_include.append("datetime")
data = self.select_dtypes(include=default_include)
if len(data.columns) == 0:
data = self
elif include == "all":
if exclude is not None:
msg = "exclude must be None when include is 'all'"
raise ValueError(msg)
data = self
else:
data = self.select_dtypes(include=include, exclude=exclude)

ldesc = [describe_1d(s) for _, s in data.items()]
# set a convenient order for rows
names: List[Hashable] = []
ldesc_indexes = sorted((x.index for x in ldesc), key=len)
for idxnames in ldesc_indexes:
for name in idxnames:
if name not in names:
names.append(name)

d = pd.concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False)
d.columns = data.columns.copy()
return d
return describe_ndframe(
obj=self,
include=include,
exclude=exclude,
datetime_is_numeric=datetime_is_numeric,
percentiles=percentiles,
)

@final
def pct_change(
Expand Down