Skip to content

CLN: Centralised _check_percentile #27584

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Oct 3, 2019
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
intended for public consumption
"""
from textwrap import dedent
from typing import Dict
from typing import Dict, Iterable, Union
from warnings import catch_warnings, simplefilter, warn

import numpy as np
Expand Down Expand Up @@ -1102,6 +1102,27 @@ def _get_score(at):
return result


def check_percentile(q: Union[float, Iterable[float]]) -> np.ndarray:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would this make more sense in validators?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that's reasonable

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should I shift it to utilts/_validators.py ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea let's do that

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would renaming it to validate_percentile be better?

"""
Validate percentiles (used by describe and quantile).
Args:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just as an FYI this technically isn't a valid docstring as it doesn't follow the docstring guide:

https://pandas.pydata.org/pandas-docs/stable/development/contributing_docstring.html#docstring

Though since not part of the public API I suppose not failing CI. Would be nice to update if you have to make other changes

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the suggestion, I did not know this. I'll update the doctoring.

q: A single percentile or an iterable of percentiles.
Returns:
ndarray
An ndarray of the percentiles if valid.
Raises: ValueError if percentiles are not in given interval([0, 1]).
"""
msg = "percentiles should all be in the interval [0, 1]. " "Try {0} instead."
q_arr = np.asarray(q)
if q_arr.ndim == 0:
if not 0 <= q_arr <= 1:
raise ValueError(msg.format(q_arr / 100.0))
else:
if not all(0 <= qs <= 1 for qs in q_arr):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for a large array this is going to be much less performant than ((0 <= q_arr) & (q_arr <= 1)).all()

raise ValueError(msg.format(q_arr / 100.0))
return q_arr


# --------------- #
# select n #
# --------------- #
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -8225,7 +8225,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"):
C 1 days 12:00:00
Name: 0.5, dtype: object
"""
self._check_percentile(q)
algorithms.check_percentile(q)

data = self._get_numeric_data() if numeric_only else self
axis = self._get_axis_number(axis)
Expand Down
17 changes: 1 addition & 16 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -10169,7 +10169,7 @@ def describe(self, percentiles=None, include=None, exclude=None):
percentiles = list(percentiles)

# get them all to be in [0, 1]
self._check_percentile(percentiles)
algos.check_percentile(percentiles)

# median should always be included
if 0.5 not in percentiles:
Expand Down Expand Up @@ -10273,21 +10273,6 @@ def describe_1d(data):
d.columns = data.columns.copy()
return d

def _check_percentile(self, q):
"""
Validate percentiles (used by describe and quantile).
"""

msg = "percentiles should all be in the interval [0, 1]. Try {0} instead."
q = np.asarray(q)
if q.ndim == 0:
if not 0 <= q <= 1:
raise ValueError(msg.format(q / 100.0))
else:
if not all(0 <= qs <= 1 for qs in q):
raise ValueError(msg.format(q / 100.0))
return q

_shared_docs[
"pct_change"
] = """
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2353,7 +2353,7 @@ def quantile(self, q=0.5, interpolation="linear"):
dtype: float64
"""

self._check_percentile(q)
algorithms.check_percentile(q)

# We dispatch to DataFrame so that core.internals only has to worry
# about 2D cases.
Expand Down