Skip to content

CLN: Centralised _check_percentile #27584

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Oct 3, 2019
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 22 additions & 26 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
intended for public consumption
"""
from textwrap import dedent
from typing import Dict
from typing import Dict, Iterable, Union
from warnings import catch_warnings, simplefilter, warn

import numpy as np
Expand Down Expand Up @@ -114,10 +114,10 @@ def _ensure_data(values, dtype=None):

# datetimelike
if (
needs_i8_conversion(values)
or is_period_dtype(dtype)
or is_datetime64_any_dtype(dtype)
or is_timedelta64_dtype(dtype)
needs_i8_conversion(values)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why these changes?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for these

or is_period_dtype(dtype)
or is_datetime64_any_dtype(dtype)
or is_timedelta64_dtype(dtype)
):
if is_period_dtype(values) or is_period_dtype(dtype):
from pandas import PeriodIndex
Expand Down Expand Up @@ -146,7 +146,7 @@ def _ensure_data(values, dtype=None):
return values.asi8, dtype, "int64"

elif is_categorical_dtype(values) and (
is_categorical_dtype(dtype) or dtype is None
is_categorical_dtype(dtype) or dtype is None
):
values = getattr(values, "values", values)
values = values.codes
Expand Down Expand Up @@ -248,7 +248,6 @@ def _get_hashtable_algo(values):


def _get_data_algo(values, func_map):

if is_categorical_dtype(values):
values = values._values_for_rank()

Expand Down Expand Up @@ -299,7 +298,6 @@ def match(to_match, values, na_sentinel=-1):
result = table.lookup(to_match)

if na_sentinel != -1:

# replace but return a numpy array
# use a Series because it handles dtype conversions properly
from pandas import Series
Expand Down Expand Up @@ -657,9 +655,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
values, dtype, _ = _ensure_data(values)

if (
is_datetime64_any_dtype(original)
or is_timedelta64_dtype(original)
or is_period_dtype(original)
is_datetime64_any_dtype(original)
or is_timedelta64_dtype(original)
or is_period_dtype(original)
):
na_value = na_value_for_dtype(original.dtype)
else:
Expand Down Expand Up @@ -690,7 +688,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):


def value_counts(
values, sort=True, ascending=False, normalize=False, bins=None, dropna=True
values, sort=True, ascending=False, normalize=False, bins=None, dropna=True
):
"""
Compute a histogram of the counts of non-null values.
Expand Down Expand Up @@ -993,10 +991,10 @@ def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None):
to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any()
else:
to_raise = (
((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any()
or (
(np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2]
).any()
((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any()
or (
(np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2]
).any()
)

if to_raise:
Expand Down Expand Up @@ -1129,8 +1127,8 @@ def is_valid_dtype_n_method(dtype):
nsmallest/nlargest methods
"""
return (
is_numeric_dtype(dtype) and not is_complex_dtype(dtype)
) or needs_i8_conversion(dtype)
is_numeric_dtype(dtype) and not is_complex_dtype(dtype)
) or needs_i8_conversion(dtype)


class SelectNSeries(SelectN):
Expand Down Expand Up @@ -1165,7 +1163,6 @@ def compute(self, method):

# slow method
if n >= len(self.obj):

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you revert these files as they are not actually changed

Copy link
Contributor Author

@hedonhermdev hedonhermdev Oct 1, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isort fails if I don't commit them

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok that's odd

reverse_it = self.keep == "last" or method == "nlargest"
ascending = method == "nsmallest"
slc = np.s_[::-1] if reverse_it else np.s_[:]
Expand Down Expand Up @@ -1603,7 +1600,7 @@ def take(arr, indices, axis=0, allow_fill=False, fill_value=None):


def take_nd(
arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, allow_fill=True
arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, allow_fill=True
):
"""
Specialized Cython take which sets NaN values in one pass
Expand Down Expand Up @@ -1720,7 +1717,7 @@ def take_nd(


def take_2d_multi(
arr, indexer, out=None, fill_value=np.nan, mask_info=None, allow_fill=True
arr, indexer, out=None, fill_value=np.nan, mask_info=None, allow_fill=True
):
"""
Specialized Cython take which sets NaN values in one pass
Expand Down Expand Up @@ -1779,7 +1776,6 @@ def take_2d_multi(
if func is not None:
func = _convert_wrapper(func, out.dtype)
if func is None:

def func(arr, indexer, out, fill_value=np.nan):
_take_2d_multi_object(
arr, indexer, out, fill_value=fill_value, mask_info=mask_info
Expand Down Expand Up @@ -1842,9 +1838,9 @@ def searchsorted(arr, value, side="left", sorter=None):
sorter = ensure_platform_int(sorter)

if (
isinstance(arr, np.ndarray)
and is_integer_dtype(arr)
and (is_integer(value) or is_integer_dtype(value))
isinstance(arr, np.ndarray)
and is_integer_dtype(arr)
and (is_integer(value) or is_integer_dtype(value))
):
# if `arr` and `value` have different dtypes, `arr` would be
# recast by numpy, causing a slow search.
Expand All @@ -1864,7 +1860,7 @@ def searchsorted(arr, value, side="left", sorter=None):
else:
value = array(value, dtype=dtype)
elif not (
is_object_dtype(arr) or is_numeric_dtype(arr) or is_categorical_dtype(arr)
is_object_dtype(arr) or is_numeric_dtype(arr) or is_categorical_dtype(arr)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you not add unrelated changes (all of this whitespace)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for that.

):
from pandas.core.series import Series

Expand Down
4 changes: 2 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
deprecate_kwarg,
rewrite_axis_style_signature,
)
from pandas.util._validators import validate_axis_style_args, validate_bool_kwarg
from pandas.util._validators import validate_axis_style_args, validate_bool_kwarg, validate_percentile

from pandas.core.dtypes.cast import (
cast_scalar_to_array,
Expand Down Expand Up @@ -8225,7 +8225,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"):
C 1 days 12:00:00
Name: 0.5, dtype: object
"""
self._check_percentile(q)
validate_percentile(q)

data = self._get_numeric_data() if numeric_only else self
axis = self._get_axis_number(axis)
Expand Down
19 changes: 2 additions & 17 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from pandas.compat.numpy import function as nv
from pandas.errors import AbstractMethodError
from pandas.util._decorators import Appender, Substitution, rewrite_axis_style_signature
from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs
from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs, validate_percentile

from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask
from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -10169,7 +10169,7 @@ def describe(self, percentiles=None, include=None, exclude=None):
percentiles = list(percentiles)

# get them all to be in [0, 1]
self._check_percentile(percentiles)
validate_percentile(percentiles)

# median should always be included
if 0.5 not in percentiles:
Expand Down Expand Up @@ -10273,21 +10273,6 @@ def describe_1d(data):
d.columns = data.columns.copy()
return d

def _check_percentile(self, q):
"""
Validate percentiles (used by describe and quantile).
"""

msg = "percentiles should all be in the interval [0, 1]. Try {0} instead."
q = np.asarray(q)
if q.ndim == 0:
if not 0 <= q <= 1:
raise ValueError(msg.format(q / 100.0))
else:
if not all(0 <= qs <= 1 for qs in q):
raise ValueError(msg.format(q / 100.0))
return q

_shared_docs[
"pct_change"
] = """
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from pandas.compat import PY36
from pandas.compat.numpy import function as nv
from pandas.util._decorators import Appender, Substitution, deprecate
from pandas.util._validators import validate_bool_kwarg
from pandas.util._validators import validate_bool_kwarg, validate_percentile

from pandas.core.dtypes.common import (
_is_unorderable_exception,
Expand Down Expand Up @@ -2353,7 +2353,7 @@ def quantile(self, q=0.5, interpolation="linear"):
dtype: float64
"""

self._check_percentile(q)
validate_percentile(q)

# We dispatch to DataFrame so that core.internals only has to worry
# about 2D cases.
Expand Down
34 changes: 34 additions & 0 deletions pandas/util/_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
for validating data or function arguments
"""
import warnings
import numpy as np
from typing import Union, Iterable

from pandas.core.dtypes.common import is_bool

Expand Down Expand Up @@ -370,3 +372,35 @@ def validate_fillna_kwargs(value, method, validate_scalar_dict_value=True):
raise ValueError("Cannot specify both 'value' and 'method'.")

return value, method


def validate_percentile(q: Union[float, Iterable[float]]) -> np.ndarray:
"""
Validate percentiles (used by describe and quantile).

This function checks if the given float oriterable of floats is a valid percentile
otherwise raises a ValueError.

Parameters
----
q: float or iterable of floats
A single percentile or an iterable of percentiles.

Returns
-------
ndarray
An ndarray of the percentiles if valid.

Raises
------
ValueError if percentiles are not in given interval([0, 1]).
"""
msg = "percentiles should all be in the interval [0, 1]. " "Try {0} instead."
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Extra quotation marks before 'Try'

q_arr = np.asarray(q)
if q_arr.ndim == 0:
if not 0 <= q_arr <= 1:
raise ValueError(msg.format(q_arr / 100.0))
else:
if not all(0 <= qs <= 1 for qs in q_arr):
raise ValueError(msg.format(q_arr / 100.0))
return q_arr