-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
CLN: Centralised _check_percentile #27584
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 13 commits
69ef619
715ac7d
de8a4ab
7db44a4
350a624
4b4ca39
79c407d
146ebc5
8807706
b0a02e4
d4d0e88
7870f75
93a7970
5b0122f
946ee3f
3c56c6b
786e172
d81a08d
631a049
f66f314
4e399c6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,7 +3,7 @@ | |
intended for public consumption | ||
""" | ||
from textwrap import dedent | ||
from typing import Dict | ||
from typing import Dict, Iterable, Union | ||
from warnings import catch_warnings, simplefilter, warn | ||
|
||
import numpy as np | ||
|
@@ -114,10 +114,10 @@ def _ensure_data(values, dtype=None): | |
|
||
# datetimelike | ||
if ( | ||
needs_i8_conversion(values) | ||
or is_period_dtype(dtype) | ||
or is_datetime64_any_dtype(dtype) | ||
or is_timedelta64_dtype(dtype) | ||
needs_i8_conversion(values) | ||
or is_period_dtype(dtype) | ||
or is_datetime64_any_dtype(dtype) | ||
or is_timedelta64_dtype(dtype) | ||
): | ||
if is_period_dtype(values) or is_period_dtype(dtype): | ||
from pandas import PeriodIndex | ||
|
@@ -146,7 +146,7 @@ def _ensure_data(values, dtype=None): | |
return values.asi8, dtype, "int64" | ||
|
||
elif is_categorical_dtype(values) and ( | ||
is_categorical_dtype(dtype) or dtype is None | ||
is_categorical_dtype(dtype) or dtype is None | ||
): | ||
values = getattr(values, "values", values) | ||
values = values.codes | ||
|
@@ -248,7 +248,6 @@ def _get_hashtable_algo(values): | |
|
||
|
||
def _get_data_algo(values, func_map): | ||
|
||
if is_categorical_dtype(values): | ||
values = values._values_for_rank() | ||
|
||
|
@@ -299,7 +298,6 @@ def match(to_match, values, na_sentinel=-1): | |
result = table.lookup(to_match) | ||
|
||
if na_sentinel != -1: | ||
|
||
# replace but return a numpy array | ||
# use a Series because it handles dtype conversions properly | ||
from pandas import Series | ||
|
@@ -657,9 +655,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): | |
values, dtype, _ = _ensure_data(values) | ||
|
||
if ( | ||
is_datetime64_any_dtype(original) | ||
or is_timedelta64_dtype(original) | ||
or is_period_dtype(original) | ||
is_datetime64_any_dtype(original) | ||
or is_timedelta64_dtype(original) | ||
or is_period_dtype(original) | ||
): | ||
na_value = na_value_for_dtype(original.dtype) | ||
else: | ||
|
@@ -690,7 +688,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): | |
|
||
|
||
def value_counts( | ||
values, sort=True, ascending=False, normalize=False, bins=None, dropna=True | ||
values, sort=True, ascending=False, normalize=False, bins=None, dropna=True | ||
): | ||
""" | ||
Compute a histogram of the counts of non-null values. | ||
|
@@ -993,10 +991,10 @@ def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): | |
to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any() | ||
else: | ||
to_raise = ( | ||
((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() | ||
or ( | ||
(np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2] | ||
).any() | ||
((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() | ||
or ( | ||
(np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2] | ||
).any() | ||
) | ||
|
||
if to_raise: | ||
|
@@ -1129,8 +1127,8 @@ def is_valid_dtype_n_method(dtype): | |
nsmallest/nlargest methods | ||
""" | ||
return ( | ||
is_numeric_dtype(dtype) and not is_complex_dtype(dtype) | ||
) or needs_i8_conversion(dtype) | ||
is_numeric_dtype(dtype) and not is_complex_dtype(dtype) | ||
) or needs_i8_conversion(dtype) | ||
|
||
|
||
class SelectNSeries(SelectN): | ||
|
@@ -1165,7 +1163,6 @@ def compute(self, method): | |
|
||
# slow method | ||
if n >= len(self.obj): | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you revert these files as they are not actually changed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. isort fails if I don't commit them There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok that's odd |
||
reverse_it = self.keep == "last" or method == "nlargest" | ||
ascending = method == "nsmallest" | ||
slc = np.s_[::-1] if reverse_it else np.s_[:] | ||
|
@@ -1603,7 +1600,7 @@ def take(arr, indices, axis=0, allow_fill=False, fill_value=None): | |
|
||
|
||
def take_nd( | ||
arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, allow_fill=True | ||
arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, allow_fill=True | ||
): | ||
""" | ||
Specialized Cython take which sets NaN values in one pass | ||
|
@@ -1720,7 +1717,7 @@ def take_nd( | |
|
||
|
||
def take_2d_multi( | ||
arr, indexer, out=None, fill_value=np.nan, mask_info=None, allow_fill=True | ||
arr, indexer, out=None, fill_value=np.nan, mask_info=None, allow_fill=True | ||
): | ||
""" | ||
Specialized Cython take which sets NaN values in one pass | ||
|
@@ -1779,7 +1776,6 @@ def take_2d_multi( | |
if func is not None: | ||
func = _convert_wrapper(func, out.dtype) | ||
if func is None: | ||
|
||
def func(arr, indexer, out, fill_value=np.nan): | ||
_take_2d_multi_object( | ||
arr, indexer, out, fill_value=fill_value, mask_info=mask_info | ||
|
@@ -1842,9 +1838,9 @@ def searchsorted(arr, value, side="left", sorter=None): | |
sorter = ensure_platform_int(sorter) | ||
|
||
if ( | ||
isinstance(arr, np.ndarray) | ||
and is_integer_dtype(arr) | ||
and (is_integer(value) or is_integer_dtype(value)) | ||
isinstance(arr, np.ndarray) | ||
and is_integer_dtype(arr) | ||
and (is_integer(value) or is_integer_dtype(value)) | ||
): | ||
# if `arr` and `value` have different dtypes, `arr` would be | ||
# recast by numpy, causing a slow search. | ||
|
@@ -1864,7 +1860,7 @@ def searchsorted(arr, value, side="left", sorter=None): | |
else: | ||
value = array(value, dtype=dtype) | ||
elif not ( | ||
is_object_dtype(arr) or is_numeric_dtype(arr) or is_categorical_dtype(arr) | ||
is_object_dtype(arr) or is_numeric_dtype(arr) or is_categorical_dtype(arr) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you not add unrelated changes (all of this whitespace) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry for that. |
||
): | ||
from pandas.core.series import Series | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,8 @@ | |
for validating data or function arguments | ||
""" | ||
import warnings | ||
import numpy as np | ||
from typing import Union, Iterable | ||
|
||
from pandas.core.dtypes.common import is_bool | ||
|
||
|
@@ -370,3 +372,35 @@ def validate_fillna_kwargs(value, method, validate_scalar_dict_value=True): | |
raise ValueError("Cannot specify both 'value' and 'method'.") | ||
|
||
return value, method | ||
|
||
|
||
def validate_percentile(q: Union[float, Iterable[float]]) -> np.ndarray: | ||
""" | ||
Validate percentiles (used by describe and quantile). | ||
|
||
This function checks if the given float oriterable of floats is a valid percentile | ||
otherwise raises a ValueError. | ||
|
||
Parameters | ||
---- | ||
hedonhermdev marked this conversation as resolved.
Show resolved
Hide resolved
|
||
q: float or iterable of floats | ||
A single percentile or an iterable of percentiles. | ||
|
||
Returns | ||
------- | ||
ndarray | ||
An ndarray of the percentiles if valid. | ||
|
||
Raises | ||
------ | ||
ValueError if percentiles are not in given interval([0, 1]). | ||
""" | ||
msg = "percentiles should all be in the interval [0, 1]. " "Try {0} instead." | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Extra quotation marks before 'Try' |
||
q_arr = np.asarray(q) | ||
if q_arr.ndim == 0: | ||
if not 0 <= q_arr <= 1: | ||
raise ValueError(msg.format(q_arr / 100.0)) | ||
else: | ||
if not all(0 <= qs <= 1 for qs in q_arr): | ||
raise ValueError(msg.format(q_arr / 100.0)) | ||
return q_arr | ||
hedonhermdev marked this conversation as resolved.
Show resolved
Hide resolved
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why these changes?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry for these