-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
[ENH] nargsort handles EA with its _values_for_argsort #26854
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
8f9637c
4460191
2f37a63
707d9e3
5ce8a29
42d7092
f84aad8
7543a84
a65fea3
365afef
fc99129
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,4 @@ | ||
""" miscellaneous sorting / groupby utilities """ | ||
import warnings | ||
|
||
import numpy as np | ||
|
||
from pandas._libs import algos, hashtable, lib | ||
|
@@ -9,7 +7,8 @@ | |
from pandas.core.dtypes.cast import infer_dtype_from_array | ||
from pandas.core.dtypes.common import ( | ||
ensure_int64, ensure_platform_int, is_categorical_dtype, | ||
is_extension_array_dtype, is_list_like) | ||
is_extension_array_dtype, is_list_like, is_sparse) | ||
from pandas.core.dtypes.generic import ABCIndexClass | ||
from pandas.core.dtypes.missing import isna | ||
|
||
import pandas.core.algorithms as algorithms | ||
|
@@ -239,12 +238,12 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): | |
GH #6399, #5231 | ||
""" | ||
|
||
mask = isna(items) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. change this to
|
||
# specially handle Categorical | ||
if is_categorical_dtype(items): | ||
if na_position not in {'first', 'last'}: | ||
raise ValueError('invalid na_position: {!r}'.format(na_position)) | ||
|
||
mask = isna(items) | ||
cnt_null = mask.sum() | ||
sorted_idx = items.argsort(ascending=ascending, kind=kind) | ||
if ascending and na_position == 'last': | ||
|
@@ -255,15 +254,19 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): | |
sorted_idx = np.roll(sorted_idx, cnt_null) | ||
return sorted_idx | ||
|
||
with warnings.catch_warnings(): | ||
# https://github.com/pandas-dev/pandas/issues/25439 | ||
# can be removed once ExtensionArrays are properly handled by nargsort | ||
warnings.filterwarnings( | ||
"ignore", category=FutureWarning, | ||
message="Converting timezone-aware DatetimeArray to") | ||
if (not isinstance(items, ABCIndexClass) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One more question: why do we have this check for ABCIndexClass here? Shouldn't they be treated the same? I think it'd be more correct to do
on all inputs. This will get an ExtensionArray from an Index or Series. Then if the new There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
and is_extension_array_dtype(items)): | ||
|
||
if is_sparse(items): | ||
# The conversion to np.ndarray is due to the fact that | ||
# SparseArray.isna() is also a SparseArray | ||
mask = np.array(isna(items)) | ||
|
||
items = items._values_for_argsort() | ||
else: | ||
items = np.asanyarray(items) | ||
|
||
idx = np.arange(len(items)) | ||
mask = isna(items) | ||
non_nans = items[~mask] | ||
non_nan_idx = idx[~mask] | ||
nan_idx = np.nonzero(mask)[0] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ | |
import pytest | ||
|
||
import pandas as pd | ||
from pandas.core.sorting import nargsort | ||
import pandas.util.testing as tm | ||
|
||
from .base import BaseExtensionTests | ||
|
@@ -44,6 +45,15 @@ def test_argsort_missing(self, data_missing_for_sorting): | |
expected = pd.Series(np.array([1, -1, 0], dtype=np.int64)) | ||
self.assert_series_equal(result, expected) | ||
|
||
@pytest.mark.parametrize('na_position, expected', [ | ||
('last', np.array([2, 0, 1])), | ||
('first', np.array([1, 2, 0])) | ||
]) | ||
def test_nargsort(self, data_missing_for_sorting, na_position, expected): | ||
simonjayhawkins marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# GH 25439 | ||
result = nargsort(data_missing_for_sorting, na_position=na_position) | ||
tm.assert_numpy_array_equal(result, expected, check_dtype=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is If you change the expected to |
||
|
||
@pytest.mark.parametrize('ascending', [True, False]) | ||
def test_sort_values(self, data_for_sorting, ascending): | ||
ser = pd.Series(data_for_sorting) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,8 +6,7 @@ | |
from numpy import nan | ||
import pytest | ||
|
||
from pandas import ( | ||
DataFrame, MultiIndex, Series, array, concat, merge, to_datetime) | ||
from pandas import DataFrame, MultiIndex, Series, array, concat, merge | ||
from pandas.core import common as com | ||
from pandas.core.sorting import ( | ||
decons_group_index, get_group_index, is_int64_overflow_possible, | ||
|
@@ -181,13 +180,6 @@ def test_nargsort(self): | |
exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) | ||
tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) | ||
|
||
def test_nargsort_datetimearray_warning(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we not leave this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FWIW, I don't see the warning on master. It seems to have changed in the meantime. Probably fine to just remove. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure that's cool |
||
# https://github.com/pandas-dev/pandas/issues/25439 | ||
# can be removed once the FutureWarning for np.array(DTA) is removed | ||
data = to_datetime([0, 2, 0, 1]).tz_localize('Europe/Brussels') | ||
with tm.assert_produces_warning(None): | ||
nargsort(data) | ||
|
||
|
||
class TestMerge: | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IIRC you had this different before, can you say that EA now argsorts Nans at the end
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Referencing :meth:
pandas.api.extensions.ExtensionArray.argsort
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@makbigc can you update this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#26354 But the PR in which
ExtensionArray.argsort
places Nans at the end bynargsort
hasn't been merged. Should we add the reference in the whatsnew entry?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah got them mixed up. You can just remove this note then.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
removed.