diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 8db9be21ca4ef..71e8a1cc3cfd3 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -276,6 +276,62 @@ Now null-values are no longer mangled. res +.. _whatsnew_140.notable_bug_fixes.argsort_bug_fix: + +Argsort now returns NumPy like +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The current implementation of :meth:`Series.argsort` suffers from two main issues: + + - Returning a useful argsort array which is only applicable to the values *after* null elements have been removed. + - A confusing and static index for the returned Series object which has, in general, random argsort indexes. + +For example with: + +.. ipython:: python + + ser = pd.Series([2, np.nan, np.nan, 1], index=["high", "null", "null2", "low"]) + +*Previous behavior*: + +.. code-block:: ipython + + In [5]: ser.argsort() + Out[5]: + high 1 + null -1 + null2 -1 + low 0 + dtype: int64 + +Firstly we observe that, for elements that are not null, the argsort has returned *1* and *0*. These are the +correct sorting indices when nulls are removed, i.e. the lowest element has index *1* and the highest element has index *0*. +If the nulls are not removed the sorting indices for the lowest element is actually *3* and the highest is *0*. + +Secondly, the argsort values for null elements returned is *-1* and those values have been positioned in the array +associated with the previous index labels for "null" and "null2". As indicated above, however, the relationship +between the index and the argsort values does not hold for the not nulls values, since the index is not sorted. + +*New behaviour*: + +The new behaviour replicates the behaviour in ``numpy.argsort``, whilst extending the functionality to work +with pandas nullable data types, and providing a *-1* return value for part backwards compatibility, and also +sorting the index to maintain a sensible and useful association with the index and argsort value. + +.. ipython:: python + + ser.argsort() + +The additional ``na_position`` argument allows ``numpy`` replication with an associated series index. + +.. ipython:: python + + ser.argsort(na_position="first") + +.. ipython:: python + + ser.argsort(na_position="last") + .. _whatsnew_140.notable_bug_fixes.notable_bug_fix3: notable_bug_fix3 @@ -575,6 +631,7 @@ Indexing - Bug when setting string-backed :class:`Categorical` values that can be parsed to datetimes into a :class:`DatetimeArray` or :class:`Series` or :class:`DataFrame` column backed by :class:`DatetimeArray` failing to parse these strings (:issue:`44236`) - Bug in :meth:`Series.__setitem__` with an integer dtype other than ``int64`` setting with a ``range`` object unnecessarily upcasting to ``int64`` (:issue:`44261`) - Bug in :meth:`Series.__setitem__` with a boolean mask indexer setting a listlike value of length 1 incorrectly broadcasting that value (:issue:`44265`) +- Bug in :class:`Series.argsort` where the index was misaligned and the indexed values were not correct (:issue:`12694`, :issue:`42056`) - Missing diff --git a/pandas/core/series.py b/pandas/core/series.py index b3c9167bfbbab..90e859e56eaa1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3649,12 +3649,12 @@ def sort_index( key, ) - def argsort(self, axis=0, kind="quicksort", order=None) -> Series: + def argsort(self, axis=0, kind="quicksort", order=None, na_position=None) -> Series: """ Return the integer indices that would sort the Series values. - Override ndarray.argsort. Argsorts the value, omitting NA/null values, - and places the result in the same locations as the non-NA values. + Override ndarray.argsort. The index is also sorted so that index labels + correspond to the integer indices. Parameters ---------- @@ -3665,29 +3665,107 @@ def argsort(self, axis=0, kind="quicksort", order=None) -> Series: information. 'mergesort' and 'stable' are the only stable algorithms. order : None Has no effect but is accepted for compatibility with numpy. + na_position : {None, "first", "last"} + Puts NaNs at the beginning if *first*; *last* puts NaNs at the end. + Defaults to *None*, which puts NaNs at the end an gives them all a sorting + index of '-1'. + + .. versionadded:: 1.4.0 Returns ------- Series[np.intp] - Positions of values within the sort order with -1 indicating - nan values. + Positions of values within the sort order with associated sorted index. See Also -------- numpy.ndarray.argsort : Returns the indices that would sort this array. - """ - values = self._values - mask = isna(values) + Series.idxmax : Return the row label of the maximum value. + Series.idxmin : Return the row label of the minimum value. - if mask.any(): - result = np.full(len(self), -1, dtype=np.intp) - notmask = ~mask - result[notmask] = np.argsort(values[notmask], kind=kind) + Examples + -------- + Argsorting a basic Series. + + >>> series = Series([30, 10, 20], index=["high", "low", "mid"], name="xy") + >>> series.argsort() + low 1 + mid 2 + high 0 + Name: xy, dtype: int64 + + Argsorting a Series with null values. + + >>> series = Series([30, 10, np.nan, 20], name="xy", + ... index=["high", "low", "null", "mid"]) + >>> series.argsort() + low 1 + mid 3 + high 0 + null -1 + Name: xy, dtype: int64 + + Argsorting a Series using ``na_position`` + + >>> series.argsort(na_position="first") + null 2 + low 1 + mid 3 + high 0 + Name: xy, dtype: int64 + """ + values = self.values + na_mask = isna(values) + n_na = na_mask.sum() + if n_na == 0: # number of NaN values is zero + res = np.argsort(values, kind=kind) + res_ser = Series(res, index=self.index[res], dtype=np.intp, name=self.name) + return res_ser.__finalize__(self, method="argsort") else: - result = np.argsort(values, kind=kind) + # GH 42090 + if isinstance(na_mask, pandas.core.arrays.sparse.SparseArray): + # avoid RecursionError + na_mask = np.asarray(na_mask) + + # Do the not_na argsort: + # count the missing index values within arrays added to not_na results + notna_na_cumsum = na_mask.cumsum()[~na_mask] + # argsort the values excluding the nans + notna_argsort = np.argsort(values[~na_mask]) + # add to these the indexes where nans have been removed + notna_argsort += notna_na_cumsum[notna_argsort] + + # Do the na argsort: + if na_position is None: + na_argsort = -1 + elif na_position == "first" or na_position == "last": + # count the missing index values within arrays added to na results + na_notna_cumsum = (~na_mask).cumsum()[na_mask] + # argsort the nans + na_argsort = np.arange(n_na) + # add to these the indexes where not nans have been removed + na_argsort += na_notna_cumsum + else: + raise ValueError("`na_position` must be one of {'first', 'last', None}") - res = self._constructor(result, index=self.index, name=self.name, dtype=np.intp) - return res.__finalize__(self, method="argsort") + # create and combine the Series: + na_res_ser = Series( + na_argsort, index=self.index[na_mask], dtype=np.intp, name=self.name + ) + notna_res_ser = Series( + notna_argsort, + index=self.index[notna_argsort], + dtype="int64", + name=self.name, + ) + from pandas.core.reshape.concat import concat + + concat_order = [notna_res_ser, na_res_ser] + if na_position == "first": + concat_order = [na_res_ser, notna_res_ser] + ret_ser = concat(concat_order).__finalize__(self, method="argsort") + assert isinstance(ret_ser, Series) # mypy: concat 2 Series so is OK + return ret_ser def nlargest(self, n=5, keep="first") -> Series: """ diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index c96e2fb49e397..02d5e96cb0e88 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -72,7 +72,7 @@ def test_apply_simple_series(self, data): def test_argsort(self, data_for_sorting): result = pd.Series(data_for_sorting).argsort() # argsort result gets passed to take, so should be np.intp - expected = pd.Series(np.array([2, 0, 1], dtype=np.intp)) + expected = pd.Series(np.array([2, 0, 1], dtype=np.intp), index=[2, 0, 1]) self.assert_series_equal(result, expected) def test_argsort_missing_array(self, data_missing_for_sorting): @@ -83,7 +83,7 @@ def test_argsort_missing_array(self, data_missing_for_sorting): def test_argsort_missing(self, data_missing_for_sorting): result = pd.Series(data_missing_for_sorting).argsort() - expected = pd.Series(np.array([1, -1, 0], dtype=np.intp)) + expected = pd.Series(np.array([2, 0, -1], dtype=np.intp), index=[2, 0, 1]) self.assert_series_equal(result, expected) def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_value): diff --git a/pandas/tests/series/methods/test_argsort.py b/pandas/tests/series/methods/test_argsort.py index 1fbc9ed787e11..2776120dad83c 100644 --- a/pandas/tests/series/methods/test_argsort.py +++ b/pandas/tests/series/methods/test_argsort.py @@ -20,8 +20,9 @@ def _check_accum_op(self, name, ser, check_dtype=True): ts = ser.copy() ts[::2] = np.NaN - result = func(ts)[1::2] - expected = func(np.array(ts.dropna())) + result = func(ts) + expected = func(ts.to_numpy()) + expected[int(len(expected) / 2) :] = -1 tm.assert_numpy_array_equal(result.values, expected, check_dtype=False) @@ -53,8 +54,12 @@ def test_argsort_stable(self): mexpected = np.argsort(s.values, kind="mergesort") qexpected = np.argsort(s.values, kind="quicksort") - tm.assert_series_equal(mindexer.astype(np.intp), Series(mexpected)) - tm.assert_series_equal(qindexer.astype(np.intp), Series(qexpected)) + tm.assert_series_equal( + mindexer.astype(np.intp), Series(mexpected, index=s.index[mexpected]) + ) + tm.assert_series_equal( + qindexer.astype(np.intp), Series(qexpected, index=s.index[qexpected]) + ) msg = ( r"ndarray Expected type , " r"found instead" @@ -65,3 +70,20 @@ def test_argsort_stable(self): def test_argsort_preserve_name(self, datetime_series): result = datetime_series.argsort() assert result.name == datetime_series.name + + def test_na_pos_raises(self): + s = Series([0, np.nan]) + with pytest.raises(ValueError, match="`na_position` must be one of"): + s.argsort(na_position="bad input") + + @pytest.mark.parametrize( + "na_position, expected", + [ + (None, Series([2, 0, -1, -1], index=["c", "a", "b", "d"], dtype=np.intp)), + ("first", Series([1, 3, 2, 0], index=["b", "d", "c", "a"], dtype=np.intp)), + ("last", Series([2, 0, 1, 3], index=["c", "a", "b", "d"], dtype=np.intp)), + ], + ) + def test_na_position(self, na_position, expected): + s = Series([2, np.nan, 1, np.nan], index=["a", "b", "c", "d"]) + tm.assert_series_equal(s.argsort(na_position=na_position), expected)