Skip to content

ENH: Add ignore_index for df.sort_values and series.sort_values #30402

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Dec 27, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
7e461a1
remove \n from docstring
charlesdong1991 Dec 3, 2018
1314059
fix conflicts
charlesdong1991 Jan 19, 2019
8bcb313
Merge remote-tracking branch 'upstream/master'
charlesdong1991 Jul 30, 2019
1f9a4ce
Merge remote-tracking branch 'upstream/master' into fix_issue_30114
charlesdong1991 Dec 9, 2019
5f924c3
Merge remote-tracking branch 'upstream/master' into fix_issue_30114
charlesdong1991 Dec 22, 2019
d0a134f
Add ignore index for sort values
charlesdong1991 Dec 22, 2019
6d52765
black reformat
charlesdong1991 Dec 22, 2019
a31797d
add tests
charlesdong1991 Dec 22, 2019
b80f380
remove type hint to see if test passes
charlesdong1991 Dec 22, 2019
b997d3f
code change based on WA review
charlesdong1991 Dec 23, 2019
12d1260
restore reformat change on other parts
charlesdong1991 Dec 23, 2019
4ff2493
revert change
charlesdong1991 Dec 23, 2019
e9d63f4
change bool
charlesdong1991 Dec 23, 2019
70ffec7
remove annotation
charlesdong1991 Dec 23, 2019
b4245d7
remove for series
charlesdong1991 Dec 23, 2019
f9e7ec2
add ignore_index for series
charlesdong1991 Dec 23, 2019
d95a89f
keep consistency
charlesdong1991 Dec 23, 2019
f241e67
revert change
charlesdong1991 Dec 23, 2019
7f9846a
resolve conflict
charlesdong1991 Dec 23, 2019
bbb4754
restore change
charlesdong1991 Dec 23, 2019
3c37eb9
code change based on WA and JR reviews
charlesdong1991 Dec 24, 2019
4ce9f43
better english
charlesdong1991 Dec 24, 2019
0f89aa2
skip annotation
charlesdong1991 Dec 24, 2019
d02b651
code change on JR review
charlesdong1991 Dec 26, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ Other enhancements
- The ``partition_cols`` argument in :meth:`DataFrame.to_parquet` now accepts a string (:issue:`27117`)
- :func:`to_parquet` now appropriately handles the ``schema`` argument for user defined schemas in the pyarrow engine. (:issue: `30270`)
- DataFrame constructor preserve `ExtensionArray` dtype with `ExtensionArray` (:issue:`11363`)
- :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` have gained ``ignore_index`` keyword to be able to reset index after sorting (:issue:`30114`)


Build Changes
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4704,6 +4704,7 @@ def sort_values(
inplace=False,
kind="quicksort",
na_position="last",
ignore_index=False,
):
inplace = validate_bool_kwarg(inplace, "inplace")
axis = self._get_axis_number(axis)
Expand Down Expand Up @@ -4737,6 +4738,9 @@ def sort_values(
indexer, axis=self._get_block_manager_axis(axis), verify=False
)

if ignore_index:
new_data.axes[1] = ibase.default_index(len(indexer))

if inplace:
return self._update_inplace(new_data)
else:
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -4087,6 +4087,7 @@ def sort_values(
inplace: bool_t = False,
kind: str = "quicksort",
na_position: str = "last",
ignore_index: bool_t = False,
):
"""
Sort by the values along either axis.
Expand All @@ -4109,6 +4110,10 @@ def sort_values(
na_position : {'first', 'last'}, default 'last'
Puts NaNs at the beginning if `first`; `last` puts NaNs at the
end.
ignore_index : bool, default False
If True, the resulting axis will be labeled 0, 1, …, n - 1.

.. versionadded:: 1.0.0

Returns
-------
Expand Down
20 changes: 14 additions & 6 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2693,6 +2693,7 @@ def sort_values(
inplace=False,
kind="quicksort",
na_position="last",
ignore_index=False,
):
"""
Sort by the values.
Expand All @@ -2715,6 +2716,10 @@ def sort_values(
na_position : {'first' or 'last'}, default 'last'
Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at
the end.
ignore_index : bool, default False
If True, the resulting axis will be labeled 0, 1, …, n - 1.

.. versionadded:: 1.0.0

Returns
-------
Expand Down Expand Up @@ -2820,7 +2825,7 @@ def _try_kind_sort(arr):
return arr.argsort(kind="quicksort")

arr = self._values
sortedIdx = np.empty(len(self), dtype=np.int32)
sorted_index = np.empty(len(self), dtype=np.int32)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Orthogonal to this but I wonder why this is specified as np.int32; might be a bug for input with more than 2 ** 32 - 1 rows

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

numpy uses int32 for argsort so best can do (to be honest 2B is enough)


bad = isna(arr)

Expand All @@ -2844,16 +2849,19 @@ def _try_kind_sort(arr):

if na_position == "last":
n = good.sum()
sortedIdx[:n] = idx[good][argsorted]
sortedIdx[n:] = idx[bad]
sorted_index[:n] = idx[good][argsorted]
sorted_index[n:] = idx[bad]
elif na_position == "first":
n = bad.sum()
sortedIdx[n:] = idx[good][argsorted]
sortedIdx[:n] = idx[bad]
sorted_index[n:] = idx[good][argsorted]
sorted_index[:n] = idx[bad]
else:
raise ValueError(f"invalid na_position: {na_position}")

result = self._constructor(arr[sortedIdx], index=self.index[sortedIdx])
result = self._constructor(arr[sorted_index], index=self.index[sorted_index])

if ignore_index:
result.index = ibase.default_index(len(sorted_index))

if inplace:
self._update_inplace(result)
Expand Down
42 changes: 42 additions & 0 deletions pandas/tests/frame/methods/test_sort_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,3 +460,45 @@ def test_sort_values_na_position_with_categories_raises(self):

with pytest.raises(ValueError):
df.sort_values(by="c", ascending=False, na_position="bad_position")

@pytest.mark.parametrize(
"original_dict, sorted_dict, ignore_index, output_index",
[
({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, True, [0, 1, 2]),
({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, False, [2, 1, 0]),
(
{"A": [1, 2, 3], "B": [2, 3, 4]},
{"A": [3, 2, 1], "B": [4, 3, 2]},
True,
[0, 1, 2],
),
(
{"A": [1, 2, 3], "B": [2, 3, 4]},
{"A": [3, 2, 1], "B": [4, 3, 2]},
False,
[2, 1, 0],
),
],
)
def test_sort_values_ignore_index(
self, original_dict, sorted_dict, ignore_index, output_index
):
# GH 30114
df = DataFrame(original_dict)
expected = DataFrame(sorted_dict, index=output_index)

# Test when inplace is False
sorted_df = df.sort_values("A", ascending=False, ignore_index=ignore_index)
tm.assert_frame_equal(sorted_df, expected)

tm.assert_frame_equal(df, DataFrame(original_dict))

# Test when inplace is True
copied_df = df.copy()

copied_df.sort_values(
"A", ascending=False, ignore_index=ignore_index, inplace=True
)
tm.assert_frame_equal(copied_df, expected)

tm.assert_frame_equal(df, DataFrame(original_dict))
27 changes: 27 additions & 0 deletions pandas/tests/series/methods/test_sort_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,3 +156,30 @@ def test_sort_values_categorical(self):
result = df.sort_values(by=["grade", "id"])
expected = df.iloc[[2, 1, 5, 4, 3, 0]]
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"original_list, sorted_list, ignore_index, output_index",
[
([2, 3, 6, 1], [6, 3, 2, 1], True, [0, 1, 2, 3]),
([2, 3, 6, 1], [6, 3, 2, 1], False, [2, 1, 0, 3]),
],
)
def test_sort_values_ignore_index(
self, original_list, sorted_list, ignore_index, output_index
):
# GH 30114
sr = Series(original_list)
expected = Series(sorted_list, index=output_index)

# Test when inplace is False
sorted_sr = sr.sort_values(ascending=False, ignore_index=ignore_index)
tm.assert_series_equal(sorted_sr, expected)

tm.assert_series_equal(sr, Series(original_list))

# Test when inplace is True
copied_sr = sr.copy()
copied_sr.sort_values(ascending=False, ignore_index=ignore_index, inplace=True)
tm.assert_series_equal(copied_sr, expected)

tm.assert_series_equal(sr, Series(original_list))