Skip to content

Commit b7aaaf9

Browse files
authored
Backport PR pandas-dev#55621 on branch 2.1.x (BUG: mode not sorting values for arrow backed strings) (pandas-dev#55680)
BUG: mode not sorting values for arrow backed strings (pandas-dev#55621) * BUG: mode not sorting values for arrow backed strings * Fix tests * Change to pa_installed variable * Update pyarrow.py * Fix * Fix (cherry picked from commit bb2d2e0)
1 parent 96ecfec commit b7aaaf9

File tree

4 files changed

+15
-9
lines changed

4 files changed

+15
-9
lines changed

doc/source/whatsnew/v2.1.2.rst

+1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ Bug fixes
3232
- Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`)
3333
- Fixed bug in :meth:`Series.all` and :meth:`Series.any` not treating missing values correctly for ``dtype="string[pyarrow_numpy]"`` (:issue:`55367`)
3434
- Fixed bug in :meth:`Series.floordiv` for :class:`ArrowDtype` (:issue:`55561`)
35+
- Fixed bug in :meth:`Series.mode` not sorting values for arrow backed string dtype (:issue:`55621`)
3536
- Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`)
3637
- Fixed bug in :meth:`Series.str.extractall` for :class:`ArrowDtype` dtype being converted to object (:issue:`53846`)
3738
- Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`)

pandas/core/arrays/arrow/array.py

+1
Original file line numberDiff line numberDiff line change
@@ -1886,6 +1886,7 @@ def _mode(self, dropna: bool = True) -> Self:
18861886
if pa.types.is_temporal(pa_type):
18871887
most_common = most_common.cast(pa_type)
18881888

1889+
most_common = most_common.take(pc.array_sort_indices(most_common))
18891890
return type(self)(most_common)
18901891

18911892
def _maybe_convert_setitem_value(self, value):

pandas/tests/extension/test_arrow.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1474,7 +1474,7 @@ def test_quantile(data, interpolation, quantile, request):
14741474

14751475
@pytest.mark.parametrize(
14761476
"take_idx, exp_idx",
1477-
[[[0, 0, 2, 2, 4, 4], [0, 4]], [[0, 0, 0, 2, 4, 4], [0]]],
1477+
[[[0, 0, 2, 2, 4, 4], [4, 0]], [[0, 0, 0, 2, 4, 4], [0]]],
14781478
ids=["multi_mode", "single_mode"],
14791479
)
14801480
def test_mode_dropna_true(data_for_grouping, take_idx, exp_idx):
@@ -1492,7 +1492,7 @@ def test_mode_dropna_false_mode_na(data):
14921492
expected = pd.Series([None], dtype=data.dtype)
14931493
tm.assert_series_equal(result, expected)
14941494

1495-
expected = pd.Series([None, data[0]], dtype=data.dtype)
1495+
expected = pd.Series([data[0], None], dtype=data.dtype)
14961496
result = expected.mode(dropna=False)
14971497
tm.assert_series_equal(result, expected)
14981498

pandas/tests/groupby/test_groupby.py

+11-7
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@
55
import numpy as np
66
import pytest
77

8-
from pandas.compat import pa_version_under7p0
98
from pandas.errors import (
109
PerformanceWarning,
1110
SpecificationError,
1211
)
12+
import pandas.util._test_decorators as td
1313

1414
import pandas as pd
1515
from pandas import (
@@ -2518,10 +2518,7 @@ def test_groupby_column_index_name_lost(func):
25182518
"infer_string",
25192519
[
25202520
False,
2521-
pytest.param(
2522-
True,
2523-
marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"),
2524-
),
2521+
pytest.param(True, marks=td.skip_if_no("pyarrow")),
25252522
],
25262523
)
25272524
def test_groupby_duplicate_columns(infer_string):
@@ -2751,13 +2748,20 @@ def test_rolling_wrong_param_min_period():
27512748
test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum()
27522749

27532750

2754-
def test_by_column_values_with_same_starting_value():
2751+
@pytest.mark.parametrize(
2752+
"dtype",
2753+
[
2754+
object,
2755+
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
2756+
],
2757+
)
2758+
def test_by_column_values_with_same_starting_value(dtype):
27552759
# GH29635
27562760
df = DataFrame(
27572761
{
27582762
"Name": ["Thomas", "Thomas", "Thomas John"],
27592763
"Credit": [1200, 1300, 900],
2760-
"Mood": ["sad", "happy", "happy"],
2764+
"Mood": Series(["sad", "happy", "happy"], dtype=dtype),
27612765
}
27622766
)
27632767
aggregate_details = {"Mood": Series.mode, "Credit": "sum"}

0 commit comments

Comments
 (0)