Skip to content

Commit 3fd020c

Browse files
authored
BUG: ArrowExtensionArray.mode(dropna=False) not respecting NAs (#50986)
* BUG: ArrowExtensionArray.mode(dropna=False) not respecting NAs * Fix tests * remove parameterization * Fix whatsnew * Use value_counts * Remove unneeded xfail * Dropna after * Revert "Dropna after" This reverts commit 1a680a8. * Remove unused request
1 parent 2070bb8 commit 3fd020c

File tree

3 files changed

+23
-29
lines changed

3 files changed

+23
-29
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1229,6 +1229,7 @@ Numeric
12291229
- Bug in DataFrame reduction methods (e.g. :meth:`DataFrame.sum`) with object dtype, ``axis=1`` and ``numeric_only=False`` would not be coerced to float (:issue:`49551`)
12301230
- Bug in :meth:`DataFrame.sem` and :meth:`Series.sem` where an erroneous ``TypeError`` would always raise when using data backed by an :class:`ArrowDtype` (:issue:`49759`)
12311231
- Bug in :meth:`Series.__add__` casting to object for list and masked :class:`Series` (:issue:`22962`)
1232+
- Bug in :meth:`~arrays.ArrowExtensionArray.mode` where ``dropna=False`` was not respected when there was ``NA`` values (:issue:`50982`)
12321233
- Bug in :meth:`DataFrame.query` with ``engine="numexpr"`` and column names are ``min`` or ``max`` would raise a ``TypeError`` (:issue:`50937`)
12331234

12341235
Conversion

pandas/core/arrays/arrow/array.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -1370,7 +1370,6 @@ def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArra
13701370
----------
13711371
dropna : bool, default True
13721372
Don't consider counts of NA values.
1373-
Not implemented by pyarrow.
13741373
13751374
Returns
13761375
-------
@@ -1389,12 +1388,13 @@ def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArra
13891388
else:
13901389
data = self._data
13911390

1392-
modes = pc.mode(data, pc.count_distinct(data).as_py())
1393-
values = modes.field(0)
1394-
counts = modes.field(1)
1395-
# counts sorted descending i.e counts[0] = max
1396-
mask = pc.equal(counts, counts[0])
1397-
most_common = values.filter(mask)
1391+
if dropna:
1392+
data = data.drop_null()
1393+
1394+
res = pc.value_counts(data)
1395+
most_common = res.field("values").filter(
1396+
pc.equal(res.field("counts"), pc.max(res.field("counts")))
1397+
)
13981398

13991399
if pa.types.is_temporal(pa_type):
14001400
most_common = most_common.cast(pa_type)

pandas/tests/extension/test_arrow.py

+15-22
Original file line numberDiff line numberDiff line change
@@ -1341,38 +1341,31 @@ def test_quantile(data, interpolation, quantile, request):
13411341
tm.assert_series_equal(result, expected)
13421342

13431343

1344-
@pytest.mark.parametrize("dropna", [True, False])
13451344
@pytest.mark.parametrize(
13461345
"take_idx, exp_idx",
1347-
[[[0, 0, 2, 2, 4, 4], [4, 0]], [[0, 0, 0, 2, 4, 4], [0]]],
1346+
[[[0, 0, 2, 2, 4, 4], [0, 4]], [[0, 0, 0, 2, 4, 4], [0]]],
13481347
ids=["multi_mode", "single_mode"],
13491348
)
1350-
def test_mode(data_for_grouping, dropna, take_idx, exp_idx, request):
1351-
pa_dtype = data_for_grouping.dtype.pyarrow_dtype
1352-
if pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype):
1353-
request.node.add_marker(
1354-
pytest.mark.xfail(
1355-
raises=pa.ArrowNotImplementedError,
1356-
reason=f"mode not supported by pyarrow for {pa_dtype}",
1357-
)
1358-
)
1359-
elif (
1360-
pa.types.is_boolean(pa_dtype)
1361-
and "multi_mode" in request.node.nodeid
1362-
and pa_version_under9p0
1363-
):
1364-
request.node.add_marker(
1365-
pytest.mark.xfail(
1366-
reason="https://issues.apache.org/jira/browse/ARROW-17096",
1367-
)
1368-
)
1349+
def test_mode_dropna_true(data_for_grouping, take_idx, exp_idx):
13691350
data = data_for_grouping.take(take_idx)
13701351
ser = pd.Series(data)
1371-
result = ser.mode(dropna=dropna)
1352+
result = ser.mode(dropna=True)
13721353
expected = pd.Series(data_for_grouping.take(exp_idx))
13731354
tm.assert_series_equal(result, expected)
13741355

13751356

1357+
def test_mode_dropna_false_mode_na(data):
1358+
# GH 50982
1359+
more_nans = pd.Series([None, None, data[0]], dtype=data.dtype)
1360+
result = more_nans.mode(dropna=False)
1361+
expected = pd.Series([None], dtype=data.dtype)
1362+
tm.assert_series_equal(result, expected)
1363+
1364+
expected = pd.Series([None, data[0]], dtype=data.dtype)
1365+
result = expected.mode(dropna=False)
1366+
tm.assert_series_equal(result, expected)
1367+
1368+
13761369
def test_is_bool_dtype():
13771370
# GH 22667
13781371
data = ArrowExtensionArray(pa.array([True, False, True]))

0 commit comments

Comments
 (0)