Skip to content

Commit d88b90d

Browse files
authored
BUG: Fix segfault in GroupBy.count and DataFrame.count (#32842)
1 parent a44ac34 commit d88b90d

File tree

5 files changed

+57
-15
lines changed

5 files changed

+57
-15
lines changed

doc/source/whatsnew/v1.1.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,7 @@ Numeric
326326
- Bug in :meth:`to_numeric` with string argument ``"uint64"`` and ``errors="coerce"`` silently fails (:issue:`32394`)
327327
- Bug in :meth:`to_numeric` with ``downcast="unsigned"`` fails for empty data (:issue:`32493`)
328328
- Bug in :meth:`DataFrame.mean` with ``numeric_only=False`` and either ``datetime64`` dtype or ``PeriodDtype`` column incorrectly raising ``TypeError`` (:issue:`32426`)
329-
-
329+
- Bug in :meth:`DataFrame.count` with ``level="foo"`` and index level ``"foo"`` containing NaNs causes segmentation fault (:issue:`21824`)
330330

331331
Conversion
332332
^^^^^^^^^^
@@ -424,6 +424,7 @@ Groupby/resample/rolling
424424

425425
- Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`)
426426
- Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`)
427+
- Bug in :meth:`GroupBy.count` causes segmentation fault when grouped-by column contains NaNs (:issue:`32841`)
427428
- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean series (:issue:`32894`)
428429
- Bug in :meth:`DataFrame.resample` where an ``AmbiguousTimeError`` would be raised when the resulting timezone aware :class:`DatetimeIndex` had a DST transition at midnight (:issue:`25758`)
429430

pandas/_libs/lib.pyx

+4-2
Original file line numberDiff line numberDiff line change
@@ -798,14 +798,16 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
798798
with nogil:
799799
for i in range(n):
800800
for j in range(k):
801-
counts[labels[i], j] += mask[i, j]
801+
if mask[i, j]:
802+
counts[labels[i], j] += 1
802803

803804
else: # axis == 1
804805
counts = np.zeros((n, max_bin), dtype='i8')
805806
with nogil:
806807
for i in range(n):
807808
for j in range(k):
808-
counts[i, labels[j]] += mask[i, j]
809+
if mask[i, j]:
810+
counts[i, labels[j]] += 1
809811

810812
return counts
811813

pandas/core/frame.py

+13-11
Original file line numberDiff line numberDiff line change
@@ -7891,34 +7891,36 @@ def _count_level(self, level, axis=0, numeric_only=False):
78917891
f"Can only count levels on hierarchical {self._get_axis_name(axis)}."
78927892
)
78937893

7894+
# Mask NaNs: Mask rows or columns where the index level is NaN, and all
7895+
# values in the DataFrame that are NaN
78947896
if frame._is_mixed_type:
78957897
# Since we have mixed types, calling notna(frame.values) might
78967898
# upcast everything to object
7897-
mask = notna(frame).values
7899+
values_mask = notna(frame).values
78987900
else:
78997901
# But use the speedup when we have homogeneous dtypes
7900-
mask = notna(frame.values)
7902+
values_mask = notna(frame.values)
79017903

7904+
index_mask = notna(count_axis.get_level_values(level=level))
79027905
if axis == 1:
7903-
# We're transposing the mask rather than frame to avoid potential
7904-
# upcasts to object, which induces a ~20x slowdown
7905-
mask = mask.T
7906+
mask = index_mask & values_mask
7907+
else:
7908+
mask = index_mask.reshape(-1, 1) & values_mask
79067909

79077910
if isinstance(level, str):
79087911
level = count_axis._get_level_number(level)
79097912

79107913
level_name = count_axis._names[level]
79117914
level_index = count_axis.levels[level]._shallow_copy(name=level_name)
79127915
level_codes = ensure_int64(count_axis.codes[level])
7913-
counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=0)
7914-
7915-
result = DataFrame(counts, index=level_index, columns=agg_axis)
7916+
counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=axis)
79167917

79177918
if axis == 1:
7918-
# Undo our earlier transpose
7919-
return result.T
7919+
result = DataFrame(counts, index=agg_axis, columns=level_index)
79207920
else:
7921-
return result
7921+
result = DataFrame(counts, index=level_index, columns=agg_axis)
7922+
7923+
return result
79227924

79237925
def _reduce(
79247926
self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds

pandas/tests/groupby/test_counting.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import numpy as np
44
import pytest
55

6-
from pandas import DataFrame, MultiIndex, Period, Series, Timedelta, Timestamp
6+
from pandas import DataFrame, Index, MultiIndex, Period, Series, Timedelta, Timestamp
77
import pandas._testing as tm
88

99

@@ -220,3 +220,12 @@ def test_count_with_only_nans_in_first_group(self):
220220
mi = MultiIndex(levels=[[], ["a", "b"]], codes=[[], []], names=["A", "B"])
221221
expected = Series([], index=mi, dtype=np.int64, name="C")
222222
tm.assert_series_equal(result, expected, check_index_type=False)
223+
224+
def test_count_groupby_column_with_nan_in_groupby_column(self):
225+
# https://github.com/pandas-dev/pandas/issues/32841
226+
df = DataFrame({"A": [1, 1, 1, 1, 1], "B": [5, 4, np.NaN, 3, 0]})
227+
res = df.groupby(["B"]).count()
228+
expected = DataFrame(
229+
index=Index([0.0, 3.0, 4.0, 5.0], name="B"), data={"A": [1, 1, 1, 1]}
230+
)
231+
tm.assert_frame_equal(expected, res)

pandas/tests/test_multilevel.py

+28
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,34 @@ def _check_counts(frame, axis=0):
248248
result = self.frame.count(level=0, numeric_only=True)
249249
tm.assert_index_equal(result.columns, Index(list("ABC"), name="exp"))
250250

251+
def test_count_index_with_nan(self):
252+
# https://github.com/pandas-dev/pandas/issues/21824
253+
df = DataFrame(
254+
{
255+
"Person": ["John", "Myla", None, "John", "Myla"],
256+
"Age": [24.0, 5, 21.0, 33, 26],
257+
"Single": [False, True, True, True, False],
258+
}
259+
)
260+
261+
# count on row labels
262+
res = df.set_index(["Person", "Single"]).count(level="Person")
263+
expected = DataFrame(
264+
index=Index(["John", "Myla"], name="Person"),
265+
columns=Index(["Age"]),
266+
data=[2, 2],
267+
)
268+
tm.assert_frame_equal(res, expected)
269+
270+
# count on column labels
271+
res = df.set_index(["Person", "Single"]).T.count(level="Person", axis=1)
272+
expected = DataFrame(
273+
columns=Index(["John", "Myla"], name="Person"),
274+
index=Index(["Age"]),
275+
data=[[2, 2]],
276+
)
277+
tm.assert_frame_equal(res, expected)
278+
251279
def test_count_level_series(self):
252280
index = MultiIndex(
253281
levels=[["foo", "bar", "baz"], ["one", "two", "three", "four"]],

0 commit comments

Comments
 (0)