diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 570a5e3ce97ab..35501be97dc10 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -305,7 +305,7 @@ Numeric - Bug in :meth:`to_numeric` with string argument ``"uint64"`` and ``errors="coerce"`` silently fails (:issue:`32394`) - Bug in :meth:`to_numeric` with ``downcast="unsigned"`` fails for empty data (:issue:`32493`) - Bug in :meth:`DataFrame.mean` with ``numeric_only=False`` and either ``datetime64`` dtype or ``PeriodDtype`` column incorrectly raising ``TypeError`` (:issue:`32426`) -- +- Bug in :meth:`DataFrame.count` with ``level="foo"`` and index level ``"foo"`` containing NaNs causes segmentation fault (:issue:`21824`) Conversion ^^^^^^^^^^ @@ -403,6 +403,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) - Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`) +- Bug in :meth:`GroupBy.count` causes segmentation fault when grouped-by column contains NaNs (:issue:`32841`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean series (:issue:`32894`) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 6c6f6a8600ba2..e011024dec966 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -793,14 +793,16 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, with nogil: for i in range(n): for j in range(k): - counts[labels[i], j] += mask[i, j] + if mask[i, j]: + counts[labels[i], j] += 1 else: # axis == 1 counts = np.zeros((n, max_bin), dtype='i8') with nogil: for i in range(n): for j in range(k): - counts[i, labels[j]] += mask[i, j] + if mask[i, j]: + counts[i, labels[j]] += 1 return counts diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1e9f8995b6bed..6ddb00db350af 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7893,18 +7893,21 @@ def _count_level(self, level, axis=0, numeric_only=False): f"Can only count levels on hierarchical {self._get_axis_name(axis)}." ) + # Mask NaNs: Mask rows or columns where the index level is NaN, and all + # values in the DataFrame that are NaN if frame._is_mixed_type: # Since we have mixed types, calling notna(frame.values) might # upcast everything to object - mask = notna(frame).values + values_mask = notna(frame).values else: # But use the speedup when we have homogeneous dtypes - mask = notna(frame.values) + values_mask = notna(frame.values) + index_mask = notna(count_axis.get_level_values(level=level)) if axis == 1: - # We're transposing the mask rather than frame to avoid potential - # upcasts to object, which induces a ~20x slowdown - mask = mask.T + mask = index_mask & values_mask + else: + mask = index_mask.reshape(-1, 1) & values_mask if isinstance(level, str): level = count_axis._get_level_number(level) @@ -7912,15 +7915,14 @@ def _count_level(self, level, axis=0, numeric_only=False): level_name = count_axis._names[level] level_index = count_axis.levels[level]._shallow_copy(name=level_name) level_codes = ensure_int64(count_axis.codes[level]) - counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=0) - - result = DataFrame(counts, index=level_index, columns=agg_axis) + counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=axis) if axis == 1: - # Undo our earlier transpose - return result.T + result = DataFrame(counts, index=agg_axis, columns=level_index) else: - return result + result = DataFrame(counts, index=level_index, columns=agg_axis) + + return result def _reduce( self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index b4239d7d34a90..56a18757da6e7 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas import DataFrame, MultiIndex, Period, Series, Timedelta, Timestamp +from pandas import DataFrame, Index, MultiIndex, Period, Series, Timedelta, Timestamp import pandas._testing as tm @@ -220,3 +220,12 @@ def test_count_with_only_nans_in_first_group(self): mi = MultiIndex(levels=[[], ["a", "b"]], codes=[[], []], names=["A", "B"]) expected = Series([], index=mi, dtype=np.int64, name="C") tm.assert_series_equal(result, expected, check_index_type=False) + + def test_count_groupby_column_with_nan_in_groupby_column(self): + # https://github.com/pandas-dev/pandas/issues/32841 + df = DataFrame({"A": [1, 1, 1, 1, 1], "B": [5, 4, np.NaN, 3, 0]}) + res = df.groupby(["B"]).count() + expected = DataFrame( + index=Index([0.0, 3.0, 4.0, 5.0], name="B"), data={"A": [1, 1, 1, 1]} + ) + tm.assert_frame_equal(expected, res) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 84279d874bae1..f025abd5628cf 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -248,6 +248,34 @@ def _check_counts(frame, axis=0): result = self.frame.count(level=0, numeric_only=True) tm.assert_index_equal(result.columns, Index(list("ABC"), name="exp")) + def test_count_index_with_nan(self): + # https://github.com/pandas-dev/pandas/issues/21824 + df = DataFrame( + { + "Person": ["John", "Myla", None, "John", "Myla"], + "Age": [24.0, 5, 21.0, 33, 26], + "Single": [False, True, True, True, False], + } + ) + + # count on row labels + res = df.set_index(["Person", "Single"]).count(level="Person") + expected = DataFrame( + index=Index(["John", "Myla"], name="Person"), + columns=Index(["Age"]), + data=[2, 2], + ) + tm.assert_frame_equal(res, expected) + + # count on column labels + res = df.set_index(["Person", "Single"]).T.count(level="Person", axis=1) + expected = DataFrame( + columns=Index(["John", "Myla"], name="Person"), + index=Index(["Age"]), + data=[[2, 2]], + ) + tm.assert_frame_equal(res, expected) + def test_count_level_series(self): index = MultiIndex( levels=[["foo", "bar", "baz"], ["one", "two", "three", "four"]],