Skip to content

Commit 432cd68

Browse files
committed
Fix masking logic to avoid SegFaults with DataFrame.count().
1 parent 3bea5af commit 432cd68

File tree

2 files changed

+34
-29
lines changed

2 files changed

+34
-29
lines changed

pandas/core/frame.py

+12-24
Original file line numberDiff line numberDiff line change
@@ -7820,42 +7820,30 @@ def _count_level(self, level, axis=0, numeric_only=False):
78207820
f"Can only count levels on hierarchical {self._get_axis_name(axis)}."
78217821
)
78227822

7823-
# Mask NaNs: Mask rows where the index level is NaN and all values in
7824-
# the DataFrame that are NaN
7825-
if frame._is_mixed_type:
7826-
# Since we have mixed types, calling notna(frame.values) might
7827-
# upcast everything to object
7828-
mask = (
7829-
notna(frame.index.get_level_values(level=level)).reshape(-1, 1) &
7830-
notna(frame).values
7831-
)
7832-
else:
7833-
# But use the speedup when we have homogeneous dtypes
7834-
mask = (
7835-
notna(frame.index.get_level_values(level=level)).reshape(-1, 1) &
7836-
notna(frame.values)
7837-
)
7823+
# Mask NaNs: Mask rows or columns where the index level is NaN, and all
7824+
# values in the DataFrame that are NaN
7825+
values_mask = notna(frame.values)
78387826

7827+
index_mask = notna(count_axis.get_level_values(level=level))
78397828
if axis == 1:
7840-
# We're transposing the mask rather than frame to avoid potential
7841-
# upcasts to object, which induces a ~20x slowdown
7842-
mask = mask.T
7829+
mask = index_mask & values_mask
7830+
else:
7831+
mask = index_mask.reshape(-1, 1) & values_mask
78437832

78447833
if isinstance(level, str):
78457834
level = count_axis._get_level_number(level)
78467835

78477836
level_name = count_axis._names[level]
78487837
level_index = count_axis.levels[level]._shallow_copy(name=level_name)
78497838
level_codes = ensure_int64(count_axis.codes[level])
7850-
counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=0)
7851-
7852-
result = DataFrame(counts, index=level_index, columns=agg_axis)
7839+
counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=axis)
78537840

78547841
if axis == 1:
7855-
# Undo our earlier transpose
7856-
return result.T
7842+
result = DataFrame(counts, index=agg_axis, columns=level_index)
78577843
else:
7858-
return result
7844+
result = DataFrame(counts, index=level_index, columns=agg_axis)
7845+
7846+
return result
78597847

78607848
def _reduce(
78617849
self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds

pandas/tests/test_multilevel.py

+22-5
Original file line numberDiff line numberDiff line change
@@ -250,14 +250,31 @@ def _check_counts(frame, axis=0):
250250

251251
def test_count_index_with_nan(self):
252252
# https://github.com/pandas-dev/pandas/issues/21824
253-
df = DataFrame({"Person": ["John", "Myla", None, "John", "Myla"],
254-
"Age": [24., 5, 21., 33, 26],
255-
"Single": [False, True, True, True, False]})
253+
df = DataFrame(
254+
{
255+
"Person": ["John", "Myla", None, "John", "Myla"],
256+
"Age": [24.0, 5, 21.0, 33, 26],
257+
"Single": [False, True, True, True, False],
258+
}
259+
)
256260

261+
# count on row labels
257262
res = df.set_index(["Person", "Single"]).count(level="Person")
258-
expected = DataFrame(index=Index(["John", "Myla"], name="Person"),
259-
data={"Age": [2, 2]})
263+
expected = DataFrame(
264+
index=Index(["John", "Myla"], name="Person"),
265+
columns=Index(["Age"]),
266+
data=np.array([2, 2]),
267+
)
268+
tm.assert_frame_equal(res, expected)
260269

270+
# count on column labels
271+
res = df.set_index(["Person", "Single"]).T.count(level="Person", axis=1)
272+
expected = DataFrame(
273+
columns=Index(["John", "Myla"], name="Person"),
274+
index=Index(["Age"]),
275+
data=np.array([[2, 2]]),
276+
)
277+
tm.assert_frame_equal(res, expected)
261278

262279
def test_count_level_series(self):
263280
index = MultiIndex(

0 commit comments

Comments
 (0)