Skip to content

Commit 42affd5

Browse files
committed
BUG: GH13629 Binned groupby median function calculates median on empty bins and outputs random numbers
1 parent 9554195 commit 42affd5

File tree

4 files changed

+50
-3
lines changed

4 files changed

+50
-3
lines changed

doc/source/whatsnew/v0.19.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1572,3 +1572,4 @@ Bug Fixes
15721572
- Bug in ``eval()`` where the ``resolvers`` argument would not accept a list (:issue:`14095`)
15731573
- Bugs in ``stack``, ``get_dummies``, ``make_axis_dummies`` which don't preserve categorical dtypes in (multi)indexes (:issue:`13854`)
15741574
- ``PeridIndex`` can now accept ``list`` and ``array`` which contains ``pd.NaT`` (:issue:`13430`)
1575+
- Bug in ``df.groupby`` where ``.median()`` returns arbitrary values if grouped dataframe contains empty bins (:issue:`13629`)

pandas/algos.pyx

+4-1
Original file line numberDiff line numberDiff line change
@@ -992,7 +992,7 @@ def is_lexsorted(list list_of_arrays):
992992
def groupby_indices(dict ids, ndarray[int64_t] labels,
993993
ndarray[int64_t] counts):
994994
"""
995-
turn group_labels output into a combined indexer maping the labels to
995+
turn group_labels output into a combined indexer mapping the labels to
996996
indexers
997997
998998
Parameters
@@ -1313,6 +1313,9 @@ cdef inline float64_t _median_linear(float64_t* a, int n):
13131313
cdef float64_t result
13141314
cdef float64_t* tmp
13151315

1316+
if n == 0:
1317+
return NaN
1318+
13161319
# count NAs
13171320
for i in range(n):
13181321
if a[i] != a[i]:

pandas/core/groupby.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -4424,12 +4424,13 @@ def _reorder_by_uniques(uniques, labels):
44244424
def _groupby_indices(values):
44254425

44264426
if is_categorical_dtype(values):
4427-
44284427
# we have a categorical, so we can do quite a bit
44294428
# bit better than factorizing again
44304429
reverse = dict(enumerate(values.categories))
44314430
codes = values.codes.astype('int64')
4432-
_, counts = _hash.value_count_int64(codes, False)
4431+
4432+
mask = 0 <= codes
4433+
counts = np.bincount(codes[mask], minlength=values.categories.size)
44334434
else:
44344435
reverse, codes, counts = _algos.group_labels(
44354436
_values_from_object(_ensure_object(values)))

pandas/tests/test_groupby.py

+42
Original file line numberDiff line numberDiff line change
@@ -799,6 +799,17 @@ def test_get_group(self):
799799
self.assertRaises(ValueError,
800800
lambda: g.get_group(('foo', 'bar', 'baz')))
801801

802+
def test_get_group_empty_bins(self):
803+
d = pd.DataFrame([3, 1, 7, 6])
804+
bins = [0, 5, 10, 15]
805+
g = d.groupby(pd.cut(d[0], bins))
806+
807+
result = g.get_group('(0, 5]')
808+
expected = DataFrame([3, 1], index=[0, 1])
809+
assert_frame_equal(result, expected)
810+
811+
self.assertRaises(KeyError, lambda: g.get_group('(10, 15]'))
812+
802813
def test_get_group_grouped_by_tuple(self):
803814
# GH 8121
804815
df = DataFrame([[(1, ), (1, 2), (1, ), (1, 2)]], index=['ids']).T
@@ -4415,6 +4426,16 @@ def test_cython_median(self):
44154426
xp = df.groupby(labels).median()
44164427
assert_frame_equal(rs, xp)
44174428

4429+
def test_median_empty_bins(self):
4430+
df = pd.DataFrame(np.random.randint(0, 44, 500))
4431+
4432+
grps = range(0, 55, 5)
4433+
bins = pd.cut(df[0], grps)
4434+
4435+
result = df.groupby(bins).median()
4436+
expected = df.groupby(bins).agg(lambda x: x.median())
4437+
assert_frame_equal(result, expected)
4438+
44184439
def test_groupby_categorical_no_compress(self):
44194440
data = Series(np.random.randn(9))
44204441

@@ -6123,6 +6144,27 @@ def test__cython_agg_general(self):
61236144
exc.args += ('operation: %s' % op, )
61246145
raise
61256146

6147+
def test_cython_agg_empty_buckets(self):
6148+
ops = [('mean', np.mean),
6149+
('median', np.median),
6150+
('var', lambda x: np.var(x, ddof=1)),
6151+
('add', lambda x: np.sum(x) if len(x) > 0 else np.nan),
6152+
('prod', np.prod),
6153+
('min', np.min),
6154+
('max', np.max), ]
6155+
6156+
df = pd.DataFrame([11, 12, 13])
6157+
grps = range(0, 55, 5)
6158+
6159+
for op, targop in ops:
6160+
result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(op)
6161+
expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x))
6162+
try:
6163+
tm.assert_frame_equal(result, expected)
6164+
except BaseException as exc:
6165+
exc.args += ('operation: %s' % op,)
6166+
raise
6167+
61266168
def test_cython_group_transform_algos(self):
61276169
# GH 4095
61286170
dtypes = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint32,

0 commit comments

Comments
 (0)