diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index d3239c4562765..6933cbedb5d67 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1572,3 +1572,4 @@ Bug Fixes - Bug in ``eval()`` where the ``resolvers`` argument would not accept a list (:issue:`14095`) - Bugs in ``stack``, ``get_dummies``, ``make_axis_dummies`` which don't preserve categorical dtypes in (multi)indexes (:issue:`13854`) - ``PeridIndex`` can now accept ``list`` and ``array`` which contains ``pd.NaT`` (:issue:`13430`) +- Bug in ``df.groupby`` where ``.median()`` returns arbitrary values if grouped dataframe contains empty bins (:issue:`13629`) diff --git a/pandas/algos.pyx b/pandas/algos.pyx index de5c5fc661d4d..8710ef34504d1 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -992,7 +992,7 @@ def is_lexsorted(list list_of_arrays): def groupby_indices(dict ids, ndarray[int64_t] labels, ndarray[int64_t] counts): """ - turn group_labels output into a combined indexer maping the labels to + turn group_labels output into a combined indexer mapping the labels to indexers Parameters @@ -1313,6 +1313,9 @@ cdef inline float64_t _median_linear(float64_t* a, int n): cdef float64_t result cdef float64_t* tmp + if n == 0: + return NaN + # count NAs for i in range(n): if a[i] != a[i]: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 66e30229cd52b..7ed84b970d9c3 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -4424,12 +4424,13 @@ def _reorder_by_uniques(uniques, labels): def _groupby_indices(values): if is_categorical_dtype(values): - # we have a categorical, so we can do quite a bit # bit better than factorizing again reverse = dict(enumerate(values.categories)) codes = values.codes.astype('int64') - _, counts = _hash.value_count_int64(codes, False) + + mask = 0 <= codes + counts = np.bincount(codes[mask], minlength=values.categories.size) else: reverse, codes, counts = _algos.group_labels( _values_from_object(_ensure_object(values))) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 9d8873d843642..492326d0898f0 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -799,6 +799,17 @@ def test_get_group(self): self.assertRaises(ValueError, lambda: g.get_group(('foo', 'bar', 'baz'))) + def test_get_group_empty_bins(self): + d = pd.DataFrame([3, 1, 7, 6]) + bins = [0, 5, 10, 15] + g = d.groupby(pd.cut(d[0], bins)) + + result = g.get_group('(0, 5]') + expected = DataFrame([3, 1], index=[0, 1]) + assert_frame_equal(result, expected) + + self.assertRaises(KeyError, lambda: g.get_group('(10, 15]')) + def test_get_group_grouped_by_tuple(self): # GH 8121 df = DataFrame([[(1, ), (1, 2), (1, ), (1, 2)]], index=['ids']).T @@ -4415,6 +4426,16 @@ def test_cython_median(self): xp = df.groupby(labels).median() assert_frame_equal(rs, xp) + def test_median_empty_bins(self): + df = pd.DataFrame(np.random.randint(0, 44, 500)) + + grps = range(0, 55, 5) + bins = pd.cut(df[0], grps) + + result = df.groupby(bins).median() + expected = df.groupby(bins).agg(lambda x: x.median()) + assert_frame_equal(result, expected) + def test_groupby_categorical_no_compress(self): data = Series(np.random.randn(9)) @@ -6123,6 +6144,27 @@ def test__cython_agg_general(self): exc.args += ('operation: %s' % op, ) raise + def test_cython_agg_empty_buckets(self): + ops = [('mean', np.mean), + ('median', np.median), + ('var', lambda x: np.var(x, ddof=1)), + ('add', lambda x: np.sum(x) if len(x) > 0 else np.nan), + ('prod', np.prod), + ('min', np.min), + ('max', np.max), ] + + df = pd.DataFrame([11, 12, 13]) + grps = range(0, 55, 5) + + for op, targop in ops: + result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(op) + expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x)) + try: + tm.assert_frame_equal(result, expected) + except BaseException as exc: + exc.args += ('operation: %s' % op,) + raise + def test_cython_group_transform_algos(self): # GH 4095 dtypes = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint32,