Skip to content

BUG: GH13629 Binned groupby median function calculates median on empt… #14225

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 18, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.19.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1572,3 +1572,4 @@ Bug Fixes
- Bug in ``eval()`` where the ``resolvers`` argument would not accept a list (:issue:`14095`)
- Bugs in ``stack``, ``get_dummies``, ``make_axis_dummies`` which don't preserve categorical dtypes in (multi)indexes (:issue:`13854`)
- ``PeridIndex`` can now accept ``list`` and ``array`` which contains ``pd.NaT`` (:issue:`13430`)
- Bug in ``df.groupby`` where ``.median()`` returns arbitrary values if grouped dataframe contains empty bins (:issue:`13629`)
5 changes: 4 additions & 1 deletion pandas/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -992,7 +992,7 @@ def is_lexsorted(list list_of_arrays):
def groupby_indices(dict ids, ndarray[int64_t] labels,
ndarray[int64_t] counts):
"""
turn group_labels output into a combined indexer maping the labels to
turn group_labels output into a combined indexer mapping the labels to
indexers

Parameters
Expand Down Expand Up @@ -1313,6 +1313,9 @@ cdef inline float64_t _median_linear(float64_t* a, int n):
cdef float64_t result
cdef float64_t* tmp

if n == 0:
return NaN

# count NAs
for i in range(n):
if a[i] != a[i]:
Expand Down
5 changes: 3 additions & 2 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -4424,12 +4424,13 @@ def _reorder_by_uniques(uniques, labels):
def _groupby_indices(values):

if is_categorical_dtype(values):

# we have a categorical, so we can do quite a bit
# bit better than factorizing again
reverse = dict(enumerate(values.categories))
codes = values.codes.astype('int64')
_, counts = _hash.value_count_int64(codes, False)

mask = 0 <= codes
counts = np.bincount(codes[mask], minlength=values.categories.size)
else:
reverse, codes, counts = _algos.group_labels(
_values_from_object(_ensure_object(values)))
Expand Down
42 changes: 42 additions & 0 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -799,6 +799,17 @@ def test_get_group(self):
self.assertRaises(ValueError,
lambda: g.get_group(('foo', 'bar', 'baz')))

def test_get_group_empty_bins(self):
d = pd.DataFrame([3, 1, 7, 6])
bins = [0, 5, 10, 15]
g = d.groupby(pd.cut(d[0], bins))

result = g.get_group('(0, 5]')
expected = DataFrame([3, 1], index=[0, 1])
assert_frame_equal(result, expected)

self.assertRaises(KeyError, lambda: g.get_group('(10, 15]'))

def test_get_group_grouped_by_tuple(self):
# GH 8121
df = DataFrame([[(1, ), (1, 2), (1, ), (1, 2)]], index=['ids']).T
Expand Down Expand Up @@ -4415,6 +4426,16 @@ def test_cython_median(self):
xp = df.groupby(labels).median()
assert_frame_equal(rs, xp)

def test_median_empty_bins(self):
df = pd.DataFrame(np.random.randint(0, 44, 500))

grps = range(0, 55, 5)
bins = pd.cut(df[0], grps)

result = df.groupby(bins).median()
expected = df.groupby(bins).agg(lambda x: x.median())
assert_frame_equal(result, expected)

def test_groupby_categorical_no_compress(self):
data = Series(np.random.randn(9))

Expand Down Expand Up @@ -6123,6 +6144,27 @@ def test__cython_agg_general(self):
exc.args += ('operation: %s' % op, )
raise

def test_cython_agg_empty_buckets(self):
ops = [('mean', np.mean),
('median', np.median),
('var', lambda x: np.var(x, ddof=1)),
('add', lambda x: np.sum(x) if len(x) > 0 else np.nan),
('prod', np.prod),
('min', np.min),
('max', np.max), ]

df = pd.DataFrame([11, 12, 13])
grps = range(0, 55, 5)

for op, targop in ops:
result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(op)
expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x))
try:
tm.assert_frame_equal(result, expected)
except BaseException as exc:
exc.args += ('operation: %s' % op,)
raise

def test_cython_group_transform_algos(self):
# GH 4095
dtypes = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint32,
Expand Down