Skip to content

Commit e96e386

Browse files
committed
API: Sum / Prod of empty / all-NA (groupby)
1 parent a87f1f9 commit e96e386

File tree

5 files changed

+61
-9
lines changed

5 files changed

+61
-9
lines changed

pandas/_libs/groupby_helper.pxi.in

+2-2
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
8989
for i in range(ncounts):
9090
for j in range(K):
9191
if nobs[i, j] == 0:
92-
out[i, j] = NAN
92+
out[i, j] = 0
9393
else:
9494
out[i, j] = sumx[i, j]
9595

@@ -148,7 +148,7 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
148148
for i in range(ncounts):
149149
for j in range(K):
150150
if nobs[i, j] == 0:
151-
out[i, j] = NAN
151+
out[i, j] = 1
152152
else:
153153
out[i, j] = prodx[i, j]
154154

pandas/tests/groupby/test_aggregate.py

+31-2
Original file line numberDiff line numberDiff line change
@@ -813,8 +813,6 @@ def test_cython_agg_empty_buckets(self):
813813
ops = [('mean', np.mean),
814814
('median', lambda x: np.median(x) if len(x) > 0 else np.nan),
815815
('var', lambda x: np.var(x, ddof=1)),
816-
('add', lambda x: np.sum(x) if len(x) > 0 else np.nan),
817-
('prod', np.prod),
818816
('min', np.min),
819817
('max', np.max), ]
820818

@@ -830,6 +828,23 @@ def test_cython_agg_empty_buckets(self):
830828
exc.args += ('operation: %s' % op,)
831829
raise
832830

831+
def test_cython_agg_empty_buckets_nanops(self):
832+
# Bug in python agg func not being evaluated on empty buckets
833+
df = pd.DataFrame([11, 12, 13], columns=['a'])
834+
grps = range(0, 25, 5)
835+
result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('add')
836+
intervals = pd.interval_range(0, 20, freq=5)
837+
expected = pd.DataFrame(
838+
{"a": [0, 0, 36, 0]},
839+
index=pd.CategoricalIndex(intervals, name='a', ordered=True))
840+
tm.assert_frame_equal(result, expected)
841+
842+
result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('prod')
843+
expected = pd.DataFrame(
844+
{"a": [1, 1, 1716, 1]},
845+
index=pd.CategoricalIndex(intervals, name='a', ordered=True))
846+
tm.assert_frame_equal(result, expected)
847+
833848
def test_agg_over_numpy_arrays(self):
834849
# GH 3788
835850
df = pd.DataFrame([[1, np.array([10, 20, 30])],
@@ -925,3 +940,17 @@ def test_agg_structs_series(self, structure, expected):
925940
result = df.groupby('A')['C'].aggregate(structure)
926941
expected.index.name = 'A'
927942
assert_series_equal(result, expected)
943+
944+
@pytest.mark.xfail(reason="agg functions not called on empty groups")
945+
def test_agg_category_nansum(self):
946+
categories = ['a', 'b', 'c']
947+
df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'],
948+
categories=categories),
949+
'B': [1, 2, 3]})
950+
result = df.groupby("A").B.agg(np.nansum)
951+
expected = pd.Series([3, 3, 0],
952+
index=pd.CategoricalIndex(['a', 'b', 'c'],
953+
categories=categories,
954+
name='A'),
955+
name='B')
956+
tm.assert_series_equal(result, expected)

pandas/tests/groupby/test_categorical.py

+23-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def test_groupby(self):
3737
# single grouper
3838
gb = df.groupby("A")
3939
exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True)
40-
expected = DataFrame({'values': Series([3, 7, np.nan], index=exp_idx)})
40+
expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)})
4141
result = gb.sum()
4242
tm.assert_frame_equal(result, expected)
4343

@@ -662,3 +662,25 @@ def test_groupby_categorical_two_columns(self):
662662
"C3": [nan, nan, nan, nan, 10, 100,
663663
nan, nan, nan, nan, 200, 34]}, index=idx)
664664
tm.assert_frame_equal(res, exp)
665+
666+
def test_sum_zero(self):
667+
df = pd.DataFrame({"A": pd.Categorical(['a', 'b', 'a'],
668+
categories=['a', 'b', 'c']),
669+
'B': [1, 2, 1]})
670+
result = df.groupby("A").B.sum()
671+
expected = pd.Series([2, 2, 0],
672+
index=pd.CategoricalIndex(['a', 'b', 'c'],
673+
name='A'),
674+
name='B')
675+
tm.assert_series_equal(result, expected)
676+
677+
def test_prod_one(self):
678+
df = pd.DataFrame({"A": pd.Categorical(['a', 'b', 'a'],
679+
categories=['a', 'b', 'c']),
680+
'B': [1, 2, 1]})
681+
result = df.groupby("A").B.prod()
682+
expected = pd.Series([1, 2, 1],
683+
index=pd.CategoricalIndex(['a', 'b', 'c'],
684+
name='A'),
685+
name='B')
686+
tm.assert_series_equal(result, expected)

pandas/tests/groupby/test_groupby.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2704,7 +2704,7 @@ def h(df, arg3):
27042704

27052705
# Assert the results here
27062706
index = pd.Index(['A', 'B', 'C'], name='group')
2707-
expected = pd.Series([-79.5160891089, -78.4839108911, None],
2707+
expected = pd.Series([-79.5160891089, -78.4839108911, -80],
27082708
index=index)
27092709

27102710
assert_series_equal(expected, result)

pandas/tests/groupby/test_timegrouper.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,12 @@ def test_groupby_with_timegrouper(self):
4141
df = df.set_index(['Date'])
4242

4343
expected = DataFrame(
44-
{'Quantity': np.nan},
44+
{'Quantity': 0},
4545
index=date_range('20130901 13:00:00',
4646
'20131205 13:00:00', freq='5D',
4747
name='Date', closed='left'))
4848
expected.iloc[[0, 6, 18], 0] = np.array(
49-
[24., 6., 9.], dtype='float64')
49+
[24, 6, 9], dtype='int64')
5050

5151
result1 = df.resample('5D') .sum()
5252
assert_frame_equal(result1, expected)
@@ -261,9 +261,10 @@ def test_timegrouper_with_reg_groups(self):
261261
for freq in ['D', 'M', 'A', 'Q-APR']:
262262
expected = df.groupby('user_id')[
263263
'whole_cost'].resample(
264-
freq).sum().dropna().reorder_levels(
264+
freq).sum().reorder_levels(
265265
['date', 'user_id']).sort_index().astype('int64')
266266
expected.name = 'whole_cost'
267+
expected = expected[expected > 0]
267268

268269
result1 = df.sort_index().groupby([pd.Grouper(freq=freq),
269270
'user_id'])['whole_cost'].sum()

0 commit comments

Comments
 (0)