Skip to content

Commit cf6cbb2

Browse files
committed
ENH: use size instead of cythonized count for fallback cases
1 parent 668f0f7 commit cf6cbb2

File tree

5 files changed

+52
-229
lines changed

5 files changed

+52
-229
lines changed

pandas/core/groupby.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -722,8 +722,7 @@ def size(self):
722722
last = _groupby_function('last', 'last', _last_compat, numeric_only=False,
723723
_convert=True)
724724

725-
_count = _groupby_function('_count', 'count',
726-
lambda x, axis=0: notnull(x).sum(axis=axis),
725+
_count = _groupby_function('_count', 'count', lambda x, axis=0: x.size(),
727726
numeric_only=False)
728727

729728
def count(self, axis=0):

pandas/src/generate_code.py

+17-10
Original file line numberDiff line numberDiff line change
@@ -2219,18 +2219,21 @@ def put2d_%(name)s_%(dest_type)s(ndarray[%(c_type)s, ndim=2, cast=True] values,
22192219
#-------------------------------------------------------------------------
22202220
# Generators
22212221

2222-
def generate_put_template(template, use_ints = True, use_floats = True,
2223-
use_objects=False):
2222+
def generate_put_template(template, use_ints=True, use_floats=True,
2223+
use_objects=False, use_datelikes=False):
22242224
floats_list = [
22252225
('float64', 'float64_t', 'float64_t', 'np.float64'),
22262226
('float32', 'float32_t', 'float32_t', 'np.float32'),
2227-
]
2227+
]
22282228
ints_list = [
22292229
('int8', 'int8_t', 'float32_t', 'np.float32'),
22302230
('int16', 'int16_t', 'float32_t', 'np.float32'),
22312231
('int32', 'int32_t', 'float64_t', 'np.float64'),
22322232
('int64', 'int64_t', 'float64_t', 'np.float64'),
2233-
]
2233+
]
2234+
date_like_list = [
2235+
('int64', 'int64_t', 'float64_t', 'np.float64'),
2236+
]
22342237
object_list = [('object', 'object', 'float64_t', 'np.float64')]
22352238
function_list = []
22362239
if use_floats:
@@ -2239,14 +2242,16 @@ def generate_put_template(template, use_ints = True, use_floats = True,
22392242
function_list.extend(ints_list)
22402243
if use_objects:
22412244
function_list.extend(object_list)
2245+
if use_datelikes:
2246+
function_list.extend(date_like_list)
22422247

22432248
output = StringIO()
22442249
for name, c_type, dest_type, dest_dtype in function_list:
2245-
func = template % {'name' : name,
2246-
'c_type' : c_type,
2247-
'dest_type' : dest_type.replace('_t', ''),
2248-
'dest_type2' : dest_type,
2249-
'dest_dtype' : dest_dtype}
2250+
func = template % {'name': name,
2251+
'c_type': c_type,
2252+
'dest_type': dest_type.replace('_t', ''),
2253+
'dest_type2': dest_type,
2254+
'dest_dtype': dest_dtype}
22502255
output.write(func)
22512256
return output.getvalue()
22522257

@@ -2372,7 +2377,9 @@ def generate_take_cython_file(path='generated.pyx'):
23722377
print(generate_put_template(template, use_ints=False), file=f)
23732378

23742379
for template in groupby_count:
2375-
print(generate_put_template(template, use_objects=True), file=f)
2380+
print(generate_put_template(template, use_ints=False,
2381+
use_datelikes=True, use_objects=True),
2382+
file=f)
23762383

23772384
# for template in templates_1d_datetime:
23782385
# print >> f, generate_from_template_datetime(template)

pandas/src/generated.pyx

+6-216
Original file line numberDiff line numberDiff line change
@@ -6697,89 +6697,17 @@ def group_count_float32(ndarray[float32_t, ndim=2] out,
66976697

66986698
@cython.boundscheck(False)
66996699
@cython.wraparound(False)
6700-
def group_count_int8(ndarray[float32_t, ndim=2] out,
6701-
ndarray[int64_t] counts,
6702-
ndarray[int8_t, ndim=2] values,
6703-
ndarray[int64_t] labels):
6704-
'''
6705-
Only aggregates on axis=0
6706-
'''
6707-
cdef:
6708-
Py_ssize_t i, j, lab
6709-
Py_ssize_t N = values.shape[0], K = values.shape[1]
6710-
int8_t val
6711-
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
6712-
dtype=np.int64)
6713-
6714-
if len(values) != len(labels):
6715-
raise AssertionError("len(index) != len(labels)")
6716-
6717-
for i in range(N):
6718-
lab = labels[i]
6719-
if lab < 0:
6720-
continue
6721-
6722-
counts[lab] += 1
6723-
for j in range(K):
6724-
val = values[i, j]
6725-
6726-
# not nan
6727-
nobs[lab, j] += val == val and val != iNaT
6728-
6729-
for i in range(len(counts)):
6730-
for j in range(K):
6731-
out[i, j] = nobs[i, j]
6732-
6733-
6734-
@cython.boundscheck(False)
6735-
@cython.wraparound(False)
6736-
def group_count_int16(ndarray[float32_t, ndim=2] out,
6737-
ndarray[int64_t] counts,
6738-
ndarray[int16_t, ndim=2] values,
6739-
ndarray[int64_t] labels):
6740-
'''
6741-
Only aggregates on axis=0
6742-
'''
6743-
cdef:
6744-
Py_ssize_t i, j, lab
6745-
Py_ssize_t N = values.shape[0], K = values.shape[1]
6746-
int16_t val
6747-
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
6748-
dtype=np.int64)
6749-
6750-
if len(values) != len(labels):
6751-
raise AssertionError("len(index) != len(labels)")
6752-
6753-
for i in range(N):
6754-
lab = labels[i]
6755-
if lab < 0:
6756-
continue
6757-
6758-
counts[lab] += 1
6759-
for j in range(K):
6760-
val = values[i, j]
6761-
6762-
# not nan
6763-
nobs[lab, j] += val == val and val != iNaT
6764-
6765-
for i in range(len(counts)):
6766-
for j in range(K):
6767-
out[i, j] = nobs[i, j]
6768-
6769-
6770-
@cython.boundscheck(False)
6771-
@cython.wraparound(False)
6772-
def group_count_int32(ndarray[float64_t, ndim=2] out,
6700+
def group_count_object(ndarray[float64_t, ndim=2] out,
67736701
ndarray[int64_t] counts,
6774-
ndarray[int32_t, ndim=2] values,
6702+
ndarray[object, ndim=2] values,
67756703
ndarray[int64_t] labels):
67766704
'''
67776705
Only aggregates on axis=0
67786706
'''
67796707
cdef:
67806708
Py_ssize_t i, j, lab
67816709
Py_ssize_t N = values.shape[0], K = values.shape[1]
6782-
int32_t val
6710+
object val
67836711
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
67846712
dtype=np.int64)
67856713

@@ -6839,42 +6767,6 @@ def group_count_int64(ndarray[float64_t, ndim=2] out,
68396767
out[i, j] = nobs[i, j]
68406768

68416769

6842-
@cython.boundscheck(False)
6843-
@cython.wraparound(False)
6844-
def group_count_object(ndarray[float64_t, ndim=2] out,
6845-
ndarray[int64_t] counts,
6846-
ndarray[object, ndim=2] values,
6847-
ndarray[int64_t] labels):
6848-
'''
6849-
Only aggregates on axis=0
6850-
'''
6851-
cdef:
6852-
Py_ssize_t i, j, lab
6853-
Py_ssize_t N = values.shape[0], K = values.shape[1]
6854-
object val
6855-
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
6856-
dtype=np.int64)
6857-
6858-
if len(values) != len(labels):
6859-
raise AssertionError("len(index) != len(labels)")
6860-
6861-
for i in range(N):
6862-
lab = labels[i]
6863-
if lab < 0:
6864-
continue
6865-
6866-
counts[lab] += 1
6867-
for j in range(K):
6868-
val = values[i, j]
6869-
6870-
# not nan
6871-
nobs[lab, j] += val == val and val != iNaT
6872-
6873-
for i in range(len(counts)):
6874-
for j in range(K):
6875-
out[i, j] = nobs[i, j]
6876-
6877-
68786770

68796771
@cython.boundscheck(False)
68806772
@cython.wraparound(False)
@@ -6946,85 +6838,17 @@ def group_count_bin_float32(ndarray[float32_t, ndim=2] out,
69466838

69476839
@cython.boundscheck(False)
69486840
@cython.wraparound(False)
6949-
def group_count_bin_int8(ndarray[float32_t, ndim=2] out,
6950-
ndarray[int64_t] counts,
6951-
ndarray[int8_t, ndim=2] values,
6952-
ndarray[int64_t] bins):
6953-
'''
6954-
Only aggregates on axis=0
6955-
'''
6956-
cdef:
6957-
Py_ssize_t i, j, ngroups
6958-
Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0
6959-
int8_t val
6960-
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
6961-
dtype=np.int64)
6962-
6963-
ngroups = len(bins) + (bins[len(bins) - 1] != N)
6964-
6965-
for i in range(N):
6966-
while b < ngroups - 1 and i >= bins[b]:
6967-
b += 1
6968-
6969-
counts[b] += 1
6970-
for j in range(K):
6971-
val = values[i, j]
6972-
6973-
# not nan
6974-
nobs[b, j] += val == val and val != iNaT
6975-
6976-
for i in range(ngroups):
6977-
for j in range(K):
6978-
out[i, j] = nobs[i, j]
6979-
6980-
6981-
@cython.boundscheck(False)
6982-
@cython.wraparound(False)
6983-
def group_count_bin_int16(ndarray[float32_t, ndim=2] out,
6984-
ndarray[int64_t] counts,
6985-
ndarray[int16_t, ndim=2] values,
6986-
ndarray[int64_t] bins):
6987-
'''
6988-
Only aggregates on axis=0
6989-
'''
6990-
cdef:
6991-
Py_ssize_t i, j, ngroups
6992-
Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0
6993-
int16_t val
6994-
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
6995-
dtype=np.int64)
6996-
6997-
ngroups = len(bins) + (bins[len(bins) - 1] != N)
6998-
6999-
for i in range(N):
7000-
while b < ngroups - 1 and i >= bins[b]:
7001-
b += 1
7002-
7003-
counts[b] += 1
7004-
for j in range(K):
7005-
val = values[i, j]
7006-
7007-
# not nan
7008-
nobs[b, j] += val == val and val != iNaT
7009-
7010-
for i in range(ngroups):
7011-
for j in range(K):
7012-
out[i, j] = nobs[i, j]
7013-
7014-
7015-
@cython.boundscheck(False)
7016-
@cython.wraparound(False)
7017-
def group_count_bin_int32(ndarray[float64_t, ndim=2] out,
6841+
def group_count_bin_object(ndarray[float64_t, ndim=2] out,
70186842
ndarray[int64_t] counts,
7019-
ndarray[int32_t, ndim=2] values,
6843+
ndarray[object, ndim=2] values,
70206844
ndarray[int64_t] bins):
70216845
'''
70226846
Only aggregates on axis=0
70236847
'''
70246848
cdef:
70256849
Py_ssize_t i, j, ngroups
70266850
Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0
7027-
int32_t val
6851+
object val
70286852
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
70296853
dtype=np.int64)
70306854

@@ -7080,40 +6904,6 @@ def group_count_bin_int64(ndarray[float64_t, ndim=2] out,
70806904
out[i, j] = nobs[i, j]
70816905

70826906

7083-
@cython.boundscheck(False)
7084-
@cython.wraparound(False)
7085-
def group_count_bin_object(ndarray[float64_t, ndim=2] out,
7086-
ndarray[int64_t] counts,
7087-
ndarray[object, ndim=2] values,
7088-
ndarray[int64_t] bins):
7089-
'''
7090-
Only aggregates on axis=0
7091-
'''
7092-
cdef:
7093-
Py_ssize_t i, j, ngroups
7094-
Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0
7095-
object val
7096-
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
7097-
dtype=np.int64)
7098-
7099-
ngroups = len(bins) + (bins[len(bins) - 1] != N)
7100-
7101-
for i in range(N):
7102-
while b < ngroups - 1 and i >= bins[b]:
7103-
b += 1
7104-
7105-
counts[b] += 1
7106-
for j in range(K):
7107-
val = values[i, j]
7108-
7109-
# not nan
7110-
nobs[b, j] += val == val and val != iNaT
7111-
7112-
for i in range(ngroups):
7113-
for j in range(K):
7114-
out[i, j] = nobs[i, j]
7115-
7116-
71176907

71186908
@cython.wraparound(False)
71196909
@cython.boundscheck(False)

pandas/tests/test_groupby.py

+13
Original file line numberDiff line numberDiff line change
@@ -4202,6 +4202,19 @@ def test_datetime_count(self):
42024202
name='dates')
42034203
tm.assert_series_equal(result, expected)
42044204

4205+
def test_lower_int_prec_count(self):
4206+
df = DataFrame({'a': np.array([0, 1, 2, 100], np.int8),
4207+
'b': np.array([1, 2, 3, 6], np.uint32),
4208+
'c': np.array([4, 5, 6, 8], np.int16),
4209+
'grp': list('ab' * 2)})
4210+
result = df.groupby('grp').count()
4211+
expected = DataFrame({'a': [2, 2],
4212+
'b': [2, 2],
4213+
'c': [2, 2]}, index=pd.Index(list('ab'),
4214+
name='grp'))
4215+
tm.assert_frame_equal(result, expected)
4216+
4217+
42054218
def assert_fp_equal(a, b):
42064219
assert (np.abs(a - b) < 1e-12).all()
42074220

vb_suite/groupby.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -133,21 +133,35 @@ def f():
133133
value2 = np.random.randn(n)
134134
value2[np.random.rand(n) > 0.5] = np.nan
135135
136-
obj = pd.util.testing.choice(['a', 'b'], size=n).astype(object)
136+
obj = tm.choice(list('ab'), size=n).astype(object)
137137
obj[np.random.randn(n) > 0.5] = np.nan
138138
139139
df = DataFrame({'key1': np.random.randint(0, 500, size=n),
140140
'key2': np.random.randint(0, 100, size=n),
141141
'dates': dates,
142142
'value2' : value2,
143143
'value3' : np.random.randn(n),
144+
'ints': np.random.randint(0, 1000, size=n),
144145
'obj': obj,
145146
'offsets': offsets})
146147
"""
147148

148149
groupby_multi_count = Benchmark("df.groupby(['key1', 'key2']).count()",
149150
setup, name='groupby_multi_count',
150151
start_date=datetime(2014, 5, 5))
152+
153+
setup = common_setup + """
154+
n = 10000
155+
156+
df = DataFrame({'key1': randint(0, 500, size=n),
157+
'key2': randint(0, 100, size=n),
158+
'ints': randint(0, 1000, size=n),
159+
'ints2': randint(0, 1000, size=n)})
160+
"""
161+
162+
groupby_int_count = Benchmark("df.groupby(['key1', 'key2']).count()",
163+
setup, name='groupby_int_count',
164+
start_date=datetime(2014, 5, 6))
151165
#----------------------------------------------------------------------
152166
# Series.value_counts
153167

0 commit comments

Comments
 (0)