Skip to content

Commit d968aab

Browse files
committed
CLN: removes cython implementation of groupby count
1 parent 76520d9 commit d968aab

File tree

7 files changed

+73
-260
lines changed

7 files changed

+73
-260
lines changed

pandas/core/frame.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4562,7 +4562,7 @@ def _count_level(self, level, axis=0, numeric_only=False):
45624562

45634563
level_index = count_axis.levels[level]
45644564
labels = com._ensure_int64(count_axis.labels[level])
4565-
counts = lib.count_level_2d(mask, labels, len(level_index))
4565+
counts = lib.count_level_2d(mask, labels, len(level_index), axis=0)
45664566

45674567
result = DataFrame(counts, index=level_index,
45684568
columns=agg_axis)

pandas/core/groupby.py

+19-10
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@
6969
'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount',
7070
'resample',
7171
'describe',
72-
'rank', 'quantile', 'count',
72+
'rank', 'quantile',
7373
'fillna',
7474
'mad',
7575
'any', 'all',
@@ -149,9 +149,6 @@ def _last(x):
149149
return _last(x)
150150

151151

152-
def _count_compat(x, axis=0):
153-
return x.count() # .size != .count(); count excludes nan
154-
155152
class Grouper(object):
156153
"""
157154
A Grouper allows the user to specify a groupby instruction for a target object
@@ -801,11 +798,6 @@ def size(self):
801798
numeric_only=False, _convert=True)
802799
last = _groupby_function('last', 'last', _last_compat, numeric_only=False,
803800
_convert=True)
804-
_count = _groupby_function('_count', 'count', _count_compat,
805-
numeric_only=False)
806-
807-
def count(self, axis=0):
808-
return self._count().astype('int64')
809801

810802
def ohlc(self):
811803
"""
@@ -1463,7 +1455,6 @@ def get_group_levels(self):
14631455
'f': lambda func, a, b, c, d: func(a, b, c, d, 1)
14641456
},
14651457
'last': 'group_last',
1466-
'count': 'group_count',
14671458
}
14681459

14691460
_cython_arity = {
@@ -3468,6 +3459,24 @@ def _apply_to_column_groupbys(self, func):
34683459
in self._iterate_column_groupbys()),
34693460
keys=self._selected_obj.columns, axis=1)
34703461

3462+
def count(self):
3463+
from functools import partial
3464+
from pandas.lib import count_level_2d
3465+
from pandas.core.common import _isnull_ndarraylike as isnull
3466+
3467+
data, _ = self._get_data_to_aggregate()
3468+
ids, _, ngroups = self.grouper.group_info
3469+
mask = ids != -1
3470+
3471+
val = ((mask & ~isnull(blk.get_values())) for blk in data.blocks)
3472+
loc = (blk.mgr_locs for blk in data.blocks)
3473+
3474+
counter = partial(count_level_2d, labels=ids, max_bin=ngroups, axis=1)
3475+
blk = map(make_block, map(counter, val), loc)
3476+
3477+
return self._wrap_agged_blocks(data.items, list(blk))
3478+
3479+
34713480
from pandas.tools.plotting import boxplot_frame_groupby
34723481
DataFrameGroupBy.boxplot = boxplot_frame_groupby
34733482

pandas/lib.pyx

+19-6
Original file line numberDiff line numberDiff line change
@@ -1253,19 +1253,32 @@ def lookup_values(ndarray[object] values, dict mapping):
12531253
return maybe_convert_objects(result)
12541254

12551255

1256+
@cython.boundscheck(False)
1257+
@cython.wraparound(False)
12561258
def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
1257-
ndarray[int64_t] labels, Py_ssize_t max_bin):
1259+
ndarray[int64_t, ndim=1] labels,
1260+
Py_ssize_t max_bin,
1261+
int axis):
12581262
cdef:
12591263
Py_ssize_t i, j, k, n
12601264
ndarray[int64_t, ndim=2] counts
12611265

1266+
assert(axis == 0 or axis == 1)
12621267
n, k = (<object> mask).shape
1263-
counts = np.zeros((max_bin, k), dtype='i8')
12641268

1265-
for i from 0 <= i < n:
1266-
for j from 0 <= j < k:
1267-
if mask[i, j]:
1268-
counts[labels[i], j] += 1
1269+
if axis == 0:
1270+
counts = np.zeros((max_bin, k), dtype='i8')
1271+
with nogil:
1272+
for i from 0 <= i < n:
1273+
for j from 0 <= j < k:
1274+
counts[labels[i], j] += mask[i, j]
1275+
1276+
else: # axis == 1
1277+
counts = np.zeros((n, max_bin), dtype='i8')
1278+
with nogil:
1279+
for i from 0 <= i < n:
1280+
for j from 0 <= j < k:
1281+
counts[i, labels[j]] += mask[i, j]
12691282

12701283
return counts
12711284

pandas/src/generate_code.py

-46
Original file line numberDiff line numberDiff line change
@@ -971,44 +971,6 @@ def group_var_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
971971
972972
"""
973973

974-
group_count_template = """@cython.boundscheck(False)
975-
@cython.wraparound(False)
976-
def group_count_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
977-
ndarray[int64_t] counts,
978-
ndarray[%(c_type)s, ndim=2] values,
979-
ndarray[int64_t] labels):
980-
'''
981-
Only aggregates on axis=0
982-
'''
983-
cdef:
984-
Py_ssize_t i, j, lab, ncounts = len(counts)
985-
Py_ssize_t N = values.shape[0], K = values.shape[1]
986-
%(c_type)s val
987-
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
988-
dtype=np.int64)
989-
990-
if len(values) != len(labels):
991-
raise AssertionError("len(index) != len(labels)")
992-
993-
994-
%(nogil)s
995-
%(tab)sfor i in range(N):
996-
%(tab)s lab = labels[i]
997-
%(tab)s if lab < 0:
998-
%(tab)s continue
999-
1000-
%(tab)s counts[lab] += 1
1001-
%(tab)s for j in range(K):
1002-
%(tab)s val = values[i, j]
1003-
1004-
%(tab)s # not nan
1005-
%(tab)s nobs[lab, j] += val == val and val != iNaT
1006-
1007-
%(tab)sfor i in range(ncounts):
1008-
%(tab)s for j in range(K):
1009-
%(tab)s out[i, j] = nobs[i, j]
1010-
"""
1011-
1012974
# add passing bin edges, instead of labels
1013975

1014976

@@ -1995,8 +1957,6 @@ def generate_from_template(template, exclude=None):
19951957
groupby_min_max = [group_min_template,
19961958
group_max_template]
19971959

1998-
groupby_count = [group_count_template]
1999-
20001960
templates_1d = [map_indices_template,
20011961
pad_template,
20021962
backfill_template,
@@ -2051,12 +2011,6 @@ def generate_take_cython_file():
20512011
print(generate_put_min_max_template(template, use_ints=True),
20522012
file=f)
20532013

2054-
for template in groupby_count:
2055-
print(generate_put_selection_template(template, use_ints=True,
2056-
use_datelikes=True,
2057-
use_objects=True),
2058-
file=f)
2059-
20602014
for template in nobool_1d_templates:
20612015
print(generate_from_template(template, exclude=['bool']), file=f)
20622016

pandas/src/generated.pyx

-186
Original file line numberDiff line numberDiff line change
@@ -7930,192 +7930,6 @@ def group_max_int64(ndarray[int64_t, ndim=2] out,
79307930
out[i, j] = maxx[i, j]
79317931

79327932

7933-
@cython.boundscheck(False)
7934-
@cython.wraparound(False)
7935-
def group_count_float64(ndarray[float64_t, ndim=2] out,
7936-
ndarray[int64_t] counts,
7937-
ndarray[float64_t, ndim=2] values,
7938-
ndarray[int64_t] labels):
7939-
'''
7940-
Only aggregates on axis=0
7941-
'''
7942-
cdef:
7943-
Py_ssize_t i, j, lab, ncounts = len(counts)
7944-
Py_ssize_t N = values.shape[0], K = values.shape[1]
7945-
float64_t val
7946-
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
7947-
dtype=np.int64)
7948-
7949-
if len(values) != len(labels):
7950-
raise AssertionError("len(index) != len(labels)")
7951-
7952-
7953-
with nogil:
7954-
for i in range(N):
7955-
lab = labels[i]
7956-
if lab < 0:
7957-
continue
7958-
7959-
counts[lab] += 1
7960-
for j in range(K):
7961-
val = values[i, j]
7962-
7963-
# not nan
7964-
nobs[lab, j] += val == val and val != iNaT
7965-
7966-
for i in range(ncounts):
7967-
for j in range(K):
7968-
out[i, j] = nobs[i, j]
7969-
7970-
@cython.boundscheck(False)
7971-
@cython.wraparound(False)
7972-
def group_count_float32(ndarray[float32_t, ndim=2] out,
7973-
ndarray[int64_t] counts,
7974-
ndarray[float32_t, ndim=2] values,
7975-
ndarray[int64_t] labels):
7976-
'''
7977-
Only aggregates on axis=0
7978-
'''
7979-
cdef:
7980-
Py_ssize_t i, j, lab, ncounts = len(counts)
7981-
Py_ssize_t N = values.shape[0], K = values.shape[1]
7982-
float32_t val
7983-
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
7984-
dtype=np.int64)
7985-
7986-
if len(values) != len(labels):
7987-
raise AssertionError("len(index) != len(labels)")
7988-
7989-
7990-
with nogil:
7991-
for i in range(N):
7992-
lab = labels[i]
7993-
if lab < 0:
7994-
continue
7995-
7996-
counts[lab] += 1
7997-
for j in range(K):
7998-
val = values[i, j]
7999-
8000-
# not nan
8001-
nobs[lab, j] += val == val and val != iNaT
8002-
8003-
for i in range(ncounts):
8004-
for j in range(K):
8005-
out[i, j] = nobs[i, j]
8006-
8007-
@cython.boundscheck(False)
8008-
@cython.wraparound(False)
8009-
def group_count_int64(ndarray[int64_t, ndim=2] out,
8010-
ndarray[int64_t] counts,
8011-
ndarray[int64_t, ndim=2] values,
8012-
ndarray[int64_t] labels):
8013-
'''
8014-
Only aggregates on axis=0
8015-
'''
8016-
cdef:
8017-
Py_ssize_t i, j, lab, ncounts = len(counts)
8018-
Py_ssize_t N = values.shape[0], K = values.shape[1]
8019-
int64_t val
8020-
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
8021-
dtype=np.int64)
8022-
8023-
if len(values) != len(labels):
8024-
raise AssertionError("len(index) != len(labels)")
8025-
8026-
8027-
with nogil:
8028-
for i in range(N):
8029-
lab = labels[i]
8030-
if lab < 0:
8031-
continue
8032-
8033-
counts[lab] += 1
8034-
for j in range(K):
8035-
val = values[i, j]
8036-
8037-
# not nan
8038-
nobs[lab, j] += val == val and val != iNaT
8039-
8040-
for i in range(ncounts):
8041-
for j in range(K):
8042-
out[i, j] = nobs[i, j]
8043-
8044-
@cython.boundscheck(False)
8045-
@cython.wraparound(False)
8046-
def group_count_object(ndarray[object, ndim=2] out,
8047-
ndarray[int64_t] counts,
8048-
ndarray[object, ndim=2] values,
8049-
ndarray[int64_t] labels):
8050-
'''
8051-
Only aggregates on axis=0
8052-
'''
8053-
cdef:
8054-
Py_ssize_t i, j, lab, ncounts = len(counts)
8055-
Py_ssize_t N = values.shape[0], K = values.shape[1]
8056-
object val
8057-
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
8058-
dtype=np.int64)
8059-
8060-
if len(values) != len(labels):
8061-
raise AssertionError("len(index) != len(labels)")
8062-
8063-
8064-
8065-
for i in range(N):
8066-
lab = labels[i]
8067-
if lab < 0:
8068-
continue
8069-
8070-
counts[lab] += 1
8071-
for j in range(K):
8072-
val = values[i, j]
8073-
8074-
# not nan
8075-
nobs[lab, j] += val == val and val != iNaT
8076-
8077-
for i in range(ncounts):
8078-
for j in range(K):
8079-
out[i, j] = nobs[i, j]
8080-
8081-
@cython.boundscheck(False)
8082-
@cython.wraparound(False)
8083-
def group_count_int64(ndarray[int64_t, ndim=2] out,
8084-
ndarray[int64_t] counts,
8085-
ndarray[int64_t, ndim=2] values,
8086-
ndarray[int64_t] labels):
8087-
'''
8088-
Only aggregates on axis=0
8089-
'''
8090-
cdef:
8091-
Py_ssize_t i, j, lab, ncounts = len(counts)
8092-
Py_ssize_t N = values.shape[0], K = values.shape[1]
8093-
int64_t val
8094-
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
8095-
dtype=np.int64)
8096-
8097-
if len(values) != len(labels):
8098-
raise AssertionError("len(index) != len(labels)")
8099-
8100-
8101-
with nogil:
8102-
for i in range(N):
8103-
lab = labels[i]
8104-
if lab < 0:
8105-
continue
8106-
8107-
counts[lab] += 1
8108-
for j in range(K):
8109-
val = values[i, j]
8110-
8111-
# not nan
8112-
nobs[lab, j] += val == val and val != iNaT
8113-
8114-
for i in range(ncounts):
8115-
for j in range(K):
8116-
out[i, j] = nobs[i, j]
8117-
8118-
81197933
@cython.wraparound(False)
81207934
@cython.boundscheck(False)
81217935
def left_join_indexer_unique_float64(ndarray[float64_t] left,

0 commit comments

Comments
 (0)