@@ -89,6 +89,122 @@ cdef class Reducer:
89
89
raise ValueError (' function does not reduce' )
90
90
return result
91
91
92
+ cdef class Grouper:
93
+ '''
94
+ Performs generic grouping operation while avoiding ndarray construction
95
+ overhead
96
+ '''
97
+ cdef:
98
+ Py_ssize_t nresults, ngroups
99
+ object arr, dummy, f, labels, counts
100
+ bint passed_dummy
101
+
102
+ def __init__ (self , object arr , object f , object labels , ngroups , dummy = None ):
103
+ n = len (arr)
104
+
105
+ assert (arr.ndim == 1 )
106
+
107
+ if not arr.flags.contiguous:
108
+ arr = arr.copy()
109
+
110
+ self .labels = labels
111
+ self .f = f
112
+ self .arr = arr
113
+ self .dummy = self ._check_dummy(dummy)
114
+ self .passed_dummy = dummy is not None
115
+
116
+ self .counts = np.zeros(ngroups, dtype = ' i4' )
117
+
118
+ self .ngroups = ngroups
119
+
120
+ def _check_dummy (self , dummy = None ):
121
+ if dummy is None :
122
+ dummy = np.empty(0 , dtype = self .arr.dtype)
123
+ else :
124
+ if dummy.dtype != self .arr.dtype:
125
+ raise ValueError (' Dummy array must be same dtype' )
126
+ if len (dummy) != self .chunksize:
127
+ raise ValueError (' Dummy array must be length %d ' %
128
+ self .chunksize)
129
+
130
+ return dummy
131
+
132
+ def get_result (self ):
133
+ cdef:
134
+ char * dummy_buf
135
+ ndarray arr, result, chunk
136
+ ndarray[int32_t] labels, counts
137
+ Py_ssize_t i, group_size, n, lab
138
+ flatiter it
139
+ npy_intp * shape
140
+ object res
141
+ bint initialized = 0
142
+ tuple args
143
+ object kwds
144
+
145
+ labels = self .labels
146
+ counts = self .counts
147
+
148
+ arr = self .arr
149
+ chunk = self .dummy
150
+
151
+ dummy_buf = chunk.data
152
+ chunk.data = arr.data
153
+
154
+ shape = chunk.shape
155
+ group_size = 0
156
+ n = len (arr)
157
+
158
+ args = cpython.PyTuple_New(1 )
159
+ kwds = {}
160
+ cpython.PyTuple_SET_ITEM(args, 0 , chunk)
161
+ cpython.Py_INCREF(chunk)
162
+
163
+ try :
164
+ for i in range (n):
165
+ group_size += 1
166
+
167
+ lab = labels[i]
168
+
169
+ if i == n - 1 or lab != labels[i + 1 ]:
170
+ chunk.shape[0 ] = group_size
171
+
172
+ res = cpython.PyObject_Call(self .f, args, kwds)
173
+
174
+ # res = self.f(chunk)
175
+ if not initialized:
176
+ result = self ._get_result_array(res)
177
+ it = < flatiter> PyArray_IterNew(result)
178
+ initialized = 1
179
+
180
+ PyArray_ITER_GOTO1D(it, lab)
181
+ PyArray_SETITEM(result, PyArray_ITER_DATA(it), res)
182
+ counts[lab] = group_size
183
+
184
+ chunk.data = chunk.data + group_size
185
+ group_size = 0
186
+ except :
187
+ raise
188
+ finally :
189
+ # so we don't free the wrong memory
190
+ chunk.shape[0 ] = 0
191
+ chunk.data = dummy_buf
192
+
193
+ if result.dtype == np.object_:
194
+ result = maybe_convert_objects(result)
195
+
196
+ return result
197
+
198
+ def _get_result_array (self , object res ):
199
+ try :
200
+ assert (not isinstance (res, np.ndarray))
201
+ assert (not (isinstance (res, list ) and len (res) == len (self .dummy)))
202
+
203
+ result = np.empty(self .ngroups, dtype = ' O' )
204
+ except Exception :
205
+ raise ValueError (' function does not reduce' )
206
+ return result
207
+
92
208
def reduce (arr , f , axis = 0 , dummy = None ):
93
209
reducer = Reducer(arr, f, axis = axis, dummy = dummy)
94
210
return reducer.get_result()
0 commit comments