ENH: multi-GroupBy refactoring to be less nested, reuse more code, getting toward addressing #496

wesm · wesm · commit 9bb210ccfd78 · 2011-12-16T13:32:49.000-05:00
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -232,20 +232,15 @@ def _multi_iter(self):
         elif isinstance(self.obj, Series):
             tipo = Series
 
-        def flatten(gen, level=0, shape_axis=0):
-            ids = self.groupings[level].ids
-            for cat, subgen in gen:
-                if subgen is None:
-                    continue
-
-                if isinstance(subgen, tipo):
-                    yield (ids[cat],), subgen
-                else:
-                    for subcat, data in flatten(subgen, level=level+1,
-                                                shape_axis=shape_axis):
-                        yield (ids[cat],) + subcat, data
+        id_list = [ping.ids for ping in self.groupings]
+        shape = tuple(len(ids) for ids in id_list)
 
-        return flatten(self._generator_factory(data), shape_axis=self.axis)
+        for label, group in self._generator_factory(data):
+            if group is None:
+                continue
+            unraveled = np.unravel_index(label, shape)
+            key = tuple(id_list[i][j] for i, j in enumerate(unraveled))
+            yield key, group
 
     def apply(self, func, *args, **kwargs):
         """
@@ -387,51 +382,31 @@ def _python_agg_general(self, func, *args, **kwargs):
         group_shape = self._group_shape
         counts = np.zeros(group_shape, dtype=int)
 
-        # want to cythonize?
-        def _doit(reschunk, ctchunk, gen, shape_axis=0):
-            for i, (_, subgen) in enumerate(gen):
-                # TODO: fixme
-                if subgen is None:
+        # todo: cythonize?
+        def _aggregate(output, counts, generator, shape_axis=0):
+            for label, group in generator:
+                if group is None:
                     continue
+                counts[label] = group.shape[shape_axis]
+                output[label] = func(group, *args, **kwargs)
 
-                if isinstance(subgen, PandasObject):
-                    size = subgen.shape[shape_axis]
-                    ctchunk[i] = size
-                    reschunk[i] = func(subgen, *args, **kwargs)
-                else:
-                    _doit(reschunk[i], ctchunk[i], subgen,
-                          shape_axis=shape_axis)
-
-        gen_factory = self._generator_factory
-
-        try:
-            stride_shape = self._agg_stride_shape
-            output = np.empty(group_shape + stride_shape, dtype=float)
-            output.fill(np.nan)
-            obj = self._obj_with_exclusions
-            _doit(output, counts, gen_factory(obj), shape_axis=self.axis)
-            mask = counts.ravel() > 0
-            output = output.reshape((np.prod(group_shape),) + stride_shape)
-            output = output[mask]
-        except Exception:
-            # we failed, try to go slice-by-slice / column-by-column
-
-            result = np.empty(group_shape, dtype=float)
-            result.fill(np.nan)
-            # iterate through "columns" ex exclusions to populate output dict
-            output = {}
-            for name, obj in self._iterate_slices():
-                try:
-                    _doit(result, counts, gen_factory(obj))
-                    # TODO: same mask for every column...
-                    output[name] = result.ravel().copy()
-                    result.fill(np.nan)
-                except TypeError:
-                    continue
+        result = np.empty(group_shape, dtype=float)
+        result.fill(np.nan)
+        # iterate through "columns" ex exclusions to populate output dict
+        output = {}
+        for name, obj in self._iterate_slices():
+            try:
+                _aggregate(result.ravel(), counts.ravel(),
+                           self._generator_factory(obj))
+                # TODO: same mask for every column...
+                output[name] = result.ravel().copy()
+                result.fill(np.nan)
+            except TypeError:
+                continue
 
-            mask = counts.ravel() > 0
-            for name, result in output.iteritems():
-                output[name] = result[mask]
+        mask = counts.ravel() > 0
+        for name, result in output.iteritems():
+            output[name] = result[mask]
 
         return self._wrap_aggregated_output(output, mask)
 
@@ -869,7 +844,7 @@ class DataFrameGroupBy(GroupBy):
     def _agg_stride_shape(self):
         if self._column is not None:
             # ffffff
-            return 1
+            return 1,
 
         if self.axis == 0:
             n = len(self.obj.columns)
@@ -1322,8 +1297,14 @@ def generate_groups(data, label_list, shape, axis=0, factory=lambda x: x):
     -------
     generator
     """
-    indexer = _get_group_sorter(label_list, shape)
-    sorted_labels = [labels.take(indexer) for labels in label_list]
+    group_index = get_group_index(label_list, shape)
+    na_mask = np.zeros(len(label_list[0]), dtype=bool)
+    for arr in label_list:
+        na_mask |= arr == -1
+    group_index[na_mask] = -1
+    indexer = lib.groupsort_indexer(group_index.astype('i4'),
+                                    np.prod(shape))
+    group_index = group_index.take(indexer)
 
     if isinstance(data, BlockManager):
         # this is sort of wasteful but...
@@ -1335,29 +1316,6 @@ def generate_groups(data, label_list, shape, axis=0, factory=lambda x: x):
     elif isinstance(data, DataFrame):
         sorted_data = data.take(indexer, axis=axis)
 
-    gen = _generate_groups(sorted_data, sorted_labels, shape,
-                           0, len(label_list[0]), axis=axis, which=0,
-                           factory=factory)
-    for key, group in gen:
-        yield key, group
-
-def _get_group_sorter(label_list, shape):
-    group_index = get_group_index(label_list, shape)
-    na_mask = np.zeros(len(label_list[0]), dtype=bool)
-    for arr in label_list:
-        na_mask |= arr == -1
-    group_index[na_mask] = -1
-    indexer = lib.groupsort_indexer(group_index.astype('i4'),
-                                    np.prod(shape))
-
-    return indexer
-
-def _generate_groups(data, labels, shape, start, end, axis=0, which=0,
-                     factory=lambda x: x):
-    axis_labels = labels[which][start:end]
-    edges = axis_labels.searchsorted(np.arange(1, shape[which] + 1),
-                                     side='left')
-
     if isinstance(data, DataFrame):
         def slicer(data, slob):
             if axis == 0:
@@ -1371,29 +1329,13 @@ def slicer(data, slob):
         def slicer(data, slob):
             return data[slob]
 
-    do_slice = which == len(labels) - 1
+    starts, ends = lib.generate_slices(group_index, np.prod(shape))
 
-    # omit -1 values at beginning-- NA values
-    left = axis_labels.searchsorted(0)
-
-    # time to actually aggregate
-    for i, right in enumerate(edges):
-        if do_slice:
-            slob = slice(start + left, start + right)
-
-            # skip empty groups in the cartesian product
-            if left == right:
-                yield i, None
-                continue
-
-            yield i, slicer(data, slob)
+    for i, (start, end) in enumerate(zip(starts, ends)):
+        if start == end:
+            yield i, None
         else:
-            # yield subgenerators, yikes
-            yield i, _generate_groups(data, labels, shape, start + left,
-                                      start + right, axis=axis,
-                                      which=which + 1, factory=factory)
-
-        left = right
+            yield i, slicer(sorted_data, slice(start, end))
 
 def get_group_index(label_list, shape):
     n = len(label_list[0])
diff --git a/pandas/src/groupby.pyx b/pandas/src/groupby.pyx
@@ -261,44 +261,6 @@ def get_unique_labels(ndarray[object] values, dict idMap):
 
     return fillVec
 
-# from libcpp.set cimport set as stlset
-
-# cdef fast_unique_int32(ndarray arr):
-#     cdef:
-#         cdef stlset[int] table
-
-#         Py_ssize_t i, n = len(arr)
-#         int32_t* values
-#         list uniques = []
-#         int32_t val
-
-#     values = <int32_t*> arr.data
-
-#     for i from 0 <= i < n:
-#         val = values[i]
-#         if table.count(val) == 0:
-#             table.insert(val)
-#             uniques.append(val)
-#     return np.asarray(sorted(uniques), dtype=object)
-
-
-def _group_reorder(values, label_list, shape):
-    # group_index = np.zeros(len(label_list[0]), dtype='i4')
-    # for i in xrange(len(shape)):
-    #     stride = np.prod([x for x in shape[i+1:]], dtype='i4')
-    #     group_index += label_list[i] * stride
-    # na_mask = np.zeros(len(label_list[0]), dtype=bool)
-    # for arr in label_list:
-    #     na_mask |= arr == -1
-    # group_index[na_mask] = -1
-
-    # indexer = groupsort_indexer(group_index, np.prod(shape))
-
-    indexer = np.lexsort(label_list[::-1])
-    sorted_labels = [labels.take(indexer) for labels in label_list]
-    sorted_values = values.take(indexer)
-    return sorted_values, sorted_labels
-
 @cython.wraparound(False)
 def groupsort_indexer(ndarray[int32_t] index, Py_ssize_t ngroups):
     cdef:
@@ -326,39 +288,10 @@ def groupsort_indexer(ndarray[int32_t] index, Py_ssize_t ngroups):
     return result
 
 
-# cdef int _aggregate_group(float64_t *out, int32_t *counts, float64_t *values,
-#                            list labels, int start, int end, tuple shape,
-#                            Py_ssize_t which, Py_ssize_t offset,
-#                            agg_func func) except -1:
-#     cdef:
-#         ndarray[int32_t] axis
-#         cdef Py_ssize_t stride
-
-#     # time to actually aggregate
-#     if which == len(labels) - 1:
-#         axis = labels[which]
-
-#         while start < end and axis[start] == -1:
-#             start += 1
-#         func(out, counts, values, <int32_t*> axis.data, start, end, offset)
-#     else:
-#         axis = labels[which][start:end]
-#         stride = np.prod(shape[which+1:])
-#         # get group counts on axisp
-#         edges = axis.searchsorted(np.arange(1, shape[which] + 1), side='left')
-#         # print edges, axis
-
-#         left = axis.searchsorted(0) # ignore NA values coded as -1
-
-#         # aggregate each subgroup
-#         for right in edges:
-#             _aggregate_group(out, counts, values, labels, start + left,
-#                              start + right, shape, which + 1, offset, func)
-#             offset += stride
-#             left = right
 
 # TODO: aggregate multiple columns in single pass
 
+@cython.boundscheck(False)
 @cython.wraparound(False)
 def group_add(ndarray[float64_t] out,
               ndarray[int32_t] counts,
@@ -391,6 +324,7 @@ def group_add(ndarray[float64_t] out,
         else:
             out[i] = sumx[i]
 
+@cython.boundscheck(False)
 @cython.wraparound(False)
 def group_mean(ndarray[float64_t] out,
                ndarray[int32_t] counts,
@@ -424,6 +358,7 @@ def group_mean(ndarray[float64_t] out,
         else:
             out[i] = sumx[i] / count
 
+@cython.boundscheck(False)
 @cython.wraparound(False)
 def group_var(ndarray[float64_t] out,
               ndarray[int32_t] counts,
@@ -460,13 +395,6 @@ def group_var(ndarray[float64_t] out,
             out[i] = ((ct * sumxx[i] - sumx[i] * sumx[i]) /
                       (ct * ct - ct))
 
-def _result_shape(label_list):
-    # assumed sorted
-    shape = []
-    for labels in label_list:
-        shape.append(1 + labels[-1])
-    return tuple(shape)
-
 def reduce_mean(ndarray[object] indices,
                 ndarray[object] buckets,
                 ndarray[float64_t] values,
@@ -584,6 +512,31 @@ def duplicated(list values, take_last=False):
 
     return result.view(np.bool_)
 
+
+def generate_slices(ndarray[Py_ssize_t] labels, Py_ssize_t ngroups):
+    cdef:
+        Py_ssize_t i, group_size, n, lab, start
+        object slobj
+        ndarray[int32_t] starts
+
+    n = len(labels)
+
+    starts = np.zeros(ngroups, dtype='i4')
+    ends = np.zeros(ngroups, dtype='i4')
+
+    start = 0
+    group_size = 0
+    for i in range(n):
+        group_size += 1
+        lab = labels[i]
+        if i == n - 1 or lab != labels[i + 1]:
+            starts[lab] = start
+            ends[lab] = start + group_size
+            start += group_size
+            group_size = 0
+
+    return starts, ends
+
 '''
 
 def ts_upsample_mean(ndarray[object] indices,
diff --git a/pandas/src/reduce.pyx b/pandas/src/reduce.pyx
@@ -99,7 +99,8 @@ cdef class Grouper:
         object arr, dummy, f, labels, counts
         bint passed_dummy
 
-    def __init__(self, object arr, object f, object labels, ngroups, dummy=None):
+    def __init__(self, object arr, object index, object f,
+                 object labels, ngroups, dummy=None):
         n = len(arr)
 
         assert(arr.ndim == 1)