new: groupby procedure

semio · semio · commit 2c51b6d48db2 · 2016-12-12T16:18:38.000+08:00
related issue: #25 see the test_groupby.yaml for an example recipe
diff --git a/ddf_utils/chef/helpers.py b/ddf_utils/chef/helpers.py
@@ -1,6 +1,8 @@
 # -*- coding: utf-8 -*-
 
-from functools import wraps
+from functools import wraps, partial
+import numpy as np
+from .. import ops
 import logging
 
 
@@ -12,6 +14,14 @@ def read_opt(options, key, required=False, default=None):
     return default
 
 
+def mkfunc(options):
+    if isinstance(options, str):
+        return getattr(np, options)
+    else:
+        func = getattr(ops, options.pop('function'))
+        return partial(func, **options)
+
+
 def log_shape(func):
     @wraps(func)
     def wrapper(*args, **kwargs):
diff --git a/ddf_utils/chef/procedure.py b/ddf_utils/chef/procedure.py
@@ -5,7 +5,7 @@
 import pandas as pd
 import numpy as np
 from . ingredient import BaseIngredient, Ingredient, ProcedureResult
-from .helpers import read_opt
+from .helpers import read_opt, mkfunc
 from .. import config
 from .. import transformer
 import time
@@ -420,33 +420,71 @@ def align(to_align: BaseIngredient, base: Ingredient, *, result=None, **options)
         return ProcedureResult(result, to_replace, data=ing_data)
 
 
-def groupby(ingredient: BaseIngredient, *, result=None, **options) -> ProcedureResult:
+def groupby(ingredient: BaseIngredient, *, result, **options) -> ProcedureResult:
     """group ingredient data by column(s) and run aggregate function
 
     available options:
-        by: the column(s) to group, can be a list or a string
-        aggregate: the function to aggregate. Default: sum
-    """
+        groubby: the column(s) to group, can be a list or a string
+        aggregate/transform/filter: the function to run. only one of them should be supplied.
 
-    data = ingredient.get_data()
-    by = options.pop('by')
+    The function block should have below format:
+
+    aggregate:
+      column1: funcname1
+      column2: funcname2
+
+    or
+
+    aggregate:
+      column:
+        function: funcname
+        param1: foo
+        param2: bar
+
+    other columns not mentioned will be dropped.
+    """
 
     logger.info("groupby: " + ingredient.ingred_id)
 
-    try:
-        agg = options.pop('aggregate')
-    except KeyError:
-        logger.warning("no aggregate function found, assuming sum()")
-        agg = 'sum'
+    data = ingredient.get_data()
+    by = options.pop('groupby')
 
-    for k, df in data.items():
-        df = df.groupby(by=by).agg({k: agg})
-        newkey = ','.join(df.index.names)
-        data[k] = df.reset_index()
+    # only one of aggregate/transform/filter should be in options.
+    assert len(list(options.keys())) == 1
+    comp_type = list(options.keys())[0]
+    assert comp_type in ['aggregate', 'transform', 'filter']
 
-    if not result:
-        result = ingredient.ingred_id + '-agg'
-    return ProcedureResult(result, newkey, data=data)
+    if comp_type == 'aggregate':  # only aggregate should change the key of ingredient
+        if isinstance(by, list):
+            newkey = ','.join(by)
+        else:
+            newkey = by
+            by = [by]
+        logger.debug("changing the key to: " + str(newkey))
+    else:
+        newkey = ingredient.key
+        by = [by]
+
+    newdata = dict()
+
+    if comp_type == 'aggregate':
+        for k, func in options[comp_type].items():
+            func = mkfunc(func)
+            newdata[k] = data[k].groupby(by=by).agg({k: func}).reset_index()
+    if comp_type == 'transform':
+        for k, func in options[comp_type].items():
+            func = mkfunc(func)
+            df = data[k].set_index(ingredient.key_to_list())
+            levels = [df.index.names.index(x) for x in by]
+            newdata[k] = df.groupby(level=levels)[k].transform(func).reset_index()
+    if comp_type == 'filter':
+        for k, func in options[comp_type].items():
+            func = mkfunc(func)
+            df = data[k].set_index(ingredient.key_to_list())
+            levels = [df.index.names.index(x) for x in by]
+            newdata[k] = df.groupby(level=levels)[k].filter(func).reset_index()
+
+    return ProcedureResult(result, newkey, data=newdata)
 
 
 def accumulate(ingredient: BaseIngredient, *, result=None, **options) -> ProcedureResult:
@@ -563,3 +601,9 @@ def extract_concepts(*ingredients: List[BaseIngredient],
     return ProcedureResult(result, 'concept', data=concepts.reset_index())
 
 
+def trend_bridge(ingredient: BaseIngredient, result, **options) -> ProcedureResult:
+    """run trend bridge on ingredients
+    """
+    from ..transformer import trend_bridge as tb
+
+    raise NotImplementedError('')
diff --git a/tests/recipes/test_groupby.yaml b/tests/recipes/test_groupby.yaml
@@ -0,0 +1,46 @@
+info:
+    id: test_groupby
+
+ingredients:
+    - id: datapoints-ilo
+      dataset: ddf--ilo--kilm_employment_sector
+      key: "country,sex,year"
+      value: "*"
+
+cooking:
+    datapoints:
+        - procedure: groupby
+          ingredients:
+              - datapoints-ilo
+          options:
+              groupby:
+                  - country
+                  - year
+              aggregate:
+                  agriculture_thousands: sum
+                  agriculture_percentage: sum
+          result: datapoints-by-country-year 
+        - procedure: groupby
+          ingredients:
+              - datapoints-by-country-year
+          options:
+              groupby: country
+              filter:
+                agriculture_percentage:
+                      function: gt
+                      how: all
+                      val: 100
+          result: grouped-datapoints-1
+        - procedure: groupby
+          ingredients:
+              - datapoints-by-country-year
+          options:
+              groupby: country
+              transform:
+                  agriculture_thousands:
+                      function: zcore
+          result: grouped-datapoints-2
+
+serving:
+    - grouped-datapoints-1
+    - grouped-datapoints-2