Skip to content

Commit 2c51b6d

Browse files
committed
new: groupby procedure
related issue: #25 see the test_groupby.yaml for an example recipe
1 parent 92afbb5 commit 2c51b6d

File tree

3 files changed

+120
-20
lines changed

3 files changed

+120
-20
lines changed

ddf_utils/chef/helpers.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# -*- coding: utf-8 -*-
22

3-
from functools import wraps
3+
from functools import wraps, partial
4+
import numpy as np
5+
from .. import ops
46
import logging
57

68

@@ -12,6 +14,14 @@ def read_opt(options, key, required=False, default=None):
1214
return default
1315

1416

17+
def mkfunc(options):
18+
if isinstance(options, str):
19+
return getattr(np, options)
20+
else:
21+
func = getattr(ops, options.pop('function'))
22+
return partial(func, **options)
23+
24+
1525
def log_shape(func):
1626
@wraps(func)
1727
def wrapper(*args, **kwargs):

ddf_utils/chef/procedure.py

+63-19
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import pandas as pd
66
import numpy as np
77
from . ingredient import BaseIngredient, Ingredient, ProcedureResult
8-
from .helpers import read_opt
8+
from .helpers import read_opt, mkfunc
99
from .. import config
1010
from .. import transformer
1111
import time
@@ -420,33 +420,71 @@ def align(to_align: BaseIngredient, base: Ingredient, *, result=None, **options)
420420
return ProcedureResult(result, to_replace, data=ing_data)
421421

422422

423-
def groupby(ingredient: BaseIngredient, *, result=None, **options) -> ProcedureResult:
423+
def groupby(ingredient: BaseIngredient, *, result, **options) -> ProcedureResult:
424424
"""group ingredient data by column(s) and run aggregate function
425425
426426
available options:
427-
by: the column(s) to group, can be a list or a string
428-
aggregate: the function to aggregate. Default: sum
429-
"""
427+
groubby: the column(s) to group, can be a list or a string
428+
aggregate/transform/filter: the function to run. only one of them should be supplied.
430429
431-
data = ingredient.get_data()
432-
by = options.pop('by')
430+
The function block should have below format:
431+
432+
aggregate:
433+
column1: funcname1
434+
column2: funcname2
435+
436+
or
437+
438+
aggregate:
439+
column:
440+
function: funcname
441+
param1: foo
442+
param2: bar
443+
444+
other columns not mentioned will be dropped.
445+
"""
433446

434447
logger.info("groupby: " + ingredient.ingred_id)
435448

436-
try:
437-
agg = options.pop('aggregate')
438-
except KeyError:
439-
logger.warning("no aggregate function found, assuming sum()")
440-
agg = 'sum'
449+
data = ingredient.get_data()
450+
by = options.pop('groupby')
441451

442-
for k, df in data.items():
443-
df = df.groupby(by=by).agg({k: agg})
444-
newkey = ','.join(df.index.names)
445-
data[k] = df.reset_index()
452+
# only one of aggregate/transform/filter should be in options.
453+
assert len(list(options.keys())) == 1
454+
comp_type = list(options.keys())[0]
455+
assert comp_type in ['aggregate', 'transform', 'filter']
446456

447-
if not result:
448-
result = ingredient.ingred_id + '-agg'
449-
return ProcedureResult(result, newkey, data=data)
457+
if comp_type == 'aggregate': # only aggregate should change the key of ingredient
458+
if isinstance(by, list):
459+
newkey = ','.join(by)
460+
else:
461+
newkey = by
462+
by = [by]
463+
logger.debug("changing the key to: " + str(newkey))
464+
else:
465+
newkey = ingredient.key
466+
by = [by]
467+
468+
newdata = dict()
469+
470+
if comp_type == 'aggregate':
471+
for k, func in options[comp_type].items():
472+
func = mkfunc(func)
473+
newdata[k] = data[k].groupby(by=by).agg({k: func}).reset_index()
474+
if comp_type == 'transform':
475+
for k, func in options[comp_type].items():
476+
func = mkfunc(func)
477+
df = data[k].set_index(ingredient.key_to_list())
478+
levels = [df.index.names.index(x) for x in by]
479+
newdata[k] = df.groupby(level=levels)[k].transform(func).reset_index()
480+
if comp_type == 'filter':
481+
for k, func in options[comp_type].items():
482+
func = mkfunc(func)
483+
df = data[k].set_index(ingredient.key_to_list())
484+
levels = [df.index.names.index(x) for x in by]
485+
newdata[k] = df.groupby(level=levels)[k].filter(func).reset_index()
486+
487+
return ProcedureResult(result, newkey, data=newdata)
450488

451489

452490
def accumulate(ingredient: BaseIngredient, *, result=None, **options) -> ProcedureResult:
@@ -563,3 +601,9 @@ def extract_concepts(*ingredients: List[BaseIngredient],
563601
return ProcedureResult(result, 'concept', data=concepts.reset_index())
564602

565603

604+
def trend_bridge(ingredient: BaseIngredient, result, **options) -> ProcedureResult:
605+
"""run trend bridge on ingredients
606+
"""
607+
from ..transformer import trend_bridge as tb
608+
609+
raise NotImplementedError('')

tests/recipes/test_groupby.yaml

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
info:
2+
id: test_groupby
3+
4+
ingredients:
5+
- id: datapoints-ilo
6+
dataset: ddf--ilo--kilm_employment_sector
7+
key: "country,sex,year"
8+
value: "*"
9+
10+
cooking:
11+
datapoints:
12+
- procedure: groupby
13+
ingredients:
14+
- datapoints-ilo
15+
options:
16+
groupby:
17+
- country
18+
- year
19+
aggregate:
20+
agriculture_thousands: sum
21+
agriculture_percentage: sum
22+
result: datapoints-by-country-year
23+
- procedure: groupby
24+
ingredients:
25+
- datapoints-by-country-year
26+
options:
27+
groupby: country
28+
filter:
29+
agriculture_percentage:
30+
function: gt
31+
how: all
32+
val: 100
33+
result: grouped-datapoints-1
34+
- procedure: groupby
35+
ingredients:
36+
- datapoints-by-country-year
37+
options:
38+
groupby: country
39+
transform:
40+
agriculture_thousands:
41+
function: zcore
42+
result: grouped-datapoints-2
43+
44+
serving:
45+
- grouped-datapoints-1
46+
- grouped-datapoints-2

0 commit comments

Comments
 (0)