Skip to content

Commit e1b64c1

Browse files
samukwekusamuel.oranyeli
andauthored
[ENH] Add mutate function (#1448)
* add mutate support for dict * add mutate support for tuple * changelog * add mutate to docs * add support for groupby object * add support for groupby object * update docs * update docs * update docs * add copy argument * add support for callable * add support for callable * handle coverage --------- Co-authored-by: samuel.oranyeli <[email protected]>
1 parent 3675115 commit e1b64c1

File tree

5 files changed

+457
-0
lines changed

5 files changed

+457
-0
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
## [Unreleased]
44

55
- [ENH] Added support for pd.Series.select - Issue #1394 @samukweku
6+
- [ENH] Added suport for janitor.mutate - Issue #1226 @samukweku
67

78
## [v0.30.0] - 2024-12-04
89

janitor/functions/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
from .limit_column_characters import limit_column_characters
5858
from .min_max_scale import min_max_scale
5959
from .move import move
60+
from .mutate import mutate
6061
from .pivot import (
6162
pivot_longer,
6263
pivot_longer_spec,
@@ -140,6 +141,7 @@
140141
"limit_column_characters",
141142
"min_max_scale",
142143
"move",
144+
"mutate",
143145
"pivot_longer",
144146
"pivot_longer_spec",
145147
"pivot_wider",

janitor/functions/mutate.py

Lines changed: 269 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,269 @@
1+
"""Implementation of mutate."""
2+
3+
from __future__ import annotations
4+
5+
from functools import singledispatch
6+
from typing import Any
7+
8+
import pandas as pd
9+
import pandas_flavor as pf
10+
from pandas.api.types import is_scalar
11+
from pandas.core.common import apply_if_callable
12+
from pandas.core.groupby.generic import DataFrameGroupBy
13+
14+
from janitor.functions.select import get_index_labels
15+
from janitor.utils import check
16+
17+
18+
@pf.register_dataframe_method
19+
def mutate(
20+
df: pd.DataFrame,
21+
*args: tuple[dict | tuple],
22+
by: Any = None,
23+
copy: bool = True,
24+
) -> pd.DataFrame:
25+
"""
26+
27+
!!! info "New in version 0.31.0"
28+
29+
!!!note
30+
31+
Before reaching for `mutate`, try `pd.DataFrame.assign`.
32+
33+
mutate creates new columns that are functions of existing columns.
34+
It can also modify columns (if the name is the same as an existing column).
35+
36+
The argument provided to *args* should be either a dictionary, a tuple or a callable.
37+
38+
- **dictionary argument**:
39+
If the argument is a dictionary,
40+
the value in the `{key:value}` pairing
41+
should be either a string, a callable or a tuple.
42+
43+
- If the value in the dictionary
44+
is a string or a callable,
45+
the key of the dictionary
46+
should be an existing column name.
47+
48+
!!!note
49+
50+
- If the value is a string,
51+
the string should be a pandas string function,
52+
e.g "sum", "mean", etc.
53+
54+
- If the value of the dictionary is a tuple,
55+
it should be of length 2, and of the form
56+
`(column_name, mutation_func)`,
57+
where `column_name` should exist in the DataFrame,
58+
and `mutation_func` should be either a string or a callable.
59+
The key in the dictionary can be a new column name.
60+
61+
!!!note
62+
63+
- If `mutation_func` is a string,
64+
the string should be a pandas string function,
65+
e.g "sum", "mean", etc.
66+
67+
68+
69+
- **tuple argument**:
70+
If the argument is a tuple, it should be of length 2,
71+
and of the form
72+
`(column_name, mutation_func)`,
73+
where `column_name` should exist in the DataFrame,
74+
and `mutation_func` should be either a string or a callable.
75+
76+
!!!note
77+
78+
- if `mutation_func` is a string,
79+
the string should be a pandas string function,
80+
e.g "sum", "mean", etc.
81+
82+
!!!note
83+
84+
- `column_name` can be anything supported by the
85+
[`select`][janitor.functions.select.select] syntax;
86+
as such multiple columns can be processed here -
87+
they will be processed individually.
88+
89+
90+
91+
- **callable argument**:
92+
If the argument is a callable, the callable is applied
93+
on the DataFrame or GroupBy object.
94+
The result from the callable should be a pandas Series
95+
or DataFrame.
96+
97+
`by` can be a `DataFrameGroupBy` object; it is assumed that
98+
`by` was created from `df` - the onus is on the user to
99+
ensure that, or the aggregations may yield incorrect results.
100+
101+
`by` accepts anything supported by `pd.DataFrame.groupby`.
102+
103+
Arguments supported in `pd.DataFrame.groupby`
104+
can also be passed to `by` via a dictionary.
105+
106+
Mutation does not occur on the original DataFrame;
107+
change this behaviour by passing `copy=False`.
108+
109+
Examples:
110+
>>> import pandas as pd
111+
>>> import numpy as np
112+
>>> import janitor
113+
>>> df = pd.DataFrame({
114+
... "col1": [5, 10, 15],
115+
... "col2": [3, 6, 9],
116+
... "col3": [10, 100, 1_000],
117+
... })
118+
119+
Transformation via a dictionary:
120+
>>> df.mutate(
121+
... {"col4": ('col1',np.log10),
122+
... "col1": np.log10}
123+
... )
124+
col1 col2 col3 col4
125+
0 0.698970 3 10 0.698970
126+
1 1.000000 6 100 1.000000
127+
2 1.176091 9 1000 1.176091
128+
129+
Transformation via a tuple:
130+
>>> df.mutate(("col1", np.log10))
131+
col1 col2 col3
132+
0 0.698970 3 10
133+
1 1.000000 6 100
134+
2 1.176091 9 1000
135+
>>> df.mutate(("col*", np.log10))
136+
col1 col2 col3
137+
0 0.698970 0.477121 1.0
138+
1 1.000000 0.778151 2.0
139+
2 1.176091 0.954243 3.0
140+
141+
Transformation via a callable:
142+
>>> df.mutate(lambda df: df.sum(axis=1).rename('total'))
143+
col1 col2 col3 total
144+
0 5 3 10 18
145+
1 10 6 100 116
146+
2 15 9 1000 1024
147+
148+
Transformation in the presence of a groupby:
149+
>>> data = {'avg_jump': [3, 4, 1, 2, 3, 4],
150+
... 'avg_run': [3, 4, 1, 3, 2, 4],
151+
... 'combine_id': [100200, 100200,
152+
... 101200, 101200,
153+
... 102201, 103202]}
154+
>>> df = pd.DataFrame(data)
155+
>>> df.mutate({"avg_run_2":("avg_run","mean")}, by='combine_id')
156+
avg_jump avg_run combine_id avg_run_2
157+
0 3 3 100200 3.5
158+
1 4 4 100200 3.5
159+
2 1 1 101200 2.0
160+
3 2 3 101200 2.0
161+
4 3 2 102201 2.0
162+
5 4 4 103202 4.0
163+
164+
Args:
165+
df: A pandas DataFrame.
166+
args: Either a dictionary or a tuple.
167+
by: Column(s) to group by.
168+
169+
Raises:
170+
ValueError: If a tuple is passed and the length is not 2.
171+
172+
Returns:
173+
A pandas DataFrame or Series with aggregated columns.
174+
""" # noqa: E501
175+
check("copy", copy, [bool])
176+
if by is not None:
177+
if isinstance(by, DataFrameGroupBy):
178+
# it is assumed that by is created from df
179+
# onus is on user to ensure that
180+
pass
181+
elif isinstance(by, dict):
182+
by = df.groupby(**by)
183+
else:
184+
if is_scalar(by):
185+
by = [by]
186+
by = df.groupby(by, sort=False, observed=True)
187+
if copy:
188+
df = df.copy(deep=None)
189+
for arg in args:
190+
df = _mutator(arg, df=df, by=by)
191+
return df
192+
193+
194+
@singledispatch
195+
def _mutator(arg, df, by):
196+
if not callable(arg):
197+
raise NotImplementedError(
198+
f"janitor.mutate is not supported for {type(arg)}"
199+
)
200+
if by is None:
201+
val = df
202+
else:
203+
val = by
204+
outcome = _process_maybe_callable(func=arg, obj=val)
205+
if isinstance(outcome, pd.Series):
206+
if not outcome.name:
207+
raise ValueError("Ensure the pandas Series object has a name")
208+
df[outcome.name] = outcome
209+
return df
210+
if isinstance(outcome, pd.DataFrame):
211+
for column in outcome:
212+
df[column] = outcome[column]
213+
return df
214+
raise TypeError(
215+
"The output from a callable should be a named Series or a DataFrame"
216+
)
217+
218+
219+
@_mutator.register(dict)
220+
def _(arg, df, by):
221+
"""Dispatch function for dictionary"""
222+
if by is None:
223+
val = df
224+
else:
225+
val = by
226+
for column_name, mutator in arg.items():
227+
if isinstance(mutator, tuple):
228+
column, func = mutator
229+
column = _process_within_dict(mutator=func, obj=val[column])
230+
else:
231+
column = _process_within_dict(
232+
mutator=mutator, obj=val[column_name]
233+
)
234+
df[column_name] = column
235+
return df
236+
237+
238+
@_mutator.register(tuple)
239+
def _(arg, df, by):
240+
"""Dispatch function for tuple"""
241+
if len(arg) != 2:
242+
raise ValueError("the tuple has to be a length of 2")
243+
column_names, mutator = arg
244+
column_names = get_index_labels(arg=[column_names], df=df, axis="columns")
245+
mapping = {column_name: mutator for column_name in column_names}
246+
return _mutator(mapping, df=df, by=by)
247+
248+
249+
def _process_maybe_callable(func: callable, obj):
250+
"""Function to handle callables"""
251+
try:
252+
column = obj.transform(func)
253+
except: # noqa: E722
254+
column = apply_if_callable(maybe_callable=func, obj=obj)
255+
return column
256+
257+
258+
def _process_maybe_string(func: str, obj):
259+
"""Function to handle pandas string functions"""
260+
# treat as a pandas approved string function
261+
# https://pandas.pydata.org/docs/user_guide/groupby.html#built-in-aggregation-methods
262+
return obj.transform(func)
263+
264+
265+
def _process_within_dict(mutator, obj):
266+
"""Handle str/callables within a dictionary"""
267+
if isinstance(mutator, str):
268+
return _process_maybe_string(func=mutator, obj=obj)
269+
return _process_maybe_callable(func=mutator, obj=obj)

mkdocs/api/functions.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
- limit_column_characters
4444
- min_max_scale
4545
- move
46+
- mutate
4647
- pivot
4748
- process_text
4849
- remove_columns

0 commit comments

Comments
 (0)