|
5 | 5 |
|
6 | 6 | from collections import defaultdict
|
7 | 7 | from functools import partial
|
8 |
| -from typing import Any, Callable, DefaultDict, List, Sequence, Tuple, Union |
| 8 | +from typing import ( |
| 9 | + Any, |
| 10 | + Callable, |
| 11 | + DefaultDict, |
| 12 | + Dict, |
| 13 | + List, |
| 14 | + Optional, |
| 15 | + Sequence, |
| 16 | + Tuple, |
| 17 | + Union, |
| 18 | +) |
| 19 | + |
| 20 | +from pandas._typing import Label |
9 | 21 |
|
10 | 22 | from pandas.core.dtypes.common import is_dict_like, is_list_like
|
11 | 23 |
|
| 24 | +from pandas.core.base import SpecificationError |
12 | 25 | import pandas.core.common as com
|
13 | 26 | from pandas.core.indexes.api import Index
|
| 27 | +from pandas.core.series import FrameOrSeriesUnion, Series |
| 28 | + |
| 29 | +# types of `func` kwarg for DataFrame.aggregate and Series.aggregate |
| 30 | +AggFuncTypeBase = Union[Callable, str] |
| 31 | +AggFuncType = Union[ |
| 32 | + AggFuncTypeBase, |
| 33 | + List[AggFuncTypeBase], |
| 34 | + Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]], |
| 35 | +] |
| 36 | + |
| 37 | + |
| 38 | +def reconstruct_func( |
| 39 | + func: Optional[AggFuncType], **kwargs, |
| 40 | +) -> Tuple[ |
| 41 | + bool, Optional[AggFuncType], Optional[List[str]], Optional[List[int]], |
| 42 | +]: |
| 43 | + """ |
| 44 | + This is the internal function to reconstruct func given if there is relabeling |
| 45 | + or not and also normalize the keyword to get new order of columns. |
| 46 | +
|
| 47 | + If named aggregation is applied, `func` will be None, and kwargs contains the |
| 48 | + column and aggregation function information to be parsed; |
| 49 | + If named aggregation is not applied, `func` is either string (e.g. 'min') or |
| 50 | + Callable, or list of them (e.g. ['min', np.max]), or the dictionary of column name |
| 51 | + and str/Callable/list of them (e.g. {'A': 'min'}, or {'A': [np.min, lambda x: x]}) |
| 52 | +
|
| 53 | + If relabeling is True, will return relabeling, reconstructed func, column |
| 54 | + names, and the reconstructed order of columns. |
| 55 | + If relabeling is False, the columns and order will be None. |
| 56 | +
|
| 57 | + Parameters |
| 58 | + ---------- |
| 59 | + func: agg function (e.g. 'min' or Callable) or list of agg functions |
| 60 | + (e.g. ['min', np.max]) or dictionary (e.g. {'A': ['min', np.max]}). |
| 61 | + **kwargs: dict, kwargs used in is_multi_agg_with_relabel and |
| 62 | + normalize_keyword_aggregation function for relabelling |
| 63 | +
|
| 64 | + Returns |
| 65 | + ------- |
| 66 | + relabelling: bool, if there is relabelling or not |
| 67 | + func: normalized and mangled func |
| 68 | + columns: list of column names |
| 69 | + order: list of columns indices |
| 70 | +
|
| 71 | + Examples |
| 72 | + -------- |
| 73 | + >>> reconstruct_func(None, **{"foo": ("col", "min")}) |
| 74 | + (True, defaultdict(None, {'col': ['min']}), ('foo',), array([0])) |
| 75 | +
|
| 76 | + >>> reconstruct_func("min") |
| 77 | + (False, 'min', None, None) |
| 78 | + """ |
| 79 | + relabeling = func is None and is_multi_agg_with_relabel(**kwargs) |
| 80 | + columns: Optional[List[str]] = None |
| 81 | + order: Optional[List[int]] = None |
| 82 | + |
| 83 | + if not relabeling: |
| 84 | + if isinstance(func, list) and len(func) > len(set(func)): |
| 85 | + |
| 86 | + # GH 28426 will raise error if duplicated function names are used and |
| 87 | + # there is no reassigned name |
| 88 | + raise SpecificationError( |
| 89 | + "Function names must be unique if there is no new column names " |
| 90 | + "assigned" |
| 91 | + ) |
| 92 | + elif func is None: |
| 93 | + # nicer error message |
| 94 | + raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") |
| 95 | + |
| 96 | + if relabeling: |
| 97 | + func, columns, order = normalize_keyword_aggregation(kwargs) |
| 98 | + func = maybe_mangle_lambdas(func) |
| 99 | + |
| 100 | + return relabeling, func, columns, order |
14 | 101 |
|
15 | 102 |
|
16 | 103 | def is_multi_agg_with_relabel(**kwargs) -> bool:
|
@@ -198,6 +285,79 @@ def maybe_mangle_lambdas(agg_spec: Any) -> Any:
|
198 | 285 | return mangled_aggspec
|
199 | 286 |
|
200 | 287 |
|
| 288 | +def relabel_result( |
| 289 | + result: FrameOrSeriesUnion, |
| 290 | + func: Dict[str, List[Union[Callable, str]]], |
| 291 | + columns: Tuple, |
| 292 | + order: List[int], |
| 293 | +) -> Dict[Label, Series]: |
| 294 | + """Internal function to reorder result if relabelling is True for |
| 295 | + dataframe.agg, and return the reordered result in dict. |
| 296 | +
|
| 297 | + Parameters: |
| 298 | + ---------- |
| 299 | + result: Result from aggregation |
| 300 | + func: Dict of (column name, funcs) |
| 301 | + columns: New columns name for relabelling |
| 302 | + order: New order for relabelling |
| 303 | +
|
| 304 | + Examples: |
| 305 | + --------- |
| 306 | + >>> result = DataFrame({"A": [np.nan, 2, np.nan], |
| 307 | + ... "C": [6, np.nan, np.nan], "B": [np.nan, 4, 2.5]}) # doctest: +SKIP |
| 308 | + >>> funcs = {"A": ["max"], "C": ["max"], "B": ["mean", "min"]} |
| 309 | + >>> columns = ("foo", "aab", "bar", "dat") |
| 310 | + >>> order = [0, 1, 2, 3] |
| 311 | + >>> _relabel_result(result, func, columns, order) # doctest: +SKIP |
| 312 | + dict(A=Series([2.0, NaN, NaN, NaN], index=["foo", "aab", "bar", "dat"]), |
| 313 | + C=Series([NaN, 6.0, NaN, NaN], index=["foo", "aab", "bar", "dat"]), |
| 314 | + B=Series([NaN, NaN, 2.5, 4.0], index=["foo", "aab", "bar", "dat"])) |
| 315 | + """ |
| 316 | + reordered_indexes = [ |
| 317 | + pair[0] for pair in sorted(zip(columns, order), key=lambda t: t[1]) |
| 318 | + ] |
| 319 | + reordered_result_in_dict: Dict[Label, Series] = {} |
| 320 | + idx = 0 |
| 321 | + |
| 322 | + reorder_mask = not isinstance(result, Series) and len(result.columns) > 1 |
| 323 | + for col, fun in func.items(): |
| 324 | + s = result[col].dropna() |
| 325 | + |
| 326 | + # In the `_aggregate`, the callable names are obtained and used in `result`, and |
| 327 | + # these names are ordered alphabetically. e.g. |
| 328 | + # C2 C1 |
| 329 | + # <lambda> 1 NaN |
| 330 | + # amax NaN 4.0 |
| 331 | + # max NaN 4.0 |
| 332 | + # sum 18.0 6.0 |
| 333 | + # Therefore, the order of functions for each column could be shuffled |
| 334 | + # accordingly so need to get the callable name if it is not parsed names, and |
| 335 | + # reorder the aggregated result for each column. |
| 336 | + # e.g. if df.agg(c1=("C2", sum), c2=("C2", lambda x: min(x))), correct order is |
| 337 | + # [sum, <lambda>], but in `result`, it will be [<lambda>, sum], and we need to |
| 338 | + # reorder so that aggregated values map to their functions regarding the order. |
| 339 | + |
| 340 | + # However there is only one column being used for aggregation, not need to |
| 341 | + # reorder since the index is not sorted, and keep as is in `funcs`, e.g. |
| 342 | + # A |
| 343 | + # min 1.0 |
| 344 | + # mean 1.5 |
| 345 | + # mean 1.5 |
| 346 | + if reorder_mask: |
| 347 | + fun = [ |
| 348 | + com.get_callable_name(f) if not isinstance(f, str) else f for f in fun |
| 349 | + ] |
| 350 | + col_idx_order = Index(s.index).get_indexer(fun) |
| 351 | + s = s[col_idx_order] |
| 352 | + |
| 353 | + # assign the new user-provided "named aggregation" as index names, and reindex |
| 354 | + # it based on the whole user-provided names. |
| 355 | + s.index = reordered_indexes[idx : idx + len(fun)] |
| 356 | + reordered_result_in_dict[col] = s.reindex(columns, copy=False) |
| 357 | + idx = idx + len(fun) |
| 358 | + return reordered_result_in_dict |
| 359 | + |
| 360 | + |
201 | 361 | def validate_func_kwargs(
|
202 | 362 | kwargs: dict,
|
203 | 363 | ) -> Tuple[List[str], List[Union[str, Callable[..., Any]]]]:
|
|
0 commit comments