Skip to content

Commit 8370e39

Browse files
committed
REF/ENH: Refactor NDFrame finalization
In preperation for pandas-dev#27108 (disallowing duplicates), we need to enhance our metadata propagation. *We need a way for a particiular attribute to deterimine how it's propagated for a particular method*. Our current method of metadata propagation lacked two features 1. It only copies an attribute from a source NDFrame to a new NDFrame. There is no way to propagate metadata from a collection of NDFrames (say from `pd.concat`) to a new NDFrame. 2. It only and always copies the attribute. This is not always appropriate when dealing with a collection of input NDFrames, as the source attributes may differ. The resolution of conflicts will differ by attribute (for `Series.name` we might throw away the name. For `Series.allow_duplicates`, any Series disallowing duplicates should mean the output disallows duplicates)
1 parent 0bde7ce commit 8370e39

File tree

3 files changed

+198
-4
lines changed

3 files changed

+198
-4
lines changed

pandas/core/_meta.py

+146
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
"""
2+
Metadata propagation through pandas operations.
3+
4+
This module contains the infrastructure for propagating ``NDFrame._metadata``
5+
through operations. We perform an operation (say :meth:`pandas.Series.copy`) that
6+
returns an ``NDFrame`` and would like to propagate the metadata (say ``Series.name``)
7+
from ``self`` to the new ``NDFrame``.
8+
9+
.. note::
10+
11+
Currently, pandas doesn't provide a clean, documented API on
12+
13+
* which methods call finalize
14+
* the types passed to finalize for each method
15+
16+
This is a known limitation we would like to address in the future.
17+
"""
18+
from collections import defaultdict
19+
from functools import wraps
20+
from typing import TYPE_CHECKING, Any, Callable, Union
21+
22+
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
23+
24+
if TYPE_CHECKING:
25+
from pandas.core.generic import NDFrame
26+
27+
dispatch = defaultdict(dict)
28+
dispatch_method_type = Union[Callable[..., "NDFrame"], str]
29+
30+
31+
def key_of(method):
32+
if isinstance(method, str):
33+
# TODO: figure out if this is OK. May be necessary when we have
34+
# things like pd.merge and DataFrame.merge that hit the same finalize.
35+
return method
36+
elif method:
37+
return method.__module__, method.__name__
38+
39+
40+
class PandasMetadata:
41+
"""
42+
Dispatch metadata finalization for pandas metadata.
43+
44+
Users should instantiate a single `PandasMetadata` instance
45+
for their piece of metadata and register finalizers for various
46+
pandas methods using :meth:`PandsaMetadata.register`.
47+
48+
Parameters
49+
----------
50+
name : str
51+
The name of the attribute being finalized.
52+
53+
Examples
54+
--------
55+
>>> maxmeta = PandasMetadata("attr")
56+
57+
Register a finalizer for a given pandas method:
58+
59+
>>> @maxmeta.register(pd.concat)
60+
... def _(new, concatenator):
61+
... new.attr = max(x.attr_meta for x in concatenator.objs)
62+
63+
>>> pd.DataFrame._metadata = ['attr']
64+
>>> x = pd.DataFrame({"x"}); x.attr = 1
65+
>>> y = pd.DataFrame({"y"}); y.attr = 2
66+
>>> pd.concat([x, y]).attr
67+
2
68+
"""
69+
70+
def __init__(self, name: str):
71+
self.name = name
72+
73+
def register(self, pandas_method: dispatch_method_type):
74+
"""
75+
A decorator to register a finalizer for a specific pandas method.
76+
77+
Parameters
78+
----------
79+
pandas_method : callable or str
80+
A pandas method, like :meth:`pandas.concat`, that this finalizer
81+
should be used for. The function being decorated will be called
82+
with the relevant arguments (typically the output and the source NDFrame).
83+
When `NDFrame.__finalize__` is called as a result of `pandas_method`,
84+
the registered finalizer will be called.
85+
"""
86+
87+
def decorate(func):
88+
# TODO: warn of collisions?
89+
dispatch[key_of(pandas_method)][self.name] = func
90+
91+
@wraps(func)
92+
def wrapper(*args, **kwargs):
93+
return func(*args, **kwargs)
94+
95+
return wrapper
96+
97+
return decorate
98+
99+
100+
def default_finalizer(new: "NDFrame", other: Any, *, name: str):
101+
"""
102+
The default finalizer when this method, attribute hasn't been overridden.
103+
104+
This copies the ``_metadata`` attribute from ``other`` to ``self``, modifying
105+
``self`` inplace.
106+
107+
Parameters
108+
----------
109+
new : NDFrame
110+
The newly created NDFrame being finalized.
111+
other : NDFrame
112+
The source NDFrame attributes will be extracted from.
113+
"""
114+
object.__setattr__(new, name, getattr(other, name, None))
115+
116+
117+
# ----------------------------------------------------------------------------
118+
# Pandas Internals.
119+
120+
121+
def ndframe_finalize(new: "NDFrame", other: Any, method: dispatch_method_type):
122+
"""
123+
Finalize a new NDFrame.
124+
125+
The finalizer is looked up from finalizers registered with PandasMetadata.
126+
`new` is modified inplace, and nothing is returned.
127+
128+
Parameters
129+
----------
130+
new : NDFrame
131+
other : NDFrame
132+
Or a list of them? TBD
133+
method : callable or str
134+
"""
135+
# To avoid one isinstance per _metadata name, we check up front.
136+
# Most of the time `other` is an ndframe, but in some cases (e.g. concat)
137+
# it's `_Concatenator` object
138+
other_is_ndframe = isinstance(other, (ABCSeries, ABCDataFrame))
139+
140+
for name in new._metadata:
141+
finalizer = dispatch.get(key_of(method), {}).get(name)
142+
143+
if finalizer:
144+
finalizer(new, other)
145+
elif other_is_ndframe:
146+
default_finalizer(new, other, name=name)

pandas/core/generic.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
import pandas as pd
6363
from pandas._typing import Dtype, FilePathOrBuffer
6464
from pandas.core import missing, nanops
65+
from pandas.core._meta import ndframe_finalize
6566
import pandas.core.algorithms as algos
6667
from pandas.core.base import PandasObject, SelectionMixin
6768
import pandas.core.common as com
@@ -5175,9 +5176,7 @@ def __finalize__(self, other, method=None, **kwargs):
51755176
types of propagation actions based on this
51765177
51775178
"""
5178-
if isinstance(other, NDFrame):
5179-
for name in self._metadata:
5180-
object.__setattr__(self, name, getattr(other, name, None))
5179+
ndframe_finalize(self, other, method)
51815180
return self
51825181

51835182
def __getattr__(self, name):
@@ -6016,7 +6015,7 @@ def copy(self, deep=True):
60166015
dtype: object
60176016
"""
60186017
data = self._data.copy(deep=deep)
6019-
return self._constructor(data).__finalize__(self)
6018+
return self._constructor(data).__finalize__(self, NDFrame.copy)
60206019

60216020
def __copy__(self, deep=True):
60226021
return self.copy(deep=deep)

pandas/tests/generic/test_metadata.py

+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import pytest
2+
3+
import pandas as pd
4+
from pandas.core._meta import PandasMetadata
5+
6+
mymeta = PandasMetadata("attr")
7+
8+
9+
@mymeta.register(pd.core.generic.NDFrame.copy)
10+
def _(new, other):
11+
new.attr = other.attr + 1
12+
13+
14+
@mymeta.register("concat")
15+
def _(new, other):
16+
assert isinstance(other, pd.core.reshape.concat._Concatenator)
17+
new.attr = sum(x.attr for x in other.objs)
18+
19+
20+
@pytest.fixture
21+
def custom_meta(monkeypatch):
22+
original_metadata = []
23+
24+
for cls in [pd.Series, pd.DataFrame]:
25+
original_metadata.append(cls._metadata)
26+
custom_metadata = cls._metadata.copy()
27+
custom_metadata.append("attr")
28+
29+
monkeypatch.setattr(cls, "_metadata", custom_metadata)
30+
31+
32+
def test_custom_finalizer(custom_meta):
33+
34+
df = pd.DataFrame({"A": [1, 2]})
35+
df.attr = 0
36+
37+
result = df.copy()
38+
assert result.attr == 1
39+
40+
41+
def test_concat(custom_meta):
42+
df1 = pd.DataFrame({"A": [1, 2]})
43+
df1.attr = 2
44+
45+
df2 = pd.DataFrame({"A": [1, 2]})
46+
df2.attr = 3
47+
48+
result = pd.concat([df1, df2])
49+
assert result.attr == 5

0 commit comments

Comments
 (0)