Skip to content

REF/ENH: Refactor NDFrame finalization #28334

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 146 additions & 0 deletions pandas/core/_meta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
"""
Metadata propagation through pandas operations.

This module contains the infrastructure for propagating ``NDFrame._metadata``
through operations. We perform an operation (say :meth:`pandas.Series.copy`) that
returns an ``NDFrame`` and would like to propagate the metadata (say ``Series.name``)
from ``self`` to the new ``NDFrame``.

.. note::

Currently, pandas doesn't provide a clean, documented API on

* which methods call finalize
* the types passed to finalize for each method

This is a known limitation we would like to address in the future.
"""
from collections import defaultdict
from functools import wraps
from typing import TYPE_CHECKING, Any, Callable, Union

from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries

if TYPE_CHECKING:
from pandas.core.generic import NDFrame
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you want FrameOrSeries from pandas._typing


dispatch = defaultdict(dict)
dispatch_method_type = Union[Callable[..., "NDFrame"], str]


def key_of(method):
if isinstance(method, str):
# TODO: figure out if this is OK. May be necessary when we have
# things like pd.merge and DataFrame.merge that hit the same finalize.
return method
elif method:
return method.__module__, method.__name__


class PandasMetadata:
"""
Dispatch metadata finalization for pandas metadata.

Users should instantiate a single `PandasMetadata` instance
for their piece of metadata and register finalizers for various
pandas methods using :meth:`PandsaMetadata.register`.

Parameters
----------
name : str
The name of the attribute being finalized.

Examples
--------
>>> maxmeta = PandasMetadata("attr")

Register a finalizer for a given pandas method:

>>> @maxmeta.register(pd.concat)
... def _(new, concatenator):
... new.attr = max(x.attr_meta for x in concatenator.objs)

>>> pd.DataFrame._metadata = ['attr']
>>> x = pd.DataFrame({"x"}); x.attr = 1
>>> y = pd.DataFrame({"y"}); y.attr = 2
>>> pd.concat([x, y]).attr
2
"""

def __init__(self, name: str):
self.name = name

def register(self, pandas_method: dispatch_method_type):
"""
A decorator to register a finalizer for a specific pandas method.

Parameters
----------
pandas_method : callable or str
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this looks like you can register a single finalizer? but we already have internal ones, shouldn't this just append to a list of finalizers? how is the default done if we have 1 or more finalizers?

Copy link
Contributor Author

@TomAugspurger TomAugspurger Sep 8, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea was to register one finalizer per pandas method. I Brock's subclassed based approach will make this clearer.

Pandas will provide a default implementation, which the subclass can override.

how is the default done if we have 1 or more finalizers?

Previously, the __finalize__ iterated over each metadata and applied the "default finalizer" (copy from self to new).

Now we iterate over metadata attributes, look up the finalizer for that attribute, and then apply that finalizer. This gives you potentially different finalization behavior for different attributes (which we need for .name vs. .allows_duplicates).

A pandas method, like :meth:`pandas.concat`, that this finalizer
should be used for. The function being decorated will be called
with the relevant arguments (typically the output and the source NDFrame).
When `NDFrame.__finalize__` is called as a result of `pandas_method`,
the registered finalizer will be called.
"""

def decorate(func):
# TODO: warn of collisions?
dispatch[key_of(pandas_method)][self.name] = func

@wraps(func)
def wrapper(*args, **kwargs):
return func(*args, **kwargs)

return wrapper

return decorate


def default_finalizer(new: "NDFrame", other: Any, *, name: str):
"""
The default finalizer when this method, attribute hasn't been overridden.

This copies the ``_metadata`` attribute from ``other`` to ``self``, modifying
``self`` inplace.

Parameters
----------
new : NDFrame
The newly created NDFrame being finalized.
other : NDFrame
The source NDFrame attributes will be extracted from.
"""
object.__setattr__(new, name, getattr(other, name, None))


# ----------------------------------------------------------------------------
# Pandas Internals.


def ndframe_finalize(new: "NDFrame", other: Any, method: dispatch_method_type):
"""
Finalize a new NDFrame.

The finalizer is looked up from finalizers registered with PandasMetadata.
`new` is modified inplace, and nothing is returned.

Parameters
----------
new : NDFrame
other : NDFrame
Or a list of them? TBD
method : callable or str
"""
# To avoid one isinstance per _metadata name, we check up front.
# Most of the time `other` is an ndframe, but in some cases (e.g. concat)
# it's `_Concatenator` object
other_is_ndframe = isinstance(other, (ABCSeries, ABCDataFrame))

for name in new._metadata:
finalizer = dispatch.get(key_of(method), {}).get(name)

if finalizer:
finalizer(new, other)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should not these return new?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a style choice. All these operations are inplace. My hope is that by returning None, we make it clearer that you can't return a new object.

elif other_is_ndframe:
default_finalizer(new, other, name=name)
7 changes: 3 additions & 4 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
import pandas as pd
from pandas._typing import Dtype, FilePathOrBuffer
from pandas.core import missing, nanops
from pandas.core._meta import ndframe_finalize
import pandas.core.algorithms as algos
from pandas.core.base import PandasObject, SelectionMixin
import pandas.core.common as com
Expand Down Expand Up @@ -5175,9 +5176,7 @@ def __finalize__(self, other, method=None, **kwargs):
types of propagation actions based on this

"""
if isinstance(other, NDFrame):
for name in self._metadata:
object.__setattr__(self, name, getattr(other, name, None))
ndframe_finalize(self, other, method)
return self

def __getattr__(self, name):
Expand Down Expand Up @@ -6016,7 +6015,7 @@ def copy(self, deep=True):
dtype: object
"""
data = self._data.copy(deep=deep)
return self._constructor(data).__finalize__(self)
return self._constructor(data).__finalize__(self, NDFrame.copy)

def __copy__(self, deep=True):
return self.copy(deep=deep)
Expand Down
49 changes: 49 additions & 0 deletions pandas/tests/generic/test_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import pytest

import pandas as pd
from pandas.core._meta import PandasMetadata

mymeta = PandasMetadata("attr")


@mymeta.register(pd.core.generic.NDFrame.copy)
def _(new, other):
new.attr = other.attr + 1


@mymeta.register("concat")
def _(new, other):
assert isinstance(other, pd.core.reshape.concat._Concatenator)
new.attr = sum(x.attr for x in other.objs)


@pytest.fixture
def custom_meta(monkeypatch):
original_metadata = []

for cls in [pd.Series, pd.DataFrame]:
original_metadata.append(cls._metadata)
custom_metadata = cls._metadata.copy()
custom_metadata.append("attr")

monkeypatch.setattr(cls, "_metadata", custom_metadata)


def test_custom_finalizer(custom_meta):

df = pd.DataFrame({"A": [1, 2]})
df.attr = 0

result = df.copy()
assert result.attr == 1


def test_concat(custom_meta):
df1 = pd.DataFrame({"A": [1, 2]})
df1.attr = 2

df2 = pd.DataFrame({"A": [1, 2]})
df2.attr = 3

result = pd.concat([df1, df2])
assert result.attr == 5