Skip to content

Commit 7cc4d53

Browse files
committed
Fix pandas-devGH-29442 DataFrame.groupby doesn't preserve _metadata
This bug is a regression in v1.1.0 and was introduced by the fix for pandas-devGH-34214 in commit [6f065b]. Underlying cause is that the `*Splitter` classes do not use the `._constructor` property and do not call `__finalize__`. Please note that the method name used for `__finalize__` calls was my best guess since documentation for the value has been hard to find. [6f065b]: pandas-dev@6f065b6
1 parent 8380708 commit 7cc4d53

File tree

2 files changed

+86
-2
lines changed

2 files changed

+86
-2
lines changed

pandas/core/groupby/ops.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -955,7 +955,8 @@ class SeriesSplitter(DataSplitter):
955955
def _chop(self, sdata: Series, slice_obj: slice) -> Series:
956956
# fastpath equivalent to `sdata.iloc[slice_obj]`
957957
mgr = sdata._mgr.get_slice(slice_obj)
958-
return type(sdata)(mgr, name=sdata.name, fastpath=True)
958+
return sdata._constructor(mgr, name=sdata.name, fastpath=True)\
959+
.__finalize__(sdata, method='groupby')
959960

960961

961962
class FrameSplitter(DataSplitter):
@@ -971,7 +972,7 @@ def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame:
971972
# else:
972973
# return sdata.iloc[:, slice_obj]
973974
mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis)
974-
return type(sdata)(mgr)
975+
return sdata._constructor(mgr).__finalize__(sdata, method='groupby')
975976

976977

977978
def get_splitter(
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
"""
2+
Test metadata propagation in groupby
3+
4+
The PandasTable class below is implemented according to the [guidelines], and as such would
5+
expect `__finalize__` to always be called so that the `_pandastable_metadata` is always populated.
6+
7+
[guidelines]: https://pandas.pydata.org/pandas-docs/stable/development/extending.html#override-constructor-properties
8+
"""
9+
10+
import pytest
11+
import pandas as pd
12+
from warnings import warn
13+
from typing import List
14+
15+
16+
_TABLE_METADATA_FIELD_NAME = '_pandastable_metadata'
17+
18+
19+
def _combine_metadata(data: List[str]) -> str:
20+
"""
21+
A mock implementation for testing
22+
"""
23+
return '+'.join(data)
24+
25+
26+
class PandasTable(pd.DataFrame):
27+
"""
28+
A pandas dataframe subclass with associated table metadata.
29+
"""
30+
31+
_metadata = [_TABLE_METADATA_FIELD_NAME] # Register metadata fieldnames here
32+
33+
@property
34+
def _constructor(self):
35+
return PandasTable
36+
37+
def __finalize__(self, other, method=None, **kwargs):
38+
"""
39+
This method is responsible for populating metadata when creating new Table-object.
40+
41+
The method argument is subject to change, and a robust handling of this is implemented
42+
"""
43+
src = [other] #more logic here in actual implementation
44+
metadata = _combine_metadata([d.get_metadata() for d in src if isinstance(d, PandasTable)])
45+
46+
if not metadata:
47+
warn('__finalize__ unable to combine metadata for method "{method}", falling back to DataFrame')
48+
return pd.DataFrame(self)
49+
object.__setattr__(self, _TABLE_METADATA_FIELD_NAME, metadata)
50+
return self
51+
52+
def get_metadata(self):
53+
#return object.__getattribute__(self, _TABLE_METADATA_FIELD_NAME)
54+
metadata = getattr(self, _TABLE_METADATA_FIELD_NAME, None)
55+
if metadata is None:
56+
warn('PandasTable object not correctly initialized: no metadata')
57+
return metadata
58+
59+
@staticmethod
60+
def from_table_data(df: pd.DataFrame, metadata) -> 'PandasTable':
61+
df = PandasTable(df)
62+
object.__setattr__(df, _TABLE_METADATA_FIELD_NAME, metadata)
63+
return df
64+
65+
66+
@pytest.fixture
67+
def dft():
68+
df = pd.DataFrame([[11, 12, 0], [21, 22, 0], [31, 32, 1]], columns={'a','b','g'})
69+
return PandasTable.from_table_data(df, 'My metadata')
70+
71+
72+
def test_initial_metadata(dft):
73+
assert dft.get_metadata() == 'My metadata'
74+
75+
76+
def test_basic_propagation(dft):
77+
gg = dft.loc[dft.g==0, :]
78+
assert gg.get_metadata() == 'My metadata'
79+
80+
81+
def test_groupby(dft):
82+
gg = [ab for g, ab in dft.groupby('g')]
83+
assert gg[0].get_metadata() is not None

0 commit comments

Comments
 (0)