Skip to content

PR: Add metadata attribute to DataFrame and Column #43

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion protocol/dataframe_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,13 @@ def null_count(self) -> Optional[int]:
"""
pass

@property
def metadata(self) -> Dict[str, Any]:
"""
The metadata for the column. See `DataFrame.metadata` for more details.
"""
pass

def num_chunks(self) -> int:
"""
Return the number of chunks the column consists of.
Expand Down Expand Up @@ -350,6 +357,19 @@ def __dataframe__(self, nan_as_null : bool = False) -> dict:
"version": 0 # Version number of the protocol
}

@property
def metadata(self) -> Dict[str, Any]:
"""
The metadata for the data frame, as a dictionary with string keys. The
contents of `metadata` may be anything, they are meant for a library
to store information that it needs to, e.g., roundtrip losslessly or
for two implementations to share data that is not (yet) part of the
interchange protocol specification. For avoiding collisions with other
entries, please add name the keys with the name of the library
followed by a period and the desired name, e.g, ``pandas.indexcol``.
"""
pass

def num_columns(self) -> int:
"""
Return the number of columns in the DataFrame
Expand Down Expand Up @@ -417,4 +437,3 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[DataFrame]:
before yielding it.
"""
pass

32 changes: 31 additions & 1 deletion protocol/pandas_implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,13 @@ def null_count(self) -> int:
"""
return self._col.isna().sum()

@property
def metadata(self) -> Dict[str, Any]:
"""
Store specific metadata of the column.
"""
return {}

def num_chunks(self) -> int:
"""
Return the number of chunks the column consists of.
Expand Down Expand Up @@ -495,6 +502,10 @@ def __init__(self, df : pd.DataFrame, nan_as_null : bool = False) -> None:
# dtypes is added, this value should be propagated to columns.
self._nan_as_null = nan_as_null

@property
def metadata(self):
return {"pandas.index": self._df.index}

def num_columns(self) -> int:
return len(self._df.columns)

Expand Down Expand Up @@ -578,9 +589,28 @@ def test_categorical_dtype():
tm.assert_frame_equal(df, df2)


def test_metadata():
df = pd.DataFrame({'A': [1, 2, 3, 4],'B': [1, 2, 3, 4]})

# Check the metadata from the dataframe
df_metadata = df.__dataframe__().metadata
expected = {"pandas.index": df.index}
for key in df_metadata:
assert all(df_metadata[key] == expected[key])

# Check the metadata from the column
col_metadata = df.__dataframe__().get_column(0).metadata
expected = {}
for key in col_metadata:
assert col_metadata[key] == expected[key]

df2 = from_dataframe(df)
tm.assert_frame_equal(df, df2)


if __name__ == '__main__':
test_categorical_dtype()
test_float_only()
test_mixed_intfloat()
test_noncontiguous_columns()

test_metadata()