Skip to content

PR: Add metadata attribute to DataFrame and Column #43

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion protocol/dataframe_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,13 @@ def null_count(self) -> Optional[int]:
"""
pass

@property
def metadata(self) -> Dict[str, Any]:
"""
Store the metadata specific to the column.
"""
pass

def num_chunks(self) -> int:
"""
Return the number of chunks the column consists of.
Expand Down Expand Up @@ -350,6 +357,13 @@ def __dataframe__(self, nan_as_null : bool = False) -> dict:
"version": 0 # Version number of the protocol
}

@property
def metadata(self) -> Dict[str, Any]:
"""
Store the metadata specific to the DataFrame.
"""
pass

def num_columns(self) -> int:
"""
Return the number of columns in the DataFrame
Expand Down Expand Up @@ -417,4 +431,3 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[DataFrame]:
before yielding it.
"""
pass

33 changes: 32 additions & 1 deletion protocol/pandas_implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,13 @@ def null_count(self) -> int:
"""
return self._col.isna().sum()

@property
def metadata(self) -> Dict[str, Any]:
"""
Store specific metadata of the column.
"""
return {"num_chunks": self.num_chunks()}

def num_chunks(self) -> int:
"""
Return the number of chunks the column consists of.
Expand Down Expand Up @@ -495,6 +502,11 @@ def __init__(self, df : pd.DataFrame, nan_as_null : bool = False) -> None:
# dtypes is added, this value should be propagated to columns.
self._nan_as_null = nan_as_null

@property
def metadata(self):
return {"num_chunks": self.num_chunks(),
"num_columns": self.num_columns()}

def num_columns(self) -> int:
return len(self._df.columns)

Expand Down Expand Up @@ -578,9 +590,28 @@ def test_categorical_dtype():
tm.assert_frame_equal(df, df2)


def test_metadata():
df = pd.DataFrame(data=dict(a=[1, 2, 3], b=[4, 5, 6], c=[7, 8, 9]))

# Check the metadata from the dataframe
df_metadata = df.__dataframe__().metadata
excpected = {"num_chunks": 1, "num_columns": 3}
for key in df_metadata:
assert df_metadata[key] == excpected[key]

# Check the metadata from the column
col_metadata = df.__dataframe__().get_column(0).metadata
expected = {"num_chunks": 1}
for key in col_metadata:
assert col_metadata[key] == excpected[key]

df2 = from_dataframe(df)
tm.assert_frame_equal(df, df2)


if __name__ == '__main__':
test_categorical_dtype()
test_float_only()
test_mixed_intfloat()
test_noncontiguous_columns()

test_metadata()