Skip to content

Commit d9419b1

Browse files
steff456rgommers
authored andcommitted
Add metadata attribute to DataFrame and Column
This PR adds a metadata attribute that can be used to store library-specific things. For example, libraries like Vaex will be able to store expressions for its virtual columns there. - Add metadata attribute to the DataFrame class - Add metadata attribute to the Column class - Add a test
1 parent 916f1af commit d9419b1

File tree

2 files changed

+50
-3
lines changed

2 files changed

+50
-3
lines changed

protocol/dataframe_protocol.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,13 @@ def null_count(self) -> Optional[int]:
288288
"""
289289
pass
290290

291+
@property
292+
def metadata(self) -> Dict[str, Any]:
293+
"""
294+
The metadata for the column. See `DataFrame.metadata` for more details.
295+
"""
296+
pass
297+
291298
def num_chunks(self) -> int:
292299
"""
293300
Return the number of chunks the column consists of.
@@ -362,6 +369,19 @@ def __dataframe__(self, nan_as_null : bool = False) -> dict:
362369
"version": 0 # Version number of the protocol
363370
}
364371

372+
@property
373+
def metadata(self) -> Dict[str, Any]:
374+
"""
375+
The metadata for the data frame, as a dictionary with string keys. The
376+
contents of `metadata` may be anything, they are meant for a library
377+
to store information that it needs to, e.g., roundtrip losslessly or
378+
for two implementations to share data that is not (yet) part of the
379+
interchange protocol specification. For avoiding collisions with other
380+
entries, please add name the keys with the name of the library
381+
followed by a period and the desired name, e.g, ``pandas.indexcol``.
382+
"""
383+
pass
384+
365385
def num_columns(self) -> int:
366386
"""
367387
Return the number of columns in the DataFrame.
@@ -429,4 +449,3 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[DataFrame]:
429449
before yielding it.
430450
"""
431451
pass
432-

protocol/pandas_implementation.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ def convert_string_column(col : ColumnObject) -> np.ndarray:
196196
v = mbuf[i/8]
197197
if null_value == 1:
198198
v = ~v
199-
199+
200200
if v & (1<<(i%8)):
201201
str_list.append(np.nan)
202202
continue
@@ -498,6 +498,13 @@ def null_count(self) -> int:
498498
"""
499499
return self._col.isna().sum()
500500

501+
@property
502+
def metadata(self) -> Dict[str, Any]:
503+
"""
504+
Store specific metadata of the column.
505+
"""
506+
return {}
507+
501508
def num_chunks(self) -> int:
502509
"""
503510
Return the number of chunks the column consists of.
@@ -682,6 +689,12 @@ def __init__(self, df : pd.DataFrame, nan_as_null : bool = False) -> None:
682689
# dtypes is added, this value should be propagated to columns.
683690
self._nan_as_null = nan_as_null
684691

692+
@property
693+
def metadata(self):
694+
# `index` isn't a regular column, and the protocol doesn't support row
695+
# labels - so we export it as Pandas-specific metadata here.
696+
return {"pandas.index": self._df.index}
697+
685698
def num_columns(self) -> int:
686699
return len(self._df.columns)
687700

@@ -777,6 +790,21 @@ def test_string_dtype():
777790
assert col.describe_null == (4, 0)
778791
assert col.num_chunks() == 1
779792

793+
def test_metadata():
794+
df = pd.DataFrame({'A': [1, 2, 3, 4],'B': [1, 2, 3, 4]})
795+
796+
# Check the metadata from the dataframe
797+
df_metadata = df.__dataframe__().metadata
798+
expected = {"pandas.index": df.index}
799+
for key in df_metadata:
800+
assert all(df_metadata[key] == expected[key])
801+
802+
# Check the metadata from the column
803+
col_metadata = df.__dataframe__().get_column(0).metadata
804+
expected = {}
805+
for key in col_metadata:
806+
assert col_metadata[key] == expected[key]
807+
780808
df2 = from_dataframe(df)
781809
tm.assert_frame_equal(df, df2)
782810

@@ -787,4 +815,4 @@ def test_string_dtype():
787815
test_mixed_intfloat()
788816
test_noncontiguous_columns()
789817
test_string_dtype()
790-
818+
test_metadata()

0 commit comments

Comments
 (0)