Skip to content

Commit f2e9513

Browse files
mroeschkenoatamir
authored andcommitted
REF: PandasColumn.describe_categorical returns categores instead of mapping (pandas-dev#47886)
1 parent 5b39d00 commit f2e9513

File tree

4 files changed

+27
-18
lines changed

4 files changed

+27
-18
lines changed

pandas/core/interchange/column.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -146,15 +146,18 @@ def describe_categorical(self):
146146
"""
147147
If the dtype is categorical, there are two options:
148148
- There are only values in the data buffer.
149-
- There is a separate dictionary-style encoding for categorical values.
150-
Raises RuntimeError if the dtype is not categorical
149+
- There is a separate non-categorical Column encoding for categorical values.
150+
151+
Raises TypeError if the dtype is not categorical
152+
151153
Content of returned dict:
152154
- "is_ordered" : bool, whether the ordering of dictionary indices is
153155
semantically meaningful.
154156
- "is_dictionary" : bool, whether a dictionary-style mapping of
155157
categorical values to other objects exists
156-
- "mapping" : dict, Python-level only (e.g. ``{int: str}``).
157-
None if not a dictionary-style categorical.
158+
- "categories" : Column representing the (implicit) mapping of indices to
159+
category values (e.g. an array of cat1, cat2, ...).
160+
None if not a dictionary-style categorical.
158161
"""
159162
if not self.dtype[0] == DtypeKind.CATEGORICAL:
160163
raise TypeError(
@@ -164,7 +167,7 @@ def describe_categorical(self):
164167
return {
165168
"is_ordered": self._col.cat.ordered,
166169
"is_dictionary": True,
167-
"mapping": dict(enumerate(self._col.cat.categories)),
170+
"categories": PandasColumn(pd.Series(self._col.cat.categories)),
168171
}
169172

170173
@property

pandas/core/interchange/dataframe_protocol.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ class CategoricalDescription(TypedDict):
110110
is_dictionary: bool
111111
# Python-level only (e.g. ``{int: str}``).
112112
# None if not a dictionary-style categorical.
113-
mapping: dict | None
113+
categories: Column | None
114114

115115

116116
class Buffer(ABC):
@@ -274,17 +274,18 @@ def describe_categorical(self) -> CategoricalDescription:
274274
"""
275275
If the dtype is categorical, there are two options:
276276
- There are only values in the data buffer.
277-
- There is a separate dictionary-style encoding for categorical values.
277+
- There is a separate non-categorical Column encoding for categorical values.
278278
279279
Raises TypeError if the dtype is not categorical
280280
281281
Returns the dictionary with description on how to interpret the data buffer:
282282
- "is_ordered" : bool, whether the ordering of dictionary indices is
283283
semantically meaningful.
284-
- "is_dictionary" : bool, whether a dictionary-style mapping of
284+
- "is_dictionary" : bool, whether a mapping of
285285
categorical values to other objects exists
286-
- "mapping" : dict, Python-level only (e.g. ``{int: str}``).
287-
None if not a dictionary-style categorical.
286+
- "categories" : Column representing the (implicit) mapping of indices to
287+
category values (e.g. an array of cat1, cat2, ...).
288+
None if not a dictionary-style categorical.
288289
289290
TBD: are there any other in-memory representations that are needed?
290291
"""

pandas/core/interchange/from_dataframe.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import numpy as np
88

99
import pandas as pd
10+
from pandas.core.interchange.column import PandasColumn
1011
from pandas.core.interchange.dataframe_protocol import (
1112
Buffer,
1213
Column,
@@ -179,9 +180,10 @@ def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]:
179180
if not categorical["is_dictionary"]:
180181
raise NotImplementedError("Non-dictionary categoricals not supported yet")
181182

182-
mapping = categorical["mapping"]
183-
assert isinstance(mapping, dict), "Categorical mapping must be a dict"
184-
categories = np.array(tuple(mapping[k] for k in sorted(mapping)))
183+
cat_column = categorical["categories"]
184+
# for mypy/pyright
185+
assert isinstance(cat_column, PandasColumn), "categories must be a PandasColumn"
186+
categories = np.array(cat_column._col)
185187
buffers = col.get_buffers()
186188

187189
codes_buff, codes_dtype = buffers["data"]

pandas/tests/interchange/test_impl.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
import pandas as pd
1010
import pandas._testing as tm
11+
from pandas.core.interchange.column import PandasColumn
1112
from pandas.core.interchange.dataframe_protocol import (
1213
ColumnNullType,
1314
DtypeKind,
@@ -61,11 +62,13 @@ def test_categorical_dtype(data):
6162
assert col.null_count == 0
6263
assert col.describe_null == (ColumnNullType.USE_SENTINEL, -1)
6364
assert col.num_chunks() == 1
64-
assert col.describe_categorical == {
65-
"is_ordered": data[1],
66-
"is_dictionary": True,
67-
"mapping": {0: "a", 1: "d", 2: "e", 3: "s", 4: "t"},
68-
}
65+
desc_cat = col.describe_categorical
66+
assert desc_cat["is_ordered"] == data[1]
67+
assert desc_cat["is_dictionary"] is True
68+
assert isinstance(desc_cat["categories"], PandasColumn)
69+
tm.assert_series_equal(
70+
desc_cat["categories"]._col, pd.Series(["a", "d", "e", "s", "t"])
71+
)
6972

7073
tm.assert_frame_equal(df, from_dataframe(df.__dataframe__()))
7174

0 commit comments

Comments
 (0)