Skip to content

Commit f0cdc7c

Browse files
phoflMarcoGorelli
andauthored
Support large strings in interchange protocol (pandas-dev#56772)
* Support large strings in interchange protocol * Update test_impl.py * fixup buffer dtype, add todo * add whatsnew --------- Co-authored-by: MarcoGorelli <[email protected]>
1 parent b7e2202 commit f0cdc7c

File tree

5 files changed

+17
-7
lines changed

5 files changed

+17
-7
lines changed

doc/source/whatsnew/v2.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -907,6 +907,7 @@ Sparse
907907

908908
Other
909909
^^^^^
910+
- :meth:`DataFrame.__dataframe__` did not support pyarrow large strings (:issue:`56702`)
910911
- Bug in :func:`DataFrame.describe` when formatting percentiles in the resulting percentile 99.999% is rounded to 100% (:issue:`55765`)
911912
- Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`)
912913
- Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`)

pandas/core/dtypes/dtypes.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -2190,7 +2190,9 @@ def numpy_dtype(self) -> np.dtype:
21902190
# This can be removed if/when pyarrow addresses it:
21912191
# https://github.com/apache/arrow/issues/34462
21922192
return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]")
2193-
if pa.types.is_string(self.pyarrow_dtype):
2193+
if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string(
2194+
self.pyarrow_dtype
2195+
):
21942196
# pa.string().to_pandas_dtype() = object which we don't want
21952197
return np.dtype(str)
21962198
try:

pandas/core/interchange/column.py

+3-6
Original file line numberDiff line numberDiff line change
@@ -301,12 +301,9 @@ def _get_data_buffer(
301301
buffer = PandasBuffer(np.frombuffer(b, dtype="uint8"))
302302

303303
# Define the dtype for the returned buffer
304-
dtype = (
305-
DtypeKind.STRING,
306-
8,
307-
ArrowCTypes.STRING,
308-
Endianness.NATIVE,
309-
) # note: currently only support native endianness
304+
# TODO: this will need correcting
305+
# https://github.com/pandas-dev/pandas/issues/54781
306+
dtype = self.dtype
310307
else:
311308
raise NotImplementedError(f"Data type {self._col.dtype} not handled yet")
312309

pandas/core/interchange/utils.py

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
"float": "f", # float32
3838
"double": "g", # float64
3939
"string": "u",
40+
"large_string": "U",
4041
"binary": "z",
4142
"time32[s]": "tts",
4243
"time32[ms]": "ttm",

pandas/tests/interchange/test_impl.py

+9
Original file line numberDiff line numberDiff line change
@@ -353,3 +353,12 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None:
353353
interchange.get_column_by_name = lambda _: column
354354
monkeypatch.setattr(df, "__dataframe__", lambda allow_copy: interchange)
355355
pd.api.interchange.from_dataframe(df)
356+
357+
358+
def test_large_string():
359+
# GH#56702
360+
pytest.importorskip("pyarrow")
361+
df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]")
362+
result = pd.api.interchange.from_dataframe(df.__dataframe__())
363+
expected = pd.DataFrame({"a": ["x"]}, dtype="object")
364+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)