Support large strings in interchange protocol (pandas-dev#56772)

phofl · MarcoGorelli · web-flow · commit f0cdc7c2c60c · 2024-01-09T08:41:07.000Z
* Support large strings in interchange protocol

* Update test_impl.py

* fixup buffer dtype, add todo

* add whatsnew

---------

Co-authored-by: MarcoGorelli &lt;33491632+MarcoGorelli@users.noreply.github.com&gt;
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -907,6 +907,7 @@ Sparse
 
 Other
 ^^^^^
+- :meth:`DataFrame.__dataframe__` did not support pyarrow large strings (:issue:`56702`)
 - Bug in :func:`DataFrame.describe` when formatting percentiles in the resulting percentile 99.999% is rounded to 100% (:issue:`55765`)
 - Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`)
 - Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`)
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -2190,7 +2190,9 @@ def numpy_dtype(self) -> np.dtype:
             # This can be removed if/when pyarrow addresses it:
             # https://github.com/apache/arrow/issues/34462
             return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]")
-        if pa.types.is_string(self.pyarrow_dtype):
+        if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string(
+            self.pyarrow_dtype
+        ):
             # pa.string().to_pandas_dtype() = object which we don't want
             return np.dtype(str)
         try:
diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py
@@ -301,12 +301,9 @@ def _get_data_buffer(
             buffer = PandasBuffer(np.frombuffer(b, dtype="uint8"))
 
             # Define the dtype for the returned buffer
-            dtype = (
-                DtypeKind.STRING,
-                8,
-                ArrowCTypes.STRING,
-                Endianness.NATIVE,
-            )  # note: currently only support native endianness
+            # TODO: this will need correcting
+            # https://github.com/pandas-dev/pandas/issues/54781
+            dtype = self.dtype
         else:
             raise NotImplementedError(f"Data type {self._col.dtype} not handled yet")
 
diff --git a/pandas/core/interchange/utils.py b/pandas/core/interchange/utils.py
@@ -37,6 +37,7 @@
     "float": "f",  # float32
     "double": "g",  # float64
     "string": "u",
+    "large_string": "U",
     "binary": "z",
     "time32[s]": "tts",
     "time32[ms]": "ttm",
diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py
@@ -353,3 +353,12 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None:
     interchange.get_column_by_name = lambda _: column
     monkeypatch.setattr(df, "__dataframe__", lambda allow_copy: interchange)
     pd.api.interchange.from_dataframe(df)
+
+
+def test_large_string():
+    # GH#56702
+    pytest.importorskip("pyarrow")
+    df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]")
+    result = pd.api.interchange.from_dataframe(df.__dataframe__())
+    expected = pd.DataFrame({"a": ["x"]}, dtype="object")
+    tm.assert_frame_equal(result, expected)