From b163dd2c51957368f2765e17549b4e3336350c38 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 8 Jan 2024 20:24:09 +0000 Subject: [PATCH 1/7] Handle non-string object dtypes in DataFrame interchange protocol --- pandas/core/interchange/column.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index acfbc5d9e6c62..41dcdb6a70ea3 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -116,7 +116,7 @@ def dtype(self) -> tuple[DtypeKind, int, str, str]: Endianness.NATIVE, ) elif is_string_dtype(dtype): - if infer_dtype(self._col) == "string": + if infer_dtype(self._col) == "string" or infer_dtype(self._col) == "empty": return ( DtypeKind.STRING, 8, From 75b422ef49238b8cd216e021965c1965685d27df Mon Sep 17 00:00:00 2001 From: root Date: Mon, 8 Jan 2024 21:21:56 +0000 Subject: [PATCH 2/7] Add test --- pandas/tests/interchange/test_impl.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 6d8cc501ade6c..7f08c3bb5d582 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -353,3 +353,10 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: interchange.get_column_by_name = lambda _: column monkeypatch.setattr(df, "__dataframe__", lambda allow_copy: interchange) pd.api.interchange.from_dataframe(df) + + +def test_empty_string_column(): + df = pd.Series([], name="a", dtype=str).to_frame() + df2 = df.__dataframe__() + result = pd.api.interchange.from_dataframe(df2) + tm.assert_frame_equal(df, result) From de04996f50a0a026727ab68b7a067d454b26126b Mon Sep 17 00:00:00 2001 From: root Date: Mon, 8 Jan 2024 22:06:51 +0000 Subject: [PATCH 3/7] Add 'whats new' --- doc/source/whatsnew/v2.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index b138e91b41661..9f1dbab265c4d 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -907,6 +907,7 @@ Other - Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`) - Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`) - Bug in :func:`infer_freq` and :meth:`DatetimeIndex.inferred_freq` with weekly frequencies and non-nanosecond resolutions (:issue:`55609`) +- Bug in :func:`pd.api.interchange.from_dataframe` where it raised ``NotImplementedError`` when handling empty string columns (:issue:`56703`) - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) - Bug in :meth:`DataFrame.from_dict` which would always sort the rows of the created :class:`DataFrame`. (:issue:`55683`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` raising a ``ValueError`` (:issue:`56478`) @@ -915,7 +916,6 @@ Other - Bug in the error message when assigning an empty :class:`DataFrame` to a column (:issue:`55956`) - Bug when time-like strings were being cast to :class:`ArrowDtype` with ``pyarrow.time64`` type (:issue:`56463`) - .. --------------------------------------------------------------------------- .. _whatsnew_220.contributors: From b55a183eefe00c5d1368f5d0ee83dec12c4e7e7e Mon Sep 17 00:00:00 2001 From: yashb <74137864+roadrollerdafjorst@users.noreply.github.com> Date: Tue, 9 Jan 2024 22:22:01 +0530 Subject: [PATCH 4/7] Update pandas/core/interchange/column.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/interchange/column.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index 41dcdb6a70ea3..4f2fd7578738f 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -116,7 +116,7 @@ def dtype(self) -> tuple[DtypeKind, int, str, str]: Endianness.NATIVE, ) elif is_string_dtype(dtype): - if infer_dtype(self._col) == "string" or infer_dtype(self._col) == "empty": + if infer_dtype(self._col) in ("string", "empty"): return ( DtypeKind.STRING, 8, From 42aa771158d1c231fea1b20d8f22fe295a4f25c3 Mon Sep 17 00:00:00 2001 From: yashb <74137864+roadrollerdafjorst@users.noreply.github.com> Date: Tue, 9 Jan 2024 22:22:18 +0530 Subject: [PATCH 5/7] Update pandas/tests/interchange/test_impl.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/interchange/test_impl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 7f08c3bb5d582..c45cf4aa5317a 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -356,7 +356,7 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: def test_empty_string_column(): - df = pd.Series([], name="a", dtype=str).to_frame() + df = pd.DataFrame({"a": []}, dtype=str) df2 = df.__dataframe__() result = pd.api.interchange.from_dataframe(df2) tm.assert_frame_equal(df, result) From 858b95b1555db17205b3f44b9a214befec13cd64 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 9 Jan 2024 17:59:05 +0000 Subject: [PATCH 6/7] resolve checks --- pandas/core/frame.py | 6 ++++-- pandas/tests/interchange/test_impl.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 021c7b74adb7f..38445198c8fc6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6703,7 +6703,8 @@ def drop_duplicates( ---------- subset : column label or sequence of labels, optional Only consider certain columns for identifying duplicates, by - default use all of the columns. + default use all of the columns. Columns with mutable/unhashable dtype + like lists are not supported. keep : {'first', 'last', ``False``}, default 'first' Determines which duplicates (if any) to keep. @@ -6796,7 +6797,8 @@ def duplicated( ---------- subset : column label or sequence of labels, optional Only consider certain columns for identifying duplicates, by - default use all of the columns. + default use all of the columns. Columns with mutable/unhashable dtype + like lists are not supported. keep : {'first', 'last', False}, default 'first' Determines which duplicates (if any) to mark. diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 078d5efb8030e..2bc488fbb1dd1 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -362,7 +362,7 @@ def test_empty_string_column(): result = pd.api.interchange.from_dataframe(df2) tm.assert_frame_equal(df, result) - + def test_large_string(): # GH#56702 pytest.importorskip("pyarrow") From b45f640e411a91be75a6a1ca544aa5e4c453d745 Mon Sep 17 00:00:00 2001 From: yashb <74137864+roadrollerdafjorst@users.noreply.github.com> Date: Tue, 9 Jan 2024 23:41:58 +0530 Subject: [PATCH 7/7] Update not needed --- pandas/core/frame.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 38445198c8fc6..021c7b74adb7f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6703,8 +6703,7 @@ def drop_duplicates( ---------- subset : column label or sequence of labels, optional Only consider certain columns for identifying duplicates, by - default use all of the columns. Columns with mutable/unhashable dtype - like lists are not supported. + default use all of the columns. keep : {'first', 'last', ``False``}, default 'first' Determines which duplicates (if any) to keep. @@ -6797,8 +6796,7 @@ def duplicated( ---------- subset : column label or sequence of labels, optional Only consider certain columns for identifying duplicates, by - default use all of the columns. Columns with mutable/unhashable dtype - like lists are not supported. + default use all of the columns. keep : {'first', 'last', False}, default 'first' Determines which duplicates (if any) to mark.