From a0d3b44cf3ff562f72c93480e1ed66069bddea3b Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 18 Apr 2023 18:12:11 +0100 Subject: [PATCH 1/8] round-trip categorical pyarrow --- doc/source/whatsnew/v2.0.1.rst | 1 + pandas/core/interchange/from_dataframe.py | 3 +-- pandas/tests/interchange/test_impl.py | 14 ++++++++++++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index c64f7a46d3058..4ab1f1af2794d 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -26,6 +26,7 @@ Bug fixes ~~~~~~~~~ - Bug in :attr:`Series.dt.days` that would overflow ``int32`` number of days (:issue:`52391`) - Bug in :class:`arrays.DatetimeArray` constructor returning an incorrect unit when passed a non-nanosecond numpy datetime array (:issue:`52555`) +- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on-categorical dtypes (:issue:`49889`) - Bug in :func:`pandas.testing.assert_series_equal` where ``check_dtype=False`` would still raise for datetime or timedelta types with different resolutions (:issue:`52449`) - Bug in :func:`read_csv` casting PyArrow datetimes to NumPy when ``dtype_backend="pyarrow"`` and ``parse_dates`` is set causing a performance bottleneck in the process (:issue:`52546`) - Bug in :func:`to_datetime` and :func:`to_timedelta` when trying to convert numeric data with a :class:`ArrowDtype` (:issue:`52425`) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 065387e1acefd..9e12d5feed3ae 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -7,7 +7,6 @@ import numpy as np import pandas as pd -from pandas.core.interchange.column import PandasColumn from pandas.core.interchange.dataframe_protocol import ( Buffer, Column, @@ -182,7 +181,7 @@ def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]: cat_column = categorical["categories"] # for mypy/pyright - assert isinstance(cat_column, PandasColumn), "categories must be a PandasColumn" + assert hasattr(cat_column, "_col"), "categories must have a `.col` attribute" categories = np.array(cat_column._col) buffers = col.get_buffers() diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 078a17510d502..d0abf04cfd4d8 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -2,6 +2,7 @@ import random import numpy as np +import pyarrow as pa import pytest from pandas._libs.tslibs import iNaT @@ -74,6 +75,19 @@ def test_categorical_dtype(data): tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) +def test_categorical_pyarrow(): + # GH 49889 + arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"] + table = pa.table({"weekday": pa.array(arr).dictionary_encode()}) + exchange_df = table.__dataframe__() + result = from_dataframe(exchange_df) + weekday = pd.Categorical( + arr, categories=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] + ) + expected = pd.DataFrame({"weekday": weekday}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "data", [int_data, uint_data, float_data, bool_data, datetime_data] ) From 0e176b52343a39cab1120d51798af47128d7a682 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 18 Apr 2023 18:36:22 +0100 Subject: [PATCH 2/8] fixup for jobs where pyarrow is not present --- pandas/tests/interchange/test_impl.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index d0abf04cfd4d8..5f7a401061bf8 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -2,7 +2,6 @@ import random import numpy as np -import pyarrow as pa import pytest from pandas._libs.tslibs import iNaT @@ -75,8 +74,11 @@ def test_categorical_dtype(data): tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) +@td.skip_if_no("pyarrow") def test_categorical_pyarrow(): # GH 49889 + import pyarrow as pa + arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"] table = pa.table({"weekday": pa.array(arr).dictionary_encode()}) exchange_df = table.__dataframe__() From e709c4c655de498a1a856429a13ab571a8b1eb47 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 18 Apr 2023 22:43:56 +0100 Subject: [PATCH 3/8] use importorskip --- pandas/tests/interchange/test_impl.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 5f7a401061bf8..518d0ee03c05d 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -74,10 +74,9 @@ def test_categorical_dtype(data): tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) -@td.skip_if_no("pyarrow") def test_categorical_pyarrow(): # GH 49889 - import pyarrow as pa + pa = pytest.importorskip("pyarrow") arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"] table = pa.table({"weekday": pa.array(arr).dictionary_encode()}) From fbef6c6a01b31e5e075bf7fec1ad1bcc8a7a7784 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Wed, 19 Apr 2023 08:29:15 +0100 Subject: [PATCH 4/8] set minimum pyarrow version --- pandas/tests/interchange/test_impl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 518d0ee03c05d..abdfb4e79cb20 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -76,7 +76,7 @@ def test_categorical_dtype(data): def test_categorical_pyarrow(): # GH 49889 - pa = pytest.importorskip("pyarrow") + pa = pytest.importorskip("pyarrow", "11.0.0") arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"] table = pa.table({"weekday": pa.array(arr).dictionary_encode()}) From 936e6cd918f1a93222463df9f4ae231e2c5a7bd8 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Wed, 19 Apr 2023 10:00:36 +0100 Subject: [PATCH 5/8] just type: ignore --- pandas/core/interchange/from_dataframe.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 9e12d5feed3ae..b9ebd1ebf6f54 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -180,9 +180,9 @@ def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]: raise NotImplementedError("Non-dictionary categoricals not supported yet") cat_column = categorical["categories"] - # for mypy/pyright - assert hasattr(cat_column, "_col"), "categories must have a `.col` attribute" - categories = np.array(cat_column._col) + # Item "Column" of "Optional[Column]" has no attribute "_col" + # Item "None" of "Optional[Column]" has no attribute "_col" + categories = np.array(cat_column._col) # type: ignore[union-attr] buffers = col.get_buffers() codes_buff, codes_dtype = buffers["data"] From 2cc531594d4c3655bfb2894e40dba8c70e1025a9 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Wed, 19 Apr 2023 10:30:47 +0100 Subject: [PATCH 6/8] raise notimplementederror --- pandas/core/interchange/from_dataframe.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index b9ebd1ebf6f54..cc6b80d23a80b 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -182,7 +182,13 @@ def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]: cat_column = categorical["categories"] # Item "Column" of "Optional[Column]" has no attribute "_col" # Item "None" of "Optional[Column]" has no attribute "_col" - categories = np.array(cat_column._col) # type: ignore[union-attr] + if hasattr(cat_column, "_col"): + categories = np.array(cat_column._col) # type: ignore[union-attr] + else: + raise NotImplementedError( + "`from_dataframe` when `cat_column` does not have `_col` " + "attribute is not yet implemented" + ) buffers = col.get_buffers() codes_buff, codes_dtype = buffers["data"] From 66637141b0fe500f421c6fceae7e90dc40ff68a2 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Wed, 19 Apr 2023 10:31:31 +0100 Subject: [PATCH 7/8] raise notimplementederror --- pandas/core/interchange/from_dataframe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index cc6b80d23a80b..5250da5e11f11 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -180,9 +180,9 @@ def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]: raise NotImplementedError("Non-dictionary categoricals not supported yet") cat_column = categorical["categories"] - # Item "Column" of "Optional[Column]" has no attribute "_col" - # Item "None" of "Optional[Column]" has no attribute "_col" if hasattr(cat_column, "_col"): + # Item "Column" of "Optional[Column]" has no attribute "_col" + # Item "None" of "Optional[Column]" has no attribute "_col" categories = np.array(cat_column._col) # type: ignore[union-attr] else: raise NotImplementedError( From 1d27e7dfd29fc31edb73239ed8727d006f4c9c47 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 19 Apr 2023 11:00:46 +0100 Subject: [PATCH 8/8] Update pandas/core/interchange/from_dataframe.py Co-authored-by: Matthew Barber --- pandas/core/interchange/from_dataframe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 5250da5e11f11..2bbb678516968 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -186,8 +186,8 @@ def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]: categories = np.array(cat_column._col) # type: ignore[union-attr] else: raise NotImplementedError( - "`from_dataframe` when `cat_column` does not have `_col` " - "attribute is not yet implemented" + "Interchanging categorical columns isn't supported yet, and our " + "fallback of using the `col._col` attribute (a ndarray) failed." ) buffers = col.get_buffers()