From 09a91a384a013a61732a5c39d5818524faa20df4 Mon Sep 17 00:00:00 2001 From: Robin Mader <128372203+robin-mader-bis@users.noreply.github.com> Date: Wed, 26 Mar 2025 15:28:13 +0100 Subject: [PATCH 1/9] BUG: Fix bug with DataFrame.pivot and .set_index not compatible with pyarrow dictionary categoricals Relates to #53051 Code for fix taken and adapted from #59099 --- pandas/core/arrays/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e5c5716165e2f..df1aa21e9203c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -452,7 +452,7 @@ def __init__( if isinstance(values, Index): arr = values._data._pa_array.combine_chunks() else: - arr = values._pa_array.combine_chunks() + arr = extract_array(values)._pa_array.combine_chunks() categories = arr.dictionary.to_pandas(types_mapper=ArrowDtype) codes = arr.indices.to_numpy() dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered) From 218a86c134a143edb7022b00c9ae8231cc93d4bc Mon Sep 17 00:00:00 2001 From: Robin Mader <128372203+robin-mader-bis@users.noreply.github.com> Date: Fri, 28 Mar 2025 09:33:56 +0100 Subject: [PATCH 2/9] TST: Add tests for faulty behavior relating to pyarrow categoricals --- pandas/tests/reshape/test_pivot.py | 40 ++++++++++++++++++++++++++++++ pandas/tests/test_multilevel.py | 21 ++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 46eee13755b2d..a80814837a337 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -3,6 +3,7 @@ datetime, timedelta, ) +import io from itertools import product import re @@ -2851,3 +2852,42 @@ def test_pivot_margins_with_none_index(self): ), ) tm.assert_frame_equal(result, expected) + + def test_pivot_with_pyarrow_categorical(self): + # GH#53051 + + # Create dataframe with categorical colum + df = ( + pd.DataFrame([("A", 1), ("B", 2), ("C", 3)], columns=["string_column", "number_column"]) + .astype({"string_column": "string", "number_column": "float32"}) + .astype({"string_column": "category", "number_column": "float32"}) + ) + + # Convert dataframe to pyarrow backend + with io.BytesIO() as buffer: + df.to_parquet(buffer) + buffer.seek(0) # Reset buffer position + df = pd.read_parquet(buffer, dtype_backend="pyarrow") + + + # Check that pivot works + df = df.pivot(columns=["string_column"], values=["number_column"]) + + # Assert that values of result are correct to prevent silent failure + multi_index = pd.MultiIndex.from_arrays( + [ + ["number_column", "number_column", "number_column"], + ["A", "B", "C"] + ], + names=(None, "string_column") + ) + df_expected = pd.DataFrame( + [ + [1.0, np.nan, np.nan], + [np.nan, 2.0, np.nan], + [np.nan, np.nan, 3.0] + ], + columns=multi_index + ) + tm.assert_frame_equal(df, df_expected, check_dtype=False, check_column_type=False) + diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index a23e6d9b3973a..af5f62d70f977 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1,4 +1,5 @@ import datetime +import io import numpy as np import pytest @@ -318,6 +319,26 @@ def test_multiindex_dt_with_nan(self): expected = Series(["a", "b", "c", "d"], name=("sub", np.nan)) tm.assert_series_equal(result, expected) + def test_multiindex_with_pyarrow_categorical(self): + # GH#53051 + + # Create dataframe with categorical colum + df = ( + pd.DataFrame([("A", 1), ("B", 2), ("C", 3)], columns=["string_column", "number_column"]) + .astype({"string_column": "string", "number_column": "float32"}) + .astype({"string_column": "category", "number_column": "float32"}) + ) + + # Convert dataframe to pyarrow backend + with io.BytesIO() as buffer: + df.to_parquet(buffer) + buffer.seek(0) # Reset buffer position + df = pd.read_parquet(buffer, dtype_backend="pyarrow") + + + # Check that index can be set + df.set_index(["string_column", "number_column"]) + class TestSorted: """everything you wanted to test about sorting""" From 5fd6817a885c264f0a3eab36b794a614bbb7ee7e Mon Sep 17 00:00:00 2001 From: Robin Mader <128372203+robin-mader-bis@users.noreply.github.com> Date: Wed, 2 Apr 2025 09:18:02 +0200 Subject: [PATCH 3/9] CLN: Fix issues reported by pre-commit hooks --- pandas/tests/reshape/test_pivot.py | 34 ++++++++++++++---------------- pandas/tests/test_multilevel.py | 10 ++++++--- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index a80814837a337..b081974cdd103 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2856,9 +2856,14 @@ def test_pivot_margins_with_none_index(self): def test_pivot_with_pyarrow_categorical(self): # GH#53051 - # Create dataframe with categorical colum + pytest.importorskip("pyarrow") + + # Create dataframe with categorical column df = ( - pd.DataFrame([("A", 1), ("B", 2), ("C", 3)], columns=["string_column", "number_column"]) + DataFrame( + [("A", 1), ("B", 2), ("C", 3)], + columns=["string_column", "number_column"], + ) .astype({"string_column": "string", "number_column": "float32"}) .astype({"string_column": "category", "number_column": "float32"}) ) @@ -2869,25 +2874,18 @@ def test_pivot_with_pyarrow_categorical(self): buffer.seek(0) # Reset buffer position df = pd.read_parquet(buffer, dtype_backend="pyarrow") - # Check that pivot works df = df.pivot(columns=["string_column"], values=["number_column"]) # Assert that values of result are correct to prevent silent failure - multi_index = pd.MultiIndex.from_arrays( - [ - ["number_column", "number_column", "number_column"], - ["A", "B", "C"] - ], - names=(None, "string_column") + multi_index = MultiIndex.from_arrays( + [["number_column", "number_column", "number_column"], ["A", "B", "C"]], + names=(None, "string_column"), ) - df_expected = pd.DataFrame( - [ - [1.0, np.nan, np.nan], - [np.nan, 2.0, np.nan], - [np.nan, np.nan, 3.0] - ], - columns=multi_index + df_expected = DataFrame( + [[1.0, np.nan, np.nan], [np.nan, 2.0, np.nan], [np.nan, np.nan, 3.0]], + columns=multi_index, + ) + tm.assert_frame_equal( + df, df_expected, check_dtype=False, check_column_type=False ) - tm.assert_frame_equal(df, df_expected, check_dtype=False, check_column_type=False) - diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index af5f62d70f977..3f8421e505ca5 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -322,9 +322,14 @@ def test_multiindex_dt_with_nan(self): def test_multiindex_with_pyarrow_categorical(self): # GH#53051 - # Create dataframe with categorical colum + pytest.importorskip("pyarrow") + + # Create dataframe with categorical column df = ( - pd.DataFrame([("A", 1), ("B", 2), ("C", 3)], columns=["string_column", "number_column"]) + DataFrame( + [["A", 1], ["B", 2], ["C", 3]], + columns=["string_column", "number_column"], + ) .astype({"string_column": "string", "number_column": "float32"}) .astype({"string_column": "category", "number_column": "float32"}) ) @@ -335,7 +340,6 @@ def test_multiindex_with_pyarrow_categorical(self): buffer.seek(0) # Reset buffer position df = pd.read_parquet(buffer, dtype_backend="pyarrow") - # Check that index can be set df.set_index(["string_column", "number_column"]) From 858240492932ef562b31b4e6d5c45d9c701eb1b0 Mon Sep 17 00:00:00 2001 From: Robin Mader <128372203+robin-mader-bis@users.noreply.github.com> Date: Fri, 4 Apr 2025 07:58:22 +0200 Subject: [PATCH 4/9] TST: Fix failing tests for minimum version by ignoring obsolete deprecation warning --- pandas/tests/reshape/test_pivot.py | 3 +++ pandas/tests/test_multilevel.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index b081974cdd103..e497fc79910b1 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2853,6 +2853,9 @@ def test_pivot_margins_with_none_index(self): ) tm.assert_frame_equal(result, expected) + # Ignore deprecation raised by old versions of pyarrow. Already fixed in + # newer versions + @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_pivot_with_pyarrow_categorical(self): # GH#53051 diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 3f8421e505ca5..70899bcef034d 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -319,6 +319,9 @@ def test_multiindex_dt_with_nan(self): expected = Series(["a", "b", "c", "d"], name=("sub", np.nan)) tm.assert_series_equal(result, expected) + # Ignore deprecation raised by old versions of pyarrow. Already fixed in + # newer versions + @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_multiindex_with_pyarrow_categorical(self): # GH#53051 From 04c979408eab921adb064953e40569743820a16b Mon Sep 17 00:00:00 2001 From: Robin Mader <128372203+robin-mader-bis@users.noreply.github.com> Date: Fri, 4 Apr 2025 13:01:11 +0200 Subject: [PATCH 5/9] DOC: Add entry for bugfix to whatsnew v3.0.0 --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 8873f7c1a8fe8..ae2862604f2aa 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -638,8 +638,8 @@ Bug fixes Categorical ^^^^^^^^^^^ - Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`) +- Bug in :meth:`DataFrame.pivot` and :meth:`DataFrame.set_index` raising an ``ArrowNotImplementedError`` for columns with pyarrow dictionary dtype (:issue:`53051`) - Bug in :meth:`Series.convert_dtypes` with ``dtype_backend="pyarrow"`` where empty :class:`CategoricalDtype` :class:`Series` raised an error or got converted to ``null[pyarrow]`` (:issue:`59934`) -- Datetimelike ^^^^^^^^^^^^ From 99b4824f4cb6aa9ac47dc5a3f66ab53c1df9ead8 Mon Sep 17 00:00:00 2001 From: Robin Mader <128372203+robin-mader-bis@users.noreply.github.com> Date: Wed, 9 Apr 2025 09:31:57 +0200 Subject: [PATCH 6/9] CLN: Refactor code and clean up according to PR feedback --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/tests/reshape/test_pivot.py | 27 +++++++++++---------------- pandas/tests/test_multilevel.py | 25 +++++++++++-------------- 3 files changed, 23 insertions(+), 30 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ae2862604f2aa..77bae7d66e25b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -640,6 +640,7 @@ Categorical - Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`) - Bug in :meth:`DataFrame.pivot` and :meth:`DataFrame.set_index` raising an ``ArrowNotImplementedError`` for columns with pyarrow dictionary dtype (:issue:`53051`) - Bug in :meth:`Series.convert_dtypes` with ``dtype_backend="pyarrow"`` where empty :class:`CategoricalDtype` :class:`Series` raised an error or got converted to ``null[pyarrow]`` (:issue:`59934`) +- Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index e497fc79910b1..c868b358c8594 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -3,7 +3,6 @@ datetime, timedelta, ) -import io from itertools import product import re @@ -16,6 +15,7 @@ import pandas as pd from pandas import ( + ArrowDtype, Categorical, DataFrame, Grouper, @@ -2853,29 +2853,24 @@ def test_pivot_margins_with_none_index(self): ) tm.assert_frame_equal(result, expected) - # Ignore deprecation raised by old versions of pyarrow. Already fixed in - # newer versions @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_pivot_with_pyarrow_categorical(self): # GH#53051 - pytest.importorskip("pyarrow") + pa = pytest.importorskip("pyarrow") # Create dataframe with categorical column - df = ( - DataFrame( - [("A", 1), ("B", 2), ("C", 3)], - columns=["string_column", "number_column"], - ) - .astype({"string_column": "string", "number_column": "float32"}) - .astype({"string_column": "category", "number_column": "float32"}) - ) + df = DataFrame( + {"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]} + ).astype({"string_column": "category", "number_column": "float32"}) # Convert dataframe to pyarrow backend - with io.BytesIO() as buffer: - df.to_parquet(buffer) - buffer.seek(0) # Reset buffer position - df = pd.read_parquet(buffer, dtype_backend="pyarrow") + df = df.astype( + { + "string_column": ArrowDtype(pa.dictionary(pa.int32(), pa.string())), + "number_column": "float[pyarrow]", + } + ) # Check that pivot works df = df.pivot(columns=["string_column"], values=["number_column"]) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 70899bcef034d..502728e178e43 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1,11 +1,11 @@ import datetime -import io import numpy as np import pytest import pandas as pd from pandas import ( + ArrowDtype, DataFrame, MultiIndex, Series, @@ -325,23 +325,20 @@ def test_multiindex_dt_with_nan(self): def test_multiindex_with_pyarrow_categorical(self): # GH#53051 - pytest.importorskip("pyarrow") + pa = pytest.importorskip("pyarrow") # Create dataframe with categorical column - df = ( - DataFrame( - [["A", 1], ["B", 2], ["C", 3]], - columns=["string_column", "number_column"], - ) - .astype({"string_column": "string", "number_column": "float32"}) - .astype({"string_column": "category", "number_column": "float32"}) - ) + df = DataFrame( + {"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]} + ).astype({"string_column": "category", "number_column": "float32"}) # Convert dataframe to pyarrow backend - with io.BytesIO() as buffer: - df.to_parquet(buffer) - buffer.seek(0) # Reset buffer position - df = pd.read_parquet(buffer, dtype_backend="pyarrow") + df = df.astype( + { + "string_column": ArrowDtype(pa.dictionary(pa.int32(), pa.string())), + "number_column": "float[pyarrow]", + } + ) # Check that index can be set df.set_index(["string_column", "number_column"]) From 157d4a5b2afdcd9c8f56aba9d537ad1eb359b7e8 Mon Sep 17 00:00:00 2001 From: Robin Mader <128372203+robin-mader-bis@users.noreply.github.com> Date: Thu, 10 Apr 2025 17:56:25 +0200 Subject: [PATCH 7/9] CLN: Refactor code and clean up according to PR feedback --- pandas/tests/reshape/test_pivot.py | 23 ++++++++------------ pandas/tests/test_multilevel.py | 35 ++++++++++++++++-------------- 2 files changed, 28 insertions(+), 30 deletions(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index c868b358c8594..d8fc0285d20d5 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2856,26 +2856,21 @@ def test_pivot_margins_with_none_index(self): @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_pivot_with_pyarrow_categorical(self): # GH#53051 - pa = pytest.importorskip("pyarrow") - # Create dataframe with categorical column - df = DataFrame( - {"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]} - ).astype({"string_column": "category", "number_column": "float32"}) - - # Convert dataframe to pyarrow backend - df = df.astype( - { - "string_column": ArrowDtype(pa.dictionary(pa.int32(), pa.string())), - "number_column": "float[pyarrow]", - } + df = ( + DataFrame({"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]}) + .astype({"string_column": "category", "number_column": "float32"}) + .astype( + { + "string_column": ArrowDtype(pa.dictionary(pa.int32(), pa.string())), + "number_column": "float[pyarrow]", + } + ) ) - # Check that pivot works df = df.pivot(columns=["string_column"], values=["number_column"]) - # Assert that values of result are correct to prevent silent failure multi_index = MultiIndex.from_arrays( [["number_column", "number_column", "number_column"], ["A", "B", "C"]], names=(None, "string_column"), diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 502728e178e43..936471d8ddcea 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -319,29 +319,32 @@ def test_multiindex_dt_with_nan(self): expected = Series(["a", "b", "c", "d"], name=("sub", np.nan)) tm.assert_series_equal(result, expected) - # Ignore deprecation raised by old versions of pyarrow. Already fixed in - # newer versions @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_multiindex_with_pyarrow_categorical(self): # GH#53051 - pa = pytest.importorskip("pyarrow") - # Create dataframe with categorical column - df = DataFrame( - {"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]} - ).astype({"string_column": "category", "number_column": "float32"}) - - # Convert dataframe to pyarrow backend - df = df.astype( - { - "string_column": ArrowDtype(pa.dictionary(pa.int32(), pa.string())), - "number_column": "float[pyarrow]", - } + df = ( + DataFrame({"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]}) + .astype({"string_column": "category", "number_column": "float32"}) + .astype( + { + "string_column": ArrowDtype(pa.dictionary(pa.int32(), pa.string())), + "number_column": "float[pyarrow]", + } + ) ) - # Check that index can be set - df.set_index(["string_column", "number_column"]) + df = df.set_index(["string_column", "number_column"]) + + df_expected = DataFrame( + index=MultiIndex.from_arrays( + [["A", "B", "C"], [1, 2, 3]], names=["string_column", "number_column"] + ) + ) + tm.assert_frame_equal( + df, df_expected, check_dtype=False, check_column_type=False + ) class TestSorted: From 080151cac35878faf554748cf13166c4cce43aff Mon Sep 17 00:00:00 2001 From: Robin Mader <128372203+robin-mader-bis@users.noreply.github.com> Date: Fri, 11 Apr 2025 08:28:17 +0200 Subject: [PATCH 8/9] CLN: Refactor tests to adress PR feedback --- pandas/tests/reshape/test_pivot.py | 12 ++++-------- pandas/tests/test_multilevel.py | 14 +++++++------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index d8fc0285d20d5..c12528385530b 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2858,8 +2858,9 @@ def test_pivot_with_pyarrow_categorical(self): # GH#53051 pa = pytest.importorskip("pyarrow") + data = {"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]} df = ( - DataFrame({"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]}) + DataFrame(data) .astype({"string_column": "category", "number_column": "float32"}) .astype( { @@ -2871,13 +2872,8 @@ def test_pivot_with_pyarrow_categorical(self): df = df.pivot(columns=["string_column"], values=["number_column"]) - multi_index = MultiIndex.from_arrays( - [["number_column", "number_column", "number_column"], ["A", "B", "C"]], - names=(None, "string_column"), - ) - df_expected = DataFrame( - [[1.0, np.nan, np.nan], [np.nan, 2.0, np.nan], [np.nan, np.nan, 3.0]], - columns=multi_index, + df_expected = DataFrame(data).pivot( + columns=["string_column"], values=["number_column"] ) tm.assert_frame_equal( df, df_expected, check_dtype=False, check_column_type=False diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 936471d8ddcea..7b50fcd563296 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -324,8 +324,9 @@ def test_multiindex_with_pyarrow_categorical(self): # GH#53051 pa = pytest.importorskip("pyarrow") + data = {"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]} df = ( - DataFrame({"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]}) + DataFrame(data) .astype({"string_column": "category", "number_column": "float32"}) .astype( { @@ -337,13 +338,12 @@ def test_multiindex_with_pyarrow_categorical(self): df = df.set_index(["string_column", "number_column"]) - df_expected = DataFrame( - index=MultiIndex.from_arrays( - [["A", "B", "C"], [1, 2, 3]], names=["string_column", "number_column"] - ) - ) + df_expected = DataFrame(data).set_index(["string_column", "number_column"]) tm.assert_frame_equal( - df, df_expected, check_dtype=False, check_column_type=False + df, + df_expected, + check_dtype=False, + check_index_type=False, ) From 5e53fece52b0acc077d3d20ac5cf347d6e82044f Mon Sep 17 00:00:00 2001 From: Robin Mader <128372203+robin-mader-bis@users.noreply.github.com> Date: Sat, 12 Apr 2025 11:07:32 +0200 Subject: [PATCH 9/9] CLN: Refactor tests to adress PR feedback --- pandas/tests/reshape/test_pivot.py | 26 ++++++++++++++------------ pandas/tests/test_multilevel.py | 25 +++++++++++++------------ 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index c12528385530b..614200ae5b7c2 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2858,22 +2858,24 @@ def test_pivot_with_pyarrow_categorical(self): # GH#53051 pa = pytest.importorskip("pyarrow") - data = {"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]} - df = ( - DataFrame(data) - .astype({"string_column": "category", "number_column": "float32"}) - .astype( - { - "string_column": ArrowDtype(pa.dictionary(pa.int32(), pa.string())), - "number_column": "float[pyarrow]", - } - ) + df = DataFrame( + {"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]} + ).astype( + { + "string_column": ArrowDtype(pa.dictionary(pa.int32(), pa.string())), + "number_column": "float[pyarrow]", + } ) df = df.pivot(columns=["string_column"], values=["number_column"]) - df_expected = DataFrame(data).pivot( - columns=["string_column"], values=["number_column"] + multi_index = MultiIndex.from_arrays( + [["number_column", "number_column", "number_column"], ["A", "B", "C"]], + names=(None, "string_column"), + ) + df_expected = DataFrame( + [[1.0, np.nan, np.nan], [np.nan, 2.0, np.nan], [np.nan, np.nan, 3.0]], + columns=multi_index, ) tm.assert_frame_equal( df, df_expected, check_dtype=False, check_column_type=False diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 7b50fcd563296..ff7ab22c197d8 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -324,26 +324,27 @@ def test_multiindex_with_pyarrow_categorical(self): # GH#53051 pa = pytest.importorskip("pyarrow") - data = {"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]} - df = ( - DataFrame(data) - .astype({"string_column": "category", "number_column": "float32"}) - .astype( - { - "string_column": ArrowDtype(pa.dictionary(pa.int32(), pa.string())), - "number_column": "float[pyarrow]", - } - ) + df = DataFrame( + {"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]} + ).astype( + { + "string_column": ArrowDtype(pa.dictionary(pa.int32(), pa.string())), + "number_column": "float[pyarrow]", + } ) df = df.set_index(["string_column", "number_column"]) - df_expected = DataFrame(data).set_index(["string_column", "number_column"]) + df_expected = DataFrame( + index=MultiIndex.from_arrays( + [["A", "B", "C"], [1, 2, 3]], names=["string_column", "number_column"] + ) + ) tm.assert_frame_equal( df, df_expected, - check_dtype=False, check_index_type=False, + check_column_type=False, )