From 83984130ebaf9d58b2807e68c662eb3b2dd326f4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 13 Jun 2023 15:06:12 -0700 Subject: [PATCH 1/3] BUG: DataFrame construction with dictionary ArrowDtype columns --- doc/source/whatsnew/v2.0.3.rst | 2 +- pandas/core/indexes/base.py | 7 +++++++ pandas/tests/frame/test_constructors.py | 18 ++++++++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.3.rst b/doc/source/whatsnew/v2.0.3.rst index 3da469c2e1fe6..dc1213412a07e 100644 --- a/doc/source/whatsnew/v2.0.3.rst +++ b/doc/source/whatsnew/v2.0.3.rst @@ -24,7 +24,7 @@ Bug fixes - Bug in :func:`RangeIndex.union` when using ``sort=True`` with another :class:`RangeIndex` (:issue:`53490`) - Bug in :func:`read_csv` when defining ``dtype`` with ``bool[pyarrow]`` for the ``"c"`` and ``"python"`` engines (:issue:`53390`) - Bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` with ``expand=True`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`53532`) -- +- Bug when constructing a :class:`DataFrame` with columns of an :class:`ArrowDtype` with a ``pyarrow.dictionary`` type that reindexes the data (:issue:`53617`) .. --------------------------------------------------------------------------- .. _whatsnew_203.other: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2c6c6dd0e6ed1..f747d62ec7c4b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -113,6 +113,7 @@ ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ( + ArrowDtype, CategoricalDtype, DatetimeTZDtype, ExtensionDtype, @@ -7538,6 +7539,12 @@ def _unpack_nested_dtype(other: Index) -> Index: # If there is ever a SparseIndex, this could get dispatched # here too. return dtype.categories + elif isinstance(dtype, ArrowDtype): + # GH 53617 + import pyarrow as pa + + if pa.types.is_dictionary(dtype.pyarrow_dtype): + other = other.astype(ArrowDtype(dtype.pyarrow_dtype.value_type)) return other diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 47e307f561cf4..06e244b93016c 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2714,6 +2714,24 @@ def test_frame_from_dict_with_mixed_tzaware_indexes(self): with pytest.raises(TypeError, match=msg): DataFrame({"D": ser1, "A": ser2, "B": ser3}) + @pytest.mark.parametrize( + "key_val, col_vals, col_type", + [ + ["3", ["3", "4"], "utf8"], + [3, [3, 4], "int8"], + ], + ) + def test_dict_data_arrow_column_expansion(self, key_val, col_vals, col_type): + # GH 53617 + pa = pytest.importorskip("pyarrow") + cols = pd.arrays.ArrowExtensionArray( + pa.array(col_vals, type=pa.dictionary(pa.int8(), getattr(pa, col_type)())) + ) + result = DataFrame({key_val: [1, 2]}, columns=cols) + expected = DataFrame([[1, np.nan], [2, np.nan]], columns=cols) + expected.iloc[:, 1] = expected.iloc[:, 1].astype(object) + tm.assert_frame_equal(result, expected) + class TestDataFrameConstructorWithDtypeCoercion: def test_floating_values_integer_dtype(self): From 6d416fed3a6dd8cdafde7f70268b1b45ef3ec100 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 13 Jun 2023 15:46:23 -0700 Subject: [PATCH 2/3] Add tests for get_indexer --- pandas/tests/indexes/numeric/test_indexing.py | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index c99e912ce4c0f..9c9e9b35b2bb4 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -11,7 +11,10 @@ Timestamp, ) import pandas._testing as tm -from pandas.core.arrays import FloatingArray +from pandas.core.arrays import ( + ArrowExtensionArray, + FloatingArray, +) @pytest.fixture @@ -389,6 +392,24 @@ def test_get_indexer_masked_na_boolean(self, dtype): result = idx.get_loc(NA) assert result == 2 + def test_get_indexer_arrow_dictionary_target(self): + pa = pytest.importorskip("pyarrow") + target = Index( + ArrowExtensionArray( + pa.array([1, 2], type=pa.dictionary(pa.int8(), pa.int8())) + ) + ) + idx = Index([1]) + + result = idx.get_indexer(target) + expected = np.array([0, -1]) + tm.assert_numpy_array_equal(result, expected) + + result_1, result_2 = idx.get_indexer_non_unique(target) + expected_1, expected_2 = np.array([0, -1]), np.array([1]) + tm.assert_numpy_array_equal(result_1, expected_1) + tm.assert_numpy_array_equal(result_2, expected_2) + class TestWhere: @pytest.mark.parametrize( From 165219d9759828d7d94c3027055a1af788c9570a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 13 Jun 2023 17:47:26 -0700 Subject: [PATCH 3/3] Windows --- pandas/tests/indexes/numeric/test_indexing.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index 9c9e9b35b2bb4..cd28d519313ed 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -402,11 +402,13 @@ def test_get_indexer_arrow_dictionary_target(self): idx = Index([1]) result = idx.get_indexer(target) - expected = np.array([0, -1]) + expected = np.array([0, -1], dtype=np.int64) tm.assert_numpy_array_equal(result, expected) result_1, result_2 = idx.get_indexer_non_unique(target) - expected_1, expected_2 = np.array([0, -1]), np.array([1]) + expected_1, expected_2 = np.array([0, -1], dtype=np.int64), np.array( + [1], dtype=np.int64 + ) tm.assert_numpy_array_equal(result_1, expected_1) tm.assert_numpy_array_equal(result_2, expected_2)