pandas-dev · mroeschke · Jun 21, 2023 · Jun 21, 2023 · Jun 21, 2023 · Jun 21, 2023
diff --git a/doc/source/whatsnew/v2.0.3.rst b/doc/source/whatsnew/v2.0.3.rst
@@ -27,6 +27,7 @@ Bug fixes
 - Bug in :func:`read_csv` when defining ``dtype`` with ``bool[pyarrow]`` for the ``"c"`` and ``"python"`` engines (:issue:`53390`)
 - Bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` with ``expand=True`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`53532`)
 - Bug in indexing methods (e.g. :meth:`DataFrame.__getitem__`) where taking the entire :class:`DataFrame`/:class:`Series` would raise an ``OverflowError`` when Copy on Write was enabled and the length of the array was over the maximum size a 32-bit integer can hold (:issue:`53616`)
+- Bug when constructing a :class:`DataFrame` with columns of an :class:`ArrowDtype` with a ``pyarrow.dictionary`` type that reindexes the data (:issue:`53617`)
 - Bug when indexing a :class:`DataFrame` or :class:`Series` with an :class:`Index` with a timestamp :class:`ArrowDtype` would raise an ``AttributeError`` (:issue:`53644`)
 
 .. ---------------------------------------------------------------------------

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -7212,11 +7212,19 @@ def _unpack_nested_dtype(other: Index) -> Index:
     -------
     Index
     """
+    from pandas.core.arrays.arrow import ArrowDtype
+
     dtype = other.dtype
     if isinstance(dtype, CategoricalDtype):
         # If there is ever a SparseIndex, this could get dispatched
         #  here too.
         return dtype.categories
+    elif isinstance(dtype, ArrowDtype):
+        # GH 53617
+        import pyarrow as pa
+
+        if pa.types.is_dictionary(dtype.pyarrow_dtype):
+            other = other.astype(ArrowDtype(dtype.pyarrow_dtype.value_type))
     return other
 
 

diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -2696,6 +2696,24 @@ def test_frame_from_dict_with_mixed_tzaware_indexes(self):
         with pytest.raises(TypeError, match=msg):
             DataFrame({"D": ser1, "A": ser2, "B": ser3})
 
+    @pytest.mark.parametrize(
+        "key_val, col_vals, col_type",
+        [
+            ["3", ["3", "4"], "utf8"],
+            [3, [3, 4], "int8"],
+        ],
+    )
+    def test_dict_data_arrow_column_expansion(self, key_val, col_vals, col_type):
+        # GH 53617
+        pa = pytest.importorskip("pyarrow")
+        cols = pd.arrays.ArrowExtensionArray(
+            pa.array(col_vals, type=pa.dictionary(pa.int8(), getattr(pa, col_type)()))
+        )
+        result = DataFrame({key_val: [1, 2]}, columns=cols)
+        expected = DataFrame([[1, np.nan], [2, np.nan]], columns=cols)
+        expected.iloc[:, 1] = expected.iloc[:, 1].astype(object)
+        tm.assert_frame_equal(result, expected)
+
 
 class TestDataFrameConstructorWithDtypeCoercion:
     def test_floating_values_integer_dtype(self):

diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py
@@ -11,7 +11,10 @@
     Timestamp,
 )
 import pandas._testing as tm
-from pandas.core.arrays import FloatingArray
+from pandas.core.arrays import (
+    ArrowExtensionArray,
+    FloatingArray,
+)
 
 
 @pytest.fixture
@@ -389,6 +392,26 @@ def test_get_indexer_masked_na_boolean(self, dtype):
         result = idx.get_loc(NA)
         assert result == 2
 
+    def test_get_indexer_arrow_dictionary_target(self):
+        pa = pytest.importorskip("pyarrow")
+        target = Index(
+            ArrowExtensionArray(
+                pa.array([1, 2], type=pa.dictionary(pa.int8(), pa.int8()))
+            )
+        )
+        idx = Index([1])
+
+        result = idx.get_indexer(target)
+        expected = np.array([0, -1], dtype=np.int64)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result_1, result_2 = idx.get_indexer_non_unique(target)
+        expected_1, expected_2 = np.array([0, -1], dtype=np.int64), np.array(
+            [1], dtype=np.int64
+        )
+        tm.assert_numpy_array_equal(result_1, expected_1)
+        tm.assert_numpy_array_equal(result_2, expected_2)
+
 
 class TestWhere:
     @pytest.mark.parametrize(