Skip to content

Commit af7d5d8

Browse files
authored
Backport PR #53654: BUG: DataFrame construction with dictionary ArrowDtype columns (#53758)
* Backport PR #53654: BUG: DataFrame construction with dictionary ArrowDtype columns * chage import
1 parent 0f06e4c commit af7d5d8

File tree

4 files changed

+51
-1
lines changed

4 files changed

+51
-1
lines changed

doc/source/whatsnew/v2.0.3.rst

+1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ Bug fixes
2727
- Bug in :func:`read_csv` when defining ``dtype`` with ``bool[pyarrow]`` for the ``"c"`` and ``"python"`` engines (:issue:`53390`)
2828
- Bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` with ``expand=True`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`53532`)
2929
- Bug in indexing methods (e.g. :meth:`DataFrame.__getitem__`) where taking the entire :class:`DataFrame`/:class:`Series` would raise an ``OverflowError`` when Copy on Write was enabled and the length of the array was over the maximum size a 32-bit integer can hold (:issue:`53616`)
30+
- Bug when constructing a :class:`DataFrame` with columns of an :class:`ArrowDtype` with a ``pyarrow.dictionary`` type that reindexes the data (:issue:`53617`)
3031
- Bug when indexing a :class:`DataFrame` or :class:`Series` with an :class:`Index` with a timestamp :class:`ArrowDtype` would raise an ``AttributeError`` (:issue:`53644`)
3132

3233
.. ---------------------------------------------------------------------------

pandas/core/indexes/base.py

+8
Original file line numberDiff line numberDiff line change
@@ -7212,11 +7212,19 @@ def _unpack_nested_dtype(other: Index) -> Index:
72127212
-------
72137213
Index
72147214
"""
7215+
from pandas.core.arrays.arrow import ArrowDtype
7216+
72157217
dtype = other.dtype
72167218
if isinstance(dtype, CategoricalDtype):
72177219
# If there is ever a SparseIndex, this could get dispatched
72187220
# here too.
72197221
return dtype.categories
7222+
elif isinstance(dtype, ArrowDtype):
7223+
# GH 53617
7224+
import pyarrow as pa
7225+
7226+
if pa.types.is_dictionary(dtype.pyarrow_dtype):
7227+
other = other.astype(ArrowDtype(dtype.pyarrow_dtype.value_type))
72207228
return other
72217229

72227230

pandas/tests/frame/test_constructors.py

+18
Original file line numberDiff line numberDiff line change
@@ -2696,6 +2696,24 @@ def test_frame_from_dict_with_mixed_tzaware_indexes(self):
26962696
with pytest.raises(TypeError, match=msg):
26972697
DataFrame({"D": ser1, "A": ser2, "B": ser3})
26982698

2699+
@pytest.mark.parametrize(
2700+
"key_val, col_vals, col_type",
2701+
[
2702+
["3", ["3", "4"], "utf8"],
2703+
[3, [3, 4], "int8"],
2704+
],
2705+
)
2706+
def test_dict_data_arrow_column_expansion(self, key_val, col_vals, col_type):
2707+
# GH 53617
2708+
pa = pytest.importorskip("pyarrow")
2709+
cols = pd.arrays.ArrowExtensionArray(
2710+
pa.array(col_vals, type=pa.dictionary(pa.int8(), getattr(pa, col_type)()))
2711+
)
2712+
result = DataFrame({key_val: [1, 2]}, columns=cols)
2713+
expected = DataFrame([[1, np.nan], [2, np.nan]], columns=cols)
2714+
expected.iloc[:, 1] = expected.iloc[:, 1].astype(object)
2715+
tm.assert_frame_equal(result, expected)
2716+
26992717

27002718
class TestDataFrameConstructorWithDtypeCoercion:
27012719
def test_floating_values_integer_dtype(self):

pandas/tests/indexes/numeric/test_indexing.py

+24-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@
1111
Timestamp,
1212
)
1313
import pandas._testing as tm
14-
from pandas.core.arrays import FloatingArray
14+
from pandas.core.arrays import (
15+
ArrowExtensionArray,
16+
FloatingArray,
17+
)
1518

1619

1720
@pytest.fixture
@@ -389,6 +392,26 @@ def test_get_indexer_masked_na_boolean(self, dtype):
389392
result = idx.get_loc(NA)
390393
assert result == 2
391394

395+
def test_get_indexer_arrow_dictionary_target(self):
396+
pa = pytest.importorskip("pyarrow")
397+
target = Index(
398+
ArrowExtensionArray(
399+
pa.array([1, 2], type=pa.dictionary(pa.int8(), pa.int8()))
400+
)
401+
)
402+
idx = Index([1])
403+
404+
result = idx.get_indexer(target)
405+
expected = np.array([0, -1], dtype=np.int64)
406+
tm.assert_numpy_array_equal(result, expected)
407+
408+
result_1, result_2 = idx.get_indexer_non_unique(target)
409+
expected_1, expected_2 = np.array([0, -1], dtype=np.int64), np.array(
410+
[1], dtype=np.int64
411+
)
412+
tm.assert_numpy_array_equal(result_1, expected_1)
413+
tm.assert_numpy_array_equal(result_2, expected_2)
414+
392415

393416
class TestWhere:
394417
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)