Skip to content

Backport PR #53654: BUG: DataFrame construction with dictionary ArrowDtype columns #53758

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ Bug fixes
- Bug in :func:`read_csv` when defining ``dtype`` with ``bool[pyarrow]`` for the ``"c"`` and ``"python"`` engines (:issue:`53390`)
- Bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` with ``expand=True`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`53532`)
- Bug in indexing methods (e.g. :meth:`DataFrame.__getitem__`) where taking the entire :class:`DataFrame`/:class:`Series` would raise an ``OverflowError`` when Copy on Write was enabled and the length of the array was over the maximum size a 32-bit integer can hold (:issue:`53616`)
- Bug when constructing a :class:`DataFrame` with columns of an :class:`ArrowDtype` with a ``pyarrow.dictionary`` type that reindexes the data (:issue:`53617`)
- Bug when indexing a :class:`DataFrame` or :class:`Series` with an :class:`Index` with a timestamp :class:`ArrowDtype` would raise an ``AttributeError`` (:issue:`53644`)

.. ---------------------------------------------------------------------------
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7212,11 +7212,19 @@ def _unpack_nested_dtype(other: Index) -> Index:
-------
Index
"""
from pandas.core.arrays.arrow import ArrowDtype

dtype = other.dtype
if isinstance(dtype, CategoricalDtype):
# If there is ever a SparseIndex, this could get dispatched
# here too.
return dtype.categories
elif isinstance(dtype, ArrowDtype):
# GH 53617
import pyarrow as pa

if pa.types.is_dictionary(dtype.pyarrow_dtype):
other = other.astype(ArrowDtype(dtype.pyarrow_dtype.value_type))
return other


Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2696,6 +2696,24 @@ def test_frame_from_dict_with_mixed_tzaware_indexes(self):
with pytest.raises(TypeError, match=msg):
DataFrame({"D": ser1, "A": ser2, "B": ser3})

@pytest.mark.parametrize(
"key_val, col_vals, col_type",
[
["3", ["3", "4"], "utf8"],
[3, [3, 4], "int8"],
],
)
def test_dict_data_arrow_column_expansion(self, key_val, col_vals, col_type):
# GH 53617
pa = pytest.importorskip("pyarrow")
cols = pd.arrays.ArrowExtensionArray(
pa.array(col_vals, type=pa.dictionary(pa.int8(), getattr(pa, col_type)()))
)
result = DataFrame({key_val: [1, 2]}, columns=cols)
expected = DataFrame([[1, np.nan], [2, np.nan]], columns=cols)
expected.iloc[:, 1] = expected.iloc[:, 1].astype(object)
tm.assert_frame_equal(result, expected)


class TestDataFrameConstructorWithDtypeCoercion:
def test_floating_values_integer_dtype(self):
Expand Down
25 changes: 24 additions & 1 deletion pandas/tests/indexes/numeric/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@
Timestamp,
)
import pandas._testing as tm
from pandas.core.arrays import FloatingArray
from pandas.core.arrays import (
ArrowExtensionArray,
FloatingArray,
)


@pytest.fixture
Expand Down Expand Up @@ -389,6 +392,26 @@ def test_get_indexer_masked_na_boolean(self, dtype):
result = idx.get_loc(NA)
assert result == 2

def test_get_indexer_arrow_dictionary_target(self):
pa = pytest.importorskip("pyarrow")
target = Index(
ArrowExtensionArray(
pa.array([1, 2], type=pa.dictionary(pa.int8(), pa.int8()))
)
)
idx = Index([1])

result = idx.get_indexer(target)
expected = np.array([0, -1], dtype=np.int64)
tm.assert_numpy_array_equal(result, expected)

result_1, result_2 = idx.get_indexer_non_unique(target)
expected_1, expected_2 = np.array([0, -1], dtype=np.int64), np.array(
[1], dtype=np.int64
)
tm.assert_numpy_array_equal(result_1, expected_1)
tm.assert_numpy_array_equal(result_2, expected_2)


class TestWhere:
@pytest.mark.parametrize(
Expand Down