Skip to content

Commit d36da2b

Browse files
authored
BUG: DataFrame construction with dictionary ArrowDtype columns (#53654)
* BUG: DataFrame construction with dictionary ArrowDtype columns * Add tests for get_indexer * Windows
1 parent 6f51d03 commit d36da2b

File tree

4 files changed

+50
-1
lines changed

4 files changed

+50
-1
lines changed

doc/source/whatsnew/v2.0.3.rst

+1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ Bug fixes
2828
- Bug in :func:`read_csv` when defining ``dtype`` with ``bool[pyarrow]`` for the ``"c"`` and ``"python"`` engines (:issue:`53390`)
2929
- Bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` with ``expand=True`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`53532`)
3030
- Bug in indexing methods (e.g. :meth:`DataFrame.__getitem__`) where taking the entire :class:`DataFrame`/:class:`Series` would raise an ``OverflowError`` when Copy on Write was enabled and the length of the array was over the maximum size a 32-bit integer can hold (:issue:`53616`)
31+
- Bug when constructing a :class:`DataFrame` with columns of an :class:`ArrowDtype` with a ``pyarrow.dictionary`` type that reindexes the data (:issue:`53617`)
3132
- Bug when indexing a :class:`DataFrame` or :class:`Series` with an :class:`Index` with a timestamp :class:`ArrowDtype` would raise an ``AttributeError`` (:issue:`53644`)
3233

3334
.. ---------------------------------------------------------------------------

pandas/core/indexes/base.py

+7
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@
116116
)
117117
from pandas.core.dtypes.concat import concat_compat
118118
from pandas.core.dtypes.dtypes import (
119+
ArrowDtype,
119120
CategoricalDtype,
120121
DatetimeTZDtype,
121122
ExtensionDtype,
@@ -7549,6 +7550,12 @@ def _unpack_nested_dtype(other: Index) -> Index:
75497550
# If there is ever a SparseIndex, this could get dispatched
75507551
# here too.
75517552
return dtype.categories
7553+
elif isinstance(dtype, ArrowDtype):
7554+
# GH 53617
7555+
import pyarrow as pa
7556+
7557+
if pa.types.is_dictionary(dtype.pyarrow_dtype):
7558+
other = other.astype(ArrowDtype(dtype.pyarrow_dtype.value_type))
75527559
return other
75537560

75547561

pandas/tests/frame/test_constructors.py

+18
Original file line numberDiff line numberDiff line change
@@ -2714,6 +2714,24 @@ def test_frame_from_dict_with_mixed_tzaware_indexes(self):
27142714
with pytest.raises(TypeError, match=msg):
27152715
DataFrame({"D": ser1, "A": ser2, "B": ser3})
27162716

2717+
@pytest.mark.parametrize(
2718+
"key_val, col_vals, col_type",
2719+
[
2720+
["3", ["3", "4"], "utf8"],
2721+
[3, [3, 4], "int8"],
2722+
],
2723+
)
2724+
def test_dict_data_arrow_column_expansion(self, key_val, col_vals, col_type):
2725+
# GH 53617
2726+
pa = pytest.importorskip("pyarrow")
2727+
cols = pd.arrays.ArrowExtensionArray(
2728+
pa.array(col_vals, type=pa.dictionary(pa.int8(), getattr(pa, col_type)()))
2729+
)
2730+
result = DataFrame({key_val: [1, 2]}, columns=cols)
2731+
expected = DataFrame([[1, np.nan], [2, np.nan]], columns=cols)
2732+
expected.iloc[:, 1] = expected.iloc[:, 1].astype(object)
2733+
tm.assert_frame_equal(result, expected)
2734+
27172735

27182736
class TestDataFrameConstructorWithDtypeCoercion:
27192737
def test_floating_values_integer_dtype(self):

pandas/tests/indexes/numeric/test_indexing.py

+24-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@
1111
Timestamp,
1212
)
1313
import pandas._testing as tm
14-
from pandas.core.arrays import FloatingArray
14+
from pandas.core.arrays import (
15+
ArrowExtensionArray,
16+
FloatingArray,
17+
)
1518

1619

1720
@pytest.fixture
@@ -389,6 +392,26 @@ def test_get_indexer_masked_na_boolean(self, dtype):
389392
result = idx.get_loc(NA)
390393
assert result == 2
391394

395+
def test_get_indexer_arrow_dictionary_target(self):
396+
pa = pytest.importorskip("pyarrow")
397+
target = Index(
398+
ArrowExtensionArray(
399+
pa.array([1, 2], type=pa.dictionary(pa.int8(), pa.int8()))
400+
)
401+
)
402+
idx = Index([1])
403+
404+
result = idx.get_indexer(target)
405+
expected = np.array([0, -1], dtype=np.int64)
406+
tm.assert_numpy_array_equal(result, expected)
407+
408+
result_1, result_2 = idx.get_indexer_non_unique(target)
409+
expected_1, expected_2 = np.array([0, -1], dtype=np.int64), np.array(
410+
[1], dtype=np.int64
411+
)
412+
tm.assert_numpy_array_equal(result_1, expected_1)
413+
tm.assert_numpy_array_equal(result_2, expected_2)
414+
392415

393416
class TestWhere:
394417
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)