Skip to content

Commit b02e41a

Browse files
authored
PERF: setting values via df.loc / df.iloc with pyarrow-backed columns (#50248)
* perf: ArrowExtensionArray.__setitem__(null_slice) * gh refs * fix test * add test for setitem null slice paths * add test * remove version check * fix text
1 parent 34b3222 commit b02e41a

File tree

4 files changed

+73
-1
lines changed

4 files changed

+73
-1
lines changed

asv_bench/benchmarks/array.py

+3
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,9 @@ def time_setitem_list(self, multiple_chunks):
9090
def time_setitem_slice(self, multiple_chunks):
9191
self.array[::10] = "foo"
9292

93+
def time_setitem_null_slice(self, multiple_chunks):
94+
self.array[:] = "foo"
95+
9396
def time_tolist(self, multiple_chunks):
9497
self.array.tolist()
9598

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -748,6 +748,7 @@ Performance improvements
748748
- Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`)
749749
- Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`)
750750
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`)
751+
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.__setitem__` when key is a null slice (:issue:`50248`)
751752
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`49973`)
752753
- Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`)
753754
- Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`)

pandas/core/arrays/arrow/array.py

+24-1
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040

4141
from pandas.core.arraylike import OpsMixin
4242
from pandas.core.arrays.base import ExtensionArray
43+
import pandas.core.common as com
4344
from pandas.core.indexers import (
4445
check_array_indexer,
4546
unpack_tuple_and_ellipses,
@@ -931,9 +932,31 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None:
931932
None
932933
"""
933934
key = check_array_indexer(self, key)
934-
indices = self._indexing_key_to_indices(key)
935935
value = self._maybe_convert_setitem_value(value)
936936

937+
# fast path (GH50248)
938+
if com.is_null_slice(key):
939+
if is_scalar(value):
940+
fill_value = pa.scalar(value, type=self._data.type, from_pandas=True)
941+
try:
942+
self._data = pc.if_else(True, fill_value, self._data)
943+
return
944+
except pa.ArrowNotImplementedError:
945+
# ArrowNotImplementedError: Function 'if_else' has no kernel
946+
# matching input types (bool, duration[ns], duration[ns])
947+
# TODO: remove try/except wrapper if/when pyarrow implements
948+
# a kernel for duration types.
949+
pass
950+
elif len(value) == len(self):
951+
if isinstance(value, type(self)) and value.dtype == self.dtype:
952+
self._data = value._data
953+
else:
954+
arr = pa.array(value, type=self._data.type, from_pandas=True)
955+
self._data = pa.chunked_array([arr])
956+
return
957+
958+
indices = self._indexing_key_to_indices(key)
959+
937960
argsort = np.argsort(indices)
938961
indices = indices[argsort]
939962

pandas/tests/extension/test_arrow.py

+45
Original file line numberDiff line numberDiff line change
@@ -1438,3 +1438,48 @@ def test_to_numpy_with_defaults(data):
14381438
expected[pd.isna(data)] = pd.NA
14391439

14401440
tm.assert_numpy_array_equal(result, expected)
1441+
1442+
1443+
def test_setitem_null_slice(data):
1444+
# GH50248
1445+
orig = data.copy()
1446+
1447+
result = orig.copy()
1448+
result[:] = data[0]
1449+
expected = ArrowExtensionArray(
1450+
pa.array([data[0]] * len(data), type=data._data.type)
1451+
)
1452+
tm.assert_extension_array_equal(result, expected)
1453+
1454+
result = orig.copy()
1455+
result[:] = data[::-1]
1456+
expected = data[::-1]
1457+
tm.assert_extension_array_equal(result, expected)
1458+
1459+
result = orig.copy()
1460+
result[:] = data.tolist()
1461+
expected = data
1462+
tm.assert_extension_array_equal(result, expected)
1463+
1464+
1465+
def test_setitem_invalid_dtype(data):
1466+
# GH50248
1467+
pa_type = data._data.type
1468+
if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type):
1469+
fill_value = 123
1470+
err = pa.ArrowTypeError
1471+
msg = "Expected bytes"
1472+
elif (
1473+
pa.types.is_integer(pa_type)
1474+
or pa.types.is_floating(pa_type)
1475+
or pa.types.is_boolean(pa_type)
1476+
):
1477+
fill_value = "foo"
1478+
err = pa.ArrowInvalid
1479+
msg = "Could not convert"
1480+
else:
1481+
fill_value = "foo"
1482+
err = pa.ArrowTypeError
1483+
msg = "cannot be converted"
1484+
with pytest.raises(err, match=msg):
1485+
data[:] = fill_value

0 commit comments

Comments
 (0)