|
19 | 19 | Iterator,
|
20 | 20 | NpDtype,
|
21 | 21 | PositionalIndexer,
|
| 22 | + Scalar, |
22 | 23 | SortKind,
|
23 | 24 | TakeIndexer,
|
24 | 25 | npt,
|
25 | 26 | )
|
26 | 27 | from pandas.compat import (
|
27 | 28 | pa_version_under6p0,
|
28 | 29 | pa_version_under7p0,
|
| 30 | + pa_version_under8p0, |
29 | 31 | pa_version_under9p0,
|
30 | 32 | )
|
31 | 33 | from pandas.util._decorators import doc
|
|
36 | 38 | is_bool_dtype,
|
37 | 39 | is_integer,
|
38 | 40 | is_integer_dtype,
|
| 41 | + is_list_like, |
39 | 42 | is_object_dtype,
|
40 | 43 | is_scalar,
|
41 | 44 | )
|
@@ -1056,76 +1059,56 @@ def __setitem__(self, key, value) -> None:
|
1056 | 1059 | key = check_array_indexer(self, key)
|
1057 | 1060 | value = self._maybe_convert_setitem_value(value)
|
1058 | 1061 |
|
1059 |
| - # fast path (GH50248) |
1060 | 1062 | if com.is_null_slice(key):
|
1061 |
| - if is_scalar(value): |
1062 |
| - fill_value = pa.scalar(value, type=self._data.type, from_pandas=True) |
1063 |
| - try: |
1064 |
| - self._data = pc.if_else(True, fill_value, self._data) |
1065 |
| - return |
1066 |
| - except pa.ArrowNotImplementedError: |
1067 |
| - # ArrowNotImplementedError: Function 'if_else' has no kernel |
1068 |
| - # matching input types (bool, duration[ns], duration[ns]) |
1069 |
| - # TODO: remove try/except wrapper if/when pyarrow implements |
1070 |
| - # a kernel for duration types. |
1071 |
| - pass |
1072 |
| - elif len(value) == len(self): |
1073 |
| - if isinstance(value, type(self)) and value.dtype == self.dtype: |
1074 |
| - self._data = value._data |
1075 |
| - else: |
1076 |
| - arr = pa.array(value, type=self._data.type, from_pandas=True) |
1077 |
| - self._data = pa.chunked_array([arr]) |
1078 |
| - return |
1079 |
| - |
1080 |
| - indices = self._indexing_key_to_indices(key) |
1081 |
| - argsort = np.argsort(indices) |
1082 |
| - indices = indices[argsort] |
1083 |
| - |
1084 |
| - if is_scalar(value): |
1085 |
| - value = np.broadcast_to(value, len(self)) |
1086 |
| - elif len(indices) != len(value): |
1087 |
| - raise ValueError("Length of indexer and values mismatch") |
1088 |
| - else: |
1089 |
| - value = np.asarray(value)[argsort] |
| 1063 | + # fast path (GH50248) |
| 1064 | + data = self._if_else(True, value, self._data) |
1090 | 1065 |
|
1091 |
| - self._data = self._set_via_chunk_iteration(indices=indices, value=value) |
| 1066 | + elif is_integer(key): |
| 1067 | + # fast path |
| 1068 | + key = cast(int, key) |
| 1069 | + n = len(self) |
| 1070 | + if key < 0: |
| 1071 | + key += n |
| 1072 | + if not 0 <= key < n: |
| 1073 | + raise IndexError( |
| 1074 | + f"index {key} is out of bounds for axis 0 with size {n}" |
| 1075 | + ) |
| 1076 | + if is_list_like(value): |
| 1077 | + raise ValueError("Length of indexer and values mismatch") |
| 1078 | + elif isinstance(value, pa.Scalar): |
| 1079 | + value = value.as_py() |
| 1080 | + chunks = [ |
| 1081 | + *self._data[:key].chunks, |
| 1082 | + pa.array([value], type=self._data.type, from_pandas=True), |
| 1083 | + *self._data[key + 1 :].chunks, |
| 1084 | + ] |
| 1085 | + data = pa.chunked_array(chunks).combine_chunks() |
1092 | 1086 |
|
1093 |
| - def _indexing_key_to_indices( |
1094 |
| - self, key: int | slice | np.ndarray |
1095 |
| - ) -> npt.NDArray[np.intp]: |
1096 |
| - """ |
1097 |
| - Convert indexing key for self into positional indices. |
| 1087 | + elif is_bool_dtype(key): |
| 1088 | + key = np.asarray(key, dtype=np.bool_) |
| 1089 | + data = self._replace_with_mask(self._data, key, value) |
1098 | 1090 |
|
1099 |
| - Parameters |
1100 |
| - ---------- |
1101 |
| - key : int | slice | np.ndarray |
| 1091 | + elif is_scalar(value) or isinstance(value, pa.Scalar): |
| 1092 | + mask = np.zeros(len(self), dtype=np.bool_) |
| 1093 | + mask[key] = True |
| 1094 | + data = self._if_else(mask, value, self._data) |
1102 | 1095 |
|
1103 |
| - Returns |
1104 |
| - ------- |
1105 |
| - npt.NDArray[np.intp] |
1106 |
| - """ |
1107 |
| - n = len(self) |
1108 |
| - if isinstance(key, slice): |
1109 |
| - indices = np.arange(n)[key] |
1110 |
| - elif is_integer(key): |
1111 |
| - # error: Invalid index type "List[Union[int, ndarray[Any, Any]]]" |
1112 |
| - # for "ndarray[Any, dtype[signedinteger[Any]]]"; expected type |
1113 |
| - # "Union[SupportsIndex, _SupportsArray[dtype[Union[bool_, |
1114 |
| - # integer[Any]]]], _NestedSequence[_SupportsArray[dtype[Union |
1115 |
| - # [bool_, integer[Any]]]]], _NestedSequence[Union[bool, int]] |
1116 |
| - # , Tuple[Union[SupportsIndex, _SupportsArray[dtype[Union[bool_ |
1117 |
| - # , integer[Any]]]], _NestedSequence[_SupportsArray[dtype[Union |
1118 |
| - # [bool_, integer[Any]]]]], _NestedSequence[Union[bool, int]]], ...]]" |
1119 |
| - indices = np.arange(n)[[key]] # type: ignore[index] |
1120 |
| - elif is_bool_dtype(key): |
1121 |
| - key = np.asarray(key) |
1122 |
| - if len(key) != n: |
1123 |
| - raise ValueError("Length of indexer and values mismatch") |
1124 |
| - indices = key.nonzero()[0] |
1125 | 1096 | else:
|
1126 |
| - key = np.asarray(key) |
1127 |
| - indices = np.arange(n)[key] |
1128 |
| - return indices |
| 1097 | + indices = np.arange(len(self))[key] |
| 1098 | + if len(indices) != len(value): |
| 1099 | + raise ValueError("Length of indexer and values mismatch") |
| 1100 | + if len(indices) == 0: |
| 1101 | + return |
| 1102 | + argsort = np.argsort(indices) |
| 1103 | + indices = indices[argsort] |
| 1104 | + value = value.take(argsort) |
| 1105 | + mask = np.zeros(len(self), dtype=np.bool_) |
| 1106 | + mask[indices] = True |
| 1107 | + data = self._replace_with_mask(self._data, mask, value) |
| 1108 | + |
| 1109 | + if isinstance(data, pa.Array): |
| 1110 | + data = pa.chunked_array([data]) |
| 1111 | + self._data = data |
1129 | 1112 |
|
1130 | 1113 | def _rank(
|
1131 | 1114 | self,
|
@@ -1241,95 +1224,110 @@ def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArra
|
1241 | 1224 |
|
1242 | 1225 | def _maybe_convert_setitem_value(self, value):
|
1243 | 1226 | """Maybe convert value to be pyarrow compatible."""
|
1244 |
| - # TODO: Make more robust like ArrowStringArray._maybe_convert_setitem_value |
| 1227 | + if value is None: |
| 1228 | + return value |
| 1229 | + if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)): |
| 1230 | + return value |
| 1231 | + if is_list_like(value): |
| 1232 | + pa_box = pa.array |
| 1233 | + else: |
| 1234 | + pa_box = pa.scalar |
| 1235 | + try: |
| 1236 | + value = pa_box(value, type=self._data.type, from_pandas=True) |
| 1237 | + except pa.ArrowTypeError as err: |
| 1238 | + msg = f"Invalid value '{str(value)}' for dtype {self.dtype}" |
| 1239 | + raise TypeError(msg) from err |
1245 | 1240 | return value
|
1246 | 1241 |
|
1247 |
| - def _set_via_chunk_iteration( |
1248 |
| - self, indices: npt.NDArray[np.intp], value: npt.NDArray[Any] |
1249 |
| - ) -> pa.ChunkedArray: |
| 1242 | + @classmethod |
| 1243 | + def _if_else( |
| 1244 | + cls, |
| 1245 | + cond: npt.NDArray[np.bool_] | bool, |
| 1246 | + left: ArrayLike | Scalar, |
| 1247 | + right: ArrayLike | Scalar, |
| 1248 | + ): |
1250 | 1249 | """
|
1251 |
| - Loop through the array chunks and set the new values while |
1252 |
| - leaving the chunking layout unchanged. |
| 1250 | + Choose values based on a condition. |
| 1251 | +
|
| 1252 | + Analogous to pyarrow.compute.if_else, with logic |
| 1253 | + to fallback to numpy for unsupported types. |
1253 | 1254 |
|
1254 | 1255 | Parameters
|
1255 | 1256 | ----------
|
1256 |
| - indices : npt.NDArray[np.intp] |
1257 |
| - Position indices for the underlying ChunkedArray. |
1258 |
| -
|
1259 |
| - value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object |
1260 |
| - value or values to be set of ``key``. |
| 1257 | + cond : npt.NDArray[np.bool_] or bool |
| 1258 | + left : ArrayLike | Scalar |
| 1259 | + right : ArrayLike | Scalar |
1261 | 1260 |
|
1262 |
| - Notes |
1263 |
| - ----- |
1264 |
| - Assumes that indices is sorted. Caller is responsible for sorting. |
| 1261 | + Returns |
| 1262 | + ------- |
| 1263 | + pa.Array |
1265 | 1264 | """
|
1266 |
| - new_data = [] |
1267 |
| - stop = 0 |
1268 |
| - for chunk in self._data.iterchunks(): |
1269 |
| - start, stop = stop, stop + len(chunk) |
1270 |
| - if len(indices) == 0 or stop <= indices[0]: |
1271 |
| - new_data.append(chunk) |
| 1265 | + try: |
| 1266 | + return pc.if_else(cond, left, right) |
| 1267 | + except pa.ArrowNotImplementedError: |
| 1268 | + pass |
| 1269 | + |
| 1270 | + def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]: |
| 1271 | + if isinstance(value, (pa.Array, pa.ChunkedArray)): |
| 1272 | + pa_type = value.type |
| 1273 | + elif isinstance(value, pa.Scalar): |
| 1274 | + pa_type = value.type |
| 1275 | + value = value.as_py() |
1272 | 1276 | else:
|
1273 |
| - n = int(np.searchsorted(indices, stop, side="left")) |
1274 |
| - c_ind = indices[:n] - start |
1275 |
| - indices = indices[n:] |
1276 |
| - n = len(c_ind) |
1277 |
| - c_value, value = value[:n], value[n:] |
1278 |
| - new_data.append(self._replace_with_indices(chunk, c_ind, c_value)) |
1279 |
| - return pa.chunked_array(new_data) |
| 1277 | + pa_type = None |
| 1278 | + return np.array(value, dtype=object), pa_type |
| 1279 | + |
| 1280 | + left, left_type = _to_numpy_and_type(left) |
| 1281 | + right, right_type = _to_numpy_and_type(right) |
| 1282 | + pa_type = left_type or right_type |
| 1283 | + result = np.where(cond, left, right) |
| 1284 | + return pa.array(result, type=pa_type, from_pandas=True) |
1280 | 1285 |
|
1281 | 1286 | @classmethod
|
1282 |
| - def _replace_with_indices( |
| 1287 | + def _replace_with_mask( |
1283 | 1288 | cls,
|
1284 |
| - chunk: pa.Array, |
1285 |
| - indices: npt.NDArray[np.intp], |
1286 |
| - value: npt.NDArray[Any], |
1287 |
| - ) -> pa.Array: |
| 1289 | + values: pa.Array | pa.ChunkedArray, |
| 1290 | + mask: npt.NDArray[np.bool_] | bool, |
| 1291 | + replacements: ArrayLike | Scalar, |
| 1292 | + ): |
1288 | 1293 | """
|
1289 |
| - Replace items selected with a set of positional indices. |
| 1294 | + Replace items selected with a mask. |
1290 | 1295 |
|
1291 |
| - Analogous to pyarrow.compute.replace_with_mask, except that replacement |
1292 |
| - positions are identified via indices rather than a mask. |
| 1296 | + Analogous to pyarrow.compute.replace_with_mask, with logic |
| 1297 | + to fallback to numpy for unsupported types. |
1293 | 1298 |
|
1294 | 1299 | Parameters
|
1295 | 1300 | ----------
|
1296 |
| - chunk : pa.Array |
1297 |
| - indices : npt.NDArray[np.intp] |
1298 |
| - value : npt.NDArray[Any] |
1299 |
| - Replacement value(s). |
| 1301 | + values : pa.Array or pa.ChunkedArray |
| 1302 | + mask : npt.NDArray[np.bool_] or bool |
| 1303 | + replacements : ArrayLike or Scalar |
| 1304 | + Replacement value(s) |
1300 | 1305 |
|
1301 | 1306 | Returns
|
1302 | 1307 | -------
|
1303 |
| - pa.Array |
| 1308 | + pa.Array or pa.ChunkedArray |
1304 | 1309 | """
|
1305 |
| - n = len(indices) |
1306 |
| - |
1307 |
| - if n == 0: |
1308 |
| - return chunk |
1309 |
| - |
1310 |
| - start, stop = indices[[0, -1]] |
1311 |
| - |
1312 |
| - if (stop - start) == (n - 1): |
1313 |
| - # fast path for a contiguous set of indices |
1314 |
| - arrays = [ |
1315 |
| - chunk[:start], |
1316 |
| - pa.array(value, type=chunk.type, from_pandas=True), |
1317 |
| - chunk[stop + 1 :], |
1318 |
| - ] |
1319 |
| - arrays = [arr for arr in arrays if len(arr)] |
1320 |
| - if len(arrays) == 1: |
1321 |
| - return arrays[0] |
1322 |
| - return pa.concat_arrays(arrays) |
1323 |
| - |
1324 |
| - mask = np.zeros(len(chunk), dtype=np.bool_) |
1325 |
| - mask[indices] = True |
1326 |
| - |
1327 |
| - if pa_version_under6p0: |
1328 |
| - arr = chunk.to_numpy(zero_copy_only=False) |
1329 |
| - arr[mask] = value |
1330 |
| - return pa.array(arr, type=chunk.type) |
1331 |
| - |
1332 |
| - if isna(value).all(): |
1333 |
| - return pc.if_else(mask, None, chunk) |
1334 |
| - |
1335 |
| - return pc.replace_with_mask(chunk, mask, value) |
| 1310 | + if isinstance(replacements, pa.ChunkedArray): |
| 1311 | + # replacements must be array or scalar, not ChunkedArray |
| 1312 | + replacements = replacements.combine_chunks() |
| 1313 | + if pa_version_under8p0: |
| 1314 | + # pc.replace_with_mask seems to be a bit unreliable for versions < 8.0: |
| 1315 | + # version <= 7: segfaults with various types |
| 1316 | + # version <= 6: fails to replace nulls |
| 1317 | + if isinstance(replacements, pa.Array): |
| 1318 | + indices = np.full(len(values), None) |
| 1319 | + indices[mask] = np.arange(len(replacements)) |
| 1320 | + indices = pa.array(indices, type=pa.int64()) |
| 1321 | + replacements = replacements.take(indices) |
| 1322 | + return cls._if_else(mask, replacements, values) |
| 1323 | + try: |
| 1324 | + return pc.replace_with_mask(values, mask, replacements) |
| 1325 | + except pa.ArrowNotImplementedError: |
| 1326 | + pass |
| 1327 | + if isinstance(replacements, pa.Array): |
| 1328 | + replacements = np.array(replacements, dtype=object) |
| 1329 | + elif isinstance(replacements, pa.Scalar): |
| 1330 | + replacements = replacements.as_py() |
| 1331 | + result = np.array(values, dtype=object) |
| 1332 | + result[mask] = replacements |
| 1333 | + return pa.array(result, type=values.type, from_pandas=True) |
0 commit comments