Skip to content

Commit bbb1cdf

Browse files
authored
PERF: ArrowExtensionArray.factorize (#49177)
* Iterate on simplification * Complete refactor * add whatsnew * Add whatsnew note * Rename
1 parent 8b503a8 commit bbb1cdf

File tree

2 files changed

+14
-13
lines changed

2 files changed

+14
-13
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ Performance improvements
159159
- Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`)
160160
- Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
161161
- Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`)
162+
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`)
162163
- Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`)
163164
- Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`)
164165
- Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`)

pandas/core/arrays/arrow/array.py

+13-13
Original file line numberDiff line numberDiff line change
@@ -547,21 +547,21 @@ def factorize(
547547
resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
548548
null_encoding = "mask" if resolved_na_sentinel is not None else "encode"
549549
encoded = self._data.dictionary_encode(null_encoding=null_encoding)
550-
indices = pa.chunked_array(
551-
[c.indices for c in encoded.chunks], type=encoded.type.index_type
552-
).to_pandas()
553-
if indices.dtype.kind == "f":
554-
indices[np.isnan(indices)] = (
555-
resolved_na_sentinel if resolved_na_sentinel is not None else -1
550+
if encoded.length() == 0:
551+
indices = np.array([], dtype=np.intp)
552+
uniques = type(self)(pa.chunked_array([], type=encoded.type.value_type))
553+
else:
554+
pa_indices = encoded.combine_chunks().indices
555+
if pa_indices.null_count > 0:
556+
fill_value = (
557+
resolved_na_sentinel if resolved_na_sentinel is not None else -1
558+
)
559+
pa_indices = pc.fill_null(pa_indices, fill_value)
560+
indices = pa_indices.to_numpy(zero_copy_only=False, writable=True).astype(
561+
np.intp, copy=False
556562
)
557-
indices = indices.astype(np.int64, copy=False)
558-
559-
if encoded.num_chunks:
560563
uniques = type(self)(encoded.chunk(0).dictionary)
561-
else:
562-
uniques = type(self)(pa.array([], type=encoded.type.value_type))
563-
564-
return indices.values, uniques
564+
return indices, uniques
565565

566566
def reshape(self, *args, **kwargs):
567567
raise NotImplementedError(

0 commit comments

Comments
 (0)