Skip to content

Commit 7be41ca

Browse files
ENH: Arrow backed string array - implement factorize() method without casting to objects (#38007)
1 parent 7c59101 commit 7be41ca

File tree

3 files changed

+70
-14
lines changed

3 files changed

+70
-14
lines changed

asv_bench/benchmarks/algorithms.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -28,23 +28,36 @@ class Factorize:
2828
"datetime64[ns, tz]",
2929
"Int64",
3030
"boolean",
31+
"string_arrow",
3132
],
3233
]
3334
param_names = ["unique", "sort", "dtype"]
3435

3536
def setup(self, unique, sort, dtype):
3637
N = 10 ** 5
38+
string_index = tm.makeStringIndex(N)
39+
try:
40+
from pandas.core.arrays.string_arrow import ArrowStringDtype
41+
42+
string_arrow = pd.array(string_index, dtype=ArrowStringDtype())
43+
except ImportError:
44+
string_arrow = None
45+
46+
if dtype == "string_arrow" and not string_arrow:
47+
raise NotImplementedError
48+
3749
data = {
3850
"int": pd.Int64Index(np.arange(N)),
3951
"uint": pd.UInt64Index(np.arange(N)),
4052
"float": pd.Float64Index(np.random.randn(N)),
41-
"string": tm.makeStringIndex(N),
53+
"string": string_index,
4254
"datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N),
4355
"datetime64[ns, tz]": pd.date_range(
4456
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
4557
),
4658
"Int64": pd.array(np.arange(N), dtype="Int64"),
4759
"boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"),
60+
"string_arrow": string_arrow,
4861
}[dtype]
4962
if not unique:
5063
data = data.repeat(5)

pandas/core/arrays/string_arrow.py

+18-3
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
Any,
77
Optional,
88
Sequence,
9+
Tuple,
910
Type,
1011
Union,
1112
)
@@ -20,6 +21,7 @@
2021
Dtype,
2122
NpDtype,
2223
)
24+
from pandas.util._decorators import doc
2325
from pandas.util._validators import validate_fillna_kwargs
2426

2527
from pandas.core.dtypes.base import ExtensionDtype
@@ -273,9 +275,22 @@ def __len__(self) -> int:
273275
"""
274276
return len(self._data)
275277

276-
@classmethod
277-
def _from_factorized(cls, values, original):
278-
return cls._from_sequence(values)
278+
@doc(ExtensionArray.factorize)
279+
def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]:
280+
encoded = self._data.dictionary_encode()
281+
indices = pa.chunked_array(
282+
[c.indices for c in encoded.chunks], type=encoded.type.index_type
283+
).to_pandas()
284+
if indices.dtype.kind == "f":
285+
indices[np.isnan(indices)] = na_sentinel
286+
indices = indices.astype(np.int64, copy=False)
287+
288+
if encoded.num_chunks:
289+
uniques = type(self)(encoded.chunk(0).dictionary)
290+
else:
291+
uniques = type(self)(pa.array([], type=encoded.type.value_type))
292+
293+
return indices.values, uniques
279294

280295
@classmethod
281296
def _concat_same_type(cls, to_concat) -> ArrowStringArray:

pandas/tests/extension/test_string.py

+38-10
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,29 @@
2626
from pandas.tests.extension import base
2727

2828

29+
def split_array(arr):
30+
if not isinstance(arr.dtype, ArrowStringDtype):
31+
pytest.skip("chunked array n/a")
32+
33+
def _split_array(arr):
34+
import pyarrow as pa
35+
36+
arrow_array = arr._data
37+
split = len(arrow_array) // 2
38+
arrow_array = pa.chunked_array(
39+
[*arrow_array[:split].chunks, *arrow_array[split:].chunks]
40+
)
41+
assert arrow_array.num_chunks == 2
42+
return type(arr)(arrow_array)
43+
44+
return _split_array(arr)
45+
46+
47+
@pytest.fixture(params=[True, False])
48+
def chunked(request):
49+
return request.param
50+
51+
2952
@pytest.fixture(
3053
params=[
3154
StringDtype,
@@ -39,28 +62,32 @@ def dtype(request):
3962

4063

4164
@pytest.fixture
42-
def data(dtype):
65+
def data(dtype, chunked):
4366
strings = np.random.choice(list(string.ascii_letters), size=100)
4467
while strings[0] == strings[1]:
4568
strings = np.random.choice(list(string.ascii_letters), size=100)
4669

47-
return dtype.construct_array_type()._from_sequence(strings)
70+
arr = dtype.construct_array_type()._from_sequence(strings)
71+
return split_array(arr) if chunked else arr
4872

4973

5074
@pytest.fixture
51-
def data_missing(dtype):
75+
def data_missing(dtype, chunked):
5276
"""Length 2 array with [NA, Valid]"""
53-
return dtype.construct_array_type()._from_sequence([pd.NA, "A"])
77+
arr = dtype.construct_array_type()._from_sequence([pd.NA, "A"])
78+
return split_array(arr) if chunked else arr
5479

5580

5681
@pytest.fixture
57-
def data_for_sorting(dtype):
58-
return dtype.construct_array_type()._from_sequence(["B", "C", "A"])
82+
def data_for_sorting(dtype, chunked):
83+
arr = dtype.construct_array_type()._from_sequence(["B", "C", "A"])
84+
return split_array(arr) if chunked else arr
5985

6086

6187
@pytest.fixture
62-
def data_missing_for_sorting(dtype):
63-
return dtype.construct_array_type()._from_sequence(["B", pd.NA, "A"])
88+
def data_missing_for_sorting(dtype, chunked):
89+
arr = dtype.construct_array_type()._from_sequence(["B", pd.NA, "A"])
90+
return split_array(arr) if chunked else arr
6491

6592

6693
@pytest.fixture
@@ -69,10 +96,11 @@ def na_value():
6996

7097

7198
@pytest.fixture
72-
def data_for_grouping(dtype):
73-
return dtype.construct_array_type()._from_sequence(
99+
def data_for_grouping(dtype, chunked):
100+
arr = dtype.construct_array_type()._from_sequence(
74101
["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"]
75102
)
103+
return split_array(arr) if chunked else arr
76104

77105

78106
class TestDtype(base.BaseDtypeTests):

0 commit comments

Comments
 (0)