Skip to content

Commit 3846040

Browse files
[ArrowStringArray] PERF: Series.str.get_dummies (#41455)
1 parent 35ce30a commit 3846040

File tree

3 files changed

+28
-9
lines changed

3 files changed

+28
-9
lines changed

asv_bench/benchmarks/strings.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -249,10 +249,18 @@ def time_rsplit(self, dtype, expand):
249249

250250

251251
class Dummies:
252-
def setup(self):
253-
self.s = Series(tm.makeStringIndex(10 ** 5)).str.join("|")
252+
params = ["str", "string", "arrow_string"]
253+
param_names = ["dtype"]
254+
255+
def setup(self, dtype):
256+
from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401
257+
258+
try:
259+
self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype).str.join("|")
260+
except ImportError:
261+
raise NotImplementedError
254262

255-
def time_get_dummies(self):
263+
def time_get_dummies(self, dtype):
256264
self.s.str.get_dummies("|")
257265

258266

pandas/core/strings/accessor.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
is_categorical_dtype,
2525
is_integer,
2626
is_list_like,
27+
is_object_dtype,
2728
is_re,
2829
)
2930
from pandas.core.dtypes.generic import (
@@ -265,7 +266,11 @@ def _wrap_result(
265266
# infer from ndim if expand is not specified
266267
expand = result.ndim != 1
267268

268-
elif expand is True and not isinstance(self._orig, ABCIndex):
269+
elif (
270+
expand is True
271+
and is_object_dtype(result)
272+
and not isinstance(self._orig, ABCIndex)
273+
):
269274
# required when expand=True is explicitly specified
270275
# not needed when inferred
271276

pandas/tests/strings/test_strings.py

+11-5
Original file line numberDiff line numberDiff line change
@@ -301,17 +301,19 @@ def test_isnumeric(any_string_dtype):
301301
tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e, dtype=dtype))
302302

303303

304-
def test_get_dummies():
305-
s = Series(["a|b", "a|c", np.nan])
304+
def test_get_dummies(any_string_dtype):
305+
s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
306306
result = s.str.get_dummies("|")
307307
expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"))
308308
tm.assert_frame_equal(result, expected)
309309

310-
s = Series(["a;b", "a", 7])
310+
s = Series(["a;b", "a", 7], dtype=any_string_dtype)
311311
result = s.str.get_dummies(";")
312312
expected = DataFrame([[0, 1, 1], [0, 1, 0], [1, 0, 0]], columns=list("7ab"))
313313
tm.assert_frame_equal(result, expected)
314314

315+
316+
def test_get_dummies_index():
315317
# GH9980, GH8028
316318
idx = Index(["a|b", "a|c", "b|c"])
317319
result = idx.str.get_dummies("|")
@@ -322,14 +324,18 @@ def test_get_dummies():
322324
tm.assert_index_equal(result, expected)
323325

324326

325-
def test_get_dummies_with_name_dummy():
327+
def test_get_dummies_with_name_dummy(any_string_dtype):
326328
# GH 12180
327329
# Dummies named 'name' should work as expected
328-
s = Series(["a", "b,name", "b"])
330+
s = Series(["a", "b,name", "b"], dtype=any_string_dtype)
329331
result = s.str.get_dummies(",")
330332
expected = DataFrame([[1, 0, 0], [0, 1, 1], [0, 1, 0]], columns=["a", "b", "name"])
331333
tm.assert_frame_equal(result, expected)
332334

335+
336+
def test_get_dummies_with_name_dummy_index():
337+
# GH 12180
338+
# Dummies named 'name' should work as expected
333339
idx = Index(["a|b", "name|c", "b|name"])
334340
result = idx.str.get_dummies("|")
335341

0 commit comments

Comments
 (0)