Skip to content

Commit 6ee5564

Browse files
committed
ENH: Add Index.str.get_dummies
1 parent 083db2a commit 6ee5564

File tree

4 files changed

+42
-22
lines changed

4 files changed

+42
-22
lines changed

doc/source/text.rst

+14-3
Original file line numberDiff line numberDiff line change
@@ -354,16 +354,27 @@ Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take
354354
s4 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
355355
s4.str.contains('A', na=False)
356356
357+
.. _text.indicator:
358+
357359
Creating Indicator Variables
358360
----------------------------
359361

360362
You can extract dummy variables from string columns.
361363
For example if they are separated by a ``'|'``:
362364

363-
.. ipython:: python
365+
.. ipython:: python
366+
367+
s = pd.Series(['a', 'a|b', np.nan, 'a|c'])
368+
s.str.get_dummies(sep='|')
369+
370+
String ``Index`` also supports ``get_dummies`` which returns ``MultiIndex``.
371+
372+
.. versionadded:: 0.18.1
373+
374+
.. ipython:: python
364375
365-
s = pd.Series(['a', 'a|b', np.nan, 'a|c'])
366-
s.str.get_dummies(sep='|')
376+
idx = pd.Index(['a', 'a|b', np.nan, 'a|c'])
377+
idx.str.get_dummies(sep='|')
367378
368379
See also :func:`~pandas.get_dummies`.
369380

doc/source/whatsnew/v0.18.1.txt

+7
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,13 @@ Other Enhancements
6666
idx.take([2, -1]) # default, allow_fill=True, fill_value=None
6767
idx.take([2, -1], fill_value=True)
6868

69+
- ``Index`` now supports ``.str.get_dummies()`` which returns ``MultiIndex``, see :ref:`Creating Indicator Variables <text.indicator>` (:issue:`10008`, :issue:`10103`)
70+
71+
.. ipython:: python
72+
73+
idx = pd.Index(['a|b', 'a|c', 'b|c'])
74+
idx.str.get_dummies('|')
75+
6976

7077
.. _whatsnew_0181.sparse:
7178

pandas/core/strings.py

+5-14
Original file line numberDiff line numberDiff line change
@@ -741,15 +741,6 @@ def str_get_dummies(arr, sep='|'):
741741
--------
742742
pandas.get_dummies
743743
"""
744-
from pandas.core.frame import DataFrame
745-
from pandas.core.index import Index
746-
747-
# GH9980, Index.str does not support get_dummies() as it returns a frame
748-
if isinstance(arr, Index):
749-
raise TypeError("get_dummies is not supported for string methods on "
750-
"Index")
751-
752-
# TODO remove this hack?
753744
arr = arr.fillna('')
754745
try:
755746
arr = sep + arr + sep
@@ -766,7 +757,7 @@ def str_get_dummies(arr, sep='|'):
766757
for i, t in enumerate(tags):
767758
pat = sep + t + sep
768759
dummies[:, i] = lib.map_infer(arr.values, lambda x: pat in x)
769-
return DataFrame(dummies, arr.index, tags)
760+
return dummies, tags
770761

771762

772763
def str_join(arr, sep):
@@ -1356,9 +1347,9 @@ def cons_row(x):
13561347
index = self._orig.index
13571348
if expand:
13581349
cons = self._orig._constructor_expanddim
1359-
return cons(result, index=index)
1350+
return cons(result, columns=name, index=index)
13601351
else:
1361-
# Must a Series
1352+
# Must be a Series
13621353
cons = self._orig._constructor
13631354
return cons(result, name=name, index=index)
13641355

@@ -1589,9 +1580,9 @@ def get_dummies(self, sep='|'):
15891580
# we need to cast to Series of strings as only that has all
15901581
# methods available for making the dummies...
15911582
data = self._orig.astype(str) if self._is_categorical else self._data
1592-
result = str_get_dummies(data, sep)
1583+
result, name = str_get_dummies(data, sep)
15931584
return self._wrap_result(result, use_codes=(not self._is_categorical),
1594-
expand=True)
1585+
name=name, expand=True)
15951586

15961587
@copy(str_translate)
15971588
def translate(self, table, deletechars=None):

pandas/tests/test_strings.py

+16-5
Original file line numberDiff line numberDiff line change
@@ -1237,12 +1237,15 @@ def test_get_dummies(self):
12371237
columns=list('7ab'))
12381238
tm.assert_frame_equal(result, expected)
12391239

1240-
# GH9980
1241-
# Index.str does not support get_dummies() as it returns a frame
1242-
with tm.assertRaisesRegexp(TypeError, "not supported"):
1243-
idx = Index(['a|b', 'a|c', 'b|c'])
1244-
idx.str.get_dummies('|')
1240+
# GH9980, GH8028
1241+
idx = Index(['a|b', 'a|c', 'b|c'])
1242+
result = idx.str.get_dummies('|')
1243+
1244+
expected = MultiIndex.from_tuples([(1, 1, 0), (1, 0, 1),
1245+
(0, 1, 1)], names=('a', 'b', 'c'))
1246+
tm.assert_index_equal(result, expected)
12451247

1248+
def test_get_dummies_with_name_dummy(self):
12461249
# GH 12180
12471250
# Dummies named 'name' should work as expected
12481251
s = Series(['a', 'b,name', 'b'])
@@ -1251,6 +1254,14 @@ def test_get_dummies(self):
12511254
columns=['a', 'b', 'name'])
12521255
tm.assert_frame_equal(result, expected)
12531256

1257+
idx = Index(['a|b', 'name|c', 'b|name'])
1258+
result = idx.str.get_dummies('|')
1259+
1260+
expected = MultiIndex.from_tuples([(1, 1, 0, 0), (0, 0, 1, 1),
1261+
(0, 1, 0, 1)],
1262+
names=('a', 'b', 'c', 'name'))
1263+
tm.assert_index_equal(result, expected)
1264+
12541265
def test_join(self):
12551266
values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])
12561267
result = values.str.split('_').str.join('_')

0 commit comments

Comments
 (0)