From 6ee556403b1ef00bdbe3f1c3d051df125a66efe9 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sun, 10 Apr 2016 11:18:51 +0900 Subject: [PATCH] ENH: Add Index.str.get_dummies --- doc/source/text.rst | 17 ++++++++++++++--- doc/source/whatsnew/v0.18.1.txt | 7 +++++++ pandas/core/strings.py | 19 +++++-------------- pandas/tests/test_strings.py | 21 ++++++++++++++++----- 4 files changed, 42 insertions(+), 22 deletions(-) diff --git a/doc/source/text.rst b/doc/source/text.rst index 655df5c5e566c..d9f8d45c8aa75 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -354,16 +354,27 @@ Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take s4 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) s4.str.contains('A', na=False) +.. _text.indicator: + Creating Indicator Variables ---------------------------- You can extract dummy variables from string columns. For example if they are separated by a ``'|'``: - .. ipython:: python +.. ipython:: python + + s = pd.Series(['a', 'a|b', np.nan, 'a|c']) + s.str.get_dummies(sep='|') + +String ``Index`` also supports ``get_dummies`` which returns ``MultiIndex``. + +.. versionadded:: 0.18.1 + +.. ipython:: python - s = pd.Series(['a', 'a|b', np.nan, 'a|c']) - s.str.get_dummies(sep='|') + idx = pd.Index(['a', 'a|b', np.nan, 'a|c']) + idx.str.get_dummies(sep='|') See also :func:`~pandas.get_dummies`. diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index edbaeb65c45eb..48677f5105dda 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -66,6 +66,13 @@ Other Enhancements idx.take([2, -1]) # default, allow_fill=True, fill_value=None idx.take([2, -1], fill_value=True) +- ``Index`` now supports ``.str.get_dummies()`` which returns ``MultiIndex``, see :ref:`Creating Indicator Variables ` (:issue:`10008`, :issue:`10103`) + +.. ipython:: python + + idx = pd.Index(['a|b', 'a|c', 'b|c']) + idx.str.get_dummies('|') + .. _whatsnew_0181.sparse: diff --git a/pandas/core/strings.py b/pandas/core/strings.py index e5d539821e3ca..66e4638a9e4b4 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -741,15 +741,6 @@ def str_get_dummies(arr, sep='|'): -------- pandas.get_dummies """ - from pandas.core.frame import DataFrame - from pandas.core.index import Index - - # GH9980, Index.str does not support get_dummies() as it returns a frame - if isinstance(arr, Index): - raise TypeError("get_dummies is not supported for string methods on " - "Index") - - # TODO remove this hack? arr = arr.fillna('') try: arr = sep + arr + sep @@ -766,7 +757,7 @@ def str_get_dummies(arr, sep='|'): for i, t in enumerate(tags): pat = sep + t + sep dummies[:, i] = lib.map_infer(arr.values, lambda x: pat in x) - return DataFrame(dummies, arr.index, tags) + return dummies, tags def str_join(arr, sep): @@ -1356,9 +1347,9 @@ def cons_row(x): index = self._orig.index if expand: cons = self._orig._constructor_expanddim - return cons(result, index=index) + return cons(result, columns=name, index=index) else: - # Must a Series + # Must be a Series cons = self._orig._constructor return cons(result, name=name, index=index) @@ -1589,9 +1580,9 @@ def get_dummies(self, sep='|'): # we need to cast to Series of strings as only that has all # methods available for making the dummies... data = self._orig.astype(str) if self._is_categorical else self._data - result = str_get_dummies(data, sep) + result, name = str_get_dummies(data, sep) return self._wrap_result(result, use_codes=(not self._is_categorical), - expand=True) + name=name, expand=True) @copy(str_translate) def translate(self, table, deletechars=None): diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index d61ae3681a880..1f9f7d43e8568 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -1237,12 +1237,15 @@ def test_get_dummies(self): columns=list('7ab')) tm.assert_frame_equal(result, expected) - # GH9980 - # Index.str does not support get_dummies() as it returns a frame - with tm.assertRaisesRegexp(TypeError, "not supported"): - idx = Index(['a|b', 'a|c', 'b|c']) - idx.str.get_dummies('|') + # GH9980, GH8028 + idx = Index(['a|b', 'a|c', 'b|c']) + result = idx.str.get_dummies('|') + + expected = MultiIndex.from_tuples([(1, 1, 0), (1, 0, 1), + (0, 1, 1)], names=('a', 'b', 'c')) + tm.assert_index_equal(result, expected) + def test_get_dummies_with_name_dummy(self): # GH 12180 # Dummies named 'name' should work as expected s = Series(['a', 'b,name', 'b']) @@ -1251,6 +1254,14 @@ def test_get_dummies(self): columns=['a', 'b', 'name']) tm.assert_frame_equal(result, expected) + idx = Index(['a|b', 'name|c', 'b|name']) + result = idx.str.get_dummies('|') + + expected = MultiIndex.from_tuples([(1, 1, 0, 0), (0, 0, 1, 1), + (0, 1, 0, 1)], + names=('a', 'b', 'c', 'name')) + tm.assert_index_equal(result, expected) + def test_join(self): values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h']) result = values.str.split('_').str.join('_')