From 38f96dda1182c20e234aa6fcf9a6c4bfe5fc6da6 Mon Sep 17 00:00:00 2001 From: sreejata Date: Mon, 13 Apr 2015 12:34:21 -0300 Subject: [PATCH 1/4] Fix: #9847 by adding a "same" and "expand" param to the StringMethods.split() return value --- pandas/core/strings.py | 14 +++++++------- pandas/tests/test_index.py | 4 ++++ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 6d20907373014..b0e726d14cf3c 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -632,9 +632,9 @@ def str_split(arr, pat=None, n=None, return_type='series'): pat : string, default None String or regular expression to split on. If None, splits on whitespace n : int, default None (all) - return_type : {'series', 'index', 'frame'}, default 'series' - If frame, returns a DataFrame (elements are strings) - If series or index, returns the same type as the original object + return_type : {'series', 'index', 'frame', 'same', 'expand'}, default 'series' + If frame or expand, returns a DataFrame (elements are strings) + If series, index or same, returns the same type as the original object (elements are lists of strings). Notes @@ -649,9 +649,9 @@ def str_split(arr, pat=None, n=None, return_type='series'): from pandas.core.frame import DataFrame from pandas.core.index import Index - if return_type not in ('series', 'index', 'frame'): - raise ValueError("return_type must be {'series', 'index', 'frame'}") - if return_type == 'frame' and isinstance(arr, Index): + if return_type not in ('series', 'index', 'frame', 'same', 'expand'): + raise ValueError("return_type must be {'series', 'index', 'frame', 'same', 'expand'}") + if return_type in ['frame', 'expand'] and isinstance(arr, Index): raise ValueError("return_type='frame' is not supported for string " "methods on Index") if pat is None: @@ -668,7 +668,7 @@ def str_split(arr, pat=None, n=None, return_type='series'): n = 0 regex = re.compile(pat) f = lambda x: regex.split(x, maxsplit=n) - if return_type == 'frame': + if return_type == 'frame' or return_type == 'expand': res = DataFrame((Series(x) for x in _na_map(f, arr)), index=arr.index) else: res = _na_map(f, arr) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index bb75b12754dca..7267924baf023 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -1220,8 +1220,12 @@ def test_str_attribute(self): tm.assert_index_equal(idx.str.split(return_type='series'), expected) # return_type 'index' is an alias for 'series' tm.assert_index_equal(idx.str.split(return_type='index'), expected) + # return_type 'same' is an alias for 'series' and 'index' + tm.assert_index_equal(idx.str.split(return_type='same'), expected) with self.assertRaisesRegexp(ValueError, 'not supported'): idx.str.split(return_type='frame') + with self.assertRaisesRegexp(ValueError, 'not supported'): + idx.str.split(return_type='expand') # test boolean case, should return np.array instead of boolean Index idx = Index(['a1', 'a2', 'b1', 'b2']) From 4ede22a155ae196188c24e5d816e399b2894abf7 Mon Sep 17 00:00:00 2001 From: sreejata Date: Mon, 13 Apr 2015 12:51:57 -0300 Subject: [PATCH 2/4] Cleaning up --- pandas/core/strings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index b0e726d14cf3c..604ee47c8fee7 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -651,7 +651,7 @@ def str_split(arr, pat=None, n=None, return_type='series'): if return_type not in ('series', 'index', 'frame', 'same', 'expand'): raise ValueError("return_type must be {'series', 'index', 'frame', 'same', 'expand'}") - if return_type in ['frame', 'expand'] and isinstance(arr, Index): + if return_type in ('frame', 'expand') and isinstance(arr, Index): raise ValueError("return_type='frame' is not supported for string " "methods on Index") if pat is None: @@ -668,7 +668,7 @@ def str_split(arr, pat=None, n=None, return_type='series'): n = 0 regex = re.compile(pat) f = lambda x: regex.split(x, maxsplit=n) - if return_type == 'frame' or return_type == 'expand': + if return_type in ('frame', 'expand'): res = DataFrame((Series(x) for x in _na_map(f, arr)), index=arr.index) else: res = _na_map(f, arr) From c8f14dd23ce4b6338bb996a39721ba4babaf2c75 Mon Sep 17 00:00:00 2001 From: sreejata Date: Mon, 13 Apr 2015 13:49:58 -0300 Subject: [PATCH 3/4] Adding a future deprecation warning --- pandas/core/strings.py | 3 +++ pandas/tests/test_index.py | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 604ee47c8fee7..aa73d09194248 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -654,6 +654,9 @@ def str_split(arr, pat=None, n=None, return_type='series'): if return_type in ('frame', 'expand') and isinstance(arr, Index): raise ValueError("return_type='frame' is not supported for string " "methods on Index") + if return_type in ('series', 'index', 'frame'): + warnings.warn(("'series', 'index' and 'frame' are deprecated. Please use 'same' or 'expand' instead"), + FutureWarning) if pat is None: if n is None or n == 0: n = -1 diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 7267924baf023..159635da5d505 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -1226,7 +1226,6 @@ def test_str_attribute(self): idx.str.split(return_type='frame') with self.assertRaisesRegexp(ValueError, 'not supported'): idx.str.split(return_type='expand') - # test boolean case, should return np.array instead of boolean Index idx = Index(['a1', 'a2', 'b1', 'b2']) expected = np.array([True, True, False, False]) From d2c57daac2e505df0e789b0c7095c99008d244ba Mon Sep 17 00:00:00 2001 From: sreejata Date: Mon, 13 Apr 2015 14:01:09 -0300 Subject: [PATCH 4/4] Adding doc string --- pandas/core/strings.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index aa73d09194248..87169a5314643 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -624,6 +624,7 @@ def str_pad(arr, width, side='left', fillchar=' '): def str_split(arr, pat=None, n=None, return_type='series'): """ + Deprecated: return_types 'series', 'index', 'frame' are now deprecated Split each string (a la re.split) in array by given pattern, propagating NA values @@ -632,8 +633,8 @@ def str_split(arr, pat=None, n=None, return_type='series'): pat : string, default None String or regular expression to split on. If None, splits on whitespace n : int, default None (all) - return_type : {'series', 'index', 'frame', 'same', 'expand'}, default 'series' - If frame or expand, returns a DataFrame (elements are strings) + return_type : {'same', 'expand'}, default 'series' + If expand, returns a DataFrame (elements are strings) If series, index or same, returns the same type as the original object (elements are lists of strings).