diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 6a48abb6c6592..a692d1e60fd24 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -325,7 +325,7 @@ Indexing - Bug in indexing non-scalar value from ``Series`` having non-unique ``Index`` will return value flattened (:issue:`17610`) - Bug in :func:`DatetimeIndex.insert` where inserting ``NaT`` into a timezone-aware index incorrectly raised (:issue:`16357`) - Bug in ``__setitem__`` when indexing a :class:`DataFrame` with a 2-d boolean ndarray (:issue:`18582`) - +- Bug in ``str.extractall`` when there were no matches empty :class:`Index` was returned instead of appropriate :class:`MultiIndex` (:issue:`19034`) I/O ^^^ diff --git a/pandas/core/strings.py b/pandas/core/strings.py index fab4e77ce4467..3b7ec2ad8a508 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -794,12 +794,10 @@ def str_extractall(arr, pat, flags=0): result_key = tuple(subject_key + (match_i, )) index_list.append(result_key) - if 0 < len(index_list): - from pandas import MultiIndex - index = MultiIndex.from_tuples( - index_list, names=arr.index.names + ["match"]) - else: - index = None + from pandas import MultiIndex + index = MultiIndex.from_tuples( + index_list, names=arr.index.names + ["match"]) + result = arr._constructor_expanddim(match_list, index=index, columns=columns) return result diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 8aa69bcbfdf7f..973fe74429551 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -1072,28 +1072,50 @@ def test_extractall_single_group_with_quantifier(self): e = DataFrame(['ab', 'abc', 'd', 'cd'], i) tm.assert_frame_equal(r, e) - def test_extractall_no_matches(self): - s = Series(['a3', 'b3', 'd4c2'], name='series_name') + @pytest.mark.parametrize('data, names', [ + ([], (None, )), + ([], ('i1', )), + ([], (None, 'i2')), + ([], ('i1', 'i2')), + (['a3', 'b3', 'd4c2'], (None, )), + (['a3', 'b3', 'd4c2'], ('i1', 'i2')), + (['a3', 'b3', 'd4c2'], (None, 'i2')), + (['a3', 'b3', 'd4c2'], ('i1', 'i2')), + ]) + def test_extractall_no_matches(self, data, names): + # GH19075 extractall with no matches should return a valid MultiIndex + n = len(data) + if len(names) == 1: + i = Index(range(n), name=names[0]) + else: + a = (tuple([i] * (n - 1)) for i in range(n)) + i = MultiIndex.from_tuples(a, names=names) + s = Series(data, name='series_name', index=i, dtype='object') + ei = MultiIndex.from_tuples([], names=(names + ('match',))) + # one un-named group. r = s.str.extractall('(z)') - e = DataFrame(columns=[0]) + e = DataFrame(columns=[0], index=ei) tm.assert_frame_equal(r, e) + # two un-named groups. r = s.str.extractall('(z)(z)') - e = DataFrame(columns=[0, 1]) + e = DataFrame(columns=[0, 1], index=ei) tm.assert_frame_equal(r, e) + # one named group. r = s.str.extractall('(?Pz)') - e = DataFrame(columns=["first"]) + e = DataFrame(columns=["first"], index=ei) tm.assert_frame_equal(r, e) + # two named groups. r = s.str.extractall('(?Pz)(?Pz)') - e = DataFrame(columns=["first", "second"]) + e = DataFrame(columns=["first", "second"], index=ei) tm.assert_frame_equal(r, e) + # one named, one un-named. r = s.str.extractall('(z)(?Pz)') - e = DataFrame(columns=[0, - "second"]) + e = DataFrame(columns=[0, "second"], index=ei) tm.assert_frame_equal(r, e) def test_extractall_stringindex(self):