Merge branch 'str_extract_time_series_bug' of https://github.com/andrewkittredge/pandas into andrewkittredge-str_extract_time_series_bug

jreback · jreback · commit 8efa4ac0d978 · 2014-02-23T17:07:51.000-05:00
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -171,6 +171,7 @@ Bug Fixes
 - Perf issue in concatting with empty objects (:issue:`3259`)
 - Clarify sorting of ``sym_diff`` on ``Index``es with ``NaN``s (:isssue:`6444`)
 - Regression in ``MultiIndex.from_product`` with a ``DatetimeIndex`` as input (:issue:`6439`)
+- Bug in ``str.extract`` when passed a non-default index (:issue:`6348`)
 
 pandas 0.13.1
 -------------
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -451,11 +451,15 @@ def f(x):
         else:
             return empty_row
     if regex.groups == 1:
-        result = Series([f(val)[0] for val in arr], name=regex.groupindex.get(1))
+        result = Series([f(val)[0] for val in arr], 
+                        name=regex.groupindex.get(1),
+                        index=arr.index)
     else:
         names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
         columns = [names.get(1 + i, i) for i in range(regex.groups)]
-        result = DataFrame([f(val) for val in arr], columns=columns)
+        result = DataFrame([f(val) for val in arr], 
+                           columns=columns, 
+                           index=arr.index)
     return result
 
 
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -548,6 +548,25 @@ def test_extract(self):
         exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], columns=['letter', 'number'])
         tm.assert_frame_equal(result, exp)
 
+        # GH6348
+        # not passing index to the extractor
+        import pdb; pdb.set_trace()
+        def check_index(index):
+            data = ['A1', 'B2', 'C']
+            index = index[:len(data)]
+            result = Series(data, index=index).str.extract('(\d)')
+            exp = Series(['1', '2', NA], index=index)
+            tm.assert_series_equal(result, exp)
+
+            result = Series(data, index=index).str.extract('(?P<letter>\D)(?P<number>\d)?')
+            exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], columns=['letter', 'number'], index=index)
+            tm.assert_frame_equal(result, exp)
+
+        for index in [ tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex,
+                       tm.makeDateIndex, tm.makePeriodIndex ]:
+            check_index(index())
+
+
     def test_get_dummies(self):
         s = Series(['a|b', 'a|c', np.nan])
         result = s.str.get_dummies('|')