pandas-dev · ghost · Apr 8, 2013 · Apr 11, 2013 · Apr 11, 2013 · Apr 11, 2013
diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt
@@ -233,6 +233,12 @@ API changes
 
   - In ``HDFStore``, deprecated the ``unique`` method, can be replicated by ``select_column(key,column).unique()``
 
+  - an experimental new df.dgrep() for selecting rows by applying regex/predicate function to *data* values
+    can be made available via the "sandbox.dgrep" option. Note that this is not an official part of the API yet
+    so is subject to breaking change. Feedback is welcome (GH2460_).
+
+.. _GH2460: https://github.com/pydata/pandas/issues/2460
+
 Enhancements
 ~~~~~~~~~~~~
 

diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -241,3 +241,76 @@ def use_inf_as_null_cb(key):
 with cf.config_prefix('mode'):
     cf.register_option('use_inf_as_null', False, use_inf_as_null_doc,
                        cb=use_inf_as_null_cb)
+
+
+sb_xp_dgrep_doc = """
+: boolean
+    Enables the experimental df.dgrep() method, for selecting series/DataFrame
+    rows by regex/predicate match against *data* values.
+
+    Features exposed via the sandbox are subject to change or removal, and are not
+    yet part of the official API.
+
+"""
+
+def sandbox(gh_issue_num,msg=None):
+    def inner(cb):
+        def f(key):
+
+            s = """
+This is an experimental feature being considered for inclusion in pandas core.
+We'd appreciate your feedback on it in the Github issue page:
+
+    http://github.com/pydata/pandas/issues/%d
+
+If you find this useful, lacking in major functionality or buggy please
+take a moment to let us know, so we can make pandas (even) better.
+
+Thank you,
+
+The Pandas dev team
+
+""" % gh_issue_num
+
+            if msg:
+                s += "P.S.\n\n" + msg
+
+            # don't print( the msessage on turn off
+            val = cf.get_option(key)
+            if val:
+                print(s)
+
+            return cb(key)
+
+        return f
+    return inner
+
+@sandbox(3276,msg="""
+Series/DataFrame now have the `dgrep` and `neighbours` methods.
+See the docstrings for usage examples.
+""")
+def xp_dgrep_cb(key):
+    import pandas
+    val = cf.get_option(key)
+    if val:
+        from pandas.sandbox.dgrep import dgrep,neighbours
+        pandas.DataFrame.dgrep = dgrep
+        pandas.Series.dgrep = dgrep
+        pandas.DataFrame.neighbours = neighbours
+        pandas.Series.neighbours = neighbours
+
+    else:
+        try:
+            del pandas.DataFrame.dgrep
+            del pandas.DataFrame.context
+        except:
+            pass
+        try:
+            del pandas.series.dgrep
+            del pandas.series.context
+        except:
+            pass
+
+with cf.config_prefix('sandbox'):
+        cf.register_option('dgrep', False, sb_xp_dgrep_doc,
+                       validator=is_bool, cb=xp_dgrep_cb)
diff --git a/pandas/sandbox/dgrep/__init__.py b/pandas/sandbox/dgrep/__init__.py
@@ -0,0 +1,141 @@
+import numpy as np
+# TODO, add axis argument
+def dgrep(self,pred,cols=None,C=0,B=0,A=0,split=False,keys=True):
+    """Select rows by regex match or predicate function, against *data*.
+
+    This is an unindexed operation, and is substantially slower then
+    index-based selection for large datasets.
+
+    cols : string or sequence of str
+             name or sequence of column named if running against a DataFrame,
+             ignored otherwise.
+    pred : string regex or f(val) -> bool or value to test equality against.
+
+             if the predicate function expects *args or multiple unnamed
+             arguments, the row values for the specified columns will be passed
+             in to the the predicate function as a list, one call per row.
+
+    A/B,C : int, grep-like argument, context lines (A)fter/(B)efore or  (C)entered (C)ontext
+    split: bool , False returns a  slice of the current object, if context lines overlap
+                between matches, they will only appear once. a True value will return
+                a list of frames or (matched_index_label, self_sliced) pairs (default),
+                depending on the the value of `keys`. Similar to the groupby API.
+    keys: bool, if split==True, keys=False will make the function return
+             a list of frames, rather then a list of (label, dataframe) pairs.
+
+    Usage examples:
+
+    from pandas.util.testing import makeCustomDataframe as mkdf
+
+    df=mkdf(30,4,r_idx_nlevels=3)
+    df.index=range(30)
+    df.iloc[5,0] = "supercool"
+    df.iloc[6,0] = "supercool"
+    df.iloc[29,0] = "supercool"
+    df.iloc[15,1] = "supercool"
+    df.iloc[17,2] = "supercool"
+    # accepts colname and regex string
+    df.dgrep(".cool$","C_l0_g0")
+
+    df.dgrep(".cool$",["C_l0_g0","C_l0_g1"])
+    # specifying C=2 (or A/B=) does a grep context , providing
+    # context lines around the hit
+    # NB overlapping context lines do not cause line duplication (*)
+    df.dgrep(".cool$",["C_l0_g0"],C=2)
+    # also accepts lambda
+    # NB, last match is at end, so only previous line of context displayed
+    df.dgrep(lambda x: bool(re.search(".cool$",x)),["C_l0_g0"],C=3)
+    # split=True returns a series of (index_label_matched, dataframe)
+    # pairs, similar to groupby
+    # NB some lines appear in more then one group in this case (*)
+    df.dgrep(".cool$",["C_l0_g0"],split=True,C=3)
+
+    # works on series too
+    df.C_l0_g0.dgrep(".cool$",C=3)
+
+    # can also get the values "applied" onto the function
+    # TODO?: df.dgrep(lambda c1,c2: "cool" in c1 or "cool" in c2,df.columns[:2])
+
+    # which also works with *args
+    df.dgrep(lambda *args: "supercool" in args,df.columns[:3])
+    """
+    from pandas import DataFrame
+    from pandas.core.common import _is_sequence
+    import inspect
+
+    if _is_sequence(cols):
+        cols = list(cols)   # convert index to list, from slice such as df.columns[:3]
+    if not isinstance(cols,(list,tuple)):
+        cols = [cols]
+
+    combine=False
+    if callable(pred):
+        fargs=inspect.getargspec(pred)
+        if fargs.varargs:
+           combine=True
+
+        # elif len(fargs.args) > 1:
+        #    if len(fargs.args) !=  len(cols):
+        #        raise ValueError("predicate function argcount doesn't match num. of cols")
+        #    combine=True
+
+    elif isinstance(pred,basestring):
+        import re
+        _pat = pred
+        matcher = re.compile(_pat)
+        def f1(x):
+            return bool(matcher.search(unicode(x)))
+        pred=f1
+    else: # can also match non-string values by equality
+        def f2(x):
+            return x == pred
+        pred=f2
+
+    indicies =  set()
+    if isinstance(self,DataFrame):
+        if  combine:
+            vals = self.ix[cols].apply(pred).sum(1)
+            indicies.update(np.where(vals)[0].tolist())
+
+        else:
+            for col in cols:
+                # print np.where(self[col].apply(pred))
+                vals = np.where(self[col].apply(pred))[0]
+                indicies.update(vals.tolist())
+    else:
+
+        indicies.update(np.where(self.apply(pred))[0].tolist())
+
+    return self.neighbours(self.index[list(sorted(indicies))],C=C,B=B,A=A,split=split,keys=keys)
+
+def neighbours(self,labels,C=None,B=None,A=None,split=False,keys=True):
+    """Takes a list of labels and return one ore more frame/series with the indicated
+    rows + surrounding rows as determined by the (A)fter/(B)efore or
+    (C)entered (C)ontext.
+
+    see the `dgrep` docstring for more details about the identical arguments.
+
+    """
+    if C:
+        B = C//2
+        A = C-B-1
+
+    indicies = map(self.index.get_loc,labels)
+    if split:
+        #list of (hit_label,sliced frame)
+        def g(x):
+            return (x,range(max(0,x-B),min(x+A+1,len(self.index))))
+
+        indicies_grps = map(g,indicies)
+        results = []
+        for i,indicies in indicies_grps:
+            if keys:
+                results.append((self.index[i],self.iloc[indicies]))
+            else:
+                results.append(self.iloc[indicies])
+        return results
+    else:
+        indicies=reduce(lambda acc,x: acc+range(max(0,x-B),min(x+A+1,len(self.index))),
+                    indicies,[])
+        # there's just one, and return just the sliced frame, not the hit label
+        return self.iloc[sorted(set(indicies))]
diff --git a/pandas/sandbox/dgrep/test_dgrep.py b/pandas/sandbox/dgrep/test_dgrep.py
@@ -0,0 +1,68 @@
+# pylint: disable-msg=W0612,E1101
+
+import unittest
+import nose
+
+from pandas.util.testing import assert_series_equal
+
+class TestDgrep(unittest.TestCase):
+    def test_dgrep(self):
+        import pandas as pd
+        from pandas import Series as Series
+        from pandas.util.testing import makeCustomDataframe as mkdf
+
+        import re
+        pd.options.sandbox.dgrep=True # turn it on
+        df=mkdf(30,4,r_idx_nlevels=3)
+        df.index=range(30)
+        df.iloc[5,0] = "supercool"
+        df.iloc[6,0] = "supercool"
+        df.iloc[29,0] = "supercool"
+        df.iloc[15,1] = "supercool"
+        df.iloc[17,2] = "supercool"
+        # accepts colname and regex string
+        rs = df.dgrep(".cool$","C_l0_g0")
+        assert_series_equal(rs.C_l0_g0,Series(["supercool"]*3,index=[5,6,29]))
+        # accepts lists of cols, can include a series such as df.series_name
+        # (convenient for tab completion on columns)
+        rs = df.dgrep(".cool$",['C_l0_g0','C_l0_g1'])
+        xp = Series(["supercool","supercool","R15C0","supercool"],index=[5,6,15,29])
+        assert_series_equal(rs.C_l0_g0,xp)
+        self.assertEqual(rs.iloc[2,1],"supercool")
+
+        # accepts a single named series
+        rs = df.dgrep(".cool$",'C_l0_g1')
+        xp = Series(["supercool"],index=[15])
+        assert_series_equal(rs.C_l0_g1,xp)
+
+
+        # specifying C=2 (or A/B=) does a grep context , providing
+        # context lines around the hit
+        # NB overlapping context lines do not cause line duplication (*)
+        rs = df.dgrep(".cool$",["C_l0_g0"],C=2)
+        xp = Series(['R4C0', 'supercool', 'supercool', 'R28C0', 'supercool'],index=[4,5,6,28,29])
+        assert_series_equal(rs.C_l0_g0,xp)
+
+        # also accepts lambda
+        # NB, last match is at end, so only previous line of context displayed
+        rs=df.dgrep(lambda x: bool(re.search(".cool$",x)),["C_l0_g0"],C=3)
+        xp = Series(['R4C0', 'supercool', 'supercool', 'R7C0', 'R28C0', 'supercool'],index=[4,5,6,7,28,29])
+        assert_series_equal(xp,rs.C_l0_g0)
+        # split=True returns a series of (index_label_matched, dataframe)
+        # pairs, similar to groupby
+        # NB some lines appear in more then one group in this case (*)
+        rs = df.dgrep(".cool$",["C_l0_g0"],split=True,C=3)
+        self.assertEqual(len(rs),3)
+        xp = Series(['R4C0', 'supercool', 'supercool'],index=[4,5,6])
+        assert_series_equal(xp,rs[0][1].C_l0_g0)
+
+        # works on series too
+        s = df.C_l0_g0.dgrep(".cool$",C=3)
+        xp = Series(['R4C0', 'supercool', 'supercool', 'R7C0', 'R28C0', 'supercool'],index=[4,5,6,7,28,29])
+        assert_series_equal(xp,s)
+
+        # can also get the values "applied" onto the function
+        # TODO?: df.dgrep(lambda c1,c2: "cool" in c1 or "cool" in c2,df.columns[:2])
+
+        # which also works with *args
+        df.dgrep(lambda *args: any(["supercool" in x for x in args]),df.columns[:3])