diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt index 9c0a6d5a421c7..5e272fcb1fa48 100644 --- a/doc/source/v0.11.0.txt +++ b/doc/source/v0.11.0.txt @@ -233,6 +233,12 @@ API changes - In ``HDFStore``, deprecated the ``unique`` method, can be replicated by ``select_column(key,column).unique()`` + - an experimental new df.dgrep() for selecting rows by applying regex/predicate function to *data* values + can be made available via the "sandbox.dgrep" option. Note that this is not an official part of the API yet + so is subject to breaking change. Feedback is welcome (GH2460_). + +.. _GH2460: https://github.com/pydata/pandas/issues/2460 + Enhancements ~~~~~~~~~~~~ diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 9f599ffe908ba..2cb6abe154f5c 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -241,3 +241,76 @@ def use_inf_as_null_cb(key): with cf.config_prefix('mode'): cf.register_option('use_inf_as_null', False, use_inf_as_null_doc, cb=use_inf_as_null_cb) + + +sb_xp_dgrep_doc = """ +: boolean + Enables the experimental df.dgrep() method, for selecting series/DataFrame + rows by regex/predicate match against *data* values. + + Features exposed via the sandbox are subject to change or removal, and are not + yet part of the official API. + +""" + +def sandbox(gh_issue_num,msg=None): + def inner(cb): + def f(key): + + s = """ +This is an experimental feature being considered for inclusion in pandas core. +We'd appreciate your feedback on it in the Github issue page: + + http://github.com/pydata/pandas/issues/%d + +If you find this useful, lacking in major functionality or buggy please +take a moment to let us know, so we can make pandas (even) better. + +Thank you, + +The Pandas dev team + +""" % gh_issue_num + + if msg: + s += "P.S.\n\n" + msg + + # don't print( the msessage on turn off + val = cf.get_option(key) + if val: + print(s) + + return cb(key) + + return f + return inner + +@sandbox(3276,msg=""" +Series/DataFrame now have the `dgrep` and `neighbours` methods. +See the docstrings for usage examples. +""") +def xp_dgrep_cb(key): + import pandas + val = cf.get_option(key) + if val: + from pandas.sandbox.dgrep import dgrep,neighbours + pandas.DataFrame.dgrep = dgrep + pandas.Series.dgrep = dgrep + pandas.DataFrame.neighbours = neighbours + pandas.Series.neighbours = neighbours + + else: + try: + del pandas.DataFrame.dgrep + del pandas.DataFrame.context + except: + pass + try: + del pandas.series.dgrep + del pandas.series.context + except: + pass + +with cf.config_prefix('sandbox'): + cf.register_option('dgrep', False, sb_xp_dgrep_doc, + validator=is_bool, cb=xp_dgrep_cb) diff --git a/pandas/sandbox/dgrep/__init__.py b/pandas/sandbox/dgrep/__init__.py new file mode 100644 index 0000000000000..ea69c317426ad --- /dev/null +++ b/pandas/sandbox/dgrep/__init__.py @@ -0,0 +1,141 @@ +import numpy as np +# TODO, add axis argument +def dgrep(self,pred,cols=None,C=0,B=0,A=0,split=False,keys=True): + """Select rows by regex match or predicate function, against *data*. + + This is an unindexed operation, and is substantially slower then + index-based selection for large datasets. + + cols : string or sequence of str + name or sequence of column named if running against a DataFrame, + ignored otherwise. + pred : string regex or f(val) -> bool or value to test equality against. + + if the predicate function expects *args or multiple unnamed + arguments, the row values for the specified columns will be passed + in to the the predicate function as a list, one call per row. + + A/B,C : int, grep-like argument, context lines (A)fter/(B)efore or (C)entered (C)ontext + split: bool , False returns a slice of the current object, if context lines overlap + between matches, they will only appear once. a True value will return + a list of frames or (matched_index_label, self_sliced) pairs (default), + depending on the the value of `keys`. Similar to the groupby API. + keys: bool, if split==True, keys=False will make the function return + a list of frames, rather then a list of (label, dataframe) pairs. + + Usage examples: + + from pandas.util.testing import makeCustomDataframe as mkdf + + df=mkdf(30,4,r_idx_nlevels=3) + df.index=range(30) + df.iloc[5,0] = "supercool" + df.iloc[6,0] = "supercool" + df.iloc[29,0] = "supercool" + df.iloc[15,1] = "supercool" + df.iloc[17,2] = "supercool" + # accepts colname and regex string + df.dgrep(".cool$","C_l0_g0") + + df.dgrep(".cool$",["C_l0_g0","C_l0_g1"]) + # specifying C=2 (or A/B=) does a grep context , providing + # context lines around the hit + # NB overlapping context lines do not cause line duplication (*) + df.dgrep(".cool$",["C_l0_g0"],C=2) + # also accepts lambda + # NB, last match is at end, so only previous line of context displayed + df.dgrep(lambda x: bool(re.search(".cool$",x)),["C_l0_g0"],C=3) + # split=True returns a series of (index_label_matched, dataframe) + # pairs, similar to groupby + # NB some lines appear in more then one group in this case (*) + df.dgrep(".cool$",["C_l0_g0"],split=True,C=3) + + # works on series too + df.C_l0_g0.dgrep(".cool$",C=3) + + # can also get the values "applied" onto the function + # TODO?: df.dgrep(lambda c1,c2: "cool" in c1 or "cool" in c2,df.columns[:2]) + + # which also works with *args + df.dgrep(lambda *args: "supercool" in args,df.columns[:3]) + """ + from pandas import DataFrame + from pandas.core.common import _is_sequence + import inspect + + if _is_sequence(cols): + cols = list(cols) # convert index to list, from slice such as df.columns[:3] + if not isinstance(cols,(list,tuple)): + cols = [cols] + + combine=False + if callable(pred): + fargs=inspect.getargspec(pred) + if fargs.varargs: + combine=True + + # elif len(fargs.args) > 1: + # if len(fargs.args) != len(cols): + # raise ValueError("predicate function argcount doesn't match num. of cols") + # combine=True + + elif isinstance(pred,basestring): + import re + _pat = pred + matcher = re.compile(_pat) + def f1(x): + return bool(matcher.search(unicode(x))) + pred=f1 + else: # can also match non-string values by equality + def f2(x): + return x == pred + pred=f2 + + indicies = set() + if isinstance(self,DataFrame): + if combine: + vals = self.ix[cols].apply(pred).sum(1) + indicies.update(np.where(vals)[0].tolist()) + + else: + for col in cols: + # print np.where(self[col].apply(pred)) + vals = np.where(self[col].apply(pred))[0] + indicies.update(vals.tolist()) + else: + + indicies.update(np.where(self.apply(pred))[0].tolist()) + + return self.neighbours(self.index[list(sorted(indicies))],C=C,B=B,A=A,split=split,keys=keys) + +def neighbours(self,labels,C=None,B=None,A=None,split=False,keys=True): + """Takes a list of labels and return one ore more frame/series with the indicated + rows + surrounding rows as determined by the (A)fter/(B)efore or + (C)entered (C)ontext. + + see the `dgrep` docstring for more details about the identical arguments. + + """ + if C: + B = C//2 + A = C-B-1 + + indicies = map(self.index.get_loc,labels) + if split: + #list of (hit_label,sliced frame) + def g(x): + return (x,range(max(0,x-B),min(x+A+1,len(self.index)))) + + indicies_grps = map(g,indicies) + results = [] + for i,indicies in indicies_grps: + if keys: + results.append((self.index[i],self.iloc[indicies])) + else: + results.append(self.iloc[indicies]) + return results + else: + indicies=reduce(lambda acc,x: acc+range(max(0,x-B),min(x+A+1,len(self.index))), + indicies,[]) + # there's just one, and return just the sliced frame, not the hit label + return self.iloc[sorted(set(indicies))] diff --git a/pandas/sandbox/dgrep/test_dgrep.py b/pandas/sandbox/dgrep/test_dgrep.py new file mode 100644 index 0000000000000..d2944890f7fb0 --- /dev/null +++ b/pandas/sandbox/dgrep/test_dgrep.py @@ -0,0 +1,68 @@ +# pylint: disable-msg=W0612,E1101 + +import unittest +import nose + +from pandas.util.testing import assert_series_equal + +class TestDgrep(unittest.TestCase): + def test_dgrep(self): + import pandas as pd + from pandas import Series as Series + from pandas.util.testing import makeCustomDataframe as mkdf + + import re + pd.options.sandbox.dgrep=True # turn it on + df=mkdf(30,4,r_idx_nlevels=3) + df.index=range(30) + df.iloc[5,0] = "supercool" + df.iloc[6,0] = "supercool" + df.iloc[29,0] = "supercool" + df.iloc[15,1] = "supercool" + df.iloc[17,2] = "supercool" + # accepts colname and regex string + rs = df.dgrep(".cool$","C_l0_g0") + assert_series_equal(rs.C_l0_g0,Series(["supercool"]*3,index=[5,6,29])) + # accepts lists of cols, can include a series such as df.series_name + # (convenient for tab completion on columns) + rs = df.dgrep(".cool$",['C_l0_g0','C_l0_g1']) + xp = Series(["supercool","supercool","R15C0","supercool"],index=[5,6,15,29]) + assert_series_equal(rs.C_l0_g0,xp) + self.assertEqual(rs.iloc[2,1],"supercool") + + # accepts a single named series + rs = df.dgrep(".cool$",'C_l0_g1') + xp = Series(["supercool"],index=[15]) + assert_series_equal(rs.C_l0_g1,xp) + + + # specifying C=2 (or A/B=) does a grep context , providing + # context lines around the hit + # NB overlapping context lines do not cause line duplication (*) + rs = df.dgrep(".cool$",["C_l0_g0"],C=2) + xp = Series(['R4C0', 'supercool', 'supercool', 'R28C0', 'supercool'],index=[4,5,6,28,29]) + assert_series_equal(rs.C_l0_g0,xp) + + # also accepts lambda + # NB, last match is at end, so only previous line of context displayed + rs=df.dgrep(lambda x: bool(re.search(".cool$",x)),["C_l0_g0"],C=3) + xp = Series(['R4C0', 'supercool', 'supercool', 'R7C0', 'R28C0', 'supercool'],index=[4,5,6,7,28,29]) + assert_series_equal(xp,rs.C_l0_g0) + # split=True returns a series of (index_label_matched, dataframe) + # pairs, similar to groupby + # NB some lines appear in more then one group in this case (*) + rs = df.dgrep(".cool$",["C_l0_g0"],split=True,C=3) + self.assertEqual(len(rs),3) + xp = Series(['R4C0', 'supercool', 'supercool'],index=[4,5,6]) + assert_series_equal(xp,rs[0][1].C_l0_g0) + + # works on series too + s = df.C_l0_g0.dgrep(".cool$",C=3) + xp = Series(['R4C0', 'supercool', 'supercool', 'R7C0', 'R28C0', 'supercool'],index=[4,5,6,7,28,29]) + assert_series_equal(xp,s) + + # can also get the values "applied" onto the function + # TODO?: df.dgrep(lambda c1,c2: "cool" in c1 or "cool" in c2,df.columns[:2]) + + # which also works with *args + df.dgrep(lambda *args: any(["supercool" in x for x in args]),df.columns[:3])