Skip to content

WIP: add df.dgrep, df.neighbours #3276

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions doc/source/v0.11.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,12 @@ API changes

- In ``HDFStore``, deprecated the ``unique`` method, can be replicated by ``select_column(key,column).unique()``

- an experimental new df.dgrep() for selecting rows by applying regex/predicate function to *data* values
can be made available via the "sandbox.dgrep" option. Note that this is not an official part of the API yet
so is subject to breaking change. Feedback is welcome (GH2460_).

.. _GH2460: https://github.com/pydata/pandas/issues/2460

Enhancements
~~~~~~~~~~~~

Expand Down
73 changes: 73 additions & 0 deletions pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,3 +241,76 @@ def use_inf_as_null_cb(key):
with cf.config_prefix('mode'):
cf.register_option('use_inf_as_null', False, use_inf_as_null_doc,
cb=use_inf_as_null_cb)


sb_xp_dgrep_doc = """
: boolean
Enables the experimental df.dgrep() method, for selecting series/DataFrame
rows by regex/predicate match against *data* values.

Features exposed via the sandbox are subject to change or removal, and are not
yet part of the official API.

"""

def sandbox(gh_issue_num,msg=None):
def inner(cb):
def f(key):

s = """
This is an experimental feature being considered for inclusion in pandas core.
We'd appreciate your feedback on it in the Github issue page:

http://github.com/pydata/pandas/issues/%d

If you find this useful, lacking in major functionality or buggy please
take a moment to let us know, so we can make pandas (even) better.

Thank you,

The Pandas dev team

""" % gh_issue_num

if msg:
s += "P.S.\n\n" + msg

# don't print( the msessage on turn off
val = cf.get_option(key)
if val:
print(s)

return cb(key)

return f
return inner

@sandbox(3276,msg="""
Series/DataFrame now have the `dgrep` and `neighbours` methods.
See the docstrings for usage examples.
""")
def xp_dgrep_cb(key):
import pandas
val = cf.get_option(key)
if val:
from pandas.sandbox.dgrep import dgrep,neighbours
pandas.DataFrame.dgrep = dgrep
pandas.Series.dgrep = dgrep
pandas.DataFrame.neighbours = neighbours
pandas.Series.neighbours = neighbours

else:
try:
del pandas.DataFrame.dgrep
del pandas.DataFrame.context
except:
pass
try:
del pandas.series.dgrep
del pandas.series.context
except:
pass

with cf.config_prefix('sandbox'):
cf.register_option('dgrep', False, sb_xp_dgrep_doc,
validator=is_bool, cb=xp_dgrep_cb)
141 changes: 141 additions & 0 deletions pandas/sandbox/dgrep/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import numpy as np
# TODO, add axis argument
def dgrep(self,pred,cols=None,C=0,B=0,A=0,split=False,keys=True):
"""Select rows by regex match or predicate function, against *data*.

This is an unindexed operation, and is substantially slower then
index-based selection for large datasets.

cols : string or sequence of str
name or sequence of column named if running against a DataFrame,
ignored otherwise.
pred : string regex or f(val) -> bool or value to test equality against.

if the predicate function expects *args or multiple unnamed
arguments, the row values for the specified columns will be passed
in to the the predicate function as a list, one call per row.

A/B,C : int, grep-like argument, context lines (A)fter/(B)efore or (C)entered (C)ontext
split: bool , False returns a slice of the current object, if context lines overlap
between matches, they will only appear once. a True value will return
a list of frames or (matched_index_label, self_sliced) pairs (default),
depending on the the value of `keys`. Similar to the groupby API.
keys: bool, if split==True, keys=False will make the function return
a list of frames, rather then a list of (label, dataframe) pairs.

Usage examples:

from pandas.util.testing import makeCustomDataframe as mkdf

df=mkdf(30,4,r_idx_nlevels=3)
df.index=range(30)
df.iloc[5,0] = "supercool"
df.iloc[6,0] = "supercool"
df.iloc[29,0] = "supercool"
df.iloc[15,1] = "supercool"
df.iloc[17,2] = "supercool"
# accepts colname and regex string
df.dgrep(".cool$","C_l0_g0")

df.dgrep(".cool$",["C_l0_g0","C_l0_g1"])
# specifying C=2 (or A/B=) does a grep context , providing
# context lines around the hit
# NB overlapping context lines do not cause line duplication (*)
df.dgrep(".cool$",["C_l0_g0"],C=2)
# also accepts lambda
# NB, last match is at end, so only previous line of context displayed
df.dgrep(lambda x: bool(re.search(".cool$",x)),["C_l0_g0"],C=3)
# split=True returns a series of (index_label_matched, dataframe)
# pairs, similar to groupby
# NB some lines appear in more then one group in this case (*)
df.dgrep(".cool$",["C_l0_g0"],split=True,C=3)

# works on series too
df.C_l0_g0.dgrep(".cool$",C=3)

# can also get the values "applied" onto the function
# TODO?: df.dgrep(lambda c1,c2: "cool" in c1 or "cool" in c2,df.columns[:2])

# which also works with *args
df.dgrep(lambda *args: "supercool" in args,df.columns[:3])
"""
from pandas import DataFrame
from pandas.core.common import _is_sequence
import inspect

if _is_sequence(cols):
cols = list(cols) # convert index to list, from slice such as df.columns[:3]
if not isinstance(cols,(list,tuple)):
cols = [cols]

combine=False
if callable(pred):
fargs=inspect.getargspec(pred)
if fargs.varargs:
combine=True

# elif len(fargs.args) > 1:
# if len(fargs.args) != len(cols):
# raise ValueError("predicate function argcount doesn't match num. of cols")
# combine=True

elif isinstance(pred,basestring):
import re
_pat = pred
matcher = re.compile(_pat)
def f1(x):
return bool(matcher.search(unicode(x)))
pred=f1
else: # can also match non-string values by equality
def f2(x):
return x == pred
pred=f2

indicies = set()
if isinstance(self,DataFrame):
if combine:
vals = self.ix[cols].apply(pred).sum(1)
indicies.update(np.where(vals)[0].tolist())

else:
for col in cols:
# print np.where(self[col].apply(pred))
vals = np.where(self[col].apply(pred))[0]
indicies.update(vals.tolist())
else:

indicies.update(np.where(self.apply(pred))[0].tolist())

return self.neighbours(self.index[list(sorted(indicies))],C=C,B=B,A=A,split=split,keys=keys)

def neighbours(self,labels,C=None,B=None,A=None,split=False,keys=True):
"""Takes a list of labels and return one ore more frame/series with the indicated
rows + surrounding rows as determined by the (A)fter/(B)efore or
(C)entered (C)ontext.

see the `dgrep` docstring for more details about the identical arguments.

"""
if C:
B = C//2
A = C-B-1

indicies = map(self.index.get_loc,labels)
if split:
#list of (hit_label,sliced frame)
def g(x):
return (x,range(max(0,x-B),min(x+A+1,len(self.index))))

indicies_grps = map(g,indicies)
results = []
for i,indicies in indicies_grps:
if keys:
results.append((self.index[i],self.iloc[indicies]))
else:
results.append(self.iloc[indicies])
return results
else:
indicies=reduce(lambda acc,x: acc+range(max(0,x-B),min(x+A+1,len(self.index))),
indicies,[])
# there's just one, and return just the sliced frame, not the hit label
return self.iloc[sorted(set(indicies))]
68 changes: 68 additions & 0 deletions pandas/sandbox/dgrep/test_dgrep.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# pylint: disable-msg=W0612,E1101

import unittest
import nose

from pandas.util.testing import assert_series_equal

class TestDgrep(unittest.TestCase):
def test_dgrep(self):
import pandas as pd
from pandas import Series as Series
from pandas.util.testing import makeCustomDataframe as mkdf

import re
pd.options.sandbox.dgrep=True # turn it on
df=mkdf(30,4,r_idx_nlevels=3)
df.index=range(30)
df.iloc[5,0] = "supercool"
df.iloc[6,0] = "supercool"
df.iloc[29,0] = "supercool"
df.iloc[15,1] = "supercool"
df.iloc[17,2] = "supercool"
# accepts colname and regex string
rs = df.dgrep(".cool$","C_l0_g0")
assert_series_equal(rs.C_l0_g0,Series(["supercool"]*3,index=[5,6,29]))
# accepts lists of cols, can include a series such as df.series_name
# (convenient for tab completion on columns)
rs = df.dgrep(".cool$",['C_l0_g0','C_l0_g1'])
xp = Series(["supercool","supercool","R15C0","supercool"],index=[5,6,15,29])
assert_series_equal(rs.C_l0_g0,xp)
self.assertEqual(rs.iloc[2,1],"supercool")

# accepts a single named series
rs = df.dgrep(".cool$",'C_l0_g1')
xp = Series(["supercool"],index=[15])
assert_series_equal(rs.C_l0_g1,xp)


# specifying C=2 (or A/B=) does a grep context , providing
# context lines around the hit
# NB overlapping context lines do not cause line duplication (*)
rs = df.dgrep(".cool$",["C_l0_g0"],C=2)
xp = Series(['R4C0', 'supercool', 'supercool', 'R28C0', 'supercool'],index=[4,5,6,28,29])
assert_series_equal(rs.C_l0_g0,xp)

# also accepts lambda
# NB, last match is at end, so only previous line of context displayed
rs=df.dgrep(lambda x: bool(re.search(".cool$",x)),["C_l0_g0"],C=3)
xp = Series(['R4C0', 'supercool', 'supercool', 'R7C0', 'R28C0', 'supercool'],index=[4,5,6,7,28,29])
assert_series_equal(xp,rs.C_l0_g0)
# split=True returns a series of (index_label_matched, dataframe)
# pairs, similar to groupby
# NB some lines appear in more then one group in this case (*)
rs = df.dgrep(".cool$",["C_l0_g0"],split=True,C=3)
self.assertEqual(len(rs),3)
xp = Series(['R4C0', 'supercool', 'supercool'],index=[4,5,6])
assert_series_equal(xp,rs[0][1].C_l0_g0)

# works on series too
s = df.C_l0_g0.dgrep(".cool$",C=3)
xp = Series(['R4C0', 'supercool', 'supercool', 'R7C0', 'R28C0', 'supercool'],index=[4,5,6,7,28,29])
assert_series_equal(xp,s)

# can also get the values "applied" onto the function
# TODO?: df.dgrep(lambda c1,c2: "cool" in c1 or "cool" in c2,df.columns[:2])

# which also works with *args
df.dgrep(lambda *args: any(["supercool" in x for x in args]),df.columns[:3])