-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
/
Copy path__init__.py
141 lines (117 loc) · 5.16 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import numpy as np
# TODO, add axis argument
def dgrep(self,pred,cols=None,C=0,B=0,A=0,split=False,keys=True):
"""Select rows by regex match or predicate function, against *data*.
This is an unindexed operation, and is substantially slower then
index-based selection for large datasets.
cols : string or sequence of str
name or sequence of column named if running against a DataFrame,
ignored otherwise.
pred : string regex or f(val) -> bool or value to test equality against.
if the predicate function expects *args or multiple unnamed
arguments, the row values for the specified columns will be passed
in to the the predicate function as a list, one call per row.
A/B,C : int, grep-like argument, context lines (A)fter/(B)efore or (C)entered (C)ontext
split: bool , False returns a slice of the current object, if context lines overlap
between matches, they will only appear once. a True value will return
a list of frames or (matched_index_label, self_sliced) pairs (default),
depending on the the value of `keys`. Similar to the groupby API.
keys: bool, if split==True, keys=False will make the function return
a list of frames, rather then a list of (label, dataframe) pairs.
Usage examples:
from pandas.util.testing import makeCustomDataframe as mkdf
df=mkdf(30,4,r_idx_nlevels=3)
df.index=range(30)
df.iloc[5,0] = "supercool"
df.iloc[6,0] = "supercool"
df.iloc[29,0] = "supercool"
df.iloc[15,1] = "supercool"
df.iloc[17,2] = "supercool"
# accepts colname and regex string
df.dgrep(".cool$","C_l0_g0")
df.dgrep(".cool$",["C_l0_g0","C_l0_g1"])
# specifying C=2 (or A/B=) does a grep context , providing
# context lines around the hit
# NB overlapping context lines do not cause line duplication (*)
df.dgrep(".cool$",["C_l0_g0"],C=2)
# also accepts lambda
# NB, last match is at end, so only previous line of context displayed
df.dgrep(lambda x: bool(re.search(".cool$",x)),["C_l0_g0"],C=3)
# split=True returns a series of (index_label_matched, dataframe)
# pairs, similar to groupby
# NB some lines appear in more then one group in this case (*)
df.dgrep(".cool$",["C_l0_g0"],split=True,C=3)
# works on series too
df.C_l0_g0.dgrep(".cool$",C=3)
# can also get the values "applied" onto the function
# TODO?: df.dgrep(lambda c1,c2: "cool" in c1 or "cool" in c2,df.columns[:2])
# which also works with *args
df.dgrep(lambda *args: "supercool" in args,df.columns[:3])
"""
from pandas import DataFrame
from pandas.core.common import _is_sequence
import inspect
if _is_sequence(cols):
cols = list(cols) # convert index to list, from slice such as df.columns[:3]
if not isinstance(cols,(list,tuple)):
cols = [cols]
combine=False
if callable(pred):
fargs=inspect.getargspec(pred)
if fargs.varargs:
combine=True
# elif len(fargs.args) > 1:
# if len(fargs.args) != len(cols):
# raise ValueError("predicate function argcount doesn't match num. of cols")
# combine=True
elif isinstance(pred,basestring):
import re
_pat = pred
matcher = re.compile(_pat)
def f1(x):
return bool(matcher.search(unicode(x)))
pred=f1
else: # can also match non-string values by equality
def f2(x):
return x == pred
pred=f2
indicies = set()
if isinstance(self,DataFrame):
if combine:
vals = self.ix[cols].apply(pred).sum(1)
indicies.update(np.where(vals)[0].tolist())
else:
for col in cols:
# print np.where(self[col].apply(pred))
vals = np.where(self[col].apply(pred))[0]
indicies.update(vals.tolist())
else:
indicies.update(np.where(self.apply(pred))[0].tolist())
return self.neighbours(self.index[list(sorted(indicies))],C=C,B=B,A=A,split=split,keys=keys)
def neighbours(self,labels,C=None,B=None,A=None,split=False,keys=True):
"""Takes a list of labels and return one ore more frame/series with the indicated
rows + surrounding rows as determined by the (A)fter/(B)efore or
(C)entered (C)ontext.
see the `dgrep` docstring for more details about the identical arguments.
"""
if C:
B = C//2
A = C-B-1
indicies = map(self.index.get_loc,labels)
if split:
#list of (hit_label,sliced frame)
def g(x):
return (x,range(max(0,x-B),min(x+A+1,len(self.index))))
indicies_grps = map(g,indicies)
results = []
for i,indicies in indicies_grps:
if keys:
results.append((self.index[i],self.iloc[indicies]))
else:
results.append(self.iloc[indicies])
return results
else:
indicies=reduce(lambda acc,x: acc+range(max(0,x-B),min(x+A+1,len(self.index))),
indicies,[])
# there's just one, and return just the sliced frame, not the hit label
return self.iloc[sorted(set(indicies))]