Skip to content

Commit 7b21d77

Browse files
authored
ruleguard/textmatch: an abstraction on top of regexp for performance (#281)
`textmatch.Compile()` takes a regexp pattern and tries to recognize it, returning the matcher that can match the input strings faster than real `*regexp.Regexp` would. If it can't recognize the pattern, it returns a normal `*regexp.Regexp`. Right now we only optimize the simplest patterns, but it's a first step to prove that we can still use regexp in ruleguard rules and avoid big performance loses. ``` name old time/op new time/op delta Match/^\p{Lu}_0-8 153ns ± 4% 11ns ± 1% -92.81% (p=0.008 n=5+5) Match/^\p{Lu}_1-8 140ns ± 2% 11ns ± 0% -92.13% (p=0.008 n=5+5) Match/^\p{Ll}_0-8 152ns ± 1% 11ns ± 1% -92.77% (p=0.008 n=5+5) Match/^\p{Ll}_1-8 140ns ± 2% 11ns ± 3% -92.04% (p=0.008 n=5+5) Match/foo$_0-8 174ns ± 1% 13ns ± 1% -92.26% (p=0.008 n=5+5) Match/foo$_1-8 83.4ns ± 2% 13.4ns ± 6% -83.96% (p=0.008 n=5+5) Match/^foo_0-8 135ns ± 0% 10ns ± 1% -92.33% (p=0.016 n=4+5) Match/^foo_1-8 108ns ± 4% 11ns ± 4% -89.78% (p=0.008 n=5+5) Match/simpleIdent_0-8 243ns ± 2% 18ns ± 1% -92.51% (p=0.008 n=5+5) Match/simpleIdent_1-8 92.7ns ± 1% 26.5ns ± 1% -71.43% (p=0.008 n=5+5) Match/.*simpleIdent.*_0-8 1.59µs ± 2% 0.02µs ± 1% -98.86% (p=0.008 n=5+5) Match/.*simpleIdent.*_1-8 1.70µs ± 1% 0.03µs ± 1% -98.46% (p=0.008 n=5+5) Match/simpleIdent_0#01-8 237ns ± 1% 14ns ± 1% -94.03% (p=0.008 n=5+5) Match/simpleIdent_1#01-8 247ns ± 1% 24ns ± 3% -90.42% (p=0.008 n=5+5) [Geo mean] 211ns 15ns -93.00% ```
1 parent 4b7bdbb commit 7b21d77

File tree

6 files changed

+356
-6
lines changed

6 files changed

+356
-6
lines changed

ruleguard/filters.go

+4-4
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,12 @@ import (
66
"go/token"
77
"go/types"
88
"path/filepath"
9-
"regexp"
109

1110
"github.com/quasilyte/go-ruleguard/internal/gogrep"
1211
"github.com/quasilyte/go-ruleguard/internal/xtypes"
1312
"github.com/quasilyte/go-ruleguard/nodetag"
1413
"github.com/quasilyte/go-ruleguard/ruleguard/quasigo"
14+
"github.com/quasilyte/go-ruleguard/ruleguard/textmatch"
1515
"github.com/quasilyte/go-ruleguard/ruleguard/typematch"
1616
)
1717

@@ -76,7 +76,7 @@ func makeFileImportsFilter(src, pkgPath string) filterFunc {
7676
}
7777
}
7878

79-
func makeFilePkgPathMatchesFilter(src string, re *regexp.Regexp) filterFunc {
79+
func makeFilePkgPathMatchesFilter(src string, re textmatch.Pattern) filterFunc {
8080
return func(params *filterParams) matchFilterResult {
8181
pkgPath := params.ctx.Pkg.Path()
8282
if re.MatchString(pkgPath) {
@@ -86,7 +86,7 @@ func makeFilePkgPathMatchesFilter(src string, re *regexp.Regexp) filterFunc {
8686
}
8787
}
8888

89-
func makeFileNameMatchesFilter(src string, re *regexp.Regexp) filterFunc {
89+
func makeFileNameMatchesFilter(src string, re textmatch.Pattern) filterFunc {
9090
return func(params *filterParams) matchFilterResult {
9191
if re.MatchString(filepath.Base(params.filename)) {
9292
return filterSuccess
@@ -373,7 +373,7 @@ func makeTextFilter(src, varname string, op token.Token, rhsVarname string) filt
373373
}
374374
}
375375

376-
func makeTextMatchesFilter(src, varname string, re *regexp.Regexp) filterFunc {
376+
func makeTextMatchesFilter(src, varname string, re textmatch.Pattern) filterFunc {
377377
// TODO(quasilyte): add variadic support.
378378
return func(params *filterParams) matchFilterResult {
379379
if re.Match(params.nodeText(params.subNode(varname))) {

ruleguard/ir_loader.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"github.com/quasilyte/go-ruleguard/ruleguard/goutil"
1717
"github.com/quasilyte/go-ruleguard/ruleguard/ir"
1818
"github.com/quasilyte/go-ruleguard/ruleguard/quasigo"
19+
"github.com/quasilyte/go-ruleguard/ruleguard/textmatch"
1920
"github.com/quasilyte/go-ruleguard/ruleguard/typematch"
2021
)
2122

@@ -409,12 +410,12 @@ func (l *irLoader) unwrapInterfaceExpr(filter ir.FilterExpr) (*types.Interface,
409410
return iface, nil
410411
}
411412

412-
func (l *irLoader) unwrapRegexpExpr(filter ir.FilterExpr) (*regexp.Regexp, error) {
413+
func (l *irLoader) unwrapRegexpExpr(filter ir.FilterExpr) (textmatch.Pattern, error) {
413414
patternString := l.unwrapStringExpr(filter)
414415
if patternString == "" {
415416
return nil, l.errorf(filter.Line, nil, "expected a non-empty regexp pattern argument")
416417
}
417-
re, err := regexp.Compile(patternString)
418+
re, err := textmatch.Compile(patternString)
418419
if err != nil {
419420
return nil, l.errorf(filter.Line, err, "compile regexp")
420421
}

ruleguard/textmatch/compile.go

+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
package textmatch
2+
3+
import (
4+
"regexp"
5+
"regexp/syntax"
6+
"unicode"
7+
)
8+
9+
func compile(s string) (Pattern, error) {
10+
reSyntax, err := syntax.Parse(s, syntax.Perl)
11+
if err == nil {
12+
if optimized := compileOptimized(s, reSyntax); optimized != nil {
13+
return optimized, nil
14+
}
15+
}
16+
return regexp.Compile(s)
17+
}
18+
19+
func compileOptimized(s string, re *syntax.Regexp) Pattern {
20+
// .*
21+
isAny := func(re *syntax.Regexp) bool {
22+
return re.Op == syntax.OpStar && re.Sub[0].Op == syntax.OpAnyCharNotNL
23+
}
24+
// "literal"
25+
isLit := func(re *syntax.Regexp) bool {
26+
return re.Op == syntax.OpLiteral
27+
}
28+
// ^
29+
isBegin := func(re *syntax.Regexp) bool {
30+
return re.Op == syntax.OpBeginText
31+
}
32+
// $
33+
isEnd := func(re *syntax.Regexp) bool {
34+
return re.Op == syntax.OpEndText
35+
}
36+
37+
// TODO: analyze what kind of regexps people use in rules
38+
// more often and optimize those as well.
39+
40+
// lit => strings.Contains($input, lit)
41+
if re.Op == syntax.OpLiteral {
42+
return &containsLiteralMatcher{value: newInputValue(string(re.Rune))}
43+
}
44+
45+
// `.*` lit `.*` => strings.Contains($input, lit)
46+
if re.Op == syntax.OpConcat && len(re.Sub) == 3 {
47+
if isAny(re.Sub[0]) && isLit(re.Sub[1]) && isAny(re.Sub[2]) {
48+
return &containsLiteralMatcher{value: newInputValue(string(re.Sub[1].Rune))}
49+
}
50+
}
51+
52+
// `^` lit => strings.HasPrefix($input, lit)
53+
if re.Op == syntax.OpConcat && len(re.Sub) == 2 {
54+
if isBegin(re.Sub[0]) && isLit(re.Sub[1]) {
55+
return &prefixLiteralMatcher{value: newInputValue(string(re.Sub[1].Rune))}
56+
}
57+
}
58+
59+
// lit `$` => strings.HasSuffix($input, lit)
60+
if re.Op == syntax.OpConcat && len(re.Sub) == 2 {
61+
if isLit(re.Sub[0]) && isEnd(re.Sub[1]) {
62+
return &suffixLiteralMatcher{value: newInputValue(string(re.Sub[0].Rune))}
63+
}
64+
}
65+
66+
// `^` lit `$` => $input == lit
67+
if re.Op == syntax.OpConcat && len(re.Sub) == 3 {
68+
if isBegin(re.Sub[0]) && isLit(re.Sub[1]) && isEnd(re.Sub[2]) {
69+
return &eqLiteralMatcher{value: newInputValue(string(re.Sub[1].Rune))}
70+
}
71+
}
72+
73+
// `^\p{Lu}` => prefixRunePredMatcher:unicode.IsUpper
74+
// `^\p{Ll}` => prefixRunePredMatcher:unicode.IsLower
75+
switch s {
76+
case `^\p{Lu}`:
77+
return &prefixRunePredMatcher{pred: unicode.IsUpper}
78+
case `^\p{Ll}`:
79+
return &prefixRunePredMatcher{pred: unicode.IsLower}
80+
}
81+
82+
// Can't optimize.
83+
return nil
84+
}

ruleguard/textmatch/matchers.go

+72
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
package textmatch
2+
3+
import (
4+
"bytes"
5+
"strings"
6+
"unicode/utf8"
7+
)
8+
9+
// inputValue is a wrapper for string|[]byte.
10+
//
11+
// We hold both values to avoid string->[]byte and vice versa
12+
// conversions when doing Match and MatchString.
13+
type inputValue struct {
14+
s string
15+
b []byte
16+
}
17+
18+
func newInputValue(s string) inputValue {
19+
return inputValue{s: s, b: []byte(s)}
20+
}
21+
22+
type containsLiteralMatcher struct{ value inputValue }
23+
24+
func (m *containsLiteralMatcher) MatchString(s string) bool {
25+
return strings.Contains(s, m.value.s)
26+
}
27+
28+
func (m *containsLiteralMatcher) Match(b []byte) bool {
29+
return bytes.Contains(b, m.value.b)
30+
}
31+
32+
type prefixLiteralMatcher struct{ value inputValue }
33+
34+
func (m *prefixLiteralMatcher) MatchString(s string) bool {
35+
return strings.HasPrefix(s, m.value.s)
36+
}
37+
38+
func (m *prefixLiteralMatcher) Match(b []byte) bool {
39+
return bytes.HasPrefix(b, m.value.b)
40+
}
41+
42+
type suffixLiteralMatcher struct{ value inputValue }
43+
44+
func (m *suffixLiteralMatcher) MatchString(s string) bool {
45+
return strings.HasSuffix(s, m.value.s)
46+
}
47+
48+
func (m *suffixLiteralMatcher) Match(b []byte) bool {
49+
return bytes.HasSuffix(b, m.value.b)
50+
}
51+
52+
type eqLiteralMatcher struct{ value inputValue }
53+
54+
func (m *eqLiteralMatcher) MatchString(s string) bool {
55+
return m.value.s == s
56+
}
57+
58+
func (m *eqLiteralMatcher) Match(b []byte) bool {
59+
return bytes.Equal(m.value.b, b)
60+
}
61+
62+
type prefixRunePredMatcher struct{ pred func(rune) bool }
63+
64+
func (m *prefixRunePredMatcher) MatchString(s string) bool {
65+
r, _ := utf8.DecodeRuneInString(s)
66+
return m.pred(r)
67+
}
68+
69+
func (m *prefixRunePredMatcher) Match(b []byte) bool {
70+
r, _ := utf8.DecodeRune(b)
71+
return m.pred(r)
72+
}

ruleguard/textmatch/textmatch.go

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
package textmatch
2+
3+
import "regexp"
4+
5+
// Pattern is a compiled regular expression.
6+
type Pattern interface {
7+
MatchString(s string) bool
8+
Match(b []byte) bool
9+
}
10+
11+
// Compile parses a regular expression and returns a compiled
12+
// pattern that can match inputs descriped by the regexp.
13+
//
14+
// Semantically it's close to the regexp.Compile, but
15+
// it does recognize some common patterns and creates
16+
// a more optimized matcher for them.
17+
func Compile(re string) (Pattern, error) {
18+
return compile(re)
19+
}
20+
21+
// IsRegexp reports whether p is implemented using regexp.
22+
// False means that the underlying matcher is something optimized.
23+
func IsRegexp(p Pattern) bool {
24+
_, ok := p.(*regexp.Regexp)
25+
return ok
26+
}

0 commit comments

Comments
 (0)