ruleguard/textmatch: an abstraction on top of regexp for performance (#281)

quasilyte · web-flow · commit 7b21d7781baf · 2021-10-15T02:01:56.000+03:00
`textmatch.Compile()` takes a regexp pattern and tries to recognize
it, returning the matcher that can match the input strings faster
than real `*regexp.Regexp` would. If it can't recognize the pattern,
it returns a normal `*regexp.Regexp`.

Right now we only optimize the simplest patterns, but it's a
first step to prove that we can still use regexp in ruleguard
rules and avoid big performance loses.

```
name                       old time/op    new time/op    delta
Match/^\p{Lu}_0-8             153ns ± 4%      11ns ± 1%  -92.81%  (p=0.008 n=5+5)
Match/^\p{Lu}_1-8             140ns ± 2%      11ns ± 0%  -92.13%  (p=0.008 n=5+5)
Match/^\p{Ll}_0-8             152ns ± 1%      11ns ± 1%  -92.77%  (p=0.008 n=5+5)
Match/^\p{Ll}_1-8             140ns ± 2%      11ns ± 3%  -92.04%  (p=0.008 n=5+5)
Match/foo$_0-8                174ns ± 1%      13ns ± 1%  -92.26%  (p=0.008 n=5+5)
Match/foo$_1-8               83.4ns ± 2%    13.4ns ± 6%  -83.96%  (p=0.008 n=5+5)
Match/^foo_0-8                135ns ± 0%      10ns ± 1%  -92.33%  (p=0.016 n=4+5)
Match/^foo_1-8                108ns ± 4%      11ns ± 4%  -89.78%  (p=0.008 n=5+5)
Match/simpleIdent_0-8         243ns ± 2%      18ns ± 1%  -92.51%  (p=0.008 n=5+5)
Match/simpleIdent_1-8        92.7ns ± 1%    26.5ns ± 1%  -71.43%  (p=0.008 n=5+5)
Match/.*simpleIdent.*_0-8    1.59µs ± 2%    0.02µs ± 1%  -98.86%  (p=0.008 n=5+5)
Match/.*simpleIdent.*_1-8    1.70µs ± 1%    0.03µs ± 1%  -98.46%  (p=0.008 n=5+5)
Match/simpleIdent_0#01-8      237ns ± 1%      14ns ± 1%  -94.03%  (p=0.008 n=5+5)
Match/simpleIdent_1#01-8      247ns ± 1%      24ns ± 3%  -90.42%  (p=0.008 n=5+5)
[Geo mean]                    211ns           15ns       -93.00%
```
diff --git a/ruleguard/filters.go b/ruleguard/filters.go
@@ -6,12 +6,12 @@ import (
 	"go/token"
 	"go/types"
 	"path/filepath"
-	"regexp"
 
 	"github.com/quasilyte/go-ruleguard/internal/gogrep"
 	"github.com/quasilyte/go-ruleguard/internal/xtypes"
 	"github.com/quasilyte/go-ruleguard/nodetag"
 	"github.com/quasilyte/go-ruleguard/ruleguard/quasigo"
+	"github.com/quasilyte/go-ruleguard/ruleguard/textmatch"
 	"github.com/quasilyte/go-ruleguard/ruleguard/typematch"
 )
 
@@ -76,7 +76,7 @@ func makeFileImportsFilter(src, pkgPath string) filterFunc {
 	}
 }
 
-func makeFilePkgPathMatchesFilter(src string, re *regexp.Regexp) filterFunc {
+func makeFilePkgPathMatchesFilter(src string, re textmatch.Pattern) filterFunc {
 	return func(params *filterParams) matchFilterResult {
 		pkgPath := params.ctx.Pkg.Path()
 		if re.MatchString(pkgPath) {
@@ -86,7 +86,7 @@ func makeFilePkgPathMatchesFilter(src string, re *regexp.Regexp) filterFunc {
 	}
 }
 
-func makeFileNameMatchesFilter(src string, re *regexp.Regexp) filterFunc {
+func makeFileNameMatchesFilter(src string, re textmatch.Pattern) filterFunc {
 	return func(params *filterParams) matchFilterResult {
 		if re.MatchString(filepath.Base(params.filename)) {
 			return filterSuccess
@@ -373,7 +373,7 @@ func makeTextFilter(src, varname string, op token.Token, rhsVarname string) filt
 	}
 }
 
-func makeTextMatchesFilter(src, varname string, re *regexp.Regexp) filterFunc {
+func makeTextMatchesFilter(src, varname string, re textmatch.Pattern) filterFunc {
 	// TODO(quasilyte): add variadic support.
 	return func(params *filterParams) matchFilterResult {
 		if re.Match(params.nodeText(params.subNode(varname))) {
diff --git a/ruleguard/ir_loader.go b/ruleguard/ir_loader.go
@@ -16,6 +16,7 @@ import (
 	"github.com/quasilyte/go-ruleguard/ruleguard/goutil"
 	"github.com/quasilyte/go-ruleguard/ruleguard/ir"
 	"github.com/quasilyte/go-ruleguard/ruleguard/quasigo"
+	"github.com/quasilyte/go-ruleguard/ruleguard/textmatch"
 	"github.com/quasilyte/go-ruleguard/ruleguard/typematch"
 )
 
@@ -409,12 +410,12 @@ func (l *irLoader) unwrapInterfaceExpr(filter ir.FilterExpr) (*types.Interface,
 	return iface, nil
 }
 
-func (l *irLoader) unwrapRegexpExpr(filter ir.FilterExpr) (*regexp.Regexp, error) {
+func (l *irLoader) unwrapRegexpExpr(filter ir.FilterExpr) (textmatch.Pattern, error) {
 	patternString := l.unwrapStringExpr(filter)
 	if patternString == "" {
 		return nil, l.errorf(filter.Line, nil, "expected a non-empty regexp pattern argument")
 	}
-	re, err := regexp.Compile(patternString)
+	re, err := textmatch.Compile(patternString)
 	if err != nil {
 		return nil, l.errorf(filter.Line, err, "compile regexp")
 	}
diff --git a/ruleguard/textmatch/compile.go b/ruleguard/textmatch/compile.go
@@ -0,0 +1,84 @@
+package textmatch
+
+import (
+	"regexp"
+	"regexp/syntax"
+	"unicode"
+)
+
+func compile(s string) (Pattern, error) {
+	reSyntax, err := syntax.Parse(s, syntax.Perl)
+	if err == nil {
+		if optimized := compileOptimized(s, reSyntax); optimized != nil {
+			return optimized, nil
+		}
+	}
+	return regexp.Compile(s)
+}
+
+func compileOptimized(s string, re *syntax.Regexp) Pattern {
+	// .*
+	isAny := func(re *syntax.Regexp) bool {
+		return re.Op == syntax.OpStar && re.Sub[0].Op == syntax.OpAnyCharNotNL
+	}
+	// "literal"
+	isLit := func(re *syntax.Regexp) bool {
+		return re.Op == syntax.OpLiteral
+	}
+	// ^
+	isBegin := func(re *syntax.Regexp) bool {
+		return re.Op == syntax.OpBeginText
+	}
+	// $
+	isEnd := func(re *syntax.Regexp) bool {
+		return re.Op == syntax.OpEndText
+	}
+
+	// TODO: analyze what kind of regexps people use in rules
+	// more often and optimize those as well.
+
+	// lit => strings.Contains($input, lit)
+	if re.Op == syntax.OpLiteral {
+		return &containsLiteralMatcher{value: newInputValue(string(re.Rune))}
+	}
+
+	// `.*` lit `.*` => strings.Contains($input, lit)
+	if re.Op == syntax.OpConcat && len(re.Sub) == 3 {
+		if isAny(re.Sub[0]) && isLit(re.Sub[1]) && isAny(re.Sub[2]) {
+			return &containsLiteralMatcher{value: newInputValue(string(re.Sub[1].Rune))}
+		}
+	}
+
+	// `^` lit => strings.HasPrefix($input, lit)
+	if re.Op == syntax.OpConcat && len(re.Sub) == 2 {
+		if isBegin(re.Sub[0]) && isLit(re.Sub[1]) {
+			return &prefixLiteralMatcher{value: newInputValue(string(re.Sub[1].Rune))}
+		}
+	}
+
+	// lit `$` => strings.HasSuffix($input, lit)
+	if re.Op == syntax.OpConcat && len(re.Sub) == 2 {
+		if isLit(re.Sub[0]) && isEnd(re.Sub[1]) {
+			return &suffixLiteralMatcher{value: newInputValue(string(re.Sub[0].Rune))}
+		}
+	}
+
+	// `^` lit `$` => $input == lit
+	if re.Op == syntax.OpConcat && len(re.Sub) == 3 {
+		if isBegin(re.Sub[0]) && isLit(re.Sub[1]) && isEnd(re.Sub[2]) {
+			return &eqLiteralMatcher{value: newInputValue(string(re.Sub[1].Rune))}
+		}
+	}
+
+	// `^\p{Lu}` => prefixRunePredMatcher:unicode.IsUpper
+	// `^\p{Ll}` => prefixRunePredMatcher:unicode.IsLower
+	switch s {
+	case `^\p{Lu}`:
+		return &prefixRunePredMatcher{pred: unicode.IsUpper}
+	case `^\p{Ll}`:
+		return &prefixRunePredMatcher{pred: unicode.IsLower}
+	}
+
+	// Can't optimize.
+	return nil
+}
diff --git a/ruleguard/textmatch/matchers.go b/ruleguard/textmatch/matchers.go
@@ -0,0 +1,72 @@
+package textmatch
+
+import (
+	"bytes"
+	"strings"
+	"unicode/utf8"
+)
+
+// inputValue is a wrapper for string|[]byte.
+//
+// We hold both values to avoid string->[]byte and vice versa
+// conversions when doing Match and MatchString.
+type inputValue struct {
+	s string
+	b []byte
+}
+
+func newInputValue(s string) inputValue {
+	return inputValue{s: s, b: []byte(s)}
+}
+
+type containsLiteralMatcher struct{ value inputValue }
+
+func (m *containsLiteralMatcher) MatchString(s string) bool {
+	return strings.Contains(s, m.value.s)
+}
+
+func (m *containsLiteralMatcher) Match(b []byte) bool {
+	return bytes.Contains(b, m.value.b)
+}
+
+type prefixLiteralMatcher struct{ value inputValue }
+
+func (m *prefixLiteralMatcher) MatchString(s string) bool {
+	return strings.HasPrefix(s, m.value.s)
+}
+
+func (m *prefixLiteralMatcher) Match(b []byte) bool {
+	return bytes.HasPrefix(b, m.value.b)
+}
+
+type suffixLiteralMatcher struct{ value inputValue }
+
+func (m *suffixLiteralMatcher) MatchString(s string) bool {
+	return strings.HasSuffix(s, m.value.s)
+}
+
+func (m *suffixLiteralMatcher) Match(b []byte) bool {
+	return bytes.HasSuffix(b, m.value.b)
+}
+
+type eqLiteralMatcher struct{ value inputValue }
+
+func (m *eqLiteralMatcher) MatchString(s string) bool {
+	return m.value.s == s
+}
+
+func (m *eqLiteralMatcher) Match(b []byte) bool {
+	return bytes.Equal(m.value.b, b)
+}
+
+type prefixRunePredMatcher struct{ pred func(rune) bool }
+
+func (m *prefixRunePredMatcher) MatchString(s string) bool {
+	r, _ := utf8.DecodeRuneInString(s)
+	return m.pred(r)
+}
+
+func (m *prefixRunePredMatcher) Match(b []byte) bool {
+	r, _ := utf8.DecodeRune(b)
+	return m.pred(r)
+}
diff --git a/ruleguard/textmatch/textmatch.go b/ruleguard/textmatch/textmatch.go
@@ -0,0 +1,26 @@
+package textmatch
+
+import "regexp"
+
+// Pattern is a compiled regular expression.
+type Pattern interface {
+	MatchString(s string) bool
+	Match(b []byte) bool
+}
+
+// Compile parses a regular expression and returns a compiled
+// pattern that can match inputs descriped by the regexp.
+//
+// Semantically it's close to the regexp.Compile, but
+// it does recognize some common patterns and creates
+// a more optimized matcher for them.
+func Compile(re string) (Pattern, error) {
+	return compile(re)
+}
+
+// IsRegexp reports whether p is implemented using regexp.
+// False means that the underlying matcher is something optimized.
+func IsRegexp(p Pattern) bool {
+	_, ok := p.(*regexp.Regexp)
+	return ok
+}
diff --git a/ruleguard/textmatch/textmatch_test.go b/ruleguard/textmatch/textmatch_test.go

Original file line number	Diff line number	Diff line change
`@@ -6,12 +6,12 @@ import (`
`6`	`6`	`"go/token"`
`7`	`7`	`"go/types"`
`8`	`8`	`"path/filepath"`
`9`		`- "regexp"`
`10`	`9`
`11`	`10`	`"github.com/quasilyte/go-ruleguard/internal/gogrep"`
`12`	`11`	`"github.com/quasilyte/go-ruleguard/internal/xtypes"`
`13`	`12`	`"github.com/quasilyte/go-ruleguard/nodetag"`
`14`	`13`	`"github.com/quasilyte/go-ruleguard/ruleguard/quasigo"`
	`14`	`+ "github.com/quasilyte/go-ruleguard/ruleguard/textmatch"`
`15`	`15`	`"github.com/quasilyte/go-ruleguard/ruleguard/typematch"`
`16`	`16`	`)`
`17`	`17`
`@@ -76,7 +76,7 @@ func makeFileImportsFilter(src, pkgPath string) filterFunc {`
`76`	`76`	`}`
`77`	`77`	`}`
`78`	`78`
`79`		`-func makeFilePkgPathMatchesFilter(src string, re *regexp.Regexp) filterFunc {`
	`79`	`+func makeFilePkgPathMatchesFilter(src string, re textmatch.Pattern) filterFunc {`
`80`	`80`	`return func(params *filterParams) matchFilterResult {`
`81`	`81`	`pkgPath := params.ctx.Pkg.Path()`
`82`	`82`	`if re.MatchString(pkgPath) {`
`@@ -86,7 +86,7 @@ func makeFilePkgPathMatchesFilter(src string, re *regexp.Regexp) filterFunc {`
`86`	`86`	`}`
`87`	`87`	`}`
`88`	`88`
`89`		`-func makeFileNameMatchesFilter(src string, re *regexp.Regexp) filterFunc {`
	`89`	`+func makeFileNameMatchesFilter(src string, re textmatch.Pattern) filterFunc {`
`90`	`90`	`return func(params *filterParams) matchFilterResult {`
`91`	`91`	`if re.MatchString(filepath.Base(params.filename)) {`
`92`	`92`	`return filterSuccess`
`@@ -373,7 +373,7 @@ func makeTextFilter(src, varname string, op token.Token, rhsVarname string) filt`
`373`	`373`	`}`
`374`	`374`	`}`
`375`	`375`
`376`		`-func makeTextMatchesFilter(src, varname string, re *regexp.Regexp) filterFunc {`
	`376`	`+func makeTextMatchesFilter(src, varname string, re textmatch.Pattern) filterFunc {`
`377`	`377`	`// TODO(quasilyte): add variadic support.`
`378`	`378`	`return func(params *filterParams) matchFilterResult {`
`379`	`379`	`if re.Match(params.nodeText(params.subNode(varname))) {`
Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@ import (`
`16`	`16`	`"github.com/quasilyte/go-ruleguard/ruleguard/goutil"`
`17`	`17`	`"github.com/quasilyte/go-ruleguard/ruleguard/ir"`
`18`	`18`	`"github.com/quasilyte/go-ruleguard/ruleguard/quasigo"`
	`19`	`+ "github.com/quasilyte/go-ruleguard/ruleguard/textmatch"`
`19`	`20`	`"github.com/quasilyte/go-ruleguard/ruleguard/typematch"`
`20`	`21`	`)`
`21`	`22`
`@@ -409,12 +410,12 @@ func (l irLoader) unwrapInterfaceExpr(filter ir.FilterExpr) (types.Interface,`
`409`	`410`	`return iface, nil`
`410`	`411`	`}`
`411`	`412`
`412`		`-func (l irLoader) unwrapRegexpExpr(filter ir.FilterExpr) (regexp.Regexp, error) {`
	`413`	`+func (l *irLoader) unwrapRegexpExpr(filter ir.FilterExpr) (textmatch.Pattern, error) {`
`413`	`414`	`patternString := l.unwrapStringExpr(filter)`
`414`	`415`	`if patternString == "" {`
`415`	`416`	`return nil, l.errorf(filter.Line, nil, "expected a non-empty regexp pattern argument")`
`416`	`417`	`}`
`417`		`- re, err := regexp.Compile(patternString)`
	`418`	`+ re, err := textmatch.Compile(patternString)`
`418`	`419`	`if err != nil {`
`419`	`420`	`return nil, l.errorf(filter.Line, err, "compile regexp")`
`420`	`421`	`}`