js: improve lexer identifier parsing performance

evanw · evanw · commit f6f8b27a0d58 · 2022-12-04T19:34:36.000-05:00
diff --git a/internal/js_lexer/js_lexer.go b/internal/js_lexer/js_lexer.go
@@ -1757,23 +1757,49 @@ func (lexer *Lexer) Next() {
 				lexer.addRangeError(lexer.Range(), "JSON strings must use double quotes")
 			}
 
+		// Note: This case is hot in profiles
 		case '_', '$',
 			'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 			'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 			'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 			'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z':
+			// This is a fast path for long ASCII identifiers. Doing this in a loop
+			// first instead of doing "step()" and "IsIdentifierContinue()" like we
+			// do after this is noticeably faster in the common case of ASCII-only
+			// text. For example, doing this sped up end-to-end consuming of a large
+			// TypeScript type declaration file from 97ms to 79ms (around 20% faster).
+			contents := lexer.source.Contents
+			n := len(contents)
+			i := lexer.current
+			for i < n {
+				c := contents[i]
+				if (c < 'a' || c > 'z') && (c < 'A' || c > 'Z') && (c < '0' || c > '9') && c != '_' && c != '$' {
+					break
+				}
+				i++
+			}
+			lexer.current = i
+
+			// Now do the slow path for any remaining non-ASCII identifier characters
 			lexer.step()
-			for IsIdentifierContinue(lexer.codePoint) {
-				lexer.step()
+			if lexer.codePoint >= 0x80 {
+				for IsIdentifierContinue(lexer.codePoint) {
+					lexer.step()
+				}
 			}
+
+			// If there's a slash, then we're in the extra-slow (and extra-rare) case
+			// where the identifier has embedded escapes
 			if lexer.codePoint == '\\' {
 				lexer.Identifier, lexer.Token = lexer.scanIdentifierWithEscapes(normalIdentifier)
-			} else {
-				lexer.Identifier = lexer.rawIdentifier()
-				lexer.Token = Keywords[lexer.Raw()]
-				if lexer.Token == 0 {
-					lexer.Token = TIdentifier
-				}
+				break
+			}
+
+			// Otherwise (if there was no escape) we can slice the code verbatim
+			lexer.Identifier = lexer.rawIdentifier()
+			lexer.Token = Keywords[lexer.Raw()]
+			if lexer.Token == 0 {
+				lexer.Token = TIdentifier
 			}
 
 		case '\\':
diff --git a/internal/js_lexer/js_lexer_test.go b/internal/js_lexer/js_lexer_test.go
@@ -133,6 +133,8 @@ func TestIdentifier(t *testing.T) {
 
 	expectIdentifier(t, "a\u200C", "a\u200C")
 	expectIdentifier(t, "a\u200D", "a\u200D")
+	expectIdentifier(t, "a\u200Cb", "a\u200Cb")
+	expectIdentifier(t, "a\u200Db", "a\u200Db")
 }
 
 func expectNumber(t *testing.T, contents string, expected float64) {

Original file line number	Diff line number	Diff line change
`@@ -133,6 +133,8 @@ func TestIdentifier(t *testing.T) {`
`133`	`133`
`134`	`134`	`expectIdentifier(t, "a\u200C", "a\u200C")`
`135`	`135`	`expectIdentifier(t, "a\u200D", "a\u200D")`
	`136`	`+ expectIdentifier(t, "a\u200Cb", "a\u200Cb")`
	`137`	`+ expectIdentifier(t, "a\u200Db", "a\u200Db")`
`136`	`138`	`}`
`137`	`139`
`138`	`140`	`func expectNumber(t *testing.T, contents string, expected float64) {`