Skip to content

Commit 41f69af

Browse files
committed
Complete regex literal lexing
The previous regex literal lexing logic didn't implement all the heuristics needed to parse all the regex literals that the old C++ parser handles. Update it with the heuristics it needs, and better handle diagnostics for invalid cases. This unfortunately requires a bit of lexical hackery, including a heuristic to classify previous token kinds to determine if we're in regex literal position, but it's needed to handle pathological cases such as `x /^ y/` where there's an ambiguity with a binary operator.
1 parent 21cae1c commit 41f69af

26 files changed

+2160
-577
lines changed

CodeGeneration/Sources/SyntaxSupport/Classification.swift

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ public let SYNTAX_CLASSIFICATIONS: [SyntaxClassification] = [
5858
SyntaxClassification(name: "ObjectLiteral", description: "An image, color, etc. literal."),
5959
SyntaxClassification(name: "OperatorIdentifier", description: "An identifier referring to an operator."),
6060
SyntaxClassification(name: "PoundDirectiveKeyword", description: "A `#` keyword like `#warning`."),
61+
SyntaxClassification(name: "RegexLiteral", description: "A regex literal, including multiline regex literals."),
6162
SyntaxClassification(name: "StringInterpolationAnchor", description: "The opening and closing parenthesis of string interpolation."),
6263
SyntaxClassification(name: "StringLiteral", description: "A string literal including multiline string literals."),
6364
SyntaxClassification(name: "TypeIdentifier", description: "An identifier referring to a type."),

CodeGeneration/Sources/SyntaxSupport/ExprNodes.swift

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1199,9 +1199,27 @@ public let EXPR_NODES: [Node] = [
11991199
kind: "Expr",
12001200
children: [
12011201
Child(
1202-
name: "Regex",
1203-
kind: .token(choices: [.token(tokenKind: "RegexLiteralToken")])
1204-
)
1202+
name: "OpeningPounds",
1203+
kind: .token(choices: [.token(tokenKind: "ExtendedRegexDelimiterToken")]),
1204+
isOptional: true
1205+
),
1206+
Child(
1207+
name: "OpenSlash",
1208+
kind: .token(choices: [.token(tokenKind: "RegexSlashToken")])
1209+
),
1210+
Child(
1211+
name: "RegexPattern",
1212+
kind: .token(choices: [.token(tokenKind: "RegexLiteralPatternToken")])
1213+
),
1214+
Child(
1215+
name: "CloseSlash",
1216+
kind: .token(choices: [.token(tokenKind: "RegexSlashToken")])
1217+
),
1218+
Child(
1219+
name: "ClosingPounds",
1220+
kind: .token(choices: [.token(tokenKind: "ExtendedRegexDelimiterToken")]),
1221+
isOptional: true
1222+
),
12051223
]
12061224
),
12071225

CodeGeneration/Sources/SyntaxSupport/TokenSpec.swift

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ public let SYNTAX_TOKENS: [TokenSpec] = [
178178
PunctuatorSpec(name: "Ellipsis", kind: "ellipsis", text: "..."),
179179
PunctuatorSpec(name: "Equal", kind: "equal", text: "=", requiresLeadingSpace: true, requiresTrailingSpace: true),
180180
PunctuatorSpec(name: "ExclamationMark", kind: "exclaim_postfix", text: "!"),
181+
MiscSpec(name: "ExtendedRegexDelimiter", kind: "extended_regex_delimiter", nameForDiagnostics: "extended delimiter", classification: "RegexLiteral"),
181182
LiteralSpec(name: "FloatingLiteral", kind: "floating_literal", nameForDiagnostics: "floating literal", classification: "FloatingLiteral"),
182183
MiscSpec(name: "Identifier", kind: "identifier", nameForDiagnostics: "identifier", classification: "Identifier"),
183184
PunctuatorSpec(name: "InfixQuestionMark", kind: "question_infix", text: "?"),
@@ -202,7 +203,8 @@ public let SYNTAX_TOKENS: [TokenSpec] = [
202203
PunctuatorSpec(name: "PrefixAmpersand", kind: "amp_prefix", text: "&"),
203204
MiscSpec(name: "PrefixOperator", kind: "oper_prefix", nameForDiagnostics: "prefix operator", classification: "OperatorIdentifier"),
204205
MiscSpec(name: "RawStringDelimiter", kind: "raw_string_delimiter", nameForDiagnostics: "raw string delimiter"),
205-
LiteralSpec(name: "RegexLiteral", kind: "regex_literal", nameForDiagnostics: "regex literal"),
206+
MiscSpec(name: "RegexLiteralPattern", kind: "regex_literal_pattern", nameForDiagnostics: "regex pattern", classification: "RegexLiteral"),
207+
PunctuatorSpec(name: "RegexSlash", kind: "regex_slash", text: "/", classification: "RegexLiteral"),
206208
PunctuatorSpec(name: "RightAngle", kind: "r_angle", text: ">"),
207209
PunctuatorSpec(name: "RightBrace", kind: "r_brace", text: "}"),
208210
PunctuatorSpec(name: "RightParen", kind: "r_paren", text: ")"),

Sources/SwiftParser/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ add_swift_host_library(SwiftParser
4242
Lexer/Lexeme.swift
4343
Lexer/LexemeSequence.swift
4444
Lexer/Lexer.swift
45+
Lexer/RegexLiteralLexer.swift
4546
Lexer/UnicodeScalarExtensions.swift
4647
)
4748

Sources/SwiftParser/Declarations.swift

Lines changed: 1 addition & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1315,28 +1315,6 @@ extension Parser {
13151315
}
13161316

13171317
extension Parser {
1318-
/// Are we at a regular expression literal that could act as an operator?
1319-
private mutating func atRegexLiteralThatCouldBeAnOperator() -> Bool {
1320-
guard self.at(.regexLiteral) else {
1321-
return false
1322-
}
1323-
1324-
/// Try to re-lex at regex literal as an operator. If it succeeds and
1325-
/// consumes the entire regex literal, we're done.
1326-
return self.currentToken.tokenText.withBuffer {
1327-
(buffer: UnsafeBufferPointer<UInt8>) -> Bool in
1328-
var cursor = Lexer.Cursor(input: buffer, previous: 0)
1329-
guard buffer[0] == UInt8(ascii: "/") else { return false }
1330-
switch cursor.lexOperatorIdentifier(sourceBufferStart: cursor).tokenKind {
1331-
case .unknown:
1332-
return false
1333-
1334-
default:
1335-
return cursor.input.isEmpty
1336-
}
1337-
}
1338-
}
1339-
13401318
@_spi(RawSyntax)
13411319
public mutating func parseFuncDeclaration(
13421320
_ attrs: DeclAttributes,
@@ -1345,7 +1323,7 @@ extension Parser {
13451323
let (unexpectedBeforeFuncKeyword, funcKeyword) = self.eat(handle)
13461324
let unexpectedBeforeIdentifier: RawUnexpectedNodesSyntax?
13471325
let identifier: RawTokenSyntax
1348-
if self.at(anyIn: Operator.self) != nil || self.at(.exclamationMark, .prefixAmpersand) || self.atRegexLiteralThatCouldBeAnOperator() {
1326+
if self.at(anyIn: Operator.self) != nil || self.at(.exclamationMark, .prefixAmpersand) {
13491327
var name = self.currentToken.tokenText
13501328
if name.count > 1 && name.hasSuffix("<") && self.peek().rawTokenKind == .identifier {
13511329
name = SyntaxText(rebasing: name.dropLast())

Sources/SwiftParser/Expressions.swift

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1170,7 +1170,7 @@ extension Parser {
11701170
)
11711171
case (.rawStringDelimiter, _)?, (.stringQuote, _)?, (.multilineStringQuote, _)?, (.singleQuote, _)?:
11721172
return RawExprSyntax(self.parseStringLiteral())
1173-
case (.regexLiteral, _)?:
1173+
case (.extendedRegexDelimiter, _)?, (.regexSlash, _)?:
11741174
return RawExprSyntax(self.parseRegexLiteral())
11751175
case (.nilKeyword, let handle)?:
11761176
let nilKeyword = self.eat(handle)
@@ -1433,13 +1433,32 @@ extension Parser {
14331433
/// Grammar
14341434
/// =======
14351435
///
1436-
/// regular-expression-literal → '\' `Any valid regular expression characters` '\'
1436+
/// regular-expression-literal → '#'* '/' `Any valid regular expression characters` '/' '#'*
14371437
@_spi(RawSyntax)
14381438
public mutating func parseRegexLiteral() -> RawRegexLiteralExprSyntax {
1439-
let (unexpectedBeforeLiteral, literal) = self.expect(.regexLiteral)
1439+
// See if we have an opening set of pounds.
1440+
let openPounds = self.consume(if: .extendedRegexDelimiter)
1441+
1442+
// Parse the opening slash.
1443+
let (unexpectedBeforeSlash, openSlash) = self.expect(.regexSlash)
1444+
1445+
// Parse the pattern and closing slash, avoiding recovery or leading trivia
1446+
// as the lexer should provide the tokens exactly in order without trivia,
1447+
// otherwise they should be treated as missing.
1448+
let pattern = self.expectWithoutRecoveryOrLeadingTrivia(.regexLiteralPattern)
1449+
let closeSlash = self.expectWithoutRecoveryOrLeadingTrivia(.regexSlash)
1450+
1451+
// Finally, parse a closing set of pounds.
1452+
let (unexpectedBeforeClosePounds, closePounds) = parsePoundDelimiter(.extendedRegexDelimiter, matching: openPounds)
1453+
14401454
return RawRegexLiteralExprSyntax(
1441-
unexpectedBeforeLiteral,
1442-
regex: literal,
1455+
openingPounds: openPounds,
1456+
unexpectedBeforeSlash,
1457+
openSlash: openSlash,
1458+
regexPattern: pattern,
1459+
closeSlash: closeSlash,
1460+
unexpectedBeforeClosePounds,
1461+
closingPounds: closePounds,
14431462
arena: self.arena
14441463
)
14451464
}

0 commit comments

Comments
 (0)