Skip to content

[Parser] Cleanup UnicodeScalar handling #2516

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 48 additions & 53 deletions Sources/SwiftParser/Lexer/Cursor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,7 @@ struct CharacterByte: ExpressibleByUnicodeScalarLiteral, ExpressibleByIntegerLit
let value: UInt8

init(unicodeScalarLiteral value: Unicode.Scalar) {
self.value = UInt8(ascii: Unicode.Scalar(unicodeScalarLiteral: value))
self.value = UInt8(ascii: value)
}

init(integerLiteral value: UInt8) {
Expand Down Expand Up @@ -964,11 +964,11 @@ extension Lexer.Cursor {
return Lexer.Result(.endOfFile)
default:
var tmp = self
if tmp.advance(if: { Unicode.Scalar($0).isValidIdentifierStartCodePoint }) {
if tmp.advance(if: { $0.isValidIdentifierStartCodePoint }) {
return self.lexIdentifier()
}

if tmp.advance(if: { Unicode.Scalar($0).isOperatorStartCodePoint }) {
if tmp.advance(if: { $0.isOperatorStartCodePoint }) {
return self.lexOperatorIdentifier(
sourceBufferStart: sourceBufferStart,
preferRegexOverBinaryOperator: preferRegexOverBinaryOperator
Expand Down Expand Up @@ -1009,7 +1009,7 @@ extension Lexer.Cursor {
private mutating func lexAfterClosingStringQuote() -> Lexer.Result {
switch self.peek() {
case "#":
self.advance(while: { $0 == Unicode.Scalar("#") })
self.advance(while: { $0 == "#" })
return Lexer.Result(.rawStringPoundDelimiter, stateTransition: .pop)
case nil:
return Lexer.Result(.endOfFile)
Expand All @@ -1028,7 +1028,7 @@ extension Lexer.Cursor {
/// number of '#' is correct because otherwise `isAtStringInterpolationAnchor`
/// would have returned false in `lexInStringLiteral` and w we wouldn't have
/// transitioned to the `afterBackslashOfStringInterpolation` state.
self.advance(while: { $0 == Unicode.Scalar("#") })
self.advance(while: { $0 == "#" })
return Lexer.Result(.rawStringPoundDelimiter)
case "(":
_ = self.advance()
Expand Down Expand Up @@ -1248,9 +1248,7 @@ extension Lexer.Cursor {
)
}

self.advance(while: {
($0 >= Unicode.Scalar("0") && $0 <= Unicode.Scalar("7")) || $0 == Unicode.Scalar("_")
})
self.advance(while: { ($0 >= "0" && $0 <= "7") || $0 == "_" })

let tmp = self
if self.advance(if: { $0.isValidIdentifierContinuationCodePoint }) {
Expand Down Expand Up @@ -1279,9 +1277,7 @@ extension Lexer.Cursor {
)
}

self.advance(while: {
$0 == Unicode.Scalar("0") || $0 == Unicode.Scalar("1") || $0 == Unicode.Scalar("_")
})
self.advance(while: { $0 == "0" || $0 == "1" || $0 == "_" })

let tmp = self
if self.advance(if: { $0.isValidIdentifierContinuationCodePoint }) {
Expand All @@ -1298,7 +1294,7 @@ extension Lexer.Cursor {

// Handle a leading [0-9]+, lexing an integer or falling through if we have a
// floating point value.
self.advance(while: { $0.isDigit || $0 == Unicode.Scalar("_") })
self.advance(while: { $0.isDigit || $0 == "_" })

// TODO: This can probably be unified with lexHexNumber somehow

Expand Down Expand Up @@ -1333,7 +1329,7 @@ extension Lexer.Cursor {
// Lex decimal point.
if self.advance(matching: ".") {
// Lex any digits after the decimal point.
self.advance(while: { $0.isDigit || $0 == Unicode.Scalar("_") })
self.advance(while: { $0.isDigit || $0 == "_" })
}

// Lex exponent.
Expand Down Expand Up @@ -1364,7 +1360,7 @@ extension Lexer.Cursor {
)
}

self.advance(while: { $0.isDigit || $0 == Unicode.Scalar("_") })
self.advance(while: { $0.isDigit || $0 == "_" })

let tmp = self
if self.advance(if: { $0.isValidIdentifierContinuationCodePoint }) {
Expand Down Expand Up @@ -1401,7 +1397,7 @@ extension Lexer.Cursor {
}
}

self.advance(while: { $0.isHexDigit || $0 == Unicode.Scalar("_") })
self.advance(while: { $0.isHexDigit || $0 == "_" })

if self.isAtEndOfFile || self.is(notAt: ".", "p", "P") {
let tmp = self
Expand Down Expand Up @@ -1429,7 +1425,7 @@ extension Lexer.Cursor {
return Lexer.Result(.integerLiteral)
}

self.advance(while: { $0.isHexDigit || $0 == Unicode.Scalar("_") })
self.advance(while: { $0.isHexDigit || $0 == "_" })

if self.isAtEndOfFile || self.is(notAt: "p", "P") {
if let peeked = self.peek(at: 1), !Unicode.Scalar(peeked).isDigit {
Expand Down Expand Up @@ -1486,7 +1482,7 @@ extension Lexer.Cursor {
)
}

self.advance(while: { $0.isDigit || $0 == Unicode.Scalar("_") })
self.advance(while: { $0.isDigit || $0 == "_" })

let tmp = self
if self.advance(if: { $0.isValidIdentifierContinuationCodePoint }) {
Expand Down Expand Up @@ -1545,8 +1541,8 @@ extension Lexer.Cursor {
case success(Unicode.Scalar)

/// An escaped character, e.g. `\n` or `\u{1234}`. It has been validated that
/// this is a valid character
case validatedEscapeSequence(Character)
/// this is a valid unicode scalar.
case validatedEscapeSequence(Unicode.Scalar)

/// The end of a string literal has been reached.
case endOfString
Expand Down Expand Up @@ -1605,16 +1601,11 @@ extension Lexer.Cursor {
case "\\": // Escapes.
_ = self.advance()
if !self.advanceIfStringDelimiter(delimiterLength: delimiterLength) {
return .success(Unicode.Scalar("\\"))
return .success("\\")
}
switch self.lexEscapedCharacter(isMultilineString: stringLiteralKind == .multiLine) {
case .success(let escapedCharacterCode):
// Check to see if the encoding is valid.
if let validatedScalar = Unicode.Scalar(escapedCharacterCode) {
return .validatedEscapeSequence(Character(validatedScalar))
} else {
return .error(.invalidEscapeSequenceInStringLiteral)
}
case .success(let codePoint):
return .validatedEscapeSequence(codePoint)
case .error(let kind):
return .error(kind)
}
Expand All @@ -1635,7 +1626,7 @@ extension Lexer.Cursor {
enum EscapedCharacterLex {
// Successfully lexed an escape sequence that represents the Unicode character
// at the given codepoint
case success(UInt32)
case success(Unicode.Scalar)
case error(TokenDiagnostic.Kind)
}

Expand All @@ -1649,13 +1640,13 @@ extension Lexer.Cursor {
// Escape processing. We already ate the "\".
switch self.peek() {
// Simple single-character escapes.
case "0": _ = self.advance(); return .success(UInt32(UInt8(ascii: "\0")))
case "n": _ = self.advance(); return .success(UInt32(UInt8(ascii: "\n")))
case "r": _ = self.advance(); return .success(UInt32(UInt8(ascii: "\r")))
case "t": _ = self.advance(); return .success(UInt32(UInt8(ascii: "\t")))
case #"""#: _ = self.advance(); return .success(UInt32(UInt8(ascii: #"""#)))
case "'": _ = self.advance(); return .success(UInt32(UInt8(ascii: "'")))
case "\\": _ = self.advance(); return .success(UInt32(UInt8(ascii: "\\")))
case "0": _ = self.advance(); return .success("\0")
case "n": _ = self.advance(); return .success("\n")
case "r": _ = self.advance(); return .success("\r")
case "t": _ = self.advance(); return .success("\t")
case #"""#: _ = self.advance(); return .success(#"""#)
case "'": _ = self.advance(); return .success("'")
case "\\": _ = self.advance(); return .success("\\")

case "u": // e.g. \u{1234}
_ = self.advance()
Expand All @@ -1667,7 +1658,7 @@ extension Lexer.Cursor {
return self.lexUnicodeEscape()
case "\n", "\r":
if isMultilineString && self.maybeConsumeNewlineEscape() {
return .success(UInt32(UInt8(ascii: "\n")))
return .success("\n")
}
return .error(.invalidEscapeSequenceInStringLiteral)
case nil:
Expand All @@ -1692,24 +1683,30 @@ extension Lexer.Cursor {
precondition(quoteConsumed)

let digitStart = self
var numDigits = 0
while self.advance(if: { $0.isHexDigit }) {
numDigits += 1
}
self.advance(while: { $0.isHexDigit })

let digitText = SyntaxText(
baseAddress: digitStart.pointer,
count: digitStart.distance(to: self)
)

guard self.advance(matching: "}") else {
return .error(.expectedClosingBraceInUnicodeEscape)
}

if numDigits == 0 || numDigits > 8 {
guard 1 <= digitText.count && digitText.count <= 8 else {
return .error(.invalidNumberOfHexDigitsInUnicodeEscape)
}

if let codePoint = UInt32(String(decoding: digitStart.input[0..<numDigits], as: UTF8.self), radix: 16) {
return .success(codePoint)
} else {
guard
// FIXME: Implement 'UInt32(_: SyntaxText, radix:)'.
let codePoint = UInt32(String(syntaxText: digitText), radix: 16),
let scalar = Unicode.Scalar.init(codePoint)
else {
return .error(.invalidEscapeSequenceInStringLiteral)
}

return .success(scalar)
}

private mutating func maybeConsumeNewlineEscape() -> Bool {
Expand All @@ -1719,7 +1716,7 @@ extension Lexer.Cursor {
case " ", "\t":
continue
case "\r":
_ = tmp.advance(if: { $0 == Unicode.Scalar("\n") })
_ = tmp.advance(if: { $0 == "\n" })
fallthrough
case "\n":
self = tmp
Expand Down Expand Up @@ -1776,7 +1773,7 @@ extension Lexer.Cursor {
// Scan ahead until the end of the line. Every time we see a closing
// quote, check if it is followed by the correct number of closing delimiters.
while isSingleLineString.is(notAt: "\r", "\n") {
if isSingleLineString.advance(if: { $0 == Unicode.Scalar((#"""#)) }) {
if isSingleLineString.advance(if: { $0 == #"""# }) {
if isSingleLineString.advanceIfStringDelimiter(delimiterLength: leadingDelimiterLength) {
return Lexer.Result(.stringQuote, stateTransition: stateTransitionAfterLexingStringQuote(kind: .singleLine))
}
Expand Down Expand Up @@ -2238,7 +2235,7 @@ extension Lexer.Cursor {
case .error:
// If the character was incorrectly encoded, give up.
return nil
case .endOfString, .success(Unicode.Scalar(0x201D)):
case .endOfString, .success("\u{201D}"):
// If we found a closing quote, then we're done. Just return the spot
// to continue.
return body
Expand All @@ -2262,10 +2259,10 @@ extension Lexer.Cursor {
precondition(!(self.peekScalar()?.isValidIdentifierStartCodePoint ?? false) && !(self.peekScalar()?.isOperatorStartCodePoint ?? false))
let start = self
var tmp = self
if tmp.advance(if: { Unicode.Scalar($0).isValidIdentifierContinuationCodePoint }) {
if tmp.advance(if: { $0.isValidIdentifierContinuationCodePoint }) {
// If this is a valid identifier continuation, but not a valid identifier
// start, attempt to recover by eating more continuation characters.
tmp.advance(while: { Unicode.Scalar($0).isValidIdentifierContinuationCodePoint })
tmp.advance(while: { $0.isValidIdentifierContinuationCodePoint })
self = tmp
return .lexemeContents(Lexer.Result(.identifier, error: LexingDiagnostic(.invalidIdentifierStartCharacter, position: start)))
}
Expand Down Expand Up @@ -2369,10 +2366,8 @@ extension Lexer.Cursor {
previous: curPtr.input[markerKind.introducer.utf8.count - 1]
)
while !restOfBuffer.isAtEndOfFile {
let terminatorStart = markerKind.terminator.utf8.first!
restOfBuffer.advance(while: { byte in
byte != Unicode.Scalar(terminatorStart)
})
let terminatorStart = markerKind.terminator.unicodeScalars.first!
restOfBuffer.advance(while: { byte in byte != terminatorStart })

guard restOfBuffer.starts(with: markerKind.terminator.utf8) else {
_ = restOfBuffer.advance()
Expand Down
15 changes: 10 additions & 5 deletions Sources/SwiftParser/StringLiteralRepresentedLiteralValue.swift
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,14 @@ extension StringSegmentSyntax {
) {
precondition(!hasError, "appendUnescapedLiteralValue relies on properly parsed literals")

var text = content.text
text.withUTF8 { buffer in
let rawText = content.rawText
if !rawText.contains("\\") {
// Fast path. No escape sequence.
output.append(String(syntaxText: rawText))
return
}
Comment on lines -74 to +79
Copy link
Member Author

@rintaro rintaro Feb 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This fast path is an addition. It's probably just faster than always lexing the UTF8 sequence.


rawText.withBuffer { buffer in
var cursor = Lexer.Cursor(input: buffer, previous: 0)

// Put the cursor in the string literal lexing state. This is just
Expand All @@ -88,10 +94,9 @@ extension StringSegmentSyntax {
)

switch lex {
case .success(let scalar):
case .success(let scalar),
.validatedEscapeSequence(let scalar):
output.append(Character(scalar))
case .validatedEscapeSequence(let character):
output.append(character)
case .endOfString, .error:
// We get an error at the end of the string because
// `lexCharacterInStringLiteral` expects the closing quote.
Expand Down