Skip to content

Commit cd9a97f

Browse files
authored
Merge pull request swiftlang#2516 from rintaro/parser-unicodescalar
[Parser] Cleanup UnicodeScalar handling
2 parents a2bc847 + e07ae2b commit cd9a97f

File tree

2 files changed

+58
-58
lines changed

2 files changed

+58
-58
lines changed

Sources/SwiftParser/Lexer/Cursor.swift

Lines changed: 48 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -493,7 +493,7 @@ struct CharacterByte: ExpressibleByUnicodeScalarLiteral, ExpressibleByIntegerLit
493493
let value: UInt8
494494

495495
init(unicodeScalarLiteral value: Unicode.Scalar) {
496-
self.value = UInt8(ascii: Unicode.Scalar(unicodeScalarLiteral: value))
496+
self.value = UInt8(ascii: value)
497497
}
498498

499499
init(integerLiteral value: UInt8) {
@@ -964,11 +964,11 @@ extension Lexer.Cursor {
964964
return Lexer.Result(.endOfFile)
965965
default:
966966
var tmp = self
967-
if tmp.advance(if: { Unicode.Scalar($0).isValidIdentifierStartCodePoint }) {
967+
if tmp.advance(if: { $0.isValidIdentifierStartCodePoint }) {
968968
return self.lexIdentifier()
969969
}
970970

971-
if tmp.advance(if: { Unicode.Scalar($0).isOperatorStartCodePoint }) {
971+
if tmp.advance(if: { $0.isOperatorStartCodePoint }) {
972972
return self.lexOperatorIdentifier(
973973
sourceBufferStart: sourceBufferStart,
974974
preferRegexOverBinaryOperator: preferRegexOverBinaryOperator
@@ -1009,7 +1009,7 @@ extension Lexer.Cursor {
10091009
private mutating func lexAfterClosingStringQuote() -> Lexer.Result {
10101010
switch self.peek() {
10111011
case "#":
1012-
self.advance(while: { $0 == Unicode.Scalar("#") })
1012+
self.advance(while: { $0 == "#" })
10131013
return Lexer.Result(.rawStringPoundDelimiter, stateTransition: .pop)
10141014
case nil:
10151015
return Lexer.Result(.endOfFile)
@@ -1028,7 +1028,7 @@ extension Lexer.Cursor {
10281028
/// number of '#' is correct because otherwise `isAtStringInterpolationAnchor`
10291029
/// would have returned false in `lexInStringLiteral` and w we wouldn't have
10301030
/// transitioned to the `afterBackslashOfStringInterpolation` state.
1031-
self.advance(while: { $0 == Unicode.Scalar("#") })
1031+
self.advance(while: { $0 == "#" })
10321032
return Lexer.Result(.rawStringPoundDelimiter)
10331033
case "(":
10341034
_ = self.advance()
@@ -1248,9 +1248,7 @@ extension Lexer.Cursor {
12481248
)
12491249
}
12501250

1251-
self.advance(while: {
1252-
($0 >= Unicode.Scalar("0") && $0 <= Unicode.Scalar("7")) || $0 == Unicode.Scalar("_")
1253-
})
1251+
self.advance(while: { ($0 >= "0" && $0 <= "7") || $0 == "_" })
12541252

12551253
let tmp = self
12561254
if self.advance(if: { $0.isValidIdentifierContinuationCodePoint }) {
@@ -1279,9 +1277,7 @@ extension Lexer.Cursor {
12791277
)
12801278
}
12811279

1282-
self.advance(while: {
1283-
$0 == Unicode.Scalar("0") || $0 == Unicode.Scalar("1") || $0 == Unicode.Scalar("_")
1284-
})
1280+
self.advance(while: { $0 == "0" || $0 == "1" || $0 == "_" })
12851281

12861282
let tmp = self
12871283
if self.advance(if: { $0.isValidIdentifierContinuationCodePoint }) {
@@ -1298,7 +1294,7 @@ extension Lexer.Cursor {
12981294

12991295
// Handle a leading [0-9]+, lexing an integer or falling through if we have a
13001296
// floating point value.
1301-
self.advance(while: { $0.isDigit || $0 == Unicode.Scalar("_") })
1297+
self.advance(while: { $0.isDigit || $0 == "_" })
13021298

13031299
// TODO: This can probably be unified with lexHexNumber somehow
13041300

@@ -1333,7 +1329,7 @@ extension Lexer.Cursor {
13331329
// Lex decimal point.
13341330
if self.advance(matching: ".") {
13351331
// Lex any digits after the decimal point.
1336-
self.advance(while: { $0.isDigit || $0 == Unicode.Scalar("_") })
1332+
self.advance(while: { $0.isDigit || $0 == "_" })
13371333
}
13381334

13391335
// Lex exponent.
@@ -1364,7 +1360,7 @@ extension Lexer.Cursor {
13641360
)
13651361
}
13661362

1367-
self.advance(while: { $0.isDigit || $0 == Unicode.Scalar("_") })
1363+
self.advance(while: { $0.isDigit || $0 == "_" })
13681364

13691365
let tmp = self
13701366
if self.advance(if: { $0.isValidIdentifierContinuationCodePoint }) {
@@ -1401,7 +1397,7 @@ extension Lexer.Cursor {
14011397
}
14021398
}
14031399

1404-
self.advance(while: { $0.isHexDigit || $0 == Unicode.Scalar("_") })
1400+
self.advance(while: { $0.isHexDigit || $0 == "_" })
14051401

14061402
if self.isAtEndOfFile || self.is(notAt: ".", "p", "P") {
14071403
let tmp = self
@@ -1429,7 +1425,7 @@ extension Lexer.Cursor {
14291425
return Lexer.Result(.integerLiteral)
14301426
}
14311427

1432-
self.advance(while: { $0.isHexDigit || $0 == Unicode.Scalar("_") })
1428+
self.advance(while: { $0.isHexDigit || $0 == "_" })
14331429

14341430
if self.isAtEndOfFile || self.is(notAt: "p", "P") {
14351431
if let peeked = self.peek(at: 1), !Unicode.Scalar(peeked).isDigit {
@@ -1486,7 +1482,7 @@ extension Lexer.Cursor {
14861482
)
14871483
}
14881484

1489-
self.advance(while: { $0.isDigit || $0 == Unicode.Scalar("_") })
1485+
self.advance(while: { $0.isDigit || $0 == "_" })
14901486

14911487
let tmp = self
14921488
if self.advance(if: { $0.isValidIdentifierContinuationCodePoint }) {
@@ -1545,8 +1541,8 @@ extension Lexer.Cursor {
15451541
case success(Unicode.Scalar)
15461542

15471543
/// An escaped character, e.g. `\n` or `\u{1234}`. It has been validated that
1548-
/// this is a valid character
1549-
case validatedEscapeSequence(Character)
1544+
/// this is a valid unicode scalar.
1545+
case validatedEscapeSequence(Unicode.Scalar)
15501546

15511547
/// The end of a string literal has been reached.
15521548
case endOfString
@@ -1605,16 +1601,11 @@ extension Lexer.Cursor {
16051601
case "\\": // Escapes.
16061602
_ = self.advance()
16071603
if !self.advanceIfStringDelimiter(delimiterLength: delimiterLength) {
1608-
return .success(Unicode.Scalar("\\"))
1604+
return .success("\\")
16091605
}
16101606
switch self.lexEscapedCharacter(isMultilineString: stringLiteralKind == .multiLine) {
1611-
case .success(let escapedCharacterCode):
1612-
// Check to see if the encoding is valid.
1613-
if let validatedScalar = Unicode.Scalar(escapedCharacterCode) {
1614-
return .validatedEscapeSequence(Character(validatedScalar))
1615-
} else {
1616-
return .error(.invalidEscapeSequenceInStringLiteral)
1617-
}
1607+
case .success(let codePoint):
1608+
return .validatedEscapeSequence(codePoint)
16181609
case .error(let kind):
16191610
return .error(kind)
16201611
}
@@ -1635,7 +1626,7 @@ extension Lexer.Cursor {
16351626
enum EscapedCharacterLex {
16361627
// Successfully lexed an escape sequence that represents the Unicode character
16371628
// at the given codepoint
1638-
case success(UInt32)
1629+
case success(Unicode.Scalar)
16391630
case error(TokenDiagnostic.Kind)
16401631
}
16411632

@@ -1649,13 +1640,13 @@ extension Lexer.Cursor {
16491640
// Escape processing. We already ate the "\".
16501641
switch self.peek() {
16511642
// Simple single-character escapes.
1652-
case "0": _ = self.advance(); return .success(UInt32(UInt8(ascii: "\0")))
1653-
case "n": _ = self.advance(); return .success(UInt32(UInt8(ascii: "\n")))
1654-
case "r": _ = self.advance(); return .success(UInt32(UInt8(ascii: "\r")))
1655-
case "t": _ = self.advance(); return .success(UInt32(UInt8(ascii: "\t")))
1656-
case #"""#: _ = self.advance(); return .success(UInt32(UInt8(ascii: #"""#)))
1657-
case "'": _ = self.advance(); return .success(UInt32(UInt8(ascii: "'")))
1658-
case "\\": _ = self.advance(); return .success(UInt32(UInt8(ascii: "\\")))
1643+
case "0": _ = self.advance(); return .success("\0")
1644+
case "n": _ = self.advance(); return .success("\n")
1645+
case "r": _ = self.advance(); return .success("\r")
1646+
case "t": _ = self.advance(); return .success("\t")
1647+
case #"""#: _ = self.advance(); return .success(#"""#)
1648+
case "'": _ = self.advance(); return .success("'")
1649+
case "\\": _ = self.advance(); return .success("\\")
16591650

16601651
case "u": // e.g. \u{1234}
16611652
_ = self.advance()
@@ -1667,7 +1658,7 @@ extension Lexer.Cursor {
16671658
return self.lexUnicodeEscape()
16681659
case "\n", "\r":
16691660
if isMultilineString && self.maybeConsumeNewlineEscape() {
1670-
return .success(UInt32(UInt8(ascii: "\n")))
1661+
return .success("\n")
16711662
}
16721663
return .error(.invalidEscapeSequenceInStringLiteral)
16731664
case nil:
@@ -1692,24 +1683,30 @@ extension Lexer.Cursor {
16921683
precondition(quoteConsumed)
16931684

16941685
let digitStart = self
1695-
var numDigits = 0
1696-
while self.advance(if: { $0.isHexDigit }) {
1697-
numDigits += 1
1698-
}
1686+
self.advance(while: { $0.isHexDigit })
1687+
1688+
let digitText = SyntaxText(
1689+
baseAddress: digitStart.pointer,
1690+
count: digitStart.distance(to: self)
1691+
)
16991692

17001693
guard self.advance(matching: "}") else {
17011694
return .error(.expectedClosingBraceInUnicodeEscape)
17021695
}
17031696

1704-
if numDigits == 0 || numDigits > 8 {
1697+
guard 1 <= digitText.count && digitText.count <= 8 else {
17051698
return .error(.invalidNumberOfHexDigitsInUnicodeEscape)
17061699
}
17071700

1708-
if let codePoint = UInt32(String(decoding: digitStart.input[0..<numDigits], as: UTF8.self), radix: 16) {
1709-
return .success(codePoint)
1710-
} else {
1701+
guard
1702+
// FIXME: Implement 'UInt32(_: SyntaxText, radix:)'.
1703+
let codePoint = UInt32(String(syntaxText: digitText), radix: 16),
1704+
let scalar = Unicode.Scalar.init(codePoint)
1705+
else {
17111706
return .error(.invalidEscapeSequenceInStringLiteral)
17121707
}
1708+
1709+
return .success(scalar)
17131710
}
17141711

17151712
private mutating func maybeConsumeNewlineEscape() -> Bool {
@@ -1719,7 +1716,7 @@ extension Lexer.Cursor {
17191716
case " ", "\t":
17201717
continue
17211718
case "\r":
1722-
_ = tmp.advance(if: { $0 == Unicode.Scalar("\n") })
1719+
_ = tmp.advance(if: { $0 == "\n" })
17231720
fallthrough
17241721
case "\n":
17251722
self = tmp
@@ -1776,7 +1773,7 @@ extension Lexer.Cursor {
17761773
// Scan ahead until the end of the line. Every time we see a closing
17771774
// quote, check if it is followed by the correct number of closing delimiters.
17781775
while isSingleLineString.is(notAt: "\r", "\n") {
1779-
if isSingleLineString.advance(if: { $0 == Unicode.Scalar((#"""#)) }) {
1776+
if isSingleLineString.advance(if: { $0 == #"""# }) {
17801777
if isSingleLineString.advanceIfStringDelimiter(delimiterLength: leadingDelimiterLength) {
17811778
return Lexer.Result(.stringQuote, stateTransition: stateTransitionAfterLexingStringQuote(kind: .singleLine))
17821779
}
@@ -2238,7 +2235,7 @@ extension Lexer.Cursor {
22382235
case .error:
22392236
// If the character was incorrectly encoded, give up.
22402237
return nil
2241-
case .endOfString, .success(Unicode.Scalar(0x201D)):
2238+
case .endOfString, .success("\u{201D}"):
22422239
// If we found a closing quote, then we're done. Just return the spot
22432240
// to continue.
22442241
return body
@@ -2262,10 +2259,10 @@ extension Lexer.Cursor {
22622259
precondition(!(self.peekScalar()?.isValidIdentifierStartCodePoint ?? false) && !(self.peekScalar()?.isOperatorStartCodePoint ?? false))
22632260
let start = self
22642261
var tmp = self
2265-
if tmp.advance(if: { Unicode.Scalar($0).isValidIdentifierContinuationCodePoint }) {
2262+
if tmp.advance(if: { $0.isValidIdentifierContinuationCodePoint }) {
22662263
// If this is a valid identifier continuation, but not a valid identifier
22672264
// start, attempt to recover by eating more continuation characters.
2268-
tmp.advance(while: { Unicode.Scalar($0).isValidIdentifierContinuationCodePoint })
2265+
tmp.advance(while: { $0.isValidIdentifierContinuationCodePoint })
22692266
self = tmp
22702267
return .lexemeContents(Lexer.Result(.identifier, error: LexingDiagnostic(.invalidIdentifierStartCharacter, position: start)))
22712268
}
@@ -2369,10 +2366,8 @@ extension Lexer.Cursor {
23692366
previous: curPtr.input[markerKind.introducer.utf8.count - 1]
23702367
)
23712368
while !restOfBuffer.isAtEndOfFile {
2372-
let terminatorStart = markerKind.terminator.utf8.first!
2373-
restOfBuffer.advance(while: { byte in
2374-
byte != Unicode.Scalar(terminatorStart)
2375-
})
2369+
let terminatorStart = markerKind.terminator.unicodeScalars.first!
2370+
restOfBuffer.advance(while: { byte in byte != terminatorStart })
23762371

23772372
guard restOfBuffer.starts(with: markerKind.terminator.utf8) else {
23782373
_ = restOfBuffer.advance()

Sources/SwiftParser/StringLiteralRepresentedLiteralValue.swift

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,14 @@ extension StringSegmentSyntax {
7171
) {
7272
precondition(!hasError, "appendUnescapedLiteralValue relies on properly parsed literals")
7373

74-
var text = content.text
75-
text.withUTF8 { buffer in
74+
let rawText = content.rawText
75+
if !rawText.contains("\\") {
76+
// Fast path. No escape sequence.
77+
output.append(String(syntaxText: rawText))
78+
return
79+
}
80+
81+
rawText.withBuffer { buffer in
7682
var cursor = Lexer.Cursor(input: buffer, previous: 0)
7783

7884
// Put the cursor in the string literal lexing state. This is just
@@ -88,10 +94,9 @@ extension StringSegmentSyntax {
8894
)
8995

9096
switch lex {
91-
case .success(let scalar):
97+
case .success(let scalar),
98+
.validatedEscapeSequence(let scalar):
9299
output.append(Character(scalar))
93-
case .validatedEscapeSequence(let character):
94-
output.append(character)
95100
case .endOfString, .error:
96101
// We get an error at the end of the string because
97102
// `lexCharacterInStringLiteral` expects the closing quote.

0 commit comments

Comments
 (0)