@@ -696,6 +696,45 @@ object Scanners {
696
696
recur(lastOffset, false )
697
697
}
698
698
699
+ import Character .{isHighSurrogate , isLowSurrogate , isUnicodeIdentifierPart , isUnicodeIdentifierStart , isValidCodePoint , toCodePoint }
700
+
701
+ // f"\\u$c%04x" or f"${"\\"}u$c%04x"
702
+ private def toUnicode (c : Char ): String = { val s = c.toInt.toHexString; " \\ u" + " 0" * (4 - s.length) + s }
703
+
704
+ // given char (ch) is high surrogate followed by low, codepoint passes predicate.
705
+ // true means supplementary chars were put to buffer.
706
+ // strict to require low surrogate (if not in string literal).
707
+ private def isSupplementary (high : Char , test : Int => Boolean , strict : Boolean = true ): Boolean =
708
+ isHighSurrogate(high) && {
709
+ var res = false
710
+ nextChar()
711
+ val low = ch
712
+ if isLowSurrogate(low) then
713
+ nextChar()
714
+ val codepoint = toCodePoint(high, low)
715
+ if isValidCodePoint(codepoint) && test(codepoint) then
716
+ putChar(high)
717
+ putChar(low)
718
+ res = true
719
+ else
720
+ error(s " illegal character ' ${toUnicode(high)}${toUnicode(low)}' " )
721
+ else if ! strict then
722
+ putChar(high)
723
+ res = true
724
+ else
725
+ error(s " illegal character ' ${toUnicode(high)}' missing low surrogate " )
726
+ res
727
+ }
728
+ private def atSupplementary (ch : Char , f : Int => Boolean ): Boolean =
729
+ isHighSurrogate(ch) && {
730
+ val hi = ch
731
+ val lo = lookaheadChar()
732
+ isLowSurrogate(lo) && {
733
+ val codepoint = toCodePoint(hi, lo)
734
+ isValidCodePoint(codepoint) && f(codepoint)
735
+ }
736
+ }
737
+
699
738
/** read next token, filling TokenData fields of Scanner.
700
739
*/
701
740
protected final def fetchToken (): Unit = {
@@ -822,11 +861,12 @@ object Scanners {
822
861
else ch match {
823
862
case '{' | '[' | ' ' | '\t ' if lookaheadChar() != '\' ' =>
824
863
token = QUOTE
825
- case _ if ! isAtEnd && ( ch != SU && ch != CR && ch != LF || isUnicodeEscape) =>
864
+ case _ if ! isAtEnd && ch != SU && ch != CR && ch != LF =>
826
865
val isEmptyCharLit = (ch == '\' ' )
827
866
getLitChar()
828
867
if ch == '\' ' then
829
868
if isEmptyCharLit then error(" empty character literal (use '\\ '' for single quote)" )
869
+ else if litBuf.length != 1 then error(" illegal codepoint in Char constant: " + litBuf.toString.map(toUnicode).mkString(" '" , " " , " '" ))
830
870
else finishCharLit()
831
871
else if isEmptyCharLit then error(" empty character literal" )
832
872
else error(" unclosed character literal" )
@@ -869,9 +909,11 @@ object Scanners {
869
909
def fetchOther () =
870
910
if (ch == '\u21D2 ' ) {
871
911
nextChar(); token = ARROW
912
+ report.deprecationWarning(" The unicode arrow `β` is deprecated, use `=>` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code." , sourcePos(offset))
872
913
}
873
914
else if (ch == '\u2190 ' ) {
874
915
nextChar(); token = LARROW
916
+ report.deprecationWarning(" The unicode arrow `β` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code." , sourcePos(offset))
875
917
}
876
918
else if (Character .isUnicodeIdentifierStart(ch)) {
877
919
putChar(ch)
@@ -883,9 +925,10 @@ object Scanners {
883
925
nextChar()
884
926
getOperatorRest()
885
927
}
928
+ else if isSupplementary(ch, isUnicodeIdentifierStart) then
929
+ getIdentRest()
886
930
else {
887
- // FIXME: Dotty deviation: f"" interpolator is not supported (#1814)
888
- error(" illegal character '\\ u%04x'" .format(ch : Int ))
931
+ error(s " illegal character ' ${toUnicode(ch)}' " )
889
932
nextChar()
890
933
}
891
934
fetchOther()
@@ -1024,11 +1067,12 @@ object Scanners {
1024
1067
case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
1025
1068
finishNamed()
1026
1069
case _ =>
1027
- if ( Character . isUnicodeIdentifierPart(ch)) {
1070
+ if isUnicodeIdentifierPart(ch) then
1028
1071
putChar(ch)
1029
1072
nextChar()
1030
1073
getIdentRest()
1031
- }
1074
+ else if isSupplementary(ch, isUnicodeIdentifierPart) then
1075
+ getIdentRest()
1032
1076
else
1033
1077
finishNamed()
1034
1078
}
@@ -1111,7 +1155,7 @@ object Scanners {
1111
1155
}
1112
1156
1113
1157
// for interpolated strings
1114
- @ annotation. tailrec private def getStringPart (multiLine : Boolean ): Unit =
1158
+ @ tailrec private def getStringPart (multiLine : Boolean ): Unit =
1115
1159
if (ch == '"' )
1116
1160
if (multiLine) {
1117
1161
nextRawChar()
@@ -1136,6 +1180,28 @@ object Scanners {
1136
1180
getStringPart(multiLine)
1137
1181
}
1138
1182
else if (ch == '$' ) {
1183
+ def getInterpolatedIdentRest (hasSupplement : Boolean ): Unit =
1184
+ @ tailrec def loopRest (): Unit =
1185
+ if ch != SU && isUnicodeIdentifierPart(ch) then
1186
+ putChar(ch) ; nextRawChar()
1187
+ loopRest()
1188
+ else if atSupplementary(ch, isUnicodeIdentifierPart) then
1189
+ putChar(ch) ; nextRawChar()
1190
+ putChar(ch) ; nextRawChar()
1191
+ loopRest()
1192
+ else
1193
+ finishNamedToken(IDENTIFIER , target = next)
1194
+ end loopRest
1195
+ setStrVal()
1196
+ token = STRINGPART
1197
+ next.lastOffset = charOffset - 1
1198
+ next.offset = charOffset - 1
1199
+ putChar(ch) ; nextRawChar()
1200
+ if hasSupplement then
1201
+ putChar(ch) ; nextRawChar()
1202
+ loopRest()
1203
+ end getInterpolatedIdentRest
1204
+
1139
1205
nextRawChar()
1140
1206
if (ch == '$' || ch == '"' ) {
1141
1207
putChar(ch)
@@ -1146,18 +1212,10 @@ object Scanners {
1146
1212
setStrVal()
1147
1213
token = STRINGPART
1148
1214
}
1149
- else if (Character .isUnicodeIdentifierStart(ch) || ch == '_' ) {
1150
- setStrVal()
1151
- token = STRINGPART
1152
- next.lastOffset = charOffset - 1
1153
- next.offset = charOffset - 1
1154
- while
1155
- putChar(ch)
1156
- nextRawChar()
1157
- ch != SU && Character .isUnicodeIdentifierPart(ch)
1158
- do ()
1159
- finishNamedToken(IDENTIFIER , target = next)
1160
- }
1215
+ else if isUnicodeIdentifierStart(ch) || ch == '_' then
1216
+ getInterpolatedIdentRest(hasSupplement = false )
1217
+ else if atSupplementary(ch, isUnicodeIdentifierStart) then
1218
+ getInterpolatedIdentRest(hasSupplement = true )
1161
1219
else
1162
1220
error(" invalid string interpolation: `$$`, `$\" `, `$`ident or `$`BlockExpr expected" , off = charOffset - 2 )
1163
1221
putChar('$' )
@@ -1205,76 +1263,73 @@ object Scanners {
1205
1263
false
1206
1264
}
1207
1265
1208
- /** copy current character into litBuf, interpreting any escape sequences,
1209
- * and advance to next character.
1266
+ /** Copy current character into cbuf, interpreting any escape sequences,
1267
+ * and advance to next character. Surrogate pairs are consumed (see check
1268
+ * at fetchSingleQuote), but orphan surrogate is allowed.
1210
1269
*/
1211
1270
protected def getLitChar (): Unit =
1212
- def invalidUnicodeEscape () = {
1213
- error(" invalid character in unicode escape sequence" , charOffset - 1 )
1214
- putChar(ch)
1215
- }
1216
- def putUnicode (): Unit = {
1217
- while ch == 'u' || ch == 'U' do nextChar()
1218
- var i = 0
1219
- var cp = 0
1220
- while (i < 4 ) {
1221
- val shift = (3 - i) * 4
1222
- val d = digit2int(ch, 16 )
1223
- if (d < 0 ) {
1224
- return invalidUnicodeEscape()
1225
- }
1226
- cp += (d << shift)
1227
- nextChar()
1228
- i += 1
1229
- }
1230
- putChar(cp.asInstanceOf [Char ])
1231
- }
1232
- if (ch == '\\ ' ) {
1271
+ if ch == '\\ ' then
1233
1272
nextChar()
1234
- if ('0' <= ch && ch <= '7' ) {
1235
- val start = charOffset - 2
1236
- val leadch : Char = ch
1237
- var oct : Int = digit2int(ch, 8 )
1238
- nextChar()
1239
- if ('0' <= ch && ch <= '7' ) {
1240
- oct = oct * 8 + digit2int(ch, 8 )
1241
- nextChar()
1242
- if (leadch <= '3' && '0' <= ch && ch <= '7' ) {
1243
- oct = oct * 8 + digit2int(ch, 8 )
1244
- nextChar()
1245
- }
1246
- }
1247
- val alt = if oct == LF then raw " \n " else f " ${" \\ " }u $oct%04x "
1248
- error(s " octal escape literals are unsupported: use $alt instead " , start)
1249
- putChar(oct.toChar)
1250
- }
1251
- else if (ch == 'u' || ch == 'U' ) {
1252
- putUnicode()
1253
- }
1254
- else {
1255
- ch match {
1256
- case 'b' => putChar('\b ' )
1257
- case 't' => putChar('\t ' )
1258
- case 'n' => putChar('\n ' )
1259
- case 'f' => putChar('\f ' )
1260
- case 'r' => putChar('\r ' )
1261
- case '\" ' => putChar('\" ' )
1262
- case '\' ' => putChar('\' ' )
1263
- case '\\ ' => putChar('\\ ' )
1264
- case _ => invalidEscape()
1265
- }
1266
- nextChar()
1267
- }
1268
- }
1269
- else {
1273
+ charEscape()
1274
+ else if ! isSupplementary(ch, _ => true , strict = false ) then
1270
1275
putChar(ch)
1271
1276
nextChar()
1272
- }
1273
1277
1274
- protected def invalidEscape (): Unit = {
1278
+ private def charEscape (): Unit =
1279
+ var bump = true
1280
+ ch match
1281
+ case 'b' => putChar('\b ' )
1282
+ case 't' => putChar('\t ' )
1283
+ case 'n' => putChar('\n ' )
1284
+ case 'f' => putChar('\f ' )
1285
+ case 'r' => putChar('\r ' )
1286
+ case '\" ' => putChar('\" ' )
1287
+ case '\' ' => putChar('\' ' )
1288
+ case '\\ ' => putChar('\\ ' )
1289
+ case 'u' |
1290
+ 'U' => uEscape(); bump = false
1291
+ case x if '0' <= x && x <= '7' => octalEscape(); bump = false
1292
+ case _ => invalidEscape()
1293
+ if bump then nextChar()
1294
+ end charEscape
1295
+
1296
+ private def uEscape (): Unit =
1297
+ while ch == 'u' || ch == 'U' do nextChar()
1298
+ var i = 0
1299
+ var cp = 0
1300
+ while i < 4 do
1301
+ val digit = digit2int(ch, 16 )
1302
+ if digit < 0 then
1303
+ error(" invalid character in unicode escape sequence" , charOffset - 1 )
1304
+ putChar(ch)
1305
+ return
1306
+ val shift = (3 - i) * 4
1307
+ cp += digit << shift
1308
+ nextChar()
1309
+ i += 1
1310
+ end while
1311
+ putChar(cp.asInstanceOf [Char ])
1312
+ end uEscape
1313
+
1314
+ private def octalEscape (): Unit =
1315
+ val start = charOffset - 2
1316
+ val leadch : Char = ch
1317
+ var oct : Int = digit2int(ch, 8 )
1318
+ nextChar()
1319
+ if '0' <= ch && ch <= '7' then
1320
+ oct = oct * 8 + digit2int(ch, 8 )
1321
+ nextChar()
1322
+ if leadch <= '3' && '0' <= ch && ch <= '7' then
1323
+ oct = oct * 8 + digit2int(ch, 8 )
1324
+ nextChar()
1325
+ val alt = if oct == LF then raw " \n " else toUnicode(oct.toChar)
1326
+ error(s " octal escape literals are unsupported: use $alt instead " , start)
1327
+ putChar(oct.toChar)
1328
+ end octalEscape
1329
+
1330
+ protected def invalidEscape (): Unit =
1275
1331
error(" invalid escape character" , charOffset - 1 )
1276
1332
putChar(ch)
1277
- }
1278
1333
1279
1334
private def getLitChars (delimiter : Char ) =
1280
1335
while (ch != delimiter && ! isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape))
0 commit comments