@@ -5,7 +5,6 @@ package parsing
5
5
import core .Names ._ , core .Contexts ._ , core .Decorators ._ , util .Spans ._
6
6
import core .StdNames ._ , core .Comments ._
7
7
import util .SourceFile
8
- import java .lang .Character .isDigit
9
8
import util .Chars ._
10
9
import util .{SourcePosition , CharBuffer }
11
10
import util .Spans .Span
@@ -705,6 +704,44 @@ object Scanners {
705
704
recur(lastOffset, false )
706
705
}
707
706
707
+ import Character .{isHighSurrogate , isLowSurrogate , isUnicodeIdentifierPart , isUnicodeIdentifierStart , isValidCodePoint , toCodePoint }
708
+
709
+ // given char (ch) is high surrogate followed by low, codepoint passes predicate.
710
+ // true means supplementary chars were put to buffer.
711
+ // strict to require low surrogate (if not in string literal).
712
+ private def isSupplementary (high : Char , test : Int => Boolean , strict : Boolean = true ): Boolean =
713
+ isHighSurrogate(high) && {
714
+ var res = false
715
+ nextChar()
716
+ val low = ch
717
+ if isLowSurrogate(low) then
718
+ nextChar()
719
+ val codepoint = toCodePoint(high, low)
720
+ if isValidCodePoint(codepoint) && test(codepoint) then
721
+ putChar(high)
722
+ putChar(low)
723
+ res = true
724
+ else
725
+ error(f " illegal character ' \u ${high.toInt}%04x \u ${low.toInt}%04x' " )
726
+ // error(f"illegal character '\\u$high%04x\\u$low%04x'")
727
+ else if ! strict then
728
+ putChar(high)
729
+ res = true
730
+ else
731
+ error(f " illegal character ' \u ${high.toInt}%04x' missing low surrogate " )
732
+ // error(f"illegal character '\\u$high%04x' missing low surrogate")
733
+ res
734
+ }
735
+ private def atSupplementary (ch : Char , f : Int => Boolean ): Boolean =
736
+ isHighSurrogate(ch) && {
737
+ val hi = ch
738
+ val lo = lookaheadChar()
739
+ isLowSurrogate(lo) && {
740
+ val codepoint = toCodePoint(hi, lo)
741
+ isValidCodePoint(codepoint) && f(codepoint)
742
+ }
743
+ }
744
+
708
745
/** read next token, filling TokenData fields of Scanner.
709
746
*/
710
747
protected final def fetchToken (): Unit = {
@@ -831,11 +868,12 @@ object Scanners {
831
868
else ch match {
832
869
case '{' | '[' | ' ' | '\t ' if lookaheadChar() != '\' ' =>
833
870
token = QUOTE
834
- case _ if ! isAtEnd && ( ch != SU && ch != CR && ch != LF || isUnicodeEscape) =>
871
+ case _ if ! isAtEnd && ch != SU && ch != CR && ch != LF =>
835
872
val isEmptyCharLit = (ch == '\' ' )
836
873
getLitChar()
837
874
if ch == '\' ' then
838
875
if isEmptyCharLit then error(" empty character literal (use '\\ '' for single quote)" )
876
+ else if litBuf.length != 1 then error(" illegal codepoint in Char constant: " + litBuf.toString.map(c => f " \u ${c.toInt}%04x " ).mkString(" '" , " " , " '" )) // FIXME format
839
877
else finishCharLit()
840
878
else if isEmptyCharLit then error(" empty character literal" )
841
879
else error(" unclosed character literal" )
@@ -878,9 +916,11 @@ object Scanners {
878
916
def fetchOther () =
879
917
if (ch == '\u21D2 ' ) {
880
918
nextChar(); token = ARROW
919
+ report.deprecationWarning(" The unicode arrow `⇒` is deprecated, use `=>` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code." , sourcePos(offset))
881
920
}
882
921
else if (ch == '\u2190 ' ) {
883
922
nextChar(); token = LARROW
923
+ report.deprecationWarning(" The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code." , sourcePos(offset))
884
924
}
885
925
else if (Character .isUnicodeIdentifierStart(ch)) {
886
926
putChar(ch)
@@ -892,9 +932,12 @@ object Scanners {
892
932
nextChar()
893
933
getOperatorRest()
894
934
}
935
+ else if isSupplementary(ch, isUnicodeIdentifierStart) then
936
+ getIdentRest()
895
937
else {
896
- // FIXME: Dotty deviation: f"" interpolator is not supported (#1814)
897
- error(" illegal character '\\ u%04x'" .format(ch : Int ))
938
+ // FIXME: Dotty deviation: f"" interpolator doesn't handle char or escaped backslash
939
+ // error(f"illegal character '\\u$ch%04x'")
940
+ error(f " illegal character ' \u ${ch.toInt}%04x' " )
898
941
nextChar()
899
942
}
900
943
fetchOther()
@@ -1033,11 +1076,12 @@ object Scanners {
1033
1076
case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
1034
1077
finishNamed()
1035
1078
case _ =>
1036
- if ( Character . isUnicodeIdentifierPart(ch)) {
1079
+ if isUnicodeIdentifierPart(ch) then
1037
1080
putChar(ch)
1038
1081
nextChar()
1039
1082
getIdentRest()
1040
- }
1083
+ else if isSupplementary(ch, isUnicodeIdentifierPart) then
1084
+ getIdentRest()
1041
1085
else
1042
1086
finishNamed()
1043
1087
}
@@ -1120,7 +1164,7 @@ object Scanners {
1120
1164
}
1121
1165
1122
1166
// for interpolated strings
1123
- @ annotation. tailrec private def getStringPart (multiLine : Boolean ): Unit =
1167
+ @ tailrec private def getStringPart (multiLine : Boolean ): Unit =
1124
1168
if (ch == '"' )
1125
1169
if (multiLine) {
1126
1170
nextRawChar()
@@ -1145,6 +1189,28 @@ object Scanners {
1145
1189
getStringPart(multiLine)
1146
1190
}
1147
1191
else if (ch == '$' ) {
1192
+ def getInterpolatedIdentRest (hasSupplement : Boolean ): Unit =
1193
+ @ tailrec def loopRest (): Unit =
1194
+ if ch != SU && isUnicodeIdentifierPart(ch) then
1195
+ putChar(ch) ; nextRawChar()
1196
+ loopRest()
1197
+ else if atSupplementary(ch, isUnicodeIdentifierPart) then
1198
+ putChar(ch) ; nextRawChar()
1199
+ putChar(ch) ; nextRawChar()
1200
+ loopRest()
1201
+ else
1202
+ finishNamedToken(IDENTIFIER , target = next)
1203
+ end loopRest
1204
+ setStrVal()
1205
+ token = STRINGPART
1206
+ next.lastOffset = charOffset - 1
1207
+ next.offset = charOffset - 1
1208
+ putChar(ch) ; nextRawChar()
1209
+ if hasSupplement then
1210
+ putChar(ch) ; nextRawChar()
1211
+ loopRest()
1212
+ end getInterpolatedIdentRest
1213
+
1148
1214
nextRawChar()
1149
1215
if (ch == '$' || ch == '"' ) {
1150
1216
putChar(ch)
@@ -1155,18 +1221,10 @@ object Scanners {
1155
1221
setStrVal()
1156
1222
token = STRINGPART
1157
1223
}
1158
- else if (Character .isUnicodeIdentifierStart(ch) || ch == '_' ) {
1159
- setStrVal()
1160
- token = STRINGPART
1161
- next.lastOffset = charOffset - 1
1162
- next.offset = charOffset - 1
1163
- while
1164
- putChar(ch)
1165
- nextRawChar()
1166
- ch != SU && Character .isUnicodeIdentifierPart(ch)
1167
- do ()
1168
- finishNamedToken(IDENTIFIER , target = next)
1169
- }
1224
+ else if isUnicodeIdentifierStart(ch) || ch == '_' then
1225
+ getInterpolatedIdentRest(hasSupplement = false )
1226
+ else if atSupplementary(ch, isUnicodeIdentifierStart) then
1227
+ getInterpolatedIdentRest(hasSupplement = true )
1170
1228
else
1171
1229
error(" invalid string interpolation: `$$`, `$\" `, `$`ident or `$`BlockExpr expected" )
1172
1230
}
@@ -1212,76 +1270,76 @@ object Scanners {
1212
1270
false
1213
1271
}
1214
1272
1215
- /** copy current character into litBuf, interpreting any escape sequences,
1216
- * and advance to next character.
1273
+ /** Copy current character into cbuf, interpreting any escape sequences,
1274
+ * and advance to next character. Surrogate pairs are consumed (see check
1275
+ * at fetchSingleQuote), but orphan surrogate is allowed.
1217
1276
*/
1218
1277
protected def getLitChar (): Unit =
1219
- def invalidUnicodeEscape () = {
1220
- error(" invalid character in unicode escape sequence" , charOffset - 1 )
1221
- putChar(ch)
1222
- }
1223
- def putUnicode (): Unit = {
1224
- while ch == 'u' || ch == 'U' do nextChar()
1225
- var i = 0
1226
- var cp = 0
1227
- while (i < 4 ) {
1228
- val shift = (3 - i) * 4
1229
- val d = digit2int(ch, 16 )
1230
- if (d < 0 ) {
1231
- return invalidUnicodeEscape()
1232
- }
1233
- cp += (d << shift)
1234
- nextChar()
1235
- i += 1
1236
- }
1237
- putChar(cp.asInstanceOf [Char ])
1238
- }
1239
- if (ch == '\\ ' ) {
1278
+ if ch == '\\ ' then
1240
1279
nextChar()
1241
- if ('0' <= ch && ch <= '7' ) {
1242
- val start = charOffset - 2
1243
- val leadch : Char = ch
1244
- var oct : Int = digit2int(ch, 8 )
1245
- nextChar()
1246
- if ('0' <= ch && ch <= '7' ) {
1247
- oct = oct * 8 + digit2int(ch, 8 )
1248
- nextChar()
1249
- if (leadch <= '3' && '0' <= ch && ch <= '7' ) {
1250
- oct = oct * 8 + digit2int(ch, 8 )
1251
- nextChar()
1252
- }
1253
- }
1254
- val alt = if oct == LF then raw " \n " else f " \u $oct%04x "
1255
- error(s " octal escape literals are unsupported: use $alt instead " , start)
1256
- putChar(oct.toChar)
1257
- }
1258
- else if (ch == 'u' || ch == 'U' ) {
1259
- putUnicode()
1260
- }
1261
- else {
1262
- ch match {
1263
- case 'b' => putChar('\b ' )
1264
- case 't' => putChar('\t ' )
1265
- case 'n' => putChar('\n ' )
1266
- case 'f' => putChar('\f ' )
1267
- case 'r' => putChar('\r ' )
1268
- case '\" ' => putChar('\" ' )
1269
- case '\' ' => putChar('\' ' )
1270
- case '\\ ' => putChar('\\ ' )
1271
- case _ => invalidEscape()
1272
- }
1273
- nextChar()
1274
- }
1275
- }
1276
- else {
1280
+ charEscape()
1281
+ else if ! isSupplementary(ch, _ => true , strict = false ) then
1277
1282
putChar(ch)
1278
1283
nextChar()
1279
- }
1280
1284
1281
- protected def invalidEscape (): Unit = {
1285
+ private def charEscape (): Unit =
1286
+ var bump = true
1287
+ ch match
1288
+ case 'b' => putChar('\b ' )
1289
+ case 't' => putChar('\t ' )
1290
+ case 'n' => putChar('\n ' )
1291
+ case 'f' => putChar('\f ' )
1292
+ case 'r' => putChar('\r ' )
1293
+ case '\" ' => putChar('\" ' )
1294
+ case '\' ' => putChar('\' ' )
1295
+ case '\\ ' => putChar('\\ ' )
1296
+ case 'u' |
1297
+ 'U' => bump = uEscape()
1298
+ case x if '0' <= x && x <= '7' => bump = octalEscape()
1299
+ case _ => invalidEscape()
1300
+ if bump then nextChar()
1301
+ end charEscape
1302
+
1303
+ private def uEscape (): Boolean =
1304
+ while ch == 'u' || ch == 'U' do nextChar()
1305
+ var i = 0
1306
+ var cp = 0
1307
+ while i < 4 do
1308
+ val digit = digit2int(ch, 16 )
1309
+ if digit < 0 then
1310
+ error(" invalid character in unicode escape sequence" , charOffset - 1 )
1311
+ putChar(ch)
1312
+ return false
1313
+ val shift = (3 - i) * 4
1314
+ cp += digit << shift
1315
+ nextChar()
1316
+ i += 1
1317
+ end while
1318
+ putChar(cp.asInstanceOf [Char ])
1319
+ false
1320
+ end uEscape
1321
+
1322
+ private def octalEscape (): Boolean =
1323
+ val start = charOffset - 2
1324
+ val leadch : Char = ch
1325
+ var oct : Int = digit2int(ch, 8 )
1326
+ nextChar()
1327
+ if '0' <= ch && ch <= '7' then
1328
+ oct = oct * 8 + digit2int(ch, 8 )
1329
+ nextChar()
1330
+ if leadch <= '3' && '0' <= ch && ch <= '7' then
1331
+ oct = oct * 8 + digit2int(ch, 8 )
1332
+ nextChar()
1333
+ // val alt = if (oct == LF) "\\n" else f"\\u$oct%04x"
1334
+ val alt = if oct == LF then raw " \n " else f " \u $oct%04x "
1335
+ error(s " octal escape literals are unsupported: use $alt instead " , start)
1336
+ putChar(oct.toChar)
1337
+ false
1338
+ end octalEscape
1339
+
1340
+ protected def invalidEscape (): Unit =
1282
1341
error(" invalid escape character" , charOffset - 1 )
1283
1342
putChar(ch)
1284
- }
1285
1343
1286
1344
private def getLitChars (delimiter : Char ) =
1287
1345
while (ch != delimiter && ! isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape))
@@ -1364,25 +1422,22 @@ object Scanners {
1364
1422
setStrVal()
1365
1423
}
1366
1424
1367
- private def finishCharLit (): Unit = {
1425
+ private def finishCharLit (): Unit =
1368
1426
nextChar()
1369
1427
token = CHARLIT
1370
1428
setStrVal()
1371
- }
1372
1429
1373
1430
/** Parse character literal if current character is followed by \',
1374
1431
* or follow with given op and return a symbol literal token
1375
1432
*/
1376
- def charLitOr (op : => Token ): Unit = {
1433
+ def charLitOr (op : => Token ): Unit =
1377
1434
putChar(ch)
1378
1435
nextChar()
1379
- if ( ch == '\' ' ) finishCharLit()
1380
- else {
1436
+ if ch == '\' ' then finishCharLit()
1437
+ else
1381
1438
token = op
1382
1439
strVal = if (name != null ) name.toString else null
1383
1440
litBuf.clear()
1384
- }
1385
- }
1386
1441
1387
1442
override def toString : String =
1388
1443
showTokenDetailed(token) + {
0 commit comments