@@ -5,7 +5,6 @@ package parsing
5
5
import core .Names ._ , core .Contexts ._ , core .Decorators ._ , util .Spans ._
6
6
import core .StdNames ._ , core .Comments ._
7
7
import util .SourceFile
8
- import java .lang .Character .isDigit
9
8
import util .Chars ._
10
9
import util .{SourcePosition , CharBuffer }
11
10
import util .Spans .Span
@@ -706,6 +705,44 @@ object Scanners {
706
705
recur(lastOffset, false )
707
706
}
708
707
708
+ import Character .{isHighSurrogate , isLowSurrogate , isUnicodeIdentifierPart , isUnicodeIdentifierStart , isValidCodePoint , toCodePoint }
709
+
710
+ // given char (ch) is high surrogate followed by low, codepoint passes predicate.
711
+ // true means supplementary chars were put to buffer.
712
+ // strict to require low surrogate (if not in string literal).
713
+ private def isSupplementary (high : Char , test : Int => Boolean , strict : Boolean = true ): Boolean =
714
+ isHighSurrogate(high) && {
715
+ var res = false
716
+ nextChar()
717
+ val low = ch
718
+ if isLowSurrogate(low) then
719
+ nextChar()
720
+ val codepoint = toCodePoint(high, low)
721
+ if isValidCodePoint(codepoint) && test(codepoint) then
722
+ putChar(high)
723
+ putChar(low)
724
+ res = true
725
+ else
726
+ error(f " illegal character ' \u ${high.toInt}%04x \u ${low.toInt}%04x' " )
727
+ // error(f"illegal character '\\u$high%04x\\u$low%04x'")
728
+ else if ! strict then
729
+ putChar(high)
730
+ res = true
731
+ else
732
+ error(f " illegal character ' \u ${high.toInt}%04x' missing low surrogate " )
733
+ // error(f"illegal character '\\u$high%04x' missing low surrogate")
734
+ res
735
+ }
736
+ private def atSupplementary (ch : Char , f : Int => Boolean ): Boolean =
737
+ isHighSurrogate(ch) && {
738
+ val hi = ch
739
+ val lo = lookaheadChar()
740
+ isLowSurrogate(lo) && {
741
+ val codepoint = toCodePoint(hi, lo)
742
+ isValidCodePoint(codepoint) && f(codepoint)
743
+ }
744
+ }
745
+
709
746
/** read next token, filling TokenData fields of Scanner.
710
747
*/
711
748
protected final def fetchToken (): Unit = {
@@ -832,11 +869,12 @@ object Scanners {
832
869
else ch match {
833
870
case '{' | '[' | ' ' | '\t ' if lookaheadChar() != '\' ' =>
834
871
token = QUOTE
835
- case _ if ! isAtEnd && ( ch != SU && ch != CR && ch != LF || isUnicodeEscape) =>
872
+ case _ if ! isAtEnd && ch != SU && ch != CR && ch != LF =>
836
873
val isEmptyCharLit = (ch == '\' ' )
837
874
getLitChar()
838
875
if ch == '\' ' then
839
876
if isEmptyCharLit then error(" empty character literal (use '\\ '' for single quote)" )
877
+ else if litBuf.length != 1 then error(" illegal codepoint in Char constant: " + litBuf.toString.map(c => f " \u ${c.toInt}%04x " ).mkString(" '" , " " , " '" )) // FIXME format
840
878
else finishCharLit()
841
879
else if isEmptyCharLit then error(" empty character literal" )
842
880
else error(" unclosed character literal" )
@@ -879,9 +917,11 @@ object Scanners {
879
917
def fetchOther () =
880
918
if (ch == '\u21D2 ' ) {
881
919
nextChar(); token = ARROW
920
+ report.deprecationWarning(" The unicode arrow `⇒` is deprecated, use `=>` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code." , sourcePos(offset))
882
921
}
883
922
else if (ch == '\u2190 ' ) {
884
923
nextChar(); token = LARROW
924
+ report.deprecationWarning(" The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code." , sourcePos(offset))
885
925
}
886
926
else if (Character .isUnicodeIdentifierStart(ch)) {
887
927
putChar(ch)
@@ -893,9 +933,12 @@ object Scanners {
893
933
nextChar()
894
934
getOperatorRest()
895
935
}
936
+ else if isSupplementary(ch, isUnicodeIdentifierStart) then
937
+ getIdentRest()
896
938
else {
897
- // FIXME: Dotty deviation: f"" interpolator is not supported (#1814)
898
- error(" illegal character '\\ u%04x'" .format(ch : Int ))
939
+ // FIXME: Dotty deviation: f"" interpolator doesn't handle char or escaped backslash
940
+ // error(f"illegal character '\\u$ch%04x'")
941
+ error(f " illegal character ' \u ${ch.toInt}%04x' " )
899
942
nextChar()
900
943
}
901
944
fetchOther()
@@ -1034,11 +1077,12 @@ object Scanners {
1034
1077
case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
1035
1078
finishNamed()
1036
1079
case _ =>
1037
- if ( Character . isUnicodeIdentifierPart(ch)) {
1080
+ if isUnicodeIdentifierPart(ch) then
1038
1081
putChar(ch)
1039
1082
nextChar()
1040
1083
getIdentRest()
1041
- }
1084
+ else if isSupplementary(ch, isUnicodeIdentifierPart) then
1085
+ getIdentRest()
1042
1086
else
1043
1087
finishNamed()
1044
1088
}
@@ -1121,7 +1165,7 @@ object Scanners {
1121
1165
}
1122
1166
1123
1167
// for interpolated strings
1124
- @ annotation. tailrec private def getStringPart (multiLine : Boolean ): Unit =
1168
+ @ tailrec private def getStringPart (multiLine : Boolean ): Unit =
1125
1169
if (ch == '"' )
1126
1170
if (multiLine) {
1127
1171
nextRawChar()
@@ -1146,6 +1190,28 @@ object Scanners {
1146
1190
getStringPart(multiLine)
1147
1191
}
1148
1192
else if (ch == '$' ) {
1193
+ def getInterpolatedIdentRest (hasSupplement : Boolean ): Unit =
1194
+ @ tailrec def loopRest (): Unit =
1195
+ if ch != SU && isUnicodeIdentifierPart(ch) then
1196
+ putChar(ch) ; nextRawChar()
1197
+ loopRest()
1198
+ else if atSupplementary(ch, isUnicodeIdentifierPart) then
1199
+ putChar(ch) ; nextRawChar()
1200
+ putChar(ch) ; nextRawChar()
1201
+ loopRest()
1202
+ else
1203
+ finishNamed(target = next)
1204
+ end loopRest
1205
+ setStrVal()
1206
+ token = STRINGPART
1207
+ next.lastOffset = charOffset - 1
1208
+ next.offset = charOffset - 1
1209
+ putChar(ch) ; nextRawChar()
1210
+ if hasSupplement then
1211
+ putChar(ch) ; nextRawChar()
1212
+ loopRest()
1213
+ end getInterpolatedIdentRest
1214
+
1149
1215
nextRawChar()
1150
1216
if (ch == '$' || ch == '"' ) {
1151
1217
putChar(ch)
@@ -1156,18 +1222,10 @@ object Scanners {
1156
1222
setStrVal()
1157
1223
token = STRINGPART
1158
1224
}
1159
- else if (Character .isUnicodeIdentifierStart(ch) || ch == '_' ) {
1160
- setStrVal()
1161
- token = STRINGPART
1162
- next.lastOffset = charOffset - 1
1163
- next.offset = charOffset - 1
1164
- while
1165
- putChar(ch)
1166
- nextRawChar()
1167
- ch != SU && Character .isUnicodeIdentifierPart(ch)
1168
- do ()
1169
- finishNamed(target = next)
1170
- }
1225
+ else if isUnicodeIdentifierStart(ch) || ch == '_' then
1226
+ getInterpolatedIdentRest(hasSupplement = false )
1227
+ else if atSupplementary(ch, isUnicodeIdentifierStart) then
1228
+ getInterpolatedIdentRest(hasSupplement = true )
1171
1229
else
1172
1230
error(" invalid string interpolation: `$$`, `$\" `, `$`ident or `$`BlockExpr expected" )
1173
1231
}
@@ -1213,76 +1271,76 @@ object Scanners {
1213
1271
false
1214
1272
}
1215
1273
1216
- /** copy current character into litBuf, interpreting any escape sequences,
1217
- * and advance to next character.
1274
+ /** Copy current character into cbuf, interpreting any escape sequences,
1275
+ * and advance to next character. Surrogate pairs are consumed (see check
1276
+ * at fetchSingleQuote), but orphan surrogate is allowed.
1218
1277
*/
1219
1278
protected def getLitChar (): Unit =
1220
- def invalidUnicodeEscape () = {
1221
- error(" invalid character in unicode escape sequence" , charOffset - 1 )
1222
- putChar(ch)
1223
- }
1224
- def putUnicode (): Unit = {
1225
- while ch == 'u' || ch == 'U' do nextChar()
1226
- var i = 0
1227
- var cp = 0
1228
- while (i < 4 ) {
1229
- val shift = (3 - i) * 4
1230
- val d = digit2int(ch, 16 )
1231
- if (d < 0 ) {
1232
- return invalidUnicodeEscape()
1233
- }
1234
- cp += (d << shift)
1235
- nextChar()
1236
- i += 1
1237
- }
1238
- putChar(cp.asInstanceOf [Char ])
1239
- }
1240
- if (ch == '\\ ' ) {
1279
+ if ch == '\\ ' then
1241
1280
nextChar()
1242
- if ('0' <= ch && ch <= '7' ) {
1243
- val start = charOffset - 2
1244
- val leadch : Char = ch
1245
- var oct : Int = digit2int(ch, 8 )
1246
- nextChar()
1247
- if ('0' <= ch && ch <= '7' ) {
1248
- oct = oct * 8 + digit2int(ch, 8 )
1249
- nextChar()
1250
- if (leadch <= '3' && '0' <= ch && ch <= '7' ) {
1251
- oct = oct * 8 + digit2int(ch, 8 )
1252
- nextChar()
1253
- }
1254
- }
1255
- val alt = if oct == LF then raw " \n " else f " \u $oct%04x "
1256
- error(s " octal escape literals are unsupported: use $alt instead " , start)
1257
- putChar(oct.toChar)
1258
- }
1259
- else if (ch == 'u' || ch == 'U' ) {
1260
- putUnicode()
1261
- }
1262
- else {
1263
- ch match {
1264
- case 'b' => putChar('\b ' )
1265
- case 't' => putChar('\t ' )
1266
- case 'n' => putChar('\n ' )
1267
- case 'f' => putChar('\f ' )
1268
- case 'r' => putChar('\r ' )
1269
- case '\" ' => putChar('\" ' )
1270
- case '\' ' => putChar('\' ' )
1271
- case '\\ ' => putChar('\\ ' )
1272
- case _ => invalidEscape()
1273
- }
1274
- nextChar()
1275
- }
1276
- }
1277
- else {
1281
+ charEscape()
1282
+ else if ! isSupplementary(ch, _ => true , strict = false ) then
1278
1283
putChar(ch)
1279
1284
nextChar()
1280
- }
1281
1285
1282
- protected def invalidEscape (): Unit = {
1286
+ private def charEscape (): Unit =
1287
+ var bump = true
1288
+ ch match
1289
+ case 'b' => putChar('\b ' )
1290
+ case 't' => putChar('\t ' )
1291
+ case 'n' => putChar('\n ' )
1292
+ case 'f' => putChar('\f ' )
1293
+ case 'r' => putChar('\r ' )
1294
+ case '\" ' => putChar('\" ' )
1295
+ case '\' ' => putChar('\' ' )
1296
+ case '\\ ' => putChar('\\ ' )
1297
+ case 'u' |
1298
+ 'U' => bump = uEscape()
1299
+ case x if '0' <= x && x <= '7' => bump = octalEscape()
1300
+ case _ => invalidEscape()
1301
+ if bump then nextChar()
1302
+ end charEscape
1303
+
1304
+ private def uEscape (): Boolean =
1305
+ while ch == 'u' || ch == 'U' do nextChar()
1306
+ var i = 0
1307
+ var cp = 0
1308
+ while i < 4 do
1309
+ val digit = digit2int(ch, 16 )
1310
+ if digit < 0 then
1311
+ error(" invalid character in unicode escape sequence" , charOffset - 1 )
1312
+ putChar(ch)
1313
+ return false
1314
+ val shift = (3 - i) * 4
1315
+ cp += digit << shift
1316
+ nextChar()
1317
+ i += 1
1318
+ end while
1319
+ putChar(cp.asInstanceOf [Char ])
1320
+ false
1321
+ end uEscape
1322
+
1323
+ private def octalEscape (): Boolean =
1324
+ val start = charOffset - 2
1325
+ val leadch : Char = ch
1326
+ var oct : Int = digit2int(ch, 8 )
1327
+ nextChar()
1328
+ if '0' <= ch && ch <= '7' then
1329
+ oct = oct * 8 + digit2int(ch, 8 )
1330
+ nextChar()
1331
+ if leadch <= '3' && '0' <= ch && ch <= '7' then
1332
+ oct = oct * 8 + digit2int(ch, 8 )
1333
+ nextChar()
1334
+ // val alt = if (oct == LF) "\\n" else f"\\u$oct%04x"
1335
+ val alt = if oct == LF then raw " \n " else f " \u $oct%04x "
1336
+ error(s " octal escape literals are unsupported: use $alt instead " , start)
1337
+ putChar(oct.toChar)
1338
+ false
1339
+ end octalEscape
1340
+
1341
+ protected def invalidEscape (): Unit =
1283
1342
error(" invalid escape character" , charOffset - 1 )
1284
1343
putChar(ch)
1285
- }
1286
1344
1287
1345
private def getLitChars (delimiter : Char ) =
1288
1346
while (ch != delimiter && ! isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape))
@@ -1365,25 +1423,22 @@ object Scanners {
1365
1423
setStrVal()
1366
1424
}
1367
1425
1368
- private def finishCharLit (): Unit = {
1426
+ private def finishCharLit (): Unit =
1369
1427
nextChar()
1370
1428
token = CHARLIT
1371
1429
setStrVal()
1372
- }
1373
1430
1374
1431
/** Parse character literal if current character is followed by \',
1375
1432
* or follow with given op and return a symbol literal token
1376
1433
*/
1377
- def charLitOr (op : => Token ): Unit = {
1434
+ def charLitOr (op : => Token ): Unit =
1378
1435
putChar(ch)
1379
1436
nextChar()
1380
- if ( ch == '\' ' ) finishCharLit()
1381
- else {
1437
+ if ch == '\' ' then finishCharLit()
1438
+ else
1382
1439
token = op
1383
1440
strVal = if (name != null ) name.toString else null
1384
1441
litBuf.clear()
1385
- }
1386
- }
1387
1442
1388
1443
override def toString : String =
1389
1444
showTokenDetailed(token) + {
0 commit comments