Skip to content

Commit 7dd2a52

Browse files
committed
Accept supplementary characters
1 parent ec15557 commit 7dd2a52

File tree

8 files changed

+201
-104
lines changed

8 files changed

+201
-104
lines changed

compiler/src/dotty/tools/dotc/parsing/Scanners.scala

Lines changed: 143 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ package parsing
55
import core.Names._, core.Contexts._, core.Decorators._, util.Spans._
66
import core.StdNames._, core.Comments._
77
import util.SourceFile
8-
import java.lang.Character.isDigit
98
import util.Chars._
109
import util.{SourcePosition, CharBuffer}
1110
import util.Spans.Span
@@ -706,6 +705,44 @@ object Scanners {
706705
recur(lastOffset, false)
707706
}
708707

708+
import Character.{isHighSurrogate, isLowSurrogate, isUnicodeIdentifierPart, isUnicodeIdentifierStart, isValidCodePoint, toCodePoint}
709+
710+
// given char (ch) is high surrogate followed by low, codepoint passes predicate.
711+
// true means supplementary chars were put to buffer.
712+
// strict to require low surrogate (if not in string literal).
713+
private def isSupplementary(high: Char, test: Int => Boolean, strict: Boolean = true): Boolean =
714+
isHighSurrogate(high) && {
715+
var res = false
716+
nextChar()
717+
val low = ch
718+
if isLowSurrogate(low) then
719+
nextChar()
720+
val codepoint = toCodePoint(high, low)
721+
if isValidCodePoint(codepoint) && test(codepoint) then
722+
putChar(high)
723+
putChar(low)
724+
res = true
725+
else
726+
error(f"illegal character '\u${high.toInt}%04x\u${low.toInt}%04x'")
727+
//error(f"illegal character '\\u$high%04x\\u$low%04x'")
728+
else if !strict then
729+
putChar(high)
730+
res = true
731+
else
732+
error(f"illegal character '\u${high.toInt}%04x' missing low surrogate")
733+
//error(f"illegal character '\\u$high%04x' missing low surrogate")
734+
res
735+
}
736+
private def atSupplementary(ch: Char, f: Int => Boolean): Boolean =
737+
isHighSurrogate(ch) && {
738+
val hi = ch
739+
val lo = lookaheadChar()
740+
isLowSurrogate(lo) && {
741+
val codepoint = toCodePoint(hi, lo)
742+
isValidCodePoint(codepoint) && f(codepoint)
743+
}
744+
}
745+
709746
/** read next token, filling TokenData fields of Scanner.
710747
*/
711748
protected final def fetchToken(): Unit = {
@@ -832,11 +869,12 @@ object Scanners {
832869
else ch match {
833870
case '{' | '[' | ' ' | '\t' if lookaheadChar() != '\'' =>
834871
token = QUOTE
835-
case _ if !isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape) =>
872+
case _ if !isAtEnd && ch != SU && ch != CR && ch != LF =>
836873
val isEmptyCharLit = (ch == '\'')
837874
getLitChar()
838875
if ch == '\'' then
839876
if isEmptyCharLit then error("empty character literal (use '\\'' for single quote)")
877+
else if litBuf.length != 1 then error("illegal codepoint in Char constant: " + litBuf.toString.map(c => f"\u${c.toInt}%04x").mkString("'", "", "'")) // FIXME format
840878
else finishCharLit()
841879
else if isEmptyCharLit then error("empty character literal")
842880
else error("unclosed character literal")
@@ -879,9 +917,11 @@ object Scanners {
879917
def fetchOther() =
880918
if (ch == '\u21D2') {
881919
nextChar(); token = ARROW
920+
report.deprecationWarning("The unicode arrow `⇒` is deprecated, use `=>` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
882921
}
883922
else if (ch == '\u2190') {
884923
nextChar(); token = LARROW
924+
report.deprecationWarning("The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
885925
}
886926
else if (Character.isUnicodeIdentifierStart(ch)) {
887927
putChar(ch)
@@ -893,9 +933,12 @@ object Scanners {
893933
nextChar()
894934
getOperatorRest()
895935
}
936+
else if isSupplementary(ch, isUnicodeIdentifierStart) then
937+
getIdentRest()
896938
else {
897-
// FIXME: Dotty deviation: f"" interpolator is not supported (#1814)
898-
error("illegal character '\\u%04x'".format(ch: Int))
939+
// FIXME: Dotty deviation: f"" interpolator doesn't handle char or escaped backslash
940+
//error(f"illegal character '\\u$ch%04x'")
941+
error(f"illegal character '\u${ch.toInt}%04x'")
899942
nextChar()
900943
}
901944
fetchOther()
@@ -1034,11 +1077,12 @@ object Scanners {
10341077
case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
10351078
finishNamed()
10361079
case _ =>
1037-
if (Character.isUnicodeIdentifierPart(ch)) {
1080+
if isUnicodeIdentifierPart(ch) then
10381081
putChar(ch)
10391082
nextChar()
10401083
getIdentRest()
1041-
}
1084+
else if isSupplementary(ch, isUnicodeIdentifierPart) then
1085+
getIdentRest()
10421086
else
10431087
finishNamed()
10441088
}
@@ -1121,7 +1165,7 @@ object Scanners {
11211165
}
11221166

11231167
// for interpolated strings
1124-
@annotation.tailrec private def getStringPart(multiLine: Boolean): Unit =
1168+
@tailrec private def getStringPart(multiLine: Boolean): Unit =
11251169
if (ch == '"')
11261170
if (multiLine) {
11271171
nextRawChar()
@@ -1146,6 +1190,28 @@ object Scanners {
11461190
getStringPart(multiLine)
11471191
}
11481192
else if (ch == '$') {
1193+
def getInterpolatedIdentRest(hasSupplement: Boolean): Unit =
1194+
@tailrec def loopRest(): Unit =
1195+
if ch != SU && isUnicodeIdentifierPart(ch) then
1196+
putChar(ch) ; nextRawChar()
1197+
loopRest()
1198+
else if atSupplementary(ch, isUnicodeIdentifierPart) then
1199+
putChar(ch) ; nextRawChar()
1200+
putChar(ch) ; nextRawChar()
1201+
loopRest()
1202+
else
1203+
finishNamed(target = next)
1204+
end loopRest
1205+
setStrVal()
1206+
token = STRINGPART
1207+
next.lastOffset = charOffset - 1
1208+
next.offset = charOffset - 1
1209+
putChar(ch) ; nextRawChar()
1210+
if hasSupplement then
1211+
putChar(ch) ; nextRawChar()
1212+
loopRest()
1213+
end getInterpolatedIdentRest
1214+
11491215
nextRawChar()
11501216
if (ch == '$' || ch == '"') {
11511217
putChar(ch)
@@ -1156,18 +1222,10 @@ object Scanners {
11561222
setStrVal()
11571223
token = STRINGPART
11581224
}
1159-
else if (Character.isUnicodeIdentifierStart(ch) || ch == '_') {
1160-
setStrVal()
1161-
token = STRINGPART
1162-
next.lastOffset = charOffset - 1
1163-
next.offset = charOffset - 1
1164-
while
1165-
putChar(ch)
1166-
nextRawChar()
1167-
ch != SU && Character.isUnicodeIdentifierPart(ch)
1168-
do ()
1169-
finishNamed(target = next)
1170-
}
1225+
else if isUnicodeIdentifierStart(ch) || ch == '_' then
1226+
getInterpolatedIdentRest(hasSupplement = false)
1227+
else if atSupplementary(ch, isUnicodeIdentifierStart) then
1228+
getInterpolatedIdentRest(hasSupplement = true)
11711229
else
11721230
error("invalid string interpolation: `$$`, `$\"`, `$`ident or `$`BlockExpr expected")
11731231
}
@@ -1213,76 +1271,76 @@ object Scanners {
12131271
false
12141272
}
12151273

1216-
/** copy current character into litBuf, interpreting any escape sequences,
1217-
* and advance to next character.
1274+
/** Copy current character into cbuf, interpreting any escape sequences,
1275+
* and advance to next character. Surrogate pairs are consumed (see check
1276+
* at fetchSingleQuote), but orphan surrogate is allowed.
12181277
*/
12191278
protected def getLitChar(): Unit =
1220-
def invalidUnicodeEscape() = {
1221-
error("invalid character in unicode escape sequence", charOffset - 1)
1222-
putChar(ch)
1223-
}
1224-
def putUnicode(): Unit = {
1225-
while ch == 'u' || ch == 'U' do nextChar()
1226-
var i = 0
1227-
var cp = 0
1228-
while (i < 4) {
1229-
val shift = (3 - i) * 4
1230-
val d = digit2int(ch, 16)
1231-
if(d < 0) {
1232-
return invalidUnicodeEscape()
1233-
}
1234-
cp += (d << shift)
1235-
nextChar()
1236-
i += 1
1237-
}
1238-
putChar(cp.asInstanceOf[Char])
1239-
}
1240-
if (ch == '\\') {
1279+
if ch == '\\' then
12411280
nextChar()
1242-
if ('0' <= ch && ch <= '7') {
1243-
val start = charOffset - 2
1244-
val leadch: Char = ch
1245-
var oct: Int = digit2int(ch, 8)
1246-
nextChar()
1247-
if ('0' <= ch && ch <= '7') {
1248-
oct = oct * 8 + digit2int(ch, 8)
1249-
nextChar()
1250-
if (leadch <= '3' && '0' <= ch && ch <= '7') {
1251-
oct = oct * 8 + digit2int(ch, 8)
1252-
nextChar()
1253-
}
1254-
}
1255-
val alt = if oct == LF then raw"\n" else f"\u$oct%04x"
1256-
error(s"octal escape literals are unsupported: use $alt instead", start)
1257-
putChar(oct.toChar)
1258-
}
1259-
else if (ch == 'u' || ch == 'U') {
1260-
putUnicode()
1261-
}
1262-
else {
1263-
ch match {
1264-
case 'b' => putChar('\b')
1265-
case 't' => putChar('\t')
1266-
case 'n' => putChar('\n')
1267-
case 'f' => putChar('\f')
1268-
case 'r' => putChar('\r')
1269-
case '\"' => putChar('\"')
1270-
case '\'' => putChar('\'')
1271-
case '\\' => putChar('\\')
1272-
case _ => invalidEscape()
1273-
}
1274-
nextChar()
1275-
}
1276-
}
1277-
else {
1281+
charEscape()
1282+
else if !isSupplementary(ch, _ => true, strict = false) then
12781283
putChar(ch)
12791284
nextChar()
1280-
}
12811285

1282-
protected def invalidEscape(): Unit = {
1286+
private def charEscape(): Unit =
1287+
var bump = true
1288+
ch match
1289+
case 'b' => putChar('\b')
1290+
case 't' => putChar('\t')
1291+
case 'n' => putChar('\n')
1292+
case 'f' => putChar('\f')
1293+
case 'r' => putChar('\r')
1294+
case '\"' => putChar('\"')
1295+
case '\'' => putChar('\'')
1296+
case '\\' => putChar('\\')
1297+
case 'u' |
1298+
'U' => bump = uEscape()
1299+
case x if '0' <= x && x <= '7' => bump = octalEscape()
1300+
case _ => invalidEscape()
1301+
if bump then nextChar()
1302+
end charEscape
1303+
1304+
private def uEscape(): Boolean =
1305+
while ch == 'u' || ch == 'U' do nextChar()
1306+
var i = 0
1307+
var cp = 0
1308+
while i < 4 do
1309+
val digit = digit2int(ch, 16)
1310+
if digit < 0 then
1311+
error("invalid character in unicode escape sequence", charOffset - 1)
1312+
putChar(ch)
1313+
return false
1314+
val shift = (3 - i) * 4
1315+
cp += digit << shift
1316+
nextChar()
1317+
i += 1
1318+
end while
1319+
putChar(cp.asInstanceOf[Char])
1320+
false
1321+
end uEscape
1322+
1323+
private def octalEscape(): Boolean =
1324+
val start = charOffset - 2
1325+
val leadch: Char = ch
1326+
var oct: Int = digit2int(ch, 8)
1327+
nextChar()
1328+
if '0' <= ch && ch <= '7' then
1329+
oct = oct * 8 + digit2int(ch, 8)
1330+
nextChar()
1331+
if leadch <= '3' && '0' <= ch && ch <= '7' then
1332+
oct = oct * 8 + digit2int(ch, 8)
1333+
nextChar()
1334+
//val alt = if (oct == LF) "\\n" else f"\\u$oct%04x"
1335+
val alt = if oct == LF then raw"\n" else f"\u$oct%04x"
1336+
error(s"octal escape literals are unsupported: use $alt instead", start)
1337+
putChar(oct.toChar)
1338+
false
1339+
end octalEscape
1340+
1341+
protected def invalidEscape(): Unit =
12831342
error("invalid escape character", charOffset - 1)
12841343
putChar(ch)
1285-
}
12861344

12871345
private def getLitChars(delimiter: Char) =
12881346
while (ch != delimiter && !isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape))
@@ -1365,25 +1423,22 @@ object Scanners {
13651423
setStrVal()
13661424
}
13671425

1368-
private def finishCharLit(): Unit = {
1426+
private def finishCharLit(): Unit =
13691427
nextChar()
13701428
token = CHARLIT
13711429
setStrVal()
1372-
}
13731430

13741431
/** Parse character literal if current character is followed by \',
13751432
* or follow with given op and return a symbol literal token
13761433
*/
1377-
def charLitOr(op: => Token): Unit = {
1434+
def charLitOr(op: => Token): Unit =
13781435
putChar(ch)
13791436
nextChar()
1380-
if (ch == '\'') finishCharLit()
1381-
else {
1437+
if ch == '\'' then finishCharLit()
1438+
else
13821439
token = op
13831440
strVal = if (name != null) name.toString else null
13841441
litBuf.clear()
1385-
}
1386-
}
13871442

13881443
override def toString: String =
13891444
showTokenDetailed(token) + {

compiler/src/dotty/tools/dotc/transform/Pickler.scala

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,11 +137,14 @@ class Pickler extends Phase {
137137
}
138138

139139
private def testSame(unpickled: String, previous: String, cls: ClassSymbol)(using Context) =
140-
if (previous != unpickled) {
140+
import java.nio.charset.StandardCharsets.UTF_8
141+
def normal(s: String) = new String(s.getBytes(UTF_8), UTF_8)
142+
val unequal = unpickled.length() != previous.length() || normal(unpickled) != normal(previous)
143+
if unequal then
141144
output("before-pickling.txt", previous)
142145
output("after-pickling.txt", unpickled)
143146
report.error(s"""pickling difference for $cls in ${cls.source}, for details:
144147
|
145148
| diff before-pickling.txt after-pickling.txt""".stripMargin)
146-
}
149+
end testSame
147150
}

scaladoc/src/dotty/tools/scaladoc/util/JSON.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def jsonString(s: String): JSON =
3131

3232
sb.append('"')
3333
firstToBeEncoded() match
34-
case -1 sb.append(s)
34+
case -1 => sb.append(s)
3535
case first =>
3636
// sb.append(s, 0, first) for "abc", 0, 2 produce "(abc,0,2)" rather then "ab" as in Java
3737
sb.append(s.substring(0, first))
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
2+
val f = (x: Int) x + 1 // error
3+
4+
val list = for (n List(42)) yield n + 1 // error

tests/neg/surrogates.scala

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
2+
class C {
3+
def `too wide for Char` = '𐐀' // error
4+
}

0 commit comments

Comments
 (0)