Skip to content

Commit 87e8373

Browse files
committed
Accept supplementary characters
1 parent 0857285 commit 87e8373

File tree

8 files changed

+201
-104
lines changed

8 files changed

+201
-104
lines changed

compiler/src/dotty/tools/dotc/parsing/Scanners.scala

Lines changed: 143 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ package parsing
55
import core.Names._, core.Contexts._, core.Decorators._, util.Spans._
66
import core.StdNames._, core.Comments._
77
import util.SourceFile
8-
import java.lang.Character.isDigit
98
import util.Chars._
109
import util.{SourcePosition, CharBuffer}
1110
import util.Spans.Span
@@ -705,6 +704,44 @@ object Scanners {
705704
recur(lastOffset, false)
706705
}
707706

707+
import Character.{isHighSurrogate, isLowSurrogate, isUnicodeIdentifierPart, isUnicodeIdentifierStart, isValidCodePoint, toCodePoint}
708+
709+
// given char (ch) is high surrogate followed by low, codepoint passes predicate.
710+
// true means supplementary chars were put to buffer.
711+
// strict to require low surrogate (if not in string literal).
712+
private def isSupplementary(high: Char, test: Int => Boolean, strict: Boolean = true): Boolean =
713+
isHighSurrogate(high) && {
714+
var res = false
715+
nextChar()
716+
val low = ch
717+
if isLowSurrogate(low) then
718+
nextChar()
719+
val codepoint = toCodePoint(high, low)
720+
if isValidCodePoint(codepoint) && test(codepoint) then
721+
putChar(high)
722+
putChar(low)
723+
res = true
724+
else
725+
error(f"illegal character '\u${high.toInt}%04x\u${low.toInt}%04x'")
726+
//error(f"illegal character '\\u$high%04x\\u$low%04x'")
727+
else if !strict then
728+
putChar(high)
729+
res = true
730+
else
731+
error(f"illegal character '\u${high.toInt}%04x' missing low surrogate")
732+
//error(f"illegal character '\\u$high%04x' missing low surrogate")
733+
res
734+
}
735+
private def atSupplementary(ch: Char, f: Int => Boolean): Boolean =
736+
isHighSurrogate(ch) && {
737+
val hi = ch
738+
val lo = lookaheadChar()
739+
isLowSurrogate(lo) && {
740+
val codepoint = toCodePoint(hi, lo)
741+
isValidCodePoint(codepoint) && f(codepoint)
742+
}
743+
}
744+
708745
/** read next token, filling TokenData fields of Scanner.
709746
*/
710747
protected final def fetchToken(): Unit = {
@@ -831,11 +868,12 @@ object Scanners {
831868
else ch match {
832869
case '{' | '[' | ' ' | '\t' if lookaheadChar() != '\'' =>
833870
token = QUOTE
834-
case _ if !isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape) =>
871+
case _ if !isAtEnd && ch != SU && ch != CR && ch != LF =>
835872
val isEmptyCharLit = (ch == '\'')
836873
getLitChar()
837874
if ch == '\'' then
838875
if isEmptyCharLit then error("empty character literal (use '\\'' for single quote)")
876+
else if litBuf.length != 1 then error("illegal codepoint in Char constant: " + litBuf.toString.map(c => f"\u${c.toInt}%04x").mkString("'", "", "'")) // FIXME format
839877
else finishCharLit()
840878
else if isEmptyCharLit then error("empty character literal")
841879
else error("unclosed character literal")
@@ -878,9 +916,11 @@ object Scanners {
878916
def fetchOther() =
879917
if (ch == '\u21D2') {
880918
nextChar(); token = ARROW
919+
report.deprecationWarning("The unicode arrow `⇒` is deprecated, use `=>` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
881920
}
882921
else if (ch == '\u2190') {
883922
nextChar(); token = LARROW
923+
report.deprecationWarning("The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
884924
}
885925
else if (Character.isUnicodeIdentifierStart(ch)) {
886926
putChar(ch)
@@ -892,9 +932,12 @@ object Scanners {
892932
nextChar()
893933
getOperatorRest()
894934
}
935+
else if isSupplementary(ch, isUnicodeIdentifierStart) then
936+
getIdentRest()
895937
else {
896-
// FIXME: Dotty deviation: f"" interpolator is not supported (#1814)
897-
error("illegal character '\\u%04x'".format(ch: Int))
938+
// FIXME: Dotty deviation: f"" interpolator doesn't handle char or escaped backslash
939+
//error(f"illegal character '\\u$ch%04x'")
940+
error(f"illegal character '\u${ch.toInt}%04x'")
898941
nextChar()
899942
}
900943
fetchOther()
@@ -1033,11 +1076,12 @@ object Scanners {
10331076
case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
10341077
finishNamed()
10351078
case _ =>
1036-
if (Character.isUnicodeIdentifierPart(ch)) {
1079+
if isUnicodeIdentifierPart(ch) then
10371080
putChar(ch)
10381081
nextChar()
10391082
getIdentRest()
1040-
}
1083+
else if isSupplementary(ch, isUnicodeIdentifierPart) then
1084+
getIdentRest()
10411085
else
10421086
finishNamed()
10431087
}
@@ -1120,7 +1164,7 @@ object Scanners {
11201164
}
11211165

11221166
// for interpolated strings
1123-
@annotation.tailrec private def getStringPart(multiLine: Boolean): Unit =
1167+
@tailrec private def getStringPart(multiLine: Boolean): Unit =
11241168
if (ch == '"')
11251169
if (multiLine) {
11261170
nextRawChar()
@@ -1145,6 +1189,28 @@ object Scanners {
11451189
getStringPart(multiLine)
11461190
}
11471191
else if (ch == '$') {
1192+
def getInterpolatedIdentRest(hasSupplement: Boolean): Unit =
1193+
@tailrec def loopRest(): Unit =
1194+
if ch != SU && isUnicodeIdentifierPart(ch) then
1195+
putChar(ch) ; nextRawChar()
1196+
loopRest()
1197+
else if atSupplementary(ch, isUnicodeIdentifierPart) then
1198+
putChar(ch) ; nextRawChar()
1199+
putChar(ch) ; nextRawChar()
1200+
loopRest()
1201+
else
1202+
finishNamedToken(IDENTIFIER, target = next)
1203+
end loopRest
1204+
setStrVal()
1205+
token = STRINGPART
1206+
next.lastOffset = charOffset - 1
1207+
next.offset = charOffset - 1
1208+
putChar(ch) ; nextRawChar()
1209+
if hasSupplement then
1210+
putChar(ch) ; nextRawChar()
1211+
loopRest()
1212+
end getInterpolatedIdentRest
1213+
11481214
nextRawChar()
11491215
if (ch == '$' || ch == '"') {
11501216
putChar(ch)
@@ -1155,18 +1221,10 @@ object Scanners {
11551221
setStrVal()
11561222
token = STRINGPART
11571223
}
1158-
else if (Character.isUnicodeIdentifierStart(ch) || ch == '_') {
1159-
setStrVal()
1160-
token = STRINGPART
1161-
next.lastOffset = charOffset - 1
1162-
next.offset = charOffset - 1
1163-
while
1164-
putChar(ch)
1165-
nextRawChar()
1166-
ch != SU && Character.isUnicodeIdentifierPart(ch)
1167-
do ()
1168-
finishNamedToken(IDENTIFIER, target = next)
1169-
}
1224+
else if isUnicodeIdentifierStart(ch) || ch == '_' then
1225+
getInterpolatedIdentRest(hasSupplement = false)
1226+
else if atSupplementary(ch, isUnicodeIdentifierStart) then
1227+
getInterpolatedIdentRest(hasSupplement = true)
11701228
else
11711229
error("invalid string interpolation: `$$`, `$\"`, `$`ident or `$`BlockExpr expected")
11721230
}
@@ -1212,76 +1270,76 @@ object Scanners {
12121270
false
12131271
}
12141272

1215-
/** copy current character into litBuf, interpreting any escape sequences,
1216-
* and advance to next character.
1273+
/** Copy current character into cbuf, interpreting any escape sequences,
1274+
* and advance to next character. Surrogate pairs are consumed (see check
1275+
* at fetchSingleQuote), but orphan surrogate is allowed.
12171276
*/
12181277
protected def getLitChar(): Unit =
1219-
def invalidUnicodeEscape() = {
1220-
error("invalid character in unicode escape sequence", charOffset - 1)
1221-
putChar(ch)
1222-
}
1223-
def putUnicode(): Unit = {
1224-
while ch == 'u' || ch == 'U' do nextChar()
1225-
var i = 0
1226-
var cp = 0
1227-
while (i < 4) {
1228-
val shift = (3 - i) * 4
1229-
val d = digit2int(ch, 16)
1230-
if(d < 0) {
1231-
return invalidUnicodeEscape()
1232-
}
1233-
cp += (d << shift)
1234-
nextChar()
1235-
i += 1
1236-
}
1237-
putChar(cp.asInstanceOf[Char])
1238-
}
1239-
if (ch == '\\') {
1278+
if ch == '\\' then
12401279
nextChar()
1241-
if ('0' <= ch && ch <= '7') {
1242-
val start = charOffset - 2
1243-
val leadch: Char = ch
1244-
var oct: Int = digit2int(ch, 8)
1245-
nextChar()
1246-
if ('0' <= ch && ch <= '7') {
1247-
oct = oct * 8 + digit2int(ch, 8)
1248-
nextChar()
1249-
if (leadch <= '3' && '0' <= ch && ch <= '7') {
1250-
oct = oct * 8 + digit2int(ch, 8)
1251-
nextChar()
1252-
}
1253-
}
1254-
val alt = if oct == LF then raw"\n" else f"\u$oct%04x"
1255-
error(s"octal escape literals are unsupported: use $alt instead", start)
1256-
putChar(oct.toChar)
1257-
}
1258-
else if (ch == 'u' || ch == 'U') {
1259-
putUnicode()
1260-
}
1261-
else {
1262-
ch match {
1263-
case 'b' => putChar('\b')
1264-
case 't' => putChar('\t')
1265-
case 'n' => putChar('\n')
1266-
case 'f' => putChar('\f')
1267-
case 'r' => putChar('\r')
1268-
case '\"' => putChar('\"')
1269-
case '\'' => putChar('\'')
1270-
case '\\' => putChar('\\')
1271-
case _ => invalidEscape()
1272-
}
1273-
nextChar()
1274-
}
1275-
}
1276-
else {
1280+
charEscape()
1281+
else if !isSupplementary(ch, _ => true, strict = false) then
12771282
putChar(ch)
12781283
nextChar()
1279-
}
12801284

1281-
protected def invalidEscape(): Unit = {
1285+
private def charEscape(): Unit =
1286+
var bump = true
1287+
ch match
1288+
case 'b' => putChar('\b')
1289+
case 't' => putChar('\t')
1290+
case 'n' => putChar('\n')
1291+
case 'f' => putChar('\f')
1292+
case 'r' => putChar('\r')
1293+
case '\"' => putChar('\"')
1294+
case '\'' => putChar('\'')
1295+
case '\\' => putChar('\\')
1296+
case 'u' |
1297+
'U' => bump = uEscape()
1298+
case x if '0' <= x && x <= '7' => bump = octalEscape()
1299+
case _ => invalidEscape()
1300+
if bump then nextChar()
1301+
end charEscape
1302+
1303+
private def uEscape(): Boolean =
1304+
while ch == 'u' || ch == 'U' do nextChar()
1305+
var i = 0
1306+
var cp = 0
1307+
while i < 4 do
1308+
val digit = digit2int(ch, 16)
1309+
if digit < 0 then
1310+
error("invalid character in unicode escape sequence", charOffset - 1)
1311+
putChar(ch)
1312+
return false
1313+
val shift = (3 - i) * 4
1314+
cp += digit << shift
1315+
nextChar()
1316+
i += 1
1317+
end while
1318+
putChar(cp.asInstanceOf[Char])
1319+
false
1320+
end uEscape
1321+
1322+
private def octalEscape(): Boolean =
1323+
val start = charOffset - 2
1324+
val leadch: Char = ch
1325+
var oct: Int = digit2int(ch, 8)
1326+
nextChar()
1327+
if '0' <= ch && ch <= '7' then
1328+
oct = oct * 8 + digit2int(ch, 8)
1329+
nextChar()
1330+
if leadch <= '3' && '0' <= ch && ch <= '7' then
1331+
oct = oct * 8 + digit2int(ch, 8)
1332+
nextChar()
1333+
//val alt = if (oct == LF) "\\n" else f"\\u$oct%04x"
1334+
val alt = if oct == LF then raw"\n" else f"\u$oct%04x"
1335+
error(s"octal escape literals are unsupported: use $alt instead", start)
1336+
putChar(oct.toChar)
1337+
false
1338+
end octalEscape
1339+
1340+
protected def invalidEscape(): Unit =
12821341
error("invalid escape character", charOffset - 1)
12831342
putChar(ch)
1284-
}
12851343

12861344
private def getLitChars(delimiter: Char) =
12871345
while (ch != delimiter && !isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape))
@@ -1364,25 +1422,22 @@ object Scanners {
13641422
setStrVal()
13651423
}
13661424

1367-
private def finishCharLit(): Unit = {
1425+
private def finishCharLit(): Unit =
13681426
nextChar()
13691427
token = CHARLIT
13701428
setStrVal()
1371-
}
13721429

13731430
/** Parse character literal if current character is followed by \',
13741431
* or follow with given op and return a symbol literal token
13751432
*/
1376-
def charLitOr(op: => Token): Unit = {
1433+
def charLitOr(op: => Token): Unit =
13771434
putChar(ch)
13781435
nextChar()
1379-
if (ch == '\'') finishCharLit()
1380-
else {
1436+
if ch == '\'' then finishCharLit()
1437+
else
13811438
token = op
13821439
strVal = if (name != null) name.toString else null
13831440
litBuf.clear()
1384-
}
1385-
}
13861441

13871442
override def toString: String =
13881443
showTokenDetailed(token) + {

compiler/src/dotty/tools/dotc/transform/Pickler.scala

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,11 +137,14 @@ class Pickler extends Phase {
137137
}
138138

139139
private def testSame(unpickled: String, previous: String, cls: ClassSymbol)(using Context) =
140-
if (previous != unpickled) {
140+
import java.nio.charset.StandardCharsets.UTF_8
141+
def normal(s: String) = new String(s.getBytes(UTF_8), UTF_8)
142+
val unequal = unpickled.length() != previous.length() || normal(unpickled) != normal(previous)
143+
if unequal then
141144
output("before-pickling.txt", previous)
142145
output("after-pickling.txt", unpickled)
143146
report.error(s"""pickling difference for $cls in ${cls.source}, for details:
144147
|
145148
| diff before-pickling.txt after-pickling.txt""".stripMargin)
146-
}
149+
end testSame
147150
}

scaladoc/src/dotty/tools/scaladoc/util/JSON.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def jsonString(s: String): JSON =
3131

3232
sb.append('"')
3333
firstToBeEncoded() match
34-
case -1 sb.append(s)
34+
case -1 => sb.append(s)
3535
case first =>
3636
// sb.append(s, 0, first) for "abc", 0, 2 produce "(abc,0,2)" rather then "ab" as in Java
3737
sb.append(s.substring(0, first))
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
2+
val f = (x: Int) x + 1 // error
3+
4+
val list = for (n List(42)) yield n + 1 // error

tests/neg/surrogates.scala

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
2+
class C {
3+
def `too wide for Char` = '𐐀' // error
4+
}

0 commit comments

Comments
 (0)