Skip to content

Commit d2ebd75

Browse files
authored
Merge pull request #13136 from som-snytt/forward/supplementary
2 parents 5626f25 + cf29787 commit d2ebd75

File tree

8 files changed

+193
-96
lines changed

8 files changed

+193
-96
lines changed

β€Žcompiler/src/dotty/tools/dotc/parsing/Scanners.scala

Lines changed: 135 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -696,6 +696,45 @@ object Scanners {
696696
recur(lastOffset, false)
697697
}
698698

699+
import Character.{isHighSurrogate, isLowSurrogate, isUnicodeIdentifierPart, isUnicodeIdentifierStart, isValidCodePoint, toCodePoint}
700+
701+
// f"\\u$c%04x" or f"${"\\"}u$c%04x"
702+
private def toUnicode(c: Char): String = { val s = c.toInt.toHexString; "\\u" + "0" * (4 - s.length) + s }
703+
704+
// given char (ch) is high surrogate followed by low, codepoint passes predicate.
705+
// true means supplementary chars were put to buffer.
706+
// strict to require low surrogate (if not in string literal).
707+
private def isSupplementary(high: Char, test: Int => Boolean, strict: Boolean = true): Boolean =
708+
isHighSurrogate(high) && {
709+
var res = false
710+
nextChar()
711+
val low = ch
712+
if isLowSurrogate(low) then
713+
nextChar()
714+
val codepoint = toCodePoint(high, low)
715+
if isValidCodePoint(codepoint) && test(codepoint) then
716+
putChar(high)
717+
putChar(low)
718+
res = true
719+
else
720+
error(s"illegal character '${toUnicode(high)}${toUnicode(low)}'")
721+
else if !strict then
722+
putChar(high)
723+
res = true
724+
else
725+
error(s"illegal character '${toUnicode(high)}' missing low surrogate")
726+
res
727+
}
728+
private def atSupplementary(ch: Char, f: Int => Boolean): Boolean =
729+
isHighSurrogate(ch) && {
730+
val hi = ch
731+
val lo = lookaheadChar()
732+
isLowSurrogate(lo) && {
733+
val codepoint = toCodePoint(hi, lo)
734+
isValidCodePoint(codepoint) && f(codepoint)
735+
}
736+
}
737+
699738
/** read next token, filling TokenData fields of Scanner.
700739
*/
701740
protected final def fetchToken(): Unit = {
@@ -822,11 +861,12 @@ object Scanners {
822861
else ch match {
823862
case '{' | '[' | ' ' | '\t' if lookaheadChar() != '\'' =>
824863
token = QUOTE
825-
case _ if !isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape) =>
864+
case _ if !isAtEnd && ch != SU && ch != CR && ch != LF =>
826865
val isEmptyCharLit = (ch == '\'')
827866
getLitChar()
828867
if ch == '\'' then
829868
if isEmptyCharLit then error("empty character literal (use '\\'' for single quote)")
869+
else if litBuf.length != 1 then error("illegal codepoint in Char constant: " + litBuf.toString.map(toUnicode).mkString("'", "", "'"))
830870
else finishCharLit()
831871
else if isEmptyCharLit then error("empty character literal")
832872
else error("unclosed character literal")
@@ -869,9 +909,11 @@ object Scanners {
869909
def fetchOther() =
870910
if (ch == '\u21D2') {
871911
nextChar(); token = ARROW
912+
report.deprecationWarning("The unicode arrow `β‡’` is deprecated, use `=>` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
872913
}
873914
else if (ch == '\u2190') {
874915
nextChar(); token = LARROW
916+
report.deprecationWarning("The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
875917
}
876918
else if (Character.isUnicodeIdentifierStart(ch)) {
877919
putChar(ch)
@@ -883,9 +925,10 @@ object Scanners {
883925
nextChar()
884926
getOperatorRest()
885927
}
928+
else if isSupplementary(ch, isUnicodeIdentifierStart) then
929+
getIdentRest()
886930
else {
887-
// FIXME: Dotty deviation: f"" interpolator is not supported (#1814)
888-
error("illegal character '\\u%04x'".format(ch: Int))
931+
error(s"illegal character '${toUnicode(ch)}'")
889932
nextChar()
890933
}
891934
fetchOther()
@@ -1024,11 +1067,12 @@ object Scanners {
10241067
case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
10251068
finishNamed()
10261069
case _ =>
1027-
if (Character.isUnicodeIdentifierPart(ch)) {
1070+
if isUnicodeIdentifierPart(ch) then
10281071
putChar(ch)
10291072
nextChar()
10301073
getIdentRest()
1031-
}
1074+
else if isSupplementary(ch, isUnicodeIdentifierPart) then
1075+
getIdentRest()
10321076
else
10331077
finishNamed()
10341078
}
@@ -1111,7 +1155,7 @@ object Scanners {
11111155
}
11121156

11131157
// for interpolated strings
1114-
@annotation.tailrec private def getStringPart(multiLine: Boolean): Unit =
1158+
@tailrec private def getStringPart(multiLine: Boolean): Unit =
11151159
if (ch == '"')
11161160
if (multiLine) {
11171161
nextRawChar()
@@ -1136,6 +1180,28 @@ object Scanners {
11361180
getStringPart(multiLine)
11371181
}
11381182
else if (ch == '$') {
1183+
def getInterpolatedIdentRest(hasSupplement: Boolean): Unit =
1184+
@tailrec def loopRest(): Unit =
1185+
if ch != SU && isUnicodeIdentifierPart(ch) then
1186+
putChar(ch) ; nextRawChar()
1187+
loopRest()
1188+
else if atSupplementary(ch, isUnicodeIdentifierPart) then
1189+
putChar(ch) ; nextRawChar()
1190+
putChar(ch) ; nextRawChar()
1191+
loopRest()
1192+
else
1193+
finishNamedToken(IDENTIFIER, target = next)
1194+
end loopRest
1195+
setStrVal()
1196+
token = STRINGPART
1197+
next.lastOffset = charOffset - 1
1198+
next.offset = charOffset - 1
1199+
putChar(ch) ; nextRawChar()
1200+
if hasSupplement then
1201+
putChar(ch) ; nextRawChar()
1202+
loopRest()
1203+
end getInterpolatedIdentRest
1204+
11391205
nextRawChar()
11401206
if (ch == '$' || ch == '"') {
11411207
putChar(ch)
@@ -1146,18 +1212,10 @@ object Scanners {
11461212
setStrVal()
11471213
token = STRINGPART
11481214
}
1149-
else if (Character.isUnicodeIdentifierStart(ch) || ch == '_') {
1150-
setStrVal()
1151-
token = STRINGPART
1152-
next.lastOffset = charOffset - 1
1153-
next.offset = charOffset - 1
1154-
while
1155-
putChar(ch)
1156-
nextRawChar()
1157-
ch != SU && Character.isUnicodeIdentifierPart(ch)
1158-
do ()
1159-
finishNamedToken(IDENTIFIER, target = next)
1160-
}
1215+
else if isUnicodeIdentifierStart(ch) || ch == '_' then
1216+
getInterpolatedIdentRest(hasSupplement = false)
1217+
else if atSupplementary(ch, isUnicodeIdentifierStart) then
1218+
getInterpolatedIdentRest(hasSupplement = true)
11611219
else
11621220
error("invalid string interpolation: `$$`, `$\"`, `$`ident or `$`BlockExpr expected", off = charOffset - 2)
11631221
putChar('$')
@@ -1205,76 +1263,73 @@ object Scanners {
12051263
false
12061264
}
12071265

1208-
/** copy current character into litBuf, interpreting any escape sequences,
1209-
* and advance to next character.
1266+
/** Copy current character into cbuf, interpreting any escape sequences,
1267+
* and advance to next character. Surrogate pairs are consumed (see check
1268+
* at fetchSingleQuote), but orphan surrogate is allowed.
12101269
*/
12111270
protected def getLitChar(): Unit =
1212-
def invalidUnicodeEscape() = {
1213-
error("invalid character in unicode escape sequence", charOffset - 1)
1214-
putChar(ch)
1215-
}
1216-
def putUnicode(): Unit = {
1217-
while ch == 'u' || ch == 'U' do nextChar()
1218-
var i = 0
1219-
var cp = 0
1220-
while (i < 4) {
1221-
val shift = (3 - i) * 4
1222-
val d = digit2int(ch, 16)
1223-
if(d < 0) {
1224-
return invalidUnicodeEscape()
1225-
}
1226-
cp += (d << shift)
1227-
nextChar()
1228-
i += 1
1229-
}
1230-
putChar(cp.asInstanceOf[Char])
1231-
}
1232-
if (ch == '\\') {
1271+
if ch == '\\' then
12331272
nextChar()
1234-
if ('0' <= ch && ch <= '7') {
1235-
val start = charOffset - 2
1236-
val leadch: Char = ch
1237-
var oct: Int = digit2int(ch, 8)
1238-
nextChar()
1239-
if ('0' <= ch && ch <= '7') {
1240-
oct = oct * 8 + digit2int(ch, 8)
1241-
nextChar()
1242-
if (leadch <= '3' && '0' <= ch && ch <= '7') {
1243-
oct = oct * 8 + digit2int(ch, 8)
1244-
nextChar()
1245-
}
1246-
}
1247-
val alt = if oct == LF then raw"\n" else f"${"\\"}u$oct%04x"
1248-
error(s"octal escape literals are unsupported: use $alt instead", start)
1249-
putChar(oct.toChar)
1250-
}
1251-
else if (ch == 'u' || ch == 'U') {
1252-
putUnicode()
1253-
}
1254-
else {
1255-
ch match {
1256-
case 'b' => putChar('\b')
1257-
case 't' => putChar('\t')
1258-
case 'n' => putChar('\n')
1259-
case 'f' => putChar('\f')
1260-
case 'r' => putChar('\r')
1261-
case '\"' => putChar('\"')
1262-
case '\'' => putChar('\'')
1263-
case '\\' => putChar('\\')
1264-
case _ => invalidEscape()
1265-
}
1266-
nextChar()
1267-
}
1268-
}
1269-
else {
1273+
charEscape()
1274+
else if !isSupplementary(ch, _ => true, strict = false) then
12701275
putChar(ch)
12711276
nextChar()
1272-
}
12731277

1274-
protected def invalidEscape(): Unit = {
1278+
private def charEscape(): Unit =
1279+
var bump = true
1280+
ch match
1281+
case 'b' => putChar('\b')
1282+
case 't' => putChar('\t')
1283+
case 'n' => putChar('\n')
1284+
case 'f' => putChar('\f')
1285+
case 'r' => putChar('\r')
1286+
case '\"' => putChar('\"')
1287+
case '\'' => putChar('\'')
1288+
case '\\' => putChar('\\')
1289+
case 'u' |
1290+
'U' => uEscape(); bump = false
1291+
case x if '0' <= x && x <= '7' => octalEscape(); bump = false
1292+
case _ => invalidEscape()
1293+
if bump then nextChar()
1294+
end charEscape
1295+
1296+
private def uEscape(): Unit =
1297+
while ch == 'u' || ch == 'U' do nextChar()
1298+
var i = 0
1299+
var cp = 0
1300+
while i < 4 do
1301+
val digit = digit2int(ch, 16)
1302+
if digit < 0 then
1303+
error("invalid character in unicode escape sequence", charOffset - 1)
1304+
putChar(ch)
1305+
return
1306+
val shift = (3 - i) * 4
1307+
cp += digit << shift
1308+
nextChar()
1309+
i += 1
1310+
end while
1311+
putChar(cp.asInstanceOf[Char])
1312+
end uEscape
1313+
1314+
private def octalEscape(): Unit =
1315+
val start = charOffset - 2
1316+
val leadch: Char = ch
1317+
var oct: Int = digit2int(ch, 8)
1318+
nextChar()
1319+
if '0' <= ch && ch <= '7' then
1320+
oct = oct * 8 + digit2int(ch, 8)
1321+
nextChar()
1322+
if leadch <= '3' && '0' <= ch && ch <= '7' then
1323+
oct = oct * 8 + digit2int(ch, 8)
1324+
nextChar()
1325+
val alt = if oct == LF then raw"\n" else toUnicode(oct.toChar)
1326+
error(s"octal escape literals are unsupported: use $alt instead", start)
1327+
putChar(oct.toChar)
1328+
end octalEscape
1329+
1330+
protected def invalidEscape(): Unit =
12751331
error("invalid escape character", charOffset - 1)
12761332
putChar(ch)
1277-
}
12781333

12791334
private def getLitChars(delimiter: Char) =
12801335
while (ch != delimiter && !isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape))

β€Žcompiler/src/dotty/tools/dotc/transform/Pickler.scala

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,11 +140,14 @@ class Pickler extends Phase {
140140
}
141141

142142
private def testSame(unpickled: String, previous: String, cls: ClassSymbol)(using Context) =
143-
if (previous != unpickled) {
143+
import java.nio.charset.StandardCharsets.UTF_8
144+
def normal(s: String) = new String(s.getBytes(UTF_8), UTF_8)
145+
val unequal = unpickled.length() != previous.length() || normal(unpickled) != normal(previous)
146+
if unequal then
144147
output("before-pickling.txt", previous)
145148
output("after-pickling.txt", unpickled)
146149
report.error(s"""pickling difference for $cls in ${cls.source}, for details:
147150
|
148151
| diff before-pickling.txt after-pickling.txt""".stripMargin)
149-
}
152+
end testSame
150153
}

β€Žscaladoc/src/dotty/tools/scaladoc/util/JSON.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def jsonString(s: String): JSON =
3131

3232
sb.append('"')
3333
firstToBeEncoded() match
34-
case -1 β‡’ sb.append(s)
34+
case -1 => sb.append(s)
3535
case first =>
3636
// sb.append(s, 0, first) for "abc", 0, 2 produce "(abc,0,2)" rather then "ab" as in Java
3737
sb.append(s.substring(0, first))
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
2+
val f = (x: Int) β‡’ x + 1 // error
3+
4+
val list = for (n ← List(42)) yield n + 1 // error

β€Žtests/neg/surrogates.scala

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
2+
class C {
3+
def `too wide for Char` = '𐐀' // error
4+
}

β€Žtests/patmat/t11620.scala

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,20 +19,20 @@ object B {
1919
}
2020

2121
def foo[T](b: B[T]) = b match {
22-
case B(A1(t)) β‡’ t
23-
case B(A2(t, _)) β‡’ t
22+
case B(A1(t)) => t
23+
case B(A2(t, _)) => t
2424
}
2525

2626
def foo2[_A[+U] <: A[U], T](b: B.Aux[_A, T]) = b match {
27-
case B.Aux(a @ A1(_ )) β‡’ a.t
28-
case B.Aux(a @ A2(_, _)) β‡’ a.t1 // πŸ‘Ž (false-positive): unreachable code
27+
case B.Aux(a @ A1(_ )) => a.t
28+
case B.Aux(a @ A2(_, _)) => a.t1 // πŸ‘Ž (false-positive): unreachable code
2929
}
3030

3131
def foo3[_A[+U] <: A[U], T](b: B.Aux[_A, T]) = b match {
32-
case B.Aux(a: A1[T]) β‡’ a.t
33-
case B.Aux(a: A2[T]) β‡’ a.t1 // πŸ‘Ž (false-positive): unreachable code
32+
case B.Aux(a: A1[T]) => a.t
33+
case B.Aux(a: A2[T]) => a.t1 // πŸ‘Ž (false-positive): unreachable code
3434
}
3535

3636
def foo4[T](b: B[T]) = b match {
37-
case B(A1(t)) β‡’ t // πŸ‘Ž (false-negative): incomplete match
37+
case B(A1(t)) => t // πŸ‘Ž (false-negative): incomplete match
3838
}

β€Žtests/pos/surrogates.scala

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
2+
// allow supplementary chars in identifiers
3+
4+
class 𐐀 {
5+
def 𐐀 = 42
6+
7+
// regression check: anything goes in strings
8+
def x = "𐐀"
9+
def y = s"$𐐀"
10+
def w = s" 𐐀"
11+
}
12+
13+
case class 𐐀𐐀(n: Int) {
14+
def 𐐀𐐀 = n
15+
def `𐐀𐐀1` = n + n
16+
}
17+
18+
// uncontroversially, orphan surrogates may be introduced
19+
// via unicode escape.
20+
class Construction {
21+
def hi = '\ud801'
22+
def lo = '\udc00'
23+
def endhi = "abc\ud801"
24+
def startlo = "\udc00xyz"
25+
def reversed = "xyz\udc00\ud801abc"
26+
}
27+
28+
// was: error: illegal character '\ud801', '\udc00'

0 commit comments

Comments
Β (0)