diff --git a/compiler/src/dotty/tools/dotc/parsing/Scanners.scala b/compiler/src/dotty/tools/dotc/parsing/Scanners.scala index f36176083792..e2e665c03362 100644 --- a/compiler/src/dotty/tools/dotc/parsing/Scanners.scala +++ b/compiler/src/dotty/tools/dotc/parsing/Scanners.scala @@ -696,6 +696,45 @@ object Scanners { recur(lastOffset, false) } + import Character.{isHighSurrogate, isLowSurrogate, isUnicodeIdentifierPart, isUnicodeIdentifierStart, isValidCodePoint, toCodePoint} + + // f"\\u$c%04x" or f"${"\\"}u$c%04x" + private def toUnicode(c: Char): String = { val s = c.toInt.toHexString; "\\u" + "0" * (4 - s.length) + s } + + // given char (ch) is high surrogate followed by low, codepoint passes predicate. + // true means supplementary chars were put to buffer. + // strict to require low surrogate (if not in string literal). + private def isSupplementary(high: Char, test: Int => Boolean, strict: Boolean = true): Boolean = + isHighSurrogate(high) && { + var res = false + nextChar() + val low = ch + if isLowSurrogate(low) then + nextChar() + val codepoint = toCodePoint(high, low) + if isValidCodePoint(codepoint) && test(codepoint) then + putChar(high) + putChar(low) + res = true + else + error(s"illegal character '${toUnicode(high)}${toUnicode(low)}'") + else if !strict then + putChar(high) + res = true + else + error(s"illegal character '${toUnicode(high)}' missing low surrogate") + res + } + private def atSupplementary(ch: Char, f: Int => Boolean): Boolean = + isHighSurrogate(ch) && { + val hi = ch + val lo = lookaheadChar() + isLowSurrogate(lo) && { + val codepoint = toCodePoint(hi, lo) + isValidCodePoint(codepoint) && f(codepoint) + } + } + /** read next token, filling TokenData fields of Scanner. */ protected final def fetchToken(): Unit = { @@ -822,11 +861,12 @@ object Scanners { else ch match { case '{' | '[' | ' ' | '\t' if lookaheadChar() != '\'' => token = QUOTE - case _ if !isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape) => + case _ if !isAtEnd && ch != SU && ch != CR && ch != LF => val isEmptyCharLit = (ch == '\'') getLitChar() if ch == '\'' then if isEmptyCharLit then error("empty character literal (use '\\'' for single quote)") + else if litBuf.length != 1 then error("illegal codepoint in Char constant: " + litBuf.toString.map(toUnicode).mkString("'", "", "'")) else finishCharLit() else if isEmptyCharLit then error("empty character literal") else error("unclosed character literal") @@ -869,9 +909,11 @@ object Scanners { def fetchOther() = if (ch == '\u21D2') { nextChar(); token = ARROW + report.deprecationWarning("The unicode arrow `⇒` is deprecated, use `=>` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset)) } else if (ch == '\u2190') { nextChar(); token = LARROW + report.deprecationWarning("The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset)) } else if (Character.isUnicodeIdentifierStart(ch)) { putChar(ch) @@ -883,9 +925,10 @@ object Scanners { nextChar() getOperatorRest() } + else if isSupplementary(ch, isUnicodeIdentifierStart) then + getIdentRest() else { - // FIXME: Dotty deviation: f"" interpolator is not supported (#1814) - error("illegal character '\\u%04x'".format(ch: Int)) + error(s"illegal character '${toUnicode(ch)}'") nextChar() } fetchOther() @@ -1024,11 +1067,12 @@ object Scanners { case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true! finishNamed() case _ => - if (Character.isUnicodeIdentifierPart(ch)) { + if isUnicodeIdentifierPart(ch) then putChar(ch) nextChar() getIdentRest() - } + else if isSupplementary(ch, isUnicodeIdentifierPart) then + getIdentRest() else finishNamed() } @@ -1111,7 +1155,7 @@ object Scanners { } // for interpolated strings - @annotation.tailrec private def getStringPart(multiLine: Boolean): Unit = + @tailrec private def getStringPart(multiLine: Boolean): Unit = if (ch == '"') if (multiLine) { nextRawChar() @@ -1136,6 +1180,28 @@ object Scanners { getStringPart(multiLine) } else if (ch == '$') { + def getInterpolatedIdentRest(hasSupplement: Boolean): Unit = + @tailrec def loopRest(): Unit = + if ch != SU && isUnicodeIdentifierPart(ch) then + putChar(ch) ; nextRawChar() + loopRest() + else if atSupplementary(ch, isUnicodeIdentifierPart) then + putChar(ch) ; nextRawChar() + putChar(ch) ; nextRawChar() + loopRest() + else + finishNamedToken(IDENTIFIER, target = next) + end loopRest + setStrVal() + token = STRINGPART + next.lastOffset = charOffset - 1 + next.offset = charOffset - 1 + putChar(ch) ; nextRawChar() + if hasSupplement then + putChar(ch) ; nextRawChar() + loopRest() + end getInterpolatedIdentRest + nextRawChar() if (ch == '$' || ch == '"') { putChar(ch) @@ -1146,18 +1212,10 @@ object Scanners { setStrVal() token = STRINGPART } - else if (Character.isUnicodeIdentifierStart(ch) || ch == '_') { - setStrVal() - token = STRINGPART - next.lastOffset = charOffset - 1 - next.offset = charOffset - 1 - while - putChar(ch) - nextRawChar() - ch != SU && Character.isUnicodeIdentifierPart(ch) - do () - finishNamedToken(IDENTIFIER, target = next) - } + else if isUnicodeIdentifierStart(ch) || ch == '_' then + getInterpolatedIdentRest(hasSupplement = false) + else if atSupplementary(ch, isUnicodeIdentifierStart) then + getInterpolatedIdentRest(hasSupplement = true) else error("invalid string interpolation: `$$`, `$\"`, `$`ident or `$`BlockExpr expected", off = charOffset - 2) putChar('$') @@ -1205,76 +1263,73 @@ object Scanners { false } - /** copy current character into litBuf, interpreting any escape sequences, - * and advance to next character. + /** Copy current character into cbuf, interpreting any escape sequences, + * and advance to next character. Surrogate pairs are consumed (see check + * at fetchSingleQuote), but orphan surrogate is allowed. */ protected def getLitChar(): Unit = - def invalidUnicodeEscape() = { - error("invalid character in unicode escape sequence", charOffset - 1) - putChar(ch) - } - def putUnicode(): Unit = { - while ch == 'u' || ch == 'U' do nextChar() - var i = 0 - var cp = 0 - while (i < 4) { - val shift = (3 - i) * 4 - val d = digit2int(ch, 16) - if(d < 0) { - return invalidUnicodeEscape() - } - cp += (d << shift) - nextChar() - i += 1 - } - putChar(cp.asInstanceOf[Char]) - } - if (ch == '\\') { + if ch == '\\' then nextChar() - if ('0' <= ch && ch <= '7') { - val start = charOffset - 2 - val leadch: Char = ch - var oct: Int = digit2int(ch, 8) - nextChar() - if ('0' <= ch && ch <= '7') { - oct = oct * 8 + digit2int(ch, 8) - nextChar() - if (leadch <= '3' && '0' <= ch && ch <= '7') { - oct = oct * 8 + digit2int(ch, 8) - nextChar() - } - } - val alt = if oct == LF then raw"\n" else f"${"\\"}u$oct%04x" - error(s"octal escape literals are unsupported: use $alt instead", start) - putChar(oct.toChar) - } - else if (ch == 'u' || ch == 'U') { - putUnicode() - } - else { - ch match { - case 'b' => putChar('\b') - case 't' => putChar('\t') - case 'n' => putChar('\n') - case 'f' => putChar('\f') - case 'r' => putChar('\r') - case '\"' => putChar('\"') - case '\'' => putChar('\'') - case '\\' => putChar('\\') - case _ => invalidEscape() - } - nextChar() - } - } - else { + charEscape() + else if !isSupplementary(ch, _ => true, strict = false) then putChar(ch) nextChar() - } - protected def invalidEscape(): Unit = { + private def charEscape(): Unit = + var bump = true + ch match + case 'b' => putChar('\b') + case 't' => putChar('\t') + case 'n' => putChar('\n') + case 'f' => putChar('\f') + case 'r' => putChar('\r') + case '\"' => putChar('\"') + case '\'' => putChar('\'') + case '\\' => putChar('\\') + case 'u' | + 'U' => uEscape(); bump = false + case x if '0' <= x && x <= '7' => octalEscape(); bump = false + case _ => invalidEscape() + if bump then nextChar() + end charEscape + + private def uEscape(): Unit = + while ch == 'u' || ch == 'U' do nextChar() + var i = 0 + var cp = 0 + while i < 4 do + val digit = digit2int(ch, 16) + if digit < 0 then + error("invalid character in unicode escape sequence", charOffset - 1) + putChar(ch) + return + val shift = (3 - i) * 4 + cp += digit << shift + nextChar() + i += 1 + end while + putChar(cp.asInstanceOf[Char]) + end uEscape + + private def octalEscape(): Unit = + val start = charOffset - 2 + val leadch: Char = ch + var oct: Int = digit2int(ch, 8) + nextChar() + if '0' <= ch && ch <= '7' then + oct = oct * 8 + digit2int(ch, 8) + nextChar() + if leadch <= '3' && '0' <= ch && ch <= '7' then + oct = oct * 8 + digit2int(ch, 8) + nextChar() + val alt = if oct == LF then raw"\n" else toUnicode(oct.toChar) + error(s"octal escape literals are unsupported: use $alt instead", start) + putChar(oct.toChar) + end octalEscape + + protected def invalidEscape(): Unit = error("invalid escape character", charOffset - 1) putChar(ch) - } private def getLitChars(delimiter: Char) = while (ch != delimiter && !isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape)) diff --git a/compiler/src/dotty/tools/dotc/transform/Pickler.scala b/compiler/src/dotty/tools/dotc/transform/Pickler.scala index 98b61d8b6b60..2faf450335dd 100644 --- a/compiler/src/dotty/tools/dotc/transform/Pickler.scala +++ b/compiler/src/dotty/tools/dotc/transform/Pickler.scala @@ -140,11 +140,14 @@ class Pickler extends Phase { } private def testSame(unpickled: String, previous: String, cls: ClassSymbol)(using Context) = - if (previous != unpickled) { + import java.nio.charset.StandardCharsets.UTF_8 + def normal(s: String) = new String(s.getBytes(UTF_8), UTF_8) + val unequal = unpickled.length() != previous.length() || normal(unpickled) != normal(previous) + if unequal then output("before-pickling.txt", previous) output("after-pickling.txt", unpickled) report.error(s"""pickling difference for $cls in ${cls.source}, for details: | | diff before-pickling.txt after-pickling.txt""".stripMargin) - } + end testSame } diff --git a/scaladoc/src/dotty/tools/scaladoc/util/JSON.scala b/scaladoc/src/dotty/tools/scaladoc/util/JSON.scala index 468300db6616..3de509721e2b 100644 --- a/scaladoc/src/dotty/tools/scaladoc/util/JSON.scala +++ b/scaladoc/src/dotty/tools/scaladoc/util/JSON.scala @@ -31,7 +31,7 @@ def jsonString(s: String): JSON = sb.append('"') firstToBeEncoded() match - case -1 ⇒ sb.append(s) + case -1 => sb.append(s) case first => // sb.append(s, 0, first) for "abc", 0, 2 produce "(abc,0,2)" rather then "ab" as in Java sb.append(s.substring(0, first)) diff --git a/tests/neg-custom-args/deprecation/old-syntax.scala b/tests/neg-custom-args/deprecation/old-syntax.scala new file mode 100644 index 000000000000..0ba7bbee7db0 --- /dev/null +++ b/tests/neg-custom-args/deprecation/old-syntax.scala @@ -0,0 +1,4 @@ + +val f = (x: Int) ⇒ x + 1 // error + +val list = for (n ← List(42)) yield n + 1 // error diff --git a/tests/neg/surrogates.scala b/tests/neg/surrogates.scala new file mode 100644 index 000000000000..6ac4ecb43f54 --- /dev/null +++ b/tests/neg/surrogates.scala @@ -0,0 +1,4 @@ + +class C { + def `too wide for Char` = '𐐀' // error +} diff --git a/tests/patmat/t11620.scala b/tests/patmat/t11620.scala index 05dfc1e0e437..01e6d1099a8f 100644 --- a/tests/patmat/t11620.scala +++ b/tests/patmat/t11620.scala @@ -19,20 +19,20 @@ object B { } def foo[T](b: B[T]) = b match { - case B(A1(t)) ⇒ t - case B(A2(t, _)) ⇒ t + case B(A1(t)) => t + case B(A2(t, _)) => t } def foo2[_A[+U] <: A[U], T](b: B.Aux[_A, T]) = b match { - case B.Aux(a @ A1(_ )) ⇒ a.t - case B.Aux(a @ A2(_, _)) ⇒ a.t1 // 👎 (false-positive): unreachable code + case B.Aux(a @ A1(_ )) => a.t + case B.Aux(a @ A2(_, _)) => a.t1 // 👎 (false-positive): unreachable code } def foo3[_A[+U] <: A[U], T](b: B.Aux[_A, T]) = b match { - case B.Aux(a: A1[T]) ⇒ a.t - case B.Aux(a: A2[T]) ⇒ a.t1 // 👎 (false-positive): unreachable code + case B.Aux(a: A1[T]) => a.t + case B.Aux(a: A2[T]) => a.t1 // 👎 (false-positive): unreachable code } def foo4[T](b: B[T]) = b match { - case B(A1(t)) ⇒ t // 👎 (false-negative): incomplete match + case B(A1(t)) => t // 👎 (false-negative): incomplete match } diff --git a/tests/pos/surrogates.scala b/tests/pos/surrogates.scala new file mode 100644 index 000000000000..1b710ad901ae --- /dev/null +++ b/tests/pos/surrogates.scala @@ -0,0 +1,28 @@ + +// allow supplementary chars in identifiers + +class 𐐀 { + def 𐐀 = 42 + + // regression check: anything goes in strings + def x = "𐐀" + def y = s"$𐐀" + def w = s" 𐐀" +} + +case class 𐐀𐐀(n: Int) { + def 𐐀𐐀 = n + def `𐐀𐐀1` = n + n +} + +// uncontroversially, orphan surrogates may be introduced +// via unicode escape. +class Construction { + def hi = '\ud801' + def lo = '\udc00' + def endhi = "abc\ud801" + def startlo = "\udc00xyz" + def reversed = "xyz\udc00\ud801abc" +} + +// was: error: illegal character '\ud801', '\udc00' diff --git a/tests/run/t9915/Test_2.scala b/tests/run/t9915/Test_2.scala index afed667cc6e5..164ee0b2307f 100644 --- a/tests/run/t9915/Test_2.scala +++ b/tests/run/t9915/Test_2.scala @@ -1,12 +1,15 @@ object Test extends App { + private def dump(s: String) = s.map(c => f"${c.toInt}%02X").mkString(" ") + def assertEqualStrings(expected: String)(actual: String) = + assert(expected == actual, s"Expected:\n${dump(expected)}\nActual:\n${dump(actual)}") val c = new C_1 - assert(c.nulled == "X\u0000ABC") // "X\000ABC" - assert(c.supped == "𐒈𐒝𐒑𐒛𐒐𐒘𐒕𐒖") + assert(C_1.NULLED.length == "XYABC".length) + assert(C_1.SUPPED.codePointCount(0, C_1.SUPPED.length) == 8) - assert(C_1.NULLED == "X\u0000ABC") // "X\000ABC" - assert(C_1.SUPPED == "𐒈𐒝𐒑𐒛𐒐𐒘𐒕𐒖") + assertEqualStrings(c.nulled)("X\u0000ABC") // "X\000ABC" in java source + assertEqualStrings(c.supped)("𐒈𐒝𐒑𐒛𐒐𐒘𐒕𐒖") - assert(C_1.NULLED.size == "XYABC".size) - assert(C_1.SUPPED.codePointCount(0, C_1.SUPPED.length) == 8) + assertEqualStrings(C_1.NULLED)("X\u0000ABC") // "X\000ABC" in java source + assertEqualStrings(C_1.SUPPED)("𐒈𐒝𐒑𐒛𐒐𐒘𐒕𐒖") }