Skip to content

Commit bde1057

Browse files
committed
More unicode support
1 parent d84007c commit bde1057

File tree

7 files changed

+123
-54
lines changed

7 files changed

+123
-54
lines changed

β€Žcompiler/src/dotty/tools/dotc/core/NameOps.scala

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -86,11 +86,17 @@ object NameOps {
8686
def isVarPattern: Boolean =
8787
testSimple { n =>
8888
n.length > 0 && {
89+
def isLowerLetterSupplementary: Boolean =
90+
import Character.{isHighSurrogate, isLowSurrogate, isLetter, isLowerCase, isValidCodePoint, toCodePoint}
91+
isHighSurrogate(n(0)) && n.length > 1 && isLowSurrogate(n(1)) && {
92+
val codepoint = toCodePoint(n(0), n(1))
93+
isValidCodePoint(codepoint) && isLetter(codepoint) && isLowerCase(codepoint)
94+
}
8995
val first = n.head
90-
(((first.isLower && first.isLetter) || first == '_')
91-
&& (n != false_)
92-
&& (n != true_)
93-
&& (n != null_))
96+
((first.isLower && first.isLetter || first == '_' || isLowerLetterSupplementary)
97+
&& n != false_
98+
&& n != true_
99+
&& n != null_)
94100
}
95101
} || name.is(PatMatGivenVarName)
96102

β€Žcompiler/src/dotty/tools/dotc/core/Names.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ import scala.annotation.internal.sharable
1515
object Names {
1616
import NameKinds._
1717

18-
/** Things that can be turned into names with `totermName` and `toTypeName`
19-
* Decorators defines implements these as extension methods for strings.
18+
/** Things that can be turned into names with `toTermName` and `toTypeName`
19+
* Decorators implements these as extension methods for strings.
2020
*/
2121
type PreName = Name | String
2222

@@ -25,7 +25,7 @@ object Names {
2525
*/
2626
abstract class Designator
2727

28-
/** A name if either a term name or a type name. Term names can be simple
28+
/** A name is either a term name or a type name. Term names can be simple
2929
* or derived. A simple term name is essentially an interned string stored
3030
* in a name table. A derived term name adds a tag, and possibly a number
3131
* or a further simple name to some other name.

β€Žcompiler/src/dotty/tools/dotc/parsing/Scanners.scala

Lines changed: 21 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ import config.Feature.migrateTo3
2121
import config.SourceVersion.`3.0`
2222
import reporting.{NoProfile, Profile}
2323

24+
import java.util.Objects
25+
2426
object Scanners {
2527

2628
/** Offset into source character array */
@@ -792,19 +794,21 @@ object Scanners {
792794
private def isSupplementary(high: Char, test: Int => Boolean, strict: Boolean = true): Boolean =
793795
isHighSurrogate(high) && {
794796
var res = false
795-
nextChar()
796-
val low = ch
797+
val low = lookaheadChar()
797798
if isLowSurrogate(low) then
798-
nextChar()
799799
val codepoint = toCodePoint(high, low)
800-
if isValidCodePoint(codepoint) && test(codepoint) then
801-
putChar(high)
802-
putChar(low)
803-
res = true
800+
if isValidCodePoint(codepoint) then
801+
if test(codepoint) then
802+
putChar(high)
803+
putChar(low)
804+
nextChar()
805+
nextChar()
806+
res = true
804807
else
805808
error(s"illegal character '${toUnicode(high)}${toUnicode(low)}'")
806809
else if !strict then
807810
putChar(high)
811+
nextChar()
808812
res = true
809813
else
810814
error(s"illegal character '${toUnicode(high)}' missing low surrogate")
@@ -904,7 +908,6 @@ object Scanners {
904908
if (ch == '\"') {
905909
if (lookaheadChar() == '\"') {
906910
nextRawChar()
907-
//offset += 3 // first part is positioned at the quote
908911
nextRawChar()
909912
stringPart(multiLine = true)
910913
}
@@ -915,7 +918,6 @@ object Scanners {
915918
}
916919
}
917920
else {
918-
//offset += 1 // first part is positioned at the quote
919921
stringPart(multiLine = false)
920922
}
921923
}
@@ -1000,10 +1002,11 @@ object Scanners {
10001002
nextChar(); token = LARROW
10011003
report.deprecationWarning("The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
10021004
}
1003-
else if (Character.isUnicodeIdentifierStart(ch)) {
1005+
else if (isUnicodeIdentifierStart(ch)) {
10041006
putChar(ch)
10051007
nextChar()
10061008
getIdentRest()
1009+
if (ch == '"' && token == IDENTIFIER) token = INTERPOLATIONID
10071010
}
10081011
else if (isSpecial(ch)) {
10091012
putChar(ch)
@@ -1012,6 +1015,9 @@ object Scanners {
10121015
}
10131016
else if isSupplementary(ch, isUnicodeIdentifierStart) then
10141017
getIdentRest()
1018+
if (ch == '"' && token == IDENTIFIER) token = INTERPOLATIONID
1019+
else if isSupplementary(ch, isSpecial) then
1020+
getOperatorRest()
10151021
else {
10161022
error(s"illegal character '${toUnicode(ch)}'")
10171023
nextChar()
@@ -1130,7 +1136,7 @@ object Scanners {
11301136
else error("unclosed quoted identifier")
11311137
}
11321138

1133-
private def getIdentRest(): Unit = (ch: @switch) match {
1139+
@tailrec private def getIdentRest(): Unit = (ch: @switch) match {
11341140
case 'A' | 'B' | 'C' | 'D' | 'E' |
11351141
'F' | 'G' | 'H' | 'I' | 'J' |
11361142
'K' | 'L' | 'M' | 'N' | 'O' |
@@ -1165,7 +1171,7 @@ object Scanners {
11651171
finishNamed()
11661172
}
11671173

1168-
private def getOperatorRest(): Unit = (ch: @switch) match {
1174+
@tailrec private def getOperatorRest(): Unit = (ch: @switch) match {
11691175
case '~' | '!' | '@' | '#' | '%' |
11701176
'^' | '*' | '+' | '-' | '<' |
11711177
'>' | '?' | ':' | '=' | '&' |
@@ -1177,22 +1183,12 @@ object Scanners {
11771183
else { putChar(ch); nextChar(); getOperatorRest() }
11781184
case _ =>
11791185
if (isSpecial(ch)) { putChar(ch); nextChar(); getOperatorRest() }
1186+
else if (isSupplementary(ch, isSpecial)) getOperatorRest()
11801187
else finishNamed()
11811188
}
11821189

11831190
private def getIdentOrOperatorRest(): Unit =
1184-
if (isIdentifierPart(ch))
1185-
getIdentRest()
1186-
else ch match {
1187-
case '~' | '!' | '@' | '#' | '%' |
1188-
'^' | '*' | '+' | '-' | '<' |
1189-
'>' | '?' | ':' | '=' | '&' |
1190-
'|' | '\\' | '/' =>
1191-
getOperatorRest()
1192-
case _ =>
1193-
if (isSpecial(ch)) getOperatorRest()
1194-
else finishNamed()
1195-
}
1191+
if (isIdentifierPart(ch) || isSupplementary(ch, isIdentifierPart)) getIdentRest() else getOperatorRest()
11961192

11971193
def isSoftModifier: Boolean =
11981194
token == IDENTIFIER
@@ -1515,7 +1511,7 @@ object Scanners {
15151511
if (ch == '\'') finishCharLit()
15161512
else {
15171513
token = op
1518-
strVal = if (name != null) name.toString else null
1514+
strVal = Objects.toString(name)
15191515
litBuf.clear()
15201516
}
15211517
}

β€Žcompiler/src/dotty/tools/dotc/util/Chars.scala

Lines changed: 28 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,20 @@
11
package dotty.tools.dotc.util
22

33
import scala.annotation.switch
4-
import java.lang.{Character => JCharacter}
5-
import java.lang.Character.LETTER_NUMBER
6-
import java.lang.Character.LOWERCASE_LETTER
7-
import java.lang.Character.OTHER_LETTER
8-
import java.lang.Character.TITLECASE_LETTER
9-
import java.lang.Character.UPPERCASE_LETTER
4+
import Character.{LETTER_NUMBER, LOWERCASE_LETTER, OTHER_LETTER, TITLECASE_LETTER, UPPERCASE_LETTER}
5+
import Character.{MATH_SYMBOL, OTHER_SYMBOL}
6+
import Character.{isJavaIdentifierPart, isUnicodeIdentifierStart, isUnicodeIdentifierPart}
107

118
/** Contains constants and classifier methods for characters */
12-
object Chars {
9+
object Chars:
1310

1411
inline val LF = '\u000A'
1512
inline val FF = '\u000C'
1613
inline val CR = '\u000D'
1714
inline val SU = '\u001A'
1815

16+
type CodePoint = Int
17+
1918
/** Convert a character digit to an Int according to given base,
2019
* -1 if no success
2120
*/
@@ -59,17 +58,21 @@ object Chars {
5958
'0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
6059

6160
/** Can character start an alphanumeric Scala identifier? */
62-
def isIdentifierStart(c: Char): Boolean =
63-
(c == '_') || (c == '$') || JCharacter.isUnicodeIdentifierStart(c)
61+
def isIdentifierStart(c: Char): Boolean = (c == '_') || (c == '$') || isUnicodeIdentifierStart(c)
62+
def isIdentifierStart(c: CodePoint): Boolean = (c == '_') || (c == '$') || isUnicodeIdentifierStart(c)
6463

6564
/** Can character form part of an alphanumeric Scala identifier? */
66-
def isIdentifierPart(c: Char): Boolean =
67-
(c == '$') || JCharacter.isUnicodeIdentifierPart(c)
65+
def isIdentifierPart(c: Char): Boolean = (c == '$') || isUnicodeIdentifierPart(c)
66+
def isIdentifierPart(c: CodePoint) = (c == '$') || isUnicodeIdentifierPart(c)
6867

6968
/** Is character a math or other symbol in Unicode? */
7069
def isSpecial(c: Char): Boolean = {
71-
val chtp = JCharacter.getType(c)
72-
chtp == JCharacter.MATH_SYMBOL.toInt || chtp == JCharacter.OTHER_SYMBOL.toInt
70+
val chtp = Character.getType(c)
71+
chtp == MATH_SYMBOL.toInt || chtp == OTHER_SYMBOL.toInt
72+
}
73+
def isSpecial(codePoint: CodePoint) = {
74+
val chtp = Character.getType(codePoint)
75+
chtp == MATH_SYMBOL.toInt || chtp == OTHER_SYMBOL.toInt
7376
}
7477

7578
def isValidJVMChar(c: Char): Boolean =
@@ -78,15 +81,19 @@ object Chars {
7881
def isValidJVMMethodChar(c: Char): Boolean =
7982
!(c == '.' || c == ';' || c =='[' || c == '/' || c == '<' || c == '>')
8083

81-
private final val otherLetters = Set[Char]('\u0024', '\u005F') // '$' and '_'
82-
private final val letterGroups = {
83-
import JCharacter._
84-
Set[Byte](LOWERCASE_LETTER, UPPERCASE_LETTER, OTHER_LETTER, TITLECASE_LETTER, LETTER_NUMBER)
85-
}
86-
def isScalaLetter(ch: Char): Boolean = letterGroups(JCharacter.getType(ch).toByte) || otherLetters(ch)
84+
def isScalaLetter(c: Char): Boolean =
85+
Character.getType(c: @switch) match {
86+
case LOWERCASE_LETTER | UPPERCASE_LETTER | OTHER_LETTER | TITLECASE_LETTER | LETTER_NUMBER => true
87+
case _ => c == '$' || c == '_'
88+
}
89+
def isScalaLetter(c: CodePoint): Boolean =
90+
Character.getType(c: @switch) match {
91+
case LOWERCASE_LETTER | UPPERCASE_LETTER | OTHER_LETTER | TITLECASE_LETTER | LETTER_NUMBER => true
92+
case _ => c == '$' || c == '_'
93+
}
8794

8895
/** Can character form part of a Scala operator name? */
89-
def isOperatorPart(c : Char) : Boolean = (c: @switch) match {
96+
def isOperatorPart(c: Char): Boolean = (c: @switch) match {
9097
case '~' | '!' | '@' | '#' | '%' |
9198
'^' | '*' | '+' | '-' | '<' |
9299
'>' | '?' | ':' | '=' | '&' |
@@ -95,5 +102,4 @@ object Chars {
95102
}
96103

97104
/** Would the character be encoded by `NameTransformer.encode`? */
98-
def willBeEncoded(c : Char) : Boolean = !JCharacter.isJavaIdentifierPart(c)
99-
}
105+
def willBeEncoded(c: Char): Boolean = !isJavaIdentifierPart(c)

β€Žtests/pos/surrogates.scala

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,8 @@ class Construction {
2525
def reversed = "xyz\udc00\ud801abc"
2626
}
2727

28+
class Demon {
29+
val 😈 = 42
30+
}
31+
2832
// was: error: illegal character '\ud801', '\udc00'

β€Žtests/pos/t1406.scala

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
2+
class Identifiers {
3+
4+
def f(x: Any): Boolean = x match {
5+
case 𐐨XYZ: String => true
6+
case 𐐨 => true
7+
}
8+
def g(x: Any) = x match {
9+
case 𐐨 @ _ => 𐐨
10+
}
11+
}
12+
class Ops {
13+
def 𝆗 = 42 // was error: illegal character
14+
def op_𝆗 = 42 // was error: illegal character
15+
def πŸŒ€ = 42
16+
def op_πŸŒ€ = 42
17+
def πŸš€ = 42
18+
def op_πŸš€ = 42
19+
def πŸœ€ = 42
20+
def op_πŸœ€ = 42
21+
def 𝓅 = 42
22+
def op_𝓅 = 42
23+
}
24+
class Strings {
25+
implicit class Interps(sc: StringContext) {
26+
def 𝓅(parts: Any*) = "done"
27+
}
28+
/*
29+
def 𝓅 = 42
30+
def interpolated = s"$𝓅"
31+
def e = "a 𝓅 b"
32+
*/
33+
def f = 𝓅"one"
34+
}

β€Žtests/run/t1406b.scala

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
2+
case class C(n: Int) {
3+
def 𐀀(c: C): C = C(n * c.n) // actually a letter but supplementary 0x10000
4+
def β˜€(c: C): C = C(n * c.n) // just a symbol
5+
def β˜€=(c: C): C = C(n * c.n) // just a symbol
6+
def πŸŒ€(c: C): C = C(n * c.n) // cyclone operator is symbol, supplementary
7+
def πŸŒ€=(c: C): C = C(n * c.n) // cyclone operator is symbol, supplementary
8+
def *(c: C): C = C(n * c.n)
9+
def +(c: C): C = C(n + c.n)
10+
}
11+
object Test extends App {
12+
val c, d = C(42)
13+
println(c + d)
14+
println(c * d)
15+
println(c β˜€ d)
16+
println(c * d + d)
17+
println(c β˜€ d + d)
18+
println(c β˜€= d + d) // assignment op is low precedence
19+
println(c 𐀀 d + d) // the first one, letter should be low precedence
20+
println(c πŸŒ€d + d) // the second one, cyclone should be high precedence
21+
println(c πŸŒ€= d + d) // the second one, cyclone should be high precedence
22+
}
23+

0 commit comments

Comments
Β (0)