Skip to content

Commit dff6902

Browse files
committed
More unicode support
1 parent c715a47 commit dff6902

File tree

7 files changed

+121
-52
lines changed

7 files changed

+121
-52
lines changed

β€Žcompiler/src/dotty/tools/dotc/core/NameOps.scala

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -86,11 +86,17 @@ object NameOps {
8686
def isVarPattern: Boolean =
8787
testSimple { n =>
8888
n.length > 0 && {
89+
def isLowerLetterSupplementary: Boolean =
90+
import Character.{isHighSurrogate, isLowSurrogate, isLetter, isLowerCase, isValidCodePoint, toCodePoint}
91+
isHighSurrogate(n(0)) && n.length > 1 && isLowSurrogate(n(1)) && {
92+
val codepoint = toCodePoint(n(0), n(1))
93+
isValidCodePoint(codepoint) && isLetter(codepoint) && isLowerCase(codepoint)
94+
}
8995
val first = n.head
90-
(((first.isLower && first.isLetter) || first == '_')
91-
&& (n != false_)
92-
&& (n != true_)
93-
&& (n != null_))
96+
((first.isLower && first.isLetter || first == '_' || isLowerLetterSupplementary)
97+
&& n != false_
98+
&& n != true_
99+
&& n != null_)
94100
}
95101
} || name.is(PatMatGivenVarName)
96102

β€Žcompiler/src/dotty/tools/dotc/core/Names.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ object Names {
2525
*/
2626
abstract class Designator
2727

28-
/** A name if either a term name or a type name. Term names can be simple
28+
/** A name is either a term name or a type name. Term names can be simple
2929
* or derived. A simple term name is essentially an interned string stored
3030
* in a name table. A derived term name adds a tag, and possibly a number
3131
* or a further simple name to some other name.

β€Žcompiler/src/dotty/tools/dotc/parsing/Scanners.scala

Lines changed: 21 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ import config.Feature.{migrateTo3, fewerBracesEnabled}
2121
import config.SourceVersion.`3.0`
2222
import reporting.{NoProfile, Profile, Message}
2323

24+
import java.util.Objects
25+
2426
object Scanners {
2527

2628
/** Offset into source character array */
@@ -777,19 +779,21 @@ object Scanners {
777779
private def isSupplementary(high: Char, test: Int => Boolean, strict: Boolean = true): Boolean =
778780
isHighSurrogate(high) && {
779781
var res = false
780-
nextChar()
781-
val low = ch
782+
val low = lookaheadChar()
782783
if isLowSurrogate(low) then
783-
nextChar()
784784
val codepoint = toCodePoint(high, low)
785-
if isValidCodePoint(codepoint) && test(codepoint) then
786-
putChar(high)
787-
putChar(low)
788-
res = true
785+
if isValidCodePoint(codepoint) then
786+
if test(codepoint) then
787+
putChar(high)
788+
putChar(low)
789+
nextChar()
790+
nextChar()
791+
res = true
789792
else
790793
error(em"illegal character '${toUnicode(high)}${toUnicode(low)}'")
791794
else if !strict then
792795
putChar(high)
796+
nextChar()
793797
res = true
794798
else
795799
error(em"illegal character '${toUnicode(high)}' missing low surrogate")
@@ -889,7 +893,6 @@ object Scanners {
889893
if (ch == '\"') {
890894
if (lookaheadChar() == '\"') {
891895
nextRawChar()
892-
//offset += 3 // first part is positioned at the quote
893896
nextRawChar()
894897
stringPart(multiLine = true)
895898
}
@@ -900,7 +903,6 @@ object Scanners {
900903
}
901904
}
902905
else {
903-
//offset += 1 // first part is positioned at the quote
904906
stringPart(multiLine = false)
905907
}
906908
}
@@ -985,10 +987,11 @@ object Scanners {
985987
nextChar(); token = LARROW
986988
report.deprecationWarning(em"The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
987989
}
988-
else if (Character.isUnicodeIdentifierStart(ch)) {
990+
else if (isUnicodeIdentifierStart(ch)) {
989991
putChar(ch)
990992
nextChar()
991993
getIdentRest()
994+
if (ch == '"' && token == IDENTIFIER) token = INTERPOLATIONID
992995
}
993996
else if (isSpecial(ch)) {
994997
putChar(ch)
@@ -997,6 +1000,9 @@ object Scanners {
9971000
}
9981001
else if isSupplementary(ch, isUnicodeIdentifierStart) then
9991002
getIdentRest()
1003+
if (ch == '"' && token == IDENTIFIER) token = INTERPOLATIONID
1004+
else if isSupplementary(ch, isSpecial) then
1005+
getOperatorRest()
10001006
else {
10011007
error(em"illegal character '${toUnicode(ch)}'")
10021008
nextChar()
@@ -1115,7 +1121,7 @@ object Scanners {
11151121
else error(em"unclosed quoted identifier")
11161122
}
11171123

1118-
private def getIdentRest(): Unit = (ch: @switch) match {
1124+
@tailrec private def getIdentRest(): Unit = (ch: @switch) match {
11191125
case 'A' | 'B' | 'C' | 'D' | 'E' |
11201126
'F' | 'G' | 'H' | 'I' | 'J' |
11211127
'K' | 'L' | 'M' | 'N' | 'O' |
@@ -1150,7 +1156,7 @@ object Scanners {
11501156
finishNamed()
11511157
}
11521158

1153-
private def getOperatorRest(): Unit = (ch: @switch) match {
1159+
@tailrec private def getOperatorRest(): Unit = (ch: @switch) match {
11541160
case '~' | '!' | '@' | '#' | '%' |
11551161
'^' | '*' | '+' | '-' | '<' |
11561162
'>' | '?' | ':' | '=' | '&' |
@@ -1162,22 +1168,12 @@ object Scanners {
11621168
else { putChar(ch); nextChar(); getOperatorRest() }
11631169
case _ =>
11641170
if (isSpecial(ch)) { putChar(ch); nextChar(); getOperatorRest() }
1171+
else if (isSupplementary(ch, isSpecial)) getOperatorRest()
11651172
else finishNamed()
11661173
}
11671174

11681175
private def getIdentOrOperatorRest(): Unit =
1169-
if (isIdentifierPart(ch))
1170-
getIdentRest()
1171-
else ch match {
1172-
case '~' | '!' | '@' | '#' | '%' |
1173-
'^' | '*' | '+' | '-' | '<' |
1174-
'>' | '?' | ':' | '=' | '&' |
1175-
'|' | '\\' | '/' =>
1176-
getOperatorRest()
1177-
case _ =>
1178-
if (isSpecial(ch)) getOperatorRest()
1179-
else finishNamed()
1180-
}
1176+
if (isIdentifierPart(ch) || isSupplementary(ch, isIdentifierPart)) getIdentRest() else getOperatorRest()
11811177

11821178
def isSoftModifier: Boolean =
11831179
token == IDENTIFIER
@@ -1500,7 +1496,7 @@ object Scanners {
15001496
if (ch == '\'') finishCharLit()
15011497
else {
15021498
token = op
1503-
strVal = if (name != null) name.toString else null
1499+
strVal = Objects.toString(name)
15041500
litBuf.clear()
15051501
}
15061502
}

β€Žcompiler/src/dotty/tools/dotc/util/Chars.scala

Lines changed: 28 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,20 @@
11
package dotty.tools.dotc.util
22

33
import scala.annotation.switch
4-
import java.lang.{Character => JCharacter}
5-
import java.lang.Character.LETTER_NUMBER
6-
import java.lang.Character.LOWERCASE_LETTER
7-
import java.lang.Character.OTHER_LETTER
8-
import java.lang.Character.TITLECASE_LETTER
9-
import java.lang.Character.UPPERCASE_LETTER
4+
import Character.{LETTER_NUMBER, LOWERCASE_LETTER, OTHER_LETTER, TITLECASE_LETTER, UPPERCASE_LETTER}
5+
import Character.{MATH_SYMBOL, OTHER_SYMBOL}
6+
import Character.{isJavaIdentifierPart, isUnicodeIdentifierStart, isUnicodeIdentifierPart}
107

118
/** Contains constants and classifier methods for characters */
12-
object Chars {
9+
object Chars:
1310

1411
inline val LF = '\u000A'
1512
inline val FF = '\u000C'
1613
inline val CR = '\u000D'
1714
inline val SU = '\u001A'
1815

16+
type CodePoint = Int
17+
1918
/** Convert a character digit to an Int according to given base,
2019
* -1 if no success
2120
*/
@@ -59,17 +58,21 @@ object Chars {
5958
'0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
6059

6160
/** Can character start an alphanumeric Scala identifier? */
62-
def isIdentifierStart(c: Char): Boolean =
63-
(c == '_') || (c == '$') || JCharacter.isUnicodeIdentifierStart(c)
61+
def isIdentifierStart(c: Char): Boolean = (c == '_') || (c == '$') || isUnicodeIdentifierStart(c)
62+
def isIdentifierStart(c: CodePoint): Boolean = (c == '_') || (c == '$') || isUnicodeIdentifierStart(c)
6463

6564
/** Can character form part of an alphanumeric Scala identifier? */
66-
def isIdentifierPart(c: Char): Boolean =
67-
(c == '$') || JCharacter.isUnicodeIdentifierPart(c)
65+
def isIdentifierPart(c: Char): Boolean = (c == '$') || isUnicodeIdentifierPart(c)
66+
def isIdentifierPart(c: CodePoint) = (c == '$') || isUnicodeIdentifierPart(c)
6867

6968
/** Is character a math or other symbol in Unicode? */
7069
def isSpecial(c: Char): Boolean = {
71-
val chtp = JCharacter.getType(c)
72-
chtp == JCharacter.MATH_SYMBOL.toInt || chtp == JCharacter.OTHER_SYMBOL.toInt
70+
val chtp = Character.getType(c)
71+
chtp == MATH_SYMBOL.toInt || chtp == OTHER_SYMBOL.toInt
72+
}
73+
def isSpecial(codePoint: CodePoint) = {
74+
val chtp = Character.getType(codePoint)
75+
chtp == MATH_SYMBOL.toInt || chtp == OTHER_SYMBOL.toInt
7376
}
7477

7578
def isValidJVMChar(c: Char): Boolean =
@@ -78,15 +81,19 @@ object Chars {
7881
def isValidJVMMethodChar(c: Char): Boolean =
7982
!(c == '.' || c == ';' || c =='[' || c == '/' || c == '<' || c == '>')
8083

81-
private final val otherLetters = Set[Char]('\u0024', '\u005F') // '$' and '_'
82-
private final val letterGroups = {
83-
import JCharacter._
84-
Set[Byte](LOWERCASE_LETTER, UPPERCASE_LETTER, OTHER_LETTER, TITLECASE_LETTER, LETTER_NUMBER)
85-
}
86-
def isScalaLetter(ch: Char): Boolean = letterGroups(JCharacter.getType(ch).toByte) || otherLetters(ch)
84+
def isScalaLetter(c: Char): Boolean =
85+
Character.getType(c: @switch) match {
86+
case LOWERCASE_LETTER | UPPERCASE_LETTER | OTHER_LETTER | TITLECASE_LETTER | LETTER_NUMBER => true
87+
case _ => c == '$' || c == '_'
88+
}
89+
def isScalaLetter(c: CodePoint): Boolean =
90+
Character.getType(c: @switch) match {
91+
case LOWERCASE_LETTER | UPPERCASE_LETTER | OTHER_LETTER | TITLECASE_LETTER | LETTER_NUMBER => true
92+
case _ => c == '$' || c == '_'
93+
}
8794

8895
/** Can character form part of a Scala operator name? */
89-
def isOperatorPart(c : Char) : Boolean = (c: @switch) match {
96+
def isOperatorPart(c: Char): Boolean = (c: @switch) match {
9097
case '~' | '!' | '@' | '#' | '%' |
9198
'^' | '*' | '+' | '-' | '<' |
9299
'>' | '?' | ':' | '=' | '&' |
@@ -95,5 +102,4 @@ object Chars {
95102
}
96103

97104
/** Would the character be encoded by `NameTransformer.encode`? */
98-
def willBeEncoded(c : Char) : Boolean = !JCharacter.isJavaIdentifierPart(c)
99-
}
105+
def willBeEncoded(c: Char): Boolean = !isJavaIdentifierPart(c)

β€Žtests/pos/surrogates.scala

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,8 @@ class Construction {
2525
def reversed = "xyz\udc00\ud801abc"
2626
}
2727

28+
class Demon {
29+
val 😈 = 42
30+
}
31+
2832
// was: error: illegal character '\ud801', '\udc00'

β€Žtests/pos/t1406.scala

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
2+
class Identifiers {
3+
4+
def f(x: Any): Boolean = x match {
5+
case 𐐨XYZ: String => true
6+
case 𐐨 => true
7+
}
8+
def g(x: Any) = x match {
9+
case 𐐨 @ _ => 𐐨
10+
}
11+
}
12+
class Ops {
13+
def 𝆗 = 42 // was error: illegal character
14+
def op_𝆗 = 42 // was error: illegal character
15+
def πŸŒ€ = 42
16+
def op_πŸŒ€ = 42
17+
def πŸš€ = 42
18+
def op_πŸš€ = 42
19+
def πŸœ€ = 42
20+
def op_πŸœ€ = 42
21+
def 𝓅 = 42
22+
def op_𝓅 = 42
23+
}
24+
class Strings {
25+
implicit class Interps(sc: StringContext) {
26+
def 𝓅(parts: Any*) = "done"
27+
}
28+
/*
29+
def 𝓅 = 42
30+
def interpolated = s"$𝓅"
31+
def e = "a 𝓅 b"
32+
*/
33+
def f = 𝓅"one"
34+
}

β€Žtests/run/t1406b.scala

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
2+
case class C(n: Int) {
3+
def 𐀀(c: C): C = C(n * c.n) // actually a letter but supplementary 0x10000
4+
def β˜€(c: C): C = C(n * c.n) // just a symbol
5+
def β˜€=(c: C): C = C(n * c.n) // just a symbol
6+
def πŸŒ€(c: C): C = C(n * c.n) // cyclone operator is symbol, supplementary
7+
def πŸŒ€=(c: C): C = C(n * c.n) // cyclone operator is symbol, supplementary
8+
def *(c: C): C = C(n * c.n)
9+
def +(c: C): C = C(n + c.n)
10+
}
11+
object Test extends App {
12+
val c, d = C(42)
13+
println(c + d)
14+
println(c * d)
15+
println(c β˜€ d)
16+
println(c * d + d)
17+
println(c β˜€ d + d)
18+
println(c β˜€= d + d) // assignment op is low precedence
19+
println(c 𐀀 d + d) // the first one, letter should be low precedence
20+
println(c πŸŒ€d + d) // the second one, cyclone should be high precedence
21+
println(c πŸŒ€= d + d) // the second one, cyclone should be high precedence
22+
}
23+

0 commit comments

Comments
Β (0)