Skip to content

Commit f3afab4

Browse files
committed
More unicode support
1 parent 794818b commit f3afab4

File tree

7 files changed

+121
-52
lines changed

7 files changed

+121
-52
lines changed

β€Žcompiler/src/dotty/tools/dotc/core/NameOps.scala

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -86,11 +86,17 @@ object NameOps {
8686
def isVarPattern: Boolean =
8787
testSimple { n =>
8888
n.length > 0 && {
89+
def isLowerLetterSupplementary: Boolean =
90+
import Character.{isHighSurrogate, isLowSurrogate, isLetter, isLowerCase, isValidCodePoint, toCodePoint}
91+
isHighSurrogate(n(0)) && n.length > 1 && isLowSurrogate(n(1)) && {
92+
val codepoint = toCodePoint(n(0), n(1))
93+
isValidCodePoint(codepoint) && isLetter(codepoint) && isLowerCase(codepoint)
94+
}
8995
val first = n.head
90-
(((first.isLower && first.isLetter) || first == '_')
91-
&& (n != false_)
92-
&& (n != true_)
93-
&& (n != null_))
96+
((first.isLower && first.isLetter || first == '_' || isLowerLetterSupplementary)
97+
&& n != false_
98+
&& n != true_
99+
&& n != null_)
94100
}
95101
} || name.is(PatMatGivenVarName)
96102

β€Žcompiler/src/dotty/tools/dotc/core/Names.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ object Names {
2525
*/
2626
abstract class Designator
2727

28-
/** A name if either a term name or a type name. Term names can be simple
28+
/** A name is either a term name or a type name. Term names can be simple
2929
* or derived. A simple term name is essentially an interned string stored
3030
* in a name table. A derived term name adds a tag, and possibly a number
3131
* or a further simple name to some other name.

β€Žcompiler/src/dotty/tools/dotc/parsing/Scanners.scala

Lines changed: 21 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ import config.Feature.{migrateTo3, fewerBracesEnabled}
2121
import config.SourceVersion.`3.0`
2222
import reporting.{NoProfile, Profile}
2323

24+
import java.util.Objects
25+
2426
object Scanners {
2527

2628
/** Offset into source character array */
@@ -773,19 +775,21 @@ object Scanners {
773775
private def isSupplementary(high: Char, test: Int => Boolean, strict: Boolean = true): Boolean =
774776
isHighSurrogate(high) && {
775777
var res = false
776-
nextChar()
777-
val low = ch
778+
val low = lookaheadChar()
778779
if isLowSurrogate(low) then
779-
nextChar()
780780
val codepoint = toCodePoint(high, low)
781-
if isValidCodePoint(codepoint) && test(codepoint) then
782-
putChar(high)
783-
putChar(low)
784-
res = true
781+
if isValidCodePoint(codepoint) then
782+
if test(codepoint) then
783+
putChar(high)
784+
putChar(low)
785+
nextChar()
786+
nextChar()
787+
res = true
785788
else
786789
error(s"illegal character '${toUnicode(high)}${toUnicode(low)}'")
787790
else if !strict then
788791
putChar(high)
792+
nextChar()
789793
res = true
790794
else
791795
error(s"illegal character '${toUnicode(high)}' missing low surrogate")
@@ -885,7 +889,6 @@ object Scanners {
885889
if (ch == '\"') {
886890
if (lookaheadChar() == '\"') {
887891
nextRawChar()
888-
//offset += 3 // first part is positioned at the quote
889892
nextRawChar()
890893
stringPart(multiLine = true)
891894
}
@@ -896,7 +899,6 @@ object Scanners {
896899
}
897900
}
898901
else {
899-
//offset += 1 // first part is positioned at the quote
900902
stringPart(multiLine = false)
901903
}
902904
}
@@ -981,10 +983,11 @@ object Scanners {
981983
nextChar(); token = LARROW
982984
report.deprecationWarning("The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
983985
}
984-
else if (Character.isUnicodeIdentifierStart(ch)) {
986+
else if (isUnicodeIdentifierStart(ch)) {
985987
putChar(ch)
986988
nextChar()
987989
getIdentRest()
990+
if (ch == '"' && token == IDENTIFIER) token = INTERPOLATIONID
988991
}
989992
else if (isSpecial(ch)) {
990993
putChar(ch)
@@ -993,6 +996,9 @@ object Scanners {
993996
}
994997
else if isSupplementary(ch, isUnicodeIdentifierStart) then
995998
getIdentRest()
999+
if (ch == '"' && token == IDENTIFIER) token = INTERPOLATIONID
1000+
else if isSupplementary(ch, isSpecial) then
1001+
getOperatorRest()
9961002
else {
9971003
error(s"illegal character '${toUnicode(ch)}'")
9981004
nextChar()
@@ -1111,7 +1117,7 @@ object Scanners {
11111117
else error("unclosed quoted identifier")
11121118
}
11131119

1114-
private def getIdentRest(): Unit = (ch: @switch) match {
1120+
@tailrec private def getIdentRest(): Unit = (ch: @switch) match {
11151121
case 'A' | 'B' | 'C' | 'D' | 'E' |
11161122
'F' | 'G' | 'H' | 'I' | 'J' |
11171123
'K' | 'L' | 'M' | 'N' | 'O' |
@@ -1146,7 +1152,7 @@ object Scanners {
11461152
finishNamed()
11471153
}
11481154

1149-
private def getOperatorRest(): Unit = (ch: @switch) match {
1155+
@tailrec private def getOperatorRest(): Unit = (ch: @switch) match {
11501156
case '~' | '!' | '@' | '#' | '%' |
11511157
'^' | '*' | '+' | '-' | '<' |
11521158
'>' | '?' | ':' | '=' | '&' |
@@ -1158,22 +1164,12 @@ object Scanners {
11581164
else { putChar(ch); nextChar(); getOperatorRest() }
11591165
case _ =>
11601166
if (isSpecial(ch)) { putChar(ch); nextChar(); getOperatorRest() }
1167+
else if (isSupplementary(ch, isSpecial)) getOperatorRest()
11611168
else finishNamed()
11621169
}
11631170

11641171
private def getIdentOrOperatorRest(): Unit =
1165-
if (isIdentifierPart(ch))
1166-
getIdentRest()
1167-
else ch match {
1168-
case '~' | '!' | '@' | '#' | '%' |
1169-
'^' | '*' | '+' | '-' | '<' |
1170-
'>' | '?' | ':' | '=' | '&' |
1171-
'|' | '\\' | '/' =>
1172-
getOperatorRest()
1173-
case _ =>
1174-
if (isSpecial(ch)) getOperatorRest()
1175-
else finishNamed()
1176-
}
1172+
if (isIdentifierPart(ch) || isSupplementary(ch, isIdentifierPart)) getIdentRest() else getOperatorRest()
11771173

11781174
def isSoftModifier: Boolean =
11791175
token == IDENTIFIER
@@ -1496,7 +1492,7 @@ object Scanners {
14961492
if (ch == '\'') finishCharLit()
14971493
else {
14981494
token = op
1499-
strVal = if (name != null) name.toString else null
1495+
strVal = Objects.toString(name)
15001496
litBuf.clear()
15011497
}
15021498
}

β€Žcompiler/src/dotty/tools/dotc/util/Chars.scala

Lines changed: 28 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,20 @@
11
package dotty.tools.dotc.util
22

33
import scala.annotation.switch
4-
import java.lang.{Character => JCharacter}
5-
import java.lang.Character.LETTER_NUMBER
6-
import java.lang.Character.LOWERCASE_LETTER
7-
import java.lang.Character.OTHER_LETTER
8-
import java.lang.Character.TITLECASE_LETTER
9-
import java.lang.Character.UPPERCASE_LETTER
4+
import Character.{LETTER_NUMBER, LOWERCASE_LETTER, OTHER_LETTER, TITLECASE_LETTER, UPPERCASE_LETTER}
5+
import Character.{MATH_SYMBOL, OTHER_SYMBOL}
6+
import Character.{isJavaIdentifierPart, isUnicodeIdentifierStart, isUnicodeIdentifierPart}
107

118
/** Contains constants and classifier methods for characters */
12-
object Chars {
9+
object Chars:
1310

1411
inline val LF = '\u000A'
1512
inline val FF = '\u000C'
1613
inline val CR = '\u000D'
1714
inline val SU = '\u001A'
1815

16+
type CodePoint = Int
17+
1918
/** Convert a character digit to an Int according to given base,
2019
* -1 if no success
2120
*/
@@ -59,17 +58,21 @@ object Chars {
5958
'0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
6059

6160
/** Can character start an alphanumeric Scala identifier? */
62-
def isIdentifierStart(c: Char): Boolean =
63-
(c == '_') || (c == '$') || JCharacter.isUnicodeIdentifierStart(c)
61+
def isIdentifierStart(c: Char): Boolean = (c == '_') || (c == '$') || isUnicodeIdentifierStart(c)
62+
def isIdentifierStart(c: CodePoint): Boolean = (c == '_') || (c == '$') || isUnicodeIdentifierStart(c)
6463

6564
/** Can character form part of an alphanumeric Scala identifier? */
66-
def isIdentifierPart(c: Char): Boolean =
67-
(c == '$') || JCharacter.isUnicodeIdentifierPart(c)
65+
def isIdentifierPart(c: Char): Boolean = (c == '$') || isUnicodeIdentifierPart(c)
66+
def isIdentifierPart(c: CodePoint) = (c == '$') || isUnicodeIdentifierPart(c)
6867

6968
/** Is character a math or other symbol in Unicode? */
7069
def isSpecial(c: Char): Boolean = {
71-
val chtp = JCharacter.getType(c)
72-
chtp == JCharacter.MATH_SYMBOL.toInt || chtp == JCharacter.OTHER_SYMBOL.toInt
70+
val chtp = Character.getType(c)
71+
chtp == MATH_SYMBOL.toInt || chtp == OTHER_SYMBOL.toInt
72+
}
73+
def isSpecial(codePoint: CodePoint) = {
74+
val chtp = Character.getType(codePoint)
75+
chtp == MATH_SYMBOL.toInt || chtp == OTHER_SYMBOL.toInt
7376
}
7477

7578
def isValidJVMChar(c: Char): Boolean =
@@ -78,15 +81,19 @@ object Chars {
7881
def isValidJVMMethodChar(c: Char): Boolean =
7982
!(c == '.' || c == ';' || c =='[' || c == '/' || c == '<' || c == '>')
8083

81-
private final val otherLetters = Set[Char]('\u0024', '\u005F') // '$' and '_'
82-
private final val letterGroups = {
83-
import JCharacter._
84-
Set[Byte](LOWERCASE_LETTER, UPPERCASE_LETTER, OTHER_LETTER, TITLECASE_LETTER, LETTER_NUMBER)
85-
}
86-
def isScalaLetter(ch: Char): Boolean = letterGroups(JCharacter.getType(ch).toByte) || otherLetters(ch)
84+
def isScalaLetter(c: Char): Boolean =
85+
Character.getType(c: @switch) match {
86+
case LOWERCASE_LETTER | UPPERCASE_LETTER | OTHER_LETTER | TITLECASE_LETTER | LETTER_NUMBER => true
87+
case _ => c == '$' || c == '_'
88+
}
89+
def isScalaLetter(c: CodePoint): Boolean =
90+
Character.getType(c: @switch) match {
91+
case LOWERCASE_LETTER | UPPERCASE_LETTER | OTHER_LETTER | TITLECASE_LETTER | LETTER_NUMBER => true
92+
case _ => c == '$' || c == '_'
93+
}
8794

8895
/** Can character form part of a Scala operator name? */
89-
def isOperatorPart(c : Char) : Boolean = (c: @switch) match {
96+
def isOperatorPart(c: Char): Boolean = (c: @switch) match {
9097
case '~' | '!' | '@' | '#' | '%' |
9198
'^' | '*' | '+' | '-' | '<' |
9299
'>' | '?' | ':' | '=' | '&' |
@@ -95,5 +102,4 @@ object Chars {
95102
}
96103

97104
/** Would the character be encoded by `NameTransformer.encode`? */
98-
def willBeEncoded(c : Char) : Boolean = !JCharacter.isJavaIdentifierPart(c)
99-
}
105+
def willBeEncoded(c: Char): Boolean = !isJavaIdentifierPart(c)

β€Žtests/pos/surrogates.scala

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,8 @@ class Construction {
2525
def reversed = "xyz\udc00\ud801abc"
2626
}
2727

28+
class Demon {
29+
val 😈 = 42
30+
}
31+
2832
// was: error: illegal character '\ud801', '\udc00'

β€Žtests/pos/t1406.scala

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
2+
class Identifiers {
3+
4+
def f(x: Any): Boolean = x match {
5+
case 𐐨XYZ: String => true
6+
case 𐐨 => true
7+
}
8+
def g(x: Any) = x match {
9+
case 𐐨 @ _ => 𐐨
10+
}
11+
}
12+
class Ops {
13+
def 𝆗 = 42 // was error: illegal character
14+
def op_𝆗 = 42 // was error: illegal character
15+
def πŸŒ€ = 42
16+
def op_πŸŒ€ = 42
17+
def πŸš€ = 42
18+
def op_πŸš€ = 42
19+
def πŸœ€ = 42
20+
def op_πŸœ€ = 42
21+
def 𝓅 = 42
22+
def op_𝓅 = 42
23+
}
24+
class Strings {
25+
implicit class Interps(sc: StringContext) {
26+
def 𝓅(parts: Any*) = "done"
27+
}
28+
/*
29+
def 𝓅 = 42
30+
def interpolated = s"$𝓅"
31+
def e = "a 𝓅 b"
32+
*/
33+
def f = 𝓅"one"
34+
}

β€Žtests/run/t1406b.scala

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
2+
case class C(n: Int) {
3+
def 𐀀(c: C): C = C(n * c.n) // actually a letter but supplementary 0x10000
4+
def β˜€(c: C): C = C(n * c.n) // just a symbol
5+
def β˜€=(c: C): C = C(n * c.n) // just a symbol
6+
def πŸŒ€(c: C): C = C(n * c.n) // cyclone operator is symbol, supplementary
7+
def πŸŒ€=(c: C): C = C(n * c.n) // cyclone operator is symbol, supplementary
8+
def *(c: C): C = C(n * c.n)
9+
def +(c: C): C = C(n + c.n)
10+
}
11+
object Test extends App {
12+
val c, d = C(42)
13+
println(c + d)
14+
println(c * d)
15+
println(c β˜€ d)
16+
println(c * d + d)
17+
println(c β˜€ d + d)
18+
println(c β˜€= d + d) // assignment op is low precedence
19+
println(c 𐀀 d + d) // the first one, letter should be low precedence
20+
println(c πŸŒ€d + d) // the second one, cyclone should be high precedence
21+
println(c πŸŒ€= d + d) // the second one, cyclone should be high precedence
22+
}
23+

0 commit comments

Comments
Β (0)