Skip to content

Commit 9924aff

Browse files
committed
Remove highly restrictive scriptset support
1 parent fcfa87f commit 9924aff

File tree

5 files changed

+25
-45
lines changed

5 files changed

+25
-45
lines changed

lib/elixir/pages/references/unicode-syntax.md

+3-5
Original file line numberDiff line numberDiff line change
@@ -136,13 +136,11 @@ Elixir will not warn on confusability for identifiers made up exclusively of cha
136136

137137
### C3. Mixed Script Detection
138138

139-
Elixir will not allow tokenization of mixed-script identifiers unless it is via chunks separated by an underscore, like `http_сервер`, or unless the mixing within each of those chunks is one of the exceptions defined in UTS 39 5.2, 'Highly Restrictive'. We use the means described in Section 5.1, Mixed-Script Detection, to determine if script mixing is occurring, with the modification documented in the section 'Additional Normalizations', below.
139+
Elixir will not allow tokenization of mixed-script identifiers unless it is via chunks separated by an underscore, like `http_сервер`. We use the means described in Section 5.1, Mixed-Script Detection, to determine if script mixing is occurring, with the modification documented in the section 'Additional Normalizations', below.
140140

141-
Examples: Elixir allows an identifiers like `幻ㄒㄧㄤ`, even though it includes characters from multiple 'scripts', because those scripts all 'resolve' to Japanese when applying the resolution rules from UTS 39 5.1. It also allows an atom like `:Tシャツ`, the Japanese word for 't-shirt', which incorporates a Latin capital T, because {Latn, Jpan} is one of the allowed script mixing in the definition of 'Highly Restrictive' in UTS 39 5.2, and it 'covers' the string.
141+
Examples: Elixir allows an identifiers like `幻ㄒㄧㄤ`, even though it includes characters from multiple 'scripts', because those scripts all 'resolve' to Japanese when applying the resolution rules from UTS 39 5.1. When mixing Latin and Japanese scripts, underscores are necessary, as in `:T_シャツ` (the Japanese word for 't-shirt' with an additional underscore separating the letter T).
142142

143-
Elixir does allow an identifier like `http_сервер`, where the identifier chunks on each side of the `_` are individually single-script.
144-
145-
However, Elixir would prevent tokenization in code like `if аdmin, do: :ok, else: :err`, where the scriptset for the 'a' character is {Cyrillic} but all other characters have scriptsets of {Latin}. The scriptsets fail to resolve, and the scriptsets from the definition of 'Highly Restrictive' in UTS 39 5.2 do not cover the string either, so a descriptive error is shown.
143+
Elixir does not allow code like `if аdmin, do: :ok, else: :err`, where the scriptset for the 'a' character is {Cyrillic} but all other characters have scriptsets of {Latin}. The scriptsets fail to resolve and a descriptive error is shown.
146144

147145
### C4, C5 (inapplicable)
148146

lib/elixir/src/elixir_tokenizer.erl

+1-1
Original file line numberDiff line numberDiff line change
@@ -1240,7 +1240,7 @@ tokenize_identifier(String, Line, Column, Scope, MaybeKeyword) ->
12401240
Error
12411241
end;
12421242

1243-
{error, {not_highly_restrictive, Wrong, {Prefix, Suffix}}} ->
1243+
{error, {mixed_script, Wrong, {Prefix, Suffix}}} ->
12441244
WrongColumn = Column + length(Wrong) - 1,
12451245
case suggest_simpler_unexpected_token_in_error(Wrong, Line, WrongColumn, Scope) of
12461246
no_suggestion ->

lib/elixir/test/elixir/kernel/string_tokenizer_test.exs

+6-6
Original file line numberDiff line numberDiff line change
@@ -120,21 +120,21 @@ defmodule Kernel.StringTokenizerTest do
120120
end
121121

122122
test "allows legitimate script mixing" do
123-
# writing systems that legitimately mix multiple scripts, and Common chars like _
123+
# Mixed script with supersets, numbers, and underscores
124124
assert Code.eval_string("幻ㄒㄧㄤ = 1") == {1, [幻ㄒㄧㄤ: 1]}
125125
assert Code.eval_string("幻ㄒㄧㄤ1 = 1") == {1, [幻ㄒㄧㄤ1: 1]}
126126
assert Code.eval_string("__सवव_1? = 1") == {1, [__सवव_1?: 1]}
127127

128-
# works with atoms too
129-
assert Code.eval_string(":Tシャツ") == {:Tシャツ, []}
130-
131-
# elixir's normalizations combine scriptsets of the 'from' and 'to' characters,
128+
# Elixir's normalizations combine scriptsets of the 'from' and 'to' characters,
132129
# ex: {Common} MICRO => {Greek} MU == {Common, Greek}; Common intersects w/all
133130
assert Code.eval_string("μs = 1") == {1, [μs: 1]}
134131

135-
# allows mixed scripts if the chunks are all single-script or highly restrictive
132+
# Mixed scripts in variables
136133
assert Code.eval_string("http_сервер = 1") == {1, [http_сервер: 1]}
137134
assert Code.eval_string("сервер_http = 1") == {1, [сервер_http: 1]}
135+
136+
# Mixed scripts in atoms
137+
assert Code.eval_string(":T_シャツ") == {:T_シャツ, []}
138138
end
139139

140140
test "bidi" do

lib/elixir/test/elixir/kernel/warning_test.exs

+1-1
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ defmodule Kernel.WarningTest do
6363

6464
test "warnings from macro" do
6565
assert_warn_eval(
66-
["demo:66\n", "key :dup will be overridden in map\n"],
66+
["demo:60\n", "key :dup will be overridden in map\n"],
6767
"""
6868
import Kernel.WarningTest
6969
will_warn()

lib/elixir/unicode/tokenizer.ex

+14-32
Original file line numberDiff line numberDiff line change
@@ -264,14 +264,6 @@ defmodule String.Tokenizer do
264264
@top top
265265
@indexed_scriptsets sorted_scriptsets |> Enum.with_index(&{&2, &1}) |> Map.new()
266266

267-
latin = Map.fetch!(scriptset_masks, "Latin")
268-
269-
@highly_restrictive [
270-
ScriptSet.union(latin, Map.fetch!(scriptset_masks, "Japanese")),
271-
ScriptSet.union(latin, Map.fetch!(scriptset_masks, "Han with Bopomofo")),
272-
ScriptSet.union(latin, Map.fetch!(scriptset_masks, "Korean"))
273-
]
274-
275267
# ScriptSet helpers. Inline instead of dispatching to ScriptSet for performance
276268

277269
@compile {:inline, ss_latin: 1, ss_intersect: 2}
@@ -481,7 +473,7 @@ defmodule String.Tokenizer do
481473
[:nfkc | List.delete(special, :nfkc)]
482474
end
483475

484-
if scriptset != @bottom or chunks_single_or_highly_restrictive?(acc) do
476+
if scriptset != @bottom or chunks_single?(acc) do
485477
{kind, acc, rest, length, false, special}
486478
else
487479
breakdown =
@@ -510,35 +502,25 @@ defmodule String.Tokenizer do
510502
Mixed-script identifiers are not supported for security reasons. \
511503
'#{acc}' is made of the following scripts:\n
512504
#{breakdown}
513-
All characters in identifier chunks should resolve to a single script, \
514-
or a highly restrictive set of scripts.
505+
Characters in identifiers from different scripts must be separated \
506+
by underscore (_).
515507
"""
516508

517-
{:error, {:not_highly_restrictive, acc, {prefix, suffix}}}
509+
{:error, {:mixed_script, acc, {prefix, suffix}}}
518510
end
519511
end
520512

521-
defp chunks_single_or_highly_restrictive?(acc) do
522-
# support script mixing via chunked identifiers (UTS 55-5's strong recco)
523-
# each chunk in an ident like foo_bar_baz should pass checks
524-
acc
525-
|> :string.tokens([?_])
526-
|> Enum.all?(&single_or_highly_restrictive?/1)
527-
end
513+
defp chunks_single?(acc),
514+
do: chunks_single?(acc, @top)
528515

529-
defp single_or_highly_restrictive?(acc) do
530-
scriptsets = Enum.map(acc, &codepoint_to_scriptset/1)
531-
is_single_script = @bottom != Enum.reduce(scriptsets, @top, &ss_intersect/2)
532-
533-
# 'A set of scripts is defined to cover a string if the intersection of
534-
# that set with the augmented script sets of all characters in the string
535-
# is nonempty; in other words, if every character in the string shares at
536-
# least one script with the cover set.'
537-
is_single_script or
538-
Enum.any?(@highly_restrictive, fn restrictive ->
539-
Enum.all?(scriptsets, &(ss_intersect(&1, restrictive) != @bottom))
540-
end)
541-
end
516+
defp chunks_single?([?_ | rest], acc),
517+
do: acc != @bottom and chunks_single?(rest, @top)
518+
519+
defp chunks_single?([head | rest], acc),
520+
do: chunks_single?(rest, ss_intersect(codepoint_to_scriptset(head), acc))
521+
522+
defp chunks_single?([], acc),
523+
do: acc != @bottom
542524

543525
defp codepoint_to_scriptset(head) do
544526
cond do

0 commit comments

Comments
 (0)