Remove highly restrictive scriptset support

josevalim · josevalim · commit 9924afff5db9 · 2024-07-02T14:41:57.000+02:00
diff --git a/lib/elixir/pages/references/unicode-syntax.md b/lib/elixir/pages/references/unicode-syntax.md
@@ -136,13 +136,11 @@ Elixir will not warn on confusability for identifiers made up exclusively of cha
 
 ### C3. Mixed Script Detection
 
-Elixir will not allow tokenization of mixed-script identifiers unless it is via chunks separated by an underscore, like `http_сервер`, or unless the mixing within each of those chunks is one of the exceptions defined in UTS 39 5.2, 'Highly Restrictive'. We use the means described in Section 5.1, Mixed-Script Detection, to determine if script mixing is occurring, with the modification documented in the section 'Additional Normalizations', below.
+Elixir will not allow tokenization of mixed-script identifiers unless it is via chunks separated by an underscore, like `http_сервер`. We use the means described in Section 5.1, Mixed-Script Detection, to determine if script mixing is occurring, with the modification documented in the section 'Additional Normalizations', below.
 
-Examples: Elixir allows an identifiers like `幻ㄒㄧㄤ`, even though it includes characters from multiple 'scripts', because those scripts all 'resolve' to Japanese when applying the resolution rules from UTS 39 5.1. It also allows an atom like `:Tシャツ`, the Japanese word for 't-shirt', which incorporates a Latin capital T, because {Latn, Jpan} is one of the allowed script mixing in the definition of 'Highly Restrictive' in UTS 39 5.2, and it 'covers' the string.
+Examples: Elixir allows an identifiers like `幻ㄒㄧㄤ`, even though it includes characters from multiple 'scripts', because those scripts all 'resolve' to Japanese when applying the resolution rules from UTS 39 5.1. When mixing Latin and Japanese scripts, underscores are necessary, as in `:T_シャツ` (the Japanese word for 't-shirt' with an additional underscore separating the letter T).
 
-Elixir does allow an identifier like `http_сервер`, where the identifier chunks on each side of the `_` are individually single-script.
-
-However, Elixir would prevent tokenization in code like `if аdmin, do: :ok, else: :err`, where the scriptset for the 'a' character is {Cyrillic} but all other characters have scriptsets of {Latin}. The scriptsets fail to resolve, and the scriptsets from the definition of 'Highly Restrictive' in UTS 39 5.2 do not cover the string either, so a descriptive error is shown.
+Elixir does not allow code like `if аdmin, do: :ok, else: :err`, where the scriptset for the 'a' character is {Cyrillic} but all other characters have scriptsets of {Latin}. The scriptsets fail to resolve and a descriptive error is shown.
 
 ### C4, C5 (inapplicable)
 
diff --git a/lib/elixir/src/elixir_tokenizer.erl b/lib/elixir/src/elixir_tokenizer.erl
@@ -1240,7 +1240,7 @@ tokenize_identifier(String, Line, Column, Scope, MaybeKeyword) ->
           Error
       end;
 
-    {error, {not_highly_restrictive, Wrong, {Prefix, Suffix}}} ->
+    {error, {mixed_script, Wrong, {Prefix, Suffix}}} ->
       WrongColumn = Column + length(Wrong) - 1,
       case suggest_simpler_unexpected_token_in_error(Wrong, Line, WrongColumn, Scope) of
         no_suggestion ->
diff --git a/lib/elixir/test/elixir/kernel/string_tokenizer_test.exs b/lib/elixir/test/elixir/kernel/string_tokenizer_test.exs
@@ -120,21 +120,21 @@ defmodule Kernel.StringTokenizerTest do
     end
 
     test "allows legitimate script mixing" do
-      # writing systems that legitimately mix multiple scripts, and Common chars like _
+      # Mixed script with supersets, numbers, and underscores
       assert Code.eval_string("幻ㄒㄧㄤ = 1") == {1, [幻ㄒㄧㄤ: 1]}
       assert Code.eval_string("幻ㄒㄧㄤ1 = 1") == {1, [幻ㄒㄧㄤ1: 1]}
       assert Code.eval_string("__सवव_1? = 1") == {1, [__सवव_1?: 1]}
 
-      # works with atoms too
-      assert Code.eval_string(":Tシャツ") == {:Tシャツ, []}
-
-      # elixir's normalizations combine scriptsets of the 'from' and 'to' characters,
+      # Elixir's normalizations combine scriptsets of the 'from' and 'to' characters,
       # ex: {Common} MICRO => {Greek} MU == {Common, Greek}; Common intersects w/all
       assert Code.eval_string("μs = 1") == {1, [μs: 1]}
 
-      # allows mixed scripts if the chunks are all single-script or highly restrictive
+      # Mixed scripts in variables
       assert Code.eval_string("http_сервер = 1") == {1, [http_сервер: 1]}
       assert Code.eval_string("сервер_http = 1") == {1, [сервер_http: 1]}
+
+      # Mixed scripts in atoms
+      assert Code.eval_string(":T_シャツ") == {:T_シャツ, []}
     end
 
     test "bidi" do
diff --git a/lib/elixir/test/elixir/kernel/warning_test.exs b/lib/elixir/test/elixir/kernel/warning_test.exs
@@ -63,7 +63,7 @@ defmodule Kernel.WarningTest do
 
   test "warnings from macro" do
     assert_warn_eval(
-      ["demo:66\n", "key :dup will be overridden in map\n"],
+      ["demo:60\n", "key :dup will be overridden in map\n"],
       """
       import Kernel.WarningTest
       will_warn()
diff --git a/lib/elixir/unicode/tokenizer.ex b/lib/elixir/unicode/tokenizer.ex
@@ -264,14 +264,6 @@ defmodule String.Tokenizer do
   @top top
   @indexed_scriptsets sorted_scriptsets |> Enum.with_index(&{&2, &1}) |> Map.new()
 
-  latin = Map.fetch!(scriptset_masks, "Latin")
-
-  @highly_restrictive [
-    ScriptSet.union(latin, Map.fetch!(scriptset_masks, "Japanese")),
-    ScriptSet.union(latin, Map.fetch!(scriptset_masks, "Han with Bopomofo")),
-    ScriptSet.union(latin, Map.fetch!(scriptset_masks, "Korean"))
-  ]
-
   # ScriptSet helpers. Inline instead of dispatching to ScriptSet for performance
 
   @compile {:inline, ss_latin: 1, ss_intersect: 2}
@@ -481,7 +473,7 @@ defmodule String.Tokenizer do
         [:nfkc | List.delete(special, :nfkc)]
       end
 
-    if scriptset != @bottom or chunks_single_or_highly_restrictive?(acc) do
+    if scriptset != @bottom or chunks_single?(acc) do
       {kind, acc, rest, length, false, special}
     else
       breakdown =
@@ -510,35 +502,25 @@ defmodule String.Tokenizer do
       Mixed-script identifiers are not supported for security reasons. \
       '#{acc}' is made of the following scripts:\n
       #{breakdown}
-      All characters in identifier chunks should resolve to a single script, \
-      or a highly restrictive set of scripts.
+      Characters in identifiers from different scripts must be separated \
+      by underscore (_).
       """
 
-      {:error, {:not_highly_restrictive, acc, {prefix, suffix}}}
+      {:error, {:mixed_script, acc, {prefix, suffix}}}
     end
   end
 
-  defp chunks_single_or_highly_restrictive?(acc) do
-    # support script mixing via chunked identifiers (UTS 55-5's strong recco)
-    # each chunk in an ident like foo_bar_baz should pass checks
-    acc
-    |> :string.tokens([?_])
-    |> Enum.all?(&single_or_highly_restrictive?/1)
-  end
+  defp chunks_single?(acc),
+    do: chunks_single?(acc, @top)
 
-  defp single_or_highly_restrictive?(acc) do
-    scriptsets = Enum.map(acc, &codepoint_to_scriptset/1)
-    is_single_script = @bottom != Enum.reduce(scriptsets, @top, &ss_intersect/2)
-
-    # 'A set of scripts is defined to cover a string if the intersection of
-    #  that set with the augmented script sets of all characters in the string
-    #  is nonempty; in other words, if every character in the string shares at
-    #  least one script with the cover set.'
-    is_single_script or
-      Enum.any?(@highly_restrictive, fn restrictive ->
-        Enum.all?(scriptsets, &(ss_intersect(&1, restrictive) != @bottom))
-      end)
-  end
+  defp chunks_single?([?_ | rest], acc),
+    do: acc != @bottom and chunks_single?(rest, @top)
+
+  defp chunks_single?([head | rest], acc),
+    do: chunks_single?(rest, ss_intersect(codepoint_to_scriptset(head), acc))
+
+  defp chunks_single?([], acc),
+    do: acc != @bottom
 
   defp codepoint_to_scriptset(head) do
     cond do