Directional confusability protection & allowing script mixing in identifiers when separated by underscores (#13693)

mrluc · web-flow · commit c83334e5f2f1 · 2024-07-02T14:07:20.000+02:00
* update support for uts39 from unicode 15 * follow uts39's recco that it's not necessary to require idents to be single-script (they call out proglang idents, reference the new uts55-5). We use a heuristic derived from the concept of identifier chunks from uts55-5, to allow idents like foo_bar_baz where each chunk around the _ can be single-or-highly-restrictive * provide directional confusability detection, by reversing spans of direction-changed chars in idents for bidi_skeleton, see issue #12929
diff --git a/Makefile b/Makefile
@@ -104,8 +104,8 @@ unicode: $(UNICODE)
 $(UNICODE): lib/elixir/unicode/*
 	@ echo "==> unicode (compile)";
 	$(Q) $(ELIXIRC) lib/elixir/unicode/unicode.ex -o lib/elixir/ebin;
-	$(Q) $(ELIXIRC) lib/elixir/unicode/security.ex -o lib/elixir/ebin;
 	$(Q) $(ELIXIRC) lib/elixir/unicode/tokenizer.ex -o lib/elixir/ebin;
+	$(Q) $(ELIXIRC) lib/elixir/unicode/security.ex -o lib/elixir/ebin;
 
 $(eval $(call APP_TEMPLATE,ex_unit,ExUnit))
 $(eval $(call APP_TEMPLATE,logger,Logger))
diff --git a/lib/elixir/pages/references/unicode-syntax.md b/lib/elixir/pages/references/unicode-syntax.md
@@ -136,10 +136,12 @@ Elixir will not warn on confusability for identifiers made up exclusively of cha
 
 ### C3. Mixed Script Detection
 
-Elixir will not allow tokenization of mixed-script identifiers unless the mixing is one of the exceptions defined in UTS 39 5.2, 'Highly Restrictive'. We use the means described in Section 5.1, Mixed-Script Detection, to determine if script mixing is occurring, with the modification documented in the section 'Additional Normalizations', below.
+Elixir will not allow tokenization of mixed-script identifiers unless it is via chunks separated by an underscore, like `http_сервер`, or unless the mixing within each of those chunks is one of the exceptions defined in UTS 39 5.2, 'Highly Restrictive'. We use the means described in Section 5.1, Mixed-Script Detection, to determine if script mixing is occurring, with the modification documented in the section 'Additional Normalizations', below.
 
 Examples: Elixir allows an identifiers like `幻ㄒㄧㄤ`, even though it includes characters from multiple 'scripts', because those scripts all 'resolve' to Japanese when applying the resolution rules from UTS 39 5.1. It also allows an atom like `:Tシャツ`, the Japanese word for 't-shirt', which incorporates a Latin capital T, because {Latn, Jpan} is one of the allowed script mixing in the definition of 'Highly Restrictive' in UTS 39 5.2, and it 'covers' the string.
 
+Elixir does allow an identifier like `http_сервер`, where the identifier chunks on each side of the `_` are individually single-script.
+
 However, Elixir would prevent tokenization in code like `if аdmin, do: :ok, else: :err`, where the scriptset for the 'a' character is {Cyrillic} but all other characters have scriptsets of {Latin}. The scriptsets fail to resolve, and the scriptsets from the definition of 'Highly Restrictive' in UTS 39 5.2 do not cover the string either, so a descriptive error is shown.
 
 ### C4, C5 (inapplicable)
diff --git a/lib/elixir/test/elixir/kernel/warning_test.exs b/lib/elixir/test/elixir/kernel/warning_test.exs
@@ -139,6 +139,70 @@ defmodule Kernel.WarningTest do
                "confusable identifier: 'a' looks like 'а' on line 1"
     end
 
+    test "warns on LTR-confusables" do
+      # warning outputs in byte order (vs bidi algo display order, uax9), mentions presence of rtl
+      assert_warn_quoted(
+        ["nofile:1:9", "'_1א' looks like '_א1'", "right-to-left characters"],
+        "_א1 and _1א"
+      )
+
+      assert_warn_quoted(
+        [
+          "'a_1א' includes right-to-left characters",
+          "\\u0061 a ltr",
+          "\\u005F _ neutral",
+          "\\u0031 1 weak_number",
+          "\\u05D0 א rtl",
+          "'a_א1' includes right-to-left characters:",
+          "\\u0061 a ltr",
+          "\\u005F _ neutral",
+          "\\u05D0 א rtl",
+          "\\u0031 1 weak_number"
+        ],
+        "a_א1 or a_1א"
+      )
+
+      # test that the implementation of String.Tokenizer.Security.unbidify/1 agrees
+      # w/Unicode Bidi Algo (UAX9) for these (identifier-specific, no-bracket) examples
+      #
+      # you can create new examples with: https://util.unicode.org/UnicodeJsps/bidic.jsp?s=foo_%D9%84%D8%A7%D9%85%D8%AF%D8%A7_baz&b=0&u=140&d=2
+      # inspired by (none of these are directly usable for our idents): https://www.unicode.org/Public/UCD/latest/ucd/BidiCharacterTest.txt
+      #
+      # there's a spurious ;A; after the identifier, because the semicolon is dir-neutral, and
+      # deleting it makes these examples hard to read in many/most editors!
+      """
+      foo;A;0066 006F 006F;0 1 2
+      _foo_ ;A;005F 0066 006F 006F 005F;0 1 2 3 4
+      __foo__ ;A;005F 005F 0066 006F 006F 005F 005F;0 1 2 3 4 5 6
+      لامدا_foo ;A;0644 0627 0645 062F 0627 005F 0066 006F 006F;4 3 2 1 0 5 6 7 8
+      foo_لامدا_baz ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 0062 0061 007A;0 1 2 3 8 7 6 5 4 9 10 11 12
+      foo_لامدا ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627;0 1 2 3 8 7 6 5 4
+      foo_لامدا1 ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 0031;0 1 2 3 9 8 7 6 5 4
+      foo_لامدا_حدد ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F;0 1 2 3 12 11 10 9 8 7 6 5 4
+      foo_لامدا_حدد1 ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F 0031;0 1 2 3 13 12 11 10 9 8 7 6 5 4
+      foo_لامدا_حدد1_bar ;A; 0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F 0031 005F 0062 0061 0072;0 1 2 3 13 12 11 10 9 8 7 6 5 4 14 15 16 17
+      foo_لامدا_حدد1_bar1 ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F 0031 005F 0062 0061 0072 0031;0 1 2 3 13 12 11 10 9 8 7 6 5 4 14 15 16 17 18
+      """
+      |> String.split("\n", trim: true)
+      |> Enum.map(&String.split(&1, ";", trim: true))
+      |> Enum.each(fn
+        [ident, _, bytes, indices | _rest] ->
+          bytes = String.split(bytes, " ", trim: true) |> Enum.map(&String.to_integer(&1, 16))
+          indices = String.split(indices, " ", trim: true) |> Enum.map(&String.to_integer/1)
+          display_ordered = for i <- indices, do: Enum.at(bytes, i)
+          unbidified = String.Tokenizer.Security.unbidify(bytes)
+
+          assert(display_ordered == unbidified, """
+           Failing String.Tokenizer.Security.unbidify/1 case for: '#{ident}'
+            bytes        : #{bytes |> Enum.map(&Integer.to_string(&1, 16)) |> Enum.join(" ")}
+            byte order   : #{bytes |> Enum.intersperse(32)}
+            uax9 order   : #{display_ordered |> Enum.intersperse(32)}
+            uax9 indices : #{indices |> Enum.join(" ")}
+            unbidify/1   : #{unbidified |> Enum.intersperse(32)}
+          """)
+      end)
+    end
+
     test "prevents unsafe script mixing in identifiers" do
       exception =
         assert_raise SyntaxError, fn ->
@@ -172,7 +236,9 @@ defmodule Kernel.WarningTest do
       assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[аdmin: 1]") end
       assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[{:аdmin, 1}]") end
       assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("quote do: аdmin(1)") end
-      assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("рос_api = 1") end
+
+      # c is Latin
+      assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("http_cервер = 1") end
 
       # T is in cyrillic
       assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[Тシャツ: 1]") end
@@ -190,6 +256,10 @@ defmodule Kernel.WarningTest do
       # elixir's normalizations combine scriptsets of the 'from' and 'to' characters,
       # ex: {Common} MICRO => {Greek} MU == {Common, Greek}; Common intersects w/all
       assert capture_quoted("μs") == ""
+
+      # allows mixed scripts if the chunks are all single-script or highly restrictive
+      assert capture_eval("http_сервер = 1") == ""
+      assert capture_eval("сервер_http = 1") == ""
     end
   end
 
diff --git a/lib/elixir/unicode/security.ex b/lib/elixir/unicode/security.ex
@@ -53,7 +53,7 @@ defmodule String.Tokenizer.Security do
       {line, _, previous_name} when name != previous_name ->
         {:warn,
          "confusable identifier: '#{name}' looks like '#{previous_name}' on line #{line}, " <>
-           "but they are written using different characters"}
+           "but they are written using different characters" <> dir_compare(name, previous_name)}
 
       _ ->
         {:ok, Map.put(skeletons, skeleton, info)}
@@ -106,7 +106,86 @@ defmodule String.Tokenizer.Security do
     #    the specified data, producing a string of exemplar characters.
     #  - Reapply NFD." (UTS 39 section 4, skeleton definition)
     :unicode.characters_to_nfd_list(s)
-    |> Enum.map(&confusable_prototype/1)
+    |> bidi_skeleton()
     |> :unicode.characters_to_nfd_list()
   end
+
+  # unicode 15 adds bidiSkeleton because, w/RTL codepoints, idents that
+  # aren't confusable LTR *are* confusable in most places human review
+  # occurs (editors/browsers, thanks to bidi algo, UAX9).
+  #
+  # The solution is to detect spans with reversed visual direction,
+  # and reverse those, so that the input we check for confusability
+  # matches the perceived sequence instead of the byte sequence.
+  #
+  # (we need this regardless of script mixing, because direction-neutral
+  # chars like _ or 0..9 can mix w/RTL chars).
+  def bidi_skeleton(s) do
+    # UTS39-28 4:
+    #  'Bidirectional confusability is costlier to check than
+    #   confusability, as [unicode bidi algo] must be applied.
+    #   [...] a fast path can be used: [...] if X has no characters
+    #   w/bidi classes R or AL, bidiSkeleton(X) = skeleton(X)
+    #
+    if match?([_, _ | _], s) and any_rtl?(s) do
+      unbidify(s) |> Enum.map(&confusable_prototype/1)
+    else
+      Enum.map(s, &confusable_prototype/1)
+    end
+  end
+
+  import String.Tokenizer, only: [dir: 1]
+
+  defp any_rtl?(s), do: Enum.any?(s, &(:rtl == dir(&1)))
+
+  defp dir_compare(a, b) do
+    """
+    #{if any_rtl?(a), do: "\n\n" <> dir_breakdown(a)}
+    #{if any_rtl?(b), do: dir_breakdown(b)}
+    """
+  end
+
+  defp dir_breakdown(s) do
+    init = "'#{s}' includes right-to-left characters:\n"
+
+    for codepoint <- s, into: init do
+      hex = :io_lib.format(~c"~4.16.0B", [codepoint])
+      "  \\u#{hex} #{[codepoint]} #{dir(codepoint)}\n"
+    end
+  end
+
+  # make charlist match visual order by reversing spans of {rtl, neutral}
+  # and attaching neutral characters and weak number types according to uax9
+  #
+  #  UTS39-28 4: '[...] if the strings are known not to contain explicit
+  #   directional formatting characters[...], the algorithm can
+  #   be drastically simplified, [...], obviating the need for
+  #   the [...] stack of the [unicode bidi algo]'
+  def unbidify(chars) when is_list(chars) do
+    {neutrals, direction, last_part, acc} =
+      chars
+      |> Enum.map(&{&1, dir(&1)})
+      |> Enum.reduce({[], :ltr, [], []}, fn
+        # https://www.unicode.org/reports/tr9/#W2
+        {head, :weak_number}, {neutrals, part_dir, part, acc} ->
+          {[], part_dir, [head] ++ neutrals ++ part, acc}
+
+        {head, :neutral}, {neutrals, part_dir, part, acc} ->
+          {[head | neutrals], part_dir, part, acc}
+
+        {head, part_dir}, {neutrals, part_dir, part, acc} ->
+          {[], part_dir, [head | neutrals] ++ part, acc}
+
+        {head, :ltr = head_dir}, {neutrals, :rtl, part, acc} ->
+          {[], head_dir, [head | neutrals], maybe_reverse(:rtl, part) ++ acc}
+
+        {head, :rtl = head_dir}, {neutrals, :ltr, part, acc} ->
+          {[], head_dir, [head], maybe_reverse(:ltr, neutrals ++ part) ++ acc}
+      end)
+
+    Enum.reverse(maybe_reverse(direction, neutrals ++ last_part) ++ acc)
+  end
+
+  defp maybe_reverse(:rtl, part), do: Enum.reverse(part)
+  defp maybe_reverse(:ltr, part), do: part
 end
diff --git a/lib/elixir/unicode/tokenizer.ex b/lib/elixir/unicode/tokenizer.ex
@@ -28,15 +28,19 @@ defmodule String.Tokenizer do
     end
   end
 
-  {letter_uptitlecase, start, continue, _} =
+  {letter_uptitlecase, start, continue, dir_rtls, dir_neutrals, _} =
     Path.join(__DIR__, "UnicodeData.txt")
     |> File.read!()
     |> String.split(["\r\n", "\n"], trim: true)
-    |> Enum.reduce({[], [], [], nil}, fn line, acc ->
-      {letter_uptitlecase, start, continue, first} = acc
+    |> Enum.reduce({[], [], [], [], [], nil}, fn line, acc ->
+      {letter_uptitlecase, start, continue, rtls, neutrals, first} = acc
+
+      # https://www.unicode.org/reports/tr44/tr44-32.html#UnicodeData.txt
       [codepoint, line] = :binary.split(line, ";")
       [name, line] = :binary.split(line, ";")
-      [category, _] = :binary.split(line, ";")
+      [category, line] = :binary.split(line, ";")
+      [_canonical_combining, line] = :binary.split(line, ";")
+      [bidi, _] = :binary.split(line, ";")
 
       {codepoints, first} =
         case name do
@@ -52,18 +56,25 @@ defmodule String.Tokenizer do
             {[String.to_integer(codepoint, 16)], nil}
         end
 
+      {rtls, neutrals} =
+        cond do
+          bidi in ~w(R AL)s -> {codepoints ++ rtls, neutrals}
+          bidi in ~w(WS ON CS EN ES ET NSM)s -> {rtls, codepoints ++ neutrals}
+          true -> {rtls, neutrals}
+        end
+
       cond do
         category in ~w(Lu Lt) ->
-          {codepoints ++ letter_uptitlecase, start, continue, first}
+          {codepoints ++ letter_uptitlecase, start, continue, rtls, neutrals, first}
 
         category in ~w(Ll Lm Lo Nl) ->
-          {letter_uptitlecase, codepoints ++ start, continue, first}
+          {letter_uptitlecase, codepoints ++ start, continue, rtls, neutrals, first}
 
         category in ~w(Mn Mc Nd Pc) ->
-          {letter_uptitlecase, start, codepoints ++ continue, first}
+          {letter_uptitlecase, start, codepoints ++ continue, rtls, neutrals, first}
 
         true ->
-          {letter_uptitlecase, start, continue, first}
+          {letter_uptitlecase, start, continue, rtls, neutrals, first}
       end
     end)
 
@@ -327,6 +338,28 @@ defmodule String.Tokenizer do
 
   defp unicode_continue(_), do: @bottom
 
+  # subset of direction-changing/neutral characters valid in idents
+  id_all = id_upper ++ id_start ++ id_continue
+  dir_rtls = for c <- dir_rtls, c in id_all, do: {c, :rtl}
+  dir_neutrals = for c <- dir_neutrals, c not in 48..57, c in id_all, do: {c, :neutral}
+  dir_ranges = rangify.(dir_rtls) ++ rangify.(dir_neutrals)
+
+  # direction of a codepoint. (rtl, neutral, weak, ltr fallback)
+  # weaks are pulled towards previous directional spans,
+  # but the only weaks allowed in idents are numbers 0..9
+  def dir(i) when i in 48..57, do: :weak_number
+
+  for {first, last, direction} <- dir_ranges do
+    if first == last do
+      def dir(unquote(first)), do: unquote(direction)
+    else
+      def dir(i) when i in unquote(first)..unquote(last), do: unquote(direction)
+    end
+  end
+
+  def dir(i) when is_integer(i), do: :ltr
+  def dir(_), do: {:error, :codepoint_must_be_integer}
+
   # Hard-coded normalizations. Also split by upper, start, continue.
 
   for {from, to} <- start_normalizations do
@@ -448,7 +481,7 @@ defmodule String.Tokenizer do
         [:nfkc | List.delete(special, :nfkc)]
       end
 
-    if scriptset != @bottom or highly_restrictive?(acc) do
+    if scriptset != @bottom or chunks_single_or_highly_restrictive?(acc) do
       {kind, acc, rest, length, false, special}
     else
       breakdown =
@@ -477,24 +510,34 @@ defmodule String.Tokenizer do
       Mixed-script identifiers are not supported for security reasons. \
       '#{acc}' is made of the following scripts:\n
       #{breakdown}
-      All characters in the identifier should resolve to a single script, \
-      or use a highly restrictive set of scripts.
+      All characters in identifier chunks should resolve to a single script, \
+      or a highly restrictive set of scripts.
       """
 
       {:error, {:not_highly_restrictive, acc, {prefix, suffix}}}
     end
   end
 
-  defp highly_restrictive?(acc) do
+  defp chunks_single_or_highly_restrictive?(acc) do
+    # support script mixing via chunked identifiers (UTS 55-5's strong recco)
+    # each chunk in an ident like foo_bar_baz should pass checks
+    acc
+    |> :string.tokens([?_])
+    |> Enum.all?(&single_or_highly_restrictive?/1)
+  end
+
+  defp single_or_highly_restrictive?(acc) do
     scriptsets = Enum.map(acc, &codepoint_to_scriptset/1)
+    is_single_script = @bottom != Enum.reduce(scriptsets, @top, &ss_intersect/2)
 
     # 'A set of scripts is defined to cover a string if the intersection of
     #  that set with the augmented script sets of all characters in the string
     #  is nonempty; in other words, if every character in the string shares at
     #  least one script with the cover set.'
-    Enum.any?(@highly_restrictive, fn restrictive ->
-      Enum.all?(scriptsets, &(ss_intersect(&1, restrictive) != @bottom))
-    end)
+    is_single_script or
+      Enum.any?(@highly_restrictive, fn restrictive ->
+        Enum.all?(scriptsets, &(ss_intersect(&1, restrictive) != @bottom))
+      end)
   end
 
   defp codepoint_to_scriptset(head) do