elixir-lang · josevalim · Jul 2, 2024 · Jun 25, 2024 · Jun 27, 2024 · Jul 2, 2024
diff --git a/Makefile b/Makefile
@@ -104,8 +104,8 @@ unicode: $(UNICODE)
 $(UNICODE): lib/elixir/unicode/*
 	@ echo "==> unicode (compile)";
 	$(Q) $(ELIXIRC) lib/elixir/unicode/unicode.ex -o lib/elixir/ebin;
-	$(Q) $(ELIXIRC) lib/elixir/unicode/security.ex -o lib/elixir/ebin;
 	$(Q) $(ELIXIRC) lib/elixir/unicode/tokenizer.ex -o lib/elixir/ebin;
+	$(Q) $(ELIXIRC) lib/elixir/unicode/security.ex -o lib/elixir/ebin;
 
 $(eval $(call APP_TEMPLATE,ex_unit,ExUnit))
 $(eval $(call APP_TEMPLATE,logger,Logger))

diff --git a/lib/elixir/pages/references/unicode-syntax.md b/lib/elixir/pages/references/unicode-syntax.md
@@ -136,12 +136,16 @@ Elixir will not warn on confusability for identifiers made up exclusively of cha
 
 ### C3. Mixed Script Detection
 
-Elixir will not allow tokenization of mixed-script identifiers unless the mixing is one of the exceptions defined in UTS 39 5.2, 'Highly Restrictive'. We use the means described in Section 5.1, Mixed-Script Detection, to determine if script mixing is occurring, with the modification documented in the section 'Additional Normalizations', below.
+Elixir will not allow tokenization of mixed-script identifiers unless it is via chunks separated by an underscore, like `http_сервер`, or unless the mixing within each of those chunks is one of the exceptions defined in UTS 39 5.2, 'Highly Restrictive'. We use the means described in Section 5.1, Mixed-Script Detection, to determine if script mixing is occurring, with the modification documented in the section 'Additional Normalizations', below.
 
 Examples: Elixir allows an identifiers like `幻ㄒㄧㄤ`, even though it includes characters from multiple 'scripts', because those scripts all 'resolve' to Japanese when applying the resolution rules from UTS 39 5.1. It also allows an atom like `:Tシャツ`, the Japanese word for 't-shirt', which incorporates a Latin capital T, because {Latn, Jpan} is one of the allowed script mixing in the definition of 'Highly Restrictive' in UTS 39 5.2, and it 'covers' the string.
 
+Elixir does allow an identifier like `http_сервер`, where the identifier chunks on each side of the `_` are individually single-script.
+
 However, Elixir would prevent tokenization in code like `if аdmin, do: :ok, else: :err`, where the scriptset for the 'a' character is {Cyrillic} but all other characters have scriptsets of {Latin}. The scriptsets fail to resolve, and the scriptsets from the definition of 'Highly Restrictive' in UTS 39 5.2 do not cover the string either, so a descriptive error is shown.
 
+
+
 ### C4, C5 (inapplicable)
 
 'C4 - Restriction Level detection' conformance is not claimed and does not apply to identifiers in code; rather, it applies to classifying the level of safety of a given arbitrary string into one of 5 restriction levels.

diff --git a/lib/elixir/test/elixir/kernel/warning_test.exs b/lib/elixir/test/elixir/kernel/warning_test.exs
@@ -126,6 +126,18 @@ defmodule Kernel.WarningTest do
         "力=1; カ=1"
       )
 
+      # bidirectional confusability, [97,95,49,1488] vs [97,95,1488,49];
+      # warning outputs in byte order (vs bidi algo display order, uax9), & mentions presence of rtl
+      assert_warn_quoted(
+        ["nofile:1:9", "'a_1א' looks like 'a_א1'", "right-to-left characters"],
+        "a_א1 or a_1א"
+      )
+
+      assert_warn_quoted(
+        ["nofile:1:9", "'_1א' looks like '_א1'", "right-to-left characters"],
+        "_א1 and _1א"
+      )
+
       # by convention, doesn't warn on ascii-only confusables
       assert capture_eval("x0 = xO = 1") == ""
       assert capture_eval("l1 = ll = 1") == ""
@@ -172,7 +184,9 @@ defmodule Kernel.WarningTest do
       assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[аdmin: 1]") end
       assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[{:аdmin, 1}]") end
       assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("quote do: аdmin(1)") end
-      assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("рос_api = 1") end
+
+      # c is Latin
+      assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("http_cервер = 1") end
 
       # T is in cyrillic
       assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[Тシャツ: 1]") end
@@ -190,6 +204,10 @@ defmodule Kernel.WarningTest do
       # elixir's normalizations combine scriptsets of the 'from' and 'to' characters,
       # ex: {Common} MICRO => {Greek} MU == {Common, Greek}; Common intersects w/all
       assert capture_quoted("μs") == ""
+
+      # allows mixed scripts if the chunks are all single-script or highly restrictive
+      assert capture_eval("http_сервер = 1") == ""
+      assert capture_eval("сервер_http = 1") == ""
     end
   end
 

diff --git a/lib/elixir/unicode/security.ex b/lib/elixir/unicode/security.ex
@@ -53,7 +53,7 @@ defmodule String.Tokenizer.Security do
       {line, _, previous_name} when name != previous_name ->
         {:warn,
          "confusable identifier: '#{name}' looks like '#{previous_name}' on line #{line}, " <>
-           "but they are written using different characters"}
+           "but they are written using different characters" <> dir_compare(name, previous_name)}
 
       _ ->
         {:ok, Map.put(skeletons, skeleton, info)}
@@ -106,7 +106,133 @@ defmodule String.Tokenizer.Security do
     #    the specified data, producing a string of exemplar characters.
     #  - Reapply NFD." (UTS 39 section 4, skeleton definition)
     :unicode.characters_to_nfd_list(s)
-    |> Enum.map(&confusable_prototype/1)
+    |> bidi_skeleton()
     |> :unicode.characters_to_nfd_list()
   end
+
+  # unicode 15 adds bidiSkeleton because, w/RTL codepoints, idents that
+  # aren't confusable LTR *are* confusable in most places human review
+  # occurs (editors/browsers, thanks to bidi algo, UAX9).
+  #
+  # The solution is to detect spans with reversed visual direction,
+  # and reverse those, so that the input we check for confusability
+  # matches the perceived sequence instead of the byte sequence.
+  #
+  # (we need this regardless of script mixing, because direction-neutral
+  # chars like _ or 0..9 can mix w/RTL chars).
+  def bidi_skeleton(s) do
+    # UTS39-28 4:
+    #  'Bidirectional confusability is costlier to check than
+    #   confusability, as [unicode bidi algo] must be applied.
+    #   [...] a fast path can be used: [...] if X has no characters
+    #   w/bidi classes R or AL, bidiSkeleton(X) = skeleton(X)
+    #
+    if length(s) > 1 and any_rtl?(s) do
+      unbidify(s) |> Enum.map(&confusable_prototype/1)
+    else
+      Enum.map(s, &confusable_prototype/1)
+    end
+  end
+
+  # load direction-changing and neutral codepoints valid in idents
+  {rtls, neutrals} =
+    Path.join(__DIR__, "UnicodeData.txt")
+    |> File.read!()
+    |> String.split(["\r\n", "\n"], trim: true)
+    |> Enum.reduce({[], []}, fn line, {rtls, neutrals} = acc ->
+      with [point, _, _, _, bidi | _] <- String.split(line, ";"),
+           codepoint = String.to_integer(point, 16),
+           true <- String.Tokenizer.id_codepoint?(codepoint) do
+        # https://www.unicode.org/reports/tr44/tr44-32.html#Bidi_Class_Values
+        cond do
+          bidi in ~w(R AL)s -> {[codepoint | rtls], neutrals}
+          bidi in ~w(WS ON CS EN ES ET NSM)s -> {rtls, [codepoint | neutrals]}
+          true -> acc
+        end
+      else
+        _ -> acc
+      end
+    end)
+
+  rangify = fn [head | tail] ->
+    {first, last, acc} =
+      Enum.reduce(tail, {head, head, []}, fn
+        number, {first, last, acc} when number == first - 1 ->
+          {number, last, acc}
+
+        number, {first, last, acc} ->
+          {number, number, [{first, last} | acc]}
+      end)
+
+    [{first, last} | acc]
+  end
+
+  # direction of a codepoint. (rtl, neutral, ltr fallback)
+  for {first, last} <- rangify.(rtls) do
+    if first == last do
+      defp dir(unquote(first)), do: :rtl
+    else
+      defp dir(i) when i in unquote(first)..unquote(last), do: :rtl
+    end
+  end
+
+  for {first, last} <- rangify.(neutrals) do
+    if first == last do
+      defp dir(unquote(first)), do: :neutral
+    else
+      defp dir(i) when i in unquote(first)..unquote(last), do: :neutral
+    end
+  end
+
+  defp dir(i) when is_integer(i), do: :ltr
+  defp dir(_), do: {:error, :codepoint_must_be_integer}
+
+  defp any_rtl?(s), do: Enum.any?(s, &(:rtl == dir(&1)))
+
+  defp dir_compare(a, b) do
+    """
+    #{if any_rtl?(a), do: "\n\n" <> dir_breakdown(a)}
+    #{if any_rtl?(b), do: dir_breakdown(b)}
+    """
+  end
+
+  defp dir_breakdown(s) do
+    init = "'#{s}' includes right-to-left characters:\n"
+
+    for codepoint <- s, into: init do
+      hex = :io_lib.format(~c"~4.16.0B", [codepoint])
+      "  \\u#{hex} #{[codepoint]} #{dir(codepoint)}\n"
+    end
+  end
+
+  # make charlist match visual order by reversing spans of {rtl, neutral}
+  #  UTS39-28 4: '[...] if the strings are known not to contain explicit
+  #   directional formatting characters[...], the algorithm can
+  #   be drastically simplified, [...], obviating the need for
+  #   the [...] stack of the [unicode bidi algo]'
+  def unbidify([head | tail]), do: unbidify({tail, dir(head), [head], []})
+
+  def unbidify({[head | tail], part_dir, part, acc}) do
+    {dir, part, acc} = unbidify_part({head, dir(head), part, part_dir, acc})
+    unbidify({tail, dir, part, acc})
+  end
+
+  def unbidify({[], dir, part, acc}) do
+    acc = [maybe_reverse_part(dir, part) | acc]
+    acc |> Enum.reverse() |> List.flatten()
+  end
+
+  defp unbidify_part({head, head_dir, part, part_dir, acc})
+       when head_dir == :neutral or head_dir == part_dir do
+    {part_dir, [head | part], acc}
+  end
+
+  defp unbidify_part({head, head_dir, part, part_dir, acc})
+       when head_dir != :neutral and head_dir != part_dir do
+    part = maybe_reverse_part(part_dir, part)
+    {head_dir, [head], [part | acc]}
+  end
+
+  defp maybe_reverse_part(dir, part) when dir in [:ltr, :neutral], do: Enum.reverse(part)
+  defp maybe_reverse_part(:rtl, part), do: part
 end
diff --git a/lib/elixir/unicode/tokenizer.ex b/lib/elixir/unicode/tokenizer.ex
@@ -448,7 +448,7 @@ defmodule String.Tokenizer do
         [:nfkc | List.delete(special, :nfkc)]
       end
 
-    if scriptset != @bottom or highly_restrictive?(acc) do
+    if scriptset != @bottom or chunks_single_or_highly_restrictive?(acc) do
       {kind, acc, rest, length, false, special}
     else
       breakdown =
@@ -477,24 +477,36 @@ defmodule String.Tokenizer do
       Mixed-script identifiers are not supported for security reasons. \
       '#{acc}' is made of the following scripts:\n
       #{breakdown}
-      All characters in the identifier should resolve to a single script, \
-      or use a highly restrictive set of scripts.
+      All characters in identifier chunks should resolve to a single script, \
+      or a highly restrictive set of scripts.
       """
 
       {:error, {:not_highly_restrictive, acc, {prefix, suffix}}}
     end
   end
 
-  defp highly_restrictive?(acc) do
+  defp chunks_single_or_highly_restrictive?(acc) do
+    # support script mixing via chunked identifiers (UTS 55-5, strong
+    # recco) to be kinder to users of other scripts,
+    # chunked around the _ character; each chunk
+    # in an ident like foo_bar_baz should pass checks
+    acc
+    |> :string.tokens([?_])
+    |> Enum.all?(&single_or_highly_restrictive?/1)
+  end
+
+  defp single_or_highly_restrictive?(acc) do
     scriptsets = Enum.map(acc, &codepoint_to_scriptset/1)
+    is_single_script = @bottom != Enum.reduce(scriptsets, @top, &ss_intersect/2)
 
     # 'A set of scripts is defined to cover a string if the intersection of
     #  that set with the augmented script sets of all characters in the string
     #  is nonempty; in other words, if every character in the string shares at
     #  least one script with the cover set.'
-    Enum.any?(@highly_restrictive, fn restrictive ->
-      Enum.all?(scriptsets, &(ss_intersect(&1, restrictive) != @bottom))
-    end)
+    is_single_script or
+      Enum.any?(@highly_restrictive, fn restrictive ->
+        Enum.all?(scriptsets, &(ss_intersect(&1, restrictive) != @bottom))
+      end)
   end
 
   defp codepoint_to_scriptset(head) do
@@ -512,4 +524,16 @@ defmodule String.Tokenizer do
              do: @top
     end
   end
+
+  # security.ex uses, to filter non-ident codepoints
+  def id_codepoint?(p) when is_integer(p) do
+    if p < 127 and (p == ?_ or ascii_continue?(p) or ascii_lower?(p) or ascii_continue?(p)) do
+      true
+    else
+      @bottom !=
+        unicode_start(p)
+        |> ScriptSet.union(unicode_continue(p))
+        |> ScriptSet.union(unicode_upper(p))
+    end
+  end
 end