Simplify tests

josevalim · josevalim · commit fcfa87ff36f0 · 2024-07-02T14:22:51.000+02:00
diff --git a/lib/elixir/test/elixir/kernel/string_tokenizer_test.exs b/lib/elixir/test/elixir/kernel/string_tokenizer_test.exs
@@ -69,4 +69,116 @@ defmodule Kernel.StringTokenizerTest do
     assert {:error, _} = Code.string_to_quoted("Ola?")
     assert {:error, _} = Code.string_to_quoted("Ola!")
   end
+
+  describe "script mixing" do
+    test "prevents Restricted codepoints in identifiers" do
+      exception = assert_raise SyntaxError, fn -> Code.string_to_quoted!("_shibㅤ = 1") end
+
+      assert Exception.message(exception) =~
+               "unexpected token: \"ㅤ\" (column 6, code point U+3164)"
+    end
+
+    test "prevents unsafe mixing in identifiers" do
+      exception =
+        assert_raise SyntaxError, fn ->
+          Code.string_to_quoted!("if аdmin_, do: :ok, else: :err")
+        end
+
+      assert Exception.message(exception) =~ "nofile:1:9:"
+      assert Exception.message(exception) =~ "invalid mixed-script identifier found: аdmin"
+
+      for s <- [
+            "\\u0430 а {Cyrillic}",
+            "\\u0064 d {Latin}",
+            "\\u006D m {Latin}",
+            "\\u0069 i {Latin}",
+            "\\u006E n {Latin}",
+            "\\u005F _"
+          ] do
+        assert Exception.message(exception) =~ s
+      end
+
+      # includes suggestion about what to change
+      assert Exception.message(exception) =~ """
+             Hint: You could write the above in a similar way that is accepted by Elixir:
+             """
+
+      assert Exception.message(exception) =~ """
+             "admin_" (code points 0x00061 0x00064 0x0006D 0x00069 0x0006E 0x0005F)
+             """
+
+      # a is in cyrillic
+      assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[аdmin: 1]") end
+      assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[{:аdmin, 1}]") end
+      assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("quote do: аdmin(1)") end
+
+      # c is Latin
+      assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("http_cервер = 1") end
+
+      # T is in cyrillic
+      assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[Тシャツ: 1]") end
+    end
+
+    test "allows legitimate script mixing" do
+      # writing systems that legitimately mix multiple scripts, and Common chars like _
+      assert Code.eval_string("幻ㄒㄧㄤ = 1") == {1, [幻ㄒㄧㄤ: 1]}
+      assert Code.eval_string("幻ㄒㄧㄤ1 = 1") == {1, [幻ㄒㄧㄤ1: 1]}
+      assert Code.eval_string("__सवव_1? = 1") == {1, [__सवव_1?: 1]}
+
+      # works with atoms too
+      assert Code.eval_string(":Tシャツ") == {:Tシャツ, []}
+
+      # elixir's normalizations combine scriptsets of the 'from' and 'to' characters,
+      # ex: {Common} MICRO => {Greek} MU == {Common, Greek}; Common intersects w/all
+      assert Code.eval_string("μs = 1") == {1, [μs: 1]}
+
+      # allows mixed scripts if the chunks are all single-script or highly restrictive
+      assert Code.eval_string("http_сервер = 1") == {1, [http_сервер: 1]}
+      assert Code.eval_string("сервер_http = 1") == {1, [сервер_http: 1]}
+    end
+
+    test "bidi" do
+      # test that the implementation of String.Tokenizer.Security.unbidify/1 agrees
+      # w/Unicode Bidi Algo (UAX9) for these (identifier-specific, no-bracket) examples
+      #
+      # you can create new examples with: https://util.unicode.org/UnicodeJsps/bidic.jsp?s=foo_%D9%84%D8%A7%D9%85%D8%AF%D8%A7_baz&b=0&u=140&d=2
+      # inspired by (none of these are directly usable for our idents): https://www.unicode.org/Public/UCD/latest/ucd/BidiCharacterTest.txt
+      #
+      # there's a spurious ;A; after the identifier, because the semicolon is dir-neutral, and
+      # deleting it makes these examples hard to read in many/most editors!
+      """
+      foo;A;0066 006F 006F;0 1 2
+      _foo_ ;A;005F 0066 006F 006F 005F;0 1 2 3 4
+      __foo__ ;A;005F 005F 0066 006F 006F 005F 005F;0 1 2 3 4 5 6
+      لامدا_foo ;A;0644 0627 0645 062F 0627 005F 0066 006F 006F;4 3 2 1 0 5 6 7 8
+      foo_لامدا_baz ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 0062 0061 007A;0 1 2 3 8 7 6 5 4 9 10 11 12
+      foo_لامدا ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627;0 1 2 3 8 7 6 5 4
+      foo_لامدا1 ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 0031;0 1 2 3 9 8 7 6 5 4
+      foo_لامدا_حدد ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F;0 1 2 3 12 11 10 9 8 7 6 5 4
+      foo_لامدا_حدد1 ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F 0031;0 1 2 3 13 12 11 10 9 8 7 6 5 4
+      foo_لامدا_حدد1_bar ;A; 0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F 0031 005F 0062 0061 0072;0 1 2 3 13 12 11 10 9 8 7 6 5 4 14 15 16 17
+      foo_لامدا_حدد1_bar1 ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F 0031 005F 0062 0061 0072 0031;0 1 2 3 13 12 11 10 9 8 7 6 5 4 14 15 16 17 18
+      """
+      |> String.split("\n", trim: true)
+      |> Enum.map(&String.split(&1, ";", trim: true))
+      |> Enum.each(fn
+        [ident, _, bytes, indices | _rest] ->
+          bytes = String.split(bytes, " ", trim: true) |> Enum.map(&String.to_integer(&1, 16))
+          indices = String.split(indices, " ", trim: true) |> Enum.map(&String.to_integer/1)
+          display_ordered = for i <- indices, do: Enum.at(bytes, i)
+          unbidified = String.Tokenizer.Security.unbidify(bytes)
+
+          if display_ordered != unbidified do
+            raise """
+            Failing String.Tokenizer.Security.unbidify/1 case for: '#{ident}'
+              bytes        : #{bytes |> Enum.map(&Integer.to_string(&1, 16)) |> Enum.join(" ")}
+              byte order   : #{bytes |> Enum.intersperse(32)}
+              uax9 order   : #{display_ordered |> Enum.intersperse(32)}
+              uax9 indices : #{indices |> Enum.join(" ")}
+              unbidify/1   : #{unbidified |> Enum.intersperse(32)}
+            """
+          end
+      end)
+    end
+  end
 end
diff --git a/lib/elixir/test/elixir/kernel/warning_test.exs b/lib/elixir/test/elixir/kernel/warning_test.exs
@@ -48,12 +48,6 @@ defmodule Kernel.WarningTest do
     end)
   end
 
-  defp capture_quoted(source) do
-    capture_err(fn ->
-      Code.string_to_quoted!(source, columns: true)
-    end)
-  end
-
   defp capture_compile(source) do
     capture_err(fn ->
       quoted = Code.string_to_quoted!(source, columns: true)
@@ -93,13 +87,6 @@ defmodule Kernel.WarningTest do
   end
 
   describe "unicode identifier security" do
-    test "prevents Restricted codepoints in identifiers" do
-      exception = assert_raise SyntaxError, fn -> Code.string_to_quoted!("_shibㅤ = 1") end
-
-      assert Exception.message(exception) =~
-               "unexpected token: \"ㅤ\" (column 6, code point U+3164)"
-    end
-
     test "warns on confusables" do
       assert_warn_quoted(
         ["nofile:1:6", "confusable identifier: 'a' looks like 'а' on line 1"],
@@ -161,105 +148,6 @@ defmodule Kernel.WarningTest do
         ],
         "a_א1 or a_1א"
       )
-
-      # test that the implementation of String.Tokenizer.Security.unbidify/1 agrees
-      # w/Unicode Bidi Algo (UAX9) for these (identifier-specific, no-bracket) examples
-      #
-      # you can create new examples with: https://util.unicode.org/UnicodeJsps/bidic.jsp?s=foo_%D9%84%D8%A7%D9%85%D8%AF%D8%A7_baz&b=0&u=140&d=2
-      # inspired by (none of these are directly usable for our idents): https://www.unicode.org/Public/UCD/latest/ucd/BidiCharacterTest.txt
-      #
-      # there's a spurious ;A; after the identifier, because the semicolon is dir-neutral, and
-      # deleting it makes these examples hard to read in many/most editors!
-      """
-      foo;A;0066 006F 006F;0 1 2
-      _foo_ ;A;005F 0066 006F 006F 005F;0 1 2 3 4
-      __foo__ ;A;005F 005F 0066 006F 006F 005F 005F;0 1 2 3 4 5 6
-      لامدا_foo ;A;0644 0627 0645 062F 0627 005F 0066 006F 006F;4 3 2 1 0 5 6 7 8
-      foo_لامدا_baz ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 0062 0061 007A;0 1 2 3 8 7 6 5 4 9 10 11 12
-      foo_لامدا ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627;0 1 2 3 8 7 6 5 4
-      foo_لامدا1 ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 0031;0 1 2 3 9 8 7 6 5 4
-      foo_لامدا_حدد ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F;0 1 2 3 12 11 10 9 8 7 6 5 4
-      foo_لامدا_حدد1 ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F 0031;0 1 2 3 13 12 11 10 9 8 7 6 5 4
-      foo_لامدا_حدد1_bar ;A; 0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F 0031 005F 0062 0061 0072;0 1 2 3 13 12 11 10 9 8 7 6 5 4 14 15 16 17
-      foo_لامدا_حدد1_bar1 ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F 0031 005F 0062 0061 0072 0031;0 1 2 3 13 12 11 10 9 8 7 6 5 4 14 15 16 17 18
-      """
-      |> String.split("\n", trim: true)
-      |> Enum.map(&String.split(&1, ";", trim: true))
-      |> Enum.each(fn
-        [ident, _, bytes, indices | _rest] ->
-          bytes = String.split(bytes, " ", trim: true) |> Enum.map(&String.to_integer(&1, 16))
-          indices = String.split(indices, " ", trim: true) |> Enum.map(&String.to_integer/1)
-          display_ordered = for i <- indices, do: Enum.at(bytes, i)
-          unbidified = String.Tokenizer.Security.unbidify(bytes)
-
-          assert(display_ordered == unbidified, """
-           Failing String.Tokenizer.Security.unbidify/1 case for: '#{ident}'
-            bytes        : #{bytes |> Enum.map(&Integer.to_string(&1, 16)) |> Enum.join(" ")}
-            byte order   : #{bytes |> Enum.intersperse(32)}
-            uax9 order   : #{display_ordered |> Enum.intersperse(32)}
-            uax9 indices : #{indices |> Enum.join(" ")}
-            unbidify/1   : #{unbidified |> Enum.intersperse(32)}
-          """)
-      end)
-    end
-
-    test "prevents unsafe script mixing in identifiers" do
-      exception =
-        assert_raise SyntaxError, fn ->
-          Code.string_to_quoted!("if аdmin_, do: :ok, else: :err")
-        end
-
-      assert Exception.message(exception) =~ "nofile:1:9:"
-      assert Exception.message(exception) =~ "invalid mixed-script identifier found: аdmin"
-
-      for s <- [
-            "\\u0430 а {Cyrillic}",
-            "\\u0064 d {Latin}",
-            "\\u006D m {Latin}",
-            "\\u0069 i {Latin}",
-            "\\u006E n {Latin}",
-            "\\u005F _"
-          ] do
-        assert Exception.message(exception) =~ s
-      end
-
-      # includes suggestion about what to change
-      assert Exception.message(exception) =~ """
-             Hint: You could write the above in a similar way that is accepted by Elixir:
-             """
-
-      assert Exception.message(exception) =~ """
-             "admin_" (code points 0x00061 0x00064 0x0006D 0x00069 0x0006E 0x0005F)
-             """
-
-      # a is in cyrillic
-      assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[аdmin: 1]") end
-      assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[{:аdmin, 1}]") end
-      assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("quote do: аdmin(1)") end
-
-      # c is Latin
-      assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("http_cервер = 1") end
-
-      # T is in cyrillic
-      assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[Тシャツ: 1]") end
-    end
-
-    test "allows legitimate script mixing" do
-      # writing systems that legitimately mix multiple scripts, and Common chars like _
-      assert capture_eval("幻ㄒㄧㄤ = 1") == ""
-      assert capture_eval("幻ㄒㄧㄤ1 = 1") == ""
-      assert capture_eval("__सवव_1? = 1") == ""
-
-      # uts39 5.2 allowed 'highly restrictive' script mixing, like 't-shirt' in Jpan:
-      assert capture_quoted(":Tシャツ") == ""
-
-      # elixir's normalizations combine scriptsets of the 'from' and 'to' characters,
-      # ex: {Common} MICRO => {Greek} MU == {Common, Greek}; Common intersects w/all
-      assert capture_quoted("μs") == ""
-
-      # allows mixed scripts if the chunks are all single-script or highly restrictive
-      assert capture_eval("http_сервер = 1") == ""
-      assert capture_eval("сервер_http = 1") == ""
     end
   end