Skip to content

Commit fcfa87f

Browse files
committed
Simplify tests
1 parent c83334e commit fcfa87f

File tree

2 files changed

+112
-112
lines changed

2 files changed

+112
-112
lines changed

lib/elixir/test/elixir/kernel/string_tokenizer_test.exs

+112
Original file line numberDiff line numberDiff line change
@@ -69,4 +69,116 @@ defmodule Kernel.StringTokenizerTest do
6969
assert {:error, _} = Code.string_to_quoted("Ola?")
7070
assert {:error, _} = Code.string_to_quoted("Ola!")
7171
end
72+
73+
describe "script mixing" do
74+
test "prevents Restricted codepoints in identifiers" do
75+
exception = assert_raise SyntaxError, fn -> Code.string_to_quoted!("_shibㅤ = 1") end
76+
77+
assert Exception.message(exception) =~
78+
"unexpected token: \"\" (column 6, code point U+3164)"
79+
end
80+
81+
test "prevents unsafe mixing in identifiers" do
82+
exception =
83+
assert_raise SyntaxError, fn ->
84+
Code.string_to_quoted!("if аdmin_, do: :ok, else: :err")
85+
end
86+
87+
assert Exception.message(exception) =~ "nofile:1:9:"
88+
assert Exception.message(exception) =~ "invalid mixed-script identifier found: аdmin"
89+
90+
for s <- [
91+
"\\u0430 а {Cyrillic}",
92+
"\\u0064 d {Latin}",
93+
"\\u006D m {Latin}",
94+
"\\u0069 i {Latin}",
95+
"\\u006E n {Latin}",
96+
"\\u005F _"
97+
] do
98+
assert Exception.message(exception) =~ s
99+
end
100+
101+
# includes suggestion about what to change
102+
assert Exception.message(exception) =~ """
103+
Hint: You could write the above in a similar way that is accepted by Elixir:
104+
"""
105+
106+
assert Exception.message(exception) =~ """
107+
"admin_" (code points 0x00061 0x00064 0x0006D 0x00069 0x0006E 0x0005F)
108+
"""
109+
110+
# a is in cyrillic
111+
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[аdmin: 1]") end
112+
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[{:аdmin, 1}]") end
113+
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("quote do: аdmin(1)") end
114+
115+
# c is Latin
116+
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("http_cервер = 1") end
117+
118+
# T is in cyrillic
119+
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[Тシャツ: 1]") end
120+
end
121+
122+
test "allows legitimate script mixing" do
123+
# writing systems that legitimately mix multiple scripts, and Common chars like _
124+
assert Code.eval_string("幻ㄒㄧㄤ = 1") == {1, [幻ㄒㄧㄤ: 1]}
125+
assert Code.eval_string("幻ㄒㄧㄤ1 = 1") == {1, [幻ㄒㄧㄤ1: 1]}
126+
assert Code.eval_string("__सवव_1? = 1") == {1, [__सवव_1?: 1]}
127+
128+
# works with atoms too
129+
assert Code.eval_string(":Tシャツ") == {:Tシャツ, []}
130+
131+
# elixir's normalizations combine scriptsets of the 'from' and 'to' characters,
132+
# ex: {Common} MICRO => {Greek} MU == {Common, Greek}; Common intersects w/all
133+
assert Code.eval_string("μs = 1") == {1, [μs: 1]}
134+
135+
# allows mixed scripts if the chunks are all single-script or highly restrictive
136+
assert Code.eval_string("http_сервер = 1") == {1, [http_сервер: 1]}
137+
assert Code.eval_string("сервер_http = 1") == {1, [сервер_http: 1]}
138+
end
139+
140+
test "bidi" do
141+
# test that the implementation of String.Tokenizer.Security.unbidify/1 agrees
142+
# w/Unicode Bidi Algo (UAX9) for these (identifier-specific, no-bracket) examples
143+
#
144+
# you can create new examples with: https://util.unicode.org/UnicodeJsps/bidic.jsp?s=foo_%D9%84%D8%A7%D9%85%D8%AF%D8%A7_baz&b=0&u=140&d=2
145+
# inspired by (none of these are directly usable for our idents): https://www.unicode.org/Public/UCD/latest/ucd/BidiCharacterTest.txt
146+
#
147+
# there's a spurious ;A; after the identifier, because the semicolon is dir-neutral, and
148+
# deleting it makes these examples hard to read in many/most editors!
149+
"""
150+
foo;A;0066 006F 006F;0 1 2
151+
_foo_ ;A;005F 0066 006F 006F 005F;0 1 2 3 4
152+
__foo__ ;A;005F 005F 0066 006F 006F 005F 005F;0 1 2 3 4 5 6
153+
لامدا_foo ;A;0644 0627 0645 062F 0627 005F 0066 006F 006F;4 3 2 1 0 5 6 7 8
154+
foo_لامدا_baz ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 0062 0061 007A;0 1 2 3 8 7 6 5 4 9 10 11 12
155+
foo_لامدا ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627;0 1 2 3 8 7 6 5 4
156+
foo_لامدا1 ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 0031;0 1 2 3 9 8 7 6 5 4
157+
foo_لامدا_حدد ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F;0 1 2 3 12 11 10 9 8 7 6 5 4
158+
foo_لامدا_حدد1 ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F 0031;0 1 2 3 13 12 11 10 9 8 7 6 5 4
159+
foo_لامدا_حدد1_bar ;A; 0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F 0031 005F 0062 0061 0072;0 1 2 3 13 12 11 10 9 8 7 6 5 4 14 15 16 17
160+
foo_لامدا_حدد1_bar1 ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F 0031 005F 0062 0061 0072 0031;0 1 2 3 13 12 11 10 9 8 7 6 5 4 14 15 16 17 18
161+
"""
162+
|> String.split("\n", trim: true)
163+
|> Enum.map(&String.split(&1, ";", trim: true))
164+
|> Enum.each(fn
165+
[ident, _, bytes, indices | _rest] ->
166+
bytes = String.split(bytes, " ", trim: true) |> Enum.map(&String.to_integer(&1, 16))
167+
indices = String.split(indices, " ", trim: true) |> Enum.map(&String.to_integer/1)
168+
display_ordered = for i <- indices, do: Enum.at(bytes, i)
169+
unbidified = String.Tokenizer.Security.unbidify(bytes)
170+
171+
if display_ordered != unbidified do
172+
raise """
173+
Failing String.Tokenizer.Security.unbidify/1 case for: '#{ident}'
174+
bytes : #{bytes |> Enum.map(&Integer.to_string(&1, 16)) |> Enum.join(" ")}
175+
byte order : #{bytes |> Enum.intersperse(32)}
176+
uax9 order : #{display_ordered |> Enum.intersperse(32)}
177+
uax9 indices : #{indices |> Enum.join(" ")}
178+
unbidify/1 : #{unbidified |> Enum.intersperse(32)}
179+
"""
180+
end
181+
end)
182+
end
183+
end
72184
end

lib/elixir/test/elixir/kernel/warning_test.exs

-112
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,6 @@ defmodule Kernel.WarningTest do
4848
end)
4949
end
5050

51-
defp capture_quoted(source) do
52-
capture_err(fn ->
53-
Code.string_to_quoted!(source, columns: true)
54-
end)
55-
end
56-
5751
defp capture_compile(source) do
5852
capture_err(fn ->
5953
quoted = Code.string_to_quoted!(source, columns: true)
@@ -93,13 +87,6 @@ defmodule Kernel.WarningTest do
9387
end
9488

9589
describe "unicode identifier security" do
96-
test "prevents Restricted codepoints in identifiers" do
97-
exception = assert_raise SyntaxError, fn -> Code.string_to_quoted!("_shibㅤ = 1") end
98-
99-
assert Exception.message(exception) =~
100-
"unexpected token: \"\" (column 6, code point U+3164)"
101-
end
102-
10390
test "warns on confusables" do
10491
assert_warn_quoted(
10592
["nofile:1:6", "confusable identifier: 'a' looks like 'а' on line 1"],
@@ -161,105 +148,6 @@ defmodule Kernel.WarningTest do
161148
],
162149
"a_א1 or a_1א"
163150
)
164-
165-
# test that the implementation of String.Tokenizer.Security.unbidify/1 agrees
166-
# w/Unicode Bidi Algo (UAX9) for these (identifier-specific, no-bracket) examples
167-
#
168-
# you can create new examples with: https://util.unicode.org/UnicodeJsps/bidic.jsp?s=foo_%D9%84%D8%A7%D9%85%D8%AF%D8%A7_baz&b=0&u=140&d=2
169-
# inspired by (none of these are directly usable for our idents): https://www.unicode.org/Public/UCD/latest/ucd/BidiCharacterTest.txt
170-
#
171-
# there's a spurious ;A; after the identifier, because the semicolon is dir-neutral, and
172-
# deleting it makes these examples hard to read in many/most editors!
173-
"""
174-
foo;A;0066 006F 006F;0 1 2
175-
_foo_ ;A;005F 0066 006F 006F 005F;0 1 2 3 4
176-
__foo__ ;A;005F 005F 0066 006F 006F 005F 005F;0 1 2 3 4 5 6
177-
لامدا_foo ;A;0644 0627 0645 062F 0627 005F 0066 006F 006F;4 3 2 1 0 5 6 7 8
178-
foo_لامدا_baz ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 0062 0061 007A;0 1 2 3 8 7 6 5 4 9 10 11 12
179-
foo_لامدا ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627;0 1 2 3 8 7 6 5 4
180-
foo_لامدا1 ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 0031;0 1 2 3 9 8 7 6 5 4
181-
foo_لامدا_حدد ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F;0 1 2 3 12 11 10 9 8 7 6 5 4
182-
foo_لامدا_حدد1 ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F 0031;0 1 2 3 13 12 11 10 9 8 7 6 5 4
183-
foo_لامدا_حدد1_bar ;A; 0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F 0031 005F 0062 0061 0072;0 1 2 3 13 12 11 10 9 8 7 6 5 4 14 15 16 17
184-
foo_لامدا_حدد1_bar1 ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F 0031 005F 0062 0061 0072 0031;0 1 2 3 13 12 11 10 9 8 7 6 5 4 14 15 16 17 18
185-
"""
186-
|> String.split("\n", trim: true)
187-
|> Enum.map(&String.split(&1, ";", trim: true))
188-
|> Enum.each(fn
189-
[ident, _, bytes, indices | _rest] ->
190-
bytes = String.split(bytes, " ", trim: true) |> Enum.map(&String.to_integer(&1, 16))
191-
indices = String.split(indices, " ", trim: true) |> Enum.map(&String.to_integer/1)
192-
display_ordered = for i <- indices, do: Enum.at(bytes, i)
193-
unbidified = String.Tokenizer.Security.unbidify(bytes)
194-
195-
assert(display_ordered == unbidified, """
196-
Failing String.Tokenizer.Security.unbidify/1 case for: '#{ident}'
197-
bytes : #{bytes |> Enum.map(&Integer.to_string(&1, 16)) |> Enum.join(" ")}
198-
byte order : #{bytes |> Enum.intersperse(32)}
199-
uax9 order : #{display_ordered |> Enum.intersperse(32)}
200-
uax9 indices : #{indices |> Enum.join(" ")}
201-
unbidify/1 : #{unbidified |> Enum.intersperse(32)}
202-
""")
203-
end)
204-
end
205-
206-
test "prevents unsafe script mixing in identifiers" do
207-
exception =
208-
assert_raise SyntaxError, fn ->
209-
Code.string_to_quoted!("if аdmin_, do: :ok, else: :err")
210-
end
211-
212-
assert Exception.message(exception) =~ "nofile:1:9:"
213-
assert Exception.message(exception) =~ "invalid mixed-script identifier found: аdmin"
214-
215-
for s <- [
216-
"\\u0430 а {Cyrillic}",
217-
"\\u0064 d {Latin}",
218-
"\\u006D m {Latin}",
219-
"\\u0069 i {Latin}",
220-
"\\u006E n {Latin}",
221-
"\\u005F _"
222-
] do
223-
assert Exception.message(exception) =~ s
224-
end
225-
226-
# includes suggestion about what to change
227-
assert Exception.message(exception) =~ """
228-
Hint: You could write the above in a similar way that is accepted by Elixir:
229-
"""
230-
231-
assert Exception.message(exception) =~ """
232-
"admin_" (code points 0x00061 0x00064 0x0006D 0x00069 0x0006E 0x0005F)
233-
"""
234-
235-
# a is in cyrillic
236-
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[аdmin: 1]") end
237-
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[{:аdmin, 1}]") end
238-
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("quote do: аdmin(1)") end
239-
240-
# c is Latin
241-
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("http_cервер = 1") end
242-
243-
# T is in cyrillic
244-
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[Тシャツ: 1]") end
245-
end
246-
247-
test "allows legitimate script mixing" do
248-
# writing systems that legitimately mix multiple scripts, and Common chars like _
249-
assert capture_eval("幻ㄒㄧㄤ = 1") == ""
250-
assert capture_eval("幻ㄒㄧㄤ1 = 1") == ""
251-
assert capture_eval("__सवव_1? = 1") == ""
252-
253-
# uts39 5.2 allowed 'highly restrictive' script mixing, like 't-shirt' in Jpan:
254-
assert capture_quoted(":Tシャツ") == ""
255-
256-
# elixir's normalizations combine scriptsets of the 'from' and 'to' characters,
257-
# ex: {Common} MICRO => {Greek} MU == {Common, Greek}; Common intersects w/all
258-
assert capture_quoted("μs") == ""
259-
260-
# allows mixed scripts if the chunks are all single-script or highly restrictive
261-
assert capture_eval("http_сервер = 1") == ""
262-
assert capture_eval("сервер_http = 1") == ""
263151
end
264152
end
265153

0 commit comments

Comments
 (0)