-
Notifications
You must be signed in to change notification settings - Fork 3.4k
Draft of directional confusability protection & allowing script mixing in identifiers when separated by underscores #13693
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -139,6 +139,70 @@ defmodule Kernel.WarningTest do | |
"confusable identifier: 'a' looks like 'а' on line 1" | ||
end | ||
|
||
test "warns on LTR-confusables" do | ||
# warning outputs in byte order (vs bidi algo display order, uax9), mentions presence of rtl | ||
assert_warn_quoted( | ||
["nofile:1:9", "'_1א' looks like '_א1'", "right-to-left characters"], | ||
"_א1 and _1א" | ||
) | ||
|
||
assert_warn_quoted( | ||
[ | ||
"'a_1א' includes right-to-left characters", | ||
"\\u0061 a ltr", | ||
"\\u005F _ neutral", | ||
"\\u0031 1 weak_number", | ||
"\\u05D0 א rtl", | ||
"'a_א1' includes right-to-left characters:", | ||
"\\u0061 a ltr", | ||
"\\u005F _ neutral", | ||
"\\u05D0 א rtl", | ||
"\\u0031 1 weak_number" | ||
], | ||
"a_א1 or a_1א" | ||
) | ||
|
||
# test that the implementation of String.Tokenizer.Security.unbidify/1 agrees | ||
# w/Unicode Bidi Algo (UAX9) for these (identifier-specific, no-bracket) examples | ||
# | ||
# you can create new examples with: https://util.unicode.org/UnicodeJsps/bidic.jsp?s=foo_%D9%84%D8%A7%D9%85%D8%AF%D8%A7_baz&b=0&u=140&d=2 | ||
# inspired by (none of these are directly usable for our idents): https://www.unicode.org/Public/UCD/latest/ucd/BidiCharacterTest.txt | ||
# | ||
# there's a spurious ;A; after the identifier, because the semicolon is dir-neutral, and | ||
# deleting it makes these examples hard to read in many/most editors! | ||
""" | ||
foo;A;0066 006F 006F;0 1 2 | ||
_foo_ ;A;005F 0066 006F 006F 005F;0 1 2 3 4 | ||
__foo__ ;A;005F 005F 0066 006F 006F 005F 005F;0 1 2 3 4 5 6 | ||
لامدا_foo ;A;0644 0627 0645 062F 0627 005F 0066 006F 006F;4 3 2 1 0 5 6 7 8 | ||
foo_لامدا_baz ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 0062 0061 007A;0 1 2 3 8 7 6 5 4 9 10 11 12 | ||
foo_لامدا ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627;0 1 2 3 8 7 6 5 4 | ||
foo_لامدا1 ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 0031;0 1 2 3 9 8 7 6 5 4 | ||
foo_لامدا_حدد ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F;0 1 2 3 12 11 10 9 8 7 6 5 4 | ||
foo_لامدا_حدد1 ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F 0031;0 1 2 3 13 12 11 10 9 8 7 6 5 4 | ||
foo_لامدا_حدد1_bar ;A; 0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F 0031 005F 0062 0061 0072;0 1 2 3 13 12 11 10 9 8 7 6 5 4 14 15 16 17 | ||
foo_لامدا_حدد1_bar1 ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F 0031 005F 0062 0061 0072 0031;0 1 2 3 13 12 11 10 9 8 7 6 5 4 14 15 16 17 18 | ||
Comment on lines
+165
to
+184
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @josevalim I added these example-based tests, which fed back into the implementation by exercising things I hadn't thought through, like 'which span a neutral character or a number will attach to'. Unicode has a great Bidi Algo testing tool, which will spit out the list of rearranged indices. Let me know if we don't like this format for these tests, and if so what kind of format would be better; in the absence of anything else, I liked BidiCharacterTest.txt's approach of "each test case has a list of codepoints, and a list of indices that they should be reordered to". |
||
""" | ||
|> String.split("\n", trim: true) | ||
|> Enum.map(&String.split(&1, ";", trim: true)) | ||
|> Enum.each(fn | ||
[ident, _, bytes, indices | _rest] -> | ||
bytes = String.split(bytes, " ", trim: true) |> Enum.map(&String.to_integer(&1, 16)) | ||
indices = String.split(indices, " ", trim: true) |> Enum.map(&String.to_integer/1) | ||
display_ordered = for i <- indices, do: Enum.at(bytes, i) | ||
unbidified = String.Tokenizer.Security.unbidify(bytes) | ||
|
||
assert(display_ordered == unbidified, """ | ||
Failing String.Tokenizer.Security.unbidify/1 case for: '#{ident}' | ||
bytes : #{bytes |> Enum.map(&Integer.to_string(&1, 16)) |> Enum.join(" ")} | ||
byte order : #{bytes |> Enum.intersperse(32)} | ||
uax9 order : #{display_ordered |> Enum.intersperse(32)} | ||
uax9 indices : #{indices |> Enum.join(" ")} | ||
unbidify/1 : #{unbidified |> Enum.intersperse(32)} | ||
""") | ||
end) | ||
end | ||
|
||
test "prevents unsafe script mixing in identifiers" do | ||
exception = | ||
assert_raise SyntaxError, fn -> | ||
|
@@ -172,7 +236,9 @@ defmodule Kernel.WarningTest do | |
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[аdmin: 1]") end | ||
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[{:аdmin, 1}]") end | ||
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("quote do: аdmin(1)") end | ||
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("рос_api = 1") end | ||
|
||
# c is Latin | ||
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("http_cервер = 1") end | ||
|
||
# T is in cyrillic | ||
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[Тシャツ: 1]") end | ||
|
@@ -190,6 +256,10 @@ defmodule Kernel.WarningTest do | |
# elixir's normalizations combine scriptsets of the 'from' and 'to' characters, | ||
# ex: {Common} MICRO => {Greek} MU == {Common, Greek}; Common intersects w/all | ||
assert capture_quoted("μs") == "" | ||
|
||
# allows mixed scripts if the chunks are all single-script or highly restrictive | ||
assert capture_eval("http_сервер = 1") == "" | ||
assert capture_eval("сервер_http = 1") == "" | ||
end | ||
end | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,15 +28,19 @@ defmodule String.Tokenizer do | |
end | ||
end | ||
|
||
{letter_uptitlecase, start, continue, _} = | ||
{letter_uptitlecase, start, continue, dir_rtls, dir_neutrals, _} = | ||
Path.join(__DIR__, "UnicodeData.txt") | ||
|> File.read!() | ||
|> String.split(["\r\n", "\n"], trim: true) | ||
|> Enum.reduce({[], [], [], nil}, fn line, acc -> | ||
{letter_uptitlecase, start, continue, first} = acc | ||
|> Enum.reduce({[], [], [], [], [], nil}, fn line, acc -> | ||
{letter_uptitlecase, start, continue, rtls, neutrals, first} = acc | ||
|
||
# https://www.unicode.org/reports/tr44/tr44-32.html#UnicodeData.txt | ||
[codepoint, line] = :binary.split(line, ";") | ||
[name, line] = :binary.split(line, ";") | ||
[category, _] = :binary.split(line, ";") | ||
[category, line] = :binary.split(line, ";") | ||
[_canonical_combining, line] = :binary.split(line, ";") | ||
[bidi, _] = :binary.split(line, ";") | ||
|
||
{codepoints, first} = | ||
case name do | ||
|
@@ -52,18 +56,25 @@ defmodule String.Tokenizer do | |
{[String.to_integer(codepoint, 16)], nil} | ||
end | ||
|
||
{rtls, neutrals} = | ||
cond do | ||
bidi in ~w(R AL)s -> {codepoints ++ rtls, neutrals} | ||
bidi in ~w(WS ON CS EN ES ET NSM)s -> {rtls, codepoints ++ neutrals} | ||
true -> {rtls, neutrals} | ||
end | ||
|
||
cond do | ||
category in ~w(Lu Lt) -> | ||
{codepoints ++ letter_uptitlecase, start, continue, first} | ||
{codepoints ++ letter_uptitlecase, start, continue, rtls, neutrals, first} | ||
|
||
category in ~w(Ll Lm Lo Nl) -> | ||
{letter_uptitlecase, codepoints ++ start, continue, first} | ||
{letter_uptitlecase, codepoints ++ start, continue, rtls, neutrals, first} | ||
|
||
category in ~w(Mn Mc Nd Pc) -> | ||
{letter_uptitlecase, start, codepoints ++ continue, first} | ||
{letter_uptitlecase, start, codepoints ++ continue, rtls, neutrals, first} | ||
|
||
true -> | ||
{letter_uptitlecase, start, continue, first} | ||
{letter_uptitlecase, start, continue, rtls, neutrals, first} | ||
end | ||
end) | ||
|
||
|
@@ -327,6 +338,28 @@ defmodule String.Tokenizer do | |
|
||
defp unicode_continue(_), do: @bottom | ||
|
||
# subset of direction-changing/neutral characters valid in idents | ||
id_all = id_upper ++ id_start ++ id_continue | ||
dir_rtls = for c <- dir_rtls, c in id_all, do: {c, :rtl} | ||
dir_neutrals = for c <- dir_neutrals, c not in 48..57, c in id_all, do: {c, :neutral} | ||
dir_ranges = rangify.(dir_rtls) ++ rangify.(dir_neutrals) | ||
|
||
# direction of a codepoint. (rtl, neutral, weak, ltr fallback) | ||
# weaks are pulled towards previous directional spans, | ||
# but the only weaks allowed in idents are numbers 0..9 | ||
def dir(i) when i in 48..57, do: :weak_number | ||
|
||
for {first, last, direction} <- dir_ranges do | ||
if first == last do | ||
def dir(unquote(first)), do: unquote(direction) | ||
else | ||
def dir(i) when i in unquote(first)..unquote(last), do: unquote(direction) | ||
end | ||
end | ||
|
||
def dir(i) when is_integer(i), do: :ltr | ||
def dir(_), do: {:error, :codepoint_must_be_integer} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason we don't want to raise here? This will always be a bug, right? |
||
|
||
# Hard-coded normalizations. Also split by upper, start, continue. | ||
|
||
for {from, to} <- start_normalizations do | ||
|
@@ -448,7 +481,7 @@ defmodule String.Tokenizer do | |
[:nfkc | List.delete(special, :nfkc)] | ||
end | ||
|
||
if scriptset != @bottom or highly_restrictive?(acc) do | ||
if scriptset != @bottom or chunks_single_or_highly_restrictive?(acc) do | ||
{kind, acc, rest, length, false, special} | ||
else | ||
breakdown = | ||
|
@@ -477,24 +510,34 @@ defmodule String.Tokenizer do | |
Mixed-script identifiers are not supported for security reasons. \ | ||
'#{acc}' is made of the following scripts:\n | ||
#{breakdown} | ||
All characters in the identifier should resolve to a single script, \ | ||
or use a highly restrictive set of scripts. | ||
All characters in identifier chunks should resolve to a single script, \ | ||
or a highly restrictive set of scripts. | ||
""" | ||
|
||
{:error, {:not_highly_restrictive, acc, {prefix, suffix}}} | ||
end | ||
end | ||
|
||
defp highly_restrictive?(acc) do | ||
defp chunks_single_or_highly_restrictive?(acc) do | ||
# support script mixing via chunked identifiers (UTS 55-5's strong recco) | ||
# each chunk in an ident like foo_bar_baz should pass checks | ||
acc | ||
|> :string.tokens([?_]) | ||
|> Enum.all?(&single_or_highly_restrictive?/1) | ||
end | ||
|
||
defp single_or_highly_restrictive?(acc) do | ||
scriptsets = Enum.map(acc, &codepoint_to_scriptset/1) | ||
is_single_script = @bottom != Enum.reduce(scriptsets, @top, &ss_intersect/2) | ||
|
||
# 'A set of scripts is defined to cover a string if the intersection of | ||
# that set with the augmented script sets of all characters in the string | ||
# is nonempty; in other words, if every character in the string shares at | ||
# least one script with the cover set.' | ||
Enum.any?(@highly_restrictive, fn restrictive -> | ||
Enum.all?(scriptsets, &(ss_intersect(&1, restrictive) != @bottom)) | ||
end) | ||
is_single_script or | ||
Enum.any?(@highly_restrictive, fn restrictive -> | ||
Enum.all?(scriptsets, &(ss_intersect(&1, restrictive) != @bottom)) | ||
end) | ||
end | ||
|
||
defp codepoint_to_scriptset(head) do | ||
|
Uh oh!
There was an error while loading. Please reload this page.