Skip to content

Commit b255f9e

Browse files
committed
Block unsafe Unicode characters in the local part
1 parent df852f7 commit b255f9e

File tree

3 files changed

+59
-8
lines changed

3 files changed

+59
-8
lines changed

README.md

+22-6
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Key features:
1616
to end users).
1717
* (optionally) Checks deliverability: Does the domain name resolve? And you can override the default DNS resolver.
1818
* Supports internationalized domain names and (optionally)
19-
internationalized local parts.
19+
internationalized local parts, but blocks unsafe characters.
2020
* Normalizes email addresses (super important for internationalized
2121
addresses! see below).
2222

@@ -172,12 +172,28 @@ The second sort of internationalization is internationalization in the
172172
*local* part of the address (before the @-sign). In non-internationalized
173173
email addresses, only English letters, numbers, and some punctuation
174174
(`._!#$%&'^``*+-=~/?{|}`) are allowed. In internationalized email address
175-
local parts, all Unicode characters are allowed by this library, although
176-
it's possible that not all characters will be allowed by all mail systems.
177-
178-
To deliver email to addresses with Unicode, non-English characters, your mail
175+
local parts, a wider range of Unicode characters are allowed.
176+
177+
A surprisingly large number of Unicode characters are not safe to display,
178+
especially when the email address is concatenated with other text, so this
179+
library tries to protect you by not permitting resvered, non-, private use,
180+
formatting (which can be used to alter the display order of characters),
181+
whitespace, and control characters, and combining characters
182+
as the first character (so that they cannot combine with something outside
183+
of the email address string). See https://qntm.org/safe and https://trojansource.codes/
184+
for relevant prior work. (Other than whitespace, these are checks that
185+
you should be applying to nearly all user inputs in a security-sensitive
186+
context.)
187+
188+
These character checks are performed after Unicode normalization (see below),
189+
so you are only fully protected if you replace all user-provided email addresses
190+
with the normalized email address string returned by this library. This does not
191+
guard against the well known problem that many Unicode characters look alike
192+
(or are identical), which can be used to fool humans reading displayed text.
193+
194+
Email addresses with these non-ASCII characters require that your mail
179195
submission library and the mail servers along the route to the destination,
180-
including your own outbound mail server, must all support the
196+
including your own outbound mail server, all support the
181197
[SMTPUTF8 (RFC 6531)](https://tools.ietf.org/html/rfc6531) extension.
182198
Support for SMTPUTF8 varies. See the `allow_smtputf8` parameter.
183199

email_validator/__init__.py

+28-1
Original file line numberDiff line numberDiff line change
@@ -368,8 +368,35 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
368368
# so we'll return the normalized local part in the return value.
369369
local = unicodedata.normalize("NFC", local)
370370

371+
# Check for unsafe characters.
372+
# Some of this may be redundant with the range U+0080 to U+10FFFF that is checked
373+
# by DOT_ATOM_TEXT_UTF8.
374+
for i, c in enumerate(local):
375+
category = unicodedata.category(c)
376+
if category[0] in ("L", "N", "P", "S"):
377+
# letters, numbers, punctuation, and symbols are permitted
378+
pass
379+
elif category[0] == "M":
380+
# combining character in first position would combine with something
381+
# outside of the email address if concatenated to the right, but are
382+
# otherwise permitted
383+
if i == 0:
384+
raise EmailSyntaxError("The email address contains an initial invalid character (%s)."
385+
% unicodedata.name(c, repr(c)))
386+
elif category[0] in ("Z", "C"):
387+
# spaces and line/paragraph characters (Z) and
388+
# control, format, surrogate, private use, and unassigned code points (C)
389+
raise EmailSyntaxError("The email address contains an invalid character (%s)."
390+
% unicodedata.name(c, repr(c)))
391+
else:
392+
# All categories should be handled above, but in case there is something new
393+
# in the future.
394+
raise EmailSyntaxError("The email address contains a character (%s; category %s) that may not be safe."
395+
% (unicodedata.name(c, repr(c)), category))
396+
371397
# Try encoding to UTF-8. Failure is possible with some characters like
372-
# surrogate code points.
398+
# surrogate code points, but those are checked above. Still, we don't
399+
# want to have an unhandled exception later.
373400
try:
374401
local.encode("utf8")
375402
except ValueError:

tests/test_main.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -278,12 +278,19 @@ def test_email_invalid_reserved_domain(email_input):
278278
# print(f'({email_input!r}, {str(exc_info.value)!r}),')
279279
assert "is a special-use or reserved name" in str(exc_info.value)
280280

281+
281282
@pytest.mark.parametrize(
282283
'email_input',
283284
[
284285
('white space@test'),
285286
('\n@test'),
286-
('\uD800@test'), # surrogate (Cs)
287+
('\u2005@test'), # four-per-em space (Zs)
288+
('\u009C@test'), # string terminator (Cc)
289+
('\u200B@test'), # zero-width space (Cf)
290+
('\u202Dforward-\u202Ereversed@test'), # BIDI (Cf)
291+
('\uD800@test'), # surrogate (Cs)
292+
('\uE000@test'), # private use (Co)
293+
('\uFDEF@test'), # unassigned (Cn)
287294
],
288295
)
289296
def test_email_unsafe_character(email_input):
@@ -292,6 +299,7 @@ def test_email_unsafe_character(email_input):
292299
validate_email(email_input, test_environment=True)
293300
assert "invalid character" in str(exc_info.value)
294301

302+
295303
def test_email_test_domain_name_in_test_environment():
296304
validate_email("anything@test", test_environment=True)
297305
validate_email("[email protected]", test_environment=True)

0 commit comments

Comments
 (0)