Skip to content

Add --format-email to perform full validation on "email" and "idn-email" formats #460

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Unreleased
----------

.. vendor-insert-here
- Add ``--format-email`` option to allow full validation of email/idn-email formats

0.29.0
------
Expand Down
17 changes: 17 additions & 0 deletions docs/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,23 @@ follows:
* - python
- Require the regex to be valid in Python regex syntax.

``--format-email``
~~~~~~~~~~~~~~~~~~

Set a mode for handling of the ``"email"`` and ``"idn-email"`` values for ``"format"``. The modes are as
follows:

.. list-table:: Email Options
:widths: 15 30
:header-rows: 1

* - mode
- description
* - default
- Require the email address to pass a basic sanity check
* - full
- Require the email to match RFC5321 for ``"email"`` or RFC6531 for ``"idn-email"```

Other Options
--------------

Expand Down
19 changes: 18 additions & 1 deletion src/check_jsonschema/cli/main_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from ..catalog import CUSTOM_SCHEMA_NAMES, SCHEMA_CATALOG
from ..checker import SchemaChecker
from ..formats import KNOWN_FORMATS, RegexVariantName
from ..formats import KNOWN_FORMATS, EmailVariantName, RegexVariantName
from ..instance_loader import InstanceLoader
from ..parsers import SUPPORTED_FILE_FORMATS
from ..reporter import REPORTER_BY_NAME, Reporter
Expand Down Expand Up @@ -74,6 +74,11 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str:
date, date-time, email, ipv4, ipv6, regex, uuid

\b
For the "email" and "idn-email" formats, there are multiple modes which can be specified
'--format-email':
default | only check that the string contains "@"
full | check the string against RFC 5321 (email) or RFC 6531 (idn-email)

For the "regex" format, there are multiple modes which can be specified with
'--format-regex':
default | check that the string is a valid ECMAScript regex
Expand Down Expand Up @@ -155,6 +160,16 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str:
default=RegexVariantName.default.value,
type=click.Choice([x.value for x in RegexVariantName], case_sensitive=False),
)
@click.option(
"--format-email",
help=(
"Set the mode of format validation for email addresses. "
"If `--disable-formats email` or `--disable-formats idn-email` is "
"used, this option has no effect on the disabled format."
),
default=EmailVariantName.default.value,
type=click.Choice([x.value for x in EmailVariantName], case_sensitive=False),
)
@click.option(
"--default-filetype",
help="A default filetype to assume when a file's type is not detected",
Expand Down Expand Up @@ -240,6 +255,7 @@ def main(
no_cache: bool,
cache_filename: str | None,
disable_formats: tuple[list[str], ...],
format_email: Literal["full", "default"],
format_regex: Literal["python", "default"],
default_filetype: Literal["json", "yaml", "toml", "json5"],
traceback_mode: Literal["full", "short"],
Expand Down Expand Up @@ -267,6 +283,7 @@ def main(
else:
args.disable_formats = normalized_disable_formats

args.format_email = EmailVariantName(format_email)
args.format_regex = RegexVariantName(format_regex)
args.disable_cache = no_cache
args.default_filetype = default_filetype
Expand Down
4 changes: 3 additions & 1 deletion src/check_jsonschema/cli/parse_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import click
import jsonschema

from ..formats import FormatOptions, RegexVariantName
from ..formats import EmailVariantName, FormatOptions, RegexVariantName
from ..transforms import Transform


Expand Down Expand Up @@ -36,6 +36,7 @@ def __init__(self) -> None:
# regex format options
self.disable_all_formats: bool = False
self.disable_formats: tuple[str, ...] = ()
self.format_email: EmailVariantName = EmailVariantName.default
self.format_regex: RegexVariantName = RegexVariantName.default
# error and output controls
self.verbosity: int = 1
Expand Down Expand Up @@ -83,6 +84,7 @@ def set_validator(
def format_opts(self) -> FormatOptions:
return FormatOptions(
enabled=not self.disable_all_formats,
email_variant=self.format_email,
regex_variant=self.format_regex,
disabled_formats=self.disable_formats,
)
38 changes: 37 additions & 1 deletion src/check_jsonschema/formats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,12 @@
import jsonschema.validators
import regress

from .implementations import validate_rfc3339, validate_time
from .implementations import (
validate_rfc3339,
validate_rfc5321,
validate_rfc6531,
validate_time,
)

# all known format strings except for a selection from draft3 which have either
# been renamed or removed:
Expand Down Expand Up @@ -39,6 +44,32 @@
)


class EmailVariantName(enum.Enum):
default = "default"
full = "full"


class EmailImplementation:
def __init__(self, variant: EmailVariantName) -> None:
self.variant = variant

def check_format_email(self, instance: t.Any) -> bool:
if not isinstance(instance, str):
return True
if self.variant == EmailVariantName.default:
return "@" in instance
else:
return validate_rfc5321(instance)

def check_format_idn_email(self, instance: t.Any) -> bool:
if not isinstance(instance, str):
return True
if self.variant == EmailVariantName.default:
return "@" in instance
else:
return validate_rfc6531(instance)


class RegexVariantName(enum.Enum):
default = "default"
python = "python"
Expand Down Expand Up @@ -70,10 +101,12 @@ def __init__(
self,
*,
enabled: bool = True,
email_variant: EmailVariantName = EmailVariantName.default,
regex_variant: RegexVariantName = RegexVariantName.default,
disabled_formats: tuple[str, ...] = (),
) -> None:
self.enabled = enabled
self.email_variant = email_variant
self.regex_variant = regex_variant
self.disabled_formats = disabled_formats

Expand Down Expand Up @@ -101,7 +134,10 @@ def make_format_checker(

# replace the regex check
del checker.checkers["regex"]
email_impl = EmailImplementation(opts.email_variant)
regex_impl = RegexImplementation(opts.regex_variant)
checker.checks("email")(email_impl.check_format_email)
checker.checks("idn-email")(email_impl.check_format_idn_email)
checker.checks("regex")(regex_impl.check_format)
checker.checks("date-time")(validate_rfc3339)
checker.checks("time")(validate_time)
Expand Down
4 changes: 3 additions & 1 deletion src/check_jsonschema/formats/implementations/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from .iso8601_time import validate as validate_time
from .rfc3339 import validate as validate_rfc3339
from .rfc5321 import validate as validate_rfc5321
from .rfc6531 import validate as validate_rfc6531

__all__ = ("validate_rfc3339", "validate_time")
__all__ = ("validate_rfc3339", "validate_rfc5321", "validate_rfc6531", "validate_time")
41 changes: 41 additions & 0 deletions src/check_jsonschema/formats/implementations/rfc5321.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import re

RFC5321_REGEX = re.compile(
r"""
^
(
[!#-'*+/-9=?A-Z^-~-]+(\.[!#-'*+/-9=?A-Z^-~-]+)*
|
"([]!#-[^-~ \t]|(\\[\t -~]))+"
)
@
(
[!#-'*+/-9=?A-Z^-~-]+(\.[!#-'*+/-9=?A-Z^-~-]+)*
|
\[[\t -Z^-~]*]
)
$
""",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm finding this regex a bit difficult to read. In particular, I'm seeing some unconventional range expressions in the character classes, like /-9, -Z, and ^-~.
These are valid, but they aren't the way character classes are typically written. Perhaps other people have an expert and intuitive knowledge of what chr(ord(" ")+1) will be, but I definitely don't.

Can these be rewritten such that the suite of characters matched is more obvious to a reader? For example, rather than [/-9], I would much rather see [/0-9]. The fact that the lowercase letters are captured with ^-~ caught me particularly off-guard.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's fair. I'm not the original author of these regexes either (links in PR description). I'll see what I can do about cleaning up those character classes though.

Copy link
Author

@trzejos trzejos Jul 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sirosen

I had some time this week to revisit this, and reworked the simpler RFC5321 validation. I also added in some length checks as well and tried validating it against the examples in this wikipedia page: https://en.wikipedia.org/wiki/Email_address#Examples

It validated/invalidated as expected except for a couple cases:

  • I❤️[email protected] was incorrectly found invalid, likely because of UTF-8
    • This is actually the correct behavior. UTF-8 email addresses should only be allowed in the idn-email format
  • i.like.underscores@but_they_are_not_allowed_in_this_part was incorrectly found invalid, we allow underscores in the domain part of the regex.

Let me know if the regex is easier to understand and if you think we should need to handle non-ascii strings like that utf-8 one.

I have not revisited the idn-email validator yet

re.VERBOSE | re.ASCII,
)


def validate(email_str: object) -> bool:
"""Validate a string as a RFC5321 email address."""
if not isinstance(email_str, str):
return False
return not not RFC5321_REGEX.match(email_str)


if __name__ == "__main__":
import timeit

N = 100_000
tests = (("basic", "[email protected]"),)

print("benchmarking")
for name, val in tests:
all_times = timeit.repeat(
f"validate({val!r})", globals=globals(), repeat=3, number=N
)
print(f"{name} (valid={validate(val)}): {int(min(all_times) / N * 10**9)}ns")
61 changes: 61 additions & 0 deletions src/check_jsonschema/formats/implementations/rfc6531.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import re

RFC6531_REGEX = re.compile(
r"""
^
# local part
(
([0-9a-z!#$%&'*+-\/=?^_`\{|\}~\u0080-\U0010FFFF]+(\.[0-9a-z!#$%&'*+-\/=?^_`\{|\}~\u0080-\U0010FFFF]+)*)
|
# quoted string
"([\x20-\x21\x23-\x5B\x5D-\x7E\u0080-\U0010FFFF]|\\[\x20-\x7E])*"
)
@
# Domain/address
(
# Address literal
(\[(
# IPv4
(\d{1,3}(\.\d{1,3}){3})
|
# IPv6
(IPv6:[0-9a-f]{1,4}(:[0-9a-f]{1,4}){7})
|
(IPv6:([0-9a-f]{1,4}(:[0-9a-f]{1,4}){0,5})?::([0-9a-f]{1,4}(:[0-9a-f]{1,4}){0,5})?)
|
(IPv6:[0-9a-f]{1,4}(:[0-9a-f]{1,4}){5}:\d{1,3}(\.\d{1,3}){3})
|
(IPv6:([0-9a-f]{1,4}(:[0-9a-f]{1,4}){0,3})?::([0-9a-f]{1,4}(:[0-9a-f]{1,4}){0,3}:)?\d{1,3}(\.\d{1,3}){3})
|
# General address
([a-z0-9-]*[a-z0-9]:[\x21-\x5A\x5E-\x7E]+)
)\])
|
# Domain
((?!.{256,})(([0-9a-z\u0080-\U0010FFFF]([0-9a-z-\u0080-\U0010FFFF]*[0-9a-z\u0080-\U0010FFFF])?))(\.([0-9a-z\u0080-\U0010FFFF]([0-9a-z-\u0080-\U0010FFFF]*[0-9a-z\u0080-\U0010FFFF])?))*)
)
$
""",
re.VERBOSE | re.UNICODE,
)


def validate(email_str: object) -> bool:
"""Validate a string as a RFC6531 email address."""
if not isinstance(email_str, str):
return False
return not not RFC6531_REGEX.match(email_str)


if __name__ == "__main__":
import timeit

N = 100_000
tests = (("basic", "[email protected]"),)

print("benchmarking")
for name, val in tests:
all_times = timeit.repeat(
f"validate({val!r})", globals=globals(), repeat=3, number=N
)
print(f"{name} (valid={validate(val)}): {int(min(all_times) / N * 10**9)}ns")
Loading