HypothesisWorks
diff --git a/‎hypothesis-python/RELEASE.rst
+10 b/‎hypothesis-python/RELEASE.rst
+10
diff --git a/‎hypothesis-python/src/_hypothesis_pytestplugin.py
+7 b/‎hypothesis-python/src/_hypothesis_pytestplugin.py
+7
diff --git a/‎hypothesis-python/src/hypothesis/core.py
+69-79 b/‎hypothesis-python/src/hypothesis/core.py
+69-79
diff --git a/‎hypothesis-python/src/hypothesis/errors.py
+12-3 b/‎hypothesis-python/src/hypothesis/errors.py
+12-3
diff --git a/‎hypothesis-python/src/hypothesis/internal/escalation.py
+1-1 b/‎hypothesis-python/src/hypothesis/internal/escalation.py
+1-1
diff --git a/‎hypothesis-python/src/hypothesis/reporting.py
-4 b/‎hypothesis-python/src/hypothesis/reporting.py
-4
diff --git a/‎hypothesis-python/tests/cover/test_arbitrary_data.py
+13-24 b/‎hypothesis-python/tests/cover/test_arbitrary_data.py
+13-24
@@ -0,0 +1,10 @@
+RELEASE_TYPE: minor
+
+Reporting of :obj:`multiple failing examples <hypothesis.settings.report_multiple_bugs>`
+now uses the :pep:`654` `ExceptionGroup <https://docs.python.org/3.11/library/exceptions.html#ExceptionGroup>`__ type, which is provided by the
+:pypi:`exceptiongroup` backport on Python 3.10 and earlier (:issue:`3175`).
+``hypothesis.errors.MultipleFailures`` is therefore deprecated.
+
+Failing examples and other reports are now stored as :pep:`678` exception notes, which
+ensures that they will always appear together with the traceback and other information
+about their respective error.
@@ -178,6 +178,13 @@ def pytest_configure(config):
                 pass
             core.global_force_seed = seed
 
+        core.pytest_shows_exceptiongroups = (
+            sys.version_info[:2] >= (3, 11)
+            ## See https://github.com/pytest-dev/pytest/issues/9159
+            # or pytest_version >= (7, 2)  # TODO: fill in correct version here
+            or config.getoption("tbstyle", "auto") == "native"
+        )
+
     @pytest.hookimpl(hookwrapper=True)
     def pytest_runtest_call(item):
         __tracebackhide__ = True
 
@@ -60,7 +60,6 @@
     HypothesisDeprecationWarning,
     HypothesisWarning,
     InvalidArgument,
-    MultipleFailures,
     NoSuchExample,
     StopTest,
     Unsatisfiable,
@@ -69,6 +68,7 @@
 from hypothesis.executors import default_new_style_executor, new_style_executor
 from hypothesis.internal.compat import (
     PYPY,
+    BaseExceptionGroup,
     bad_django_TestCase,
     get_type_hints,
     int_from_bytes,
@@ -126,6 +126,7 @@
 
 
 running_under_pytest = False
+pytest_shows_exceptiongroups = True
 global_force_seed = None
 _hypothesis_global_random = None
 
@@ -436,7 +437,7 @@ def execute_explicit_examples(state, wrapped_test, arguments, kwargs, original_s
                     err = new
 
                 yield (fragments_reported, err)
-                if state.settings.report_multiple_bugs:
+                if state.settings.report_multiple_bugs and pytest_shows_exceptiongroups:
                     continue
                 break
             finally:
@@ -575,7 +576,6 @@ def __init__(
         self.settings = settings
         self.last_exception = None
         self.falsifying_examples = ()
-        self.__was_flaky = False
         self.random = random
         self.__test_runtime = None
         self.ever_executed = False
@@ -710,11 +710,10 @@ def run(data):
                 )
             else:
                 report("Failed to reproduce exception. Expected: \n" + traceback)
-            self.__flaky(
-                f"Hypothesis {text_repr} produces unreliable results: Falsified"
-                " on the first call but did not on a subsequent one",
-                cause=exception,
-            )
+            raise Flaky(
+                f"Hypothesis {text_repr} produces unreliable results: "
+                "Falsified on the first call but did not on a subsequent one"
+            ) from exception
         return result
 
     def _execute_once_for_engine(self, data):
@@ -842,64 +841,57 @@ def run_engine(self):
 
         if not self.falsifying_examples:
             return
-        elif not self.settings.report_multiple_bugs:
+        elif not (self.settings.report_multiple_bugs and pytest_shows_exceptiongroups):
             # Pretend that we only found one failure, by discarding the others.
             del self.falsifying_examples[:-1]
 
         # The engine found one or more failures, so we need to reproduce and
         # report them.
 
-        flaky = 0
+        errors_to_report = []
 
-        if runner.best_observed_targets:
-            for line in describe_targets(runner.best_observed_targets):
-                report(line)
-            report("")
+        report_lines = describe_targets(runner.best_observed_targets)
+        if report_lines:
+            report_lines.append("")
 
         explanations = explanatory_lines(self.explain_traces, self.settings)
         for falsifying_example in self.falsifying_examples:
             info = falsifying_example.extra_information
+            fragments = []
 
             ran_example = ConjectureData.for_buffer(falsifying_example.buffer)
-            self.__was_flaky = False
             assert info.__expected_exception is not None
             try:
-                self.execute_once(
-                    ran_example,
-                    print_example=not self.is_find,
-                    is_final=True,
-                    expected_failure=(
-                        info.__expected_exception,
-                        info.__expected_traceback,
-                    ),
-                )
+                with with_reporter(fragments.append):
+                    self.execute_once(
+                        ran_example,
+                        print_example=not self.is_find,
+                        is_final=True,
+                        expected_failure=(
+                            info.__expected_exception,
+                            info.__expected_traceback,
+                        ),
+                    )
             except (UnsatisfiedAssumption, StopTest) as e:
-                report(format_exception(e, e.__traceback__))
-                self.__flaky(
+                err = Flaky(
                     "Unreliable assumption: An example which satisfied "
                     "assumptions on the first run now fails it.",
-                    cause=e,
                 )
+                err.__cause__ = err.__context__ = e
+                errors_to_report.append((fragments, err))
             except BaseException as e:
                 # If we have anything for explain-mode, this is the time to report.
                 for line in explanations[falsifying_example.interesting_origin]:
-                    report(line)
-
-                if len(self.falsifying_examples) <= 1:
-                    # There is only one failure, so we can report it by raising
-                    # it directly.
-                    raise
-
-                # We are reporting multiple failures, so we need to manually
-                # print each exception's stack trace and information.
-                tb = get_trimmed_traceback()
-                report(format_exception(e, tb))
+                    fragments.append(line)
+                errors_to_report.append(
+                    (fragments, e.with_traceback(get_trimmed_traceback()))
+                )
 
             finally:
                 # Whether or not replay actually raised the exception again, we want
                 # to print the reproduce_failure decorator for the failing example.
                 if self.settings.print_blob:
-                    report(
+                    fragments.append(
                         "\nYou can reproduce this example by temporarily adding "
                         "@reproduce_failure(%r, %r) as a decorator on your test case"
                         % (__version__, encode_failure(falsifying_example.buffer))
@@ -908,30 +900,38 @@ def run_engine(self):
                 # hold on to a reference to ``data`` know that it's now been
                 # finished and they can't draw more data from it.
                 ran_example.freeze()
+        _raise_to_user(errors_to_report, self.settings, report_lines)
 
-            if self.__was_flaky:
-                flaky += 1
-
-        # If we only have one example then we should have raised an error or
-        # flaky prior to this point.
-        assert len(self.falsifying_examples) > 1
 
-        if flaky > 0:
-            raise Flaky(
-                f"Hypothesis found {len(self.falsifying_examples)} distinct failures, "
-                f"but {flaky} of them exhibited some sort of flaky behaviour."
-            )
-        else:
-            raise MultipleFailures(
-                f"Hypothesis found {len(self.falsifying_examples)} distinct failures."
-            )
+def add_note(exc, note):
+    try:
+        exc.add_note(note)
+    except AttributeError:
+        if not hasattr(exc, "__notes__"):
+            exc.__notes__ = []
+        exc.__notes__.append(note)
+
+
+def _raise_to_user(errors_to_report, settings, target_lines, trailer=""):
+    """Helper function for attaching notes and grouping multiple errors."""
+    if settings.verbosity >= Verbosity.normal:
+        for fragments, err in errors_to_report:
+            for note in fragments:
+                add_note(err, note)
+
+    if len(errors_to_report) == 1:
+        _, the_error_hypothesis_found = errors_to_report[0]
+    else:
+        assert errors_to_report
+        the_error_hypothesis_found = BaseExceptionGroup(
+            f"Hypothesis found {len(errors_to_report)} distinct failures{trailer}.",
+            [e for _, e in errors_to_report],
+        )
 
-    def __flaky(self, message, *, cause):
-        if len(self.falsifying_examples) <= 1:
-            raise Flaky(message) from cause
-        else:
-            self.__was_flaky = True
-            report("Flaky example! " + message)
+    if settings.verbosity >= Verbosity.normal:
+        for line in target_lines:
+            add_note(the_error_hypothesis_found, line)
+    raise the_error_hypothesis_found
 
 
 @contextlib.contextmanager
@@ -1189,23 +1189,11 @@ def wrapped_test(*arguments, **kwargs):
                     state, wrapped_test, arguments, kwargs, original_sig
                 )
             )
-            with local_settings(state.settings):
-                if len(errors) > 1:
-                    # If we're not going to report multiple bugs, we would have
-                    # stopped running explicit examples at the first failure.
-                    assert state.settings.report_multiple_bugs
-                    for fragments, err in errors:
-                        for f in fragments:
-                            report(f)
-                        report(format_exception(err, err.__traceback__))
-                    raise MultipleFailures(
-                        f"Hypothesis found {len(errors)} failures in explicit examples."
-                    )
-                elif errors:
-                    fragments, the_error_hypothesis_found = errors[0]
-                    for f in fragments:
-                        report(f)
-                    raise the_error_hypothesis_found
+            if errors:
+                # If we're not going to report multiple bugs, we would have
+                # stopped running explicit examples at the first failure.
+                assert len(errors) == 1 or state.settings.report_multiple_bugs
+                _raise_to_user(errors, state.settings, [], " in explicit examples")
 
             # If there were any explicit examples, they all ran successfully.
             # The next step is to use the Conjecture engine to run the test on
@@ -1236,7 +1224,7 @@ def wrapped_test(*arguments, **kwargs):
                     state.run_engine()
             except BaseException as e:
                 # The exception caught here should either be an actual test
-                # failure (or MultipleFailures), or some kind of fatal error
+                # failure (or BaseExceptionGroup), or some kind of fatal error
                 # that caused the engine to stop.
 
                 generated_seed = wrapped_test._hypothesis_internal_use_generated_seed
@@ -1262,7 +1250,9 @@ def wrapped_test(*arguments, **kwargs):
                     # which will actually appear in tracebacks is as clear as
                     # possible - "raise the_error_hypothesis_found".
                     the_error_hypothesis_found = e.with_traceback(
-                        get_trimmed_traceback()
+                        None
+                        if isinstance(e, BaseExceptionGroup)
+                        else get_trimmed_traceback()
                     )
                     raise the_error_hypothesis_found
 
 
@@ -124,9 +124,18 @@ class Frozen(HypothesisException):
     after freeze() has been called."""
 
 
-class MultipleFailures(_Trimmable):
-    """Indicates that Hypothesis found more than one distinct bug when testing
-    your code."""
+def __getattr__(name):
+    if name == "MultipleFailures":
+        from hypothesis._settings import note_deprecation
+        from hypothesis.internal.compat import BaseExceptionGroup
+
+        note_deprecation(
+            "MultipleFailures is deprecated; use the builtin `BaseExceptionGroup` type "
+            "instead, or `exceptiongroup.BaseExceptionGroup` before Python 3.11",
+            since="RELEASEDAY",
+            has_codemod=False,  # This would be a great PR though!
+        )
+        return BaseExceptionGroup
 
 
 class DeadlineExceeded(_Trimmable):
 
@@ -86,7 +86,7 @@ def get_trimmed_traceback(exception=None):
     else:
         tb = exception.__traceback__
     # Avoid trimming the traceback if we're in verbose mode, or the error
-    # was raised inside Hypothesis (and is not a MultipleFailures)
+    # was raised inside Hypothesis
     if hypothesis.settings.default.verbosity >= hypothesis.Verbosity.debug or (
         is_hypothesis_file(traceback.extract_tb(tb)[-1][0])
         and not isinstance(exception, _Trimmable)
 
@@ -15,10 +15,6 @@
 from hypothesis.utils.dynamicvariables import DynamicVariable
 
 
-def silent(value):
-    pass
-
-
 def default(value):
     try:
         print(value)
 
@@ -11,11 +11,9 @@
 import pytest
 from pytest import raises
 
-from hypothesis import find, given, reporting, strategies as st
+from hypothesis import find, given, strategies as st
 from hypothesis.errors import InvalidArgument
 
-from tests.common.utils import capture_out
-
 
 @given(st.integers(), st.data())
 def test_conditional_draw(x, data):
@@ -32,13 +30,10 @@ def test(data):
         if y in x:
             raise ValueError()
 
-    with raises(ValueError):
-        with capture_out() as out:
-            with reporting.with_reporter(reporting.default):
-                test()
-    result = out.getvalue()
-    assert "Draw 1: [0, 0]" in result
-    assert "Draw 2: 0" in result
+    with raises(ValueError) as err:
+        test()
+    assert "Draw 1: [0, 0]" in err.value.__notes__
+    assert "Draw 2: 0" in err.value.__notes__
 
 
 def test_prints_labels_if_given_on_failure():
@@ -50,13 +45,10 @@ def test(data):
         x.remove(y)
         assert y not in x
 
-    with raises(AssertionError):
-        with capture_out() as out:
-            with reporting.with_reporter(reporting.default):
-                test()
-    result = out.getvalue()
-    assert "Draw 1 (Some numbers): [0, 0]" in result
-    assert "Draw 2 (A number): 0" in result
+    with raises(AssertionError) as err:
+        test()
+    assert "Draw 1 (Some numbers): [0, 0]" in err.value.__notes__
+    assert "Draw 2 (A number): 0" in err.value.__notes__
 
 
 def test_given_twice_is_same():
@@ -66,13 +58,10 @@ def test(data1, data2):
         data2.draw(st.integers())
         raise ValueError()
 
-    with raises(ValueError):
-        with capture_out() as out:
-            with reporting.with_reporter(reporting.default):
-                test()
-    result = out.getvalue()
-    assert "Draw 1: 0" in result
-    assert "Draw 2: 0" in result
+    with raises(ValueError) as err:
+        test()
+    assert "Draw 1: 0" in err.value.__notes__
+    assert "Draw 2: 0" in err.value.__notes__
 
 
 def test_errors_when_used_in_find():