Merge pull request #3979 from jobh/gc-accounting

Zac-HD · web-flow · commit 7b1d10693ba6 · 2024-06-05T13:36:07.000-07:00
Account for time spent in garbage collection
diff --git a/hypothesis-python/RELEASE-sample.rst b/hypothesis-python/RELEASE-sample.rst
@@ -21,13 +21,17 @@ which should:
   an internal invariant." (the complete changelog for version 6.99.11)
 - use ``double backticks`` for verbatim code,
 - use Sphinx cross-references to any functions or classes mentioned:
-  - :pypi:`package` for links to external packages,
+  - :pypi:`package` for links to external packages.
   - :func:`package.function` for link to functions, where the link text will
-    be ``package.function``, or :func:`~package.function` to show ``function``,
-  - :class:`package.class` for link to classes (abbreviated as above),
-  - :issue:`issue-number` for referencing issues,
-  - :doc:`link text <chapter#anchor>` for documentation references
-    (``https://hypothesis.readthedocs.io/en/latest/<chapter>.html#<anchor>``)
+    be ``package.function``, or :func:`~package.function` to show ``function``.
+  - :class:`package.class` for link to classes (abbreviated as above).
+  - :issue:`issue-number` for referencing issues.
+  - Similarly, :pull:`pr-number` can be used for PRs, but it's usually
+    preferred to refer to version numbers such as :ref:`version 6.98.9 <v6.98.9>,
+    as they are meaningful to end users.
+  - :doc:`link text <chapter#anchor>` for documentation references.
+  - `link text <https://hypothesis.readthedocs.io/en/latest/chapter.html#anchor>`__
+    is the same link, for general web addresses.
 - finish with a note of thanks from the maintainers. If this is your first
   contribution, don't forget to add yourself to AUTHORS.rst!
 
diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst
@@ -0,0 +1,8 @@
+RELEASE_TYPE: patch
+
+Account for time spent in garbage collection during tests, to avoid
+flaky ``DeadlineExceeded`` errors as seen in :issue:`3975`.
+
+Also fixes overcounting of stateful run times,
+a minor observability bug dating to :ref:`version 6.98.9 <v6.98.9>`
+(:pull:`3890`).
diff --git a/hypothesis-python/docs/schema_observations.json b/hypothesis-python/docs/schema_observations.json
@@ -46,7 +46,7 @@
                 },
                 "timing": {
                     "type": "object",
-                    "description": "The time in seconds taken by non-overlapping parts of this test case.  Hypothesis reports execute:test, and generate:{argname} for each argument.",
+                    "description": "The time in seconds taken by non-overlapping parts of this test case.  Hypothesis reports execute:test, overall:gc, and generate:{argname} for each argument.",
                     "additionalProperties": {
                         "type": "number",
                         "minimum": 0
@@ -98,4 +98,4 @@
             "additionalProperties": false
         }
     ]
-}
+}
diff --git a/hypothesis-python/src/hypothesis/core.py b/hypothesis-python/src/hypothesis/core.py
@@ -77,7 +77,10 @@
 )
 from hypothesis.internal.conjecture.data import ConjectureData, Status
 from hypothesis.internal.conjecture.engine import BUFFER_SIZE, ConjectureRunner
-from hypothesis.internal.conjecture.junkdrawer import ensure_free_stackframes
+from hypothesis.internal.conjecture.junkdrawer import (
+    ensure_free_stackframes,
+    gc_cumulative_time,
+)
 from hypothesis.internal.conjecture.shrinker import sort_key
 from hypothesis.internal.entropy import deterministic_PRNG
 from hypothesis.internal.escalation import (
@@ -820,30 +823,45 @@ def execute_once(
         self._string_repr = ""
         text_repr = None
         if self.settings.deadline is None and not TESTCASE_CALLBACKS:
-            test = self.test
+
+            @proxies(self.test)
+            def test(*args, **kwargs):
+                with ensure_free_stackframes():
+                    return self.test(*args, **kwargs)
+
         else:
 
             @proxies(self.test)
             def test(*args, **kwargs):
                 arg_drawtime = math.fsum(data.draw_times.values())
+                arg_stateful = math.fsum(data._stateful_run_times.values())
+                arg_gctime = gc_cumulative_time()
                 start = time.perf_counter()
                 try:
-                    result = self.test(*args, **kwargs)
+                    with ensure_free_stackframes():
+                        result = self.test(*args, **kwargs)
                 finally:
                     finish = time.perf_counter()
                     in_drawtime = math.fsum(data.draw_times.values()) - arg_drawtime
-                    runtime = datetime.timedelta(seconds=finish - start - in_drawtime)
+                    in_stateful = (
+                        math.fsum(data._stateful_run_times.values()) - arg_stateful
+                    )
+                    in_gctime = gc_cumulative_time() - arg_gctime
+                    runtime = finish - start - in_drawtime - in_stateful - in_gctime
                     self._timing_features = {
-                        "execute:test": finish - start - in_drawtime,
+                        "execute:test": runtime,
+                        "overall:gc": in_gctime,
                         **data.draw_times,
                         **data._stateful_run_times,
                     }
 
                 if (current_deadline := self.settings.deadline) is not None:
                     if not is_final:
                         current_deadline = (current_deadline // 4) * 5
-                    if runtime >= current_deadline:
-                        raise DeadlineExceeded(runtime, self.settings.deadline)
+                    if runtime >= current_deadline.total_seconds():
+                        raise DeadlineExceeded(
+                            datetime.timedelta(seconds=runtime), self.settings.deadline
+                        )
                 return result
 
         def run(data):
diff --git a/hypothesis-python/src/hypothesis/internal/conjecture/data.py b/hypothesis-python/src/hypothesis/internal/conjecture/data.py
@@ -44,7 +44,11 @@
 from hypothesis.internal.cache import LRUReusedCache
 from hypothesis.internal.compat import add_note, floor, int_from_bytes, int_to_bytes
 from hypothesis.internal.conjecture.floats import float_to_lex, lex_to_float
-from hypothesis.internal.conjecture.junkdrawer import IntList, uniform
+from hypothesis.internal.conjecture.junkdrawer import (
+    IntList,
+    gc_cumulative_time,
+    uniform,
+)
 from hypothesis.internal.conjecture.utils import (
     INT_SIZES,
     INT_SIZES_SAMPLER,
@@ -1980,6 +1984,7 @@ def __init__(
         self.testcounter = global_test_counter
         global_test_counter += 1
         self.start_time = time.perf_counter()
+        self.gc_start_time = gc_cumulative_time()
         self.events: Dict[str, Union[str, int, float]] = {}
         self.forced_indices: "Set[int]" = set()
         self.interesting_origin: Optional[InterestingOrigin] = None
@@ -2420,6 +2425,7 @@ def draw(
             # where we cache something expensive, this led to Flaky deadline errors!
             # See https://github.com/HypothesisWorks/hypothesis/issues/2108
             start_time = time.perf_counter()
+            gc_start_time = gc_cumulative_time()
 
         strategy.validate()
 
@@ -2443,7 +2449,10 @@ def draw(
                 try:
                     return strategy.do_draw(self)
                 finally:
-                    self.draw_times[key] = time.perf_counter() - start_time
+                    # Subtract the time spent in GC to avoid overcounting, as it is
+                    # accounted for at the overall example level.
+                    in_gctime = gc_cumulative_time() - gc_start_time
+                    self.draw_times[key] = time.perf_counter() - start_time - in_gctime
             except Exception as err:
                 add_note(err, f"while generating {key[9:]!r} from {strategy!r}")
                 raise
@@ -2520,6 +2529,7 @@ def freeze(self) -> None:
             assert isinstance(self.buffer, bytes)
             return
         self.finish_time = time.perf_counter()
+        self.gc_finish_time = gc_cumulative_time()
         assert len(self.buffer) == self.index
 
         # Always finish by closing all remaining examples so that we have a
diff --git a/hypothesis-python/src/hypothesis/internal/conjecture/engine.py b/hypothesis-python/src/hypothesis/internal/conjecture/engine.py
@@ -168,6 +168,7 @@ class CallStats(TypedDict):
     status: str
     runtime: float
     drawtime: float
+    gctime: float
     events: List[str]
 
 
@@ -298,7 +299,9 @@ def __stoppable_test_function(self, data: ConjectureData) -> None:
         """
         # We ensure that the test has this much stack space remaining, no
         # matter the size of the stack when called, to de-flake RecursionErrors
-        # (#2494, #3671).
+        # (#2494, #3671). Note, this covers the data generation part of the test;
+        # the actual test execution is additionally protected at the call site
+        # in hypothesis.core.execute_once.
         with ensure_free_stackframes():
             try:
                 self._test_function(data)
@@ -430,6 +433,7 @@ def test_function(self, data: ConjectureData) -> None:
                     "status": data.status.name.lower(),
                     "runtime": data.finish_time - data.start_time,
                     "drawtime": math.fsum(data.draw_times.values()),
+                    "gctime": data.gc_finish_time - data.gc_start_time,
                     "events": sorted(
                         k if v == "" else f"{k}: {v}" for k, v in data.events.items()
                     ),
diff --git a/hypothesis-python/src/hypothesis/internal/conjecture/junkdrawer.py b/hypothesis-python/src/hypothesis/internal/conjecture/junkdrawer.py
@@ -13,7 +13,9 @@
 anything that lives here, please move it."""
 
 import array
+import gc
 import sys
+import time
 import warnings
 from random import Random
 from typing import (
@@ -413,3 +415,52 @@ def find(self, condition: Callable[[T], bool]) -> T:
                 self.__values.append(value)
                 return value
         raise NotFound("No values satisfying condition")
+
+
+_gc_initialized = False
+_gc_start = 0
+_gc_cumulative_time = 0
+
+
+def gc_cumulative_time() -> float:
+    global _gc_initialized
+    if not _gc_initialized:
+        if hasattr(gc, "callbacks"):
+            # CPython
+            def gc_callback(phase, info):
+                global _gc_start, _gc_cumulative_time
+                try:
+                    now = time.perf_counter()
+                    if phase == "start":
+                        _gc_start = now
+                    elif phase == "stop" and _gc_start > 0:
+                        _gc_cumulative_time += now - _gc_start  # pragma: no cover # ??
+                except RecursionError:  # pragma: no cover
+                    # Avoid flakiness via UnraisableException, which is caught and
+                    # warned by pytest. The actual callback (this function) is
+                    # validated to never trigger a RecursionError itself when
+                    # when called by gc.collect.
+                    # Anyway, we should hit the same error on "start"
+                    # and "stop", but to ensure we don't get out of sync we just
+                    # signal that there is no matching start.
+                    _gc_start = 0
+                    return
+
+            gc.callbacks.insert(0, gc_callback)
+        elif hasattr(gc, "hooks"):  # pragma: no cover  # pypy only
+            # PyPy
+            def hook(stats):
+                global _gc_cumulative_time
+                try:
+                    _gc_cumulative_time += stats.duration
+                except RecursionError:
+                    pass
+
+            if gc.hooks.on_gc_minor is None:
+                gc.hooks.on_gc_minor = hook
+            if gc.hooks.on_gc_collect_step is None:
+                gc.hooks.on_gc_collect_step = hook
+
+        _gc_initialized = True
+
+    return _gc_cumulative_time
diff --git a/hypothesis-python/src/hypothesis/internal/escalation.py b/hypothesis-python/src/hypothesis/internal/escalation.py
@@ -87,9 +87,12 @@ def get_trimmed_traceback(exception=None):
     else:
         tb = exception.__traceback__
     # Avoid trimming the traceback if we're in verbose mode, or the error
-    # was raised inside Hypothesis
+    # was raised inside Hypothesis. Additionally, the environment variable
+    # HYPOTHESIS_NO_TRACEBACK_TRIM is respected if nonempty, because verbose
+    # mode is prohibitively slow when debugging strategy recursion errors.
     if (
         tb is None
+        or os.environ.get("HYPOTHESIS_NO_TRACEBACK_TRIM", None)
         or hypothesis.settings.default.verbosity >= hypothesis.Verbosity.debug
         or is_hypothesis_file(traceback.extract_tb(tb)[-1][0])
         and not isinstance(exception, _Trimmable)
diff --git a/hypothesis-python/src/hypothesis/internal/scrutineer.py b/hypothesis-python/src/hypothesis/internal/scrutineer.py
@@ -10,6 +10,7 @@
 
 import functools
 import os
+import re
 import subprocess
 import sys
 import types
@@ -58,15 +59,18 @@ def __init__(self):
         self._previous_location = None
 
     def trace(self, frame, event, arg):
-        if event == "call":
-            return self.trace
-        elif event == "line":
-            # manual inlining of self.trace_line for performance.
-            fname = frame.f_code.co_filename
-            if should_trace_file(fname):
-                current_location = (fname, frame.f_lineno)
-                self.branches.add((self._previous_location, current_location))
-                self._previous_location = current_location
+        try:
+            if event == "call":
+                return self.trace
+            elif event == "line":
+                # manual inlining of self.trace_line for performance.
+                fname = frame.f_code.co_filename
+                if should_trace_file(fname):
+                    current_location = (fname, frame.f_lineno)
+                    self.branches.add((self._previous_location, current_location))
+                    self._previous_location = current_location
+        except RecursionError:
+            pass
 
     def trace_line(self, code: types.CodeType, line_number: int) -> None:
         fname = code.co_filename
@@ -104,19 +108,38 @@ def __exit__(self, *args, **kwargs):
     # a contextmanager; this is probably after the fault has been triggered.
     # Similar reasoning applies to a few other standard-library modules: even
     # if the fault was later, these still aren't useful locations to report!
-    f"{sep}contextlib.py",
-    f"{sep}inspect.py",
-    f"{sep}re.py",
-    f"{sep}re{sep}__init__.py",  # refactored in Python 3.11
-    f"{sep}warnings.py",
+    # Note: The list is post-processed, so use plain "/" for separator here.
+    "/contextlib.py",
+    "/inspect.py",
+    "/re.py",
+    "/re/__init__.py",  # refactored in Python 3.11
+    "/warnings.py",
     # Quite rarely, the first AFNP line is in Pytest's internals.
-    f"{sep}_pytest{sep}assertion{sep}__init__.py",
-    f"{sep}_pytest{sep}assertion{sep}rewrite.py",
-    f"{sep}_pytest{sep}_io{sep}saferepr.py",
-    f"{sep}pluggy{sep}_result.py",
+    "/_pytest/_io/saferepr.py",
+    "/_pytest/assertion/*.py",
+    "/_pytest/config/__init__.py",
+    "/_pytest/pytester.py",
+    "/pluggy/_*.py",
+    "/reprlib.py",
+    "/typing.py",
+    "/conftest.py",
 )
 
 
+def _glob_to_re(locs):
+    """Translate a list of glob patterns to a combined regular expression.
+    Only the * wildcard is supported, and patterns including special
+    characters will only work by chance."""
+    # fnmatch.translate is not an option since its "*" consumes path sep
+    return "|".join(
+        loc.replace("*", r"[^/]+")
+        .replace(".", re.escape("."))
+        .replace("/", re.escape(sep))
+        + r"\Z"  # right anchored
+        for loc in locs
+    )
+
+
 def get_explaining_locations(traces):
     # Traces is a dict[interesting_origin | None, set[frozenset[tuple[str, int]]]]
     # Each trace in the set might later become a Counter instead of frozenset.
@@ -159,8 +182,9 @@ def get_explaining_locations(traces):
     # The last step is to filter out explanations that we know would be uninformative.
     # When this is the first AFNP location, we conclude that Scrutineer missed the
     # real divergence (earlier in the trace) and drop that unhelpful explanation.
+    filter_regex = re.compile(_glob_to_re(UNHELPFUL_LOCATIONS))
     return {
-        origin: {loc for loc in afnp_locs if not loc[0].endswith(UNHELPFUL_LOCATIONS)}
+        origin: {loc for loc in afnp_locs if not filter_regex.search(loc[0])}
         for origin, afnp_locs in explanations.items()
     }
 
diff --git a/hypothesis-python/src/hypothesis/stateful.py b/hypothesis-python/src/hypothesis/stateful.py
@@ -50,6 +50,7 @@
 from hypothesis.internal.compat import add_note
 from hypothesis.internal.conjecture import utils as cu
 from hypothesis.internal.conjecture.engine import BUFFER_SIZE
+from hypothesis.internal.conjecture.junkdrawer import gc_cumulative_time
 from hypothesis.internal.healthcheck import fail_health_check
 from hypothesis.internal.observability import TESTCASE_CALLBACKS
 from hypothesis.internal.reflection import (
@@ -158,6 +159,7 @@ def output(s):
                     must_stop = True
 
                 start_draw = perf_counter()
+                start_gc = gc_cumulative_time()
                 if cd.draw_boolean(p=2**-16, forced=must_stop):
                     break
                 steps_run += 1
@@ -175,7 +177,8 @@ def output(s):
                     rule, data = cd.draw(machine._rules_strategy)
                 draw_label = f"generate:rule:{rule.function.__name__}"
                 cd.draw_times.setdefault(draw_label, 0.0)
-                cd.draw_times[draw_label] += perf_counter() - start_draw
+                in_gctime = gc_cumulative_time() - start_gc
+                cd.draw_times[draw_label] += perf_counter() - start_draw - in_gctime
 
                 # Pretty-print the values this rule was called with *before* calling
                 # _add_result_to_targets, to avoid printing arguments which are also
@@ -196,8 +199,10 @@ def output(s):
 
                     label = f"execute:rule:{rule.function.__name__}"
                     start = perf_counter()
+                    start_gc = gc_cumulative_time()
                     result = rule.function(machine, **data)
-                    cd._stateful_run_times[label] += perf_counter() - start
+                    in_gctime = gc_cumulative_time() - start_gc
+                    cd._stateful_run_times[label] += perf_counter() - start - in_gctime
 
                     if rule.targets:
                         if isinstance(result, MultipleResults):
diff --git a/hypothesis-python/src/hypothesis/statistics.py b/hypothesis-python/src/hypothesis/statistics.py
diff --git a/hypothesis-python/tests/conftest.py b/hypothesis-python/tests/conftest.py
diff --git a/hypothesis-python/tests/conjecture/test_float_encoding.py b/hypothesis-python/tests/conjecture/test_float_encoding.py
diff --git a/hypothesis-python/tests/cover/test_deadline.py b/hypothesis-python/tests/cover/test_deadline.py
diff --git a/hypothesis-python/tests/nocover/test_recursive.py b/hypothesis-python/tests/nocover/test_recursive.py

Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@`
`46`	`46`	`},`
`47`	`47`	`"timing": {`
`48`	`48`	`"type": "object",`
`49`		`- "description": "The time in seconds taken by non-overlapping parts of this test case. Hypothesis reports execute:test, and generate:{argname} for each argument.",`
	`49`	`+ "description": "The time in seconds taken by non-overlapping parts of this test case. Hypothesis reports execute:test, overall:gc, and generate:{argname} for each argument.",`
`50`	`50`	`"additionalProperties": {`
`51`	`51`	`"type": "number",`
`52`	`52`	`"minimum": 0`
`@@ -98,4 +98,4 @@`
`98`	`98`	`"additionalProperties": false`
`99`	`99`	`}`
`100`	`100`	`]`
`101`		`-}`
	`101`	`+}`