Mark failures for crosshair to fix?

Zac-HD · Zac-HD · commit 755486ab9ea6 · 2025-03-23T19:43:21.000-07:00
diff --git a/hypothesis-python/tests/common/utils.py b/hypothesis-python/tests/common/utils.py
@@ -257,6 +257,7 @@ class Why(enum.Enum):
     # Some are crosshair issues, some hypothesis issues, others truly ok-to-xfail tests.
     symbolic_outside_context = "CrosshairInternal error (using value outside context)"
     nested_given = "nested @given decorators don't work with crosshair"
+    undiscovered = "crosshair may not find the failing input"
     other = "reasons not elsewhere categorized"
 
 
@@ -269,7 +270,7 @@ def xfail_on_crosshair(why: Why, /, *, strict=True, as_marks=False):
 
     current_backend = settings.get_profile(settings._current_profile).backend
     kw = {
-        "strict": strict,
+        "strict": strict and why != Why.undiscovered,
         "reason": f"Expected failure due to: {why.value}",
         "condition": current_backend == "crosshair",
     }
diff --git a/hypothesis-python/tests/cover/test_datetimes.py b/hypothesis-python/tests/cover/test_datetimes.py
@@ -16,6 +16,7 @@
 from hypothesis.strategies import dates, datetimes, timedeltas, times
 
 from tests.common.debug import assert_simple_property, find_any, minimal
+from tests.common.utils import Why, xfail_on_crosshair
 
 
 def test_can_find_positive_delta():
@@ -104,6 +105,7 @@ def test_single_date(val):
     assert find_any(dates(val, val)) is val
 
 
+@xfail_on_crosshair(Why.undiscovered)
 def test_can_find_midnight():
     find_any(times(), lambda x: x.hour == x.minute == x.second == 0)
 
diff --git a/hypothesis-python/tests/cover/test_direct_strategies.py b/hypothesis-python/tests/cover/test_direct_strategies.py
@@ -23,6 +23,7 @@
 from hypothesis.vendor.pretty import pretty
 
 from tests.common.debug import check_can_generate_examples, minimal
+from tests.common.utils import Why, xfail_on_crosshair
 
 # Use `pretty` instead of `repr` for building test names, so that set and dict
 # parameters print consistently across multiple worker processes with different
@@ -437,6 +438,7 @@ def test_decimals():
     assert minimal(st.decimals(), lambda f: f.is_finite() and f >= 1) == 1
 
 
+@xfail_on_crosshair(Why.undiscovered)
 def test_non_float_decimal():
     minimal(st.decimals(), lambda d: d.is_finite() and decimal.Decimal(float(d)) != d)
 
diff --git a/hypothesis-python/tests/cover/test_filter_rewriting.py b/hypothesis-python/tests/cover/test_filter_rewriting.py
@@ -182,6 +182,7 @@ def test_rewrite_unsatisfiable_filter(s, pred):
     assert s.filter(pred).is_empty
 
 
+@xfail_on_crosshair(Why.undiscovered)
 @pytest.mark.parametrize(
     "pred",
     [
diff --git a/hypothesis-python/tests/cover/test_lookup.py b/hypothesis-python/tests/cover/test_lookup.py
@@ -876,6 +876,7 @@ def test_supportsop_types_support_protocol(protocol, data):
     assert issubclass(type(value), protocol)
 
 
+@xfail_on_crosshair(Why.undiscovered)
 @pytest.mark.parametrize("restrict_custom_strategy", [True, False])
 def test_generic_aliases_can_be_conditionally_resolved_by_registered_function(
     restrict_custom_strategy,
diff --git a/hypothesis-python/tests/cover/test_reproduce_failure.py b/hypothesis-python/tests/cover/test_reproduce_failure.py
@@ -175,6 +175,7 @@ def test(data):
     assert "@reproduce_failure" not in o.getvalue()
 
 
+@xfail_on_crosshair(Why.undiscovered)
 def test_does_not_print_reproduction_for_large_data_examples_by_default():
     @settings(phases=no_shrink, print_blob=False)
     @given(st.data())
diff --git a/hypothesis-python/tests/cover/test_sampled_from.py b/hypothesis-python/tests/cover/test_sampled_from.py
@@ -141,6 +141,7 @@ def stupid_sampled_sets(draw):
     return result
 
 
+@xfail_on_crosshair(Why.undiscovered)
 @given(stupid_sampled_sets())
 def test_efficient_sets_of_samples_with_chained_transformations_slow_path(x):
     # This deliberately exercises the standard filtering logic without going
diff --git a/hypothesis-python/tests/cover/test_stateful.py b/hypothesis-python/tests/cover/test_stateful.py
@@ -1285,6 +1285,7 @@ def fail_fast(self, a1, a2, a3, b1, b2, b3):
     )
 
 
+@xfail_on_crosshair(Why.undiscovered)
 def test_multiple_common_targets():
     class Machine(RuleBasedStateMachine):
         a = Bundle("a")
diff --git a/hypothesis-python/tests/cover/test_targeting.py b/hypothesis-python/tests/cover/test_targeting.py
@@ -16,6 +16,8 @@
 from hypothesis.control import current_build_context
 from hypothesis.errors import InvalidArgument
 
+from tests.common.utils import Why, xfail_on_crosshair
+
 
 @example(0.0, "this covers the branch where context.data is None")
 @given(
@@ -100,6 +102,7 @@ def test_cannot_target_same_label_twice(_):
         target(1.0, label="label")
 
 
+@xfail_on_crosshair(Why.undiscovered)
 @given(st.none())
 def test_cannot_target_default_label_twice(_):
     target(0.0)
diff --git a/hypothesis-python/tests/cover/test_testdecorators.py b/hypothesis-python/tests/cover/test_testdecorators.py
@@ -149,6 +149,7 @@ def test_can_be_given_keyword_args(x, name):
     assert len(name) < x
 
 
+@xfail_on_crosshair(Why.undiscovered)
 @fails
 @given(one_of(floats(), booleans()), one_of(floats(), booleans()))
 def test_one_of_produces_different_values(x, y):
@@ -196,6 +197,7 @@ def test_removing_an_element_from_a_unique_list(xs, y):
     assert y not in xs
 
 
+@xfail_on_crosshair(Why.undiscovered)
 @fails
 @given(lists(integers(), min_size=2), data())
 def test_removing_an_element_from_a_non_unique_list(xs, data):
@@ -219,6 +221,7 @@ def test_can_mix_sampling_with_generating(x, y):
     assert type(x) == type(y)
 
 
+@xfail_on_crosshair(Why.undiscovered)
 @fails
 @given(frozensets(integers()))
 def test_can_find_large_sum_frozenset(xs):
diff --git a/hypothesis-python/tests/datetime/test_dateutil_timezones.py b/hypothesis-python/tests/datetime/test_dateutil_timezones.py
@@ -109,6 +109,7 @@ def test_dateutil_exists_our_not_exists_are_inverse(value):
     assert datetime_does_not_exist(value) == (not tz.datetime_exists(value))
 
 
+@xfail_on_crosshair(Why.undiscovered)
 def test_datetimes_can_exclude_imaginary():
     find_any(
         datetimes(**DAY_WITH_IMAGINARY_HOUR_KWARGS, allow_imaginary=True),
@@ -120,6 +121,7 @@ def test_datetimes_can_exclude_imaginary():
     )
 
 
+@xfail_on_crosshair(Why.undiscovered)
 @fails_with(FailedHealthCheck)
 @given(
     datetimes(
diff --git a/hypothesis-python/tests/datetime/test_pytz_timezones.py b/hypothesis-python/tests/datetime/test_pytz_timezones.py
@@ -104,6 +104,7 @@ def test_time_bounds_must_be_naive(name, val):
         times(**{name: val}).validate()
 
 
+@xfail_on_crosshair(Why.undiscovered)
 @pytest.mark.parametrize(
     "bound",
     [
diff --git a/hypothesis-python/tests/datetime/test_zoneinfo_timezones.py b/hypothesis-python/tests/datetime/test_zoneinfo_timezones.py
@@ -17,12 +17,14 @@
 from hypothesis.errors import InvalidArgument
 
 from tests.common.debug import assert_no_examples, find_any, minimal
+from tests.common.utils import Why, xfail_on_crosshair
 
 
 def test_utc_is_minimal():
     assert minimal(st.timezones()) is zoneinfo.ZoneInfo("UTC")
 
 
+@xfail_on_crosshair(Why.undiscovered)
 def test_can_generate_non_utc():
     find_any(
         st.datetimes(timezones=st.timezones()).filter(lambda d: d.tzinfo.key != "UTC")
diff --git a/hypothesis-python/tests/nocover/test_characters.py b/hypothesis-python/tests/nocover/test_characters.py
@@ -15,6 +15,8 @@
 
 from hypothesis import given, settings, strategies as st
 
+from tests.common.utils import Why, xfail_on_crosshair
+
 IDENTIFIER_CHARS = string.ascii_letters + string.digits + "_"
 
 
@@ -23,6 +25,7 @@ def test_large_blacklist(c):
     assert c not in IDENTIFIER_CHARS
 
 
+@xfail_on_crosshair(Why.symbolic_outside_context)  # seems like a crosshair bug here
 @given(st.data())
 def test_arbitrary_blacklist(data):
     blacklist = data.draw(st.text(st.characters(max_codepoint=1000), min_size=1))
diff --git a/hypothesis-python/tests/nocover/test_database_usage.py b/hypothesis-python/tests/nocover/test_database_usage.py
@@ -32,6 +32,7 @@ def has_a_non_zero_byte(x):
     return any(bytes(x))
 
 
+@xfail_on_crosshair(Why.undiscovered)
 def test_saves_incremental_steps_in_database():
     key = b"a database key"
     database = InMemoryExampleDatabase()
diff --git a/hypothesis-python/tests/nocover/test_duplication.py b/hypothesis-python/tests/nocover/test_duplication.py
@@ -15,6 +15,8 @@
 from hypothesis import given, settings
 from hypothesis.strategies._internal import SearchStrategy
 
+from tests.common.utils import Why, xfail_on_crosshair
+
 
 class Blocks(SearchStrategy):
     def __init__(self, n):
@@ -37,6 +39,7 @@ def test(b):
     assert set(counts.values()) == {1}
 
 
+@xfail_on_crosshair(Why.other, strict=False)  # CrosshairInternal for n>0
 @pytest.mark.parametrize("n", range(1, 5))
 def test_mostly_does_not_duplicate_blocks_even_when_failing(n):
     counts = Counter()
diff --git a/hypothesis-python/tests/nocover/test_flatmap.py b/hypothesis-python/tests/nocover/test_flatmap.py
@@ -26,6 +26,7 @@
 )
 
 from tests.common.debug import find_any, minimal
+from tests.common.utils import Why, xfail_on_crosshair
 
 ConstantLists = integers().flatmap(lambda i: lists(just(i)))
 
@@ -47,15 +48,13 @@ def test_in_order(x):
     assert x[0] < x[1]
 
 
+# crosshair just generates increasingly-long lists of [0.0]
+@xfail_on_crosshair(Why.undiscovered)
 def test_flatmap_retrieve_from_db():
-    constant_float_lists = floats(0, 1).flatmap(lambda x: lists(just(x)))
-
     track = []
 
-    db = ExampleDatabase()
-
-    @given(constant_float_lists)
-    @settings(database=db)
+    @given(floats(0, 1).flatmap(lambda x: lists(just(x))))
+    @settings(database=ExampleDatabase())
     def record_and_test_size(xs):
         if sum(xs) >= 1:
             track.append(xs)
@@ -98,6 +97,7 @@ def criterion(ls):
     assert set(result) == {False, ""}
 
 
+@xfail_on_crosshair(Why.undiscovered)  # for n >= 8 at least
 @pytest.mark.parametrize("n", range(1, 10))
 def test_can_shrink_through_a_binding(n):
     bool_lists = integers(0, 100).flatmap(
@@ -106,6 +106,7 @@ def test_can_shrink_through_a_binding(n):
     assert minimal(bool_lists, lambda x: x.count(True) >= n) == [True] * n
 
 
+@xfail_on_crosshair(Why.undiscovered)  # for n >= 8 at least
 @pytest.mark.parametrize("n", range(1, 10))
 def test_can_delete_in_middle_of_a_binding(n):
     bool_lists = integers(1, 100).flatmap(
diff --git a/hypothesis-python/tests/nocover/test_floating.py b/hypothesis-python/tests/nocover/test_floating.py
@@ -20,7 +20,7 @@
 from hypothesis.strategies import data, floats, lists
 
 from tests.common.debug import find_any
-from tests.common.utils import fails
+from tests.common.utils import Why, fails, xfail_on_crosshair
 
 TRY_HARDER = settings(
     max_examples=1000, suppress_health_check=[HealthCheck.filter_too_much]
@@ -93,6 +93,7 @@ def test_is_not_int(x):
     assert x != int(x)
 
 
+@xfail_on_crosshair(Why.undiscovered)
 @fails
 @given(floats())
 @TRY_HARDER
@@ -128,6 +129,7 @@ def test_floats_are_in_range(x, y, data):
     assert x <= t <= y
 
 
+@xfail_on_crosshair(Why.undiscovered)
 @pytest.mark.parametrize("neg", [False, True])
 @pytest.mark.parametrize("snan", [False, True])
 def test_can_find_negative_and_signaling_nans(neg, snan):
diff --git a/hypothesis-python/tests/nocover/test_recursive.py b/hypothesis-python/tests/nocover/test_recursive.py
@@ -17,7 +17,7 @@
 from hypothesis import HealthCheck, given, settings, strategies as st
 
 from tests.common.debug import find_any, minimal
-from tests.common.utils import flaky
+from tests.common.utils import Why, flaky, xfail_on_crosshair
 
 
 def test_can_generate_with_large_branching():
@@ -79,6 +79,7 @@ def test_drawing_many_near_boundary():
     assert len(ls) == size
 
 
+@xfail_on_crosshair(Why.undiscovered)
 def test_can_use_recursive_data_in_sets():
     nested_sets = st.recursive(st.booleans(), st.frozensets, max_leaves=3)
     find_any(nested_sets, settings=settings(deadline=None))
diff --git a/hypothesis-python/tests/nocover/test_regressions.py b/hypothesis-python/tests/nocover/test_regressions.py
@@ -14,7 +14,10 @@
 from hypothesis._settings import note_deprecation
 from hypothesis.errors import HypothesisDeprecationWarning
 
+from tests.common.utils import Why, xfail_on_crosshair
 
+
+@xfail_on_crosshair(Why.other)
 def test_note_deprecation_blames_right_code_issue_652():
     msg = "this is an arbitrary deprecation warning message"
 
@@ -58,6 +61,8 @@ def test_unique_floats_with_nan_is_not_flaky_3926(ls):
 
 # this will take a while to find the regression, but will eventually trigger it.
 # min_value=0 is critical to trigger the probing behavior which exhausts our buffer.
+# https://github.com/pschanely/CrossHair/issues/285 for an upstream fix.
+@xfail_on_crosshair(Why.other, strict=False)
 @given(st.integers(min_value=0, max_value=1 << 25_000))
 def test_overrun_during_datatree_simulation_3874(n):
     pass
diff --git a/hypothesis-python/tests/nocover/test_sampled_from.py b/hypothesis-python/tests/nocover/test_sampled_from.py
@@ -21,7 +21,7 @@
 from hypothesis.strategies._internal.strategies import SampledFromStrategy
 
 from tests.common.debug import find_any, minimal
-from tests.common.utils import fails_with
+from tests.common.utils import Why, fails_with, xfail_on_crosshair
 
 
 @pytest.mark.parametrize("size", [100, 10**5, 10**6, 2**25])
@@ -101,6 +101,7 @@ def test_flag_enum_repr_uses_class_not_a_list():
     assert lazy_repr == "sampled_from(tests.nocover.test_sampled_from.AFlag)"
 
 
+@xfail_on_crosshair(Why.undiscovered)
 def test_exhaustive_flags():
     # Generate powerset of flag combinations. There are only 2^3 of them, so
     # we can reasonably expect that they are all are found.
diff --git a/hypothesis-python/tests/nocover/test_simple_numbers.py b/hypothesis-python/tests/nocover/test_simple_numbers.py
@@ -17,6 +17,7 @@
 from hypothesis.strategies import floats, integers, lists
 
 from tests.common.debug import minimal
+from tests.common.utils import Why, xfail_on_crosshair
 
 
 def test_minimize_negative_int():
@@ -116,6 +117,7 @@ def test_can_minimal_infinite_negative_float():
     assert minimal(floats(), lambda x: x < -sys.float_info.max)
 
 
+@xfail_on_crosshair(Why.undiscovered)  # sometimes
 def test_can_minimal_float_on_boundary_of_representable():
     minimal(floats(), lambda x: x + 1 == x and not math.isinf(x))
 
@@ -153,6 +155,7 @@ def test_minimal_fractional_float():
     assert minimal(floats(), lambda x: x >= 1.5) == 2
 
 
+@xfail_on_crosshair(Why.undiscovered)
 def test_minimizes_lists_of_negative_ints_up_to_boundary():
     result = minimal(
         lists(integers(), min_size=10),

Original file line number	Diff line number	Diff line change
`@@ -182,6 +182,7 @@ def test_rewrite_unsatisfiable_filter(s, pred):`
`182`	`182`	`assert s.filter(pred).is_empty`
`183`	`183`
`184`	`184`
	`185`	`+@xfail_on_crosshair(Why.undiscovered)`
`185`	`186`	`@pytest.mark.parametrize(`
`186`	`187`	`"pred",`
`187`	`188`	`[`
Original file line number	Diff line number	Diff line change
`@@ -1285,6 +1285,7 @@ def fail_fast(self, a1, a2, a3, b1, b2, b3):`
`1285`	`1285`	`)`
`1286`	`1286`
`1287`	`1287`
	`1288`	`+@xfail_on_crosshair(Why.undiscovered)`
`1288`	`1289`	`def test_multiple_common_targets():`
`1289`	`1290`	`class Machine(RuleBasedStateMachine):`
`1290`	`1291`	`a = Bundle("a")`
Original file line number	Diff line number	Diff line change
`@@ -104,6 +104,7 @@ def test_time_bounds_must_be_naive(name, val):`
`104`	`104`	`times(**{name: val}).validate()`
`105`	`105`
`106`	`106`
	`107`	`+@xfail_on_crosshair(Why.undiscovered)`
`107`	`108`	`@pytest.mark.parametrize(`
`108`	`109`	`"bound",`
`109`	`110`	`[`