Skip to content

Commit 6c155a4

Browse files
authored
Merge pull request #4007 from tybug/remove-sub-ir-examples
Remove sub-ir examples
2 parents 395649a + d8c97fc commit 6c155a4

File tree

14 files changed

+151
-113
lines changed

14 files changed

+151
-113
lines changed

hypothesis-python/RELEASE.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
RELEASE_TYPE: patch
2+
3+
This patch improves our deduplication tracking across all strategies (:pull:`4007`). Hypothesis is now less likely to generate the same input twice.

hypothesis-python/src/hypothesis/internal/conjecture/data.py

Lines changed: 18 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -85,20 +85,7 @@ def wrapper(tp):
8585
return wrapper
8686

8787

88-
ONE_BOUND_INTEGERS_LABEL = calc_label_from_name("trying a one-bound int allowing 0")
89-
INTEGER_RANGE_DRAW_LABEL = calc_label_from_name("another draw in integer_range()")
90-
BIASED_COIN_LABEL = calc_label_from_name("biased_coin()")
91-
9288
TOP_LABEL = calc_label_from_name("top")
93-
DRAW_BYTES_LABEL = calc_label_from_name("draw_bytes() in ConjectureData")
94-
DRAW_FLOAT_LABEL = calc_label_from_name("drawing a float")
95-
FLOAT_STRATEGY_DO_DRAW_LABEL = calc_label_from_name(
96-
"getting another float in FloatStrategy"
97-
)
98-
INTEGER_WEIGHTED_DISTRIBUTION = calc_label_from_name(
99-
"drawing from a weighted distribution in integers"
100-
)
101-
10289
InterestingOrigin = Tuple[
10390
Type[BaseException], str, int, Tuple[Any, ...], Tuple[Tuple[Any, ...], ...]
10491
]
@@ -370,11 +357,9 @@ def run(self) -> Any:
370357
blocks = self.examples.blocks
371358
for record in self.examples.trail:
372359
if record == DRAW_BITS_RECORD:
373-
self.__push(0)
374360
self.bytes_read = blocks.endpoints[self.block_count]
375361
self.block(self.block_count)
376362
self.block_count += 1
377-
self.__pop(discarded=False)
378363
elif record == IR_NODE_RECORD:
379364
data = self.examples.ir_nodes[self.ir_node_count]
380365
self.ir_node(data)
@@ -469,8 +454,8 @@ class ExampleRecord:
469454
"""
470455

471456
def __init__(self) -> None:
472-
self.labels = [DRAW_BYTES_LABEL]
473-
self.__index_of_labels: "Optional[Dict[int, int]]" = {DRAW_BYTES_LABEL: 0}
457+
self.labels: List[int] = []
458+
self.__index_of_labels: "Optional[Dict[int, int]]" = {}
474459
self.trail = IntList()
475460
self.ir_nodes: List[IRNode] = []
476461

@@ -522,11 +507,9 @@ def __init__(self, record: ExampleRecord, blocks: "Blocks") -> None:
522507
self.trail = record.trail
523508
self.ir_nodes = record.ir_nodes
524509
self.labels = record.labels
525-
self.__length = (
526-
self.trail.count(STOP_EXAMPLE_DISCARD_RECORD)
527-
+ record.trail.count(STOP_EXAMPLE_NO_DISCARD_RECORD)
528-
+ record.trail.count(DRAW_BITS_RECORD)
529-
)
510+
self.__length = self.trail.count(
511+
STOP_EXAMPLE_DISCARD_RECORD
512+
) + record.trail.count(STOP_EXAMPLE_NO_DISCARD_RECORD)
530513
self.blocks = blocks
531514
self.__children: "Optional[List[Sequence[int]]]" = None
532515

@@ -649,18 +632,23 @@ def start_example(self, i: int, label_index: int) -> None:
649632

650633
class _mutator_groups(ExampleProperty):
651634
def begin(self) -> None:
652-
self.groups: "Dict[Tuple[int, int], List[int]]" = defaultdict(list)
635+
self.groups: "Dict[int, Set[Tuple[int, int]]]" = defaultdict(set)
653636

654637
def start_example(self, i: int, label_index: int) -> None:
655-
depth = len(self.example_stack)
656-
self.groups[label_index, depth].append(i)
638+
# TODO should we discard start == end cases? occurs for eg st.data()
639+
# which is conditionally or never drawn from. arguably swapping
640+
# nodes with the empty list is a useful mutation enabled by start == end?
641+
key = (self.examples[i].ir_start, self.examples[i].ir_end)
642+
self.groups[label_index].add(key)
657643

658-
def finish(self) -> Iterable[Iterable[int]]:
644+
def finish(self) -> Iterable[Set[Tuple[int, int]]]:
659645
# Discard groups with only one example, since the mutator can't
660646
# do anything useful with them.
661647
return [g for g in self.groups.values() if len(g) >= 2]
662648

663-
mutator_groups: List[List[int]] = calculated_example_property(_mutator_groups)
649+
mutator_groups: List[Set[Tuple[int, int]]] = calculated_example_property(
650+
_mutator_groups
651+
)
664652

665653
@property
666654
def children(self) -> List[Sequence[int]]:
@@ -1338,7 +1326,6 @@ def draw_boolean(
13381326

13391327
size = 2**bits
13401328

1341-
self._cd.start_example(BIASED_COIN_LABEL)
13421329
while True:
13431330
# The logic here is a bit complicated and special cased to make it
13441331
# play better with the shrinker.
@@ -1409,7 +1396,6 @@ def draw_boolean(
14091396
result = i > falsey
14101397

14111398
break
1412-
self._cd.stop_example()
14131399
return result
14141400

14151401
def draw_integer(
@@ -1460,24 +1446,20 @@ def draw_integer(
14601446
assert max_value is not None # make mypy happy
14611447
probe = max_value + 1
14621448
while max_value < probe:
1463-
self._cd.start_example(ONE_BOUND_INTEGERS_LABEL)
14641449
probe = shrink_towards + self._draw_unbounded_integer(
14651450
forced=None if forced is None else forced - shrink_towards,
14661451
fake_forced=fake_forced,
14671452
)
1468-
self._cd.stop_example()
14691453
return probe
14701454

14711455
if max_value is None:
14721456
assert min_value is not None
14731457
probe = min_value - 1
14741458
while probe < min_value:
1475-
self._cd.start_example(ONE_BOUND_INTEGERS_LABEL)
14761459
probe = shrink_towards + self._draw_unbounded_integer(
14771460
forced=None if forced is None else forced - shrink_towards,
14781461
fake_forced=fake_forced,
14791462
)
1480-
self._cd.stop_example()
14811463
return probe
14821464

14831465
return self._draw_bounded_integer(
@@ -1518,7 +1500,6 @@ def draw_float(
15181500
assert self._cd is not None
15191501

15201502
while True:
1521-
self._cd.start_example(FLOAT_STRATEGY_DO_DRAW_LABEL)
15221503
# If `forced in nasty_floats`, then `forced` was *probably*
15231504
# generated by drawing a nonzero index from the sampler. However, we
15241505
# have no obligation to generate it that way when forcing. In particular,
@@ -1530,7 +1511,6 @@ def draw_float(
15301511
if sampler
15311512
else 0
15321513
)
1533-
self._cd.start_example(DRAW_FLOAT_LABEL)
15341514
if i == 0:
15351515
result = self._draw_float(
15361516
forced_sign_bit=forced_sign_bit,
@@ -1546,8 +1526,6 @@ def draw_float(
15461526
assert pos_clamper is not None
15471527
clamped = pos_clamper(result)
15481528
if clamped != result and not (math.isnan(result) and allow_nan):
1549-
self._cd.stop_example()
1550-
self._cd.start_example(DRAW_FLOAT_LABEL)
15511529
self._draw_float(forced=clamped, fake_forced=fake_forced)
15521530
result = clamped
15531531
else:
@@ -1576,8 +1554,6 @@ def draw_float(
15761554

15771555
self._draw_float(forced=result, fake_forced=fake_forced)
15781556

1579-
self._cd.stop_example() # (DRAW_FLOAT_LABEL)
1580-
self._cd.stop_example() # (FLOAT_STRATEGY_DO_DRAW_LABEL)
15811557
return result
15821558

15831559
def draw_string(
@@ -1771,7 +1747,6 @@ def _draw_bounded_integer(
17711747
7 / 8, forced=None if forced is None else False, fake_forced=fake_forced
17721748
)
17731749
):
1774-
self._cd.start_example(INTEGER_WEIGHTED_DISTRIBUTION)
17751750
# For large ranges, we combine the uniform random distribution from draw_bits
17761751
# with a weighting scheme with moderate chance. Cutoff at 2 ** 24 so that our
17771752
# choice of unicode characters is uniform but the 32bit distribution is not.
@@ -1782,18 +1757,15 @@ def _draw_bounded_integer(
17821757
upper=center if not above else min(upper, center + 2**force_bits - 1),
17831758
_vary_effective_size=False,
17841759
)
1785-
self._cd.stop_example()
17861760

17871761
assert lower <= forced <= upper
17881762

17891763
while probe > gap:
1790-
self._cd.start_example(INTEGER_RANGE_DRAW_LABEL)
17911764
probe = self._cd.draw_bits(
17921765
bits,
17931766
forced=None if forced is None else abs(forced - center),
17941767
fake_forced=fake_forced,
17951768
)
1796-
self._cd.stop_example()
17971769

17981770
if above:
17991771
result = center + probe
@@ -1938,12 +1910,13 @@ def for_ir_tree(
19381910
*,
19391911
observer: Optional[DataObserver] = None,
19401912
provider: Union[type, PrimitiveProvider] = HypothesisProvider,
1913+
max_length: Optional[int] = None,
19411914
) -> "ConjectureData":
19421915
from hypothesis.internal.conjecture.engine import BUFFER_SIZE
19431916

19441917
return cls(
1945-
BUFFER_SIZE,
1946-
b"",
1918+
max_length=BUFFER_SIZE if max_length is None else max_length,
1919+
prefix=b"",
19471920
random=None,
19481921
ir_tree_prefix=ir_tree_prefix,
19491922
observer=observer,

hypothesis-python/src/hypothesis/internal/conjecture/engine.py

Lines changed: 40 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -381,16 +381,30 @@ def _cache(self, data: ConjectureData) -> None:
381381
self.__data_cache_ir[key] = result
382382

383383
def cached_test_function_ir(
384-
self, nodes: List[IRNode]
384+
self, nodes: List[IRNode], *, error_on_discard: bool = False
385385
) -> Union[ConjectureResult, _Overrun]:
386386
key = self._cache_key_ir(nodes=nodes)
387387
try:
388388
return self.__data_cache_ir[key]
389389
except KeyError:
390390
pass
391391

392+
# explicitly use a no-op DataObserver here instead of a TreeRecordingObserver.
393+
# The reason is we don't expect simulate_test_function to explore new choices
394+
# and write back to the tree, so we don't want the overhead of the
395+
# TreeRecordingObserver tracking those calls.
396+
trial_observer: Optional[DataObserver] = DataObserver()
397+
if error_on_discard:
398+
399+
class DiscardObserver(DataObserver):
400+
@override
401+
def kill_branch(self) -> NoReturn:
402+
raise ContainsDiscard
403+
404+
trial_observer = DiscardObserver()
405+
392406
try:
393-
trial_data = self.new_conjecture_data_ir(nodes)
407+
trial_data = self.new_conjecture_data_ir(nodes, observer=trial_observer)
394408
self.tree.simulate_test_function(trial_data)
395409
except PreviouslyUnseenBehaviour:
396410
pass
@@ -1063,13 +1077,24 @@ def generate_mutations_from(
10631077

10641078
group = self.random.choice(groups)
10651079

1066-
ex1, ex2 = (
1067-
data.examples[i] for i in sorted(self.random.sample(group, 2))
1068-
)
1069-
assert ex1.end <= ex2.start
1080+
(start1, end1), (start2, end2) = self.random.sample(sorted(group), 2)
1081+
if (start1 <= start2 <= end2 <= end1) or (
1082+
start2 <= start1 <= end1 <= end2
1083+
):
1084+
# one example entirely contains the other. give up.
1085+
# TODO use more intelligent mutation for containment, like
1086+
# replacing child with parent or vice versa. Would allow for
1087+
# recursive / subtree mutation
1088+
failed_mutations += 1
1089+
continue
10701090

1071-
e = self.random.choice([ex1, ex2])
1072-
replacement = data.buffer[e.start : e.end]
1091+
if start1 > start2:
1092+
(start1, end1), (start2, end2) = (start2, end2), (start1, end1)
1093+
assert end1 <= start2
1094+
1095+
nodes = data.examples.ir_tree_nodes
1096+
(start, end) = self.random.choice([(start1, end1), (start2, end2)])
1097+
replacement = nodes[start:end]
10731098

10741099
try:
10751100
# We attempt to replace both the examples with
@@ -1080,17 +1105,16 @@ def generate_mutations_from(
10801105
# really matter. It may not achieve the desired result,
10811106
# but it's still a perfectly acceptable choice sequence
10821107
# to try.
1083-
new_data = self.cached_test_function(
1084-
data.buffer[: ex1.start]
1108+
new_data = self.cached_test_function_ir(
1109+
nodes[:start1]
10851110
+ replacement
1086-
+ data.buffer[ex1.end : ex2.start]
1111+
+ nodes[end1:start2]
10871112
+ replacement
1088-
+ data.buffer[ex2.end :],
1113+
+ nodes[end2:],
10891114
# We set error_on_discard so that we don't end up
10901115
# entering parts of the tree we consider redundant
10911116
# and not worth exploring.
10921117
error_on_discard=True,
1093-
extend=BUFFER_SIZE,
10941118
)
10951119
except ContainsDiscard:
10961120
failed_mutations += 1
@@ -1184,6 +1208,7 @@ def new_conjecture_data_ir(
11841208
ir_tree_prefix: List[IRNode],
11851209
*,
11861210
observer: Optional[DataObserver] = None,
1211+
max_length: Optional[int] = None,
11871212
) -> ConjectureData:
11881213
provider = (
11891214
HypothesisProvider if self._switch_to_hypothesis_provider else self.provider
@@ -1193,7 +1218,7 @@ def new_conjecture_data_ir(
11931218
observer = DataObserver()
11941219

11951220
return ConjectureData.for_ir_tree(
1196-
ir_tree_prefix, observer=observer, provider=provider
1221+
ir_tree_prefix, observer=observer, provider=provider, max_length=max_length
11971222
)
11981223

11991224
def new_conjecture_data(
@@ -1331,7 +1356,6 @@ def cached_test_function(
13311356
self,
13321357
buffer: Union[bytes, bytearray],
13331358
*,
1334-
error_on_discard: bool = False,
13351359
extend: int = 0,
13361360
) -> Union[ConjectureResult, _Overrun]:
13371361
"""Checks the tree to see if we've tested this buffer, and returns the
@@ -1370,18 +1394,7 @@ def check_result(
13701394
except KeyError:
13711395
pass
13721396

1373-
observer: DataObserver
1374-
if error_on_discard:
1375-
1376-
class DiscardObserver(DataObserver):
1377-
@override
1378-
def kill_branch(self) -> NoReturn:
1379-
raise ContainsDiscard
1380-
1381-
observer = DiscardObserver()
1382-
else:
1383-
observer = DataObserver()
1384-
1397+
observer = DataObserver()
13851398
dummy_data = self.new_conjecture_data(
13861399
prefix=buffer, max_length=max_length, observer=observer
13871400
)

hypothesis-python/src/hypothesis/internal/conjecture/optimiser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,12 +136,12 @@ def attempt_replace(v):
136136

137137
for i, ex in enumerate(self.current_data.examples):
138138
if ex.start >= block.end:
139-
break
139+
break # pragma: no cover
140140
if ex.end <= block.start:
141141
continue
142142
ex_attempt = attempt.examples[i]
143143
if ex.length == ex_attempt.length:
144-
continue
144+
continue # pragma: no cover
145145
replacement = attempt.buffer[ex_attempt.start : ex_attempt.end]
146146
if self.consider_new_test_data(
147147
self.engine.cached_test_function(

0 commit comments

Comments
 (0)