Skip to content

Commit ec02c47

Browse files
authored
Merge pull request #1752 from HypothesisWorks/DRMacIver/factor-out-tree
Factor out tree logic from ConjectureRunner
2 parents 4f3490e + b92592b commit ec02c47

File tree

5 files changed

+322
-250
lines changed

5 files changed

+322
-250
lines changed

hypothesis-python/RELEASE.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
RELEASE_TYPE: patch
2+
3+
This is a pure refactoring release that extracts some logic from the core Hypothesis engine
4+
into its own class and file. It should have no user visible impact.
Lines changed: 297 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,297 @@
1+
# coding=utf-8
2+
#
3+
# This file is part of Hypothesis, which may be found at
4+
# https://github.com/HypothesisWorks/hypothesis/
5+
#
6+
# Most of this work is copyright (C) 2013-2019 David R. MacIver
7+
# ([email protected]), but it contains contributions by others. See
8+
# CONTRIBUTING.rst for a full list of people who may hold copyright, and
9+
# consult the git log if you need to determine who owns an individual
10+
# contribution.
11+
#
12+
# This Source Code Form is subject to the terms of the Mozilla Public License,
13+
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
14+
# obtain one at https://mozilla.org/MPL/2.0/.
15+
#
16+
# END HEADER
17+
18+
from __future__ import absolute_import, division, print_function
19+
20+
from hypothesis.internal.compat import hbytes, hrange
21+
from hypothesis.internal.conjecture.data import ConjectureData, Overrun, Status
22+
23+
24+
class DataTree(object):
25+
"""Tracks the tree structure of a collection of ConjectureData
26+
objects, for use in ConjectureRunner."""
27+
28+
def __init__(self, cap):
29+
self.cap = cap
30+
31+
# Previously-tested byte streams are recorded in a prefix tree, so that
32+
# we can:
33+
# - Avoid testing the same stream twice (in some cases).
34+
# - Avoid testing a prefix of a past stream (in some cases),
35+
# since that should only result in overrun.
36+
# - Generate stream prefixes that we haven't tried before.
37+
38+
# Tree nodes are stored in an array to prevent heavy nesting of data
39+
# structures. Branches are dicts mapping bytes to child nodes (which
40+
# will in general only be partially populated). Leaves are
41+
# ConjectureData objects that have been previously seen as the result
42+
# of following that path.
43+
self.nodes = [{}]
44+
45+
# A node is dead if there is nothing left to explore past that point.
46+
# Recursively, a node is dead if either it is a leaf or every byte
47+
# leads to a dead node when starting from here.
48+
self.dead = set()
49+
50+
# We rewrite the byte stream at various points during parsing, to one
51+
# that will produce an equivalent result but is in some sense more
52+
# canonical. We keep track of these so that when walking the tree we
53+
# can identify nodes where the exact byte value doesn't matter and
54+
# treat all bytes there as equivalent. This significantly reduces the
55+
# size of the search space and removes a lot of redundant examples.
56+
57+
# Maps tree indices where to the unique byte that is valid at that
58+
# point. Corresponds to data.write() calls.
59+
self.forced = {}
60+
61+
# Maps tree indices to a mask that restricts bytes at that point.
62+
# Currently this is only updated by draw_bits, but it potentially
63+
# could get used elsewhere.
64+
self.masks = {}
65+
66+
# Where a tree node consists of the beginning of a block we track the
67+
# size of said block. This allows us to tell when an example is too
68+
# short even if it goes off the unexplored region of the tree - if it
69+
# is at the beginning of a block of size 4 but only has 3 bytes left,
70+
# it's going to overrun the end of the buffer regardless of the
71+
# buffer contents.
72+
self.block_sizes = {}
73+
74+
@property
75+
def is_exhausted(self):
76+
"""Returns True if every possible node is dead and thus the language
77+
described must have been fully explored."""
78+
return 0 in self.dead
79+
80+
def add(self, data):
81+
"""Add a ConjectureData object to the current collection."""
82+
83+
# First, iterate through the result's buffer, to create the node that
84+
# will hold this result. Also note any forced or masked bytes.
85+
tree_node = self.nodes[0]
86+
indices = []
87+
node_index = 0
88+
for i, b in enumerate(data.buffer):
89+
# We build a list of all the node indices visited on our path
90+
# through the tree, since we'll need to refer to them later.
91+
indices.append(node_index)
92+
93+
# If this buffer position was forced or masked, then mark its
94+
# corresponding node as forced/masked.
95+
if i in data.forced_indices:
96+
self.forced[node_index] = b
97+
try:
98+
self.masks[node_index] = data.masked_indices[i]
99+
except KeyError:
100+
pass
101+
102+
try:
103+
# Use the current byte to find the next node on our path.
104+
node_index = tree_node[b]
105+
except KeyError:
106+
# That node doesn't exist yet, so create it.
107+
node_index = len(self.nodes)
108+
# Create a new branch node. If this should actually be a leaf
109+
# node, it will be overwritten when we store the result.
110+
self.nodes.append({})
111+
tree_node[b] = node_index
112+
113+
tree_node = self.nodes[node_index]
114+
115+
if node_index in self.dead:
116+
# This part of the tree has already been marked as dead, so
117+
# there's no need to traverse any deeper.
118+
break
119+
120+
# At each node that begins a block, record the size of that block.
121+
for u, v in data.all_block_bounds():
122+
# This can happen if we hit a dead node when walking the buffer.
123+
# In that case we already have this section of the tree mapped.
124+
if u >= len(indices):
125+
break
126+
self.block_sizes[indices[u]] = v - u
127+
128+
# Forcibly mark all nodes beyond the zero-bound point as dead,
129+
# because we don't intend to try any other values there.
130+
self.dead.update(indices[self.cap :])
131+
132+
# Now store this result in the tree (if appropriate), and check if
133+
# any nodes need to be marked as dead.
134+
if data.status != Status.OVERRUN and node_index not in self.dead:
135+
# Mark this node as dead, because it produced a result.
136+
# Trying to explore suffixes of it would not be helpful.
137+
self.dead.add(node_index)
138+
# Store the result in the tree as a leaf. This will overwrite the
139+
# branch node that was created during traversal.
140+
self.nodes[node_index] = data
141+
142+
# Review the traversed nodes, to see if any should be marked
143+
# as dead. We check them in reverse order, because as soon as we
144+
# find a live node, all nodes before it must still be live too.
145+
for j in reversed(indices):
146+
mask = self.masks.get(j, 0xFF)
147+
assert _is_simple_mask(mask)
148+
max_size = mask + 1
149+
150+
if len(self.nodes[j]) < max_size and j not in self.forced:
151+
# There are still byte values to explore at this node,
152+
# so it isn't dead yet.
153+
break
154+
if set(self.nodes[j].values()).issubset(self.dead):
155+
# Everything beyond this node is known to be dead,
156+
# and there are no more values to explore here (see above),
157+
# so this node must be dead too.
158+
self.dead.add(j)
159+
else:
160+
# Even though all of this node's possible values have been
161+
# tried, there are still some deeper nodes that remain
162+
# alive, so this node isn't dead yet.
163+
break
164+
165+
def generate_novel_prefix(self, random):
166+
"""Generate a short random string that (after rewriting) is not
167+
a prefix of any buffer previously added to the tree."""
168+
assert not self.is_exhausted
169+
prefix = bytearray()
170+
node = 0
171+
while True:
172+
assert len(prefix) < self.cap
173+
assert node not in self.dead
174+
175+
# Figure out the range of byte values we should be trying.
176+
# Normally this will be 0-255, unless the current position has a
177+
# mask.
178+
mask = self.masks.get(node, 0xFF)
179+
assert _is_simple_mask(mask)
180+
upper_bound = mask + 1
181+
182+
try:
183+
c = self.forced[node]
184+
# This position has a forced byte value, so trying a different
185+
# value wouldn't be helpful. Just add the forced byte, and
186+
# move on to the next position.
187+
prefix.append(c)
188+
node = self.nodes[node][c]
189+
continue
190+
except KeyError:
191+
pass
192+
193+
# Provisionally choose the next byte value.
194+
# This will change later if we find that it was a bad choice.
195+
c = random.randrange(0, upper_bound)
196+
197+
try:
198+
next_node = self.nodes[node][c]
199+
if next_node in self.dead:
200+
# Whoops, the byte value we chose for this position has
201+
# already been fully explored. Let's pick a new value, and
202+
# this time choose a value that's definitely still alive.
203+
choices = [
204+
b
205+
for b in hrange(upper_bound)
206+
if self.nodes[node].get(b) not in self.dead
207+
]
208+
assert choices
209+
c = random.choice(choices)
210+
node = self.nodes[node][c]
211+
else:
212+
# The byte value we chose is in the tree, but it still has
213+
# some unexplored descendants, so it's a valid choice.
214+
node = next_node
215+
prefix.append(c)
216+
except KeyError:
217+
# The byte value we chose isn't in the tree at this position,
218+
# which means we've successfully found a novel prefix.
219+
prefix.append(c)
220+
break
221+
assert node not in self.dead
222+
return hbytes(prefix)
223+
224+
def lookup(self, buffer):
225+
"""Look up the result of running buffer for a test function that
226+
produced the stored data objects. Returns either:
227+
228+
* a stored ConjectureData object of status >= INVALID which
229+
could have resulted from running buffer.
230+
* Overrun if running a test function which produced one of the
231+
stored data objects must necessarily result in a status of
232+
OVERRUN.
233+
* None if we cannot infer what the result of running buffer would
234+
be from the previously seen ConjectureData values.
235+
"""
236+
rewritten = bytearray()
237+
would_overrun = False
238+
239+
node_index = 0
240+
for i, c in enumerate(buffer):
241+
# If there's a forced value or a mask at this position, then
242+
# pretend that the buffer already contains a matching value,
243+
# because the test function is going to do the same.
244+
try:
245+
c = self.forced[node_index]
246+
except KeyError:
247+
pass
248+
try:
249+
c = c & self.masks[node_index]
250+
except KeyError:
251+
pass
252+
253+
try:
254+
# If we know how many bytes are read at this point and
255+
# there aren't enough, then it doesn't actually matter
256+
# what the values are, we're definitely going to overrun.
257+
if i + self.block_sizes[node_index] > len(buffer):
258+
would_overrun = True
259+
break
260+
except KeyError:
261+
pass
262+
263+
rewritten.append(c)
264+
265+
try:
266+
node_index = self.nodes[node_index][c]
267+
except KeyError:
268+
# The byte at this position isn't in the tree, which means
269+
# we haven't tested this buffer. Break out of the tree
270+
# traversal, and run the test function normally.
271+
break
272+
node = self.nodes[node_index]
273+
if isinstance(node, ConjectureData):
274+
# This buffer (or a prefix of it) has already been tested.
275+
# Return the stored result instead of trying it again.
276+
assert node.status != Status.OVERRUN
277+
return node
278+
else:
279+
# Falling off the end of this loop means that we're about to test
280+
# a prefix of a previously-tested byte stream, so the test would
281+
# overrun.
282+
would_overrun = True
283+
284+
if would_overrun:
285+
return Overrun
286+
else:
287+
return None
288+
289+
290+
def _is_simple_mask(mask):
291+
"""A simple mask is ``(2 ** n - 1)`` for some ``n``, so it has the effect
292+
of keeping the lowest ``n`` bits and discarding the rest.
293+
294+
A mask in this form can produce any integer between 0 and the mask itself
295+
(inclusive), and the total number of these values is ``(mask + 1)``.
296+
"""
297+
return (mask & (mask + 1)) == 0

0 commit comments

Comments
 (0)