|
| 1 | +# coding=utf-8 |
| 2 | +# |
| 3 | +# This file is part of Hypothesis, which may be found at |
| 4 | +# https://github.com/HypothesisWorks/hypothesis/ |
| 5 | +# |
| 6 | +# Most of this work is copyright (C) 2013-2019 David R. MacIver |
| 7 | +# ([email protected]), but it contains contributions by others. See |
| 8 | +# CONTRIBUTING.rst for a full list of people who may hold copyright, and |
| 9 | +# consult the git log if you need to determine who owns an individual |
| 10 | +# contribution. |
| 11 | +# |
| 12 | +# This Source Code Form is subject to the terms of the Mozilla Public License, |
| 13 | +# v. 2.0. If a copy of the MPL was not distributed with this file, You can |
| 14 | +# obtain one at https://mozilla.org/MPL/2.0/. |
| 15 | +# |
| 16 | +# END HEADER |
| 17 | + |
| 18 | +from __future__ import absolute_import, division, print_function |
| 19 | + |
| 20 | +from hypothesis.internal.compat import hbytes, hrange |
| 21 | +from hypothesis.internal.conjecture.data import ConjectureData, Overrun, Status |
| 22 | + |
| 23 | + |
| 24 | +class DataTree(object): |
| 25 | + """Tracks the tree structure of a collection of ConjectureData |
| 26 | + objects, for use in ConjectureRunner.""" |
| 27 | + |
| 28 | + def __init__(self, cap): |
| 29 | + self.cap = cap |
| 30 | + |
| 31 | + # Previously-tested byte streams are recorded in a prefix tree, so that |
| 32 | + # we can: |
| 33 | + # - Avoid testing the same stream twice (in some cases). |
| 34 | + # - Avoid testing a prefix of a past stream (in some cases), |
| 35 | + # since that should only result in overrun. |
| 36 | + # - Generate stream prefixes that we haven't tried before. |
| 37 | + |
| 38 | + # Tree nodes are stored in an array to prevent heavy nesting of data |
| 39 | + # structures. Branches are dicts mapping bytes to child nodes (which |
| 40 | + # will in general only be partially populated). Leaves are |
| 41 | + # ConjectureData objects that have been previously seen as the result |
| 42 | + # of following that path. |
| 43 | + self.nodes = [{}] |
| 44 | + |
| 45 | + # A node is dead if there is nothing left to explore past that point. |
| 46 | + # Recursively, a node is dead if either it is a leaf or every byte |
| 47 | + # leads to a dead node when starting from here. |
| 48 | + self.dead = set() |
| 49 | + |
| 50 | + # We rewrite the byte stream at various points during parsing, to one |
| 51 | + # that will produce an equivalent result but is in some sense more |
| 52 | + # canonical. We keep track of these so that when walking the tree we |
| 53 | + # can identify nodes where the exact byte value doesn't matter and |
| 54 | + # treat all bytes there as equivalent. This significantly reduces the |
| 55 | + # size of the search space and removes a lot of redundant examples. |
| 56 | + |
| 57 | + # Maps tree indices where to the unique byte that is valid at that |
| 58 | + # point. Corresponds to data.write() calls. |
| 59 | + self.forced = {} |
| 60 | + |
| 61 | + # Maps tree indices to a mask that restricts bytes at that point. |
| 62 | + # Currently this is only updated by draw_bits, but it potentially |
| 63 | + # could get used elsewhere. |
| 64 | + self.masks = {} |
| 65 | + |
| 66 | + # Where a tree node consists of the beginning of a block we track the |
| 67 | + # size of said block. This allows us to tell when an example is too |
| 68 | + # short even if it goes off the unexplored region of the tree - if it |
| 69 | + # is at the beginning of a block of size 4 but only has 3 bytes left, |
| 70 | + # it's going to overrun the end of the buffer regardless of the |
| 71 | + # buffer contents. |
| 72 | + self.block_sizes = {} |
| 73 | + |
| 74 | + @property |
| 75 | + def is_exhausted(self): |
| 76 | + """Returns True if every possible node is dead and thus the language |
| 77 | + described must have been fully explored.""" |
| 78 | + return 0 in self.dead |
| 79 | + |
| 80 | + def add(self, data): |
| 81 | + """Add a ConjectureData object to the current collection.""" |
| 82 | + |
| 83 | + # First, iterate through the result's buffer, to create the node that |
| 84 | + # will hold this result. Also note any forced or masked bytes. |
| 85 | + tree_node = self.nodes[0] |
| 86 | + indices = [] |
| 87 | + node_index = 0 |
| 88 | + for i, b in enumerate(data.buffer): |
| 89 | + # We build a list of all the node indices visited on our path |
| 90 | + # through the tree, since we'll need to refer to them later. |
| 91 | + indices.append(node_index) |
| 92 | + |
| 93 | + # If this buffer position was forced or masked, then mark its |
| 94 | + # corresponding node as forced/masked. |
| 95 | + if i in data.forced_indices: |
| 96 | + self.forced[node_index] = b |
| 97 | + try: |
| 98 | + self.masks[node_index] = data.masked_indices[i] |
| 99 | + except KeyError: |
| 100 | + pass |
| 101 | + |
| 102 | + try: |
| 103 | + # Use the current byte to find the next node on our path. |
| 104 | + node_index = tree_node[b] |
| 105 | + except KeyError: |
| 106 | + # That node doesn't exist yet, so create it. |
| 107 | + node_index = len(self.nodes) |
| 108 | + # Create a new branch node. If this should actually be a leaf |
| 109 | + # node, it will be overwritten when we store the result. |
| 110 | + self.nodes.append({}) |
| 111 | + tree_node[b] = node_index |
| 112 | + |
| 113 | + tree_node = self.nodes[node_index] |
| 114 | + |
| 115 | + if node_index in self.dead: |
| 116 | + # This part of the tree has already been marked as dead, so |
| 117 | + # there's no need to traverse any deeper. |
| 118 | + break |
| 119 | + |
| 120 | + # At each node that begins a block, record the size of that block. |
| 121 | + for u, v in data.all_block_bounds(): |
| 122 | + # This can happen if we hit a dead node when walking the buffer. |
| 123 | + # In that case we already have this section of the tree mapped. |
| 124 | + if u >= len(indices): |
| 125 | + break |
| 126 | + self.block_sizes[indices[u]] = v - u |
| 127 | + |
| 128 | + # Forcibly mark all nodes beyond the zero-bound point as dead, |
| 129 | + # because we don't intend to try any other values there. |
| 130 | + self.dead.update(indices[self.cap :]) |
| 131 | + |
| 132 | + # Now store this result in the tree (if appropriate), and check if |
| 133 | + # any nodes need to be marked as dead. |
| 134 | + if data.status != Status.OVERRUN and node_index not in self.dead: |
| 135 | + # Mark this node as dead, because it produced a result. |
| 136 | + # Trying to explore suffixes of it would not be helpful. |
| 137 | + self.dead.add(node_index) |
| 138 | + # Store the result in the tree as a leaf. This will overwrite the |
| 139 | + # branch node that was created during traversal. |
| 140 | + self.nodes[node_index] = data |
| 141 | + |
| 142 | + # Review the traversed nodes, to see if any should be marked |
| 143 | + # as dead. We check them in reverse order, because as soon as we |
| 144 | + # find a live node, all nodes before it must still be live too. |
| 145 | + for j in reversed(indices): |
| 146 | + mask = self.masks.get(j, 0xFF) |
| 147 | + assert _is_simple_mask(mask) |
| 148 | + max_size = mask + 1 |
| 149 | + |
| 150 | + if len(self.nodes[j]) < max_size and j not in self.forced: |
| 151 | + # There are still byte values to explore at this node, |
| 152 | + # so it isn't dead yet. |
| 153 | + break |
| 154 | + if set(self.nodes[j].values()).issubset(self.dead): |
| 155 | + # Everything beyond this node is known to be dead, |
| 156 | + # and there are no more values to explore here (see above), |
| 157 | + # so this node must be dead too. |
| 158 | + self.dead.add(j) |
| 159 | + else: |
| 160 | + # Even though all of this node's possible values have been |
| 161 | + # tried, there are still some deeper nodes that remain |
| 162 | + # alive, so this node isn't dead yet. |
| 163 | + break |
| 164 | + |
| 165 | + def generate_novel_prefix(self, random): |
| 166 | + """Generate a short random string that (after rewriting) is not |
| 167 | + a prefix of any buffer previously added to the tree.""" |
| 168 | + assert not self.is_exhausted |
| 169 | + prefix = bytearray() |
| 170 | + node = 0 |
| 171 | + while True: |
| 172 | + assert len(prefix) < self.cap |
| 173 | + assert node not in self.dead |
| 174 | + |
| 175 | + # Figure out the range of byte values we should be trying. |
| 176 | + # Normally this will be 0-255, unless the current position has a |
| 177 | + # mask. |
| 178 | + mask = self.masks.get(node, 0xFF) |
| 179 | + assert _is_simple_mask(mask) |
| 180 | + upper_bound = mask + 1 |
| 181 | + |
| 182 | + try: |
| 183 | + c = self.forced[node] |
| 184 | + # This position has a forced byte value, so trying a different |
| 185 | + # value wouldn't be helpful. Just add the forced byte, and |
| 186 | + # move on to the next position. |
| 187 | + prefix.append(c) |
| 188 | + node = self.nodes[node][c] |
| 189 | + continue |
| 190 | + except KeyError: |
| 191 | + pass |
| 192 | + |
| 193 | + # Provisionally choose the next byte value. |
| 194 | + # This will change later if we find that it was a bad choice. |
| 195 | + c = random.randrange(0, upper_bound) |
| 196 | + |
| 197 | + try: |
| 198 | + next_node = self.nodes[node][c] |
| 199 | + if next_node in self.dead: |
| 200 | + # Whoops, the byte value we chose for this position has |
| 201 | + # already been fully explored. Let's pick a new value, and |
| 202 | + # this time choose a value that's definitely still alive. |
| 203 | + choices = [ |
| 204 | + b |
| 205 | + for b in hrange(upper_bound) |
| 206 | + if self.nodes[node].get(b) not in self.dead |
| 207 | + ] |
| 208 | + assert choices |
| 209 | + c = random.choice(choices) |
| 210 | + node = self.nodes[node][c] |
| 211 | + else: |
| 212 | + # The byte value we chose is in the tree, but it still has |
| 213 | + # some unexplored descendants, so it's a valid choice. |
| 214 | + node = next_node |
| 215 | + prefix.append(c) |
| 216 | + except KeyError: |
| 217 | + # The byte value we chose isn't in the tree at this position, |
| 218 | + # which means we've successfully found a novel prefix. |
| 219 | + prefix.append(c) |
| 220 | + break |
| 221 | + assert node not in self.dead |
| 222 | + return hbytes(prefix) |
| 223 | + |
| 224 | + def lookup(self, buffer): |
| 225 | + """Look up the result of running buffer for a test function that |
| 226 | + produced the stored data objects. Returns either: |
| 227 | +
|
| 228 | + * a stored ConjectureData object of status >= INVALID which |
| 229 | + could have resulted from running buffer. |
| 230 | + * Overrun if running a test function which produced one of the |
| 231 | + stored data objects must necessarily result in a status of |
| 232 | + OVERRUN. |
| 233 | + * None if we cannot infer what the result of running buffer would |
| 234 | + be from the previously seen ConjectureData values. |
| 235 | + """ |
| 236 | + rewritten = bytearray() |
| 237 | + would_overrun = False |
| 238 | + |
| 239 | + node_index = 0 |
| 240 | + for i, c in enumerate(buffer): |
| 241 | + # If there's a forced value or a mask at this position, then |
| 242 | + # pretend that the buffer already contains a matching value, |
| 243 | + # because the test function is going to do the same. |
| 244 | + try: |
| 245 | + c = self.forced[node_index] |
| 246 | + except KeyError: |
| 247 | + pass |
| 248 | + try: |
| 249 | + c = c & self.masks[node_index] |
| 250 | + except KeyError: |
| 251 | + pass |
| 252 | + |
| 253 | + try: |
| 254 | + # If we know how many bytes are read at this point and |
| 255 | + # there aren't enough, then it doesn't actually matter |
| 256 | + # what the values are, we're definitely going to overrun. |
| 257 | + if i + self.block_sizes[node_index] > len(buffer): |
| 258 | + would_overrun = True |
| 259 | + break |
| 260 | + except KeyError: |
| 261 | + pass |
| 262 | + |
| 263 | + rewritten.append(c) |
| 264 | + |
| 265 | + try: |
| 266 | + node_index = self.nodes[node_index][c] |
| 267 | + except KeyError: |
| 268 | + # The byte at this position isn't in the tree, which means |
| 269 | + # we haven't tested this buffer. Break out of the tree |
| 270 | + # traversal, and run the test function normally. |
| 271 | + break |
| 272 | + node = self.nodes[node_index] |
| 273 | + if isinstance(node, ConjectureData): |
| 274 | + # This buffer (or a prefix of it) has already been tested. |
| 275 | + # Return the stored result instead of trying it again. |
| 276 | + assert node.status != Status.OVERRUN |
| 277 | + return node |
| 278 | + else: |
| 279 | + # Falling off the end of this loop means that we're about to test |
| 280 | + # a prefix of a previously-tested byte stream, so the test would |
| 281 | + # overrun. |
| 282 | + would_overrun = True |
| 283 | + |
| 284 | + if would_overrun: |
| 285 | + return Overrun |
| 286 | + else: |
| 287 | + return None |
| 288 | + |
| 289 | + |
| 290 | +def _is_simple_mask(mask): |
| 291 | + """A simple mask is ``(2 ** n - 1)`` for some ``n``, so it has the effect |
| 292 | + of keeping the lowest ``n`` bits and discarding the rest. |
| 293 | +
|
| 294 | + A mask in this form can produce any integer between 0 and the mask itself |
| 295 | + (inclusive), and the total number of these values is ``(mask + 1)``. |
| 296 | + """ |
| 297 | + return (mask & (mask + 1)) == 0 |
0 commit comments