Skip to content

Commit 2f0981b

Browse files
committed
use typed choice sequence in the database
1 parent 6383100 commit 2f0981b

File tree

10 files changed

+182
-84
lines changed

10 files changed

+182
-84
lines changed

hypothesis-python/RELEASE.rst

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
RELEASE_TYPE: minor
2+
3+
The :doc:`Hypothesis example database <database>` now uses a new internal format to store examples. The new format is not compatible with the previous format, so any old stored counterexamples will be silently discarded.
4+
5+
If you are replaying counterexamples using an external database such as :class:`~hypothesis.database.GitHubArtifactDatabase`, this means the counterexample must have been found after this version in the external database to successfully replay locally. In short, the Hypothesis versions of the local and remote databases should be both before or both after this version.

hypothesis-python/src/hypothesis/core.py

+7-8
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@
8787
ensure_free_stackframes,
8888
gc_cumulative_time,
8989
)
90-
from hypothesis.internal.conjecture.shrinker import sort_key, sort_key_ir
90+
from hypothesis.internal.conjecture.shrinker import sort_key_ir
9191
from hypothesis.internal.entropy import deterministic_PRNG
9292
from hypothesis.internal.escalation import (
9393
InterestingOrigin,
@@ -352,9 +352,8 @@ def decode_failure(blob: bytes) -> Sequence[ChoiceT]:
352352
f"Could not decode blob {blob!r}: Invalid start byte {prefix!r}"
353353
)
354354

355-
try:
356-
choices = ir_from_bytes(decoded)
357-
except Exception:
355+
choices = ir_from_bytes(decoded)
356+
if choices is None:
358357
raise InvalidArgument(f"Invalid serialized choice sequence for blob {blob!r}")
359358

360359
return choices
@@ -1873,13 +1872,13 @@ def fuzz_one_input(
18731872
except (StopTest, UnsatisfiedAssumption):
18741873
return None
18751874
except BaseException:
1876-
buffer = bytes(data.buffer)
18771875
known = minimal_failures.get(data.interesting_origin)
18781876
if settings.database is not None and (
1879-
known is None or sort_key(buffer) <= sort_key(known)
1877+
known is None
1878+
or sort_key_ir(data.ir_nodes) <= sort_key_ir(known)
18801879
):
1881-
settings.database.save(database_key, buffer)
1882-
minimal_failures[data.interesting_origin] = buffer
1880+
settings.database.save(database_key, ir_to_bytes(data.choices))
1881+
minimal_failures[data.interesting_origin] = data.ir_nodes
18831882
raise
18841883
return bytes(data.buffer)
18851884

hypothesis-python/src/hypothesis/database.py

+17-3
Original file line numberDiff line numberDiff line change
@@ -768,8 +768,7 @@ def ir_to_bytes(ir: Iterable[ChoiceT], /) -> bytes:
768768
return b"".join(parts)
769769

770770

771-
def ir_from_bytes(buffer: bytes, /) -> list[ChoiceT]:
772-
"""Deserialize a bytestring to a list of IR elements. Inverts ir_to_bytes."""
771+
def _ir_from_bytes(buffer: bytes, /) -> tuple[ChoiceT, ...]:
773772
# See above for an explanation of the format.
774773
parts: list[ChoiceT] = []
775774
idx = 0
@@ -797,4 +796,19 @@ def ir_from_bytes(buffer: bytes, /) -> list[ChoiceT]:
797796
else:
798797
assert tag == 4
799798
parts.append(chunk.decode(errors="surrogatepass"))
800-
return parts
799+
return tuple(parts)
800+
801+
802+
def ir_from_bytes(buffer: bytes, /) -> Optional[tuple[ChoiceT, ...]]:
803+
"""
804+
Deserialize a bytestring to a tuple of choices. Inverts ir_to_bytes.
805+
806+
Returns None if the given bytestring is not a valid serialization of choice
807+
sequences.
808+
"""
809+
try:
810+
return _ir_from_bytes(buffer)
811+
except Exception:
812+
# deserialization error, eg because our format changed or someone put junk
813+
# data in the db.
814+
return None

hypothesis-python/src/hypothesis/internal/conjecture/choice.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,8 @@ def choices_key(choices: Sequence[ChoiceT]) -> tuple[ChoiceKeyT, ...]:
465465

466466
def choice_key(choice: ChoiceT) -> ChoiceKeyT:
467467
if isinstance(choice, float):
468-
# distinguish -0.0/0.0, signaling/nonsignaling nans, etc.
468+
# float_to_int to distinguish -0.0/0.0, signaling/nonsignaling nans, etc,
469+
# and then add a "float" key to avoid colliding with actual integers.
469470
return ("float", float_to_int(choice))
470471
if isinstance(choice, bool):
471472
# avoid choice_key(0) == choice_key(False)

hypothesis-python/src/hypothesis/internal/conjecture/engine.py

+48-25
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434

3535
from hypothesis import HealthCheck, Phase, Verbosity, settings as Settings
3636
from hypothesis._settings import local_settings
37-
from hypothesis.database import ExampleDatabase
37+
from hypothesis.database import ExampleDatabase, ir_from_bytes, ir_to_bytes
3838
from hypothesis.errors import (
3939
BackendCannotProceed,
4040
FlakyReplay,
@@ -44,7 +44,12 @@
4444
)
4545
from hypothesis.internal.cache import LRUReusedCache
4646
from hypothesis.internal.compat import NotRequired, TypeAlias, TypedDict, ceil, override
47-
from hypothesis.internal.conjecture.choice import ChoiceKwargsT, ChoiceT, choices_key
47+
from hypothesis.internal.conjecture.choice import (
48+
ChoiceKeyT,
49+
ChoiceKwargsT,
50+
ChoiceT,
51+
choices_key,
52+
)
4853
from hypothesis.internal.conjecture.data import (
4954
AVAILABLE_PROVIDERS,
5055
ConjectureData,
@@ -69,7 +74,7 @@
6974
startswith,
7075
)
7176
from hypothesis.internal.conjecture.pareto import NO_SCORE, ParetoFront, ParetoOptimiser
72-
from hypothesis.internal.conjecture.shrinker import Shrinker, sort_key, sort_key_ir
77+
from hypothesis.internal.conjecture.shrinker import Shrinker, sort_key_ir
7378
from hypothesis.internal.escalation import InterestingOrigin
7479
from hypothesis.internal.healthcheck import fail_health_check
7580
from hypothesis.reporting import base_report, report
@@ -91,6 +96,10 @@
9196
Ls: TypeAlias = list["Ls | int"]
9297

9398

99+
def shortlex(s):
100+
return (len(s), s)
101+
102+
94103
@attr.s
95104
class HealthCheckState:
96105
valid_examples: int = attr.ib(default=0)
@@ -467,7 +476,7 @@ def test_function(self, data: ConjectureData) -> None:
467476
data.freeze()
468477
return
469478
except BaseException:
470-
self.save_buffer(data.buffer)
479+
self.save_choices(data.choices)
471480
raise
472481
finally:
473482
# No branch, because if we're interrupted we always raise
@@ -522,7 +531,7 @@ def test_function(self, data: ConjectureData) -> None:
522531
and self.pareto_front is not None
523532
and self.pareto_front.add(data.as_result())
524533
):
525-
self.save_buffer(data.buffer, sub_key=b"pareto")
534+
self.save_choices(data.choices, sub_key=b"pareto")
526535

527536
assert len(data.buffer) <= BUFFER_SIZE
528537

@@ -601,12 +610,12 @@ def test_function(self, data: ConjectureData) -> None:
601610
else:
602611
if sort_key_ir(data.ir_nodes) < sort_key_ir(existing.ir_nodes):
603612
self.shrinks += 1
604-
self.downgrade_buffer(existing.buffer)
613+
self.downgrade_buffer(ir_to_bytes(existing.choices))
605614
self.__data_cache.unpin(existing.buffer)
606615
changed = True
607616

608617
if changed:
609-
self.save_buffer(data.buffer)
618+
self.save_choices(data.choices)
610619
self.interesting_examples[key] = data.as_result() # type: ignore
611620
self.__data_cache.pin(data.buffer, data.as_result())
612621
self.shrunk_examples.discard(key)
@@ -651,7 +660,7 @@ def test_function(self, data: ConjectureData) -> None:
651660
self.record_for_health_check(data)
652661

653662
def on_pareto_evict(self, data: ConjectureData) -> None:
654-
self.settings.database.delete(self.pareto_key, data.buffer)
663+
self.settings.database.delete(self.pareto_key, ir_to_bytes(data.choices))
655664

656665
def generate_novel_prefix(self) -> tuple[ChoiceT, ...]:
657666
"""Uses the tree to proactively generate a starting sequence of bytes
@@ -735,14 +744,14 @@ def record_for_health_check(self, data: ConjectureData) -> None:
735744
HealthCheck.too_slow,
736745
)
737746

738-
def save_buffer(
739-
self, buffer: Union[bytes, bytearray], sub_key: Optional[bytes] = None
747+
def save_choices(
748+
self, choices: Sequence[ChoiceT], sub_key: Optional[bytes] = None
740749
) -> None:
741750
if self.settings.database is not None:
742751
key = self.sub_key(sub_key)
743752
if key is None:
744753
return
745-
self.settings.database.save(key, bytes(buffer))
754+
self.settings.database.save(key, ir_to_bytes(choices))
746755

747756
def downgrade_buffer(self, buffer: Union[bytes, bytearray]) -> None:
748757
if self.settings.database is not None and self.database_key is not None:
@@ -832,7 +841,7 @@ def reuse_existing_examples(self) -> None:
832841
# sample the secondary corpus to a more manageable size.
833842

834843
corpus = sorted(
835-
self.settings.database.fetch(self.database_key), key=sort_key
844+
self.settings.database.fetch(self.database_key), key=shortlex
836845
)
837846
factor = 0.1 if (Phase.generate in self.settings.phases) else 1
838847
desired_size = max(2, ceil(factor * self.settings.max_examples))
@@ -847,7 +856,7 @@ def reuse_existing_examples(self) -> None:
847856
extra = extra_corpus
848857
else:
849858
extra = self.random.sample(extra_corpus, shortfall)
850-
extra.sort(key=sort_key)
859+
extra.sort(key=shortlex)
851860
corpus.extend(extra)
852861

853862
# We want a fast path where every primary entry in the database was
@@ -858,15 +867,20 @@ def reuse_existing_examples(self) -> None:
858867
for i, existing in enumerate(corpus):
859868
if i >= primary_corpus_size and found_interesting_in_primary:
860869
break
861-
data = self.cached_test_function(existing, extend=BUFFER_SIZE)
870+
choices = ir_from_bytes(existing)
871+
if choices is None:
872+
# clear out any keys which fail deserialization
873+
self.settings.database.delete(self.database_key, existing)
874+
continue
875+
data = self.cached_test_function_ir(choices, extend=BUFFER_SIZE)
862876
if data.status != Status.INTERESTING:
863877
self.settings.database.delete(self.database_key, existing)
864878
self.settings.database.delete(self.secondary_key, existing)
865879
else:
866880
if i < primary_corpus_size:
867881
found_interesting_in_primary = True
868882
assert not isinstance(data, _Overrun)
869-
if existing != data.buffer:
883+
if choices_key(choices) != choices_key(data.choices):
870884
all_interesting_in_primary_were_exact = False
871885
if not self.settings.report_multiple_bugs:
872886
break
@@ -886,10 +900,14 @@ def reuse_existing_examples(self) -> None:
886900
pareto_corpus = list(self.settings.database.fetch(self.pareto_key))
887901
if len(pareto_corpus) > desired_extra:
888902
pareto_corpus = self.random.sample(pareto_corpus, desired_extra)
889-
pareto_corpus.sort(key=sort_key)
903+
pareto_corpus.sort(key=shortlex)
890904

891905
for existing in pareto_corpus:
892-
data = self.cached_test_function(existing, extend=BUFFER_SIZE)
906+
choices = ir_from_bytes(existing)
907+
if choices is None:
908+
self.settings.database.delete(self.pareto_key, existing)
909+
continue
910+
data = self.cached_test_function_ir(choices, extend=BUFFER_SIZE)
893911
if data not in self.pareto_front:
894912
self.settings.database.delete(self.pareto_key, existing)
895913
if data.status == Status.INTERESTING:
@@ -1371,9 +1389,9 @@ def shrink_interesting_examples(self) -> None:
13711389
for k, v in self.interesting_examples.items()
13721390
if k not in self.shrunk_examples
13731391
),
1374-
key=lambda kv: (sort_key_ir(kv[1].ir_nodes), sort_key(repr(kv[0]))),
1392+
key=lambda kv: (sort_key_ir(kv[1].ir_nodes), shortlex(repr(kv[0]))),
13751393
)
1376-
self.debug(f"Shrinking {target!r}: {data.choices}")
1394+
self.debug(f"Shrinking {target!r}: {example.choices}")
13771395

13781396
if not self.settings.report_multiple_bugs:
13791397
# If multi-bug reporting is disabled, we shrink our currently-minimal
@@ -1400,17 +1418,22 @@ def clear_secondary_key(self) -> None:
14001418
# It's not worth trying the primary corpus because we already
14011419
# tried all of those in the initial phase.
14021420
corpus = sorted(
1403-
self.settings.database.fetch(self.secondary_key), key=sort_key
1421+
self.settings.database.fetch(self.secondary_key), key=shortlex
14041422
)
14051423
for c in corpus:
1406-
primary = {v.buffer for v in self.interesting_examples.values()}
1407-
1408-
cap = max(map(sort_key, primary))
1424+
choices = ir_from_bytes(c)
1425+
if choices is None:
1426+
self.settings.database.delete(self.secondary_key, c)
1427+
continue
1428+
primary = {
1429+
ir_to_bytes(v.choices) for v in self.interesting_examples.values()
1430+
}
1431+
cap = max(map(shortlex, primary))
14091432

1410-
if sort_key(c) > cap:
1433+
if shortlex(c) > cap:
14111434
break
14121435
else:
1413-
self.cached_test_function(c)
1436+
self.cached_test_function_ir(choices)
14141437
# We unconditionally remove c from the secondary key as it
14151438
# is either now primary or worse than our primary example
14161439
# of this reason for interestingness.

hypothesis-python/src/hypothesis/internal/conjecture/shrinker.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def sort_key(buffer: SortKeyT) -> tuple[int, SortKeyT]:
8181
result, so it makes sense to prioritise reducing earlier values over
8282
later ones. This makes the lexicographic order the more natural choice.
8383
"""
84-
return (len(buffer), buffer)
84+
return (len(buffer), buffer) # pragma: no cover # removing soon
8585

8686

8787
def sort_key_ir(nodes: Sequence[IRNode]) -> tuple[int, tuple[int, ...]]:

0 commit comments

Comments
 (0)