Merge pull request #3987 from tybug/lazy-sequence-copy-pop

tybug · web-flow · commit d603bd41b839 · 2024-05-14T22:18:18.000-04:00
Implement `LazySequenceCopy.pop(i)`
diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst
@@ -0,0 +1,4 @@
+RELEASE_TYPE: patch
+
+This patch improves our shrinking of unique collections, such as  :func:`~hypothesis.strategies.dictionaries`,
+:func:`~hypothesis.strategies.sets`, and :func:`~hypothesis.strategies.lists` with ``unique=True``.
diff --git a/hypothesis-python/src/hypothesis/internal/conjecture/choicetree.py b/hypothesis-python/src/hypothesis/internal/conjecture/choicetree.py
@@ -12,7 +12,7 @@
 from random import Random
 from typing import Callable, Dict, Iterable, List, Optional, Sequence
 
-from hypothesis.internal.conjecture.junkdrawer import LazySequenceCopy, pop_random
+from hypothesis.internal.conjecture.junkdrawer import LazySequenceCopy
 
 
 def prefix_selection_order(
@@ -41,7 +41,8 @@ def random_selection_order(random: Random) -> Callable[[int, int], Iterable[int]
     def selection_order(depth: int, n: int) -> Iterable[int]:
         pending = LazySequenceCopy(range(n))
         while pending:
-            yield pop_random(random, pending)
+            i = random.randrange(0, len(pending))
+            yield pending.pop(i)
 
     return selection_order
 
diff --git a/hypothesis-python/src/hypothesis/internal/conjecture/junkdrawer.py b/hypothesis-python/src/hypothesis/internal/conjecture/junkdrawer.py
@@ -31,6 +31,8 @@
     overload,
 )
 
+from sortedcontainers import SortedList
+
 from hypothesis.errors import HypothesisWarning
 
 ARRAY_CODES = ["B", "H", "I", "L", "Q", "O"]
@@ -199,46 +201,71 @@ class LazySequenceCopy:
     in O(1) time. The full list API is not supported yet but there's no reason
     in principle it couldn't be."""
 
-    __mask: Optional[Dict[int, int]]
-
     def __init__(self, values: Sequence[int]):
         self.__values = values
         self.__len = len(values)
-        self.__mask = None
+        self.__mask: Optional[Dict[int, int]] = None
+        self.__popped_indices: Optional[SortedList] = None
 
     def __len__(self) -> int:
-        return self.__len
+        if self.__popped_indices is None:
+            return self.__len
+        return self.__len - len(self.__popped_indices)
 
-    def pop(self) -> int:
+    def pop(self, i: int = -1) -> int:
         if len(self) == 0:
             raise IndexError("Cannot pop from empty list")
-        result = self[-1]
-        self.__len -= 1
+        i = self.__underlying_index(i)
+
+        v = None
         if self.__mask is not None:
-            self.__mask.pop(self.__len, None)
-        return result
+            v = self.__mask.pop(i, None)
+        if v is None:
+            v = self.__values[i]
+
+        if self.__popped_indices is None:
+            self.__popped_indices = SortedList()
+        self.__popped_indices.add(i)
+        return v
 
     def __getitem__(self, i: int) -> int:
-        i = self.__check_index(i)
+        i = self.__underlying_index(i)
+
         default = self.__values[i]
         if self.__mask is None:
             return default
         else:
             return self.__mask.get(i, default)
 
     def __setitem__(self, i: int, v: int) -> None:
-        i = self.__check_index(i)
+        i = self.__underlying_index(i)
         if self.__mask is None:
             self.__mask = {}
         self.__mask[i] = v
 
-    def __check_index(self, i: int) -> int:
+    def __underlying_index(self, i: int) -> int:
         n = len(self)
         if i < -n or i >= n:
             raise IndexError(f"Index {i} out of range [0, {n})")
         if i < 0:
             i += n
         assert 0 <= i < n
+
+        if self.__popped_indices is not None:
+            # given an index i in the popped representation of the list, compute
+            # its corresponding index in the underlying list. given
+            #   l = [1, 4, 2, 10, 188]
+            #   l.pop(3)
+            #   l.pop(1)
+            #   assert l == [1, 2, 188]
+            #
+            # we want l[i] == self.__values[f(i)], where f is this function.
+            assert len(self.__popped_indices) <= len(self.__values)
+
+            for idx in self.__popped_indices:
+                if idx > i:
+                    break
+                i += 1
         return i
 
 
@@ -345,14 +372,6 @@ def find_integer(f: Callable[[int], bool]) -> int:
     return lo
 
 
-def pop_random(random: Random, seq: LazySequenceCopy) -> int:
-    """Remove and return a random element of seq. This runs in O(1) but leaves
-    the sequence in an arbitrary order."""
-    i = random.randrange(0, len(seq))
-    swap(seq, i, len(seq) - 1)
-    return seq.pop()
-
-
 class NotFound(Exception):
     pass
 
diff --git a/hypothesis-python/src/hypothesis/internal/conjecture/pareto.py b/hypothesis-python/src/hypothesis/internal/conjecture/pareto.py
@@ -179,8 +179,7 @@ def add(self, data):
             failures = 0
             while i + 1 < len(front) and failures < 10:
                 j = self.__random.randrange(i + 1, len(front))
-                swap(front, j, len(front) - 1)
-                candidate = front.pop()
+                candidate = front.pop(j)
                 dom = dominance(data, candidate)
                 assert dom != DominanceRelation.RIGHT_DOMINATES
                 if dom == DominanceRelation.LEFT_DOMINATES:
diff --git a/hypothesis-python/src/hypothesis/strategies/_internal/collections.py b/hypothesis-python/src/hypothesis/strategies/_internal/collections.py
@@ -287,12 +287,8 @@ def do_draw(self, data):
         remaining = LazySequenceCopy(self.element_strategy.elements)
 
         while remaining and should_draw.more():
-            i = len(remaining) - 1
-            j = data.draw_integer(0, i)
-            if j != i:
-                remaining[i], remaining[j] = remaining[j], remaining[i]
-            value = self.element_strategy._transform(remaining.pop())
-
+            j = data.draw_integer(0, len(remaining) - 1)
+            value = self.element_strategy._transform(remaining.pop(j))
             if value is not filter_not_satisfied and all(
                 key(value) not in seen for key, seen in zip(self.keys, seen_sets)
             ):
diff --git a/hypothesis-python/tests/conjecture/test_junkdrawer.py b/hypothesis-python/tests/conjecture/test_junkdrawer.py
@@ -8,6 +8,7 @@
 # v. 2.0. If a copy of the MPL was not distributed with this file, You can
 # obtain one at https://mozilla.org/MPL/2.0/.
 
+import copy
 import inspect
 
 import pytest
@@ -79,22 +80,31 @@ def test_clamp(lower, value, upper):
         assert clamped == upper
 
 
-def test_pop_without_mask():
-    y = [1, 2, 3]
-    x = LazySequenceCopy(y)
-    x.pop()
-    assert list(x) == [1, 2]
-    assert y == [1, 2, 3]
-
-
-def test_pop_with_mask():
-    y = [1, 2, 3]
-    x = LazySequenceCopy(y)
-    x[-1] = 5
-    t = x.pop()
-    assert t == 5
-    assert list(x) == [1, 2]
-    assert y == [1, 2, 3]
+# this would be more robust as a stateful test, where each rule is a list operation
+# on (1) the canonical python list and (2) its LazySequenceCopy. We would assert
+# that the return values and lists match after each rule, and the original list
+# is unmodified.
+@pytest.mark.parametrize("should_mask", [True, False])
+@given(lst=st.lists(st.integers(), min_size=1), data=st.data())
+def test_pop_sequence_copy(lst, data, should_mask):
+    original = copy.copy(lst)
+    pop_i = data.draw(st.integers(0, len(lst) - 1))
+    if should_mask:
+        mask_i = data.draw(st.integers(0, len(lst) - 1))
+        mask_value = data.draw(st.integers())
+
+    def pop(l):
+        if should_mask:
+            l[mask_i] = mask_value
+        return l.pop(pop_i)
+
+    expected = copy.copy(lst)
+    l = LazySequenceCopy(lst)
+
+    assert pop(expected) == pop(l)
+    assert list(l) == expected
+    # modifications to the LazySequenceCopy should not modify the original list
+    assert original == lst
 
 
 def test_assignment():
diff --git a/hypothesis-python/tests/nocover/test_sampled_from.py b/hypothesis-python/tests/nocover/test_sampled_from.py
@@ -124,10 +124,10 @@ def test_flags_minimize_to_first_named_flag():
 
 
 def test_flags_minimizes_bit_count():
-    shrunk = minimal(st.sampled_from(LargeFlag), lambda f: bit_count(f.value) > 1)
-    # Ideal would be (bit0 | bit1), but:
-    # minimal(st.sets(st.sampled_from(range(10)), min_size=3)) == {0, 8, 9}  # not {0, 1, 2}
-    assert shrunk == LargeFlag.bit0 | LargeFlag.bit63  # documents actual behaviour
+    assert (
+        minimal(st.sampled_from(LargeFlag), lambda f: bit_count(f.value) > 1)
+        == LargeFlag.bit0 | LargeFlag.bit1
+    )
 
 
 def test_flags_finds_all_bits_set():
diff --git a/hypothesis-python/tests/quality/test_shrink_quality.py b/hypothesis-python/tests/quality/test_shrink_quality.py
@@ -107,6 +107,10 @@ def test_minimize_sets_of_sets():
             assert any(s != t and t.issubset(s) for t in set_of_sets)
 
 
+def test_minimize_sets_sampled_from():
+    assert minimal(st.sets(st.sampled_from(range(10)), min_size=3)) == {0, 1, 2}
+
+
 def test_can_simplify_flatmap_with_bounded_left_hand_size():
     assert (
         minimal(booleans().flatmap(lambda x: lists(just(x))), lambda x: len(x) >= 10)