Skip to content

Commit 9436f60

Browse files
authored
Merge pull request #4291 from tybug/nasty-strings
Add possibility to generate nasty strings
2 parents 8781447 + 51e3256 commit 9436f60

File tree

4 files changed

+140
-0
lines changed

4 files changed

+140
-0
lines changed

hypothesis-python/RELEASE.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
RELEASE_TYPE: minor
2+
3+
:func:`~hypothesis.strategies.text` now occasionally generates from a preselected list of strings which are likely to find bugs. These include ligatures, right-to-left and top-to-bottom text, emojis, emoji modifiers, strings like ``"Infinity"``, ``"None"``, and ``"FALSE"``, and other interesting things. This is especially useful when testing the full unicode range, where the search space is too large for uniform sampling to be very effective.
4+
5+
Of course, examples generated this way shrink just like they normally would. It was always possible for Hypothesis to generate these strings; it is just more likely after this change. From the outside, it is as if Hypothesis generated the example completely randomly.
6+
7+
Many thanks to the `Big List of Naughty Strings <https://github.com/minimaxir/big-list-of-naughty-strings>`_, `Text Rendering Hates You <https://faultlore.com/blah/text-hates-you/>`_, and `Text Editing Hates You Too <https://lord.io/text-editing-hates-you-too/>`_ for forming the basis of this list.

hypothesis-python/src/hypothesis/internal/conjecture/providers.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@
2626

2727
from hypothesis.internal.cache import LRUCache
2828
from hypothesis.internal.compat import int_from_bytes
29+
from hypothesis.internal.conjecture.choice import (
30+
StringKWargs,
31+
choice_kwargs_key,
32+
choice_permitted,
33+
)
2934
from hypothesis.internal.conjecture.floats import float_to_lex, lex_to_float
3035
from hypothesis.internal.conjecture.junkdrawer import bits_to_bytes
3136
from hypothesis.internal.conjecture.utils import (
@@ -65,6 +70,7 @@
6570
"hypothesis": "hypothesis.internal.conjecture.providers.HypothesisProvider",
6671
}
6772
FLOAT_INIT_LOGIC_CACHE = LRUCache(4096)
73+
STRING_SAMPLER_CACHE = LRUCache(64)
6874

6975
NASTY_FLOATS = sorted(
7076
[
@@ -96,6 +102,85 @@
96102
NASTY_FLOATS = list(map(float, NASTY_FLOATS))
97103
NASTY_FLOATS.extend([-x for x in NASTY_FLOATS])
98104

105+
NASTY_STRINGS = sorted(
106+
[
107+
# strings which can be interpreted as code / logic
108+
"undefined",
109+
"null",
110+
"NULL",
111+
"nil",
112+
"NIL",
113+
"true",
114+
"false",
115+
"True",
116+
"False",
117+
"TRUE",
118+
"FALSE",
119+
"None",
120+
"none",
121+
"if",
122+
"then",
123+
"else",
124+
# strings which can be interpreted as a number
125+
"0",
126+
"1e100",
127+
"0..0",
128+
"0/0",
129+
"1/0",
130+
"+0.0",
131+
"Infinity",
132+
"-Infinity",
133+
"Inf",
134+
"INF",
135+
"NaN",
136+
"9" * 30,
137+
# common ascii characters
138+
",./;'[]\\-=<>?:\"{}|_+!@#$%^&*()`~",
139+
# common unicode characters
140+
"Ω≈ç√∫˜µ≤≥÷åß∂ƒ©˙∆˚¬…æœ∑´®†¥¨ˆøπ“‘¡™£¢∞§¶•ªº–≠¸˛Ç◊ı˜Â¯˘¿ÅÍÎÏ˝ÓÔÒÚÆ☃Œ„´‰ˇÁ¨ˆØ∏”’`⁄€‹›fifl‡°·‚—±",
141+
# characters which increase in length when lowercased
142+
"Ⱥ",
143+
"Ⱦ",
144+
# ligatures
145+
"æœÆŒffʤʨß"
146+
# emoticons
147+
"(╯°□°)╯︵ ┻━┻)",
148+
# emojis
149+
"😍",
150+
"🇺🇸",
151+
# emoji modifiers
152+
"🏻" # U+1F3FB Light Skin Tone,
153+
"👍🏻", # 👍 followed by U+1F3FB
154+
# RTL text
155+
"الكل في المجمو عة",
156+
# Ogham text, which contains the only character in the Space Separators
157+
# unicode category (Zs) that isn't visually blank:  . # noqa: RUF003
158+
"᚛ᚄᚓᚐᚋᚒᚄ ᚑᚄᚂᚑᚏᚅ᚜",
159+
# readable variations on text (bolt/italic/script)
160+
"𝐓𝐡𝐞 𝐪𝐮𝐢𝐜𝐤 𝐛𝐫𝐨𝐰𝐧 𝐟𝐨𝐱 𝐣𝐮𝐦𝐩𝐬 𝐨𝐯𝐞𝐫 𝐭𝐡𝐞 𝐥𝐚𝐳𝐲 𝐝𝐨𝐠",
161+
"𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌",
162+
"𝑻𝒉𝒆 𝒒𝒖𝒊𝒄𝒌 𝒃𝒓𝒐𝒘𝒏 𝒇𝒐𝒙 𝒋𝒖𝒎𝒑𝒔 𝒐𝒗𝒆𝒓 𝒕𝒉𝒆 𝒍𝒂𝒛𝒚 𝒅𝒐𝒈",
163+
"𝓣𝓱𝓮 𝓺𝓾𝓲𝓬𝓴 𝓫𝓻𝓸𝔀𝓷 𝓯𝓸𝔁 𝓳𝓾𝓶𝓹𝓼 𝓸𝓿𝓮𝓻 𝓽𝓱𝓮 𝓵𝓪𝔃𝔂 𝓭𝓸𝓰",
164+
"𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘",
165+
# upsidown text
166+
"ʇǝɯɐ ʇᴉs ɹolop ɯnsdᴉ ɯǝɹo˥",
167+
# reserved strings in windows
168+
"NUL",
169+
"COM1",
170+
"LPT1",
171+
# scunthorpe problem
172+
"Scunthorpe",
173+
# zalgo text
174+
"Ṱ̺̺̕o͞ ̷i̲̬͇̪͙n̝̗͕v̟̜̘̦͟o̶̙̰̠kè͚̮̺̪̹̱̤ ̖t̝͕̳̣̻̪͞h̼͓̲̦̳̘̲e͇̣̰̦̬͎ ̢̼̻̱̘h͚͎͙̜̣̲ͅi̦̲̣̰̤v̻͍e̺̭̳̪̰-m̢iͅn̖̺̞̲̯̰d̵̼̟͙̩̼̘̳ ̞̥̱̳̭r̛̗̘e͙p͠r̼̞̻̭̗e̺̠̣͟s̘͇̳͍̝͉e͉̥̯̞̲͚̬͜ǹ̬͎͎̟̖͇̤t͍̬̤͓̼̭͘ͅi̪̱n͠g̴͉ ͏͉ͅc̬̟h͡a̫̻̯͘o̫̟̖͍̙̝͉s̗̦̲.̨̹͈̣",
175+
#
176+
# examples from https://faultlore.com/blah/text-hates-you/
177+
"मनीष منش",
178+
"पन्ह पन्ह त्र र्च कृकृ ड्ड न्हृे إلا بسم الله",
179+
"lorem لا بسم الله ipsum 你好1234你好",
180+
],
181+
key=len,
182+
)
183+
99184
# Masks for masking off the first byte of an n-bit buffer.
100185
# The appropriate mask is stored at position n % 8.
101186
BYTE_MASKS = [(1 << n) - 1 for n in range(8)]
@@ -391,6 +476,15 @@ def draw_string(
391476
if len(intervals) == 0:
392477
return ""
393478

479+
sampler, nasty_strings = self._draw_string_sampler(
480+
intervals=intervals,
481+
min_size=min_size,
482+
max_size=max_size,
483+
)
484+
485+
if sampler is not None and self.draw_boolean(p=0.05):
486+
return nasty_strings[sampler.sample(self._cd)]
487+
394488
average_size = min(
395489
max(min_size * 2, min_size + 5),
396490
0.5 * (min_size + max_size),
@@ -575,6 +669,33 @@ def permitted(f: float) -> bool:
575669
)
576670
return (sampler, clamper, nasty_floats)
577671

672+
@classmethod
673+
def _draw_string_sampler(
674+
cls,
675+
*,
676+
intervals: IntervalSet,
677+
min_size: int,
678+
max_size: int,
679+
) -> tuple[Optional[Sampler], list[str]]:
680+
kwargs: StringKWargs = {
681+
"intervals": intervals,
682+
"min_size": min_size,
683+
"max_size": max_size,
684+
}
685+
key = choice_kwargs_key("string", kwargs)
686+
if key in STRING_SAMPLER_CACHE:
687+
return STRING_SAMPLER_CACHE[key]
688+
689+
nasty_strings = [s for s in NASTY_STRINGS if choice_permitted(s, kwargs)]
690+
sampler = (
691+
Sampler([1 / len(nasty_strings)] * len(nasty_strings), observe=False)
692+
if nasty_strings
693+
else None
694+
)
695+
result = (sampler, nasty_strings)
696+
STRING_SAMPLER_CACHE[key] = result
697+
return result
698+
578699

579700
class BytestringProvider(PrimitiveProvider):
580701
lifetime = "test_case"

hypothesis-python/tests/quality/test_discovery_ability.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,3 +372,7 @@ def double(x):
372372
test_long_duplicates_strings = define_test(
373373
tuples(text(), text()), lambda s: len(s[0]) >= 5 and s[0] == s[1]
374374
)
375+
376+
test_can_produce_nasty_strings = define_test(
377+
text(), lambda s: s in {"NaN", "Inf", "undefined"}, p=0.01
378+
)

hypothesis-python/tests/quality/test_shrink_quality.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -526,3 +526,11 @@ def test_minimize_duplicated_characters_within_a_choice():
526526
)
527527
== "0001"
528528
)
529+
530+
531+
def test_nasty_string_shrinks():
532+
# failures found via NASTY_STRINGS should shrink like normal
533+
assert (
534+
minimal(st.text(), lambda s: "𝕿𝖍𝖊" in s, settings=settings(max_examples=10000))
535+
== "𝕿𝖍𝖊"
536+
)

0 commit comments

Comments
 (0)