15
15
import sys
16
16
import tempfile
17
17
import unicodedata
18
+ from collections .abc import Iterable
18
19
from functools import lru_cache
20
+ from pathlib import Path
21
+ from typing import TYPE_CHECKING , Literal , Optional
19
22
20
23
from hypothesis .configuration import storage_directory
21
24
from hypothesis .control import _current_build_context
22
25
from hypothesis .errors import InvalidArgument
23
- from hypothesis .internal .intervalsets import IntervalSet
24
-
25
- intervals = tuple [tuple [int , int ], ...]
26
- cache_type = dict [tuple [tuple [str , ...], int , int , intervals ], IntervalSet ]
27
-
28
-
29
- def charmap_file (fname = "charmap" ):
26
+ from hypothesis .internal .intervalsets import IntervalSet , IntervalsT
27
+
28
+ if TYPE_CHECKING :
29
+ from typing import TypeAlias
30
+
31
+ # See https://en.wikipedia.org/wiki/Unicode_character_property#General_Category
32
+ CategoryName : "TypeAlias" = Literal [
33
+ "L" , # Letter
34
+ "Lu" , # Letter, uppercase
35
+ "Ll" , # Letter, lowercase
36
+ "Lt" , # Letter, titlecase
37
+ "Lm" , # Letter, modifier
38
+ "Lo" , # Letter, other
39
+ "M" , # Mark
40
+ "Mn" , # Mark, nonspacing
41
+ "Mc" , # Mark, spacing combining
42
+ "Me" , # Mark, enclosing
43
+ "N" , # Number
44
+ "Nd" , # Number, decimal digit
45
+ "Nl" , # Number, letter
46
+ "No" , # Number, other
47
+ "P" , # Punctuation
48
+ "Pc" , # Punctuation, connector
49
+ "Pd" , # Punctuation, dash
50
+ "Ps" , # Punctuation, open
51
+ "Pe" , # Punctuation, close
52
+ "Pi" , # Punctuation, initial quote
53
+ "Pf" , # Punctuation, final quote
54
+ "Po" , # Punctuation, other
55
+ "S" , # Symbol
56
+ "Sm" , # Symbol, math
57
+ "Sc" , # Symbol, currency
58
+ "Sk" , # Symbol, modifier
59
+ "So" , # Symbol, other
60
+ "Z" , # Separator
61
+ "Zs" , # Separator, space
62
+ "Zl" , # Separator, line
63
+ "Zp" , # Separator, paragraph
64
+ "C" , # Other
65
+ "Cc" , # Other, control
66
+ "Cf" , # Other, format
67
+ "Cs" , # Other, surrogate
68
+ "Co" , # Other, private use
69
+ "Cn" , # Other, not assigned
70
+ ]
71
+ Categories : "TypeAlias" = Iterable [CategoryName ]
72
+ CategoriesTuple : "TypeAlias" = tuple [CategoryName , ...]
73
+
74
+
75
+ def charmap_file (fname : str = "charmap" ) -> Path :
30
76
return storage_directory (
31
77
"unicode_data" , unicodedata .unidata_version , f"{ fname } .json.gz"
32
78
)
@@ -35,7 +81,7 @@ def charmap_file(fname="charmap"):
35
81
_charmap = None
36
82
37
83
38
- def charmap ():
84
+ def charmap () -> dict [ CategoryName , IntervalsT ] :
39
85
"""Return a dict that maps a Unicode category, to a tuple of 2-tuples
40
86
covering the codepoint intervals for characters in that category.
41
87
@@ -49,8 +95,8 @@ def charmap():
49
95
if _charmap is None :
50
96
f = charmap_file ()
51
97
try :
52
- with gzip .GzipFile (f , "rb" ) as i :
53
- tmp_charmap = dict (json .load (i ))
98
+ with gzip .GzipFile (f , "rb" ) as d :
99
+ tmp_charmap = dict (json .load (d ))
54
100
55
101
except Exception :
56
102
# This loop is reduced to using only local variables for performance;
@@ -63,9 +109,9 @@ def charmap():
63
109
for i in range (1 , sys .maxunicode + 1 ):
64
110
cat = category (chr (i ))
65
111
if cat != last_cat :
66
- tmp_charmap .setdefault (last_cat , []).append ([ last_start , i - 1 ] )
112
+ tmp_charmap .setdefault (last_cat , []).append (( last_start , i - 1 ) )
67
113
last_cat , last_start = cat , i
68
- tmp_charmap .setdefault (last_cat , []).append ([ last_start , sys .maxunicode ] )
114
+ tmp_charmap .setdefault (last_cat , []).append (( last_start , sys .maxunicode ) )
69
115
70
116
try :
71
117
# Write the Unicode table atomically
@@ -135,10 +181,10 @@ def intervals_from_codec(codec_name: str) -> IntervalSet: # pragma: no cover
135
181
return res
136
182
137
183
138
- _categories = None
184
+ _categories : Optional [ Categories ] = None
139
185
140
186
141
- def categories ():
187
+ def categories () -> Categories :
142
188
"""Return a tuple of Unicode categories in a normalised order.
143
189
144
190
>>> categories() # doctest: +ELLIPSIS
@@ -147,15 +193,16 @@ def categories():
147
193
global _categories
148
194
if _categories is None :
149
195
cm = charmap ()
150
- _categories = sorted (cm .keys (), key = lambda c : len (cm [c ]))
151
- _categories .remove ("Cc" ) # Other, Control
152
- _categories .remove ("Cs" ) # Other, Surrogate
153
- _categories .append ("Cc" )
154
- _categories .append ("Cs" )
155
- return tuple (_categories )
196
+ categories = sorted (cm .keys (), key = lambda c : len (cm [c ]))
197
+ categories .remove ("Cc" ) # Other, Control
198
+ categories .remove ("Cs" ) # Other, Surrogate
199
+ categories .append ("Cc" )
200
+ categories .append ("Cs" )
201
+ _categories = tuple (categories )
202
+ return _categories
156
203
157
204
158
- def as_general_categories (cats , name = "cats" ):
205
+ def as_general_categories (cats : Categories , name : str = "cats" ) -> CategoriesTuple :
159
206
"""Return a tuple of Unicode categories in a normalised order.
160
207
161
208
This function expands one-letter designations of a major class to include
@@ -170,8 +217,6 @@ def as_general_categories(cats, name="cats"):
170
217
If the collection ``cats`` includes any elements that do not represent a
171
218
major class or a class with subclass, a deprecation warning is raised.
172
219
"""
173
- if cats is None :
174
- return None
175
220
major_classes = ("L" , "M" , "N" , "P" , "S" , "Z" , "C" )
176
221
cs = categories ()
177
222
out = set (cats )
@@ -186,10 +231,10 @@ def as_general_categories(cats, name="cats"):
186
231
return tuple (c for c in cs if c in out )
187
232
188
233
189
- category_index_cache = {(): ()}
234
+ category_index_cache : dict [ frozenset [ CategoryName ], IntervalsT ] = {frozenset (): ()}
190
235
191
236
192
- def _category_key (cats ) :
237
+ def _category_key (cats : Optional [ Iterable [ str ]]) -> CategoriesTuple :
193
238
"""Return a normalised tuple of all Unicode categories that are in
194
239
`include`, but not in `exclude`.
195
240
@@ -205,7 +250,7 @@ def _category_key(cats):
205
250
return tuple (c for c in cs if c in cats )
206
251
207
252
208
- def _query_for_key (key ) :
253
+ def _query_for_key (key : Categories ) -> IntervalsT :
209
254
"""Return a tuple of codepoint intervals covering characters that match one
210
255
or more categories in the tuple of categories `key`.
211
256
@@ -214,10 +259,13 @@ def _query_for_key(key):
214
259
>>> _query_for_key(('Zl', 'Zp', 'Co'))
215
260
((8232, 8233), (57344, 63743), (983040, 1048573), (1048576, 1114109))
216
261
"""
262
+ key = tuple (key )
263
+ # ignore ordering on the cache key to increase potential cache hits.
264
+ cache_key = frozenset (key )
217
265
context = _current_build_context .value
218
266
if context is None or not context .data .provider .avoid_realization :
219
267
try :
220
- return category_index_cache [key ]
268
+ return category_index_cache [cache_key ]
221
269
except KeyError :
222
270
pass
223
271
elif not key : # pragma: no cover # only on alternative backends
@@ -231,21 +279,23 @@ def _query_for_key(key):
231
279
)
232
280
assert isinstance (result , IntervalSet )
233
281
if context is None or not context .data .provider .avoid_realization :
234
- category_index_cache [key ] = result .intervals
282
+ category_index_cache [cache_key ] = result .intervals
235
283
return result .intervals
236
284
237
285
238
- limited_category_index_cache : cache_type = {}
286
+ limited_category_index_cache : dict [
287
+ tuple [CategoriesTuple , int , int , IntervalsT , IntervalsT ], IntervalSet
288
+ ] = {}
239
289
240
290
241
291
def query (
242
292
* ,
243
- categories = None ,
244
- min_codepoint = None ,
245
- max_codepoint = None ,
246
- include_characters = "" ,
247
- exclude_characters = "" ,
248
- ):
293
+ categories : Optional [ Categories ] = None ,
294
+ min_codepoint : Optional [ int ] = None ,
295
+ max_codepoint : Optional [ int ] = None ,
296
+ include_characters : str = "" ,
297
+ exclude_characters : str = "" ,
298
+ ) -> IntervalSet :
249
299
"""Return a tuple of intervals covering the codepoints for all characters
250
300
that meet the criteria.
251
301
0 commit comments