34
34
from StringIO import StringIO
35
35
36
36
try :
37
- # completely optional type hinting
37
+ # Completely optional type hinting
38
38
# (Python 2 compatible using comments,
39
39
# see: https://mypy.readthedocs.io/en/latest/python2.html)
40
40
# This is very helpful in typing-aware IDE like PyCharm.
43
43
pass
44
44
45
45
46
- # we don't use enum.Enum because of Python 2.7 compatibility
46
+ # We don't use enum.Enum because of Python 2.7 compatibility.
47
47
class UnicodeFiles (object ):
48
- # ReadMe does not contain any unicode data, we
48
+ # ReadMe does not contain any Unicode data, we
49
49
# only use it to extract versions.
50
50
README = "ReadMe.txt"
51
51
@@ -57,11 +57,15 @@ class UnicodeFiles(object):
57
57
UNICODE_DATA = "UnicodeData.txt"
58
58
59
59
60
- UnicodeFiles .ALL_FILES = tuple (
61
- getattr (UnicodeFiles , name ) for name in dir (UnicodeFiles )
60
+ # The order doesn't really matter (Python < 3.6 won't preserve it),
61
+ # we only want to aggregate all the file names.
62
+ ALL_UNICODE_FILES = tuple (
63
+ value for name , value in UnicodeFiles .__dict__ .items ()
62
64
if not name .startswith ("_" )
63
65
)
64
66
67
+ assert len (ALL_UNICODE_FILES ) == 7 , "Unexpected number of unicode files"
68
+
65
69
# The directory this file is located in.
66
70
THIS_DIR = os .path .dirname (os .path .realpath (__file__ ))
67
71
@@ -97,18 +101,17 @@ class UnicodeFiles(object):
97
101
98
102
# This is the (inclusive) range of surrogate codepoints.
99
103
# These are not valid Rust characters.
100
- # - they are not valid Rust characters
101
104
SURROGATE_CODEPOINTS_RANGE = (0xd800 , 0xdfff )
102
105
103
106
UnicodeData = namedtuple (
104
107
"UnicodeData" , (
105
- # conversions :
108
+ # Conversions :
106
109
"to_upper" , "to_lower" , "to_title" ,
107
110
108
- # decompositions : canonical decompositions, compatibility decomp
111
+ # Decompositions : canonical decompositions, compatibility decomp
109
112
"canon_decomp" , "compat_decomp" ,
110
113
111
- # grouped : general categories and combining characters
114
+ # Grouped : general categories and combining characters
112
115
"general_categories" , "combines" ,
113
116
)
114
117
)
@@ -136,10 +139,10 @@ def fetch_files(version=None):
136
139
return have_version
137
140
138
141
if version :
139
- # check if the desired version exists on the server
142
+ # Check if the desired version exists on the server.
140
143
get_fetch_url = lambda name : FETCH_URL_VERSION .format (version = version , filename = name )
141
144
else :
142
- # extract the latest version
145
+ # Extract the latest version.
143
146
get_fetch_url = lambda name : FETCH_URL_LATEST .format (filename = name )
144
147
145
148
readme_url = get_fetch_url (UnicodeFiles .README )
@@ -153,14 +156,14 @@ def fetch_files(version=None):
153
156
154
157
download_dir = get_unicode_dir (unicode_version )
155
158
if not os .path .exists (download_dir ):
156
- # for 2.7 compat, we don't use exist_ok=True
159
+ # For 2.7 compat, we don't use ` exist_ok=True`.
157
160
os .makedirs (download_dir )
158
161
159
- for filename in UnicodeFiles . ALL_FILES :
162
+ for filename in ALL_UNICODE_FILES :
160
163
file_path = get_unicode_file_path (unicode_version , filename )
161
164
162
165
if os .path .exists (file_path ):
163
- # assume file on the server didn't change if it's been saved before
166
+ # Assume file on the server didn't change if it's been saved before.
164
167
continue
165
168
166
169
if filename == UnicodeFiles .README :
@@ -178,15 +181,16 @@ def check_stored_version(version):
178
181
# type: (Optional[str]) -> Optional[UnicodeVersion]
179
182
"""
180
183
Given desired Unicode version, return the version
181
- if stored files are all present, and None otherwise.
184
+ if stored files are all present, and ` None` otherwise.
182
185
"""
183
186
if not version :
184
- # should always check latest version
187
+ # If no desired version specified, we should check what's the latest
188
+ # version, skipping stored version checks.
185
189
return None
186
190
187
191
fetch_dir = os .path .join (FETCH_DIR , version )
188
192
189
- for filename in UnicodeFiles . ALL_FILES :
193
+ for filename in ALL_UNICODE_FILES :
190
194
file_path = os .path .join (fetch_dir , filename )
191
195
192
196
if not os .path .exists (file_path ):
@@ -199,11 +203,11 @@ def check_stored_version(version):
199
203
def parse_readme_unicode_version (readme_content ):
200
204
# type: (str) -> UnicodeVersion
201
205
"""
202
- Parse the Unicode version contained in their ReadMe.txt file.
206
+ Parse the Unicode version contained in their ` ReadMe.txt` file.
203
207
"""
204
- # "raw string" is necessary for \d not being treated as escape char
205
- # (for the sake of compat with future Python versions)
206
- # see : https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
208
+ # "Raw string" is necessary for \d not being treated as escape char
209
+ # (for the sake of compat with future Python versions).
210
+ # See : https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
207
211
pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
208
212
groups = re .search (pattern , readme_content ).groups ()
209
213
@@ -213,7 +217,7 @@ def parse_readme_unicode_version(readme_content):
213
217
def get_unicode_dir (unicode_version ):
214
218
# type: (UnicodeVersion) -> str
215
219
"""
216
- Indicate where the unicode data files should be stored.
220
+ Indicate in which parent dir the Unicode data files should be stored.
217
221
218
222
This returns a full, absolute path.
219
223
"""
@@ -223,7 +227,7 @@ def get_unicode_dir(unicode_version):
223
227
def get_unicode_file_path (unicode_version , filename ):
224
228
# type: (UnicodeVersion, str) -> str
225
229
"""
226
- Indicate where the unicode data file should be stored.
230
+ Indicate where the Unicode data file should be stored.
227
231
"""
228
232
return os .path .join (get_unicode_dir (unicode_version ), filename )
229
233
@@ -239,22 +243,22 @@ def is_surrogate(n):
239
243
def load_unicode_data (file_path ):
240
244
# type: (str) -> UnicodeData
241
245
"""
242
- Load main unicode data.
246
+ Load main Unicode data.
243
247
"""
244
- # conversions
248
+ # Conversions
245
249
to_lower = {} # type: Dict[int, Tuple[int, int, int]]
246
250
to_upper = {} # type: Dict[int, Tuple[int, int, int]]
247
251
to_title = {} # type: Dict[int, Tuple[int, int, int]]
248
252
249
- # decompositions
253
+ # Decompositions
250
254
compat_decomp = {} # type: Dict[int, List[int]]
251
255
canon_decomp = {} # type: Dict[int, List[int]]
252
256
253
- # combining characters
257
+ # Combining characters
254
258
# FIXME: combines are not used
255
259
combines = defaultdict (set ) # type: Dict[str, Set[int]]
256
260
257
- # categories
261
+ # Categories
258
262
general_categories = defaultdict (set ) # type: Dict[str, Set[int]]
259
263
category_assigned_codepoints = set () # type: Set[int]
260
264
@@ -283,41 +287,42 @@ def load_unicode_data(file_path):
283
287
decomp , deci , digit , num , mirror ,
284
288
old , iso , upcase , lowcase , titlecase ) = data
285
289
286
- # generate char to char direct common and simple conversions
287
- # uppercase to lowercase
290
+ # Generate char to char direct common and simple conversions:
291
+
292
+ # Uppercase to lowercase
288
293
if lowcase != "" and code_org != lowcase :
289
294
to_lower [code ] = (int (lowcase , 16 ), 0 , 0 )
290
295
291
- # lowercase to uppercase
296
+ # Lowercase to uppercase
292
297
if upcase != "" and code_org != upcase :
293
298
to_upper [code ] = (int (upcase , 16 ), 0 , 0 )
294
299
295
- # title case
300
+ # Title case
296
301
if titlecase .strip () != "" and code_org != titlecase :
297
302
to_title [code ] = (int (titlecase , 16 ), 0 , 0 )
298
303
299
- # store decomposition, if given
304
+ # Store decomposition, if given
300
305
if decomp :
301
306
decompositions = decomp .split ()[1 :]
302
307
decomp_code_points = [int (i , 16 ) for i in decompositions ]
303
308
304
309
if decomp .startswith ("<" ):
305
- # compatibility decomposition
310
+ # Compatibility decomposition
306
311
compat_decomp [code ] = decomp_code_points
307
312
else :
308
- # canonical decomposition
313
+ # Canonical decomposition
309
314
canon_decomp [code ] = decomp_code_points
310
315
311
- # place letter in categories as appropriate
316
+ # Place letter in categories as appropriate.
312
317
for cat in itertools .chain ((gencat , ), EXPANDED_CATEGORIES .get (gencat , [])):
313
318
general_categories [cat ].add (code )
314
319
category_assigned_codepoints .add (code )
315
320
316
- # record combining class, if any
321
+ # Record combining class, if any.
317
322
if combine != "0" :
318
323
combines [combine ].add (code )
319
324
320
- # generate Not_Assigned from Assigned
325
+ # Generate Not_Assigned from Assigned.
321
326
general_categories ["Cn" ] = get_unassigned_codepoints (category_assigned_codepoints )
322
327
323
328
# Other contains Not_Assigned
@@ -336,7 +341,7 @@ def load_unicode_data(file_path):
336
341
def load_special_casing (file_path , unicode_data ):
337
342
# type: (str, UnicodeData) -> None
338
343
"""
339
- Load special casing data and enrich given unicode data.
344
+ Load special casing data and enrich given Unicode data.
340
345
"""
341
346
for line in fileinput .input (file_path ):
342
347
data = line .split ("#" )[0 ].split (";" )
@@ -474,9 +479,9 @@ def load_properties(file_path, interesting_props):
474
479
Load properties data and return in grouped form.
475
480
"""
476
481
props = defaultdict (list ) # type: Dict[str, List[Tuple[int, int]]]
477
- # "raw string" is necessary for \. and \w not to be treated as escape chars
478
- # (for the sake of compat with future Python versions)
479
- # see : https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
482
+ # "Raw string" is necessary for `\.` and `\w` not to be treated as escape chars
483
+ # (for the sake of compat with future Python versions).
484
+ # See : https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
480
485
re1 = re .compile (r"^ *([0-9A-F]+) *; *(\w+)" )
481
486
re2 = re .compile (r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)" )
482
487
@@ -486,7 +491,7 @@ def load_properties(file_path, interesting_props):
486
491
groups = match .groups ()
487
492
488
493
if len (groups ) == 2 :
489
- # re1 matched
494
+ # ` re1` matched (2 groups).
490
495
d_lo , prop = groups
491
496
d_hi = d_lo
492
497
else :
@@ -502,7 +507,7 @@ def load_properties(file_path, interesting_props):
502
507
503
508
props [prop ].append ((lo_value , hi_value ))
504
509
505
- # optimize if possible
510
+ # Optimize if possible.
506
511
for prop in props :
507
512
props [prop ] = group_codepoints (ungroup_codepoints (props [prop ]))
508
513
@@ -587,10 +592,10 @@ def compute_trie(raw_data, chunk_size):
587
592
for i in range (len (raw_data ) // chunk_size ):
588
593
data = raw_data [i * chunk_size : (i + 1 ) * chunk_size ]
589
594
590
- # postfix compression of child nodes (data chunks)
591
- # (identical child nodes are shared)
595
+ # Postfix compression of child nodes (data chunks)
596
+ # (identical child nodes are shared).
592
597
593
- # make a tuple out of the list so it's hashable
598
+ # Make a tuple out of the list so it's hashable.
594
599
child = tuple (data )
595
600
if child not in childmap :
596
601
childmap [child ] = len (childmap )
@@ -609,15 +614,15 @@ def generate_bool_trie(name, codepoint_ranges, is_pub=True):
609
614
This yields string fragments that should be joined to produce
610
615
the final string.
611
616
612
- See: bool_trie.rs
617
+ See: ` bool_trie.rs`.
613
618
"""
614
619
chunk_size = 64
615
620
rawdata = [False ] * 0x110000
616
621
for (lo , hi ) in codepoint_ranges :
617
622
for cp in range (lo , hi + 1 ):
618
623
rawdata [cp ] = True
619
624
620
- # convert to bitmap chunks of chunk_size bits each
625
+ # Convert to bitmap chunks of ` chunk_size` bits each.
621
626
chunks = []
622
627
for i in range (0x110000 // chunk_size ):
623
628
chunk = 0
@@ -679,9 +684,9 @@ def generate_bool_trie(name, codepoint_ranges, is_pub=True):
679
684
def generate_small_bool_trie (name , codepoint_ranges , is_pub = True ):
680
685
# type: (str, List[Tuple[int, int]], bool) -> Iterator[str]
681
686
"""
682
- Generate Rust code for SmallBoolTrie struct.
687
+ Generate Rust code for ` SmallBoolTrie` struct.
683
688
684
- See: bool_trie.rs
689
+ See: ` bool_trie.rs`.
685
690
"""
686
691
last_chunk = max (hi // 64 for (lo , hi ) in codepoint_ranges )
687
692
n_chunks = last_chunk + 1
@@ -813,8 +818,8 @@ def main():
813
818
unicode_version = fetch_files (args .version )
814
819
print ("Using Unicode version: {}" .format (unicode_version .as_str ))
815
820
816
- # all the writing happens entirely in memory, we only write to file
817
- # once we have generated the file content (it's not very large, <1 MB)
821
+ # All the writing happens entirely in memory, we only write to file
822
+ # once we have generated the file content (it's not very large, <1 MB).
818
823
buf = StringIO ()
819
824
buf .write (PREAMBLE )
820
825
@@ -844,7 +849,7 @@ def main():
844
849
{"White_Space" , "Join_Control" , "Noncharacter_Code_Point" ,
845
850
"Pattern_White_Space" })
846
851
847
- # category tables
852
+ # Category tables
848
853
for (name , categories , category_subset ) in (
849
854
("general_category" , unicode_data .general_categories , ["N" , "Cc" ]),
850
855
("derived_property" , derived , want_derived ),
@@ -858,7 +863,8 @@ def main():
858
863
859
864
tables_rs_path = os .path .join (THIS_DIR , "tables.rs" )
860
865
861
- # will overwrite the file if it exists
866
+ # Actually write out the file content.
867
+ # Will overwrite the file if it exists.
862
868
with open (tables_rs_path , "w" ) as fd :
863
869
fd .write (buf .getvalue ())
864
870
0 commit comments