Skip to content

Commit 2b47a08

Browse files
committed
Address review remarks in unicode.py
1 parent 60ccf89 commit 2b47a08

File tree

1 file changed

+61
-55
lines changed

1 file changed

+61
-55
lines changed

src/libcore/unicode/unicode.py

+61-55
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
from StringIO import StringIO
3535

3636
try:
37-
# completely optional type hinting
37+
# Completely optional type hinting
3838
# (Python 2 compatible using comments,
3939
# see: https://mypy.readthedocs.io/en/latest/python2.html)
4040
# This is very helpful in typing-aware IDE like PyCharm.
@@ -43,9 +43,9 @@
4343
pass
4444

4545

46-
# we don't use enum.Enum because of Python 2.7 compatibility
46+
# We don't use enum.Enum because of Python 2.7 compatibility.
4747
class UnicodeFiles(object):
48-
# ReadMe does not contain any unicode data, we
48+
# ReadMe does not contain any Unicode data, we
4949
# only use it to extract versions.
5050
README = "ReadMe.txt"
5151

@@ -57,11 +57,15 @@ class UnicodeFiles(object):
5757
UNICODE_DATA = "UnicodeData.txt"
5858

5959

60-
UnicodeFiles.ALL_FILES = tuple(
61-
getattr(UnicodeFiles, name) for name in dir(UnicodeFiles)
60+
# The order doesn't really matter (Python < 3.6 won't preserve it),
61+
# we only want to aggregate all the file names.
62+
ALL_UNICODE_FILES = tuple(
63+
value for name, value in UnicodeFiles.__dict__.items()
6264
if not name.startswith("_")
6365
)
6466

67+
assert len(ALL_UNICODE_FILES) == 7, "Unexpected number of unicode files"
68+
6569
# The directory this file is located in.
6670
THIS_DIR = os.path.dirname(os.path.realpath(__file__))
6771

@@ -97,18 +101,17 @@ class UnicodeFiles(object):
97101

98102
# This is the (inclusive) range of surrogate codepoints.
99103
# These are not valid Rust characters.
100-
# - they are not valid Rust characters
101104
SURROGATE_CODEPOINTS_RANGE = (0xd800, 0xdfff)
102105

103106
UnicodeData = namedtuple(
104107
"UnicodeData", (
105-
# conversions:
108+
# Conversions:
106109
"to_upper", "to_lower", "to_title",
107110

108-
# decompositions: canonical decompositions, compatibility decomp
111+
# Decompositions: canonical decompositions, compatibility decomp
109112
"canon_decomp", "compat_decomp",
110113

111-
# grouped: general categories and combining characters
114+
# Grouped: general categories and combining characters
112115
"general_categories", "combines",
113116
)
114117
)
@@ -136,10 +139,10 @@ def fetch_files(version=None):
136139
return have_version
137140

138141
if version:
139-
# check if the desired version exists on the server
142+
# Check if the desired version exists on the server.
140143
get_fetch_url = lambda name: FETCH_URL_VERSION.format(version=version, filename=name)
141144
else:
142-
# extract the latest version
145+
# Extract the latest version.
143146
get_fetch_url = lambda name: FETCH_URL_LATEST.format(filename=name)
144147

145148
readme_url = get_fetch_url(UnicodeFiles.README)
@@ -153,14 +156,14 @@ def fetch_files(version=None):
153156

154157
download_dir = get_unicode_dir(unicode_version)
155158
if not os.path.exists(download_dir):
156-
# for 2.7 compat, we don't use exist_ok=True
159+
# For 2.7 compat, we don't use `exist_ok=True`.
157160
os.makedirs(download_dir)
158161

159-
for filename in UnicodeFiles.ALL_FILES:
162+
for filename in ALL_UNICODE_FILES:
160163
file_path = get_unicode_file_path(unicode_version, filename)
161164

162165
if os.path.exists(file_path):
163-
# assume file on the server didn't change if it's been saved before
166+
# Assume file on the server didn't change if it's been saved before.
164167
continue
165168

166169
if filename == UnicodeFiles.README:
@@ -178,15 +181,16 @@ def check_stored_version(version):
178181
# type: (Optional[str]) -> Optional[UnicodeVersion]
179182
"""
180183
Given desired Unicode version, return the version
181-
if stored files are all present, and None otherwise.
184+
if stored files are all present, and `None` otherwise.
182185
"""
183186
if not version:
184-
# should always check latest version
187+
# If no desired version specified, we should check what's the latest
188+
# version, skipping stored version checks.
185189
return None
186190

187191
fetch_dir = os.path.join(FETCH_DIR, version)
188192

189-
for filename in UnicodeFiles.ALL_FILES:
193+
for filename in ALL_UNICODE_FILES:
190194
file_path = os.path.join(fetch_dir, filename)
191195

192196
if not os.path.exists(file_path):
@@ -199,11 +203,11 @@ def check_stored_version(version):
199203
def parse_readme_unicode_version(readme_content):
200204
# type: (str) -> UnicodeVersion
201205
"""
202-
Parse the Unicode version contained in their ReadMe.txt file.
206+
Parse the Unicode version contained in their `ReadMe.txt` file.
203207
"""
204-
# "raw string" is necessary for \d not being treated as escape char
205-
# (for the sake of compat with future Python versions)
206-
# see: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
208+
# "Raw string" is necessary for \d not being treated as escape char
209+
# (for the sake of compat with future Python versions).
210+
# See: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
207211
pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
208212
groups = re.search(pattern, readme_content).groups()
209213

@@ -213,7 +217,7 @@ def parse_readme_unicode_version(readme_content):
213217
def get_unicode_dir(unicode_version):
214218
# type: (UnicodeVersion) -> str
215219
"""
216-
Indicate where the unicode data files should be stored.
220+
Indicate in which parent dir the Unicode data files should be stored.
217221
218222
This returns a full, absolute path.
219223
"""
@@ -223,7 +227,7 @@ def get_unicode_dir(unicode_version):
223227
def get_unicode_file_path(unicode_version, filename):
224228
# type: (UnicodeVersion, str) -> str
225229
"""
226-
Indicate where the unicode data file should be stored.
230+
Indicate where the Unicode data file should be stored.
227231
"""
228232
return os.path.join(get_unicode_dir(unicode_version), filename)
229233

@@ -239,22 +243,22 @@ def is_surrogate(n):
239243
def load_unicode_data(file_path):
240244
# type: (str) -> UnicodeData
241245
"""
242-
Load main unicode data.
246+
Load main Unicode data.
243247
"""
244-
# conversions
248+
# Conversions
245249
to_lower = {} # type: Dict[int, Tuple[int, int, int]]
246250
to_upper = {} # type: Dict[int, Tuple[int, int, int]]
247251
to_title = {} # type: Dict[int, Tuple[int, int, int]]
248252

249-
# decompositions
253+
# Decompositions
250254
compat_decomp = {} # type: Dict[int, List[int]]
251255
canon_decomp = {} # type: Dict[int, List[int]]
252256

253-
# combining characters
257+
# Combining characters
254258
# FIXME: combines are not used
255259
combines = defaultdict(set) # type: Dict[str, Set[int]]
256260

257-
# categories
261+
# Categories
258262
general_categories = defaultdict(set) # type: Dict[str, Set[int]]
259263
category_assigned_codepoints = set() # type: Set[int]
260264

@@ -283,41 +287,42 @@ def load_unicode_data(file_path):
283287
decomp, deci, digit, num, mirror,
284288
old, iso, upcase, lowcase, titlecase) = data
285289

286-
# generate char to char direct common and simple conversions
287-
# uppercase to lowercase
290+
# Generate char to char direct common and simple conversions:
291+
292+
# Uppercase to lowercase
288293
if lowcase != "" and code_org != lowcase:
289294
to_lower[code] = (int(lowcase, 16), 0, 0)
290295

291-
# lowercase to uppercase
296+
# Lowercase to uppercase
292297
if upcase != "" and code_org != upcase:
293298
to_upper[code] = (int(upcase, 16), 0, 0)
294299

295-
# title case
300+
# Title case
296301
if titlecase.strip() != "" and code_org != titlecase:
297302
to_title[code] = (int(titlecase, 16), 0, 0)
298303

299-
# store decomposition, if given
304+
# Store decomposition, if given
300305
if decomp:
301306
decompositions = decomp.split()[1:]
302307
decomp_code_points = [int(i, 16) for i in decompositions]
303308

304309
if decomp.startswith("<"):
305-
# compatibility decomposition
310+
# Compatibility decomposition
306311
compat_decomp[code] = decomp_code_points
307312
else:
308-
# canonical decomposition
313+
# Canonical decomposition
309314
canon_decomp[code] = decomp_code_points
310315

311-
# place letter in categories as appropriate
316+
# Place letter in categories as appropriate.
312317
for cat in itertools.chain((gencat, ), EXPANDED_CATEGORIES.get(gencat, [])):
313318
general_categories[cat].add(code)
314319
category_assigned_codepoints.add(code)
315320

316-
# record combining class, if any
321+
# Record combining class, if any.
317322
if combine != "0":
318323
combines[combine].add(code)
319324

320-
# generate Not_Assigned from Assigned
325+
# Generate Not_Assigned from Assigned.
321326
general_categories["Cn"] = get_unassigned_codepoints(category_assigned_codepoints)
322327

323328
# Other contains Not_Assigned
@@ -336,7 +341,7 @@ def load_unicode_data(file_path):
336341
def load_special_casing(file_path, unicode_data):
337342
# type: (str, UnicodeData) -> None
338343
"""
339-
Load special casing data and enrich given unicode data.
344+
Load special casing data and enrich given Unicode data.
340345
"""
341346
for line in fileinput.input(file_path):
342347
data = line.split("#")[0].split(";")
@@ -474,9 +479,9 @@ def load_properties(file_path, interesting_props):
474479
Load properties data and return in grouped form.
475480
"""
476481
props = defaultdict(list) # type: Dict[str, List[Tuple[int, int]]]
477-
# "raw string" is necessary for \. and \w not to be treated as escape chars
478-
# (for the sake of compat with future Python versions)
479-
# see: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
482+
# "Raw string" is necessary for `\.` and `\w` not to be treated as escape chars
483+
# (for the sake of compat with future Python versions).
484+
# See: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
480485
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
481486
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
482487

@@ -486,7 +491,7 @@ def load_properties(file_path, interesting_props):
486491
groups = match.groups()
487492

488493
if len(groups) == 2:
489-
# re1 matched
494+
# `re1` matched (2 groups).
490495
d_lo, prop = groups
491496
d_hi = d_lo
492497
else:
@@ -502,7 +507,7 @@ def load_properties(file_path, interesting_props):
502507

503508
props[prop].append((lo_value, hi_value))
504509

505-
# optimize if possible
510+
# Optimize if possible.
506511
for prop in props:
507512
props[prop] = group_codepoints(ungroup_codepoints(props[prop]))
508513

@@ -587,10 +592,10 @@ def compute_trie(raw_data, chunk_size):
587592
for i in range(len(raw_data) // chunk_size):
588593
data = raw_data[i * chunk_size : (i + 1) * chunk_size]
589594

590-
# postfix compression of child nodes (data chunks)
591-
# (identical child nodes are shared)
595+
# Postfix compression of child nodes (data chunks)
596+
# (identical child nodes are shared).
592597

593-
# make a tuple out of the list so it's hashable
598+
# Make a tuple out of the list so it's hashable.
594599
child = tuple(data)
595600
if child not in childmap:
596601
childmap[child] = len(childmap)
@@ -609,15 +614,15 @@ def generate_bool_trie(name, codepoint_ranges, is_pub=True):
609614
This yields string fragments that should be joined to produce
610615
the final string.
611616
612-
See: bool_trie.rs
617+
See: `bool_trie.rs`.
613618
"""
614619
chunk_size = 64
615620
rawdata = [False] * 0x110000
616621
for (lo, hi) in codepoint_ranges:
617622
for cp in range(lo, hi + 1):
618623
rawdata[cp] = True
619624

620-
# convert to bitmap chunks of chunk_size bits each
625+
# Convert to bitmap chunks of `chunk_size` bits each.
621626
chunks = []
622627
for i in range(0x110000 // chunk_size):
623628
chunk = 0
@@ -679,9 +684,9 @@ def generate_bool_trie(name, codepoint_ranges, is_pub=True):
679684
def generate_small_bool_trie(name, codepoint_ranges, is_pub=True):
680685
# type: (str, List[Tuple[int, int]], bool) -> Iterator[str]
681686
"""
682-
Generate Rust code for SmallBoolTrie struct.
687+
Generate Rust code for `SmallBoolTrie` struct.
683688
684-
See: bool_trie.rs
689+
See: `bool_trie.rs`.
685690
"""
686691
last_chunk = max(hi // 64 for (lo, hi) in codepoint_ranges)
687692
n_chunks = last_chunk + 1
@@ -813,8 +818,8 @@ def main():
813818
unicode_version = fetch_files(args.version)
814819
print("Using Unicode version: {}".format(unicode_version.as_str))
815820

816-
# all the writing happens entirely in memory, we only write to file
817-
# once we have generated the file content (it's not very large, <1 MB)
821+
# All the writing happens entirely in memory, we only write to file
822+
# once we have generated the file content (it's not very large, <1 MB).
818823
buf = StringIO()
819824
buf.write(PREAMBLE)
820825

@@ -844,7 +849,7 @@ def main():
844849
{"White_Space", "Join_Control", "Noncharacter_Code_Point",
845850
"Pattern_White_Space"})
846851

847-
# category tables
852+
# Category tables
848853
for (name, categories, category_subset) in (
849854
("general_category", unicode_data.general_categories, ["N", "Cc"]),
850855
("derived_property", derived, want_derived),
@@ -858,7 +863,8 @@ def main():
858863

859864
tables_rs_path = os.path.join(THIS_DIR, "tables.rs")
860865

861-
# will overwrite the file if it exists
866+
# Actually write out the file content.
867+
# Will overwrite the file if it exists.
862868
with open(tables_rs_path, "w") as fd:
863869
fd.write(buf.getvalue())
864870

0 commit comments

Comments
 (0)