Address review remarks in unicode.py

pawroman · pawroman · commit 2b47a085dd41 · 2019-07-01T19:43:48.000+02:00
diff --git a/src/libcore/unicode/unicode.py b/src/libcore/unicode/unicode.py
@@ -34,7 +34,7 @@
     from StringIO import StringIO
 
 try:
-    # completely optional type hinting
+    # Completely optional type hinting
     # (Python 2 compatible using comments,
     # see: https://mypy.readthedocs.io/en/latest/python2.html)
     # This is very helpful in typing-aware IDE like PyCharm.
@@ -43,9 +43,9 @@
     pass
 
 
-# we don't use enum.Enum because of Python 2.7 compatibility
+# We don't use enum.Enum because of Python 2.7 compatibility.
 class UnicodeFiles(object):
-    # ReadMe does not contain any unicode data, we
+    # ReadMe does not contain any Unicode data, we
     # only use it to extract versions.
     README = "ReadMe.txt"
 
@@ -57,11 +57,15 @@ class UnicodeFiles(object):
     UNICODE_DATA = "UnicodeData.txt"
 
 
-UnicodeFiles.ALL_FILES = tuple(
-    getattr(UnicodeFiles, name) for name in dir(UnicodeFiles)
+# The order doesn't really matter (Python < 3.6 won't preserve it),
+# we only want to aggregate all the file names.
+ALL_UNICODE_FILES = tuple(
+    value for name, value in UnicodeFiles.__dict__.items()
     if not name.startswith("_")
 )
 
+assert len(ALL_UNICODE_FILES) == 7, "Unexpected number of unicode files"
+
 # The directory this file is located in.
 THIS_DIR = os.path.dirname(os.path.realpath(__file__))
 
@@ -97,18 +101,17 @@ class UnicodeFiles(object):
 
 # This is the (inclusive) range of surrogate codepoints.
 # These are not valid Rust characters.
-# - they are not valid Rust characters
 SURROGATE_CODEPOINTS_RANGE = (0xd800, 0xdfff)
 
 UnicodeData = namedtuple(
     "UnicodeData", (
-        # conversions:
+        # Conversions:
         "to_upper", "to_lower", "to_title",
 
-        # decompositions: canonical decompositions, compatibility decomp
+        # Decompositions: canonical decompositions, compatibility decomp
         "canon_decomp", "compat_decomp",
 
-        # grouped: general categories and combining characters
+        # Grouped: general categories and combining characters
         "general_categories", "combines",
     )
 )
@@ -136,10 +139,10 @@ def fetch_files(version=None):
         return have_version
 
     if version:
-        # check if the desired version exists on the server
+        # Check if the desired version exists on the server.
         get_fetch_url = lambda name: FETCH_URL_VERSION.format(version=version, filename=name)
     else:
-        # extract the latest version
+        # Extract the latest version.
         get_fetch_url = lambda name: FETCH_URL_LATEST.format(filename=name)
 
     readme_url = get_fetch_url(UnicodeFiles.README)
@@ -153,14 +156,14 @@ def fetch_files(version=None):
 
     download_dir = get_unicode_dir(unicode_version)
     if not os.path.exists(download_dir):
-        # for 2.7 compat, we don't use exist_ok=True
+        # For 2.7 compat, we don't use `exist_ok=True`.
         os.makedirs(download_dir)
 
-    for filename in UnicodeFiles.ALL_FILES:
+    for filename in ALL_UNICODE_FILES:
         file_path = get_unicode_file_path(unicode_version, filename)
 
         if os.path.exists(file_path):
-            # assume file on the server didn't change if it's been saved before
+            # Assume file on the server didn't change if it's been saved before.
             continue
 
         if filename == UnicodeFiles.README:
@@ -178,15 +181,16 @@ def check_stored_version(version):
     # type: (Optional[str]) -> Optional[UnicodeVersion]
     """
     Given desired Unicode version, return the version
-    if stored files are all present, and None otherwise.
+    if stored files are all present, and `None` otherwise.
     """
     if not version:
-        # should always check latest version
+        # If no desired version specified, we should check what's the latest
+        # version, skipping stored version checks.
         return None
 
     fetch_dir = os.path.join(FETCH_DIR, version)
 
-    for filename in UnicodeFiles.ALL_FILES:
+    for filename in ALL_UNICODE_FILES:
         file_path = os.path.join(fetch_dir, filename)
 
         if not os.path.exists(file_path):
@@ -199,11 +203,11 @@ def check_stored_version(version):
 def parse_readme_unicode_version(readme_content):
     # type: (str) -> UnicodeVersion
     """
-    Parse the Unicode version contained in their ReadMe.txt file.
+    Parse the Unicode version contained in their `ReadMe.txt` file.
     """
-    # "raw string" is necessary for \d not being treated as escape char
-    # (for the sake of compat with future Python versions)
-    # see: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
+    # "Raw string" is necessary for \d not being treated as escape char
+    # (for the sake of compat with future Python versions).
+    # See: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
     pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
     groups = re.search(pattern, readme_content).groups()
 
@@ -213,7 +217,7 @@ def parse_readme_unicode_version(readme_content):
 def get_unicode_dir(unicode_version):
     # type: (UnicodeVersion) -> str
     """
-    Indicate where the unicode data files should be stored.
+    Indicate in which parent dir the Unicode data files should be stored.
 
     This returns a full, absolute path.
     """
@@ -223,7 +227,7 @@ def get_unicode_dir(unicode_version):
 def get_unicode_file_path(unicode_version, filename):
     # type: (UnicodeVersion, str) -> str
     """
-    Indicate where the unicode data file should be stored.
+    Indicate where the Unicode data file should be stored.
     """
     return os.path.join(get_unicode_dir(unicode_version), filename)
 
@@ -239,22 +243,22 @@ def is_surrogate(n):
 def load_unicode_data(file_path):
     # type: (str) -> UnicodeData
     """
-    Load main unicode data.
+    Load main Unicode data.
     """
-    # conversions
+    # Conversions
     to_lower = {}   # type: Dict[int, Tuple[int, int, int]]
     to_upper = {}   # type: Dict[int, Tuple[int, int, int]]
     to_title = {}   # type: Dict[int, Tuple[int, int, int]]
 
-    # decompositions
+    # Decompositions
     compat_decomp = {}   # type: Dict[int, List[int]]
     canon_decomp = {}    # type: Dict[int, List[int]]
 
-    # combining characters
+    # Combining characters
     # FIXME: combines are not used
     combines = defaultdict(set)   # type: Dict[str, Set[int]]
 
-    # categories
+    # Categories
     general_categories = defaultdict(set)   # type: Dict[str, Set[int]]
     category_assigned_codepoints = set()    # type: Set[int]
 
@@ -283,41 +287,42 @@ def load_unicode_data(file_path):
          decomp, deci, digit, num, mirror,
          old, iso, upcase, lowcase, titlecase) = data
 
-        # generate char to char direct common and simple conversions
-        # uppercase to lowercase
+        # Generate char to char direct common and simple conversions:
+
+        # Uppercase to lowercase
         if lowcase != "" and code_org != lowcase:
             to_lower[code] = (int(lowcase, 16), 0, 0)
 
-        # lowercase to uppercase
+        # Lowercase to uppercase
         if upcase != "" and code_org != upcase:
             to_upper[code] = (int(upcase, 16), 0, 0)
 
-        # title case
+        # Title case
         if titlecase.strip() != "" and code_org != titlecase:
             to_title[code] = (int(titlecase, 16), 0, 0)
 
-        # store decomposition, if given
+        # Store decomposition, if given
         if decomp:
             decompositions = decomp.split()[1:]
             decomp_code_points = [int(i, 16) for i in decompositions]
 
             if decomp.startswith("<"):
-                # compatibility decomposition
+                # Compatibility decomposition
                 compat_decomp[code] = decomp_code_points
             else:
-                # canonical decomposition
+                # Canonical decomposition
                 canon_decomp[code] = decomp_code_points
 
-        # place letter in categories as appropriate
+        # Place letter in categories as appropriate.
         for cat in itertools.chain((gencat, ), EXPANDED_CATEGORIES.get(gencat, [])):
             general_categories[cat].add(code)
             category_assigned_codepoints.add(code)
 
-        # record combining class, if any
+        # Record combining class, if any.
         if combine != "0":
             combines[combine].add(code)
 
-    # generate Not_Assigned from Assigned
+    # Generate Not_Assigned from Assigned.
     general_categories["Cn"] = get_unassigned_codepoints(category_assigned_codepoints)
 
     # Other contains Not_Assigned
@@ -336,7 +341,7 @@ def load_unicode_data(file_path):
 def load_special_casing(file_path, unicode_data):
     # type: (str, UnicodeData) -> None
     """
-    Load special casing data and enrich given unicode data.
+    Load special casing data and enrich given Unicode data.
     """
     for line in fileinput.input(file_path):
         data = line.split("#")[0].split(";")
@@ -474,9 +479,9 @@ def load_properties(file_path, interesting_props):
     Load properties data and return in grouped form.
     """
     props = defaultdict(list)   # type: Dict[str, List[Tuple[int, int]]]
-    # "raw string" is necessary for \. and \w not to be treated as escape chars
-    # (for the sake of compat with future Python versions)
-    # see: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
+    # "Raw string" is necessary for `\.` and `\w` not to be treated as escape chars
+    # (for the sake of compat with future Python versions).
+    # See: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
     re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
     re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
 
@@ -486,7 +491,7 @@ def load_properties(file_path, interesting_props):
             groups = match.groups()
 
             if len(groups) == 2:
-                # re1 matched
+                # `re1` matched (2 groups).
                 d_lo, prop = groups
                 d_hi = d_lo
             else:
@@ -502,7 +507,7 @@ def load_properties(file_path, interesting_props):
 
         props[prop].append((lo_value, hi_value))
 
-    # optimize if possible
+    # Optimize if possible.
     for prop in props:
         props[prop] = group_codepoints(ungroup_codepoints(props[prop]))
 
@@ -587,10 +592,10 @@ def compute_trie(raw_data, chunk_size):
     for i in range(len(raw_data) // chunk_size):
         data = raw_data[i * chunk_size : (i + 1) * chunk_size]
 
-        # postfix compression of child nodes (data chunks)
-        # (identical child nodes are shared)
+        # Postfix compression of child nodes (data chunks)
+        # (identical child nodes are shared).
 
-        # make a tuple out of the list so it's hashable
+        # Make a tuple out of the list so it's hashable.
         child = tuple(data)
         if child not in childmap:
             childmap[child] = len(childmap)
@@ -609,15 +614,15 @@ def generate_bool_trie(name, codepoint_ranges, is_pub=True):
     This yields string fragments that should be joined to produce
     the final string.
 
-    See: bool_trie.rs
+    See: `bool_trie.rs`.
     """
     chunk_size = 64
     rawdata = [False] * 0x110000
     for (lo, hi) in codepoint_ranges:
         for cp in range(lo, hi + 1):
             rawdata[cp] = True
 
-    # convert to bitmap chunks of chunk_size bits each
+    # Convert to bitmap chunks of `chunk_size` bits each.
     chunks = []
     for i in range(0x110000 // chunk_size):
         chunk = 0
@@ -679,9 +684,9 @@ def generate_bool_trie(name, codepoint_ranges, is_pub=True):
 def generate_small_bool_trie(name, codepoint_ranges, is_pub=True):
     # type: (str, List[Tuple[int, int]], bool) -> Iterator[str]
     """
-    Generate Rust code for SmallBoolTrie struct.
+    Generate Rust code for `SmallBoolTrie` struct.
 
-    See: bool_trie.rs
+    See: `bool_trie.rs`.
     """
     last_chunk = max(hi // 64 for (lo, hi) in codepoint_ranges)
     n_chunks = last_chunk + 1
@@ -813,8 +818,8 @@ def main():
     unicode_version = fetch_files(args.version)
     print("Using Unicode version: {}".format(unicode_version.as_str))
 
-    # all the writing happens entirely in memory, we only write to file
-    # once we have generated the file content (it's not very large, <1 MB)
+    # All the writing happens entirely in memory, we only write to file
+    # once we have generated the file content (it's not very large, <1 MB).
     buf = StringIO()
     buf.write(PREAMBLE)
 
@@ -844,7 +849,7 @@ def main():
                             {"White_Space", "Join_Control", "Noncharacter_Code_Point",
                              "Pattern_White_Space"})
 
-    # category tables
+    # Category tables
     for (name, categories, category_subset) in (
             ("general_category", unicode_data.general_categories, ["N", "Cc"]),
             ("derived_property", derived, want_derived),
@@ -858,7 +863,8 @@ def main():
 
     tables_rs_path = os.path.join(THIS_DIR, "tables.rs")
 
-    # will overwrite the file if it exists
+    # Actually write out the file content.
+    # Will overwrite the file if it exists.
     with open(tables_rs_path, "w") as fd:
         fd.write(buf.getvalue())