Created method to correct title capitalization (pandas-dev#26941)

awu42 · awu42 · commit 56bfc44a27bd · 2020-01-21T18:25:01.000-05:00
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -344,4 +344,5 @@ if [[ -z "$CHECK" || "$CHECK" == "typing" ]]; then
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 fi
 
+
 exit $RET
diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py
@@ -1,28 +1,21 @@
 #!/usr/bin/env python
-
 """
-GH #29641
-
-Collect the titles in the rst files and validate if they follow the proper
-capitalization convention.
+Validate that the titles in the rst files follow the proper capitalization convention.
 
-Prints the titles that do not follow the convention.
+Print the titles that do not follow the convention.
 
 Usage::
 ./scripts/validate_rst_title_capitalization.py doc/source/development/contributing.rst
 ./scripts/validate_rst_title_capitalization.py doc/source/
 
 """
-
 import argparse
 import sys
 import re
 import os
-from os import walk
-from typing import Generator, List
+from typing import Tuple, Generator, List
 
 
-# Keynames that would not follow capitalization convention
 CAPITALIZATION_EXCEPTIONS = {
     "pandas",
     "Python",
@@ -54,23 +47,48 @@
     "Docker",
 }
 
-# Lowercase representation of CAPITALIZATION_EXCEPTIONS
-CAPITALIZATION_EXCEPTIONS_LOWER = {word.lower() for word in CAPITALIZATION_EXCEPTIONS}
+CAP_EXCEPTIONS_DICT = {
+    word.lower(): word for word in CAPITALIZATION_EXCEPTIONS
+}
 
-# Dictionary of bad titles that will be printed later along with line numbers
-# Key: Document Directory, Value: Pair(Bad Title, Line Number)
 bad_title_dict = {}
 
-# Error Message:
 err_msg = "Heading capitalization formatted incorrectly. Please correctly capitalize"
 
+def correct_title_capitalization(title: str) -> str:
+    """
+    Algorithm to create the correct capitalization for a given title
+
+    Parameters
+    ----------
+    title : str
+        Heading string to correct
+
+    Returns
+    -------
+    correct_title : str
+        Correctly capitalized title
 
-def is_following_capitalization_convention(title: str) -> bool:
     """
-    Algorithm to determine if a heading follows the capitalization convention
 
-    This method returns true if the title follows the convention
-    and false if it does not
+    correct_title : str = title.capitalize()
+
+    removed_https_title = re.sub(r"<https?:\/\/.*[\r\n]*>", "", correct_title)
+
+    word_list = re.split(r"\W", removed_https_title)
+
+    for word in word_list:
+        if word.lower() in CAP_EXCEPTIONS_DICT:
+            correct_title = re.sub(
+                r'\b' + word + r'\b', CAP_EXCEPTIONS_DICT[word.lower()], correct_title
+            )
+
+    return correct_title
+
+
+def is_following_capitalization_convention(title: str) -> bool:
+    """
+    Algorithm to determine if a title is capitalized correctly
 
     Parameters
     ----------
@@ -80,49 +98,19 @@ def is_following_capitalization_convention(title: str) -> bool:
     Returns
     -------
     bool
-        True if capitalization is correct, False if not
+        True if title capitalized correctly, False if not
 
     """
 
-    # Remove https link if present in heading
-    title = re.sub(r"<https?:\/\/.*[\r\n]*>", "", title)
+    correct_title = correct_title_capitalization(title)
 
-    # Split with delimiters comma, semicolon and space, parentheses, colon, slashes
-    word_list = re.split(r"[;,-/():\s]\s*", title)
-
-    # Edge Case: First word is an empty string
-    if len(word_list[0]) == 0:
+    if (title != correct_title):
         return False
+    else:
+        return True
+
 
-    # Dealing with the first word of the title
-    if word_list[0] not in CAPITALIZATION_EXCEPTIONS:
-        # word is not in CAPITALIZATION_EXCEPTIONS but has different capitalization
-        if word_list[0].lower() in CAPITALIZATION_EXCEPTIONS_LOWER:
-            return False
-        # First letter of first word must be uppercase
-        if not word_list[0][0].isupper():
-            return False
-        # Remaining letters of first word must not be uppercase
-        for j in range(1, len(word_list[0])):
-            if word_list[0][j].isupper():
-                return False
-
-    # Remaining letters must not be uppercase letters
-    for i in range(1, len(word_list)):
-        if word_list[i] not in CAPITALIZATION_EXCEPTIONS:
-            # word is not in CAPITALIZATION_EXCEPTIONS but has different capitalization
-            if word_list[i].lower() in CAPITALIZATION_EXCEPTIONS_LOWER:
-                return False
-            # Remaining letters must not be uppercase
-            for j in range(len(word_list[i])):
-                if word_list[i][j].isupper():
-                    return False
-
-    # Returning True if the heading follows the capitalization convention
-    return True
-
-
-def findTitles(rst_file: str) -> Generator[List[str], List[int], None]:
+def find_titles(rst_file: str) -> Generator[Tuple[str, int], None, None]:
     """
     Algorithm to identify particular text that should be considered headings in an
     RST file
@@ -135,27 +123,19 @@ def findTitles(rst_file: str) -> Generator[List[str], List[int], None]:
     rst_file : str
         RST file to scan through for headings
 
-    Returns
+    Yields
     -------
-    title_list : List[str]
-        A list of heading strings found in the document tree
+    title : str
+        A heading found in the rst file
 
-    line_number_list : List[int]
-        The corresponding line numbers of the headings in title_list
+    line_number : int
+        The corresponding line number of the heading
 
     """
 
-    # title_list is the list of headings that is encountered in the doctree
-    title_list: List[str] = []
-
-    # List of line numbers that corresponding headings in title_list can be found at
-    line_number_list: List[int] = []
-
-    # Open and read the .rst file and store the string of data into lines
     with open(rst_file, "r") as file_obj:
         lines = file_obj.read().split("\n")
 
-    # Regular expressions that denote a title beforehand
     regex = {
         "*": r"^(?:\*{1})*$",
         "=": r"^(?:={1})*$",
@@ -166,26 +146,20 @@ def findTitles(rst_file: str) -> Generator[List[str], List[int], None]:
         '"': r'^(?:"{1})*$',
     }
 
-    # '*`_' markers are removed from original string text.
     table = str.maketrans("", "", "*`_")
 
-    # Loop through lines lines, appending if they are considered headings
-    for lineno in range(1, len(lines)):
-        if len(lines[lineno]) != 0 and len(lines[lineno - 1]) != 0:
+    for line_no in range(1, len(lines)):
+        if len(lines[line_no]) != 0 and len(lines[line_no - 1]) != 0:
             for key in regex:
-                match = re.search(regex[key], lines[lineno])
+                match = re.search(regex[key], lines[line_no])
                 if match is not None:
-                    if lineno >= 2:
-                        if lines[lineno] == lines[lineno - 2]:
-                            if len(lines[lineno]) == len(lines[lineno - 1]):
-                                title_list.append(lines[lineno - 1].translate(table))
-                                line_number_list.append(lineno)
+                    if line_no >= 2:
+                        if lines[line_no] == lines[line_no - 2]:
+                            if len(lines[line_no]) == len(lines[line_no - 1]):
+                                yield lines[line_no - 1].translate(table), line_no
                             break
-                    if len(lines[lineno]) >= len(lines[lineno - 1]):
-                        title_list.append(lines[lineno - 1].translate(table))
-                        line_number_list.append(lineno)
-
-    return title_list, line_number_list
+                    if len(lines[line_no]) >= len(lines[line_no - 1]):
+                        yield lines[line_no - 1].translate(table), line_no
 
 
 def fill_bad_title_dict(rst_file: str) -> None:
@@ -199,20 +173,15 @@ def fill_bad_title_dict(rst_file: str) -> None:
 
     """
 
-    # Ensure this file doesn't already have a bad_title_dict slot
     if rst_file in bad_title_dict:
         return
 
-    # Make a list of headings along with their line numbers
-    title_list, line_number_list = findTitles(rst_file)
-
-    # Append the bad_title_dict if the capitalization convention not followed
-    for i in range(len(title_list)):
-        if not is_following_capitalization_convention(title_list[i]):
+    for title, line_number in find_titles(rst_file):
+        if not is_following_capitalization_convention(title):
             if rst_file not in bad_title_dict:
-                bad_title_dict[rst_file] = [(title_list[i], line_number_list[i])]
+                bad_title_dict[rst_file] = [(title, line_number)]
             else:
-                bad_title_dict[rst_file].append((title_list[i], line_number_list[i]))
+                bad_title_dict[rst_file].append((title, line_number))
 
 
 def find_rst_files(source_paths: List[str]) -> Generator[str, None, None]:
@@ -232,7 +201,6 @@ def find_rst_files(source_paths: List[str]) -> Generator[str, None, None]:
 
     """
 
-    # Loop through source_paths, recursively looking for .rst files
     for directory_address in source_paths:
         if not os.path.exists(directory_address):
             raise ValueError(
@@ -241,7 +209,7 @@ def find_rst_files(source_paths: List[str]) -> Generator[str, None, None]:
         elif directory_address.endswith(".rst"):
             yield directory_address
         else:
-            for (dirpath, dirnames, filenames) in walk(directory_address):
+            for (dirpath, _, filenames) in os.walk(directory_address):
                 for file in filenames:
                     if file.endswith(".rst"):
                         yield os.path.join(dirpath, file)
@@ -260,32 +228,28 @@ def main(source_paths: List[str], output_format: str) -> bool:
 
     Returns
     -------
-    is_failed : bool
+    number_of_errors : int
         True if there are headings that are printed, False if not
 
     """
 
-    is_failed: bool = False
+    number_of_errors: int = 0
 
-    # Make a list of all RST files from command line directory list
     directory_list = find_rst_files(source_paths)
 
-    # Fill the bad_title_dict, which contains all incorrectly capitalized headings
     for filename in directory_list:
         fill_bad_title_dict(filename)
 
-    # Return an exit status of 0 if there are no bad titles in the dictionary
-    if len(bad_title_dict) == 0:
-        return is_failed
+    if (len(bad_title_dict) == 0):
+        return number_of_errors
 
-    # Print bad_title_dict Results
-    is_failed = True
     for key in bad_title_dict:
         for line in bad_title_dict[key]:
-            print(key + ":" + str(line[1]) + ": " + err_msg + ' "' + line[0] + '"')
+            correct_title = correct_title_capitalization(line[0])
+            print(f'{key}:{line[1]}:{err_msg} "{line[0]}" to "{correct_title}"')
+            number_of_errors += 1
 
-    # Exit status of 0
-    return is_failed
+    return number_of_errors
 
 
 if __name__ == "__main__":
@@ -298,7 +262,7 @@ def main(source_paths: List[str], output_format: str) -> bool:
     parser.add_argument(
         "--format",
         "-f",
-        default="{source_path}:{line_number}:{heading}:{msg}",
+        default="{source_path}:{line_number}:{msg}:{heading}",
         help="Output format of incorrectly capitalized titles",
     )