Added comments

awu42 · awu42 · commit 4ceea5e4002f · 2020-01-17T16:42:02.000-05:00
diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py
@@ -9,8 +9,10 @@
 
 NOTE: Run from the root directory of pandas repository
 
-Example:
+Examples:
 ./scripts/validate_rst_title_capitalization.py doc/source/development/contributing.rst
+./scripts/validate_rst_title_capitalization.py doc/source/index.rst doc/source/ecosystem.rst
+./scripts/validate_rst_title_capitalization.py doc/source/
 
 Files that cannot be validated: (code crashes when validating for some reason)
 doc/source/user_guide/io.rst
@@ -90,27 +92,27 @@ def __exit__(self, *_):
 cannotValidate = ['doc/source/user_guide/io.rst', 'doc/source/whatsnew/v0.17.1.rst']
 
 # Error Message:
-errMessage = "Title capitalization formatted incorrectly. Manually format correctly"
+errMessage = "Heading capitalization formatted incorrectly. Please correctly capitalize"
 
 def followCapitalizationConvention(title: str) -> bool:
     '''
-    tonywu1999's algorithm to determine if a heading follows the capitalization convention
+    Algorithm to determine if a heading follows the capitalization convention
 
     This method returns true if the title follows the convention
     and false if it does not
 
     '''
 
-    # split with delimiters comma, semicolon and space, parentheses, colon
-    wordList = re.split(r'[;,():\s]\s*', title) # followed by any amount of extra whitespace.
+    # split with delimiters comma, semicolon and space, parentheses, colon, slashes
+    wordList = re.split(r'[;,/():\s]\s*', title) # followed by any amount of extra whitespace.
 
     # Edge Case: First word is an empty string
     if (len(wordList[0]) == 0):
         return False
 
     # Dealing with the first word of the title
     if wordList[0] not in CAPITALIZATION_EXCEPTIONS:
-        # word is not in keyNames but has different capitalization
+        # word is not in CAPITALIZATION_EXCEPTIONS but has different capitalization
         if wordList[0].lower() in CAPITALIZATION_EXCEPTIONS_LOWER:
             return False
         # First letter of first word must be uppercase
@@ -124,21 +126,26 @@ def followCapitalizationConvention(title: str) -> bool:
     # Remaining letters must not be uppercase letters
     for i in range(1, len(wordList)):
         if wordList[i] not in CAPITALIZATION_EXCEPTIONS:
-            # word is not in keyNames but has different capitalization
+            # word is not in CAPITALIZATION_EXCEPTIONS but has different capitalization
             if wordList[i].lower() in CAPITALIZATION_EXCEPTIONS_LOWER:
                 return False
             # Remaining letters must not be uppercase
             for j in range(len(wordList[i])):
                 if wordList[i][j].isupper():
                     return False
 
+    # Returning True if the heading follows the capitalization convention
     return True
 
 def findLineNumber(node: docutils.nodes) -> int:
     '''
     Recursive method that finds the line number in a document for a particular node
     in the doctree
 
+    Text nodes usually don't have any value for its "line" instance variable,
+    so instead, we recursively look through the parent nodes to eventually find the
+    correct line number, which I determined would be node.line - 1
+
     '''
     if (node.tagname == 'document'):
         return 1
@@ -152,32 +159,92 @@ def parseRST(rstFile: str) -> docutils.nodes.document:
     Method to parse through an rstFile and return a document tree
 
     '''
-    # Parse through rstFile
+    # Create rst Parser object
     parser = docutils.parsers.rst.Parser()
+
+    # Open and read the .rst file and store the string of data into input
     f = open(rstFile, "r")
     input = f.read()
+
+    # Set up default settings for the document tree
     settings = docutils.frontend.OptionParser(
         components=(docutils.parsers.rst.Parser,)
         ).get_default_values()
+
+    # Initialize an empty document tree with the default settings from above
     document = docutils.utils.new_document('Document', settings)
 
+    # Parse the input string into an RST document tree, suppressing any stdout from the parse method
     with suppress_stdout_stderr():
         parser.parse(input, document)
 
+    # Return the root node of the document tree
     return document
 
 def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[List[str], List[int], None]:
     '''
-    tonywu1999's algorithm to identify particular text nodes as headings
-    along with the text node's line number
+    Algorithm to identify particular text nodes as headings
+    along with the text node's line number.
+
+    The idea is that when we traverse through the text nodes, nodes whose
+    parents have a tagname of 'title' are definitely considered to be part
+    of headings.
+
+    However, the problem occurs when we encounter text that has been either
+    italicized, bolded, referenced, etc.  In these situations, the tagname of
+    the parent node could be one of the following: 'emphasis', 'strong', 'reference', 'literal',
+    stored in the 'listOfMarkers' set variable.  In this situation, the node's
+    grandparent would have the 'title' tagname instead.
+
+    Let's see an example that can cause a problem.  The heading provided will be
+    'Looking at *pandas* docs' versus 'Looking at pandas docs'. In this example,
+    the stars around pandas  in the first string italicizes the word.
+    However, the doctree would be representing both doctrees as follows:
+
+          'Looking at *pandas* docs'                 'Looking at pandas docs'
+                    title                                     title
+                /     |      \                                  |
+            #text   emphasis  #text          VS               #text
+              |       |        |                                |
+     'Looking at'   #text    'docs'                    'Looking at pandas docs'
+                      |
+                    'pandas'
+
+    When iterating through the nodes, we first encounter the node: 'Looking at'.
+    However, this isn't the full line of the heading (Looking at pandas docs).
+    We're still missing 'pandas docs'. Hence, we must store this first word into
+    a variable (myText in my function) and append this string variable with more
+    words in case we encounter text that has a parent with tagname in listOfMarkers.
+    In this example, we have to go through two more nodes to get the full heading.
+
+    Meanwhile, when nothing has a parent with tagname in listOfMarkers, we only need to
+    access one node to find the 'Looking at the pandas docs' text.
+
+    My algorithm adjusts for this pattern, iterating through nodes and
+    identifying when headings are complete.
 
     '''
-    myText = ""
-    markerGrandparent = ""
-    beforeMarker = False
-    titleList = []
-    lineNumberList = []
+
+    # Initialize an empty string.  myText will be used to construct headings and append into titleList
+    myText: str = ""
+
+    # A docutils.nodes object that stores a listOfMarkers text's grandparent node,
+    # which should have a tagname of title
+    markerGrandparent: docutils.nodes.Title
+
+    # True if the most recent node encountered had a parent with a listOfMarkers tagname
+    # and a grandparent with a tagname of title
+    beforeMarker: bool = False
+
+    # titleList is the list of headings that is encountered in the doctree
+    titleList: List[str] = []
+
+    # A list of line numbers that the corresponding headings in titleList can be found at
+    lineNumberList: List[int] = []
+
+    # Traverse through the nodes.Text in the document tree to construct headings
     for node in document.traverse(nodes.Text):
+        # Case 1: Encounter a node with a parent tagname of title
         if (node.parent.tagname == 'title'):
             if (beforeMarker and markerGrandparent == node.parent):
                 myText = myText + node.astext()
@@ -189,12 +256,14 @@ def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[List[
                 lineno = findLineNumber(node)
                 myText = node.astext()
                 beforeMarker = False
+        # Case 2: Encounter a node with parent tagname in listOfMarkers
         elif (node.parent.parent.tagname == 'title' and
             node.parent.tagname in listOfMarkers):
             lineno = findLineNumber(node)
             myText = myText + node.astext()
             beforeMarker = True
             markerGrandparent = node.parent.parent
+        # Case 3: Encounter a node with parent tagname from none of the above (Ex. 'paragraph' tagname)
         else:
             beforeMarker = False
             if (myText != ""):
@@ -203,10 +272,12 @@ def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[List[
                 myText = ""
                 lineno = 0
 
+    # Sometimes, there is leftover string that hasn't been appended yet due to how the for loop works
     if (myText != ""):
         titleList.append(myText)
         lineNumberList.append(lineno)
 
+    # Return a list of the headings and a list of their corresponding line numbers
     return titleList, lineNumberList
 
 def fillBadTitleDictionary(rstFile: str) -> None:
@@ -245,16 +316,24 @@ def createRSTDirectoryList(source_paths: List[str]) -> List[str]:
     creates a list of all of the .rst file directories that these paths contain
 
     '''
+    # List of .rst file paths
     f = []
+
+    # Loop through source_paths.  If address is a folder, recursively look through the folder for .rst files
     for directoryAddress in source_paths:
-        if (directoryAddress.endswith(".rst")):
+        if not os.path.exists(directoryAddress):
+            raise ValueError(
+                "Please enter a valid path, pointing to a valid file/directory."
+            )
+        elif (directoryAddress.endswith(".rst")):
             f.append(directoryAddress)
         else:
             for (dirpath, dirnames, filenames) in walk(directoryAddress):
                 for file in filenames:
                     if file.endswith(".rst"):
                         f.append(os.path.join(dirpath, file))
 
+    # Return the filled up list of .rst file paths
     return f
 
 def main(source_paths: List[str], output_format: str) -> bool:
@@ -275,11 +354,10 @@ def main(source_paths: List[str], output_format: str) -> bool:
         return False
 
     # Print badTitleDictionary Results
+    print()
     for key in badTitleDictionary:
-        print()
-        print(key)
         for titles in badTitleDictionary[key]:
-            print(titles)
+            print(key + ":" + str(titles[1]) + ": " + errMessage + " \"" + titles[0] + "\"")
 
     # Exit status of 1
     return True
@@ -294,6 +372,7 @@ def main(source_paths: List[str], output_format: str) -> bool:
 
     parser.add_argument(
         "--format",
+        "-f",
         default="{source_path}:{line_number}:{heading}:{msg}",
         help="Output format of incorrectly capitalized titles",
     )