Skip to content

Commit 4ceea5e

Browse files
author
awu42
committed
Added comments
1 parent 2d3cfe7 commit 4ceea5e

File tree

1 file changed

+98
-19
lines changed

1 file changed

+98
-19
lines changed

scripts/validate_rst_title_capitalization.py

+98-19
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,10 @@
99
1010
NOTE: Run from the root directory of pandas repository
1111
12-
Example:
12+
Examples:
1313
./scripts/validate_rst_title_capitalization.py doc/source/development/contributing.rst
14+
./scripts/validate_rst_title_capitalization.py doc/source/index.rst doc/source/ecosystem.rst
15+
./scripts/validate_rst_title_capitalization.py doc/source/
1416
1517
Files that cannot be validated: (code crashes when validating for some reason)
1618
doc/source/user_guide/io.rst
@@ -90,27 +92,27 @@ def __exit__(self, *_):
9092
cannotValidate = ['doc/source/user_guide/io.rst', 'doc/source/whatsnew/v0.17.1.rst']
9193

9294
# Error Message:
93-
errMessage = "Title capitalization formatted incorrectly. Manually format correctly"
95+
errMessage = "Heading capitalization formatted incorrectly. Please correctly capitalize"
9496

9597
def followCapitalizationConvention(title: str) -> bool:
9698
'''
97-
tonywu1999's algorithm to determine if a heading follows the capitalization convention
99+
Algorithm to determine if a heading follows the capitalization convention
98100
99101
This method returns true if the title follows the convention
100102
and false if it does not
101103
102104
'''
103105

104-
# split with delimiters comma, semicolon and space, parentheses, colon
105-
wordList = re.split(r'[;,():\s]\s*', title) # followed by any amount of extra whitespace.
106+
# split with delimiters comma, semicolon and space, parentheses, colon, slashes
107+
wordList = re.split(r'[;,/():\s]\s*', title) # followed by any amount of extra whitespace.
106108

107109
# Edge Case: First word is an empty string
108110
if (len(wordList[0]) == 0):
109111
return False
110112

111113
# Dealing with the first word of the title
112114
if wordList[0] not in CAPITALIZATION_EXCEPTIONS:
113-
# word is not in keyNames but has different capitalization
115+
# word is not in CAPITALIZATION_EXCEPTIONS but has different capitalization
114116
if wordList[0].lower() in CAPITALIZATION_EXCEPTIONS_LOWER:
115117
return False
116118
# First letter of first word must be uppercase
@@ -124,21 +126,26 @@ def followCapitalizationConvention(title: str) -> bool:
124126
# Remaining letters must not be uppercase letters
125127
for i in range(1, len(wordList)):
126128
if wordList[i] not in CAPITALIZATION_EXCEPTIONS:
127-
# word is not in keyNames but has different capitalization
129+
# word is not in CAPITALIZATION_EXCEPTIONS but has different capitalization
128130
if wordList[i].lower() in CAPITALIZATION_EXCEPTIONS_LOWER:
129131
return False
130132
# Remaining letters must not be uppercase
131133
for j in range(len(wordList[i])):
132134
if wordList[i][j].isupper():
133135
return False
134136

137+
# Returning True if the heading follows the capitalization convention
135138
return True
136139

137140
def findLineNumber(node: docutils.nodes) -> int:
138141
'''
139142
Recursive method that finds the line number in a document for a particular node
140143
in the doctree
141144
145+
Text nodes usually don't have any value for its "line" instance variable,
146+
so instead, we recursively look through the parent nodes to eventually find the
147+
correct line number, which I determined would be node.line - 1
148+
142149
'''
143150
if (node.tagname == 'document'):
144151
return 1
@@ -152,32 +159,92 @@ def parseRST(rstFile: str) -> docutils.nodes.document:
152159
Method to parse through an rstFile and return a document tree
153160
154161
'''
155-
# Parse through rstFile
162+
# Create rst Parser object
156163
parser = docutils.parsers.rst.Parser()
164+
165+
# Open and read the .rst file and store the string of data into input
157166
f = open(rstFile, "r")
158167
input = f.read()
168+
169+
# Set up default settings for the document tree
159170
settings = docutils.frontend.OptionParser(
160171
components=(docutils.parsers.rst.Parser,)
161172
).get_default_values()
173+
174+
# Initialize an empty document tree with the default settings from above
162175
document = docutils.utils.new_document('Document', settings)
163176

177+
# Parse the input string into an RST document tree, suppressing any stdout from the parse method
164178
with suppress_stdout_stderr():
165179
parser.parse(input, document)
166180

181+
# Return the root node of the document tree
167182
return document
168183

169184
def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[List[str], List[int], None]:
170185
'''
171-
tonywu1999's algorithm to identify particular text nodes as headings
172-
along with the text node's line number
186+
Algorithm to identify particular text nodes as headings
187+
along with the text node's line number.
188+
189+
The idea is that when we traverse through the text nodes, nodes whose
190+
parents have a tagname of 'title' are definitely considered to be part
191+
of headings.
192+
193+
However, the problem occurs when we encounter text that has been either
194+
italicized, bolded, referenced, etc. In these situations, the tagname of
195+
the parent node could be one of the following: 'emphasis', 'strong', 'reference', 'literal',
196+
stored in the 'listOfMarkers' set variable. In this situation, the node's
197+
grandparent would have the 'title' tagname instead.
198+
199+
Let's see an example that can cause a problem. The heading provided will be
200+
'Looking at *pandas* docs' versus 'Looking at pandas docs'. In this example,
201+
the stars around pandas in the first string italicizes the word.
202+
However, the doctree would be representing both doctrees as follows:
203+
204+
'Looking at *pandas* docs' 'Looking at pandas docs'
205+
title title
206+
/ | \ |
207+
#text emphasis #text VS #text
208+
| | | |
209+
'Looking at' #text 'docs' 'Looking at pandas docs'
210+
|
211+
'pandas'
212+
213+
When iterating through the nodes, we first encounter the node: 'Looking at'.
214+
However, this isn't the full line of the heading (Looking at pandas docs).
215+
We're still missing 'pandas docs'. Hence, we must store this first word into
216+
a variable (myText in my function) and append this string variable with more
217+
words in case we encounter text that has a parent with tagname in listOfMarkers.
218+
In this example, we have to go through two more nodes to get the full heading.
219+
220+
Meanwhile, when nothing has a parent with tagname in listOfMarkers, we only need to
221+
access one node to find the 'Looking at the pandas docs' text.
222+
223+
My algorithm adjusts for this pattern, iterating through nodes and
224+
identifying when headings are complete.
173225
174226
'''
175-
myText = ""
176-
markerGrandparent = ""
177-
beforeMarker = False
178-
titleList = []
179-
lineNumberList = []
227+
228+
# Initialize an empty string. myText will be used to construct headings and append into titleList
229+
myText: str = ""
230+
231+
# A docutils.nodes object that stores a listOfMarkers text's grandparent node,
232+
# which should have a tagname of title
233+
markerGrandparent: docutils.nodes.Title
234+
235+
# True if the most recent node encountered had a parent with a listOfMarkers tagname
236+
# and a grandparent with a tagname of title
237+
beforeMarker: bool = False
238+
239+
# titleList is the list of headings that is encountered in the doctree
240+
titleList: List[str] = []
241+
242+
# A list of line numbers that the corresponding headings in titleList can be found at
243+
lineNumberList: List[int] = []
244+
245+
# Traverse through the nodes.Text in the document tree to construct headings
180246
for node in document.traverse(nodes.Text):
247+
# Case 1: Encounter a node with a parent tagname of title
181248
if (node.parent.tagname == 'title'):
182249
if (beforeMarker and markerGrandparent == node.parent):
183250
myText = myText + node.astext()
@@ -189,12 +256,14 @@ def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[List[
189256
lineno = findLineNumber(node)
190257
myText = node.astext()
191258
beforeMarker = False
259+
# Case 2: Encounter a node with parent tagname in listOfMarkers
192260
elif (node.parent.parent.tagname == 'title' and
193261
node.parent.tagname in listOfMarkers):
194262
lineno = findLineNumber(node)
195263
myText = myText + node.astext()
196264
beforeMarker = True
197265
markerGrandparent = node.parent.parent
266+
# Case 3: Encounter a node with parent tagname from none of the above (Ex. 'paragraph' tagname)
198267
else:
199268
beforeMarker = False
200269
if (myText != ""):
@@ -203,10 +272,12 @@ def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[List[
203272
myText = ""
204273
lineno = 0
205274

275+
# Sometimes, there is leftover string that hasn't been appended yet due to how the for loop works
206276
if (myText != ""):
207277
titleList.append(myText)
208278
lineNumberList.append(lineno)
209279

280+
# Return a list of the headings and a list of their corresponding line numbers
210281
return titleList, lineNumberList
211282

212283
def fillBadTitleDictionary(rstFile: str) -> None:
@@ -245,16 +316,24 @@ def createRSTDirectoryList(source_paths: List[str]) -> List[str]:
245316
creates a list of all of the .rst file directories that these paths contain
246317
247318
'''
319+
# List of .rst file paths
248320
f = []
321+
322+
# Loop through source_paths. If address is a folder, recursively look through the folder for .rst files
249323
for directoryAddress in source_paths:
250-
if (directoryAddress.endswith(".rst")):
324+
if not os.path.exists(directoryAddress):
325+
raise ValueError(
326+
"Please enter a valid path, pointing to a valid file/directory."
327+
)
328+
elif (directoryAddress.endswith(".rst")):
251329
f.append(directoryAddress)
252330
else:
253331
for (dirpath, dirnames, filenames) in walk(directoryAddress):
254332
for file in filenames:
255333
if file.endswith(".rst"):
256334
f.append(os.path.join(dirpath, file))
257335

336+
# Return the filled up list of .rst file paths
258337
return f
259338

260339
def main(source_paths: List[str], output_format: str) -> bool:
@@ -275,11 +354,10 @@ def main(source_paths: List[str], output_format: str) -> bool:
275354
return False
276355

277356
# Print badTitleDictionary Results
357+
print()
278358
for key in badTitleDictionary:
279-
print()
280-
print(key)
281359
for titles in badTitleDictionary[key]:
282-
print(titles)
360+
print(key + ":" + str(titles[1]) + ": " + errMessage + " \"" + titles[0] + "\"")
283361

284362
# Exit status of 1
285363
return True
@@ -294,6 +372,7 @@ def main(source_paths: List[str], output_format: str) -> bool:
294372

295373
parser.add_argument(
296374
"--format",
375+
"-f",
297376
default="{source_path}:{line_number}:{heading}:{msg}",
298377
help="Output format of incorrectly capitalized titles",
299378
)

0 commit comments

Comments
 (0)