9
9
10
10
NOTE: Run from the root directory of pandas repository
11
11
12
- Example :
12
+ Examples :
13
13
./scripts/validate_rst_title_capitalization.py doc/source/development/contributing.rst
14
+ ./scripts/validate_rst_title_capitalization.py doc/source/index.rst doc/source/ecosystem.rst
15
+ ./scripts/validate_rst_title_capitalization.py doc/source/
14
16
15
17
Files that cannot be validated: (code crashes when validating for some reason)
16
18
doc/source/user_guide/io.rst
@@ -90,27 +92,27 @@ def __exit__(self, *_):
90
92
cannotValidate = ['doc/source/user_guide/io.rst' , 'doc/source/whatsnew/v0.17.1.rst' ]
91
93
92
94
# Error Message:
93
- errMessage = "Title capitalization formatted incorrectly. Manually format correctly"
95
+ errMessage = "Heading capitalization formatted incorrectly. Please correctly capitalize "
94
96
95
97
def followCapitalizationConvention (title : str ) -> bool :
96
98
'''
97
- tonywu1999's algorithm to determine if a heading follows the capitalization convention
99
+ Algorithm to determine if a heading follows the capitalization convention
98
100
99
101
This method returns true if the title follows the convention
100
102
and false if it does not
101
103
102
104
'''
103
105
104
- # split with delimiters comma, semicolon and space, parentheses, colon
105
- wordList = re .split (r'[;,():\s]\s*' , title ) # followed by any amount of extra whitespace.
106
+ # split with delimiters comma, semicolon and space, parentheses, colon, slashes
107
+ wordList = re .split (r'[;,/ ():\s]\s*' , title ) # followed by any amount of extra whitespace.
106
108
107
109
# Edge Case: First word is an empty string
108
110
if (len (wordList [0 ]) == 0 ):
109
111
return False
110
112
111
113
# Dealing with the first word of the title
112
114
if wordList [0 ] not in CAPITALIZATION_EXCEPTIONS :
113
- # word is not in keyNames but has different capitalization
115
+ # word is not in CAPITALIZATION_EXCEPTIONS but has different capitalization
114
116
if wordList [0 ].lower () in CAPITALIZATION_EXCEPTIONS_LOWER :
115
117
return False
116
118
# First letter of first word must be uppercase
@@ -124,21 +126,26 @@ def followCapitalizationConvention(title: str) -> bool:
124
126
# Remaining letters must not be uppercase letters
125
127
for i in range (1 , len (wordList )):
126
128
if wordList [i ] not in CAPITALIZATION_EXCEPTIONS :
127
- # word is not in keyNames but has different capitalization
129
+ # word is not in CAPITALIZATION_EXCEPTIONS but has different capitalization
128
130
if wordList [i ].lower () in CAPITALIZATION_EXCEPTIONS_LOWER :
129
131
return False
130
132
# Remaining letters must not be uppercase
131
133
for j in range (len (wordList [i ])):
132
134
if wordList [i ][j ].isupper ():
133
135
return False
134
136
137
+ # Returning True if the heading follows the capitalization convention
135
138
return True
136
139
137
140
def findLineNumber (node : docutils .nodes ) -> int :
138
141
'''
139
142
Recursive method that finds the line number in a document for a particular node
140
143
in the doctree
141
144
145
+ Text nodes usually don't have any value for its "line" instance variable,
146
+ so instead, we recursively look through the parent nodes to eventually find the
147
+ correct line number, which I determined would be node.line - 1
148
+
142
149
'''
143
150
if (node .tagname == 'document' ):
144
151
return 1
@@ -152,32 +159,92 @@ def parseRST(rstFile: str) -> docutils.nodes.document:
152
159
Method to parse through an rstFile and return a document tree
153
160
154
161
'''
155
- # Parse through rstFile
162
+ # Create rst Parser object
156
163
parser = docutils .parsers .rst .Parser ()
164
+
165
+ # Open and read the .rst file and store the string of data into input
157
166
f = open (rstFile , "r" )
158
167
input = f .read ()
168
+
169
+ # Set up default settings for the document tree
159
170
settings = docutils .frontend .OptionParser (
160
171
components = (docutils .parsers .rst .Parser ,)
161
172
).get_default_values ()
173
+
174
+ # Initialize an empty document tree with the default settings from above
162
175
document = docutils .utils .new_document ('Document' , settings )
163
176
177
+ # Parse the input string into an RST document tree, suppressing any stdout from the parse method
164
178
with suppress_stdout_stderr ():
165
179
parser .parse (input , document )
166
180
181
+ # Return the root node of the document tree
167
182
return document
168
183
169
184
def findBadTitlesInDoctree (document : docutils .nodes .document ) -> Generator [List [str ], List [int ], None ]:
170
185
'''
171
- tonywu1999's algorithm to identify particular text nodes as headings
172
- along with the text node's line number
186
+ Algorithm to identify particular text nodes as headings
187
+ along with the text node's line number.
188
+
189
+ The idea is that when we traverse through the text nodes, nodes whose
190
+ parents have a tagname of 'title' are definitely considered to be part
191
+ of headings.
192
+
193
+ However, the problem occurs when we encounter text that has been either
194
+ italicized, bolded, referenced, etc. In these situations, the tagname of
195
+ the parent node could be one of the following: 'emphasis', 'strong', 'reference', 'literal',
196
+ stored in the 'listOfMarkers' set variable. In this situation, the node's
197
+ grandparent would have the 'title' tagname instead.
198
+
199
+ Let's see an example that can cause a problem. The heading provided will be
200
+ 'Looking at *pandas* docs' versus 'Looking at pandas docs'. In this example,
201
+ the stars around pandas in the first string italicizes the word.
202
+ However, the doctree would be representing both doctrees as follows:
203
+
204
+ 'Looking at *pandas* docs' 'Looking at pandas docs'
205
+ title title
206
+ / | \ |
207
+ #text emphasis #text VS #text
208
+ | | | |
209
+ 'Looking at' #text 'docs' 'Looking at pandas docs'
210
+ |
211
+ 'pandas'
212
+
213
+ When iterating through the nodes, we first encounter the node: 'Looking at'.
214
+ However, this isn't the full line of the heading (Looking at pandas docs).
215
+ We're still missing 'pandas docs'. Hence, we must store this first word into
216
+ a variable (myText in my function) and append this string variable with more
217
+ words in case we encounter text that has a parent with tagname in listOfMarkers.
218
+ In this example, we have to go through two more nodes to get the full heading.
219
+
220
+ Meanwhile, when nothing has a parent with tagname in listOfMarkers, we only need to
221
+ access one node to find the 'Looking at the pandas docs' text.
222
+
223
+ My algorithm adjusts for this pattern, iterating through nodes and
224
+ identifying when headings are complete.
173
225
174
226
'''
175
- myText = ""
176
- markerGrandparent = ""
177
- beforeMarker = False
178
- titleList = []
179
- lineNumberList = []
227
+
228
+ # Initialize an empty string. myText will be used to construct headings and append into titleList
229
+ myText : str = ""
230
+
231
+ # A docutils.nodes object that stores a listOfMarkers text's grandparent node,
232
+ # which should have a tagname of title
233
+ markerGrandparent : docutils .nodes .Title
234
+
235
+ # True if the most recent node encountered had a parent with a listOfMarkers tagname
236
+ # and a grandparent with a tagname of title
237
+ beforeMarker : bool = False
238
+
239
+ # titleList is the list of headings that is encountered in the doctree
240
+ titleList : List [str ] = []
241
+
242
+ # A list of line numbers that the corresponding headings in titleList can be found at
243
+ lineNumberList : List [int ] = []
244
+
245
+ # Traverse through the nodes.Text in the document tree to construct headings
180
246
for node in document .traverse (nodes .Text ):
247
+ # Case 1: Encounter a node with a parent tagname of title
181
248
if (node .parent .tagname == 'title' ):
182
249
if (beforeMarker and markerGrandparent == node .parent ):
183
250
myText = myText + node .astext ()
@@ -189,12 +256,14 @@ def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[List[
189
256
lineno = findLineNumber (node )
190
257
myText = node .astext ()
191
258
beforeMarker = False
259
+ # Case 2: Encounter a node with parent tagname in listOfMarkers
192
260
elif (node .parent .parent .tagname == 'title' and
193
261
node .parent .tagname in listOfMarkers ):
194
262
lineno = findLineNumber (node )
195
263
myText = myText + node .astext ()
196
264
beforeMarker = True
197
265
markerGrandparent = node .parent .parent
266
+ # Case 3: Encounter a node with parent tagname from none of the above (Ex. 'paragraph' tagname)
198
267
else :
199
268
beforeMarker = False
200
269
if (myText != "" ):
@@ -203,10 +272,12 @@ def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[List[
203
272
myText = ""
204
273
lineno = 0
205
274
275
+ # Sometimes, there is leftover string that hasn't been appended yet due to how the for loop works
206
276
if (myText != "" ):
207
277
titleList .append (myText )
208
278
lineNumberList .append (lineno )
209
279
280
+ # Return a list of the headings and a list of their corresponding line numbers
210
281
return titleList , lineNumberList
211
282
212
283
def fillBadTitleDictionary (rstFile : str ) -> None :
@@ -245,16 +316,24 @@ def createRSTDirectoryList(source_paths: List[str]) -> List[str]:
245
316
creates a list of all of the .rst file directories that these paths contain
246
317
247
318
'''
319
+ # List of .rst file paths
248
320
f = []
321
+
322
+ # Loop through source_paths. If address is a folder, recursively look through the folder for .rst files
249
323
for directoryAddress in source_paths :
250
- if (directoryAddress .endswith (".rst" )):
324
+ if not os .path .exists (directoryAddress ):
325
+ raise ValueError (
326
+ "Please enter a valid path, pointing to a valid file/directory."
327
+ )
328
+ elif (directoryAddress .endswith (".rst" )):
251
329
f .append (directoryAddress )
252
330
else :
253
331
for (dirpath , dirnames , filenames ) in walk (directoryAddress ):
254
332
for file in filenames :
255
333
if file .endswith (".rst" ):
256
334
f .append (os .path .join (dirpath , file ))
257
335
336
+ # Return the filled up list of .rst file paths
258
337
return f
259
338
260
339
def main (source_paths : List [str ], output_format : str ) -> bool :
@@ -275,11 +354,10 @@ def main(source_paths: List[str], output_format: str) -> bool:
275
354
return False
276
355
277
356
# Print badTitleDictionary Results
357
+ print ()
278
358
for key in badTitleDictionary :
279
- print ()
280
- print (key )
281
359
for titles in badTitleDictionary [key ]:
282
- print (titles )
360
+ print (key + ":" + str ( titles [ 1 ]) + ": " + errMessage + " \" " + titles [ 0 ] + " \" " )
283
361
284
362
# Exit status of 1
285
363
return True
@@ -294,6 +372,7 @@ def main(source_paths: List[str], output_format: str) -> bool:
294
372
295
373
parser .add_argument (
296
374
"--format" ,
375
+ "-f" ,
297
376
default = "{source_path}:{line_number}:{heading}:{msg}" ,
298
377
help = "Output format of incorrectly capitalized titles" ,
299
378
)
0 commit comments