Skip to content

Commit 56bfc44

Browse files
author
awu42
committed
Created method to correct title capitalization (pandas-dev#26941)
1 parent 3d95777 commit 56bfc44

File tree

2 files changed

+73
-108
lines changed

2 files changed

+73
-108
lines changed

ci/code_checks.sh

+1
Original file line numberDiff line numberDiff line change
@@ -344,4 +344,5 @@ if [[ -z "$CHECK" || "$CHECK" == "typing" ]]; then
344344
RET=$(($RET + $?)) ; echo $MSG "DONE"
345345
fi
346346

347+
347348
exit $RET

scripts/validate_rst_title_capitalization.py

+72-108
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,21 @@
11
#!/usr/bin/env python
2-
32
"""
4-
GH #29641
5-
6-
Collect the titles in the rst files and validate if they follow the proper
7-
capitalization convention.
3+
Validate that the titles in the rst files follow the proper capitalization convention.
84
9-
Prints the titles that do not follow the convention.
5+
Print the titles that do not follow the convention.
106
117
Usage::
128
./scripts/validate_rst_title_capitalization.py doc/source/development/contributing.rst
139
./scripts/validate_rst_title_capitalization.py doc/source/
1410
1511
"""
16-
1712
import argparse
1813
import sys
1914
import re
2015
import os
21-
from os import walk
22-
from typing import Generator, List
16+
from typing import Tuple, Generator, List
2317

2418

25-
# Keynames that would not follow capitalization convention
2619
CAPITALIZATION_EXCEPTIONS = {
2720
"pandas",
2821
"Python",
@@ -54,23 +47,48 @@
5447
"Docker",
5548
}
5649

57-
# Lowercase representation of CAPITALIZATION_EXCEPTIONS
58-
CAPITALIZATION_EXCEPTIONS_LOWER = {word.lower() for word in CAPITALIZATION_EXCEPTIONS}
50+
CAP_EXCEPTIONS_DICT = {
51+
word.lower(): word for word in CAPITALIZATION_EXCEPTIONS
52+
}
5953

60-
# Dictionary of bad titles that will be printed later along with line numbers
61-
# Key: Document Directory, Value: Pair(Bad Title, Line Number)
6254
bad_title_dict = {}
6355

64-
# Error Message:
6556
err_msg = "Heading capitalization formatted incorrectly. Please correctly capitalize"
6657

58+
def correct_title_capitalization(title: str) -> str:
59+
"""
60+
Algorithm to create the correct capitalization for a given title
61+
62+
Parameters
63+
----------
64+
title : str
65+
Heading string to correct
66+
67+
Returns
68+
-------
69+
correct_title : str
70+
Correctly capitalized title
6771
68-
def is_following_capitalization_convention(title: str) -> bool:
6972
"""
70-
Algorithm to determine if a heading follows the capitalization convention
7173

72-
This method returns true if the title follows the convention
73-
and false if it does not
74+
correct_title : str = title.capitalize()
75+
76+
removed_https_title = re.sub(r"<https?:\/\/.*[\r\n]*>", "", correct_title)
77+
78+
word_list = re.split(r"\W", removed_https_title)
79+
80+
for word in word_list:
81+
if word.lower() in CAP_EXCEPTIONS_DICT:
82+
correct_title = re.sub(
83+
r'\b' + word + r'\b', CAP_EXCEPTIONS_DICT[word.lower()], correct_title
84+
)
85+
86+
return correct_title
87+
88+
89+
def is_following_capitalization_convention(title: str) -> bool:
90+
"""
91+
Algorithm to determine if a title is capitalized correctly
7492
7593
Parameters
7694
----------
@@ -80,49 +98,19 @@ def is_following_capitalization_convention(title: str) -> bool:
8098
Returns
8199
-------
82100
bool
83-
True if capitalization is correct, False if not
101+
True if title capitalized correctly, False if not
84102
85103
"""
86104

87-
# Remove https link if present in heading
88-
title = re.sub(r"<https?:\/\/.*[\r\n]*>", "", title)
105+
correct_title = correct_title_capitalization(title)
89106

90-
# Split with delimiters comma, semicolon and space, parentheses, colon, slashes
91-
word_list = re.split(r"[;,-/():\s]\s*", title)
92-
93-
# Edge Case: First word is an empty string
94-
if len(word_list[0]) == 0:
107+
if (title != correct_title):
95108
return False
109+
else:
110+
return True
111+
96112

97-
# Dealing with the first word of the title
98-
if word_list[0] not in CAPITALIZATION_EXCEPTIONS:
99-
# word is not in CAPITALIZATION_EXCEPTIONS but has different capitalization
100-
if word_list[0].lower() in CAPITALIZATION_EXCEPTIONS_LOWER:
101-
return False
102-
# First letter of first word must be uppercase
103-
if not word_list[0][0].isupper():
104-
return False
105-
# Remaining letters of first word must not be uppercase
106-
for j in range(1, len(word_list[0])):
107-
if word_list[0][j].isupper():
108-
return False
109-
110-
# Remaining letters must not be uppercase letters
111-
for i in range(1, len(word_list)):
112-
if word_list[i] not in CAPITALIZATION_EXCEPTIONS:
113-
# word is not in CAPITALIZATION_EXCEPTIONS but has different capitalization
114-
if word_list[i].lower() in CAPITALIZATION_EXCEPTIONS_LOWER:
115-
return False
116-
# Remaining letters must not be uppercase
117-
for j in range(len(word_list[i])):
118-
if word_list[i][j].isupper():
119-
return False
120-
121-
# Returning True if the heading follows the capitalization convention
122-
return True
123-
124-
125-
def findTitles(rst_file: str) -> Generator[List[str], List[int], None]:
113+
def find_titles(rst_file: str) -> Generator[Tuple[str, int], None, None]:
126114
"""
127115
Algorithm to identify particular text that should be considered headings in an
128116
RST file
@@ -135,27 +123,19 @@ def findTitles(rst_file: str) -> Generator[List[str], List[int], None]:
135123
rst_file : str
136124
RST file to scan through for headings
137125
138-
Returns
126+
Yields
139127
-------
140-
title_list : List[str]
141-
A list of heading strings found in the document tree
128+
title : str
129+
A heading found in the rst file
142130
143-
line_number_list : List[int]
144-
The corresponding line numbers of the headings in title_list
131+
line_number : int
132+
The corresponding line number of the heading
145133
146134
"""
147135

148-
# title_list is the list of headings that is encountered in the doctree
149-
title_list: List[str] = []
150-
151-
# List of line numbers that corresponding headings in title_list can be found at
152-
line_number_list: List[int] = []
153-
154-
# Open and read the .rst file and store the string of data into lines
155136
with open(rst_file, "r") as file_obj:
156137
lines = file_obj.read().split("\n")
157138

158-
# Regular expressions that denote a title beforehand
159139
regex = {
160140
"*": r"^(?:\*{1})*$",
161141
"=": r"^(?:={1})*$",
@@ -166,26 +146,20 @@ def findTitles(rst_file: str) -> Generator[List[str], List[int], None]:
166146
'"': r'^(?:"{1})*$',
167147
}
168148

169-
# '*`_' markers are removed from original string text.
170149
table = str.maketrans("", "", "*`_")
171150

172-
# Loop through lines lines, appending if they are considered headings
173-
for lineno in range(1, len(lines)):
174-
if len(lines[lineno]) != 0 and len(lines[lineno - 1]) != 0:
151+
for line_no in range(1, len(lines)):
152+
if len(lines[line_no]) != 0 and len(lines[line_no - 1]) != 0:
175153
for key in regex:
176-
match = re.search(regex[key], lines[lineno])
154+
match = re.search(regex[key], lines[line_no])
177155
if match is not None:
178-
if lineno >= 2:
179-
if lines[lineno] == lines[lineno - 2]:
180-
if len(lines[lineno]) == len(lines[lineno - 1]):
181-
title_list.append(lines[lineno - 1].translate(table))
182-
line_number_list.append(lineno)
156+
if line_no >= 2:
157+
if lines[line_no] == lines[line_no - 2]:
158+
if len(lines[line_no]) == len(lines[line_no - 1]):
159+
yield lines[line_no - 1].translate(table), line_no
183160
break
184-
if len(lines[lineno]) >= len(lines[lineno - 1]):
185-
title_list.append(lines[lineno - 1].translate(table))
186-
line_number_list.append(lineno)
187-
188-
return title_list, line_number_list
161+
if len(lines[line_no]) >= len(lines[line_no - 1]):
162+
yield lines[line_no - 1].translate(table), line_no
189163

190164

191165
def fill_bad_title_dict(rst_file: str) -> None:
@@ -199,20 +173,15 @@ def fill_bad_title_dict(rst_file: str) -> None:
199173
200174
"""
201175

202-
# Ensure this file doesn't already have a bad_title_dict slot
203176
if rst_file in bad_title_dict:
204177
return
205178

206-
# Make a list of headings along with their line numbers
207-
title_list, line_number_list = findTitles(rst_file)
208-
209-
# Append the bad_title_dict if the capitalization convention not followed
210-
for i in range(len(title_list)):
211-
if not is_following_capitalization_convention(title_list[i]):
179+
for title, line_number in find_titles(rst_file):
180+
if not is_following_capitalization_convention(title):
212181
if rst_file not in bad_title_dict:
213-
bad_title_dict[rst_file] = [(title_list[i], line_number_list[i])]
182+
bad_title_dict[rst_file] = [(title, line_number)]
214183
else:
215-
bad_title_dict[rst_file].append((title_list[i], line_number_list[i]))
184+
bad_title_dict[rst_file].append((title, line_number))
216185

217186

218187
def find_rst_files(source_paths: List[str]) -> Generator[str, None, None]:
@@ -232,7 +201,6 @@ def find_rst_files(source_paths: List[str]) -> Generator[str, None, None]:
232201
233202
"""
234203

235-
# Loop through source_paths, recursively looking for .rst files
236204
for directory_address in source_paths:
237205
if not os.path.exists(directory_address):
238206
raise ValueError(
@@ -241,7 +209,7 @@ def find_rst_files(source_paths: List[str]) -> Generator[str, None, None]:
241209
elif directory_address.endswith(".rst"):
242210
yield directory_address
243211
else:
244-
for (dirpath, dirnames, filenames) in walk(directory_address):
212+
for (dirpath, _, filenames) in os.walk(directory_address):
245213
for file in filenames:
246214
if file.endswith(".rst"):
247215
yield os.path.join(dirpath, file)
@@ -260,32 +228,28 @@ def main(source_paths: List[str], output_format: str) -> bool:
260228
261229
Returns
262230
-------
263-
is_failed : bool
231+
number_of_errors : int
264232
True if there are headings that are printed, False if not
265233
266234
"""
267235

268-
is_failed: bool = False
236+
number_of_errors: int = 0
269237

270-
# Make a list of all RST files from command line directory list
271238
directory_list = find_rst_files(source_paths)
272239

273-
# Fill the bad_title_dict, which contains all incorrectly capitalized headings
274240
for filename in directory_list:
275241
fill_bad_title_dict(filename)
276242

277-
# Return an exit status of 0 if there are no bad titles in the dictionary
278-
if len(bad_title_dict) == 0:
279-
return is_failed
243+
if (len(bad_title_dict) == 0):
244+
return number_of_errors
280245

281-
# Print bad_title_dict Results
282-
is_failed = True
283246
for key in bad_title_dict:
284247
for line in bad_title_dict[key]:
285-
print(key + ":" + str(line[1]) + ": " + err_msg + ' "' + line[0] + '"')
248+
correct_title = correct_title_capitalization(line[0])
249+
print(f'{key}:{line[1]}:{err_msg} "{line[0]}" to "{correct_title}"')
250+
number_of_errors += 1
286251

287-
# Exit status of 0
288-
return is_failed
252+
return number_of_errors
289253

290254

291255
if __name__ == "__main__":
@@ -298,7 +262,7 @@ def main(source_paths: List[str], output_format: str) -> bool:
298262
parser.add_argument(
299263
"--format",
300264
"-f",
301-
default="{source_path}:{line_number}:{heading}:{msg}",
265+
default="{source_path}:{line_number}:{msg}:{heading}",
302266
help="Output format of incorrectly capitalized titles",
303267
)
304268

0 commit comments

Comments
 (0)