Skip to content

Commit 3651a61

Browse files
rscgopherbot
authored andcommitted
go/doc/comment: add heuristics for common badly formatted comments
In a set of 55M Go doc comments drawn from the latest version of all public Go modules known to the module proxy in spring 2020, the current Go 1.19 gofmt reformats about 1.57M of them. Out of those 1.57M comments, inspection of random samples shows that around 5% of the changed comments contain unindented code snippets, multiline shell commands, or lists. For example: // Here is a greeting: // // func main() { // fmt.Println("hello, world") // } // Run this command: // // path/to/your/program -flag1=longargument1 \ // -flag2=longargument2 \ // -flag3 // There are three possibilities: // // - Unindented code snippets (or JSON objects) // in which the first and last line are unindented // but end in { and start with }, respectively. // - Unindented multiline shell commands // in which the lines end in \ // - Unindented lists, in which wrapped lines are indented. All three of these cases involve unindented lines next to indented lines that would according to the usual rules begin a pre block. Before this CL, they'd be reformatted to: // Here is a greeting: // // func main() { // // fmt.Println("hello, world") // // } // Run this command: // // path/to/your/program -flag1=longargument1 \ // // -flag2=longargument2 \ // -flag3 // There are three possibilities: // // - Unindented code snippets (or JSON objects) // // in which the first and last line are unindented // but end in { and start with }, respectively. // // - Unindented multiline shell commands // // in which the lines end in \ // // - Unindented lists, in which wrapped lines are indented. The fact that they are not already in canonical format gives us a signal that they might not mean what the usual rules would say. This CL takes advantage of that opening to apply a few heuristics to better handle these cases: 1. If an indented code block immediately follows (without a blank line) an unindented line ending in { or \, include the unindented line in the code block. 2. If an indented code block immediately precedes (without a blank line) an unindented line beginning with }, include the unindented line in the code block. 3. If an indented line immediately follows (without a blank line) an unindented line that starts with a list marker, assume this is an unindented list with a wrapped indented line, and treat all adjacent unindented lines starting with list markers as part of the list, stopping at any surrounding blank lines. This raises the fraction of “correctly” reformatted doc comments in the corpus from approximately 87% to approximately 93%. Change-Id: I7ac542eb085032d607a7caf3ba9020787b2978b5 Reviewed-on: https://go-review.googlesource.com/c/go/+/410360 Auto-Submit: Russ Cox <[email protected]> TryBot-Result: Gopher Robot <[email protected]> Reviewed-by: Roland Shoemaker <[email protected]> Reviewed-by: Alan Donovan <[email protected]> Run-TryBot: Russ Cox <[email protected]> Reviewed-by: Ian Lance Taylor <[email protected]>
1 parent 4c08260 commit 3651a61

File tree

7 files changed

+322
-99
lines changed

7 files changed

+322
-99
lines changed

src/go/doc/comment/parse.go

+196-98
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,8 @@ func (d *parseDoc) lookupPkg(pkg string) (importPath string, ok bool) {
260260
}
261261

262262
func isStdPkg(path string) bool {
263-
// TODO(rsc): Use sort.Find.
263+
// TODO(rsc): Use sort.Find once we don't have to worry about
264+
// copying this code into older Go environments.
264265
i := sort.Search(len(stdPkgs), func(i int) bool { return stdPkgs[i] >= path })
265266
return i < len(stdPkgs) && stdPkgs[i] == path
266267
}
@@ -297,44 +298,27 @@ func (p *Parser) Parse(text string) *Doc {
297298

298299
// First pass: break into block structure and collect known links.
299300
// The text is all recorded as Plain for now.
300-
// TODO: Break into actual block structure.
301-
didHeading := false
302-
all := lines
303-
for len(lines) > 0 {
304-
line := lines[0]
305-
n := len(lines)
301+
var prev span
302+
for _, s := range parseSpans(lines) {
306303
var b Block
307-
308-
switch {
309-
case line == "":
310-
// emit nothing
311-
312-
case isList(line):
313-
prevWasBlank := len(lines) < len(all) && all[len(all)-len(lines)-1] == ""
314-
b, lines = d.list(lines, prevWasBlank)
315-
316-
case isIndented(line):
317-
b, lines = d.code(lines)
318-
319-
case (len(lines) == 1 || lines[1] == "") && !didHeading && isOldHeading(line, all, len(all)-n):
320-
b = d.oldHeading(line)
321-
didHeading = true
322-
323-
case (len(lines) == 1 || lines[1] == "") && isHeading(line):
324-
b = d.heading(line)
325-
didHeading = true
326-
304+
switch s.kind {
327305
default:
328-
b, lines = d.paragraph(lines)
329-
didHeading = false
306+
panic("go/doc/comment: internal error: unknown span kind")
307+
case spanList:
308+
b = d.list(lines[s.start:s.end], prev.end < s.start)
309+
case spanCode:
310+
b = d.code(lines[s.start:s.end])
311+
case spanOldHeading:
312+
b = d.oldHeading(lines[s.start])
313+
case spanHeading:
314+
b = d.heading(lines[s.start])
315+
case spanPara:
316+
b = d.paragraph(lines[s.start:s.end])
330317
}
331-
332318
if b != nil {
333319
d.Content = append(d.Content, b)
334320
}
335-
if len(lines) == n {
336-
lines = lines[1:]
337-
}
321+
prev = s
338322
}
339323

340324
// Second pass: interpret all the Plain text now that we know the links.
@@ -348,9 +332,172 @@ func (p *Parser) Parse(text string) *Doc {
348332
return d.Doc
349333
}
350334

335+
// A span represents a single span of comment lines (lines[start:end])
336+
// of an identified kind (code, heading, paragraph, and so on).
337+
type span struct {
338+
start int
339+
end int
340+
kind spanKind
341+
}
342+
343+
// A spanKind describes the kind of span.
344+
type spanKind int
345+
346+
const (
347+
_ spanKind = iota
348+
spanCode
349+
spanHeading
350+
spanList
351+
spanOldHeading
352+
spanPara
353+
)
354+
355+
func parseSpans(lines []string) []span {
356+
var spans []span
357+
358+
// The loop may process a line twice: once as unindented
359+
// and again forced indented. So the maximum expected
360+
// number of iterations is 2*len(lines). The repeating logic
361+
// can be subtle, though, and to protect against introduction
362+
// of infinite loops in future changes, we watch to see that
363+
// we are not looping too much. A panic is better than a
364+
// quiet infinite loop.
365+
watchdog := 2 * len(lines)
366+
367+
i := 0
368+
forceIndent := 0
369+
Spans:
370+
for {
371+
// Skip blank lines.
372+
for i < len(lines) && lines[i] == "" {
373+
i++
374+
}
375+
if i >= len(lines) {
376+
break
377+
}
378+
if watchdog--; watchdog < 0 {
379+
panic("go/doc/comment: internal error: not making progress")
380+
}
381+
382+
var kind spanKind
383+
start := i
384+
end := i
385+
if i < forceIndent || indented(lines[i]) {
386+
// Indented (or force indented).
387+
// Ends before next unindented. (Blank lines are OK.)
388+
// If this is an unindented list that we are heuristically treating as indented,
389+
// then accept unindented list item lines up to the first blank lines.
390+
// The heuristic is disabled at blank lines to contain its effect
391+
// to non-gofmt'ed sections of the comment.
392+
unindentedListOK := isList(lines[i]) && i < forceIndent
393+
i++
394+
for i < len(lines) && (lines[i] == "" || i < forceIndent || indented(lines[i]) || (unindentedListOK && isList(lines[i]))) {
395+
if lines[i] == "" {
396+
unindentedListOK = false
397+
}
398+
i++
399+
}
400+
401+
// Drop trailing blank lines.
402+
end = i
403+
for end > start && lines[end-1] == "" {
404+
end--
405+
}
406+
407+
// If indented lines are followed (without a blank line)
408+
// by an unindented line ending in a brace,
409+
// take that one line too. This fixes the common mistake
410+
// of pasting in something like
411+
//
412+
// func main() {
413+
// fmt.Println("hello, world")
414+
// }
415+
//
416+
// and forgetting to indent it.
417+
// The heuristic will never trigger on a gofmt'ed comment,
418+
// because any gofmt'ed code block or list would be
419+
// followed by a blank line or end of comment.
420+
if end < len(lines) && strings.HasPrefix(lines[end], "}") {
421+
end++
422+
}
423+
424+
if isList(lines[start]) {
425+
kind = spanList
426+
} else {
427+
kind = spanCode
428+
}
429+
} else {
430+
// Unindented. Ends at next blank or indented line.
431+
i++
432+
for i < len(lines) && lines[i] != "" && !indented(lines[i]) {
433+
i++
434+
}
435+
end = i
436+
437+
// If unindented lines are followed (without a blank line)
438+
// by an indented line that would start a code block,
439+
// check whether the final unindented lines
440+
// should be left for the indented section.
441+
// This can happen for the common mistakes of
442+
// unindented code or unindented lists.
443+
// The heuristic will never trigger on a gofmt'ed comment,
444+
// because any gofmt'ed code block would have a blank line
445+
// preceding it after the unindented lines.
446+
if i < len(lines) && lines[i] != "" && !isList(lines[i]) {
447+
switch {
448+
case isList(lines[i-1]):
449+
// If the final unindented line looks like a list item,
450+
// this may be the first indented line wrap of
451+
// a mistakenly unindented list.
452+
// Leave all the unindented list items.
453+
forceIndent = end
454+
end--
455+
for end > start && isList(lines[end-1]) {
456+
end--
457+
}
458+
459+
case strings.HasSuffix(lines[i-1], "{") || strings.HasSuffix(lines[i-1], `\`):
460+
// If the final unindented line ended in { or \
461+
// it is probably the start of a misindented code block.
462+
// Give the user a single line fix.
463+
// Often that's enough; if not, the user can fix the others themselves.
464+
forceIndent = end
465+
end--
466+
}
467+
468+
if start == end && forceIndent > start {
469+
i = start
470+
continue Spans
471+
}
472+
}
473+
474+
// Span is either paragraph or heading.
475+
if end-start == 1 && isHeading(lines[start]) {
476+
kind = spanHeading
477+
} else if end-start == 1 && isOldHeading(lines[start], lines, start) {
478+
kind = spanOldHeading
479+
} else {
480+
kind = spanPara
481+
}
482+
}
483+
484+
spans = append(spans, span{start, end, kind})
485+
i = end
486+
}
487+
488+
return spans
489+
}
490+
491+
// indented reports whether line is indented
492+
// (starts with a leading space or tab).
493+
func indented(line string) bool {
494+
return line != "" && (line[0] == ' ' || line[0] == '\t')
495+
}
496+
351497
// unindent removes any common space/tab prefix
352498
// from each line in lines, returning a copy of lines in which
353499
// those prefixes have been trimmed from each line.
500+
// It also replaces any lines containing only spaces with blank lines (empty strings).
354501
func unindent(lines []string) []string {
355502
// Trim leading and trailing blank lines.
356503
for len(lines) > 0 && isBlank(lines[0]) {
@@ -480,58 +627,16 @@ func (d *parseDoc) heading(line string) Block {
480627
return &Heading{Text: []Text{Plain(strings.TrimSpace(line[1:]))}}
481628
}
482629

483-
// code returns a code block built from the indented text
484-
// at the start of lines, along with the remainder of the lines.
485-
// If there is no indented text at the start, or if the indented
486-
// text consists only of empty lines, code returns a nil Block.
487-
func (d *parseDoc) code(lines []string) (b Block, rest []string) {
488-
lines, rest = indented(lines)
630+
// code returns a code block built from the lines.
631+
func (d *parseDoc) code(lines []string) *Code {
489632
body := unindent(lines)
490-
if len(body) == 0 {
491-
return nil, rest
492-
}
493633
body = append(body, "") // to get final \n from Join
494-
return &Code{Text: strings.Join(body, "\n")}, rest
495-
}
496-
497-
// isIndented reports whether the line is indented,
498-
// meaning it starts with a space or tab.
499-
func isIndented(line string) bool {
500-
return line != "" && (line[0] == ' ' || line[0] == '\t')
634+
return &Code{Text: strings.Join(body, "\n")}
501635
}
502636

503-
// indented splits lines into an initial indented section
504-
// and the remaining lines, returning the two halves.
505-
func indented(lines []string) (indented, rest []string) {
506-
// Blank lines mid-run are OK, but not at the end.
507-
i := 0
508-
for i < len(lines) && (isIndented(lines[i]) || lines[i] == "") {
509-
i++
510-
}
511-
for i > 0 && lines[i-1] == "" {
512-
i--
513-
}
514-
return lines[:i], lines[i:]
515-
}
516-
517-
// paragraph returns a paragraph block built from the
518-
// unindented text at the start of lines, along with the remainder of the lines.
519-
// If there is no unindented text at the start of lines,
520-
// then paragraph returns a nil Block.
521-
func (d *parseDoc) paragraph(lines []string) (b Block, rest []string) {
522-
// Paragraph is interrupted by any indented line,
523-
// which is either a list or a code block,
524-
// and of course by a blank line.
525-
// It is not interrupted by a # line - headings must stand alone.
526-
i := 0
527-
for i < len(lines) && lines[i] != "" && !isIndented(lines[i]) {
528-
i++
529-
}
530-
lines, rest = lines[:i], lines[i:]
531-
if len(lines) == 0 {
532-
return nil, rest
533-
}
534-
637+
// paragraph returns a paragraph block built from the lines.
638+
// If the lines are link definitions, paragraph adds them to d and returns nil.
639+
func (d *parseDoc) paragraph(lines []string) Block {
535640
// Is this a block of known links? Handle.
536641
var defs []*LinkDef
537642
for _, line := range lines {
@@ -547,10 +652,10 @@ func (d *parseDoc) paragraph(lines []string) (b Block, rest []string) {
547652
d.links[def.Text] = def
548653
}
549654
}
550-
return nil, rest
655+
return nil
551656
NoDefs:
552657

553-
return &Paragraph{Text: []Text{Plain(strings.Join(lines, "\n"))}}, rest
658+
return &Paragraph{Text: []Text{Plain(strings.Join(lines, "\n"))}}
554659
}
555660

556661
// parseLink parses a single link definition line:
@@ -581,14 +686,9 @@ func parseLink(line string) (*LinkDef, bool) {
581686
return &LinkDef{Text: text, URL: url}, true
582687
}
583688

584-
// list returns a list built from the indented text at the start of lines,
689+
// list returns a list built from the indented lines,
585690
// using forceBlankBefore as the value of the List's ForceBlankBefore field.
586-
// The caller is responsible for ensuring that the first line of lines
587-
// satisfies isList.
588-
// list returns the *List as a Block along with the remaining lines.
589-
func (d *parseDoc) list(lines []string, forceBlankBefore bool) (b Block, rest []string) {
590-
lines, rest = indented(lines)
591-
691+
func (d *parseDoc) list(lines []string, forceBlankBefore bool) *List {
592692
num, _, _ := listMarker(lines[0])
593693
var (
594694
list *List = &List{ForceBlankBefore: forceBlankBefore}
@@ -597,7 +697,7 @@ func (d *parseDoc) list(lines []string, forceBlankBefore bool) (b Block, rest []
597697
)
598698
flush := func() {
599699
if item != nil {
600-
if para, _ := d.paragraph(text); para != nil {
700+
if para := d.paragraph(text); para != nil {
601701
item.Content = append(item.Content, para)
602702
}
603703
}
@@ -622,17 +722,14 @@ func (d *parseDoc) list(lines []string, forceBlankBefore bool) (b Block, rest []
622722
text = append(text, strings.TrimSpace(line))
623723
}
624724
flush()
625-
return list, rest
725+
return list
626726
}
627727

628-
// listMarker parses the line as an indented line beginning with a list marker.
728+
// listMarker parses the line as beginning with a list marker.
629729
// If it can do that, it returns the numeric marker ("" for a bullet list),
630730
// the rest of the line, and ok == true.
631731
// Otherwise, it returns "", "", false.
632732
func listMarker(line string) (num, rest string, ok bool) {
633-
if !isIndented(line) {
634-
return "", "", false
635-
}
636733
line = strings.TrimSpace(line)
637734
if line == "" {
638735
return "", "", false
@@ -654,15 +751,16 @@ func listMarker(line string) (num, rest string, ok bool) {
654751
return "", "", false
655752
}
656753

657-
if !isIndented(rest) || strings.TrimSpace(rest) == "" {
754+
if !indented(rest) || strings.TrimSpace(rest) == "" {
658755
return "", "", false
659756
}
660757

661758
return num, rest, true
662759
}
663760

664761
// isList reports whether the line is the first line of a list,
665-
// meaning is indented and starts with a list marker.
762+
// meaning starts with a list marker after any indentation.
763+
// (The caller is responsible for checking the line is indented, as appropriate.)
666764
func isList(line string) bool {
667765
_, _, ok := listMarker(line)
668766
return ok

0 commit comments

Comments
 (0)