Skip to content

Commit 35b24a7

Browse files
authored
Merge pull request sourcegraph#61 from kwi-dk/master
Add support for complicated filenames and file copies
2 parents a768196 + 28094f6 commit 35b24a7

File tree

4 files changed

+317
-14
lines changed

4 files changed

+317
-14
lines changed

diff/diff_test.go

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -704,6 +704,69 @@ func TestParseMultiFileDiffHeaders(t *testing.T) {
704704
},
705705
},
706706
},
707+
{
708+
filename: "complicated_filenames.diff",
709+
wantDiffs: []*FileDiff{
710+
{
711+
OrigName: "/dev/null",
712+
NewName: "b/new empty file with spaces",
713+
Extended: []string{
714+
"diff --git a/new empty file with spaces b/new empty file with spaces",
715+
"new file mode 100644",
716+
"index 0000000..e69de29",
717+
},
718+
},
719+
{
720+
OrigName: "/dev/null",
721+
NewName: "b/new file with text",
722+
Extended: []string{
723+
"diff --git a/new file with text b/new file with text",
724+
"new file mode 100644",
725+
"index 0000000..c3ed4be",
726+
},
727+
},
728+
{
729+
OrigName: "a/existing file with spaces",
730+
NewName: "b/new file with spaces",
731+
Extended: []string{
732+
"diff --git a/existing file with spaces b/new file with spaces",
733+
"similarity index 100%",
734+
"copy from existing file with spaces",
735+
"copy to new file with spaces",
736+
},
737+
},
738+
{
739+
OrigName: "a/existing file with spaces",
740+
NewName: "b/new, complicated\nfilenøme",
741+
Extended: []string{
742+
`diff --git a/existing file with spaces "b/new, complicated\nfilen\303\270me"`,
743+
"similarity index 100%",
744+
"copy from existing file with spaces",
745+
`copy to "new, complicated\nfilen\303\270me"`,
746+
},
747+
},
748+
{
749+
OrigName: "a/existing file with spaces",
750+
NewName: `b/new "complicated" filename`,
751+
Extended: []string{
752+
`diff --git a/existing file with spaces "b/new \"complicated\" filename"`,
753+
"similarity index 100%",
754+
"copy from existing file with spaces",
755+
`copy to "new \"complicated\" filename"`,
756+
},
757+
},
758+
{
759+
OrigName: `a/existing "complicated" filename`,
760+
NewName: "b/new, simpler filename",
761+
Extended: []string{
762+
`diff --git "a/existing \"complicated\" filename" b/new, simpler filename`,
763+
"similarity index 100%",
764+
`copy from "existing \"complicated\" filename"`,
765+
"copy to new, simpler filename",
766+
},
767+
},
768+
},
769+
},
707770
}
708771
for _, test := range tests {
709772
t.Run(test.filename, func(t *testing.T) {

diff/parse.go

Lines changed: 135 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,102 @@ func (r *FileDiffReader) ReadExtendedHeaders() ([]string, error) {
372372
}
373373
}
374374

375+
// readQuotedFilename extracts a quoted filename from the beginning of a string,
376+
// returning the unquoted filename and any remaining text after the filename.
377+
func readQuotedFilename(text string) (value string, remainder string, err error) {
378+
if text == "" || text[0] != '"' {
379+
return "", "", fmt.Errorf(`string must start with a '"': %s`, text)
380+
}
381+
382+
// The end quote is the first quote NOT preceeded by an uneven number of backslashes.
383+
numberOfBackslashes := 0
384+
for i, c := range text {
385+
if c == '"' && i > 0 && numberOfBackslashes%2 == 0 {
386+
value, err = strconv.Unquote(text[:i+1])
387+
remainder = text[i+1:]
388+
return
389+
} else if c == '\\' {
390+
numberOfBackslashes++
391+
} else {
392+
numberOfBackslashes = 0
393+
}
394+
}
395+
return "", "", fmt.Errorf(`end of string found while searching for '"': %s`, text)
396+
}
397+
398+
// parseDiffGitArgs extracts the two filenames from a 'diff --git' line.
399+
// Returns false on syntax error, true if syntax is valid. Even with a
400+
// valid syntax, it may be impossible to extract filenames; if so, the
401+
// function returns ("", "", true).
402+
func parseDiffGitArgs(diffArgs string) (string, string, bool) {
403+
length := len(diffArgs)
404+
if length < 3 {
405+
return "", "", false
406+
}
407+
408+
if diffArgs[0] != '"' && diffArgs[length-1] != '"' {
409+
// Both filenames are unquoted.
410+
firstSpace := strings.IndexByte(diffArgs, ' ')
411+
if firstSpace <= 0 || firstSpace == length-1 {
412+
return "", "", false
413+
}
414+
415+
secondSpace := strings.IndexByte(diffArgs[firstSpace+1:], ' ')
416+
if secondSpace == -1 {
417+
if diffArgs[firstSpace+1] == '"' {
418+
// The second filename begins with '"', but doesn't end with one.
419+
return "", "", false
420+
}
421+
return diffArgs[:firstSpace], diffArgs[firstSpace+1:], true
422+
}
423+
424+
// One or both filenames contain a space, but the names are
425+
// unquoted. Here, the 'diff --git' syntax is ambiguous, and
426+
// we have to obtain the filenames elsewhere (e.g. from the
427+
// hunk headers or extended headers). HOWEVER, if the file
428+
// is newly created and empty, there IS no other place to
429+
// find the filename. In this case, the two filenames are
430+
// identical (except for the leading 'a/' prefix), and we have
431+
// to handle that case here.
432+
first := diffArgs[:length/2]
433+
second := diffArgs[length/2+1:]
434+
if len(first) >= 3 && length%2 == 1 && first[1] == '/' && first[1:] == second[1:] {
435+
return first, second, true
436+
}
437+
438+
// The syntax is (unfortunately) valid, but we could not extract
439+
// the filenames.
440+
return "", "", true
441+
}
442+
443+
if diffArgs[0] == '"' {
444+
first, remainder, err := readQuotedFilename(diffArgs)
445+
if err != nil || len(remainder) < 2 || remainder[0] != ' ' {
446+
return "", "", false
447+
}
448+
if remainder[1] == '"' {
449+
second, remainder, err := readQuotedFilename(remainder[1:])
450+
if remainder != "" || err != nil {
451+
return "", "", false
452+
}
453+
return first, second, true
454+
}
455+
return first, remainder[1:], true
456+
}
457+
458+
// In this case, second argument MUST be quoted (or it's a syntax error)
459+
i := strings.IndexByte(diffArgs, '"')
460+
if i == -1 || i+2 >= length || diffArgs[i-1] != ' ' {
461+
return "", "", false
462+
}
463+
464+
second, remainder, err := readQuotedFilename(diffArgs[i:])
465+
if remainder != "" || err != nil {
466+
return "", "", false
467+
}
468+
return diffArgs[:i-1], second, true
469+
}
470+
375471
// handleEmpty detects when FileDiff was an empty diff and will not have any hunks
376472
// that follow. It updates fd fields from the parsed extended headers.
377473
func handleEmpty(fd *FileDiff) (wasEmpty bool) {
@@ -388,6 +484,10 @@ func handleEmpty(fd *FileDiff) (wasEmpty bool) {
388484
return lineHasPrefix(idx1, prefix1) && lineHasPrefix(idx2, prefix2)
389485
}
390486

487+
isCopy := (lineCount == 4 && linesHavePrefixes(2, "copy from ", 3, "copy to ")) ||
488+
(lineCount == 6 && linesHavePrefixes(2, "copy from ", 3, "copy to ") && lineHasPrefix(5, "Binary files ")) ||
489+
(lineCount == 6 && linesHavePrefixes(1, "old mode ", 2, "new mode ") && linesHavePrefixes(4, "copy from ", 5, "copy to "))
490+
391491
isRename := (lineCount == 4 && linesHavePrefixes(2, "rename from ", 3, "rename to ")) ||
392492
(lineCount == 6 && linesHavePrefixes(2, "rename from ", 3, "rename to ") && lineHasPrefix(5, "Binary files ")) ||
393493
(lineCount == 6 && linesHavePrefixes(1, "old mode ", 2, "new mode ") && linesHavePrefixes(4, "rename from ", 5, "rename to "))
@@ -402,22 +502,12 @@ func handleEmpty(fd *FileDiff) (wasEmpty bool) {
402502

403503
isBinaryPatch := lineCount == 3 && lineHasPrefix(2, "Binary files ") || lineCount > 3 && lineHasPrefix(2, "GIT binary patch")
404504

405-
if !isModeChange && !isRename && !isBinaryPatch && !isNewFile && !isDeletedFile {
505+
if !isModeChange && !isCopy && !isRename && !isBinaryPatch && !isNewFile && !isDeletedFile {
406506
return false
407507
}
408508

409-
names := strings.SplitN(fd.Extended[0][len("diff --git "):], " ", 2)
410-
411-
var err error
412-
fd.OrigName, err = strconv.Unquote(names[0])
413-
if err != nil {
414-
fd.OrigName = names[0]
415-
}
416-
fd.NewName, err = strconv.Unquote(names[1])
417-
if err != nil {
418-
fd.NewName = names[1]
419-
}
420-
509+
var success bool
510+
fd.OrigName, fd.NewName, success = parseDiffGitArgs(fd.Extended[0][len("diff --git "):])
421511
if isNewFile {
422512
fd.OrigName = "/dev/null"
423513
}
@@ -426,7 +516,38 @@ func handleEmpty(fd *FileDiff) (wasEmpty bool) {
426516
fd.NewName = "/dev/null"
427517
}
428518

429-
return true
519+
// For ambiguous 'diff --git' lines, try to reconstruct filenames using extended headers.
520+
if success && (isCopy || isRename) && fd.OrigName == "" && fd.NewName == "" {
521+
diffArgs := fd.Extended[0][len("diff --git "):]
522+
523+
tryReconstruct := func(header string, prefix string, whichFile int, result *string) {
524+
if !strings.HasPrefix(header, prefix) {
525+
return
526+
}
527+
rawFilename := header[len(prefix):]
528+
529+
// extract the filename prefix (e.g. "a/") from the 'diff --git' line.
530+
var prefixLetterIndex int
531+
if whichFile == 1 {
532+
prefixLetterIndex = 0
533+
} else if whichFile == 2 {
534+
prefixLetterIndex = len(diffArgs) - len(rawFilename) - 2
535+
}
536+
if prefixLetterIndex < 0 || diffArgs[prefixLetterIndex+1] != '/' {
537+
return
538+
}
539+
540+
*result = diffArgs[prefixLetterIndex:prefixLetterIndex+2] + rawFilename
541+
}
542+
543+
for _, header := range fd.Extended {
544+
tryReconstruct(header, "copy from ", 1, &fd.OrigName)
545+
tryReconstruct(header, "copy to ", 2, &fd.NewName)
546+
tryReconstruct(header, "rename from ", 1, &fd.OrigName)
547+
tryReconstruct(header, "rename to ", 2, &fd.NewName)
548+
}
549+
}
550+
return success
430551
}
431552

432553
var (

diff/parse_test.go

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
package diff
2+
3+
import (
4+
"testing"
5+
)
6+
7+
func TestReadQuotedFilename_Success(t *testing.T) {
8+
tests := []struct {
9+
input, value, remainder string
10+
}{
11+
{input: `""`, value: "", remainder: ""},
12+
{input: `"aaa"`, value: "aaa", remainder: ""},
13+
{input: `"aaa" bbb`, value: "aaa", remainder: " bbb"},
14+
{input: `"aaa" "bbb" ccc`, value: "aaa", remainder: ` "bbb" ccc`},
15+
{input: `"\""`, value: "\"", remainder: ""},
16+
{input: `"uh \"oh\""`, value: "uh \"oh\"", remainder: ""},
17+
{input: `"uh \\"oh\\""`, value: "uh \\", remainder: `oh\\""`},
18+
{input: `"uh \\\"oh\\\""`, value: "uh \\\"oh\\\"", remainder: ""},
19+
}
20+
for _, tc := range tests {
21+
value, remainder, err := readQuotedFilename(tc.input)
22+
if err != nil {
23+
t.Errorf("readQuotedFilename(`%s`): expected success, got '%s'", tc.input, err)
24+
} else if value != tc.value || remainder != tc.remainder {
25+
t.Errorf("readQuotedFilename(`%s`): expected `%s` and `%s`, got `%s` and `%s`", tc.input, tc.value, tc.remainder, value, remainder)
26+
}
27+
}
28+
}
29+
30+
func TestReadQuotedFilename_Error(t *testing.T) {
31+
tests := []string{
32+
// Doesn't start with a quote
33+
``,
34+
`foo`,
35+
` "foo"`,
36+
// Missing end quote
37+
`"`,
38+
`"\"`,
39+
// "\x" is not a valid Go string literal escape
40+
`"\xxx"`,
41+
}
42+
for _, input := range tests {
43+
_, _, err := readQuotedFilename(input)
44+
if err == nil {
45+
t.Errorf("readQuotedFilename(`%s`): expected error", input)
46+
}
47+
}
48+
}
49+
50+
func TestParseDiffGitArgs_Success(t *testing.T) {
51+
tests := []struct {
52+
input, first, second string
53+
}{
54+
{input: `aaa bbb`, first: "aaa", second: "bbb"},
55+
{input: `"aaa" bbb`, first: "aaa", second: "bbb"},
56+
{input: `aaa "bbb"`, first: "aaa", second: "bbb"},
57+
{input: `"aaa" "bbb"`, first: "aaa", second: "bbb"},
58+
{input: `1/a 2/z`, first: "1/a", second: "2/z"},
59+
{input: `1/hello world 2/hello world`, first: "1/hello world", second: "2/hello world"},
60+
{input: `"new\nline" and spaces`, first: "new\nline", second: "and spaces"},
61+
{input: `a/existing file with spaces "b/new, complicated\nfilen\303\270me"`, first: "a/existing file with spaces", second: "b/new, complicated\nfilen\303\270me"},
62+
}
63+
for _, tc := range tests {
64+
first, second, success := parseDiffGitArgs(tc.input)
65+
if !success {
66+
t.Errorf("`diff --git %s`: expected success", tc.input)
67+
} else if first != tc.first || second != tc.second {
68+
t.Errorf("`diff --git %s`: expected `%s` and `%s`, got `%s` and `%s`", tc.input, tc.first, tc.second, first, second)
69+
}
70+
}
71+
}
72+
73+
func TestParseDiffGitArgs_Unsuccessful(t *testing.T) {
74+
tests := []string{
75+
``,
76+
`hello_world.txt`,
77+
`word `,
78+
` word`,
79+
`"a/bad_quoting b/bad_quoting`,
80+
`a/bad_quoting "b/bad_quoting`,
81+
`a/bad_quoting b/bad_quoting"`,
82+
`"a/bad_quoting b/bad_quoting"`,
83+
`"a/bad""b/bad"`,
84+
`"a/bad" "b/bad" "c/bad"`,
85+
`a/bad "b/bad" "c/bad"`,
86+
}
87+
for _, input := range tests {
88+
first, second, success := parseDiffGitArgs(input)
89+
if success {
90+
t.Errorf("`diff --git %s`: expected unsuccessful; got `%s` and `%s`", input, first, second)
91+
}
92+
}
93+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
diff --git a/new empty file with spaces b/new empty file with spaces
2+
new file mode 100644
3+
index 0000000..e69de29
4+
diff --git a/new file with text b/new file with text
5+
new file mode 100644
6+
index 0000000..c3ed4be
7+
--- /dev/null
8+
+++ b/new file with text
9+
@@ -0,0 +1 @@
10+
+new file with text
11+
diff --git a/existing file with spaces b/new file with spaces
12+
similarity index 100%
13+
copy from existing file with spaces
14+
copy to new file with spaces
15+
diff --git a/existing file with spaces "b/new, complicated\nfilen\303\270me"
16+
similarity index 100%
17+
copy from existing file with spaces
18+
copy to "new, complicated\nfilen\303\270me"
19+
diff --git a/existing file with spaces "b/new \"complicated\" filename"
20+
similarity index 100%
21+
copy from existing file with spaces
22+
copy to "new \"complicated\" filename"
23+
diff --git "a/existing \"complicated\" filename" b/new, simpler filename
24+
similarity index 100%
25+
copy from "existing \"complicated\" filename"
26+
copy to new, simpler filename

0 commit comments

Comments
 (0)