Skip to content

Commit 9721ba5

Browse files
committed
Add support for complicated filenames and file copies
When filenames contain special characters, they are quoted, but not if they only contain spaces, meaning the 'git --diff' argument line becomes AMBIGUOUS. It is, however, possible to reconstruct the filenames using the extended headers. This fixes parsing of unquoted filenames with multiple spaces in them, quoted filenames with spaces in them, and diffs containing file copies.
1 parent a768196 commit 9721ba5

File tree

3 files changed

+267
-14
lines changed

3 files changed

+267
-14
lines changed

diff/diff_test.go

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,49 @@ func init() {
2929
time.Local = time.UTC
3030
}
3131

32+
func TestReadQuotedFilename(t *testing.T) {
33+
tests := []string{
34+
`""`, "", "",
35+
`"aaa"`, "aaa", "",
36+
`"aaa" bbb`, "aaa", " bbb",
37+
`"\""`, "\"", "",
38+
`"uh \"oh\""`, "uh \"oh\"", "",
39+
`"uh \\"oh\\""`, "uh \\", `oh\\""`,
40+
`"uh \\\"oh\\\""`, "uh \\\"oh\\\"", "",
41+
}
42+
for i := 0; i < len(tests); i += 3 {
43+
input := tests[i]
44+
value, remainder, err := readQuotedFilename(input)
45+
if err != nil {
46+
t.Errorf("readQuotedFilename(`%s`): expected success, got '%s'", input, err)
47+
} else if value != tests[i+1] || remainder != tests[i+2] {
48+
t.Errorf("readQuotedFilename(`%s`): expected `%s` and `%s`, got `%s` and `%s`", input, tests[i+1], tests[i+2], value, remainder)
49+
}
50+
}
51+
}
52+
53+
func TestParseDiffGitArgs(t *testing.T) {
54+
tests := []string{
55+
`aaa bbb`, "aaa", "bbb",
56+
`"aaa" bbb`, "aaa", "bbb",
57+
`aaa "bbb"`, "aaa", "bbb",
58+
`"aaa" "bbb"`, "aaa", "bbb",
59+
`1/a 2/z`, "1/a", "2/z",
60+
`1/hello world 2/hello world`, "1/hello world", "2/hello world",
61+
`"new\nline" and spaces`, "new\nline", "and spaces",
62+
`a/existing file with spaces "b/new, complicated\nfilen\303\270me"`, "a/existing file with spaces", "b/new, complicated\nfilen\303\270me",
63+
}
64+
for i := 0; i < len(tests); i += 3 {
65+
input := tests[i]
66+
success, first, second := parseDiffGitArgs(input)
67+
if !success {
68+
t.Errorf("`diff --git %s`: expected success", input)
69+
} else if first != tests[i+1] || second != tests[i+2] {
70+
t.Errorf("`diff --git %s`: expected `%s` and `%s`, got `%s` and `%s`", input, tests[i+1], tests[i+2], first, second)
71+
}
72+
}
73+
}
74+
3275
func TestParseHunkNoChunksize(t *testing.T) {
3376
filename := "sample_no_chunksize.diff"
3477
diffData, err := ioutil.ReadFile(filepath.Join("testdata", filename))
@@ -704,6 +747,69 @@ func TestParseMultiFileDiffHeaders(t *testing.T) {
704747
},
705748
},
706749
},
750+
{
751+
filename: "complicated_filenames.diff",
752+
wantDiffs: []*FileDiff{
753+
{
754+
OrigName: "/dev/null",
755+
NewName: "b/new empty file with spaces",
756+
Extended: []string{
757+
"diff --git a/new empty file with spaces b/new empty file with spaces",
758+
"new file mode 100644",
759+
"index 0000000..e69de29",
760+
},
761+
},
762+
{
763+
OrigName: "/dev/null",
764+
NewName: "b/new file with text",
765+
Extended: []string{
766+
"diff --git a/new file with text b/new file with text",
767+
"new file mode 100644",
768+
"index 0000000..c3ed4be",
769+
},
770+
},
771+
{
772+
OrigName: "a/existing file with spaces",
773+
NewName: "b/new file with spaces",
774+
Extended: []string{
775+
"diff --git a/existing file with spaces b/new file with spaces",
776+
"similarity index 100%",
777+
"copy from existing file with spaces",
778+
"copy to new file with spaces",
779+
},
780+
},
781+
{
782+
OrigName: "a/existing file with spaces",
783+
NewName: "b/new, complicated\nfilenøme",
784+
Extended: []string{
785+
`diff --git a/existing file with spaces "b/new, complicated\nfilen\303\270me"`,
786+
"similarity index 100%",
787+
"copy from existing file with spaces",
788+
`copy to "new, complicated\nfilen\303\270me"`,
789+
},
790+
},
791+
{
792+
OrigName: "a/existing file with spaces",
793+
NewName: `b/new "complicated" filename`,
794+
Extended: []string{
795+
`diff --git a/existing file with spaces "b/new \"complicated\" filename"`,
796+
"similarity index 100%",
797+
"copy from existing file with spaces",
798+
`copy to "new \"complicated\" filename"`,
799+
},
800+
},
801+
{
802+
OrigName: `a/existing "complicated" filename`,
803+
NewName: "b/new, simpler filename",
804+
Extended: []string{
805+
`diff --git "a/existing \"complicated\" filename" b/new, simpler filename`,
806+
"similarity index 100%",
807+
`copy from "existing \"complicated\" filename"`,
808+
"copy to new, simpler filename",
809+
},
810+
},
811+
},
812+
},
707813
}
708814
for _, test := range tests {
709815
t.Run(test.filename, func(t *testing.T) {

diff/parse.go

Lines changed: 135 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,101 @@ func (r *FileDiffReader) ReadExtendedHeaders() ([]string, error) {
372372
}
373373
}
374374

375+
// readQuotedFilename extracts a quoted filename from the beginning of a string,
376+
// returning the unquoted filename and any remaining text after the filename.
377+
func readQuotedFilename(text string) (value string, remainder string, err error) {
378+
if text[0] != '"' {
379+
panic("caller must ensure filename is quoted! " + text)
380+
}
381+
382+
// The end quote is the first quote NOT preceeded by an uneven number of backslashes.
383+
var i, j int
384+
for i = 1; i < len(text); i++ {
385+
if text[i] == '"' {
386+
// walk backwards and find first non-backslash
387+
for j = i - 1; j > 0 && text[j] == '\\'; j-- {
388+
}
389+
numberOfBackslashes := i - j - 1
390+
if numberOfBackslashes%2 == 0 {
391+
break
392+
}
393+
}
394+
}
395+
if i == len(text) {
396+
return "", "", fmt.Errorf(`end of string found while searching for '"': %s`, text)
397+
}
398+
399+
value, err = strconv.Unquote(text[:i+1])
400+
remainder = text[i+1:]
401+
return
402+
}
403+
404+
// parseDiffGitArgs extracts the two filenames from a 'diff --git' line.
405+
func parseDiffGitArgs(diffArgs string) (bool, string, string) {
406+
length := len(diffArgs)
407+
if length < 3 {
408+
return false, "", ""
409+
}
410+
411+
if diffArgs[0] != '"' && diffArgs[length-1] != '"' {
412+
// Both filenames are unquoted.
413+
firstSpace := strings.IndexByte(diffArgs, ' ')
414+
if firstSpace <= 0 || firstSpace == length-1 {
415+
return false, "", ""
416+
}
417+
418+
secondSpace := strings.IndexByte(diffArgs[firstSpace+1:], ' ')
419+
if secondSpace == -1 {
420+
return true, diffArgs[:firstSpace], diffArgs[firstSpace+1:]
421+
}
422+
423+
// One or both filenames contain a space, but the names are
424+
// unquoted. Here, the 'diff --git' syntax is ambiguous, and
425+
// we have to obtain the filenames elsewhere (e.g. from the
426+
// chunk headers or extended headers). HOWEVER, if the file
427+
// is newly created and empty, there IS no other place to
428+
// find the filename. In this case, the two filenames are
429+
// identical (except for the leading 'a/' prefix), and we have
430+
// to handle that case here.
431+
first := diffArgs[:length/2]
432+
second := diffArgs[length/2+1:]
433+
if len(first) >= 3 && length%2 == 1 && first[1] == '/' && first[1:] == second[1:] {
434+
return true, first, second
435+
}
436+
437+
// The syntax is (unfortunately) valid, but we could not extract
438+
// the filenames.
439+
return true, "", ""
440+
}
441+
442+
if diffArgs[0] == '"' {
443+
first, remainder, err := readQuotedFilename(diffArgs)
444+
if err != nil || len(remainder) < 2 || remainder[0] != ' ' {
445+
return false, "", ""
446+
}
447+
if remainder[1] == '"' {
448+
second, remainder, err := readQuotedFilename(remainder[1:])
449+
if remainder != "" || err != nil {
450+
return false, "", ""
451+
}
452+
return true, first, second
453+
}
454+
return true, first, remainder[1:]
455+
}
456+
457+
// In this case, second argument MUST be quoted (or it's a syntax error)
458+
i := strings.IndexByte(diffArgs, '"')
459+
if i == -1 || i+2 >= length || diffArgs[i-1] != ' ' {
460+
return false, "", ""
461+
}
462+
463+
second, remainder, err := readQuotedFilename(diffArgs[i:])
464+
if remainder != "" || err != nil {
465+
return false, "", ""
466+
}
467+
return true, diffArgs[:i-1], second
468+
}
469+
375470
// handleEmpty detects when FileDiff was an empty diff and will not have any hunks
376471
// that follow. It updates fd fields from the parsed extended headers.
377472
func handleEmpty(fd *FileDiff) (wasEmpty bool) {
@@ -388,6 +483,10 @@ func handleEmpty(fd *FileDiff) (wasEmpty bool) {
388483
return lineHasPrefix(idx1, prefix1) && lineHasPrefix(idx2, prefix2)
389484
}
390485

486+
isCopy := (lineCount == 4 && linesHavePrefixes(2, "copy from ", 3, "copy to ")) ||
487+
(lineCount == 6 && linesHavePrefixes(2, "copy from ", 3, "copy to ") && lineHasPrefix(5, "Binary files ")) ||
488+
(lineCount == 6 && linesHavePrefixes(1, "old mode ", 2, "new mode ") && linesHavePrefixes(4, "copy from ", 5, "copy to "))
489+
391490
isRename := (lineCount == 4 && linesHavePrefixes(2, "rename from ", 3, "rename to ")) ||
392491
(lineCount == 6 && linesHavePrefixes(2, "rename from ", 3, "rename to ") && lineHasPrefix(5, "Binary files ")) ||
393492
(lineCount == 6 && linesHavePrefixes(1, "old mode ", 2, "new mode ") && linesHavePrefixes(4, "rename from ", 5, "rename to "))
@@ -402,22 +501,12 @@ func handleEmpty(fd *FileDiff) (wasEmpty bool) {
402501

403502
isBinaryPatch := lineCount == 3 && lineHasPrefix(2, "Binary files ") || lineCount > 3 && lineHasPrefix(2, "GIT binary patch")
404503

405-
if !isModeChange && !isRename && !isBinaryPatch && !isNewFile && !isDeletedFile {
504+
if !isModeChange && !isCopy && !isRename && !isBinaryPatch && !isNewFile && !isDeletedFile {
406505
return false
407506
}
408507

409-
names := strings.SplitN(fd.Extended[0][len("diff --git "):], " ", 2)
410-
411-
var err error
412-
fd.OrigName, err = strconv.Unquote(names[0])
413-
if err != nil {
414-
fd.OrigName = names[0]
415-
}
416-
fd.NewName, err = strconv.Unquote(names[1])
417-
if err != nil {
418-
fd.NewName = names[1]
419-
}
420-
508+
var success bool
509+
success, fd.OrigName, fd.NewName = parseDiffGitArgs(fd.Extended[0][len("diff --git "):])
421510
if isNewFile {
422511
fd.OrigName = "/dev/null"
423512
}
@@ -426,7 +515,39 @@ func handleEmpty(fd *FileDiff) (wasEmpty bool) {
426515
fd.NewName = "/dev/null"
427516
}
428517

429-
return true
518+
// For ambiguous 'diff --git' lines, try to reconstruct filenames using extended headers.
519+
if success && (isCopy || isRename) && fd.OrigName == "" && fd.NewName == "" {
520+
diffArgs := fd.Extended[0][len("diff --git "):]
521+
522+
reconstruct := func(header string, prefix string, whichFile int, result *string) bool {
523+
if !strings.HasPrefix(header, prefix) {
524+
return false
525+
}
526+
rawFilename := header[len(prefix):]
527+
528+
// extract the filename prefix (e.g. "a/") from the 'diff --git' line.
529+
var prefixLetterIndex int
530+
if whichFile == 1 {
531+
prefixLetterIndex = 0
532+
} else if whichFile == 2 {
533+
prefixLetterIndex = len(diffArgs) - len(rawFilename) - 2
534+
}
535+
if prefixLetterIndex < 0 || diffArgs[prefixLetterIndex+1] != '/' {
536+
return false
537+
}
538+
539+
*result = diffArgs[prefixLetterIndex:prefixLetterIndex+2] + rawFilename
540+
return true
541+
}
542+
543+
for _, header := range fd.Extended {
544+
_ = reconstruct(header, "copy from ", 1, &fd.OrigName) ||
545+
reconstruct(header, "copy to ", 2, &fd.NewName) ||
546+
reconstruct(header, "rename from ", 1, &fd.OrigName) ||
547+
reconstruct(header, "rename to ", 2, &fd.NewName)
548+
}
549+
}
550+
return success
430551
}
431552

432553
var (
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
diff --git a/new empty file with spaces b/new empty file with spaces
2+
new file mode 100644
3+
index 0000000..e69de29
4+
diff --git a/new file with text b/new file with text
5+
new file mode 100644
6+
index 0000000..c3ed4be
7+
--- /dev/null
8+
+++ b/new file with text
9+
@@ -0,0 +1 @@
10+
+new file with text
11+
diff --git a/existing file with spaces b/new file with spaces
12+
similarity index 100%
13+
copy from existing file with spaces
14+
copy to new file with spaces
15+
diff --git a/existing file with spaces "b/new, complicated\nfilen\303\270me"
16+
similarity index 100%
17+
copy from existing file with spaces
18+
copy to "new, complicated\nfilen\303\270me"
19+
diff --git a/existing file with spaces "b/new \"complicated\" filename"
20+
similarity index 100%
21+
copy from existing file with spaces
22+
copy to "new \"complicated\" filename"
23+
diff --git "a/existing \"complicated\" filename" b/new, simpler filename
24+
similarity index 100%
25+
copy from "existing \"complicated\" filename"
26+
copy to new, simpler filename

0 commit comments

Comments
 (0)