From e77128e5344ce7d84302facc08d17c3151037ec3 Mon Sep 17 00:00:00 2001 From: Vincent Driessen Date: Thu, 14 Apr 2016 21:27:39 +0200 Subject: [PATCH 1/5] Make diff patch parsing more reliable The a_path and b_path cannot reliably be read from the first diff line as it's ambiguous. From the git-diff manpage: > The a/ and b/ filenames are the same unless rename/copy is involved. > Especially, **even for a creation or a deletion**, /dev/null is not > used in place of the a/ or b/ filenames. This patch changes the a_path and b_path detection to read it from the more reliable locations further down the diff headers. Two use cases are fixed by this: - As the man page snippet above states, for new/deleted files the a or b path will now be properly None. - File names with spaces in it are now properly parsed. Working on this patch, I realized the --- and +++ lines really belong to the diff header, not the diff contents. This means that when parsing the patch format, the --- and +++ will now be swallowed, and not end up anymore as part of the diff contents. The diff contents now always start with an @@ line. This may be a breaking change for some users that rely on this behaviour. However, those users could now access that information more reliably via the normal Diff properties a_path and b_path now. --- git/diff.py | 32 +++++++++++++++++-------- git/test/fixtures/diff_file_with_spaces | 7 ++++++ git/test/fixtures/diff_initial | 2 -- git/test/test_diff.py | 11 +++++++-- 4 files changed, 38 insertions(+), 14 deletions(-) create mode 100644 git/test/fixtures/diff_file_with_spaces diff --git a/git/diff.py b/git/diff.py index de3aa1e80..c4fd4b279 100644 --- a/git/diff.py +++ b/git/diff.py @@ -198,16 +198,18 @@ class Diff(object): # precompiled regex re_header = re.compile(r""" ^diff[ ]--git - [ ](?:a/)?(?P.+?)[ ](?:b/)?(?P.+?)\n - (?:^similarity[ ]index[ ](?P\d+)%\n - ^rename[ ]from[ ](?P\S+)\n - ^rename[ ]to[ ](?P\S+)(?:\n|$))? + [ ](?:a/)?(?P.+?)[ ](?:b/)?(?P.+?)\n + (?:^similarity[ ]index[ ]\d+%\n + ^rename[ ]from[ ](?P.*)\n + ^rename[ ]to[ ](?P.*)(?:\n|$))? (?:^old[ ]mode[ ](?P\d+)\n ^new[ ]mode[ ](?P\d+)(?:\n|$))? (?:^new[ ]file[ ]mode[ ](?P.+)(?:\n|$))? (?:^deleted[ ]file[ ]mode[ ](?P.+)(?:\n|$))? (?:^index[ ](?P[0-9A-Fa-f]+) \.\.(?P[0-9A-Fa-f]+)[ ]?(?P.+)?(?:\n|$))? + (?:^---[ ](?:a/)?(?P.*)(?:\n|$))? + (?:^\+\+\+[ ](?:b/)?(?P.*)(?:\n|$))? """.encode('ascii'), re.VERBOSE | re.MULTILINE) # can be used for comparisons NULL_HEX_SHA = "0" * 40 @@ -231,15 +233,14 @@ def __init__(self, repo, a_path, b_path, a_blob_id, b_blob_id, a_mode, if self.b_mode: self.b_mode = mode_str_to_int(self.b_mode) - if a_blob_id is None: + if a_blob_id is None or a_blob_id == self.NULL_HEX_SHA: self.a_blob = None else: - assert self.a_mode is not None self.a_blob = Blob(repo, hex_to_bin(a_blob_id), mode=self.a_mode, path=a_path) - if b_blob_id is None: + + if b_blob_id is None or b_blob_id == self.NULL_HEX_SHA: self.b_blob = None else: - assert self.b_mode is not None self.b_blob = Blob(repo, hex_to_bin(b_blob_id), mode=self.b_mode, path=b_path) self.new_file = new_file @@ -329,11 +330,22 @@ def _index_from_patch_format(cls, repo, stream): index = DiffIndex() previous_header = None for header in cls.re_header.finditer(text): - a_path, b_path, similarity_index, rename_from, rename_to, \ + a_path_fallback, b_path_fallback, \ + rename_from, rename_to, \ old_mode, new_mode, new_file_mode, deleted_file_mode, \ - a_blob_id, b_blob_id, b_mode = header.groups() + a_blob_id, b_blob_id, b_mode, \ + a_path, b_path = header.groups() new_file, deleted_file = bool(new_file_mode), bool(deleted_file_mode) + a_path = a_path or rename_from or a_path_fallback + b_path = b_path or rename_to or b_path_fallback + + if a_path == b'/dev/null': + a_path = None + + if b_path == b'/dev/null': + b_path = None + # Our only means to find the actual text is to see what has not been matched by our regex, # and then retro-actively assin it to our index if previous_header is not None: diff --git a/git/test/fixtures/diff_file_with_spaces b/git/test/fixtures/diff_file_with_spaces new file mode 100644 index 000000000..a9f0b06c4 --- /dev/null +++ b/git/test/fixtures/diff_file_with_spaces @@ -0,0 +1,7 @@ +diff --git a/file with spaces b/file with spaces +new file mode 100644 +index 0000000000000000000000000000000000000000..75c620d7b0d3b0100415421a97f553c979d75174 +--- /dev/null ++++ b/file with spaces +@@ -0,0 +1 @@ ++ohai diff --git a/git/test/fixtures/diff_initial b/git/test/fixtures/diff_initial index 6037c6774..648d70437 100644 --- a/git/test/fixtures/diff_initial +++ b/git/test/fixtures/diff_initial @@ -1,5 +1,3 @@ ---- /dev/null -+++ b/CHANGES @@ -0,0 +1,7 @@ +======= +CHANGES diff --git a/git/test/test_diff.py b/git/test/test_diff.py index 56e395fda..05f8f3ae5 100644 --- a/git/test/test_diff.py +++ b/git/test/test_diff.py @@ -76,7 +76,7 @@ def test_list_from_string_new_mode(self): self._assert_diff_format(diffs) assert_equal(1, len(diffs)) - assert_equal(10, len(diffs[0].diff.splitlines())) + assert_equal(8, len(diffs[0].diff.splitlines())) def test_diff_with_rename(self): output = StringProcessAdapter(fixture('diff_rename')) @@ -140,7 +140,8 @@ def test_diff_initial_commit(self): # ...and with creating a patch diff_index = initial_commit.diff(NULL_TREE, create_patch=True) - assert diff_index[0].b_path == 'CHANGES' + assert diff_index[0].a_path is None, repr(diff_index[0].a_path) + assert diff_index[0].b_path == 'CHANGES', repr(diff_index[0].b_path) assert diff_index[0].new_file assert diff_index[0].diff == fixture('diff_initial') @@ -156,6 +157,12 @@ def test_diff_patch_format(self): Diff._index_from_patch_format(self.rorepo, diff_proc.stdout) # END for each fixture + def test_diff_with_spaces(self): + data = StringProcessAdapter(fixture('diff_file_with_spaces')) + diff_index = Diff._index_from_patch_format(self.rorepo, data.stdout) + assert diff_index[0].a_path is None, repr(diff_index[0].a_path) + assert diff_index[0].b_path == u'file with spaces', repr(diff_index[0].b_path) + def test_diff_interface(self): # test a few variations of the main diff routine assertion_map = dict() From 55d40df99085036ed265fbc6d24d90fbb1a24f95 Mon Sep 17 00:00:00 2001 From: Vincent Driessen Date: Fri, 15 Apr 2016 00:58:18 +0200 Subject: [PATCH 2/5] Update changelog --- doc/source/changes.rst | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/doc/source/changes.rst b/doc/source/changes.rst index 4fdf39caa..734c479c6 100644 --- a/doc/source/changes.rst +++ b/doc/source/changes.rst @@ -5,9 +5,23 @@ Changelog 1.0.3 - Fixes ============= -* `Commit.diff()` now supports diffing the root commit via `Commit.diff(NULL_TREE)`. -* `Repo.blame()` now respects `incremental=True`, supporting incremental blames. Incremental blames are slightly faster since they don't include the file's contents in them. -* IMPORTANT: This release drops support for python 2.6, which is officially deprecated by the python maintainers. +* `Commit.diff()` now supports diffing the root commit via + `Commit.diff(NULL_TREE)`. +* `Repo.blame()` now respects `incremental=True`, supporting incremental + blames. Incremental blames are slightly faster since they don't include + the file's contents in them. +* Fix: `Diff` objects created with patch output will now have their + `a_path` and `b_path` properties parsed out correctly. Previously, some + values may have been populated incorrectly when a file was added or + deleted. +* IMPORTANT: This release drops support for python 2.6, which is + officially deprecated by the python maintainers. +* CRITICAL: `Diff` objects created with patch output will now not carry + the --- and +++ header lines anymore. All diffs now start with the + @@ header line directly. Users that rely on the old behaviour can now + (reliably) read this information from the a_path and b_path properties + without having to parse these lines manually. + 1.0.2 - Fixes ============= From cdf7c5aca2201cf9dfc3cd301264da4ea352b737 Mon Sep 17 00:00:00 2001 From: Vincent Driessen Date: Fri, 15 Apr 2016 01:39:58 +0200 Subject: [PATCH 3/5] Fix regex This makes sure we're not matching a \n here by accident. It's now almost the same as the original that used \S+, except that spaces are not eaten at the end of the string (for files that end in a space). --- git/diff.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/git/diff.py b/git/diff.py index c4fd4b279..fd13f988c 100644 --- a/git/diff.py +++ b/git/diff.py @@ -208,8 +208,8 @@ class Diff(object): (?:^deleted[ ]file[ ]mode[ ](?P.+)(?:\n|$))? (?:^index[ ](?P[0-9A-Fa-f]+) \.\.(?P[0-9A-Fa-f]+)[ ]?(?P.+)?(?:\n|$))? - (?:^---[ ](?:a/)?(?P.*)(?:\n|$))? - (?:^\+\+\+[ ](?:b/)?(?P.*)(?:\n|$))? + (?:^---[ ](?:a/)?(?P[^\t\n\r\f\v]*)[\t\r\f\v]*(?:\n|$))? + (?:^\+\+\+[ ](?:b/)?(?P[^\t\n\r\f\v]*)[\t\r\f\v]*(?:\n|$))? """.encode('ascii'), re.VERBOSE | re.MULTILINE) # can be used for comparisons NULL_HEX_SHA = "0" * 40 From 2090b5487e69688be61cfbb97c346c452ab45ba2 Mon Sep 17 00:00:00 2001 From: Vincent Driessen Date: Fri, 15 Apr 2016 02:18:12 +0200 Subject: [PATCH 4/5] Make test stricter --- git/test/test_diff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/git/test/test_diff.py b/git/test/test_diff.py index 05f8f3ae5..0c670f0b3 100644 --- a/git/test/test_diff.py +++ b/git/test/test_diff.py @@ -116,7 +116,7 @@ def test_diff_index(self): res = Diff._index_from_patch_format(None, output.stdout) assert len(res) == 6 for dr in res: - assert dr.diff + assert dr.diff.startswith(b'@@') assert str(dr), "Diff to string conversion should be possible" # end for each diff From 1445b59bb41c4b1a94b7cb0ec6864c98de63814b Mon Sep 17 00:00:00 2001 From: Vincent Driessen Date: Fri, 15 Apr 2016 08:32:45 +0200 Subject: [PATCH 5/5] Fix order of regex parts When both old/new mode and rename from/to lines are found, they will appear in different order. --- git/diff.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/git/diff.py b/git/diff.py index fd13f988c..d4affd302 100644 --- a/git/diff.py +++ b/git/diff.py @@ -199,11 +199,11 @@ class Diff(object): re_header = re.compile(r""" ^diff[ ]--git [ ](?:a/)?(?P.+?)[ ](?:b/)?(?P.+?)\n + (?:^old[ ]mode[ ](?P\d+)\n + ^new[ ]mode[ ](?P\d+)(?:\n|$))? (?:^similarity[ ]index[ ]\d+%\n ^rename[ ]from[ ](?P.*)\n ^rename[ ]to[ ](?P.*)(?:\n|$))? - (?:^old[ ]mode[ ](?P\d+)\n - ^new[ ]mode[ ](?P\d+)(?:\n|$))? (?:^new[ ]file[ ]mode[ ](?P.+)(?:\n|$))? (?:^deleted[ ]file[ ]mode[ ](?P.+)(?:\n|$))? (?:^index[ ](?P[0-9A-Fa-f]+) @@ -331,8 +331,9 @@ def _index_from_patch_format(cls, repo, stream): previous_header = None for header in cls.re_header.finditer(text): a_path_fallback, b_path_fallback, \ + old_mode, new_mode, \ rename_from, rename_to, \ - old_mode, new_mode, new_file_mode, deleted_file_mode, \ + new_file_mode, deleted_file_mode, \ a_blob_id, b_blob_id, b_mode, \ a_path, b_path = header.groups() new_file, deleted_file = bool(new_file_mode), bool(deleted_file_mode)