diff regex are now precompiled on class level, renamed a|b_blob to a|b_blob_id as it better reflects the actual value

Byron · Byron · commit 52ab307935bd · 2009-10-09T15:13:05.000+02:00
actor regex now precompiled on class level
blob regex now precompiled on class level; made blame method more readable and faster although it can still be improved by making assumptions about the blame format and by reading the git command stream directly ( which is a general issue right now )
diff --git a/lib/git/actor.py b/lib/git/actor.py
@@ -10,6 +10,10 @@ class Actor(object):
     """Actors hold information about a person acting on the repository. They 
     can be committers and authors or anything with a name and an email as 
     mentioned in the git log entries."""
+    # precompiled regex
+    name_only_regex = re.compile( r'<.+>' )
+    name_email_regex = re.compile( r'(.*) <(.+?)>' ) 
+    
     def __init__(self, name, email):
         self.name = name
         self.email = email
@@ -34,8 +38,8 @@ def from_string(cls, string):
         Returns
             Actor
         """
-        if re.search(r'<.+>', string):
-            m = re.search(r'(.*) <(.+?)>', string)
+        if cls.name_only_regex.search(string):
+            m = cls.name_email_regex.search(string)
             name, email = m.groups()
             return Actor(name, email)
         else:
diff --git a/lib/git/blob.py b/lib/git/blob.py
@@ -15,6 +15,12 @@ class Blob(object):
     """A Blob encapsulates a git blob object"""
     DEFAULT_MIME_TYPE = "text/plain"
 
+    # precompiled regex
+    re_whitespace = re.compile(r'\s+')
+    re_hexsha_only = re.compile('^[0-9A-Fa-f]{40}$')
+    re_author_committer_start = re.compile(r'^(author|committer)')
+    re_tab_full_line = re.compile(r'^\t(.*)$')
+    
     def __init__(self, repo, id, mode=None, path=None):
         """
         Create an unbaked Blob containing just the specified attributes
@@ -112,49 +118,68 @@ def blame(cls, repo, commit, file):
         info = None
 
         for line in data.splitlines():
-            parts = re.split(r'\s+', line, 1)
-            if re.search(r'^[0-9A-Fa-f]{40}$', parts[0]):
-                if re.search(r'^([0-9A-Fa-f]{40}) (\d+) (\d+) (\d+)$', line):
-                    m = re.search(r'^([0-9A-Fa-f]{40}) (\d+) (\d+) (\d+)$', line)
-                    id, origin_line, final_line, group_lines = m.groups()
-                    info = {'id': id}
-                    blames.append([None, []])
-                elif re.search(r'^([0-9A-Fa-f]{40}) (\d+) (\d+)$', line):
-                    m = re.search(r'^([0-9A-Fa-f]{40}) (\d+) (\d+)$', line)
-                    id, origin_line, final_line = m.groups()
-                    info = {'id': id}
-            elif re.search(r'^(author|committer)', parts[0]):
-                if re.search(r'^(.+)-mail$', parts[0]):
-                    m = re.search(r'^(.+)-mail$', parts[0])
-                    info["%s_email" % m.groups()[0]] = parts[-1]
-                elif re.search(r'^(.+)-time$', parts[0]):
-                    m = re.search(r'^(.+)-time$', parts[0])
-                    info["%s_date" % m.groups()[0]] = time.gmtime(int(parts[-1]))
-                elif re.search(r'^(author|committer)$', parts[0]):
-                    m = re.search(r'^(author|committer)$', parts[0])
-                    info[m.groups()[0]] = parts[-1]
-            elif re.search(r'^filename', parts[0]):
-                info['filename'] = parts[-1]
-            elif re.search(r'^summary', parts[0]):
-                info['summary'] = parts[-1]
-            elif parts[0] == '':
-                if info:
-                    c = commits.has_key(info['id']) and commits[info['id']]
-                    if not c:
-                        c = Commit(repo, id=info['id'],
-                                         author=Actor.from_string(info['author'] + ' ' + info['author_email']),
-                                         authored_date=info['author_date'],
-                                         committer=Actor.from_string(info['committer'] + ' ' + info['committer_email']),
-                                         committed_date=info['committer_date'],
-                                         message=info['summary'])
-                        commits[info['id']] = c
-
-                    m = re.search(r'^\t(.*)$', line)
-                    text,  = m.groups()
-                    blames[-1][0] = c
-                    blames[-1][1].append( text )
-                    info = None
-
+            parts = cls.re_whitespace.split(line, 1)
+            firstpart = parts[0]
+            if cls.re_hexsha_only.search(firstpart):
+                # handles 
+                # 634396b2f541a9f2d58b00be1a07f0c358b999b3 1 1 7		- indicates blame-data start
+                # 634396b2f541a9f2d58b00be1a07f0c358b999b3 2 2
+                digits = parts[-1].split(" ")
+                if len(digits) == 3:
+					info = {'id': firstpart}
+					blames.append([None, []])
+				# END blame data initialization
+            else:
+                m = cls.re_author_committer_start.search(firstpart)
+                if m:
+                    # handles: 
+                    # author Tom Preston-Werner
+                    # author-mail <tom@mojombo.com>
+                    # author-time 1192271832
+                    # author-tz -0700
+                    # committer Tom Preston-Werner
+                    # committer-mail <tom@mojombo.com>
+                    # committer-time 1192271832
+                    # committer-tz -0700  - IGNORED BY US
+                    role = m.group(0)
+                    if firstpart.endswith('-mail'):
+                        info["%s_email" % role] = parts[-1]
+                    elif firstpart.endswith('-time'):
+                        info["%s_date" % role] = time.gmtime(int(parts[-1]))
+                    elif role == firstpart:
+                        info[role] = parts[-1]
+                    # END distinguish mail,time,name
+                else:
+                    # handle
+                    # filename lib/grit.rb
+                    # summary add Blob
+                    # <and rest>
+                    if firstpart.startswith('filename'):
+                        info['filename'] = parts[-1]
+                    elif firstpart.startswith('summary'):
+                        info['summary'] = parts[-1]
+                    elif firstpart == '':
+                        if info:
+                            sha = info['id']
+                            c = commits.get(sha)
+                            if c is None:
+                                c = Commit(  repo, id=sha,
+                                             author=Actor.from_string(info['author'] + ' ' + info['author_email']),
+                                             authored_date=info['author_date'],
+                                             committer=Actor.from_string(info['committer'] + ' ' + info['committer_email']),
+                                             committed_date=info['committer_date'],
+                                             message=info['summary'])
+                                commits[sha] = c
+                            # END if commit objects needs initial creation
+                            m = cls.re_tab_full_line.search(line)
+                            text,  = m.groups()
+                            blames[-1][0] = c
+                            blames[-1][1].append( text )
+                            info = None
+                        # END if we collected commit info
+                    # END distinguish filename,summary,rest
+                # END distinguish author|committer vs filename,summary,rest
+            # END distinguish hexsha vs other information
         return blames
 
     def __repr__(self):
diff --git a/lib/git/diff.py b/lib/git/diff.py
@@ -29,20 +29,36 @@ class Diff(object):
         b_mode is None
         b_blob is NOne
     """
+    
+    # precompiled regex
+    re_header = re.compile(r"""
+								#^diff[ ]--git
+									[ ]a/(?P<a_path>\S+)[ ]b/(?P<b_path>\S+)\n
+								(?:^similarity[ ]index[ ](?P<similarity_index>\d+)%\n
+								   ^rename[ ]from[ ](?P<rename_from>\S+)\n
+								   ^rename[ ]to[ ](?P<rename_to>\S+)(?:\n|$))?
+								(?:^old[ ]mode[ ](?P<old_mode>\d+)\n
+								   ^new[ ]mode[ ](?P<new_mode>\d+)(?:\n|$))?
+								(?:^new[ ]file[ ]mode[ ](?P<new_file_mode>.+)(?:\n|$))?
+								(?:^deleted[ ]file[ ]mode[ ](?P<deleted_file_mode>.+)(?:\n|$))?
+								(?:^index[ ](?P<a_blob_id>[0-9A-Fa-f]+)
+									\.\.(?P<b_blob_id>[0-9A-Fa-f]+)[ ]?(?P<b_mode>.+)?(?:\n|$))?
+							""", re.VERBOSE | re.MULTILINE)
+    re_is_null_hexsha = re.compile( r'^0{40}$' )
 
-    def __init__(self, repo, a_path, b_path, a_blob, b_blob, a_mode,
+    def __init__(self, repo, a_path, b_path, a_blob_id, b_blob_id, a_mode,
                  b_mode, new_file, deleted_file, rename_from,
                  rename_to, diff):
         self.repo = repo
 
-        if not a_blob or re.search(r'^0{40}$', a_blob):
+        if not a_blob_id or self.re_is_null_hexsha.search(a_blob_id):
             self.a_blob = None
         else:
-            self.a_blob = blob.Blob(repo, id=a_blob, mode=a_mode, path=a_path)
-        if not b_blob or re.search(r'^0{40}$', b_blob):
+            self.a_blob = blob.Blob(repo, id=a_blob_id, mode=a_mode, path=a_path)
+        if not b_blob_id or self.re_is_null_hexsha.search(b_blob_id):
             self.b_blob = None
         else:
-            self.b_blob = blob.Blob(repo, id=b_blob, mode=b_mode, path=b_path)
+            self.b_blob = blob.Blob(repo, id=b_blob_id, mode=b_mode, path=b_path)
 
         self.a_mode = a_mode
         self.b_mode = b_mode
@@ -68,29 +84,16 @@ def list_from_string(cls, repo, text):
         """
         diffs = []
 
-        diff_header = re.compile(r"""
-            #^diff[ ]--git
-                [ ]a/(?P<a_path>\S+)[ ]b/(?P<b_path>\S+)\n
-            (?:^similarity[ ]index[ ](?P<similarity_index>\d+)%\n
-               ^rename[ ]from[ ](?P<rename_from>\S+)\n
-               ^rename[ ]to[ ](?P<rename_to>\S+)(?:\n|$))?
-            (?:^old[ ]mode[ ](?P<old_mode>\d+)\n
-               ^new[ ]mode[ ](?P<new_mode>\d+)(?:\n|$))?
-            (?:^new[ ]file[ ]mode[ ](?P<new_file_mode>.+)(?:\n|$))?
-            (?:^deleted[ ]file[ ]mode[ ](?P<deleted_file_mode>.+)(?:\n|$))?
-            (?:^index[ ](?P<a_blob>[0-9A-Fa-f]+)
-                \.\.(?P<b_blob>[0-9A-Fa-f]+)[ ]?(?P<b_mode>.+)?(?:\n|$))?
-        """, re.VERBOSE | re.MULTILINE).match
-
+        diff_header = cls.re_header.match
         for diff in ('\n' + text).split('\ndiff --git')[1:]:
             header = diff_header(diff)
 
             a_path, b_path, similarity_index, rename_from, rename_to, \
                 old_mode, new_mode, new_file_mode, deleted_file_mode, \
-                a_blob, b_blob, b_mode = header.groups()
+                a_blob_id, b_blob_id, b_mode = header.groups()
             new_file, deleted_file = bool(new_file_mode), bool(deleted_file_mode)
 
-            diffs.append(Diff(repo, a_path, b_path, a_blob, b_blob,
+            diffs.append(Diff(repo, a_path, b_path, a_blob_id, b_blob_id,
                 old_mode or deleted_file_mode, new_mode or new_file_mode or b_mode,
                 new_file, deleted_file, rename_from, rename_to, diff[header.end():]))