Skip to content

Commit 52ab307

Browse files
committed
diff regex are now precompiled on class level, renamed a|b_blob to a|b_blob_id as it better reflects the actual value
actor regex now precompiled on class level blob regex now precompiled on class level; made blame method more readable and faster although it can still be improved by making assumptions about the blame format and by reading the git command stream directly ( which is a general issue right now )
1 parent 07c20b4 commit 52ab307

File tree

3 files changed

+98
-66
lines changed

3 files changed

+98
-66
lines changed

lib/git/actor.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ class Actor(object):
1010
"""Actors hold information about a person acting on the repository. They
1111
can be committers and authors or anything with a name and an email as
1212
mentioned in the git log entries."""
13+
# precompiled regex
14+
name_only_regex = re.compile( r'<.+>' )
15+
name_email_regex = re.compile( r'(.*) <(.+?)>' )
16+
1317
def __init__(self, name, email):
1418
self.name = name
1519
self.email = email
@@ -34,8 +38,8 @@ def from_string(cls, string):
3438
Returns
3539
Actor
3640
"""
37-
if re.search(r'<.+>', string):
38-
m = re.search(r'(.*) <(.+?)>', string)
41+
if cls.name_only_regex.search(string):
42+
m = cls.name_email_regex.search(string)
3943
name, email = m.groups()
4044
return Actor(name, email)
4145
else:

lib/git/blob.py

Lines changed: 68 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,12 @@ class Blob(object):
1515
"""A Blob encapsulates a git blob object"""
1616
DEFAULT_MIME_TYPE = "text/plain"
1717

18+
# precompiled regex
19+
re_whitespace = re.compile(r'\s+')
20+
re_hexsha_only = re.compile('^[0-9A-Fa-f]{40}$')
21+
re_author_committer_start = re.compile(r'^(author|committer)')
22+
re_tab_full_line = re.compile(r'^\t(.*)$')
23+
1824
def __init__(self, repo, id, mode=None, path=None):
1925
"""
2026
Create an unbaked Blob containing just the specified attributes
@@ -112,49 +118,68 @@ def blame(cls, repo, commit, file):
112118
info = None
113119

114120
for line in data.splitlines():
115-
parts = re.split(r'\s+', line, 1)
116-
if re.search(r'^[0-9A-Fa-f]{40}$', parts[0]):
117-
if re.search(r'^([0-9A-Fa-f]{40}) (\d+) (\d+) (\d+)$', line):
118-
m = re.search(r'^([0-9A-Fa-f]{40}) (\d+) (\d+) (\d+)$', line)
119-
id, origin_line, final_line, group_lines = m.groups()
120-
info = {'id': id}
121-
blames.append([None, []])
122-
elif re.search(r'^([0-9A-Fa-f]{40}) (\d+) (\d+)$', line):
123-
m = re.search(r'^([0-9A-Fa-f]{40}) (\d+) (\d+)$', line)
124-
id, origin_line, final_line = m.groups()
125-
info = {'id': id}
126-
elif re.search(r'^(author|committer)', parts[0]):
127-
if re.search(r'^(.+)-mail$', parts[0]):
128-
m = re.search(r'^(.+)-mail$', parts[0])
129-
info["%s_email" % m.groups()[0]] = parts[-1]
130-
elif re.search(r'^(.+)-time$', parts[0]):
131-
m = re.search(r'^(.+)-time$', parts[0])
132-
info["%s_date" % m.groups()[0]] = time.gmtime(int(parts[-1]))
133-
elif re.search(r'^(author|committer)$', parts[0]):
134-
m = re.search(r'^(author|committer)$', parts[0])
135-
info[m.groups()[0]] = parts[-1]
136-
elif re.search(r'^filename', parts[0]):
137-
info['filename'] = parts[-1]
138-
elif re.search(r'^summary', parts[0]):
139-
info['summary'] = parts[-1]
140-
elif parts[0] == '':
141-
if info:
142-
c = commits.has_key(info['id']) and commits[info['id']]
143-
if not c:
144-
c = Commit(repo, id=info['id'],
145-
author=Actor.from_string(info['author'] + ' ' + info['author_email']),
146-
authored_date=info['author_date'],
147-
committer=Actor.from_string(info['committer'] + ' ' + info['committer_email']),
148-
committed_date=info['committer_date'],
149-
message=info['summary'])
150-
commits[info['id']] = c
151-
152-
m = re.search(r'^\t(.*)$', line)
153-
text, = m.groups()
154-
blames[-1][0] = c
155-
blames[-1][1].append( text )
156-
info = None
157-
121+
parts = cls.re_whitespace.split(line, 1)
122+
firstpart = parts[0]
123+
if cls.re_hexsha_only.search(firstpart):
124+
# handles
125+
# 634396b2f541a9f2d58b00be1a07f0c358b999b3 1 1 7 - indicates blame-data start
126+
# 634396b2f541a9f2d58b00be1a07f0c358b999b3 2 2
127+
digits = parts[-1].split(" ")
128+
if len(digits) == 3:
129+
info = {'id': firstpart}
130+
blames.append([None, []])
131+
# END blame data initialization
132+
else:
133+
m = cls.re_author_committer_start.search(firstpart)
134+
if m:
135+
# handles:
136+
# author Tom Preston-Werner
137+
# author-mail <[email protected]>
138+
# author-time 1192271832
139+
# author-tz -0700
140+
# committer Tom Preston-Werner
141+
# committer-mail <[email protected]>
142+
# committer-time 1192271832
143+
# committer-tz -0700 - IGNORED BY US
144+
role = m.group(0)
145+
if firstpart.endswith('-mail'):
146+
info["%s_email" % role] = parts[-1]
147+
elif firstpart.endswith('-time'):
148+
info["%s_date" % role] = time.gmtime(int(parts[-1]))
149+
elif role == firstpart:
150+
info[role] = parts[-1]
151+
# END distinguish mail,time,name
152+
else:
153+
# handle
154+
# filename lib/grit.rb
155+
# summary add Blob
156+
# <and rest>
157+
if firstpart.startswith('filename'):
158+
info['filename'] = parts[-1]
159+
elif firstpart.startswith('summary'):
160+
info['summary'] = parts[-1]
161+
elif firstpart == '':
162+
if info:
163+
sha = info['id']
164+
c = commits.get(sha)
165+
if c is None:
166+
c = Commit( repo, id=sha,
167+
author=Actor.from_string(info['author'] + ' ' + info['author_email']),
168+
authored_date=info['author_date'],
169+
committer=Actor.from_string(info['committer'] + ' ' + info['committer_email']),
170+
committed_date=info['committer_date'],
171+
message=info['summary'])
172+
commits[sha] = c
173+
# END if commit objects needs initial creation
174+
m = cls.re_tab_full_line.search(line)
175+
text, = m.groups()
176+
blames[-1][0] = c
177+
blames[-1][1].append( text )
178+
info = None
179+
# END if we collected commit info
180+
# END distinguish filename,summary,rest
181+
# END distinguish author|committer vs filename,summary,rest
182+
# END distinguish hexsha vs other information
158183
return blames
159184

160185
def __repr__(self):

lib/git/diff.py

Lines changed: 24 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -29,20 +29,36 @@ class Diff(object):
2929
b_mode is None
3030
b_blob is NOne
3131
"""
32+
33+
# precompiled regex
34+
re_header = re.compile(r"""
35+
#^diff[ ]--git
36+
[ ]a/(?P<a_path>\S+)[ ]b/(?P<b_path>\S+)\n
37+
(?:^similarity[ ]index[ ](?P<similarity_index>\d+)%\n
38+
^rename[ ]from[ ](?P<rename_from>\S+)\n
39+
^rename[ ]to[ ](?P<rename_to>\S+)(?:\n|$))?
40+
(?:^old[ ]mode[ ](?P<old_mode>\d+)\n
41+
^new[ ]mode[ ](?P<new_mode>\d+)(?:\n|$))?
42+
(?:^new[ ]file[ ]mode[ ](?P<new_file_mode>.+)(?:\n|$))?
43+
(?:^deleted[ ]file[ ]mode[ ](?P<deleted_file_mode>.+)(?:\n|$))?
44+
(?:^index[ ](?P<a_blob_id>[0-9A-Fa-f]+)
45+
\.\.(?P<b_blob_id>[0-9A-Fa-f]+)[ ]?(?P<b_mode>.+)?(?:\n|$))?
46+
""", re.VERBOSE | re.MULTILINE)
47+
re_is_null_hexsha = re.compile( r'^0{40}$' )
3248

33-
def __init__(self, repo, a_path, b_path, a_blob, b_blob, a_mode,
49+
def __init__(self, repo, a_path, b_path, a_blob_id, b_blob_id, a_mode,
3450
b_mode, new_file, deleted_file, rename_from,
3551
rename_to, diff):
3652
self.repo = repo
3753

38-
if not a_blob or re.search(r'^0{40}$', a_blob):
54+
if not a_blob_id or self.re_is_null_hexsha.search(a_blob_id):
3955
self.a_blob = None
4056
else:
41-
self.a_blob = blob.Blob(repo, id=a_blob, mode=a_mode, path=a_path)
42-
if not b_blob or re.search(r'^0{40}$', b_blob):
57+
self.a_blob = blob.Blob(repo, id=a_blob_id, mode=a_mode, path=a_path)
58+
if not b_blob_id or self.re_is_null_hexsha.search(b_blob_id):
4359
self.b_blob = None
4460
else:
45-
self.b_blob = blob.Blob(repo, id=b_blob, mode=b_mode, path=b_path)
61+
self.b_blob = blob.Blob(repo, id=b_blob_id, mode=b_mode, path=b_path)
4662

4763
self.a_mode = a_mode
4864
self.b_mode = b_mode
@@ -68,29 +84,16 @@ def list_from_string(cls, repo, text):
6884
"""
6985
diffs = []
7086

71-
diff_header = re.compile(r"""
72-
#^diff[ ]--git
73-
[ ]a/(?P<a_path>\S+)[ ]b/(?P<b_path>\S+)\n
74-
(?:^similarity[ ]index[ ](?P<similarity_index>\d+)%\n
75-
^rename[ ]from[ ](?P<rename_from>\S+)\n
76-
^rename[ ]to[ ](?P<rename_to>\S+)(?:\n|$))?
77-
(?:^old[ ]mode[ ](?P<old_mode>\d+)\n
78-
^new[ ]mode[ ](?P<new_mode>\d+)(?:\n|$))?
79-
(?:^new[ ]file[ ]mode[ ](?P<new_file_mode>.+)(?:\n|$))?
80-
(?:^deleted[ ]file[ ]mode[ ](?P<deleted_file_mode>.+)(?:\n|$))?
81-
(?:^index[ ](?P<a_blob>[0-9A-Fa-f]+)
82-
\.\.(?P<b_blob>[0-9A-Fa-f]+)[ ]?(?P<b_mode>.+)?(?:\n|$))?
83-
""", re.VERBOSE | re.MULTILINE).match
84-
87+
diff_header = cls.re_header.match
8588
for diff in ('\n' + text).split('\ndiff --git')[1:]:
8689
header = diff_header(diff)
8790

8891
a_path, b_path, similarity_index, rename_from, rename_to, \
8992
old_mode, new_mode, new_file_mode, deleted_file_mode, \
90-
a_blob, b_blob, b_mode = header.groups()
93+
a_blob_id, b_blob_id, b_mode = header.groups()
9194
new_file, deleted_file = bool(new_file_mode), bool(deleted_file_mode)
9295

93-
diffs.append(Diff(repo, a_path, b_path, a_blob, b_blob,
96+
diffs.append(Diff(repo, a_path, b_path, a_blob_id, b_blob_id,
9497
old_mode or deleted_file_mode, new_mode or new_file_mode or b_mode,
9598
new_file, deleted_file, rename_from, rename_to, diff[header.end():]))
9699

0 commit comments

Comments
 (0)