Skip to content

Commit aa05ef0

Browse files
authored
Merge pull request #1743 from cruessler/skip-uninteresting-commits-for-blame
Skip uninteresting commits for blame
2 parents 18e163e + 4428838 commit aa05ef0

File tree

9 files changed

+212
-107
lines changed

9 files changed

+212
-107
lines changed

Cargo.lock

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

gitoxide-core/src/repository/blame.rs

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,17 @@ pub fn blame_file(
3535
.next()
3636
.expect("exactly one pattern");
3737

38-
let suspect = repo.head()?.peel_to_commit_in_place()?;
39-
let traverse =
40-
gix::traverse::commit::topo::Builder::from_iters(&repo.objects, [suspect.id], None::<Vec<gix::ObjectId>>)
41-
.with_commit_graph(repo.commit_graph_if_enabled()?)
42-
.build()?;
38+
let suspect: gix::ObjectId = repo.head()?.into_peeled_id()?.into();
39+
let cache: Option<gix::commitgraph::Graph> = repo.commit_graph_if_enabled()?;
4340
let mut resource_cache = repo.diff_resource_cache_for_tree_diff()?;
44-
let outcome = gix::blame::file(&repo.objects, traverse, &mut resource_cache, file.as_bstr(), range)?;
41+
let outcome = gix::blame::file(
42+
&repo.objects,
43+
suspect,
44+
cache,
45+
&mut resource_cache,
46+
file.as_bstr(),
47+
range,
48+
)?;
4549
let statistics = outcome.statistics;
4650
write_blame_entries(out, outcome)?;
4751

gix-blame/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,16 @@ rust-version = "1.70"
1414
doctest = false
1515

1616
[dependencies]
17+
gix-commitgraph = { version = "^0.26.0", path = "../gix-commitgraph" }
18+
gix-revwalk = { version = "^0.18.0", path = "../gix-revwalk" }
1719
gix-trace = { version = "^0.1.12", path = "../gix-trace" }
1820
gix-diff = { version = "^0.50.0", path = "../gix-diff", default-features = false, features = ["blob"] }
1921
gix-object = { version = "^0.47.0", path = "../gix-object" }
2022
gix-hash = { version = "^0.16.0", path = "../gix-hash" }
2123
gix-worktree = { version = "^0.39.0", path = "../gix-worktree", default-features = false, features = ["attributes"] }
2224
gix-traverse = { version = "^0.44.0", path = "../gix-traverse" }
2325

26+
smallvec = "1.10.0"
2427
thiserror = "2.0.0"
2528

2629
[dev-dependencies]

gix-blame/src/error.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,8 @@ pub enum Error {
2929
DiffTree(#[from] gix_diff::tree::Error),
3030
#[error("Invalid line range was given, line range is expected to be a 1-based inclusive range in the format '<start>,<end>'")]
3131
InvalidLineRange,
32+
#[error("Failure to decode commit during traversal")]
33+
DecodeCommit(#[from] gix_object::decode::Error),
34+
#[error("Failed to get parent from commitgraph during traversal")]
35+
GetParentFromCommitGraph(#[from] gix_commitgraph::file::commit::Error),
3236
}

gix-blame/src/file/function.rs

Lines changed: 118 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -7,22 +7,23 @@ use gix_object::{
77
bstr::{BStr, BString},
88
FindExt,
99
};
10+
use gix_traverse::commit::find as find_commit;
11+
use smallvec::SmallVec;
1012
use std::num::NonZeroU32;
1113
use std::ops::Range;
1214

1315
/// Produce a list of consecutive [`BlameEntry`] instances to indicate in which commits the ranges of the file
14-
/// at `traverse[0]:<file_path>` originated in.
16+
/// at `suspect:<file_path>` originated in.
1517
///
1618
/// ## Paramters
1719
///
1820
/// * `odb`
1921
/// - Access to database objects, also for used for diffing.
2022
/// - Should have an object cache for good diff performance.
21-
/// * `traverse`
22-
/// - The list of commits from the most recent to prior ones, following all parents sorted
23-
/// by time.
24-
/// - It's paramount that older commits are returned after newer ones.
25-
/// - The first commit returned here is the first eligible commit to be responsible for parts of `file_path`.
23+
/// * `suspect`
24+
/// - The first commit to be responsible for parts of `file_path`.
25+
/// * `cache`
26+
/// - Optionally, the commitgraph cache.
2627
/// * `file_path`
2728
/// - A *slash-separated* worktree-relative path to the file to blame.
2829
/// * `range`
@@ -60,20 +61,14 @@ use std::ops::Range;
6061
// <---><----------><-------><-----><------->
6162
// <---><---><-----><-------><-----><------->
6263
// <---><---><-----><-------><-----><-><-><->
63-
pub fn file<E>(
64+
pub fn file(
6465
odb: impl gix_object::Find + gix_object::FindHeader,
65-
traverse: impl IntoIterator<Item = Result<gix_traverse::commit::Info, E>>,
66+
suspect: ObjectId,
67+
cache: Option<gix_commitgraph::Graph>,
6668
resource_cache: &mut gix_diff::blob::Platform,
6769
file_path: &BStr,
6870
range: Option<Range<u32>>,
69-
) -> Result<Outcome, Error>
70-
where
71-
E: Into<Box<dyn std::error::Error + Send + Sync + 'static>>,
72-
{
73-
let mut traverse = traverse.into_iter().peekable();
74-
let Some(Ok(suspect)) = traverse.peek().map(|res| res.as_ref().map(|item| item.id)) else {
75-
return Err(Error::EmptyTraversal);
76-
};
71+
) -> Result<Outcome, Error> {
7772
let _span = gix_trace::coarse!("gix_blame::file()", ?file_path, ?suspect);
7873

7974
let mut stats = Statistics::default();
@@ -84,13 +79,7 @@ where
8479
commit_id: suspect,
8580
})?;
8681
let blamed_file_blob = odb.find_blob(&blamed_file_entry_id, &mut buf)?.data.to_vec();
87-
let num_lines_in_blamed = {
88-
let mut interner = gix_diff::blob::intern::Interner::new(blamed_file_blob.len() / 100);
89-
tokens_for_diffing(&blamed_file_blob)
90-
.tokenize()
91-
.map(|token| interner.intern(token))
92-
.count()
93-
} as u32;
82+
let num_lines_in_blamed = tokens_for_diffing(&blamed_file_blob).tokenize().count() as u32;
9483

9584
// Binary or otherwise empty?
9685
if num_lines_in_blamed == 0 {
@@ -103,30 +92,40 @@ where
10392
suspects: [(suspect, range_in_blamed_file)].into(),
10493
}];
10594

95+
let (mut buf, mut buf2) = (Vec::new(), Vec::new());
96+
let commit = find_commit(cache.as_ref(), &odb, &suspect, &mut buf)?;
97+
let mut queue: gix_revwalk::PriorityQueue<CommitTime, ObjectId> = gix_revwalk::PriorityQueue::new();
98+
queue.insert(commit_time(commit)?, suspect);
99+
106100
let mut out = Vec::new();
107101
let mut diff_state = gix_diff::tree::State::default();
108102
let mut previous_entry: Option<(ObjectId, ObjectId)> = None;
109-
'outer: while let Some(item) = traverse.next() {
103+
'outer: while let Some(suspect) = queue.pop_value() {
104+
stats.commits_traversed += 1;
110105
if hunks_to_blame.is_empty() {
111106
break;
112107
}
113-
let commit = item.map_err(|err| Error::Traverse(err.into()))?;
114-
let suspect = commit.id;
115-
stats.commits_traversed += 1;
116108

117-
let parent_ids = commit.parent_ids;
109+
let is_still_suspect = hunks_to_blame.iter().any(|hunk| hunk.suspects.contains_key(&suspect));
110+
if !is_still_suspect {
111+
// There are no `UnblamedHunk`s associated with this `suspect`, so we can continue with
112+
// the next one.
113+
continue 'outer;
114+
}
115+
116+
let commit = find_commit(cache.as_ref(), &odb, &suspect, &mut buf)?;
117+
let parent_ids: ParentIds = collect_parents(commit, &odb, cache.as_ref(), &mut buf2)?;
118118
if parent_ids.is_empty() {
119-
if traverse.peek().is_none() {
120-
// I’m not entirely sure if this is correct yet. `suspect`, at this point, is the `id` of
121-
// the last `item` that was yielded by `traverse`, so it makes sense to assign the
122-
// remaining lines to it, even though we don’t explicitly check whether that is true
123-
// here. We could perhaps use diff-tree-to-tree to compare `suspect`
124-
// against an empty tree to validate this assumption.
119+
if queue.is_empty() {
120+
// I’m not entirely sure if this is correct yet. `suspect`, at this point, is the
121+
// `id` of the last `item` that was yielded by `queue`, so it makes sense to assign
122+
// the remaining lines to it, even though we don’t explicitly check whether that is
123+
// true here. We could perhaps use diff-tree-to-tree to compare `suspect` against
124+
// an empty tree to validate this assumption.
125125
if unblamed_to_out_is_done(&mut hunks_to_blame, &mut out, suspect) {
126126
break 'outer;
127127
}
128128
}
129-
130129
// There is more, keep looking.
131130
continue;
132131
}
@@ -143,7 +142,41 @@ where
143142
continue;
144143
};
145144

146-
for (pid, parent_id) in parent_ids.iter().enumerate() {
145+
// This block asserts that, for every `UnblamedHunk`, all lines in the *Blamed File* are
146+
// identical to the corresponding lines in the *Source File*.
147+
#[cfg(debug_assertions)]
148+
{
149+
let source_blob = odb.find_blob(&entry_id, &mut buf)?.data.to_vec();
150+
let mut source_interner = gix_diff::blob::intern::Interner::new(source_blob.len() / 100);
151+
let source_lines_as_tokens: Vec<_> = tokens_for_diffing(&source_blob)
152+
.tokenize()
153+
.map(|token| source_interner.intern(token))
154+
.collect();
155+
156+
let mut blamed_interner = gix_diff::blob::intern::Interner::new(blamed_file_blob.len() / 100);
157+
let blamed_lines_as_tokens: Vec<_> = tokens_for_diffing(&blamed_file_blob)
158+
.tokenize()
159+
.map(|token| blamed_interner.intern(token))
160+
.collect();
161+
162+
for hunk in hunks_to_blame.iter() {
163+
if let Some(range_in_suspect) = hunk.suspects.get(&suspect) {
164+
let range_in_blamed_file = hunk.range_in_blamed_file.clone();
165+
166+
for (blamed_line_number, source_line_number) in range_in_blamed_file.zip(range_in_suspect.clone()) {
167+
let source_token = source_lines_as_tokens[source_line_number as usize];
168+
let blame_token = blamed_lines_as_tokens[blamed_line_number as usize];
169+
170+
let source_line = BString::new(source_interner[source_token].into());
171+
let blamed_line = BString::new(blamed_interner[blame_token].into());
172+
173+
assert_eq!(source_line, blamed_line);
174+
}
175+
}
176+
}
177+
}
178+
179+
for (pid, (parent_id, parent_commit_time)) in parent_ids.iter().enumerate() {
147180
if let Some(parent_entry_id) =
148181
find_path_entry_in_commit(&odb, parent_id, file_path, &mut buf, &mut buf2, &mut stats)?
149182
{
@@ -153,17 +186,19 @@ where
153186
}
154187
if no_change_in_entry {
155188
pass_blame_from_to(suspect, *parent_id, &mut hunks_to_blame);
189+
queue.insert(*parent_commit_time, *parent_id);
156190
continue 'outer;
157191
}
158192
}
159193
}
160194

161195
let more_than_one_parent = parent_ids.len() > 1;
162-
for parent_id in parent_ids {
196+
for (parent_id, parent_commit_time) in parent_ids {
197+
queue.insert(parent_commit_time, parent_id);
163198
let changes_for_file_path = tree_diff_at_file_path(
164199
&odb,
165200
file_path,
166-
commit.id,
201+
suspect,
167202
parent_id,
168203
&mut stats,
169204
&mut diff_state,
@@ -588,8 +623,51 @@ fn find_path_entry_in_commit(
588623
Ok(res.map(|e| e.oid))
589624
}
590625

591-
/// Return an iterator over tokens for use in diffing. These usually lines, but iit's important to unify them
592-
/// so the later access shows the right thing.
626+
type CommitTime = i64;
627+
628+
fn commit_time(commit: gix_traverse::commit::Either<'_, '_>) -> Result<CommitTime, gix_object::decode::Error> {
629+
match commit {
630+
gix_traverse::commit::Either::CommitRefIter(commit_ref_iter) => {
631+
commit_ref_iter.committer().map(|c| c.time.seconds)
632+
}
633+
gix_traverse::commit::Either::CachedCommit(commit) => Ok(commit.committer_timestamp() as i64),
634+
}
635+
}
636+
637+
type ParentIds = SmallVec<[(gix_hash::ObjectId, i64); 2]>;
638+
639+
fn collect_parents(
640+
commit: gix_traverse::commit::Either<'_, '_>,
641+
odb: &impl gix_object::Find,
642+
cache: Option<&gix_commitgraph::Graph>,
643+
buf: &mut Vec<u8>,
644+
) -> Result<ParentIds, Error> {
645+
let mut parent_ids: ParentIds = Default::default();
646+
match commit {
647+
gix_traverse::commit::Either::CachedCommit(commit) => {
648+
let cache = cache
649+
.as_ref()
650+
.expect("find returned a cached commit, so we expect cache to be present");
651+
for parent_pos in commit.iter_parents() {
652+
let parent = cache.commit_at(parent_pos?);
653+
parent_ids.push((parent.id().to_owned(), parent.committer_timestamp() as i64));
654+
}
655+
}
656+
gix_traverse::commit::Either::CommitRefIter(commit_ref_iter) => {
657+
for id in commit_ref_iter.parent_ids() {
658+
let parent = odb.find_commit_iter(id.as_ref(), buf).ok();
659+
let parent_commit_time = parent
660+
.and_then(|parent| parent.committer().ok().map(|committer| committer.time.seconds))
661+
.unwrap_or_default();
662+
parent_ids.push((id, parent_commit_time));
663+
}
664+
}
665+
};
666+
Ok(parent_ids)
667+
}
668+
669+
/// Return an iterator over tokens for use in diffing. These are usually lines, but it's important
670+
/// to unify them so the later access shows the right thing.
593671
pub(crate) fn tokens_for_diffing(data: &[u8]) -> impl TokenSource<Token = &[u8]> {
594672
gix_diff::blob::sources::byte_lines_with_terminator(data)
595673
}

0 commit comments

Comments
 (0)