Skip to content

Commit fd49eee

Browse files
authored
Merge pull request GitoxideLabs#2011 from blinxen/main
Add Sink that implements git's diffing improvement heuristics
2 parents 7ae3797 + 5505646 commit fd49eee

File tree

7 files changed

+497
-3
lines changed

7 files changed

+497
-3
lines changed

gix-diff/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ gix-trace = { version = "^0.1.12", path = "../gix-trace", optional = true }
4242
gix-traverse = { version = "^0.46.2", path = "../gix-traverse", optional = true }
4343

4444
thiserror = "2.0.0"
45-
imara-diff = { version = "0.1.7", optional = true }
45+
imara-diff = { version = "0.1.8", optional = true }
4646
serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] }
4747
getrandom = { version = "0.2.8", optional = true, default-features = false, features = ["js"] }
4848
bstr = { version = "1.12.0", default-features = false }

gix-diff/src/blob/git_diff.rs

Lines changed: 326 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,326 @@
1+
//! Facilities to produce git-formatted diffs.
2+
3+
use crate::blob::GitDiff;
4+
use bstr::ByteSlice;
5+
use imara_diff::intern::{InternedInput, Interner, Token};
6+
use imara_diff::Sink;
7+
use std::cmp::Ordering;
8+
use std::ops::Range;
9+
10+
// Explanation for the following numbers can be found here:
11+
// https://github.com/git/git/blob/324fbaab88126196bd42e7fa383ee94e165d61b5/xdiff/xdiffi.c#L535
12+
const MAX_INDENT: u8 = 200;
13+
const MAX_BLANKS: i16 = 20;
14+
const INDENT_WEIGHT: i16 = 60;
15+
const INDENT_HEURISTIC_MAX_SLIDING: usize = 100;
16+
17+
const START_OF_FILE_PENALTY: i16 = 1;
18+
const END_OF_FILE_PENALTY: i16 = 21;
19+
const TOTAL_BLANK_WEIGHT: i16 = -30;
20+
const POST_BLANK_WEIGHT: i16 = 6;
21+
const RELATIVE_INDENT_PENALTY: i16 = -4;
22+
const RELATIVE_INDENT_WITH_BLANK_PENALTY: i16 = 10;
23+
const RELATIVE_OUTDENT_PENALTY: i16 = 24;
24+
const RELATIVE_OUTDENT_WITH_BLANK_PENALTY: i16 = 17;
25+
const RELATIVE_DEDENT_PENALTY: i16 = 23;
26+
const RELATIVE_DEDENT_WITH_BLANK_PENALTY: i16 = 17;
27+
28+
pub(super) mod types {
29+
use crate::blob::git_diff::ChangeGroup;
30+
31+
/// A [`Sink`](imara_diff::Sink) that creates a diff like git would.
32+
///
33+
/// See the [diff slider repository](https://github.com/mhagger/diff-slider-tools) for more information.
34+
pub struct GitDiff<'a, T>
35+
where
36+
T: AsRef<[u8]>,
37+
{
38+
pub(crate) after: &'a [imara_diff::intern::Token],
39+
pub(crate) interner: &'a imara_diff::intern::Interner<T>,
40+
pub(crate) changes: Vec<ChangeGroup>,
41+
}
42+
}
43+
44+
/// An enum indicating the kind of change that occurred.
45+
#[derive(PartialEq, Debug)]
46+
pub enum ChangeKind {
47+
/// Indicates that a change introduced new lines.
48+
Added,
49+
/// Indicates that a change removed lines before the starting line of the change.
50+
RemovedAbove,
51+
/// Indicates that a change removed lines after the ending line of the change.
52+
RemovedBelow,
53+
/// Indicates that the change modified lines.
54+
Modified,
55+
}
56+
57+
#[derive(PartialEq)]
58+
struct Score {
59+
effective_indent: i16,
60+
penalty: i16,
61+
}
62+
63+
impl PartialOrd for Score {
64+
// A score is considered "Greater" if it is equal or less than 0
65+
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
66+
let indent_penalty = match self.effective_indent.cmp(&other.effective_indent) {
67+
Ordering::Greater => INDENT_WEIGHT,
68+
Ordering::Less => -INDENT_WEIGHT,
69+
Ordering::Equal => 0,
70+
};
71+
72+
Some((indent_penalty + (self.penalty - other.penalty)).cmp(&0).reverse())
73+
}
74+
}
75+
76+
/// A [`ChangeGroup`] represents a block of changed lines.
77+
#[derive(PartialEq, Debug)]
78+
pub struct ChangeGroup {
79+
/// Range indicating the lines of the previous block.
80+
/// To actually see what the previous block looked like, you need to combine this range with
81+
/// the [`InternedInput`].
82+
pub before: Range<usize>,
83+
/// Range indicating the lines of the new block
84+
/// To actually see how the current block looks like, you need to combine this range with
85+
/// the [`InternedInput`].
86+
pub after: Range<usize>,
87+
/// Further specify what kind of change is denoted by the ranges above.
88+
pub change_kind: ChangeKind,
89+
}
90+
91+
impl ChangeGroup {
92+
/// Return [before](Self::before) and [after](Self::after) as `u32` ranges for use in [Sink::process_change()].
93+
///
94+
/// This is useful for creating [unified diffs](crate::blob::UnifiedDiff), for example.
95+
pub fn as_u32_ranges(&self) -> (Range<u32>, Range<u32>) {
96+
(
97+
self.before.start as u32..self.before.end as u32,
98+
self.after.start as u32..self.after.end as u32,
99+
)
100+
}
101+
}
102+
103+
// Calculate the indentation of a single line as number of tabs.
104+
fn get_indent(s: &[u8]) -> Option<u8> {
105+
let mut indent = 0;
106+
107+
for char in s.bytes() {
108+
if !char.is_ascii_whitespace() {
109+
return Some(indent);
110+
} else if char == b' ' {
111+
indent += 1;
112+
} else if char == b'\t' {
113+
indent += 8 - indent % 8;
114+
}
115+
116+
if indent >= MAX_INDENT {
117+
return Some(MAX_INDENT);
118+
}
119+
}
120+
121+
None
122+
}
123+
124+
fn measure_and_score_change<T: AsRef<[u8]>>(lines: &[Token], split: usize, interner: &Interner<T>, score: &mut Score) {
125+
// Gather information about the surroundings of the change
126+
let end_of_file = split >= lines.len();
127+
let mut indent: Option<u8> = if split >= lines.len() {
128+
None
129+
} else {
130+
get_indent(interner[lines[split]].as_ref())
131+
};
132+
let mut pre_blank = 0;
133+
let mut pre_indent: Option<u8> = None;
134+
let mut post_blank = 0;
135+
let mut post_indent: Option<u8> = None;
136+
137+
for line in (0..=split.saturating_sub(1)).rev() {
138+
pre_indent = get_indent(interner[lines[line]].as_ref());
139+
if pre_indent.is_none() {
140+
pre_blank += 1;
141+
if pre_blank == MAX_BLANKS {
142+
pre_indent = Some(0);
143+
break;
144+
}
145+
}
146+
}
147+
for line in split + 1..lines.len() {
148+
post_indent = get_indent(interner[lines[line]].as_ref());
149+
if post_indent.is_none() {
150+
post_blank += 1;
151+
if post_blank == MAX_BLANKS {
152+
post_indent = Some(0);
153+
break;
154+
}
155+
}
156+
}
157+
158+
// Calculate score of the currently applied split
159+
post_blank = if indent.is_none() { 1 + post_blank } else { 0 };
160+
let total_blank = pre_blank + post_blank;
161+
if indent.is_none() {
162+
indent = post_indent;
163+
}
164+
let any_blanks = total_blank != 0;
165+
166+
if pre_indent.is_none() && pre_blank == 0 {
167+
score.penalty += START_OF_FILE_PENALTY;
168+
}
169+
170+
if end_of_file {
171+
score.penalty += END_OF_FILE_PENALTY;
172+
}
173+
174+
score.penalty += TOTAL_BLANK_WEIGHT * total_blank;
175+
score.penalty += POST_BLANK_WEIGHT * post_blank;
176+
177+
score.effective_indent += if let Some(indent) = indent { indent as i16 } else { -1 };
178+
179+
if indent.is_none() || pre_indent.is_none() || indent == pre_indent {
180+
} else if indent > pre_indent {
181+
score.penalty += if any_blanks {
182+
RELATIVE_INDENT_WITH_BLANK_PENALTY
183+
} else {
184+
RELATIVE_INDENT_PENALTY
185+
};
186+
} else if post_indent.is_some() && post_indent > indent {
187+
score.penalty += if any_blanks {
188+
RELATIVE_OUTDENT_WITH_BLANK_PENALTY
189+
} else {
190+
RELATIVE_OUTDENT_PENALTY
191+
};
192+
} else {
193+
score.penalty += if any_blanks {
194+
RELATIVE_DEDENT_WITH_BLANK_PENALTY
195+
} else {
196+
RELATIVE_DEDENT_PENALTY
197+
};
198+
}
199+
}
200+
201+
impl<'a, T> GitDiff<'a, T>
202+
where
203+
T: AsRef<[u8]>,
204+
{
205+
/// Create a new instance of [`GitDiff`] that can then be passed to [`imara_diff::diff`]
206+
/// and generate a more human-readable diff.
207+
pub fn new(input: &'a InternedInput<T>) -> Self {
208+
Self {
209+
after: &input.after,
210+
interner: &input.interner,
211+
changes: Vec::new(),
212+
}
213+
}
214+
}
215+
216+
impl<T> Sink for GitDiff<'_, T>
217+
where
218+
T: AsRef<[u8]>,
219+
{
220+
type Out = Vec<ChangeGroup>;
221+
222+
fn process_change(&mut self, before: Range<u32>, after: Range<u32>) {
223+
let change_kind = match (before.is_empty(), after.is_empty()) {
224+
(true, false) => ChangeKind::Added,
225+
(false, true) => {
226+
if after.start == 0 {
227+
ChangeKind::RemovedAbove
228+
} else {
229+
ChangeKind::RemovedBelow
230+
}
231+
}
232+
_ => ChangeKind::Modified,
233+
};
234+
self.changes.push(ChangeGroup {
235+
before: before.start as usize..before.end as usize,
236+
after: after.start as usize..after.end as usize,
237+
change_kind,
238+
});
239+
}
240+
241+
fn finish(mut self) -> Self::Out {
242+
if self.changes.is_empty() {
243+
return self.changes;
244+
}
245+
246+
let mut shift: usize;
247+
for change in &mut self.changes {
248+
// Skip one-liner changes
249+
if change.after.is_empty() {
250+
continue;
251+
}
252+
253+
// Move this change up by one line if the line before the change and the last line in
254+
// the change are equal
255+
loop {
256+
if change.after.start > 0 && self.after[change.after.start - 1] == self.after[change.after.end - 1] {
257+
change.after.start -= 1;
258+
change.after.end -= 1;
259+
} else {
260+
break;
261+
}
262+
}
263+
264+
shift = change.after.end;
265+
266+
// Move this change down by one line if the first line in the change the line after the
267+
// change are equal
268+
loop {
269+
if change.after.end < self.after.len() && self.after[change.after.start] == self.after[change.after.end]
270+
{
271+
change.after.start += 1;
272+
change.after.end += 1;
273+
} else {
274+
break;
275+
}
276+
}
277+
278+
let mut best_shift: Option<usize> = None;
279+
let mut best_score = Score {
280+
effective_indent: 0,
281+
penalty: 0,
282+
};
283+
284+
if change.after.end.saturating_sub(change.after.len()) > shift {
285+
shift = change.after.end - change.after.len();
286+
}
287+
288+
if change.after.end.saturating_sub(INDENT_HEURISTIC_MAX_SLIDING) > shift {
289+
shift = change.after.end - INDENT_HEURISTIC_MAX_SLIDING;
290+
}
291+
292+
while shift <= change.after.end {
293+
let mut score = Score {
294+
effective_indent: 0,
295+
penalty: 0,
296+
};
297+
298+
measure_and_score_change(self.after, shift, self.interner, &mut score);
299+
measure_and_score_change(self.after, shift - change.after.len(), self.interner, &mut score);
300+
301+
if best_shift.is_none() || score > best_score {
302+
best_score = score;
303+
best_shift = Some(shift);
304+
}
305+
shift += 1;
306+
}
307+
308+
if let Some(best_shift) = best_shift {
309+
while change.after.end > best_shift {
310+
loop {
311+
if change.after.start > 0
312+
&& self.after[change.after.start - 1] == self.after[change.after.end - 1]
313+
{
314+
change.after.start -= 1;
315+
change.after.end -= 1;
316+
} else {
317+
break;
318+
}
319+
}
320+
}
321+
}
322+
}
323+
324+
self.changes
325+
}
326+
}

gix-diff/src/blob/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@ pub mod platform;
1414
pub mod unified_diff;
1515
pub use unified_diff::_impl::UnifiedDiff;
1616

17+
pub mod git_diff;
18+
pub use git_diff::types::GitDiff;
19+
1720
/// Information about the diff performed to detect similarity.
1821
#[derive(Debug, Default, Clone, Copy, PartialEq, PartialOrd)]
1922
pub struct DiffLineStats {

0 commit comments

Comments
 (0)