Skip to content
This repository was archived by the owner on May 28, 2025. It is now read-only.

Commit 7c3744e

Browse files
committed
internal: Speedup line index calculation via NEON for aarch64
1 parent 9d8889c commit 7c3744e

File tree

1 file changed

+111
-1
lines changed

1 file changed

+111
-1
lines changed

lib/line-index/src/lib.rs

Lines changed: 111 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,22 @@ fn analyze_source_file_dispatch(
227227
}
228228
}
229229

230+
#[cfg(target_arch = "aarch64")]
231+
fn analyze_source_file_dispatch(
232+
src: &str,
233+
lines: &mut Vec<TextSize>,
234+
multi_byte_chars: &mut IntMap<u32, Vec<WideChar>>,
235+
) {
236+
if std::arch::is_aarch64_feature_detected!("neon") {
237+
// SAFETY: NEON support was checked
238+
unsafe {
239+
analyze_source_file_neon(src, lines, multi_byte_chars);
240+
}
241+
} else {
242+
analyze_source_file_generic(src, src.len(), TextSize::from(0), lines, multi_byte_chars);
243+
}
244+
}
245+
230246
/// Checks 16 byte chunks of text at a time. If the chunk contains
231247
/// something other than printable ASCII characters and newlines, the
232248
/// function falls back to the generic implementation. Otherwise it uses
@@ -322,7 +338,101 @@ unsafe fn analyze_source_file_sse2(
322338
}
323339
}
324340

325-
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
341+
#[target_feature(enable = "neon")]
342+
#[cfg(any(target_arch = "aarch64"))]
343+
// See https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
344+
//
345+
// The mask is a 64-bit integer, where each 4-bit corresponds to a u8 in the
346+
// input vector. The least significant 4 bits correspond to the first byte in
347+
// the vector.
348+
unsafe fn move_mask(v: std::arch::aarch64::uint8x16_t) -> u64 {
349+
use std::arch::aarch64::*;
350+
351+
let nibble_mask = vshrn_n_u16(vreinterpretq_u16_u8(v), 4);
352+
vget_lane_u64(vreinterpret_u64_u8(nibble_mask), 0)
353+
}
354+
355+
#[target_feature(enable = "neon")]
356+
#[cfg(any(target_arch = "aarch64"))]
357+
unsafe fn analyze_source_file_neon(
358+
src: &str,
359+
lines: &mut Vec<TextSize>,
360+
multi_byte_chars: &mut IntMap<u32, Vec<WideChar>>,
361+
) {
362+
use std::arch::aarch64::*;
363+
364+
const CHUNK_SIZE: usize = 16;
365+
366+
let src_bytes = src.as_bytes();
367+
368+
let chunk_count = src.len() / CHUNK_SIZE;
369+
370+
let newline = vdupq_n_s8(b'\n' as i8);
371+
372+
// This variable keeps track of where we should start decoding a
373+
// chunk. If a multi-byte character spans across chunk boundaries,
374+
// we need to skip that part in the next chunk because we already
375+
// handled it.
376+
let mut intra_chunk_offset = 0;
377+
378+
for chunk_index in 0..chunk_count {
379+
let ptr = src_bytes.as_ptr() as *const i8;
380+
let chunk = vld1q_s8(ptr.add(chunk_index * CHUNK_SIZE));
381+
382+
// For character in the chunk, see if its byte value is < 0, which
383+
// indicates that it's part of a UTF-8 char.
384+
let multibyte_test = vcltzq_s8(chunk);
385+
// Create a bit mask from the comparison results.
386+
let multibyte_mask = move_mask(multibyte_test);
387+
388+
// If the bit mask is all zero, we only have ASCII chars here:
389+
if multibyte_mask == 0 {
390+
assert!(intra_chunk_offset == 0);
391+
392+
// Check for newlines in the chunk
393+
let newlines_test = vceqq_s8(chunk, newline);
394+
let mut newlines_mask = move_mask(newlines_test);
395+
396+
// If the bit mask is all zero, there are no newlines in this chunk.
397+
if newlines_mask != 0 {
398+
let output_offset = TextSize::from((chunk_index * CHUNK_SIZE + 1) as u32);
399+
400+
while newlines_mask != 0 {
401+
let trailing_zeros = newlines_mask.trailing_zeros();
402+
let index = trailing_zeros / 4;
403+
404+
lines.push(TextSize::from(index) + output_offset);
405+
406+
// Clear the current 4-bit, so we can find the next one.
407+
newlines_mask &= (!0xF) << trailing_zeros;
408+
}
409+
}
410+
continue;
411+
}
412+
413+
let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
414+
intra_chunk_offset = analyze_source_file_generic(
415+
&src[scan_start..],
416+
CHUNK_SIZE - intra_chunk_offset,
417+
TextSize::from(scan_start as u32),
418+
lines,
419+
multi_byte_chars,
420+
);
421+
}
422+
423+
let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset;
424+
if tail_start < src.len() {
425+
analyze_source_file_generic(
426+
&src[tail_start..],
427+
src.len() - tail_start,
428+
TextSize::from(tail_start as u32),
429+
lines,
430+
multi_byte_chars,
431+
);
432+
}
433+
}
434+
435+
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
326436
// The target (or compiler version) does not support SSE2 ...
327437
fn analyze_source_file_dispatch(
328438
src: &str,

0 commit comments

Comments
 (0)