Skip to content

optimize popcount implementation #348

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 18, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 39 additions & 61 deletions src/intrinsic/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ mod simd;
#[cfg(feature="master")]
use std::iter;

use gccjit::{ComparisonOp, Function, RValue, ToRValue, Type, UnaryOp, FunctionType};
use gccjit::{BinaryOp, ComparisonOp, Function, RValue, ToRValue, Type, UnaryOp, FunctionType};
use rustc_codegen_ssa::MemFlags;
use rustc_codegen_ssa::base::wants_msvc_seh;
use rustc_codegen_ssa::common::IntPredicate;
Expand Down Expand Up @@ -820,74 +820,52 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
};

if value_type.is_u128(&self.cx) {
// TODO(antoyo): implement in the normal algorithm below to have a more efficient
// implementation (that does not require a call to __popcountdi2).
let popcount = self.context.get_builtin_function("__builtin_popcountll");
let sixty_four = self.gcc_int(value_type, 64);
let right_shift = self.gcc_lshr(value, sixty_four);
let high = self.gcc_int_cast(right_shift, self.cx.ulonglong_type);
let high = self.context.new_call(None, popcount, &[high]);
let high = self.pop_count(high);
let low = self.gcc_int_cast(value, self.cx.ulonglong_type);
let low = self.context.new_call(None, popcount, &[low]);
let low = self.pop_count(low);
let res = high + low;
return self.gcc_int_cast(res, result_type);
}

// First step.
let mask = self.context.new_rvalue_from_long(value_type, 0x5555555555555555);
let left = value & mask;
let shifted = value >> self.context.new_rvalue_from_int(value_type, 1);
let right = shifted & mask;
let value = left + right;

// Second step.
let mask = self.context.new_rvalue_from_long(value_type, 0x3333333333333333);
let left = value & mask;
let shifted = value >> self.context.new_rvalue_from_int(value_type, 2);
let right = shifted & mask;
let value = left + right;

// Third step.
let mask = self.context.new_rvalue_from_long(value_type, 0x0F0F0F0F0F0F0F0F);
let left = value & mask;
let shifted = value >> self.context.new_rvalue_from_int(value_type, 4);
let right = shifted & mask;
let value = left + right;

if value_type.is_u8(&self.cx) {
return self.context.new_cast(None, value, result_type);
}

// Fourth step.
let mask = self.context.new_rvalue_from_long(value_type, 0x00FF00FF00FF00FF);
let left = value & mask;
let shifted = value >> self.context.new_rvalue_from_int(value_type, 8);
let right = shifted & mask;
let value = left + right;

if value_type.is_u16(&self.cx) {
return self.context.new_cast(None, value, result_type);
}

// Fifth step.
let mask = self.context.new_rvalue_from_long(value_type, 0x0000FFFF0000FFFF);
let left = value & mask;
let shifted = value >> self.context.new_rvalue_from_int(value_type, 16);
let right = shifted & mask;
let value = left + right;

if value_type.is_u32(&self.cx) {
return self.context.new_cast(None, value, result_type);
}

// Sixth step.
let mask = self.context.new_rvalue_from_long(value_type, 0x00000000FFFFFFFF);
let left = value & mask;
let shifted = value >> self.context.new_rvalue_from_int(value_type, 32);
let right = shifted & mask;
let value = left + right;

self.context.new_cast(None, value, result_type)
// Use Wenger's algorithm for population count, gcc's seems to play better with it
// for (int counter = 0; value != 0; counter++) {
// value &= value - 1;
// }
let func = self.current_func.borrow().expect("func");
let loop_head = func.new_block("head");
let loop_body = func.new_block("body");
let loop_tail = func.new_block("tail");

let counter_type = self.int_type;
let counter = self.current_func().new_local(None, counter_type, "popcount_counter");
let val = self.current_func().new_local(None, value_type, "popcount_value");
let zero = self.context.new_rvalue_zero(counter_type);
self.llbb().add_assignment(None, counter, zero);
self.llbb().add_assignment(None, val, value);
self.br(loop_head);

// check if value isn't zero
self.switch_to_block(loop_head);
let zero = self.context.new_rvalue_zero(value_type);
let cond = self.context.new_comparison(None, ComparisonOp::NotEquals, val.to_rvalue(), zero);
self.cond_br(cond, loop_body, loop_tail);

// val &= val - 1;
self.switch_to_block(loop_body);
let sub = val.to_rvalue() - self.context.new_rvalue_one(value_type);
loop_body.add_assignment_op(None, val, BinaryOp::BitwiseAnd, sub);

// counter += 1
let one = self.context.new_rvalue_one(counter_type);
loop_body.add_assignment_op(None, counter, BinaryOp::Plus, one);
self.br(loop_head);

// end of loop
self.switch_to_block(loop_tail);
self.context.new_cast(None, counter.to_rvalue(), result_type)
}

// Algorithm from: https://blog.regehr.org/archives/1063
Expand Down