Skip to content

Commit 144fdba

Browse files
committed
optimize popcount implementation
In the current implementation, the gcc backend of rustc currently emits the following for a function that implements popcount for a u32 (x86_64 targeting AVX2, using standard unix calling convention): popcount: mov eax, edi and edi, 1431655765 shr eax and eax, 1431655765 add edi, eax mov edx, edi and edi, 858993459 shr edx, 2 and edx, 858993459 add edx, edi mov eax, edx and edx, 252645135 shr eax, 4 and eax, 252645135 add eax, edx mov edx, eax and eax, 16711935 shr edx, 8 and edx, 16711935 add edx, eax movzx eax, dx shr edx, 16 add eax, edx ret Rather than using this implementation, gcc could be told to use Wenger's algorithm. This would give the same function the following implementation: popcount: xor eax, eax xor edx, edx popcnt eax, edi test edi, edi cmove eax, edx ret This patch implements the popcount operation in terms of Wenger's algorithm in all cases. Signed-off-by: Andy Sadler <[email protected]>
1 parent c6447be commit 144fdba

File tree

1 file changed

+38
-70
lines changed

1 file changed

+38
-70
lines changed

src/intrinsic/mod.rs

+38-70
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ mod simd;
44
#[cfg(feature="master")]
55
use std::iter;
66

7-
use gccjit::{ComparisonOp, Function, RValue, ToRValue, Type, UnaryOp, FunctionType};
7+
use gccjit::{BinaryOp, ComparisonOp, Function, RValue, ToRValue, Type, UnaryOp, FunctionType};
88
use rustc_codegen_ssa::MemFlags;
99
use rustc_codegen_ssa::base::wants_msvc_seh;
1010
use rustc_codegen_ssa::common::IntPredicate;
@@ -819,75 +819,43 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
819819
value
820820
};
821821

822-
if value_type.is_u128(&self.cx) {
823-
// TODO(antoyo): implement in the normal algorithm below to have a more efficient
824-
// implementation (that does not require a call to __popcountdi2).
825-
let popcount = self.context.get_builtin_function("__builtin_popcountll");
826-
let sixty_four = self.gcc_int(value_type, 64);
827-
let right_shift = self.gcc_lshr(value, sixty_four);
828-
let high = self.gcc_int_cast(right_shift, self.cx.ulonglong_type);
829-
let high = self.context.new_call(None, popcount, &[high]);
830-
let low = self.gcc_int_cast(value, self.cx.ulonglong_type);
831-
let low = self.context.new_call(None, popcount, &[low]);
832-
let res = high + low;
833-
return self.gcc_int_cast(res, result_type);
834-
}
835-
836-
// First step.
837-
let mask = self.context.new_rvalue_from_long(value_type, 0x5555555555555555);
838-
let left = value & mask;
839-
let shifted = value >> self.context.new_rvalue_from_int(value_type, 1);
840-
let right = shifted & mask;
841-
let value = left + right;
842-
843-
// Second step.
844-
let mask = self.context.new_rvalue_from_long(value_type, 0x3333333333333333);
845-
let left = value & mask;
846-
let shifted = value >> self.context.new_rvalue_from_int(value_type, 2);
847-
let right = shifted & mask;
848-
let value = left + right;
849-
850-
// Third step.
851-
let mask = self.context.new_rvalue_from_long(value_type, 0x0F0F0F0F0F0F0F0F);
852-
let left = value & mask;
853-
let shifted = value >> self.context.new_rvalue_from_int(value_type, 4);
854-
let right = shifted & mask;
855-
let value = left + right;
856-
857-
if value_type.is_u8(&self.cx) {
858-
return self.context.new_cast(None, value, result_type);
859-
}
860-
861-
// Fourth step.
862-
let mask = self.context.new_rvalue_from_long(value_type, 0x00FF00FF00FF00FF);
863-
let left = value & mask;
864-
let shifted = value >> self.context.new_rvalue_from_int(value_type, 8);
865-
let right = shifted & mask;
866-
let value = left + right;
867-
868-
if value_type.is_u16(&self.cx) {
869-
return self.context.new_cast(None, value, result_type);
870-
}
871-
872-
// Fifth step.
873-
let mask = self.context.new_rvalue_from_long(value_type, 0x0000FFFF0000FFFF);
874-
let left = value & mask;
875-
let shifted = value >> self.context.new_rvalue_from_int(value_type, 16);
876-
let right = shifted & mask;
877-
let value = left + right;
878-
879-
if value_type.is_u32(&self.cx) {
880-
return self.context.new_cast(None, value, result_type);
881-
}
882-
883-
// Sixth step.
884-
let mask = self.context.new_rvalue_from_long(value_type, 0x00000000FFFFFFFF);
885-
let left = value & mask;
886-
let shifted = value >> self.context.new_rvalue_from_int(value_type, 32);
887-
let right = shifted & mask;
888-
let value = left + right;
889-
890-
self.context.new_cast(None, value, result_type)
822+
// Use Wenger's algorithm for population count, gcc's seems to play better with it
823+
// for (int counter = 0; value != 0; counter++) {
824+
// value &= value - 1;
825+
// }
826+
let func = self.current_func.borrow().expect("func");
827+
let loop_head = func.new_block("head");
828+
let loop_body = func.new_block("body");
829+
let loop_tail = func.new_block("tail");
830+
831+
// gcc seems to optimize better if we use an int here
832+
let counter_type = self.cx.int_type;
833+
let counter = self.current_func().new_local(None, counter_type, "popcount_counter");
834+
let val = self.current_func().new_local(None, value_type, "popcount_value");
835+
let zero = self.cx.gcc_zero(counter_type);
836+
self.llbb().add_assignment(None, counter, zero);
837+
self.llbb().add_assignment(None, val, value);
838+
self.br(loop_head);
839+
840+
// check if value isn't zero
841+
self.switch_to_block(loop_head);
842+
let zero = self.cx.gcc_zero(value_type);
843+
let cond = self.gcc_icmp(IntPredicate::IntNE, val.to_rvalue(), zero);
844+
self.cond_br(cond, loop_body, loop_tail);
845+
846+
// val &= val - 1;
847+
self.switch_to_block(loop_body);
848+
let sub = val.to_rvalue() - self.context.new_rvalue_one(value_type);
849+
loop_body.add_assignment_op(None, val, BinaryOp::BitwiseAnd, sub);
850+
851+
// counter += 1
852+
let one = self.context.new_rvalue_one(counter_type);
853+
loop_body.add_assignment_op(None, counter, BinaryOp::Plus, one);
854+
self.br(loop_head);
855+
856+
// end of loop
857+
self.switch_to_block(loop_tail);
858+
self.gcc_int_cast(counter.to_rvalue(), result_type)
891859
}
892860

893861
// Algorithm from: https://blog.regehr.org/archives/1063

0 commit comments

Comments
 (0)