optimize popcount implementation

sadlerap · sadlerap · commit 144fdba0c571 · 2023-10-09T17:18:54.000-05:00
In the current implementation, the gcc backend of rustc currently emits the
following for a function that implements popcount for a u32 (x86_64 targeting
AVX2, using standard unix calling convention):

    popcount:
        mov     eax, edi
        and     edi, 1431655765
        shr     eax
        and     eax, 1431655765
        add     edi, eax
        mov     edx, edi
        and     edi, 858993459
        shr     edx, 2
        and     edx, 858993459
        add     edx, edi
        mov     eax, edx
        and     edx, 252645135
        shr     eax, 4
        and     eax, 252645135
        add     eax, edx
        mov     edx, eax
        and     eax, 16711935
        shr     edx, 8
        and     edx, 16711935
        add     edx, eax
        movzx   eax, dx
        shr     edx, 16
        add     eax, edx
        ret

Rather than using this implementation, gcc could be told to use Wenger's
algorithm.  This would give the same function the following implementation:

    popcount:
        xor eax, eax
        xor edx, edx
        popcnt eax, edi
        test edi, edi
        cmove eax, edx
        ret

This patch implements the popcount operation in terms of Wenger's algorithm in
all cases.

Signed-off-by: Andy Sadler &lt;andrewsadler122@gmail.com&gt;
diff --git a/src/intrinsic/mod.rs b/src/intrinsic/mod.rs
@@ -4,7 +4,7 @@ mod simd;
 #[cfg(feature="master")]
 use std::iter;
 
-use gccjit::{ComparisonOp, Function, RValue, ToRValue, Type, UnaryOp, FunctionType};
+use gccjit::{BinaryOp, ComparisonOp, Function, RValue, ToRValue, Type, UnaryOp, FunctionType};
 use rustc_codegen_ssa::MemFlags;
 use rustc_codegen_ssa::base::wants_msvc_seh;
 use rustc_codegen_ssa::common::IntPredicate;
@@ -819,75 +819,43 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
                 value
             };
 
-        if value_type.is_u128(&self.cx) {
-            // TODO(antoyo): implement in the normal algorithm below to have a more efficient
-            // implementation (that does not require a call to __popcountdi2).
-            let popcount = self.context.get_builtin_function("__builtin_popcountll");
-            let sixty_four = self.gcc_int(value_type, 64);
-            let right_shift = self.gcc_lshr(value, sixty_four);
-            let high = self.gcc_int_cast(right_shift, self.cx.ulonglong_type);
-            let high = self.context.new_call(None, popcount, &[high]);
-            let low = self.gcc_int_cast(value, self.cx.ulonglong_type);
-            let low = self.context.new_call(None, popcount, &[low]);
-            let res = high + low;
-            return self.gcc_int_cast(res, result_type);
-        }
-
-        // First step.
-        let mask = self.context.new_rvalue_from_long(value_type, 0x5555555555555555);
-        let left = value & mask;
-        let shifted = value >> self.context.new_rvalue_from_int(value_type, 1);
-        let right = shifted & mask;
-        let value = left + right;
-
-        // Second step.
-        let mask = self.context.new_rvalue_from_long(value_type, 0x3333333333333333);
-        let left = value & mask;
-        let shifted = value >> self.context.new_rvalue_from_int(value_type, 2);
-        let right = shifted & mask;
-        let value = left + right;
-
-        // Third step.
-        let mask = self.context.new_rvalue_from_long(value_type, 0x0F0F0F0F0F0F0F0F);
-        let left = value & mask;
-        let shifted = value >> self.context.new_rvalue_from_int(value_type, 4);
-        let right = shifted & mask;
-        let value = left + right;
-
-        if value_type.is_u8(&self.cx) {
-            return self.context.new_cast(None, value, result_type);
-        }
-
-        // Fourth step.
-        let mask = self.context.new_rvalue_from_long(value_type, 0x00FF00FF00FF00FF);
-        let left = value & mask;
-        let shifted = value >> self.context.new_rvalue_from_int(value_type, 8);
-        let right = shifted & mask;
-        let value = left + right;
-
-        if value_type.is_u16(&self.cx) {
-            return self.context.new_cast(None, value, result_type);
-        }
-
-        // Fifth step.
-        let mask = self.context.new_rvalue_from_long(value_type, 0x0000FFFF0000FFFF);
-        let left = value & mask;
-        let shifted = value >> self.context.new_rvalue_from_int(value_type, 16);
-        let right = shifted & mask;
-        let value = left + right;
-
-        if value_type.is_u32(&self.cx) {
-            return self.context.new_cast(None, value, result_type);
-        }
-
-        // Sixth step.
-        let mask = self.context.new_rvalue_from_long(value_type, 0x00000000FFFFFFFF);
-        let left = value & mask;
-        let shifted = value >> self.context.new_rvalue_from_int(value_type, 32);
-        let right = shifted & mask;
-        let value = left + right;
-
-        self.context.new_cast(None, value, result_type)
+        // Use Wenger's algorithm for population count, gcc's seems to play better with it
+        // for (int counter = 0; value != 0; counter++) {
+        //     value &= value - 1;
+        // }
+        let func = self.current_func.borrow().expect("func");
+        let loop_head = func.new_block("head");
+        let loop_body = func.new_block("body");
+        let loop_tail = func.new_block("tail");
+
+        // gcc seems to optimize better if we use an int here
+        let counter_type = self.cx.int_type;
+        let counter = self.current_func().new_local(None, counter_type, "popcount_counter");
+        let val = self.current_func().new_local(None, value_type, "popcount_value");
+        let zero = self.cx.gcc_zero(counter_type);
+        self.llbb().add_assignment(None, counter, zero);
+        self.llbb().add_assignment(None, val, value);
+        self.br(loop_head);
+
+        // check if value isn't zero
+        self.switch_to_block(loop_head);
+        let zero = self.cx.gcc_zero(value_type);
+        let cond = self.gcc_icmp(IntPredicate::IntNE, val.to_rvalue(), zero);
+        self.cond_br(cond, loop_body, loop_tail);
+
+        // val &= val - 1;
+        self.switch_to_block(loop_body);
+        let sub = val.to_rvalue() - self.context.new_rvalue_one(value_type);
+        loop_body.add_assignment_op(None, val, BinaryOp::BitwiseAnd, sub);
+
+        // counter += 1
+        let one = self.context.new_rvalue_one(counter_type);
+        loop_body.add_assignment_op(None, counter, BinaryOp::Plus, one);
+        self.br(loop_head);
+
+        // end of loop
+        self.switch_to_block(loop_tail);
+        self.gcc_int_cast(counter.to_rvalue(), result_type)
     }
 
     // Algorithm from: https://blog.regehr.org/archives/1063