Skip to content

Commit 5ae3d85

Browse files
committed
optimize popcount implementation
In the current implementation, the gcc backend of rustc currently emits the following for a function that implements popcount for a u32 (x86_64 targeting AVX2, using standard unix calling convention): popcount: mov eax, edi and edi, 1431655765 shr eax and eax, 1431655765 add edi, eax mov edx, edi and edi, 858993459 shr edx, 2 and edx, 858993459 add edx, edi mov eax, edx and edx, 252645135 shr eax, 4 and eax, 252645135 add eax, edx mov edx, eax and eax, 16711935 shr edx, 8 and edx, 16711935 add edx, eax movzx eax, dx shr edx, 16 add eax, edx ret Rather than using this implementation, gcc could be told to use Wenger's algorithm. This would give the same function the following implementation: popcount: xor eax, eax xor edx, edx popcnt eax, edi test edi, edi cmove eax, edx ret This patch implements the popcount operation in terms of Wenger's algorithm in all cases. Signed-off-by: Andy Sadler <[email protected]>
1 parent c6447be commit 5ae3d85

File tree

1 file changed

+35
-60
lines changed

1 file changed

+35
-60
lines changed

src/intrinsic/mod.rs

+35-60
Original file line numberDiff line numberDiff line change
@@ -820,74 +820,49 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
820820
};
821821

822822
if value_type.is_u128(&self.cx) {
823-
// TODO(antoyo): implement in the normal algorithm below to have a more efficient
824-
// implementation (that does not require a call to __popcountdi2).
825-
let popcount = self.context.get_builtin_function("__builtin_popcountll");
826823
let sixty_four = self.gcc_int(value_type, 64);
827824
let right_shift = self.gcc_lshr(value, sixty_four);
828825
let high = self.gcc_int_cast(right_shift, self.cx.ulonglong_type);
829-
let high = self.context.new_call(None, popcount, &[high]);
826+
let high = self.pop_count(high);
830827
let low = self.gcc_int_cast(value, self.cx.ulonglong_type);
831-
let low = self.context.new_call(None, popcount, &[low]);
828+
let low = self.pop_count(low);
832829
let res = high + low;
833830
return self.gcc_int_cast(res, result_type);
834831
}
835832

836-
// First step.
837-
let mask = self.context.new_rvalue_from_long(value_type, 0x5555555555555555);
838-
let left = value & mask;
839-
let shifted = value >> self.context.new_rvalue_from_int(value_type, 1);
840-
let right = shifted & mask;
841-
let value = left + right;
842-
843-
// Second step.
844-
let mask = self.context.new_rvalue_from_long(value_type, 0x3333333333333333);
845-
let left = value & mask;
846-
let shifted = value >> self.context.new_rvalue_from_int(value_type, 2);
847-
let right = shifted & mask;
848-
let value = left + right;
849-
850-
// Third step.
851-
let mask = self.context.new_rvalue_from_long(value_type, 0x0F0F0F0F0F0F0F0F);
852-
let left = value & mask;
853-
let shifted = value >> self.context.new_rvalue_from_int(value_type, 4);
854-
let right = shifted & mask;
855-
let value = left + right;
856-
857-
if value_type.is_u8(&self.cx) {
858-
return self.context.new_cast(None, value, result_type);
859-
}
860-
861-
// Fourth step.
862-
let mask = self.context.new_rvalue_from_long(value_type, 0x00FF00FF00FF00FF);
863-
let left = value & mask;
864-
let shifted = value >> self.context.new_rvalue_from_int(value_type, 8);
865-
let right = shifted & mask;
866-
let value = left + right;
867-
868-
if value_type.is_u16(&self.cx) {
869-
return self.context.new_cast(None, value, result_type);
870-
}
871-
872-
// Fifth step.
873-
let mask = self.context.new_rvalue_from_long(value_type, 0x0000FFFF0000FFFF);
874-
let left = value & mask;
875-
let shifted = value >> self.context.new_rvalue_from_int(value_type, 16);
876-
let right = shifted & mask;
877-
let value = left + right;
878-
879-
if value_type.is_u32(&self.cx) {
880-
return self.context.new_cast(None, value, result_type);
881-
}
882-
883-
// Sixth step.
884-
let mask = self.context.new_rvalue_from_long(value_type, 0x00000000FFFFFFFF);
885-
let left = value & mask;
886-
let shifted = value >> self.context.new_rvalue_from_int(value_type, 32);
887-
let right = shifted & mask;
888-
let value = left + right;
889-
890-
self.context.new_cast(None, value, result_type)
833+
// Use Wenger's algorithm for population count, gcc's seems to play better with it
834+
// for (int counter = 0; value != 0; counter++) {
835+
// value &= value - 1;
836+
// }
837+
let func = self.current_func.borrow().expect("func");
838+
let loop_head = func.new_block("head");
839+
let loop_body = func.new_block("body");
840+
let loop_tail = func.new_block("tail");
841+
842+
let counter = self.current_func().new_local(None, self.cx.uint_type, "popcount_result");
843+
let val = self.current_func().new_local(None, value.get_type(), "popcount_value");
844+
let zero = self.cx.gcc_zero(counter.to_rvalue().get_type());
845+
self.llbb().add_assignment(None, counter, zero);
846+
self.llbb().add_assignment(None, val, value);
847+
self.llbb().end_with_jump(None, loop_head);
848+
849+
// check if value isn't zero
850+
self.switch_to_block(loop_head);
851+
let zero = self.cx.gcc_zero(value.get_type());
852+
let cond = self.gcc_icmp(IntPredicate::IntNE, val.to_rvalue(), zero);
853+
loop_head.end_with_conditional(None, cond, loop_body, loop_tail);
854+
855+
// val &= val - 1;
856+
let new_val = val.to_rvalue() & (val.to_rvalue() - self.context.new_rvalue_one(value.get_type()));
857+
loop_body.add_assignment(None, val, new_val);
858+
859+
// counter += 1
860+
let new_counter = counter.to_rvalue() + self.context.new_rvalue_one(self.cx.uint_type);
861+
loop_body.add_assignment(None, counter, new_counter);
862+
loop_body.end_with_jump(None, loop_head);
863+
864+
self.switch_to_block(loop_tail);
865+
counter.to_rvalue()
891866
}
892867

893868
// Algorithm from: https://blog.regehr.org/archives/1063

0 commit comments

Comments
 (0)