Skip to content

Commit 82a5f1c

Browse files
committed
[AArch64] use CNT for ISD::popcnt and ISD::parity if available
These are the two places where we explicitly want to use cnt in SelectionDAG when feature CSSC is available: ISD::popcnt and ISD::parity For both, we need to make sure we're emitting optimized code for i32 (and lower), i64 and i128. The most optimal way is of course using the GPR CNT instruction. If we don't have CSSC, but we do have neon, we'll use floating point CNT. If all fails, we'll fall back on the general GPR popcnt and parity implementations. spec: https://developer.arm.com/documentation/ddi0602/2022-09/Base-Instructions/CNT--Count-bits- Reviewed By: lenary Differential Revision: https://reviews.llvm.org/D138808
1 parent c4edeb8 commit 82a5f1c

File tree

5 files changed

+148
-11
lines changed

5 files changed

+148
-11
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -541,12 +541,19 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
541541
setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
542542
setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
543543

544-
setOperationAction(ISD::CTPOP, MVT::i32, Custom);
545-
setOperationAction(ISD::CTPOP, MVT::i64, Custom);
546-
setOperationAction(ISD::CTPOP, MVT::i128, Custom);
544+
if (Subtarget->hasCSSC()) {
545+
setOperationAction(ISD::CTPOP, MVT::i32, Legal);
546+
setOperationAction(ISD::CTPOP, MVT::i64, Legal);
547+
setOperationAction(ISD::CTPOP, MVT::i128, Expand);
548+
setOperationAction(ISD::PARITY, MVT::i128, Expand);
549+
} else {
550+
setOperationAction(ISD::CTPOP, MVT::i32, Custom);
551+
setOperationAction(ISD::CTPOP, MVT::i64, Custom);
552+
setOperationAction(ISD::CTPOP, MVT::i128, Custom);
547553

548-
setOperationAction(ISD::PARITY, MVT::i64, Custom);
549-
setOperationAction(ISD::PARITY, MVT::i128, Custom);
554+
setOperationAction(ISD::PARITY, MVT::i64, Custom);
555+
setOperationAction(ISD::PARITY, MVT::i128, Custom);
556+
}
550557

551558
setOperationAction(ISD::ABS, MVT::i32, Custom);
552559
setOperationAction(ISD::ABS, MVT::i64, Custom);
@@ -8413,19 +8420,23 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
84138420
return SDValue();
84148421

84158422
bool IsParity = Op.getOpcode() == ISD::PARITY;
8423+
SDValue Val = Op.getOperand(0);
8424+
SDLoc DL(Op);
8425+
EVT VT = Op.getValueType();
84168426

8417-
// While there is no integer popcount instruction, it can
8427+
// for i32, general parity function using EORs is more efficient compared to
8428+
// using floating point
8429+
if (VT == MVT::i32 && IsParity)
8430+
return SDValue();
8431+
8432+
// If there is no CNT instruction available, GPR popcount can
84188433
// be more efficiently lowered to the following sequence that uses
84198434
// AdvSIMD registers/instructions as long as the copies to/from
84208435
// the AdvSIMD registers are cheap.
84218436
// FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
84228437
// CNT V0.8B, V0.8B // 8xbyte pop-counts
84238438
// ADDV B0, V0.8B // sum 8xbyte pop-counts
84248439
// UMOV X0, V0.B[0] // copy byte result back to integer reg
8425-
SDValue Val = Op.getOperand(0);
8426-
SDLoc DL(Op);
8427-
EVT VT = Op.getValueType();
8428-
84298440
if (VT == MVT::i32 || VT == MVT::i64) {
84308441
if (VT == MVT::i32)
84318442
Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8529,7 +8529,7 @@ defm RCWSWP : ReadCheckWriteOperation<0b010, "swp">;
85298529
// General Data-Processing Instructions (FEAT_V94_DP)
85308530
//===----------------------------------------------------------------------===//
85318531
defm ABS : OneOperandData<0b001000, "abs">, Requires<[HasCSSC]>;
8532-
defm CNT : OneOperandData<0b000111, "cnt">, Requires<[HasCSSC]>;
8532+
defm CNT : OneOperandData<0b000111, "cnt", ctpop>, Requires<[HasCSSC]>;
85338533
defm CTZ : OneOperandData<0b000110, "ctz">, Requires<[HasCSSC]>;
85348534

85358535
defm SMAX : ComparisonOp<0, 0, "smax">, Requires<[HasCSSC]>;

llvm/test/CodeGen/AArch64/arm64-popcnt.ll

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
33
; RUN: llc < %s -mtriple=aarch64-eabi -mattr -neon -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-NONEON %s
4+
; RUN: llc < %s -mtriple=aarch64-eabi -mattr +cssc -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-CSSC %s
45

56
define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
67
; CHECK-LABEL: cnt32_advsimd:
@@ -27,6 +28,11 @@ define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
2728
; CHECK-NONEON-NEXT: mul w8, w9, w8
2829
; CHECK-NONEON-NEXT: lsr w0, w8, #24
2930
; CHECK-NONEON-NEXT: ret
31+
;
32+
; CHECK-CSSC-LABEL: cnt32_advsimd:
33+
; CHECK-CSSC: // %bb.0:
34+
; CHECK-CSSC-NEXT: cnt w0, w0
35+
; CHECK-CSSC-NEXT: ret
3036
%cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
3137
ret i32 %cnt
3238
}
@@ -57,6 +63,13 @@ define i32 @cnt32_advsimd_2(<2 x i32> %x) {
5763
; CHECK-NONEON-NEXT: mul w8, w9, w8
5864
; CHECK-NONEON-NEXT: lsr w0, w8, #24
5965
; CHECK-NONEON-NEXT: ret
66+
;
67+
; CHECK-CSSC-LABEL: cnt32_advsimd_2:
68+
; CHECK-CSSC: // %bb.0:
69+
; CHECK-CSSC-NEXT: // kill: def $d0 killed $d0 def $q0
70+
; CHECK-CSSC-NEXT: fmov w8, s0
71+
; CHECK-CSSC-NEXT: cnt w0, w8
72+
; CHECK-CSSC-NEXT: ret
6073
%1 = extractelement <2 x i32> %x, i64 0
6174
%2 = tail call i32 @llvm.ctpop.i32(i32 %1)
6275
ret i32 %2
@@ -86,6 +99,11 @@ define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
8699
; CHECK-NONEON-NEXT: mul x8, x9, x8
87100
; CHECK-NONEON-NEXT: lsr x0, x8, #56
88101
; CHECK-NONEON-NEXT: ret
102+
;
103+
; CHECK-CSSC-LABEL: cnt64_advsimd:
104+
; CHECK-CSSC: // %bb.0:
105+
; CHECK-CSSC-NEXT: cnt x0, x0
106+
; CHECK-CSSC-NEXT: ret
89107
%cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
90108
ret i64 %cnt
91109
}
@@ -125,6 +143,11 @@ define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat {
125143
; CHECK-NONEON-NEXT: mul w8, w9, w8
126144
; CHECK-NONEON-NEXT: lsr w0, w8, #24
127145
; CHECK-NONEON-NEXT: ret
146+
;
147+
; CHECK-CSSC-LABEL: cnt32:
148+
; CHECK-CSSC: // %bb.0:
149+
; CHECK-CSSC-NEXT: cnt w0, w0
150+
; CHECK-CSSC-NEXT: ret
128151
%cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
129152
ret i32 %cnt
130153
}
@@ -161,6 +184,11 @@ define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat {
161184
; CHECK-NONEON-NEXT: mul x8, x9, x8
162185
; CHECK-NONEON-NEXT: lsr x0, x8, #56
163186
; CHECK-NONEON-NEXT: ret
187+
;
188+
; CHECK-CSSC-LABEL: cnt64:
189+
; CHECK-CSSC: // %bb.0:
190+
; CHECK-CSSC-NEXT: cnt x0, x0
191+
; CHECK-CSSC-NEXT: ret
164192
%cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
165193
ret i64 %cnt
166194
}
@@ -181,6 +209,13 @@ define i32 @ctpop_eq_one(i64 %x) nounwind readnone {
181209
; CHECK-NONEON-NEXT: ccmp x0, #0, #4, eq
182210
; CHECK-NONEON-NEXT: cset w0, ne
183211
; CHECK-NONEON-NEXT: ret
212+
;
213+
; CHECK-CSSC-LABEL: ctpop_eq_one:
214+
; CHECK-CSSC: // %bb.0:
215+
; CHECK-CSSC-NEXT: cnt x8, x0
216+
; CHECK-CSSC-NEXT: cmp x8, #1
217+
; CHECK-CSSC-NEXT: cset w0, eq
218+
; CHECK-CSSC-NEXT: ret
184219
%count = tail call i64 @llvm.ctpop.i64(i64 %x)
185220
%cmp = icmp eq i64 %count, 1
186221
%conv = zext i1 %cmp to i32
@@ -203,6 +238,13 @@ define i32 @ctpop_ne_one(i64 %x) nounwind readnone {
203238
; CHECK-NONEON-NEXT: ccmp x0, #0, #4, eq
204239
; CHECK-NONEON-NEXT: cset w0, eq
205240
; CHECK-NONEON-NEXT: ret
241+
;
242+
; CHECK-CSSC-LABEL: ctpop_ne_one:
243+
; CHECK-CSSC: // %bb.0:
244+
; CHECK-CSSC-NEXT: cnt x8, x0
245+
; CHECK-CSSC-NEXT: cmp x8, #1
246+
; CHECK-CSSC-NEXT: cset w0, ne
247+
; CHECK-CSSC-NEXT: ret
206248
%count = tail call i64 @llvm.ctpop.i64(i64 %x)
207249
%cmp = icmp ne i64 %count, 1
208250
%conv = zext i1 %cmp to i32

llvm/test/CodeGen/AArch64/ctpop-nonean.ll

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-neon < %s | FileCheck %s
3+
; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-neon -mattr=+cssc < %s | FileCheck %s -check-prefix=CHECK-CSSC
34

45
declare i128 @llvm.ctpop.i128(i128)
56

@@ -31,6 +32,14 @@ define i128 @ctpop_i128(i128 %i) {
3132
; CHECK-NEXT: lsr x9, x9, #56
3233
; CHECK-NEXT: add x0, x9, x8, lsr #56
3334
; CHECK-NEXT: ret
35+
;
36+
; CHECK-CSSC-LABEL: ctpop_i128:
37+
; CHECK-CSSC: // %bb.0:
38+
; CHECK-CSSC-NEXT: cnt x8, x1
39+
; CHECK-CSSC-NEXT: cnt x9, x0
40+
; CHECK-CSSC-NEXT: add x0, x9, x8
41+
; CHECK-CSSC-NEXT: mov x1, xzr
42+
; CHECK-CSSC-NEXT: ret
3443
%c = call i128 @llvm.ctpop.i128(i128 %i)
3544
ret i128 %c
3645
}

llvm/test/CodeGen/AArch64/parity.ll

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
3+
; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu -mattr=+cssc | FileCheck %s -check-prefix=CHECK-CSSC
34

45
define i4 @parity_4(i4 %x) {
56
; CHECK-LABEL: parity_4:
@@ -9,6 +10,13 @@ define i4 @parity_4(i4 %x) {
910
; CHECK-NEXT: eor w8, w8, w8, lsr #1
1011
; CHECK-NEXT: and w0, w8, #0x1
1112
; CHECK-NEXT: ret
13+
;
14+
; CHECK-CSSC-LABEL: parity_4:
15+
; CHECK-CSSC: // %bb.0:
16+
; CHECK-CSSC-NEXT: and w8, w0, #0xf
17+
; CHECK-CSSC-NEXT: cnt w8, w8
18+
; CHECK-CSSC-NEXT: and w0, w8, #0x1
19+
; CHECK-CSSC-NEXT: ret
1220
%1 = tail call i4 @llvm.ctpop.i4(i4 %x)
1321
%2 = and i4 %1, 1
1422
ret i4 %2
@@ -23,6 +31,13 @@ define i8 @parity_8(i8 %x) {
2331
; CHECK-NEXT: eor w8, w8, w8, lsr #1
2432
; CHECK-NEXT: and w0, w8, #0x1
2533
; CHECK-NEXT: ret
34+
;
35+
; CHECK-CSSC-LABEL: parity_8:
36+
; CHECK-CSSC: // %bb.0:
37+
; CHECK-CSSC-NEXT: and w8, w0, #0xff
38+
; CHECK-CSSC-NEXT: cnt w8, w8
39+
; CHECK-CSSC-NEXT: and w0, w8, #0x1
40+
; CHECK-CSSC-NEXT: ret
2641
%1 = tail call i8 @llvm.ctpop.i8(i8 %x)
2742
%2 = and i8 %1, 1
2843
ret i8 %2
@@ -38,6 +53,13 @@ define i16 @parity_16(i16 %x) {
3853
; CHECK-NEXT: eor w8, w8, w8, lsr #1
3954
; CHECK-NEXT: and w0, w8, #0x1
4055
; CHECK-NEXT: ret
56+
;
57+
; CHECK-CSSC-LABEL: parity_16:
58+
; CHECK-CSSC: // %bb.0:
59+
; CHECK-CSSC-NEXT: and w8, w0, #0xffff
60+
; CHECK-CSSC-NEXT: cnt w8, w8
61+
; CHECK-CSSC-NEXT: and w0, w8, #0x1
62+
; CHECK-CSSC-NEXT: ret
4163
%1 = tail call i16 @llvm.ctpop.i16(i16 %x)
4264
%2 = and i16 %1, 1
4365
ret i16 %2
@@ -54,6 +76,13 @@ define i17 @parity_17(i17 %x) {
5476
; CHECK-NEXT: eor w8, w8, w8, lsr #1
5577
; CHECK-NEXT: and w0, w8, #0x1
5678
; CHECK-NEXT: ret
79+
;
80+
; CHECK-CSSC-LABEL: parity_17:
81+
; CHECK-CSSC: // %bb.0:
82+
; CHECK-CSSC-NEXT: and w8, w0, #0x1ffff
83+
; CHECK-CSSC-NEXT: cnt w8, w8
84+
; CHECK-CSSC-NEXT: and w0, w8, #0x1
85+
; CHECK-CSSC-NEXT: ret
5786
%1 = tail call i17 @llvm.ctpop.i17(i17 %x)
5887
%2 = and i17 %1, 1
5988
ret i17 %2
@@ -69,6 +98,12 @@ define i32 @parity_32(i32 %x) {
6998
; CHECK-NEXT: eor w8, w8, w8, lsr #1
7099
; CHECK-NEXT: and w0, w8, #0x1
71100
; CHECK-NEXT: ret
101+
;
102+
; CHECK-CSSC-LABEL: parity_32:
103+
; CHECK-CSSC: // %bb.0:
104+
; CHECK-CSSC-NEXT: cnt w8, w0
105+
; CHECK-CSSC-NEXT: and w0, w8, #0x1
106+
; CHECK-CSSC-NEXT: ret
72107
%1 = tail call i32 @llvm.ctpop.i32(i32 %x)
73108
%2 = and i32 %1, 1
74109
ret i32 %2
@@ -83,6 +118,12 @@ define i64 @parity_64(i64 %x) {
83118
; CHECK-NEXT: fmov w8, s0
84119
; CHECK-NEXT: and w0, w8, #0x1
85120
; CHECK-NEXT: ret
121+
;
122+
; CHECK-CSSC-LABEL: parity_64:
123+
; CHECK-CSSC: // %bb.0:
124+
; CHECK-CSSC-NEXT: cnt x8, x0
125+
; CHECK-CSSC-NEXT: and x0, x8, #0x1
126+
; CHECK-CSSC-NEXT: ret
86127
%1 = tail call i64 @llvm.ctpop.i64(i64 %x)
87128
%2 = and i64 %1, 1
88129
ret i64 %2
@@ -99,6 +140,14 @@ define i128 @parity_128(i128 %x) {
99140
; CHECK-NEXT: fmov w8, s0
100141
; CHECK-NEXT: and w0, w8, #0x1
101142
; CHECK-NEXT: ret
143+
;
144+
; CHECK-CSSC-LABEL: parity_128:
145+
; CHECK-CSSC: // %bb.0:
146+
; CHECK-CSSC-NEXT: eor x8, x0, x1
147+
; CHECK-CSSC-NEXT: mov x1, xzr
148+
; CHECK-CSSC-NEXT: cnt x8, x8
149+
; CHECK-CSSC-NEXT: and x0, x8, #0x1
150+
; CHECK-CSSC-NEXT: ret
102151
%1 = tail call i128 @llvm.ctpop.i128(i128 %x)
103152
%2 = and i128 %1, 1
104153
ret i128 %2
@@ -113,6 +162,12 @@ define i32 @parity_64_trunc(i64 %x) {
113162
; CHECK-NEXT: fmov w8, s0
114163
; CHECK-NEXT: and w0, w8, #0x1
115164
; CHECK-NEXT: ret
165+
;
166+
; CHECK-CSSC-LABEL: parity_64_trunc:
167+
; CHECK-CSSC: // %bb.0:
168+
; CHECK-CSSC-NEXT: cnt x8, x0
169+
; CHECK-CSSC-NEXT: and w0, w8, #0x1
170+
; CHECK-CSSC-NEXT: ret
116171
%1 = tail call i64 @llvm.ctpop.i64(i64 %x)
117172
%2 = trunc i64 %1 to i32
118173
%3 = and i32 %2, 1
@@ -129,6 +184,12 @@ define i8 @parity_32_trunc(i32 %x) {
129184
; CHECK-NEXT: eor w8, w8, w8, lsr #1
130185
; CHECK-NEXT: and w0, w8, #0x1
131186
; CHECK-NEXT: ret
187+
;
188+
; CHECK-CSSC-LABEL: parity_32_trunc:
189+
; CHECK-CSSC: // %bb.0:
190+
; CHECK-CSSC-NEXT: cnt w8, w0
191+
; CHECK-CSSC-NEXT: and w0, w8, #0x1
192+
; CHECK-CSSC-NEXT: ret
132193
%1 = tail call i32 @llvm.ctpop.i32(i32 %x)
133194
%2 = trunc i32 %1 to i8
134195
%3 = and i8 %2, 1
@@ -144,6 +205,13 @@ define i32 @parity_8_zext(i8 %x) {
144205
; CHECK-NEXT: eor w8, w8, w8, lsr #1
145206
; CHECK-NEXT: and w0, w8, #0x1
146207
; CHECK-NEXT: ret
208+
;
209+
; CHECK-CSSC-LABEL: parity_8_zext:
210+
; CHECK-CSSC: // %bb.0:
211+
; CHECK-CSSC-NEXT: and w8, w0, #0xff
212+
; CHECK-CSSC-NEXT: cnt w8, w8
213+
; CHECK-CSSC-NEXT: and w0, w8, #0x1
214+
; CHECK-CSSC-NEXT: ret
147215
%a = zext i8 %x to i32
148216
%b = tail call i32 @llvm.ctpop.i32(i32 %a)
149217
%c = and i32 %b, 1
@@ -159,6 +227,13 @@ define i32 @parity_8_mask(i32 %x) {
159227
; CHECK-NEXT: eor w8, w8, w8, lsr #1
160228
; CHECK-NEXT: and w0, w8, #0x1
161229
; CHECK-NEXT: ret
230+
;
231+
; CHECK-CSSC-LABEL: parity_8_mask:
232+
; CHECK-CSSC: // %bb.0:
233+
; CHECK-CSSC-NEXT: and w8, w0, #0xff
234+
; CHECK-CSSC-NEXT: cnt w8, w8
235+
; CHECK-CSSC-NEXT: and w0, w8, #0x1
236+
; CHECK-CSSC-NEXT: ret
162237
%a = and i32 %x, 255
163238
%b = tail call i32 @llvm.ctpop.i32(i32 %a)
164239
%c = and i32 %b, 1

0 commit comments

Comments
 (0)