Skip to content

Commit 3434881

Browse files
[AArch64] Explicitly use v1i64 type for llvm.aarch64.neon.pmull64
Without this, the intrinsic will be expanded to an integer; thereby an explicit copy (from GPR to SIMD register) will be codegen'd. This matches the general convention of using "v1" types to represent scalar integer operations in vector registers. The similar approach is observed in D56616, and the pattern likely applies on other intrinsic that accepts integer scalars (e.g., int_aarch64_neon_sqdmulls_scalar) Differential Revision: https://reviews.llvm.org/D130548
1 parent 653b214 commit 3434881

File tree

4 files changed

+110
-6
lines changed

4 files changed

+110
-6
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+41
Original file line numberDiff line numberDiff line change
@@ -4186,6 +4186,24 @@ static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
41864186
return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
41874187
}
41884188

4189+
static bool isOperandOfHigherHalf(SDValue &Op) {
4190+
SDNode *OpNode = Op.getNode();
4191+
if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
4192+
return false;
4193+
4194+
ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
4195+
if (!C || C->getZExtValue() != 1)
4196+
return false;
4197+
4198+
EVT VT = OpNode->getOperand(0).getValueType();
4199+
4200+
return VT.isFixedLengthVector() && VT.getVectorNumElements() == 2;
4201+
}
4202+
4203+
static bool areOperandsOfHigherHalf(SDValue &Op1, SDValue &Op2) {
4204+
return isOperandOfHigherHalf(Op1) && isOperandOfHigherHalf(Op2);
4205+
}
4206+
41894207
static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
41904208
bool isSigned) {
41914209
EVT VT = N->getValueType(0);
@@ -4526,6 +4544,29 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
45264544
report_fatal_error("Unexpected type for AArch64 NEON intrinic");
45274545
}
45284546
}
4547+
case Intrinsic::aarch64_neon_pmull64: {
4548+
SDValue Op1 = Op.getOperand(1);
4549+
SDValue Op2 = Op.getOperand(2);
4550+
4551+
// If both operands are higher half of two source SIMD & FP registers,
4552+
// ISel could make use of tablegen patterns to emit PMULL2. So do not
4553+
// legalize i64 to v1i64.
4554+
if (areOperandsOfHigherHalf(Op1, Op2))
4555+
return SDValue();
4556+
4557+
// As a general convention, use "v1" types to represent scalar integer
4558+
// operations in vector registers. This helps ISel to make use of
4559+
// tablegen patterns and generate a load into SIMD & FP registers directly.
4560+
if (Op1.getValueType() == MVT::i64)
4561+
Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op1);
4562+
if (Op2.getValueType() == MVT::i64)
4563+
Op2 = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op2);
4564+
4565+
return DAG.getNode(
4566+
ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
4567+
DAG.getConstant(Intrinsic::aarch64_neon_pmull64, dl, MVT::i32), Op1,
4568+
Op2);
4569+
}
45294570
case Intrinsic::aarch64_neon_smax:
45304571
return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
45314572
Op.getOperand(1), Op.getOperand(2));

llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll

+3-3
Original file line numberDiff line numberDiff line change
@@ -2507,9 +2507,9 @@ entry:
25072507
define i128 @test_vmull_p64(i64 %a, i64 %b) #4 {
25082508
; CHECK-LABEL: test_vmull_p64:
25092509
; CHECK: // %bb.0: // %entry
2510-
; CHECK-NEXT: fmov d0, x0
2511-
; CHECK-NEXT: fmov d1, x1
2512-
; CHECK-NEXT: pmull v0.1q, v0.1d, v1.1d
2510+
; CHECK-NEXT: fmov d0, x1
2511+
; CHECK-NEXT: fmov d1, x0
2512+
; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d
25132513
; CHECK-NEXT: mov x1, v0.d[1]
25142514
; CHECK-NEXT: fmov x0, d0
25152515
; CHECK-NEXT: ret

llvm/test/CodeGen/AArch64/arm64-vmul.ll

+3-3
Original file line numberDiff line numberDiff line change
@@ -2925,9 +2925,9 @@ define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
29252925
define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {
29262926
; CHECK-LABEL: test_pmull_64:
29272927
; CHECK: // %bb.0:
2928-
; CHECK-NEXT: fmov d0, x0
2929-
; CHECK-NEXT: fmov d1, x1
2930-
; CHECK-NEXT: pmull.1q v0, v0, v1
2928+
; CHECK-NEXT: fmov d0, x1
2929+
; CHECK-NEXT: fmov d1, x0
2930+
; CHECK-NEXT: pmull.1q v0, v1, v0
29312931
; CHECK-NEXT: ret
29322932
%val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
29332933
ret <16 x i8> %val
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+aes -o - %s| FileCheck %s --check-prefixes=CHECK
3+
4+
; Tests that scalar i64 arguments of llvm.aarch64.neon.pmull64 are
5+
; loaded into SIMD registers, as opposed to being loaded into GPR followed by a mov.
6+
7+
define void @test1(ptr %0, i64 %1, i64 %2) {
8+
; CHECK-LABEL: test1:
9+
; CHECK: // %bb.0:
10+
; CHECK-NEXT: add x8, x0, x2, lsl #4
11+
; CHECK-NEXT: add x9, x0, x1, lsl #4
12+
; CHECK-NEXT: ldr d0, [x8, #8]
13+
; CHECK-NEXT: ldr d1, [x9, #8]
14+
; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d
15+
; CHECK-NEXT: str q0, [x9]
16+
; CHECK-NEXT: ret
17+
%4 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1
18+
%5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1, i64 1
19+
%6 = load i64, ptr %5, align 8
20+
%7 = getelementptr inbounds <2 x i64>, ptr %0, i64 %2, i64 1
21+
%8 = load i64, ptr %7, align 8
22+
%9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %6, i64 %8)
23+
store <16 x i8> %9, ptr %4, align 16
24+
ret void
25+
}
26+
27+
define void @test2(ptr %0, i64 %1, i64 %2, <2 x i64> %3) {
28+
; CHECK-LABEL: test2:
29+
; CHECK: // %bb.0:
30+
; CHECK-NEXT: mov x9, v0.d[1]
31+
; CHECK-NEXT: add x8, x0, x1, lsl #4
32+
; CHECK-NEXT: ldr d0, [x8, #8]
33+
; CHECK-NEXT: fmov d1, x9
34+
; CHECK-NEXT: pmull v0.1q, v0.1d, v1.1d
35+
; CHECK-NEXT: str q0, [x8]
36+
; CHECK-NEXT: ret
37+
%5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1
38+
%6 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1, i64 1
39+
%7 = load i64, ptr %6, align 8
40+
%8 = extractelement <2 x i64> %3, i64 1
41+
%9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %7, i64 %8)
42+
store <16 x i8> %9, ptr %5, align 16
43+
ret void
44+
}
45+
46+
define void @test3(ptr %0, i64 %1, i64 %2, i64 %3) {
47+
; CHECK-LABEL: test3:
48+
; CHECK: // %bb.0:
49+
; CHECK-NEXT: add x8, x0, x1, lsl #4
50+
; CHECK-NEXT: fmov d0, x3
51+
; CHECK-NEXT: ldr d1, [x8, #8]
52+
; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d
53+
; CHECK-NEXT: str q0, [x8]
54+
; CHECK-NEXT: ret
55+
%5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1
56+
%6 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1, i64 1
57+
%7 = load i64, ptr %6, align 8
58+
%8 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %7, i64 %3)
59+
store <16 x i8> %8, ptr %5, align 16
60+
ret void
61+
}
62+
63+
declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)

0 commit comments

Comments
 (0)