Skip to content

Commit f6859fb

Browse files
committed
Add tables for SSE fp conversion costs
as disucssed, I will proceed adding costs for common SSE operations which are currently globbed into addss cost, so we do not need to set it incorrectly for znver5. Looking through the stats, there are quite few missing cases, so I am starting with those that I think are more common. I plan to do it in smaller steps so individual changes gets benchmarked by LNT and also can be bisected to. This patch adds costs for various SSE and AVX FP->FP conversions (extensions and truncations). Looking through Agner Fog's tables, these are bit assymetric so I added cost for CVTSS2SD which is also used for CVTSD2SS, CVTPS2PD and CVTPD2PS, cost for 256bit VCVTPS2PS (also used for oposite direction) and cost for 512bit one. I plan to add int->int conversions next and then int->fp & fp->int which are more tricky since they may bundle inter-unit move. I also noticed that size tables are wrong for all SSE instructions so I updated them. With some love I think vectorization can work as size optimization, too, but we need more work on that. Those values I can find in Agner Fog tables are taken from there, other are guesses (especially for yongfeng_cost and shijidadao_cost). gcc/ChangeLog: * config/i386/i386.cc (vec_fp_conversion_cost): New function. (ix86_rtx_costs): Use it for SSE/AVX FP conversoins. (ix86_builtin_vectorization_cost): Fix indentation; and use vec_fp_conversion_cost in vec_promote_demote. (fp_conversion_stmt_cost): New function. (ix86_vector_costs::add_stmt_cost): Use it to cost NOP_EXPR and vec_promote_demote. * config/i386/i386.h (struct processor_costs): * config/i386/x86-tune-costs.h (struct processor_costs):
1 parent 52d7676 commit f6859fb

File tree

3 files changed

+178
-13
lines changed

3 files changed

+178
-13
lines changed

gcc/config/i386/i386.cc

Lines changed: 61 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ along with GCC; see the file COPYING3. If not see
100100
#include "i386-features.h"
101101
#include "function-abi.h"
102102
#include "rtl-error.h"
103+
#include "gimple-pretty-print.h"
103104

104105
/* This file should be included last. */
105106
#include "target-def.h"
@@ -21816,6 +21817,25 @@ ix86_insn_cost (rtx_insn *insn, bool speed)
2181621817
return insn_cost + pattern_cost (PATTERN (insn), speed);
2181721818
}
2181821819

21820+
/* Return cost of SSE/AVX FP->FP conversion (extensions and truncates). */
21821+
21822+
static int
21823+
vec_fp_conversion_cost (const struct processor_costs *cost, int size)
21824+
{
21825+
if (size < 128)
21826+
return cost->cvtss2sd;
21827+
else if (size < 256)
21828+
{
21829+
if (TARGET_SSE_SPLIT_REGS)
21830+
return cost->cvtss2sd * size / 64;
21831+
return cost->cvtss2sd;
21832+
}
21833+
if (size < 512)
21834+
return cost->vcvtps2pd256;
21835+
else
21836+
return cost->vcvtps2pd512;
21837+
}
21838+
2181921839
/* Compute a (partial) cost for rtx X. Return true if the complete
2182021840
cost has been computed, and false if subexpressions should be
2182121841
scanned. In either case, *TOTAL contains the cost result. */
@@ -22479,17 +22499,18 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
2247922499
return false;
2248022500

2248122501
case FLOAT_EXTEND:
22502+
/* x87 represents all values extended to 80bit. */
2248222503
if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
2248322504
*total = 0;
2248422505
else
22485-
*total = ix86_vec_cost (mode, cost->addss);
22506+
*total = vec_fp_conversion_cost (cost, GET_MODE_BITSIZE (mode));
2248622507
return false;
2248722508

2248822509
case FLOAT_TRUNCATE:
2248922510
if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
2249022511
*total = cost->fadd;
2249122512
else
22492-
*total = ix86_vec_cost (mode, cost->addss);
22513+
*total = vec_fp_conversion_cost (cost, GET_MODE_BITSIZE (mode));
2249322514
return false;
2249422515

2249522516
case ABS:
@@ -24683,7 +24704,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
2468324704
switch (type_of_cost)
2468424705
{
2468524706
case scalar_stmt:
24686-
return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
24707+
return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
2468724708

2468824709
case scalar_load:
2468924710
/* load/store costs are relative to register move which is 2. Recompute
@@ -24754,7 +24775,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
2475424775
return ix86_cost->cond_not_taken_branch_cost;
2475524776

2475624777
case vec_perm:
24778+
return ix86_vec_cost (mode, ix86_cost->sse_op);
24779+
2475724780
case vec_promote_demote:
24781+
if (fp)
24782+
return vec_fp_conversion_cost (ix86_tune_cost, mode);
2475824783
return ix86_vec_cost (mode, ix86_cost->sse_op);
2475924784

2476024785
case vec_construct:
@@ -25232,6 +25257,32 @@ ix86_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
2523225257
return new ix86_vector_costs (vinfo, costing_for_scalar);
2523325258
}
2523425259

25260+
/* Return cost of statement doing FP conversion. */
25261+
25262+
static unsigned
25263+
fp_conversion_stmt_cost (machine_mode mode, gimple *stmt, bool scalar_p)
25264+
{
25265+
int outer_size
25266+
= tree_to_uhwi
25267+
(TYPE_SIZE
25268+
(TREE_TYPE (gimple_assign_lhs (stmt))));
25269+
int inner_size
25270+
= tree_to_uhwi
25271+
(TYPE_SIZE
25272+
(TREE_TYPE (gimple_assign_rhs1 (stmt))));
25273+
int stmt_cost = vec_fp_conversion_cost
25274+
(ix86_tune_cost, GET_MODE_BITSIZE (mode));
25275+
/* VEC_PACK_TRUNC_EXPR: If inner size is greater than outer size we will end
25276+
up doing two conversions and packing them. */
25277+
if (!scalar_p && inner_size > outer_size)
25278+
{
25279+
int n = inner_size / outer_size;
25280+
stmt_cost = stmt_cost * n
25281+
+ (n - 1) * ix86_vec_cost (mode, ix86_cost->sse_op);
25282+
}
25283+
return stmt_cost;
25284+
}
25285+
2523525286
unsigned
2523625287
ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
2523725288
stmt_vec_info stmt_info, slp_tree node,
@@ -25342,6 +25393,9 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
2534225393
(TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)),
2534325394
TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))))
2534425395
stmt_cost = 0;
25396+
else if (fp)
25397+
stmt_cost = fp_conversion_stmt_cost (mode, stmt_info->stmt,
25398+
scalar_p);
2534525399
break;
2534625400

2534725401
case BIT_IOR_EXPR:
@@ -25383,6 +25437,10 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
2538325437
break;
2538425438
}
2538525439

25440+
if (kind == vec_promote_demote
25441+
&& fp && FLOAT_TYPE_P (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))))
25442+
stmt_cost = fp_conversion_stmt_cost (mode, stmt_info->stmt, scalar_p);
25443+
2538625444
/* If we do elementwise loads into a vector then we are bound by
2538725445
latency and execution resources for the many scalar loads
2538825446
(AGU and load ports). Try to account for this by scaling the

gcc/config/i386/i386.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,12 @@ struct processor_costs {
207207
const int divsd; /* cost of DIVSD instructions. */
208208
const int sqrtss; /* cost of SQRTSS instructions. */
209209
const int sqrtsd; /* cost of SQRTSD instructions. */
210+
const int cvtss2sd; /* cost SSE FP conversions,
211+
such as CVTSS2SD. */
212+
const int vcvtps2pd256; /* cost 256bit packed FP conversions,
213+
such as VCVTPD2PS with larger reg in ymm. */
214+
const int vcvtps2pd512; /* cost 512bit packed FP conversions,
215+
such as VCVTPD2PS with larger reg in zmm. */
210216
const int reassoc_int, reassoc_fp, reassoc_vec_int, reassoc_vec_fp;
211217
/* Specify reassociation width for integer,
212218
fp, vector integer and vector fp

0 commit comments

Comments
 (0)