355 lines
12 KiB
Diff
355 lines
12 KiB
Diff
From 472890b43d2848a46fa13945279308f0a21c55d9 Mon Sep 17 00:00:00 2001
|
|
From: Jiahao Xu <xujiahao@loongson.cn>
|
|
Date: Wed, 18 Oct 2023 17:43:39 +0800
|
|
Subject: [PATCH 013/188] LoongArch:Implement the new vector cost model
|
|
framework.
|
|
|
|
This patch make loongarch use the new vector hooks and implements the costing
|
|
function determine_suggested_unroll_factor, to make it be able to suggest the
|
|
unroll factor for a given loop being vectorized base vec_ops analysis during
|
|
vector costing and the available issue information. Referring to aarch64 and
|
|
rs6000 port.
|
|
|
|
The patch also reduces the cost of unaligned stores, making it equal to the
|
|
cost of aligned ones in order to avoid odd alignment peeling.
|
|
|
|
gcc/ChangeLog:
|
|
|
|
* config/loongarch/loongarch.cc (loongarch_vector_costs): Inherit from
|
|
vector_costs. Add a constructor.
|
|
(loongarch_vector_costs::add_stmt_cost): Use adjust_cost_for_freq to
|
|
adjust the cost for inner loops.
|
|
(loongarch_vector_costs::count_operations): New function.
|
|
(loongarch_vector_costs::determine_suggested_unroll_factor): Ditto.
|
|
(loongarch_vector_costs::finish_cost): Ditto.
|
|
(loongarch_builtin_vectorization_cost): Adjust.
|
|
* config/loongarch/loongarch.opt (loongarch-vect-unroll-limit): New parameter.
|
|
(loongarcg-vect-issue-info): Ditto.
|
|
(mmemvec-cost): Delete.
|
|
* config/loongarch/genopts/loongarch.opt.in
|
|
(loongarch-vect-unroll-limit): Ditto.
|
|
(loongarcg-vect-issue-info): Ditto.
|
|
(mmemvec-cost): Delete.
|
|
* doc/invoke.texi (loongarcg-vect-unroll-limit): Document new option.
|
|
---
|
|
gcc/config/loongarch/genopts/loongarch.opt.in | 15 +-
|
|
gcc/config/loongarch/loongarch.cc | 173 ++++++++++++++++--
|
|
gcc/config/loongarch/loongarch.opt | 15 +-
|
|
gcc/doc/invoke.texi | 7 +
|
|
4 files changed, 188 insertions(+), 22 deletions(-)
|
|
|
|
diff --git a/gcc/config/loongarch/genopts/loongarch.opt.in b/gcc/config/loongarch/genopts/loongarch.opt.in
|
|
index f18733c24..74cf4a7f7 100644
|
|
--- a/gcc/config/loongarch/genopts/loongarch.opt.in
|
|
+++ b/gcc/config/loongarch/genopts/loongarch.opt.in
|
|
@@ -152,10 +152,6 @@ mbranch-cost=
|
|
Target RejectNegative Joined UInteger Var(loongarch_branch_cost)
|
|
-mbranch-cost=COST Set the cost of branches to roughly COST instructions.
|
|
|
|
-mmemvec-cost=
|
|
-Target RejectNegative Joined UInteger Var(loongarch_vector_access_cost) IntegerRange(1, 5)
|
|
-mmemvec-cost=COST Set the cost of vector memory access instructions.
|
|
-
|
|
mcheck-zero-division
|
|
Target Mask(CHECK_ZERO_DIV)
|
|
Trap on integer divide by zero.
|
|
@@ -219,3 +215,14 @@ mrelax
|
|
Target Var(loongarch_mrelax) Init(HAVE_AS_MRELAX_OPTION)
|
|
Take advantage of linker relaxations to reduce the number of instructions
|
|
required to materialize symbol addresses.
|
|
+
|
|
+-param=loongarch-vect-unroll-limit=
|
|
+Target Joined UInteger Var(loongarch_vect_unroll_limit) Init(6) IntegerRange(1, 64) Param
|
|
+Used to limit unroll factor which indicates how much the autovectorizer may
|
|
+unroll a loop. The default value is 6.
|
|
+
|
|
+-param=loongarch-vect-issue-info=
|
|
+Target Undocumented Joined UInteger Var(loongarch_vect_issue_info) Init(4) IntegerRange(1, 64) Param
|
|
+Indicate how many non memory access vector instructions can be issued per
|
|
+cycle, it's used in unroll factor determination for autovectorizer. The
|
|
+default value is 4.
|
|
diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
|
|
index c0f58f9a9..e22a64600 100644
|
|
--- a/gcc/config/loongarch/loongarch.cc
|
|
+++ b/gcc/config/loongarch/loongarch.cc
|
|
@@ -65,6 +65,8 @@ along with GCC; see the file COPYING3. If not see
|
|
#include "rtl-iter.h"
|
|
#include "opts.h"
|
|
#include "function-abi.h"
|
|
+#include "cfgloop.h"
|
|
+#include "tree-vectorizer.h"
|
|
|
|
/* This file should be included last. */
|
|
#include "target-def.h"
|
|
@@ -3841,8 +3843,6 @@ loongarch_rtx_costs (rtx x, machine_mode mode, int outer_code,
|
|
}
|
|
}
|
|
|
|
-/* Vectorizer cost model implementation. */
|
|
-
|
|
/* Implement targetm.vectorize.builtin_vectorization_cost. */
|
|
|
|
static int
|
|
@@ -3861,36 +3861,182 @@ loongarch_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
|
|
case vector_load:
|
|
case vec_to_scalar:
|
|
case scalar_to_vec:
|
|
- case cond_branch_not_taken:
|
|
- case vec_promote_demote:
|
|
case scalar_store:
|
|
case vector_store:
|
|
return 1;
|
|
|
|
+ case vec_promote_demote:
|
|
case vec_perm:
|
|
return LASX_SUPPORTED_MODE_P (mode)
|
|
&& !LSX_SUPPORTED_MODE_P (mode) ? 2 : 1;
|
|
|
|
case unaligned_load:
|
|
- case vector_gather_load:
|
|
- return 2;
|
|
-
|
|
case unaligned_store:
|
|
- case vector_scatter_store:
|
|
- return 10;
|
|
+ return 2;
|
|
|
|
case cond_branch_taken:
|
|
- return 3;
|
|
+ return 4;
|
|
+
|
|
+ case cond_branch_not_taken:
|
|
+ return 2;
|
|
|
|
case vec_construct:
|
|
elements = TYPE_VECTOR_SUBPARTS (vectype);
|
|
- return elements / 2 + 1;
|
|
+ if (ISA_HAS_LASX)
|
|
+ return elements + 1;
|
|
+ else
|
|
+ return elements;
|
|
|
|
default:
|
|
gcc_unreachable ();
|
|
}
|
|
}
|
|
|
|
+class loongarch_vector_costs : public vector_costs
|
|
+{
|
|
+public:
|
|
+ using vector_costs::vector_costs;
|
|
+
|
|
+ unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
|
|
+ stmt_vec_info stmt_info, slp_tree, tree vectype,
|
|
+ int misalign,
|
|
+ vect_cost_model_location where) override;
|
|
+ void finish_cost (const vector_costs *) override;
|
|
+
|
|
+protected:
|
|
+ void count_operations (vect_cost_for_stmt, stmt_vec_info,
|
|
+ vect_cost_model_location, unsigned int);
|
|
+ unsigned int determine_suggested_unroll_factor (loop_vec_info);
|
|
+ /* The number of vectorized stmts in loop. */
|
|
+ unsigned m_stmts = 0;
|
|
+ /* The number of load and store operations in loop. */
|
|
+ unsigned m_loads = 0;
|
|
+ unsigned m_stores = 0;
|
|
+ /* Reduction factor for suggesting unroll factor. */
|
|
+ unsigned m_reduc_factor = 0;
|
|
+ /* True if the loop contains an average operation. */
|
|
+ bool m_has_avg =false;
|
|
+};
|
|
+
|
|
+/* Implement TARGET_VECTORIZE_CREATE_COSTS. */
|
|
+static vector_costs *
|
|
+loongarch_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
|
|
+{
|
|
+ return new loongarch_vector_costs (vinfo, costing_for_scalar);
|
|
+}
|
|
+
|
|
+void
|
|
+loongarch_vector_costs::count_operations (vect_cost_for_stmt kind,
|
|
+ stmt_vec_info stmt_info,
|
|
+ vect_cost_model_location where,
|
|
+ unsigned int count)
|
|
+{
|
|
+ if (!m_costing_for_scalar
|
|
+ && is_a<loop_vec_info> (m_vinfo)
|
|
+ && where == vect_body)
|
|
+ {
|
|
+ m_stmts += count;
|
|
+
|
|
+ if (kind == scalar_load
|
|
+ || kind == vector_load
|
|
+ || kind == unaligned_load)
|
|
+ m_loads += count;
|
|
+ else if (kind == scalar_store
|
|
+ || kind == vector_store
|
|
+ || kind == unaligned_store)
|
|
+ m_stores += count;
|
|
+ else if ((kind == scalar_stmt
|
|
+ || kind == vector_stmt
|
|
+ || kind == vec_to_scalar)
|
|
+ && stmt_info && vect_is_reduction (stmt_info))
|
|
+ {
|
|
+ tree lhs = gimple_get_lhs (stmt_info->stmt);
|
|
+ unsigned int base = FLOAT_TYPE_P (TREE_TYPE (lhs)) ? 2 : 1;
|
|
+ m_reduc_factor = MAX (base * count, m_reduc_factor);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+unsigned int
|
|
+loongarch_vector_costs::determine_suggested_unroll_factor (loop_vec_info loop_vinfo)
|
|
+{
|
|
+ class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
|
|
+
|
|
+ if (m_has_avg)
|
|
+ return 1;
|
|
+
|
|
+ /* Don't unroll if it's specified explicitly not to be unrolled. */
|
|
+ if (loop->unroll == 1
|
|
+ || (OPTION_SET_P (flag_unroll_loops) && !flag_unroll_loops)
|
|
+ || (OPTION_SET_P (flag_unroll_all_loops) && !flag_unroll_all_loops))
|
|
+ return 1;
|
|
+
|
|
+ unsigned int nstmts_nonldst = m_stmts - m_loads - m_stores;
|
|
+ /* Don't unroll if no vector instructions excepting for memory access. */
|
|
+ if (nstmts_nonldst == 0)
|
|
+ return 1;
|
|
+
|
|
+ /* Use this simple hardware resource model that how many non vld/vst
|
|
+ vector instructions can be issued per cycle. */
|
|
+ unsigned int issue_info = loongarch_vect_issue_info;
|
|
+ unsigned int reduc_factor = m_reduc_factor > 1 ? m_reduc_factor : 1;
|
|
+ unsigned int uf = CEIL (reduc_factor * issue_info, nstmts_nonldst);
|
|
+ uf = MIN ((unsigned int) loongarch_vect_unroll_limit, uf);
|
|
+
|
|
+ return 1 << ceil_log2 (uf);
|
|
+}
|
|
+
|
|
+unsigned
|
|
+loongarch_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
|
|
+ stmt_vec_info stmt_info, slp_tree,
|
|
+ tree vectype, int misalign,
|
|
+ vect_cost_model_location where)
|
|
+{
|
|
+ unsigned retval = 0;
|
|
+
|
|
+ if (flag_vect_cost_model)
|
|
+ {
|
|
+ int stmt_cost = loongarch_builtin_vectorization_cost (kind, vectype,
|
|
+ misalign);
|
|
+ retval = adjust_cost_for_freq (stmt_info, where, count * stmt_cost);
|
|
+ m_costs[where] += retval;
|
|
+
|
|
+ count_operations (kind, stmt_info, where, count);
|
|
+ }
|
|
+
|
|
+ if (stmt_info)
|
|
+ {
|
|
+ /* Detect the use of an averaging operation. */
|
|
+ gimple *stmt = stmt_info->stmt;
|
|
+ if (is_gimple_call (stmt)
|
|
+ && gimple_call_internal_p (stmt))
|
|
+ {
|
|
+ switch (gimple_call_internal_fn (stmt))
|
|
+ {
|
|
+ case IFN_AVG_FLOOR:
|
|
+ case IFN_AVG_CEIL:
|
|
+ m_has_avg = true;
|
|
+ default:
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return retval;
|
|
+}
|
|
+
|
|
+void
|
|
+loongarch_vector_costs::finish_cost (const vector_costs *scalar_costs)
|
|
+{
|
|
+ loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
|
|
+ if (loop_vinfo)
|
|
+ {
|
|
+ m_suggested_unroll_factor = determine_suggested_unroll_factor (loop_vinfo);
|
|
+ }
|
|
+
|
|
+ vector_costs::finish_cost (scalar_costs);
|
|
+}
|
|
+
|
|
/* Implement TARGET_ADDRESS_COST. */
|
|
|
|
static int
|
|
@@ -7261,9 +7407,6 @@ loongarch_option_override_internal (struct gcc_options *opts,
|
|
if (TARGET_DIRECT_EXTERN_ACCESS && flag_shlib)
|
|
error ("%qs cannot be used for compiling a shared library",
|
|
"-mdirect-extern-access");
|
|
- if (loongarch_vector_access_cost == 0)
|
|
- loongarch_vector_access_cost = 5;
|
|
-
|
|
|
|
switch (la_target.cmodel)
|
|
{
|
|
@@ -11275,6 +11418,8 @@ loongarch_builtin_support_vector_misalignment (machine_mode mode,
|
|
#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
|
|
#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
|
|
loongarch_builtin_vectorization_cost
|
|
+#undef TARGET_VECTORIZE_CREATE_COSTS
|
|
+#define TARGET_VECTORIZE_CREATE_COSTS loongarch_vectorize_create_costs
|
|
|
|
|
|
#undef TARGET_IN_SMALL_DATA_P
|
|
diff --git a/gcc/config/loongarch/loongarch.opt b/gcc/config/loongarch/loongarch.opt
|
|
index 78f2baf3a..34bd832bd 100644
|
|
--- a/gcc/config/loongarch/loongarch.opt
|
|
+++ b/gcc/config/loongarch/loongarch.opt
|
|
@@ -159,10 +159,6 @@ mbranch-cost=
|
|
Target RejectNegative Joined UInteger Var(loongarch_branch_cost)
|
|
-mbranch-cost=COST Set the cost of branches to roughly COST instructions.
|
|
|
|
-mmemvec-cost=
|
|
-Target RejectNegative Joined UInteger Var(loongarch_vector_access_cost) IntegerRange(1, 5)
|
|
-mmemvec-cost=COST Set the cost of vector memory access instructions.
|
|
-
|
|
mcheck-zero-division
|
|
Target Mask(CHECK_ZERO_DIV)
|
|
Trap on integer divide by zero.
|
|
@@ -226,3 +222,14 @@ mrelax
|
|
Target Var(loongarch_mrelax) Init(HAVE_AS_MRELAX_OPTION)
|
|
Take advantage of linker relaxations to reduce the number of instructions
|
|
required to materialize symbol addresses.
|
|
+
|
|
+-param=loongarch-vect-unroll-limit=
|
|
+Target Joined UInteger Var(loongarch_vect_unroll_limit) Init(6) IntegerRange(1, 64) Param
|
|
+Used to limit unroll factor which indicates how much the autovectorizer may
|
|
+unroll a loop. The default value is 6.
|
|
+
|
|
+-param=loongarch-vect-issue-info=
|
|
+Target Undocumented Joined UInteger Var(loongarch_vect_issue_info) Init(4) IntegerRange(1, 64) Param
|
|
+Indicate how many non memory access vector instructions can be issued per
|
|
+cycle, it's used in unroll factor determination for autovectorizer. The
|
|
+default value is 4.
|
|
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
|
|
index 7eed77836..168f3d0db 100644
|
|
--- a/gcc/doc/invoke.texi
|
|
+++ b/gcc/doc/invoke.texi
|
|
@@ -24632,6 +24632,13 @@ environments where no dynamic link is performed, like firmwares, OS
|
|
kernels, executables linked with @option{-static} or @option{-static-pie}.
|
|
@option{-mdirect-extern-access} is not compatible with @option{-fPIC} or
|
|
@option{-fpic}.
|
|
+
|
|
+@item loongarch-vect-unroll-limit
|
|
+The vectorizer will use available tuning information to determine whether it
|
|
+would be beneficial to unroll the main vectorized loop and by how much. This
|
|
+parameter set's the upper bound of how much the vectorizer will unroll the main
|
|
+loop. The default value is six.
|
|
+
|
|
@end table
|
|
|
|
@node M32C Options
|
|
--
|
|
2.43.0
|
|
|