including: Add late slp vectorization pass with additional checks Add tracer transformation for static probabilities Modify the hip09 tune flags
321 lines
12 KiB
Diff
321 lines
12 KiB
Diff
From 9df4a0bd76299734ae47f2f4e236b10f6c156994 Mon Sep 17 00:00:00 2001
|
|
From: d84370931 <dementiev.daniil@h-partners.com>
|
|
Date: Thu, 14 Nov 2024 17:08:40 +0800
|
|
Subject: [PATCH 3/8] Add late slp vectorization pass with additional checks.
|
|
|
|
Add expansion of data reference offset using affine trees to check
|
|
if data references may alias.
|
|
|
|
Add check if a group of interleaving data references is smaller than
|
|
max vector register size.
|
|
|
|
Add operands swap for commutative operations.
|
|
Swapping operands is necessary for better vector constructing.
|
|
For example for operations
|
|
_1 = a * b;
|
|
_2 = b * c;
|
|
Construction vectors (a, c) * (b, b) is more profitable
|
|
than (a, b) * (b, c).
|
|
|
|
Add tests and special param flags for each check:
|
|
--param=vect-addr-expand-for-alias-check={0,1}
|
|
--param=vect-swap-operands={0,1}
|
|
--param=vect-register-size-check={0,1}
|
|
|
|
Add enabling flag for late slp pass:
|
|
-ftree-slp-late
|
|
---
|
|
gcc/common.opt | 4 ++
|
|
gcc/params.opt | 12 ++++++
|
|
gcc/passes.def | 4 ++
|
|
gcc/testsuite/gcc.dg/vect/vect-alias-expand.c | 12 ++++++
|
|
gcc/testsuite/gcc.dg/vect/vect-op-swap.c | 10 +++++
|
|
gcc/testsuite/gcc.dg/vect/vect-regsize.c | 18 +++++++++
|
|
gcc/timevar.def | 1 +
|
|
gcc/tree-data-ref.cc | 12 ++++++
|
|
gcc/tree-pass.h | 1 +
|
|
gcc/tree-vect-data-refs.cc | 15 +++++++
|
|
gcc/tree-vect-slp.cc | 28 +++++++++++++
|
|
gcc/tree-vectorizer.cc | 39 +++++++++++++++++++
|
|
12 files changed, 156 insertions(+)
|
|
create mode 100644 gcc/testsuite/gcc.dg/vect/vect-alias-expand.c
|
|
create mode 100644 gcc/testsuite/gcc.dg/vect/vect-op-swap.c
|
|
create mode 100644 gcc/testsuite/gcc.dg/vect/vect-regsize.c
|
|
|
|
diff --git a/gcc/common.opt b/gcc/common.opt
|
|
index 78cfc333a..c3c64ceaf 100644
|
|
--- a/gcc/common.opt
|
|
+++ b/gcc/common.opt
|
|
@@ -3268,6 +3268,10 @@ ftree-slp-transpose-vectorize
|
|
Common Var(flag_tree_slp_transpose_vectorize) Optimization Init(0)
|
|
Enable basic block vectorization (SLP) for transposed stores and loads on trees.
|
|
|
|
+ftree-slp-late
|
|
+Common Var(flag_slp_late) Init(0) Optimization
|
|
+Enable additional SLP vectorization pass after reassociation.
|
|
+
|
|
fvect-cost-model=
|
|
Common Joined RejectNegative Enum(vect_cost_model) Var(flag_vect_cost_model) Init(VECT_COST_MODEL_DEFAULT) Optimization
|
|
-fvect-cost-model=[unlimited|dynamic|cheap|very-cheap] Specifies the cost model for vectorization.
|
|
diff --git a/gcc/params.opt b/gcc/params.opt
|
|
index 3ddfaf5b2..bb4dc1825 100644
|
|
--- a/gcc/params.opt
|
|
+++ b/gcc/params.opt
|
|
@@ -1213,6 +1213,18 @@ The maximum factor which the loop vectorizer applies to the cost of statements i
|
|
Common Joined UInteger Var(param_vect_induction_float) Init(1) IntegerRange(0, 1) Param Optimization
|
|
Enable loop vectorization of floating point inductions.
|
|
|
|
+-param=vect-swap-operands=
|
|
+Common Joined UInteger Var(param_vect_swap_operands) Init(0) IntegerRange(0, 1) Param Optimization
|
|
+Enable swapping operands for commutative operations in vectorization analysis.
|
|
+
|
|
+-param=addr-expand-for-alias-check=
|
|
+Common Joined UInteger Var(param_addr_expand_for_alias_check) Init(0) IntegerRange(0, 1) Param Optimization
|
|
+Enable data reference address expansion for alias check.
|
|
+
|
|
+-param=vect-register-size-check=
|
|
+Common Joined UInteger Var(param_vect_register_size_check) Init(0) IntegerRange(0, 1) Param Optimization
|
|
+Enable checking if a group of interleaving data references may not fit in vector register.
|
|
+
|
|
-param=vrp1-mode=
|
|
Common Joined Var(param_vrp1_mode) Enum(vrp_mode) Init(VRP_MODE_VRP) Param Optimization
|
|
--param=vrp1-mode=[vrp|ranger] Specifies the mode VRP1 should operate in.
|
|
diff --git a/gcc/passes.def b/gcc/passes.def
|
|
index e945af96a..529cc5093 100644
|
|
--- a/gcc/passes.def
|
|
+++ b/gcc/passes.def
|
|
@@ -337,6 +337,10 @@ along with GCC; see the file COPYING3. If not see
|
|
NEXT_PASS (pass_lower_switch);
|
|
NEXT_PASS (pass_cse_reciprocals);
|
|
NEXT_PASS (pass_reassoc, false /* early_p */);
|
|
+ NEXT_PASS (pass_slp_vectorize_late);
|
|
+ PUSH_INSERT_PASSES_WITHIN (pass_slp_vectorize_late)
|
|
+ NEXT_PASS (pass_slp_vectorize);
|
|
+ POP_INSERT_PASSES ()
|
|
NEXT_PASS (pass_strength_reduction);
|
|
NEXT_PASS (pass_split_paths);
|
|
NEXT_PASS (pass_tracer);
|
|
diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-expand.c b/gcc/testsuite/gcc.dg/vect/vect-alias-expand.c
|
|
new file mode 100644
|
|
index 000000000..a68f4baf8
|
|
--- /dev/null
|
|
+++ b/gcc/testsuite/gcc.dg/vect/vect-alias-expand.c
|
|
@@ -0,0 +1,12 @@
|
|
+/* { dg-do compile } */
|
|
+/* { dg-options "-O3 -ftree-vectorize --param=addr-expand-for-alias-check=1 -fdump-tree-slp-details" } */
|
|
+
|
|
+extern float arr[2][2];
|
|
+
|
|
+void foo (int i, int j, float a, float b)
|
|
+{
|
|
+ arr[i][j] *= a;
|
|
+ arr[i][j+1] *= b;
|
|
+}
|
|
+
|
|
+/* { dg-final { scan-tree-dump "Basic block will be vectorized using SLP" "slp2" } } */
|
|
diff --git a/gcc/testsuite/gcc.dg/vect/vect-op-swap.c b/gcc/testsuite/gcc.dg/vect/vect-op-swap.c
|
|
new file mode 100644
|
|
index 000000000..4872dc414
|
|
--- /dev/null
|
|
+++ b/gcc/testsuite/gcc.dg/vect/vect-op-swap.c
|
|
@@ -0,0 +1,10 @@
|
|
+/* { dg-do compile } */
|
|
+/* { dg-options "-O3 -ftree-vectorize --param=vect-swap-operands=1 -fdump-tree-slp-details" } */
|
|
+
|
|
+void foo (float *res, float a, float b, float c)
|
|
+{
|
|
+ res[0] = a * b;
|
|
+ res[1] = b * c;
|
|
+}
|
|
+
|
|
+/* { dg-final { scan-tree-dump "Swapped operands for" "slp2" } } */
|
|
diff --git a/gcc/testsuite/gcc.dg/vect/vect-regsize.c b/gcc/testsuite/gcc.dg/vect/vect-regsize.c
|
|
new file mode 100644
|
|
index 000000000..bcd81e6df
|
|
--- /dev/null
|
|
+++ b/gcc/testsuite/gcc.dg/vect/vect-regsize.c
|
|
@@ -0,0 +1,18 @@
|
|
+/* { dg-do compile } */
|
|
+/* { dg-options "-O3 -ftree-vectorize --param=vect-register-size-check=1 -fdump-tree-slp-details" } */
|
|
+
|
|
+extern float arr[256][256][1024];
|
|
+
|
|
+void foo (int i, int j, float a, float b)
|
|
+{
|
|
+ arr[i][j][0] += a;
|
|
+ arr[i][j][1] += b;
|
|
+ arr[i][j+1][0] += a;
|
|
+ arr[i][j+1][1] += b;
|
|
+ arr[i+1][j][0] += a;
|
|
+ arr[i+1][j][1] += b;
|
|
+ arr[i+1][j+1][0] += a;
|
|
+ arr[i+1][j+1][1] += b;
|
|
+}
|
|
+
|
|
+/* { dg-final { scan-tree-dump "Basic block will be vectorized using SLP" "slp2" } } */
|
|
diff --git a/gcc/timevar.def b/gcc/timevar.def
|
|
index fc2b1e1e7..7560e930a 100644
|
|
--- a/gcc/timevar.def
|
|
+++ b/gcc/timevar.def
|
|
@@ -205,6 +205,7 @@ DEFTIMEVAR (TV_SCALAR_CLEANUP , "scalar cleanup")
|
|
DEFTIMEVAR (TV_TREE_PARALLELIZE_LOOPS, "tree parallelize loops")
|
|
DEFTIMEVAR (TV_TREE_VECTORIZATION , "tree vectorization")
|
|
DEFTIMEVAR (TV_TREE_SLP_VECTORIZATION, "tree slp vectorization")
|
|
+DEFTIMEVAR (TV_TREE_LATE_SLP , "late slp vectorization")
|
|
DEFTIMEVAR (TV_GRAPHITE , "Graphite")
|
|
DEFTIMEVAR (TV_GRAPHITE_TRANSFORMS , "Graphite loop transforms")
|
|
DEFTIMEVAR (TV_GRAPHITE_DATA_DEPS , "Graphite data dep analysis")
|
|
diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
|
|
index a05073c51..5eb4ac102 100644
|
|
--- a/gcc/tree-data-ref.cc
|
|
+++ b/gcc/tree-data-ref.cc
|
|
@@ -3021,6 +3021,18 @@ dr_may_alias_p (const struct data_reference *a, const struct data_reference *b,
|
|
get_inner_reference_aff (DR_REF (b), &off2, &size2);
|
|
aff_combination_scale (&off1, -1);
|
|
aff_combination_add (&off2, &off1);
|
|
+
|
|
+ if (param_addr_expand_for_alias_check)
|
|
+ {
|
|
+ using tree_expand_map_t = hash_map<tree, name_expansion *>;
|
|
+ /* Cache used by aff_combination_expand. */
|
|
+ tree_expand_map_t *cache = NULL;
|
|
+
|
|
+ if (off2.n)
|
|
+ aff_combination_expand (&off2, &cache);
|
|
+ free_affine_expand_cache (&cache);
|
|
+ }
|
|
+
|
|
if (aff_comb_cannot_overlap_p (&off2, size1, size2))
|
|
return false;
|
|
}
|
|
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
|
|
index 18b0f8022..2ed79f353 100644
|
|
--- a/gcc/tree-pass.h
|
|
+++ b/gcc/tree-pass.h
|
|
@@ -390,6 +390,7 @@ extern gimple_opt_pass *make_pass_slp_vectorize (gcc::context *ctxt);
|
|
extern gimple_opt_pass *make_pass_complete_unroll (gcc::context *ctxt);
|
|
extern gimple_opt_pass *make_pass_complete_unrolli (gcc::context *ctxt);
|
|
extern gimple_opt_pass *make_pass_pre_slp_scalar_cleanup (gcc::context *ctxt);
|
|
+extern gimple_opt_pass *make_pass_slp_vectorize_late (gcc::context *ctxt);
|
|
extern gimple_opt_pass *make_pass_parallelize_loops (gcc::context *ctxt);
|
|
extern gimple_opt_pass *make_pass_loop_prefetch (gcc::context *ctxt);
|
|
extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt);
|
|
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
|
|
index aae7f62f3..ee58c8f6c 100644
|
|
--- a/gcc/tree-vect-data-refs.cc
|
|
+++ b/gcc/tree-vect-data-refs.cc
|
|
@@ -3234,6 +3234,21 @@ vect_analyze_data_ref_accesses (vec_info *vinfo,
|
|
!= type_size_a))
|
|
break;
|
|
|
|
+ if (param_vect_register_size_check)
|
|
+ {
|
|
+ tree scalar_type = TREE_TYPE (DR_REF (dra));
|
|
+ tree vec_type = get_related_vectype_for_scalar_type (
|
|
+ vinfo->vector_mode, scalar_type);
|
|
+ poly_uint64 vec_size = TYPE_VECTOR_SUBPARTS (vec_type);
|
|
+
|
|
+ /* If we have a large interleaving group (especially a group
|
|
+ of loads with gaps) that does not fit in vector register,
|
|
+ we should split this group to chunks we support. */
|
|
+ if (maybe_ge (((unsigned HOST_WIDE_INT)init_b - init_prev)
|
|
+ / type_size_a, vec_size))
|
|
+ break;
|
|
+ }
|
|
+
|
|
/* If the step (if not zero or non-constant) is smaller than the
|
|
difference between data-refs' inits this splits groups into
|
|
suitable sizes. */
|
|
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
|
|
index fbd638333..79026fb5b 100644
|
|
--- a/gcc/tree-vect-slp.cc
|
|
+++ b/gcc/tree-vect-slp.cc
|
|
@@ -687,6 +687,34 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
|
|
if (first)
|
|
return 0;
|
|
|
|
+ /* If different statements in the group of commutative operations
|
|
+ have the same arguments but in different places, swap them to
|
|
+ group the same operands in one vector.
|
|
+
|
|
+ Check if swapping is enabled, operation is commutative and has
|
|
+ two operands of the same type.
|
|
+ If one of the operands in current statement match the operand
|
|
+ on another place of the first statement in the group we
|
|
+ swap operands in current statement. */
|
|
+ if (param_vect_swap_operands && commutative_op == 0 && !first
|
|
+ && is_a <bb_vec_info> (vinfo) && number_of_oprnds == 2
|
|
+ && vect_def_types_match (dts[0], dts[1]))
|
|
+ {
|
|
+ slp_oprnd_info oprnd_info0 = (*oprnds_info)[0];
|
|
+ slp_oprnd_info oprnd_info1 = (*oprnds_info)[1];
|
|
+ if (oprnd_info1->ops[stmt_num] == oprnd_info0->ops[0]
|
|
+ || oprnd_info0->ops[stmt_num] == oprnd_info1->ops[0])
|
|
+ {
|
|
+ std::swap (oprnd_info0->def_stmts[stmt_num],
|
|
+ oprnd_info1->def_stmts[stmt_num]);
|
|
+ std::swap (oprnd_info0->ops[stmt_num],
|
|
+ oprnd_info1->ops[stmt_num]);
|
|
+ if (dump_enabled_p ())
|
|
+ dump_printf_loc (MSG_NOTE, vect_location,
|
|
+ "Swapped operands for %G", stmt_info->stmt);
|
|
+ }
|
|
+ }
|
|
+
|
|
/* Now match the operand definition types to that of the first stmt. */
|
|
for (i = 0; i < number_of_oprnds;)
|
|
{
|
|
diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc
|
|
index a63fa3912..c363ce490 100644
|
|
--- a/gcc/tree-vectorizer.cc
|
|
+++ b/gcc/tree-vectorizer.cc
|
|
@@ -1524,6 +1524,45 @@ make_pass_slp_vectorize (gcc::context *ctxt)
|
|
return new pass_slp_vectorize (ctxt);
|
|
}
|
|
|
|
+/* The late SLP vectorization pass. */
|
|
+
|
|
+namespace {
|
|
+
|
|
+const pass_data pass_data_slp_vectorize_late =
|
|
+{
|
|
+ GIMPLE_PASS, /* type. */
|
|
+ "slp_late", /* name. */
|
|
+ OPTGROUP_NONE, /* optinfo_flags. */
|
|
+ TV_TREE_LATE_SLP, /* tv_id. */
|
|
+ PROP_cfg, /* properties_required. */
|
|
+ 0, /* properties_provided. */
|
|
+ 0, /* properties_destroyed. */
|
|
+ 0, /* todo_flags_start. */
|
|
+ 0, /* todo_flags_finish. */
|
|
+};
|
|
+
|
|
+class pass_slp_vectorize_late : public gimple_opt_pass
|
|
+{
|
|
+public:
|
|
+ pass_slp_vectorize_late (gcc::context *ctxt)
|
|
+ : gimple_opt_pass (pass_data_slp_vectorize_late, ctxt)
|
|
+ {}
|
|
+
|
|
+ /* opt_pass methods: */
|
|
+ virtual bool gate (function *)
|
|
+ {
|
|
+ return flag_slp_late != 0;
|
|
+ }
|
|
+
|
|
+}; // class pass_slp_vectorize_late
|
|
+
|
|
+} // anon namespace
|
|
+
|
|
+gimple_opt_pass *
|
|
+make_pass_slp_vectorize_late (gcc::context *ctxt)
|
|
+{
|
|
+ return new pass_slp_vectorize_late (ctxt);
|
|
+}
|
|
|
|
/* Increase alignment of global arrays to improve vectorization potential.
|
|
TODO:
|
|
--
|
|
2.33.0
|
|
|