234 lines
8.8 KiB
Diff
234 lines
8.8 KiB
Diff
|
|
From 417d51e1ecf41b3ba3ddf24eaf1e07db5c1ded9e Mon Sep 17 00:00:00 2001
|
||
|
|
From: Richard Sandiford <richard.sandiford@arm.com>
|
||
|
|
Date: Tue, 5 Dec 2023 09:28:46 +0000
|
||
|
|
Subject: [PATCH 049/157] [Backport][SME] Allow prologues and epilogues to be
|
||
|
|
inserted later
|
||
|
|
|
||
|
|
Reference: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=e9d2ae6b9816e61a6148040149c63faa83f54702
|
||
|
|
|
||
|
|
Arm's SME adds a new processor mode called streaming mode.
|
||
|
|
This mode enables some new (matrix-oriented) instructions and
|
||
|
|
disables several existing groups of instructions, such as most
|
||
|
|
Advanced SIMD vector instructions and a much smaller set of SVE
|
||
|
|
instructions. It can also change the current vector length.
|
||
|
|
|
||
|
|
There are instructions to switch in and out of streaming mode.
|
||
|
|
However, their effect on the ISA and vector length can't be represented
|
||
|
|
directly in RTL, so they need to be emitted late in the pass pipeline,
|
||
|
|
close to md_reorg.
|
||
|
|
|
||
|
|
It's sometimes the responsibility of the prologue and epilogue to
|
||
|
|
switch modes, which means we need to emit the prologue and epilogue
|
||
|
|
sequences late as well. (This loses shrink-wrapping and scheduling
|
||
|
|
opportunities, but that's a price worth paying.)
|
||
|
|
|
||
|
|
This patch therefore adds a target hook for forcing prologue
|
||
|
|
and epilogue insertion to happen later in the pipeline.
|
||
|
|
|
||
|
|
gcc/
|
||
|
|
* target.def (use_late_prologue_epilogue): New hook.
|
||
|
|
* doc/tm.texi.in: Add TARGET_USE_LATE_PROLOGUE_EPILOGUE.
|
||
|
|
* doc/tm.texi: Regenerate.
|
||
|
|
* passes.def (pass_late_thread_prologue_and_epilogue): New pass.
|
||
|
|
* tree-pass.h (make_pass_late_thread_prologue_and_epilogue): Declare.
|
||
|
|
* function.cc (pass_thread_prologue_and_epilogue::gate): New function.
|
||
|
|
(pass_data_late_thread_prologue_and_epilogue): New pass variable.
|
||
|
|
(pass_late_thread_prologue_and_epilogue): New pass class.
|
||
|
|
(make_pass_late_thread_prologue_and_epilogue): New function.
|
||
|
|
---
|
||
|
|
gcc/doc/tm.texi | 19 ++++++++++++++++++
|
||
|
|
gcc/doc/tm.texi.in | 2 ++
|
||
|
|
gcc/function.cc | 50 ++++++++++++++++++++++++++++++++++++++++++++++
|
||
|
|
gcc/passes.def | 3 +++
|
||
|
|
gcc/target.def | 21 +++++++++++++++++++
|
||
|
|
gcc/tree-pass.h | 2 ++
|
||
|
|
6 files changed, 97 insertions(+)
|
||
|
|
|
||
|
|
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
|
||
|
|
index 5f0972356..d930d233d 100644
|
||
|
|
--- a/gcc/doc/tm.texi
|
||
|
|
+++ b/gcc/doc/tm.texi
|
||
|
|
@@ -11684,6 +11684,25 @@ of the if-block in the @code{struct ce_if_block} structure that is pointed
|
||
|
|
to by @var{ce_info}.
|
||
|
|
@end defmac
|
||
|
|
|
||
|
|
+@deftypefn {Target Hook} bool TARGET_USE_LATE_PROLOGUE_EPILOGUE ()
|
||
|
|
+Return true if the current function's prologue and epilogue should
|
||
|
|
+be emitted late in the pass pipeline, instead of at the usual point.
|
||
|
|
+
|
||
|
|
+Normally, the prologue and epilogue sequences are introduced soon after
|
||
|
|
+register allocation is complete. The advantage of this approach is that
|
||
|
|
+it allows the prologue and epilogue instructions to be optimized and
|
||
|
|
+scheduled with other code in the function. However, some targets
|
||
|
|
+require the prologue and epilogue to be the first and last sequences
|
||
|
|
+executed by the function, with no variation allowed. This hook should
|
||
|
|
+return true on such targets.
|
||
|
|
+
|
||
|
|
+The default implementation returns false, which is correct for most
|
||
|
|
+targets. The hook should only return true if there is a specific
|
||
|
|
+target limitation that cannot be described in RTL. For example,
|
||
|
|
+the hook might return true if the prologue and epilogue need to switch
|
||
|
|
+between instruction sets.
|
||
|
|
+@end deftypefn
|
||
|
|
+
|
||
|
|
@deftypefn {Target Hook} void TARGET_MACHINE_DEPENDENT_REORG (void)
|
||
|
|
If non-null, this hook performs a target-specific pass over the
|
||
|
|
instruction stream. The compiler will run it at all optimization levels,
|
||
|
|
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
|
||
|
|
index fcab21744..19eabec48 100644
|
||
|
|
--- a/gcc/doc/tm.texi.in
|
||
|
|
+++ b/gcc/doc/tm.texi.in
|
||
|
|
@@ -7708,6 +7708,8 @@ of the if-block in the @code{struct ce_if_block} structure that is pointed
|
||
|
|
to by @var{ce_info}.
|
||
|
|
@end defmac
|
||
|
|
|
||
|
|
+@hook TARGET_USE_LATE_PROLOGUE_EPILOGUE
|
||
|
|
+
|
||
|
|
@hook TARGET_MACHINE_DEPENDENT_REORG
|
||
|
|
|
||
|
|
@hook TARGET_INIT_BUILTINS
|
||
|
|
diff --git a/gcc/function.cc b/gcc/function.cc
|
||
|
|
index fc8eb5812..7c90b5f23 100644
|
||
|
|
--- a/gcc/function.cc
|
||
|
|
+++ b/gcc/function.cc
|
||
|
|
@@ -84,6 +84,7 @@ along with GCC; see the file COPYING3. If not see
|
||
|
|
#include "function-abi.h"
|
||
|
|
#include "value-range.h"
|
||
|
|
#include "gimple-range.h"
|
||
|
|
+#include "insn-attr.h"
|
||
|
|
|
||
|
|
/* So we can assign to cfun in this file. */
|
||
|
|
#undef cfun
|
||
|
|
@@ -6620,6 +6621,11 @@ public:
|
||
|
|
{}
|
||
|
|
|
||
|
|
/* opt_pass methods: */
|
||
|
|
+ bool gate (function *) final override
|
||
|
|
+ {
|
||
|
|
+ return !targetm.use_late_prologue_epilogue ();
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
unsigned int execute (function * fun) final override
|
||
|
|
{
|
||
|
|
rest_of_handle_thread_prologue_and_epilogue (fun);
|
||
|
|
@@ -6628,6 +6634,44 @@ public:
|
||
|
|
|
||
|
|
}; // class pass_thread_prologue_and_epilogue
|
||
|
|
|
||
|
|
+const pass_data pass_data_late_thread_prologue_and_epilogue =
|
||
|
|
+{
|
||
|
|
+ RTL_PASS, /* type */
|
||
|
|
+ "late_pro_and_epilogue", /* name */
|
||
|
|
+ OPTGROUP_NONE, /* optinfo_flags */
|
||
|
|
+ TV_THREAD_PROLOGUE_AND_EPILOGUE, /* tv_id */
|
||
|
|
+ 0, /* properties_required */
|
||
|
|
+ 0, /* properties_provided */
|
||
|
|
+ 0, /* properties_destroyed */
|
||
|
|
+ 0, /* todo_flags_start */
|
||
|
|
+ ( TODO_df_verify | TODO_df_finish ), /* todo_flags_finish */
|
||
|
|
+};
|
||
|
|
+
|
||
|
|
+class pass_late_thread_prologue_and_epilogue : public rtl_opt_pass
|
||
|
|
+{
|
||
|
|
+public:
|
||
|
|
+ pass_late_thread_prologue_and_epilogue (gcc::context *ctxt)
|
||
|
|
+ : rtl_opt_pass (pass_data_late_thread_prologue_and_epilogue, ctxt)
|
||
|
|
+ {}
|
||
|
|
+
|
||
|
|
+ /* opt_pass methods: */
|
||
|
|
+ bool gate (function *) final override
|
||
|
|
+ {
|
||
|
|
+ return targetm.use_late_prologue_epilogue ();
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ unsigned int execute (function *fn) final override
|
||
|
|
+ {
|
||
|
|
+ /* It's not currently possible to have both delay slots and
|
||
|
|
+ late prologue/epilogue, since the latter has to run before
|
||
|
|
+ the former, and the former won't honor whatever restrictions
|
||
|
|
+ the latter is trying to enforce. */
|
||
|
|
+ gcc_assert (!DELAY_SLOTS);
|
||
|
|
+ rest_of_handle_thread_prologue_and_epilogue (fn);
|
||
|
|
+ return 0;
|
||
|
|
+ }
|
||
|
|
+}; // class pass_late_thread_prologue_and_epilogue
|
||
|
|
+
|
||
|
|
} // anon namespace
|
||
|
|
|
||
|
|
rtl_opt_pass *
|
||
|
|
@@ -6636,6 +6680,12 @@ make_pass_thread_prologue_and_epilogue (gcc::context *ctxt)
|
||
|
|
return new pass_thread_prologue_and_epilogue (ctxt);
|
||
|
|
}
|
||
|
|
|
||
|
|
+rtl_opt_pass *
|
||
|
|
+make_pass_late_thread_prologue_and_epilogue (gcc::context *ctxt)
|
||
|
|
+{
|
||
|
|
+ return new pass_late_thread_prologue_and_epilogue (ctxt);
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
namespace {
|
||
|
|
|
||
|
|
const pass_data pass_data_zero_call_used_regs =
|
||
|
|
diff --git a/gcc/passes.def b/gcc/passes.def
|
||
|
|
index cdc600298..8797f166f 100644
|
||
|
|
--- a/gcc/passes.def
|
||
|
|
+++ b/gcc/passes.def
|
||
|
|
@@ -523,6 +523,9 @@ along with GCC; see the file COPYING3. If not see
|
||
|
|
NEXT_PASS (pass_stack_regs_run);
|
||
|
|
POP_INSERT_PASSES ()
|
||
|
|
POP_INSERT_PASSES ()
|
||
|
|
+ NEXT_PASS (pass_late_thread_prologue_and_epilogue);
|
||
|
|
+ /* No target-independent code motion is allowed beyond this point,
|
||
|
|
+ excepting the legacy delayed-branch pass. */
|
||
|
|
NEXT_PASS (pass_late_compilation);
|
||
|
|
PUSH_INSERT_PASSES_WITHIN (pass_late_compilation)
|
||
|
|
NEXT_PASS (pass_zero_call_used_regs);
|
||
|
|
diff --git a/gcc/target.def b/gcc/target.def
|
||
|
|
index 4d77c1523..fd4899612 100644
|
||
|
|
--- a/gcc/target.def
|
||
|
|
+++ b/gcc/target.def
|
||
|
|
@@ -4120,6 +4120,27 @@ returns @code{VOIDmode}.",
|
||
|
|
machine_mode, (machine_mode m1, machine_mode m2),
|
||
|
|
default_cc_modes_compatible)
|
||
|
|
|
||
|
|
+DEFHOOK
|
||
|
|
+(use_late_prologue_epilogue,
|
||
|
|
+ "Return true if the current function's prologue and epilogue should\n\
|
||
|
|
+be emitted late in the pass pipeline, instead of at the usual point.\n\
|
||
|
|
+\n\
|
||
|
|
+Normally, the prologue and epilogue sequences are introduced soon after\n\
|
||
|
|
+register allocation is complete. The advantage of this approach is that\n\
|
||
|
|
+it allows the prologue and epilogue instructions to be optimized and\n\
|
||
|
|
+scheduled with other code in the function. However, some targets\n\
|
||
|
|
+require the prologue and epilogue to be the first and last sequences\n\
|
||
|
|
+executed by the function, with no variation allowed. This hook should\n\
|
||
|
|
+return true on such targets.\n\
|
||
|
|
+\n\
|
||
|
|
+The default implementation returns false, which is correct for most\n\
|
||
|
|
+targets. The hook should only return true if there is a specific\n\
|
||
|
|
+target limitation that cannot be described in RTL. For example,\n\
|
||
|
|
+the hook might return true if the prologue and epilogue need to switch\n\
|
||
|
|
+between instruction sets.",
|
||
|
|
+ bool, (),
|
||
|
|
+ hook_bool_void_false)
|
||
|
|
+
|
||
|
|
/* Do machine-dependent code transformations. Called just before
|
||
|
|
delayed-branch scheduling. */
|
||
|
|
DEFHOOK
|
||
|
|
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
|
||
|
|
index 34e60bc38..1c983ef71 100644
|
||
|
|
--- a/gcc/tree-pass.h
|
||
|
|
+++ b/gcc/tree-pass.h
|
||
|
|
@@ -612,6 +612,8 @@ extern rtl_opt_pass *make_pass_gcse2 (gcc::context *ctxt);
|
||
|
|
extern rtl_opt_pass *make_pass_split_after_reload (gcc::context *ctxt);
|
||
|
|
extern rtl_opt_pass *make_pass_thread_prologue_and_epilogue (gcc::context
|
||
|
|
*ctxt);
|
||
|
|
+extern rtl_opt_pass *make_pass_late_thread_prologue_and_epilogue (gcc::context
|
||
|
|
+ *ctxt);
|
||
|
|
extern rtl_opt_pass *make_pass_zero_call_used_regs (gcc::context *ctxt);
|
||
|
|
extern rtl_opt_pass *make_pass_split_complex_instructions (gcc::context *ctxt);
|
||
|
|
extern rtl_opt_pass *make_pass_stack_adjustments (gcc::context *ctxt);
|
||
|
|
--
|
||
|
|
2.33.0
|
||
|
|
|