1749 lines
47 KiB
Diff
1749 lines
47 KiB
Diff
From 0ad41f11bea5c303ff39c54cae8e46afdfae6070 Mon Sep 17 00:00:00 2001
|
|
From: Richard Sandiford <richard.sandiford@arm.com>
|
|
Date: Tue, 5 Dec 2023 10:11:29 +0000
|
|
Subject: [PATCH 113/157] [Backport][SME] aarch64: Add support for
|
|
__arm_locally_streaming
|
|
|
|
Reference: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=3f6e5991fab507aa79121dc44d1afcd622c78744
|
|
|
|
This patch adds support for the __arm_locally_streaming attribute,
|
|
which allows a function to use SME internally without changing
|
|
the function's ABI. The attribute is valid but redundant for
|
|
__arm_streaming functions.
|
|
|
|
gcc/
|
|
* config/aarch64/aarch64.cc (aarch64_arm_attribute_table): Add
|
|
arm::locally_streaming.
|
|
(aarch64_fndecl_is_locally_streaming): New function.
|
|
(aarch64_fndecl_sm_state): Handle locally-streaming functions.
|
|
(aarch64_cfun_enables_pstate_sm): New function.
|
|
(aarch64_add_offset): Add an argument that specifies whether
|
|
the streaming vector length should be used instead of the
|
|
prevailing one.
|
|
(aarch64_split_add_offset, aarch64_add_sp, aarch64_sub_sp): Likewise.
|
|
(aarch64_allocate_and_probe_stack_space): Likewise.
|
|
(aarch64_expand_mov_immediate): Update calls accordingly.
|
|
(aarch64_need_old_pstate_sm): Return true for locally-streaming
|
|
streaming-compatible functions.
|
|
(aarch64_layout_frame): Force all call-preserved Z and P registers
|
|
to be saved and restored if the function switches PSTATE.SM in the
|
|
prologue.
|
|
(aarch64_get_separate_components): Disable shrink-wrapping of
|
|
such Z and P saves and restores.
|
|
(aarch64_use_late_prologue_epilogue): New function.
|
|
(aarch64_expand_prologue): Measure SVE lengths in the streaming
|
|
vector length for locally-streaming functions, then emit code
|
|
to enable streaming mode.
|
|
(aarch64_expand_epilogue): Likewise in reverse.
|
|
(TARGET_USE_LATE_PROLOGUE_EPILOGUE): Define.
|
|
* config/aarch64/aarch64-c.cc (aarch64_define_unconditional_macros):
|
|
Define __arm_locally_streaming.
|
|
|
|
gcc/testsuite/
|
|
* gcc.target/aarch64/sme/locally_streaming_1.c: New test.
|
|
* gcc.target/aarch64/sme/locally_streaming_2.c: Likewise.
|
|
* gcc.target/aarch64/sme/locally_streaming_3.c: Likewise.
|
|
* gcc.target/aarch64/sme/locally_streaming_4.c: Likewise.
|
|
* gcc.target/aarch64/sme/keyword_macros_1.c: Add
|
|
__arm_locally_streaming.
|
|
* g++.target/aarch64/sme/keyword_macros_1.C: Likewise.
|
|
---
|
|
gcc/config/aarch64/aarch64-c.cc | 1 +
|
|
gcc/config/aarch64/aarch64.cc | 233 +++++++--
|
|
.../g++.target/aarch64/sme/keyword_macros_1.C | 1 +
|
|
.../gcc.target/aarch64/sme/keyword_macros_1.c | 1 +
|
|
.../aarch64/sme/locally_streaming_1.c | 466 ++++++++++++++++++
|
|
.../aarch64/sme/locally_streaming_2.c | 177 +++++++
|
|
.../aarch64/sme/locally_streaming_3.c | 273 ++++++++++
|
|
.../aarch64/sme/locally_streaming_4.c | 145 ++++++
|
|
8 files changed, 1259 insertions(+), 38 deletions(-)
|
|
create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c
|
|
create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c
|
|
create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c
|
|
create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c
|
|
|
|
diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc
|
|
index cb8a6c2fc..745719d8b 100644
|
|
--- a/gcc/config/aarch64/aarch64-c.cc
|
|
+++ b/gcc/config/aarch64/aarch64-c.cc
|
|
@@ -86,6 +86,7 @@ aarch64_define_unconditional_macros (cpp_reader *pfile)
|
|
|
|
DEFINE_ARM_KEYWORD_MACRO ("streaming");
|
|
DEFINE_ARM_KEYWORD_MACRO ("streaming_compatible");
|
|
+ DEFINE_ARM_KEYWORD_MACRO ("locally_streaming");
|
|
|
|
#undef DEFINE_ARM_KEYWORD_MACRO
|
|
|
|
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
|
|
index 113784e31..4cb43c2e2 100644
|
|
--- a/gcc/config/aarch64/aarch64.cc
|
|
+++ b/gcc/config/aarch64/aarch64.cc
|
|
@@ -3283,6 +3283,7 @@ static const attribute_spec aarch64_arm_attributes[] =
|
|
NULL, attr_streaming_exclusions },
|
|
{ "streaming_compatible", 0, 0, false, true, true, true,
|
|
NULL, attr_streaming_exclusions },
|
|
+ { "locally_streaming", 0, 0, true, false, false, false, NULL, NULL },
|
|
{ "new", 1, -1, true, false, false, false,
|
|
handle_arm_new, NULL },
|
|
{ "preserves", 1, -1, false, true, true, true,
|
|
@@ -4657,6 +4658,16 @@ aarch64_fntype_isa_mode (const_tree fntype)
|
|
| aarch64_fntype_pstate_za (fntype));
|
|
}
|
|
|
|
+/* Return true if FNDECL uses streaming mode internally, as an
|
|
+ implementation choice. */
|
|
+
|
|
+static bool
|
|
+aarch64_fndecl_is_locally_streaming (const_tree fndecl)
|
|
+{
|
|
+ return lookup_attribute ("arm", "locally_streaming",
|
|
+ DECL_ATTRIBUTES (fndecl));
|
|
+}
|
|
+
|
|
/* Return the state of PSTATE.SM when compiling the body of
|
|
function FNDECL. This might be different from the state of
|
|
PSTATE.SM on entry. */
|
|
@@ -4664,6 +4675,9 @@ aarch64_fntype_isa_mode (const_tree fntype)
|
|
static aarch64_feature_flags
|
|
aarch64_fndecl_pstate_sm (const_tree fndecl)
|
|
{
|
|
+ if (aarch64_fndecl_is_locally_streaming (fndecl))
|
|
+ return AARCH64_FL_SM_ON;
|
|
+
|
|
return aarch64_fntype_pstate_sm (TREE_TYPE (fndecl));
|
|
}
|
|
|
|
@@ -4739,6 +4753,16 @@ aarch64_cfun_has_new_state (const char *state_name)
|
|
return aarch64_fndecl_has_new_state (cfun->decl, state_name);
|
|
}
|
|
|
|
+/* Return true if PSTATE.SM is 1 in the body of the current function,
|
|
+ but is not guaranteed to be 1 on entry. */
|
|
+
|
|
+static bool
|
|
+aarch64_cfun_enables_pstate_sm ()
|
|
+{
|
|
+ return (aarch64_fndecl_is_locally_streaming (cfun->decl)
|
|
+ && aarch64_cfun_incoming_pstate_sm () != AARCH64_FL_SM_ON);
|
|
+}
|
|
+
|
|
/* Return true if the current function has state STATE_NAME, either by
|
|
creating new state itself or by sharing state with callers. */
|
|
|
|
@@ -6931,6 +6955,10 @@ aarch64_add_offset_temporaries (rtx x)
|
|
TEMP2, if nonnull, is a second temporary register that doesn't
|
|
overlap either DEST or REG.
|
|
|
|
+ FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of OFFSET
|
|
+ is measured relative to the SME vector length instead of the current
|
|
+ prevailing vector length. It is 0 otherwise.
|
|
+
|
|
Since this function may be used to adjust the stack pointer, we must
|
|
ensure that it cannot cause transient stack deallocation (for example
|
|
by first incrementing SP and then decrementing when adjusting by a
|
|
@@ -6939,6 +6967,7 @@ aarch64_add_offset_temporaries (rtx x)
|
|
static void
|
|
aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
|
|
poly_int64 offset, rtx temp1, rtx temp2,
|
|
+ aarch64_feature_flags force_isa_mode,
|
|
bool frame_related_p, bool emit_move_imm = true)
|
|
{
|
|
gcc_assert (emit_move_imm || temp1 != NULL_RTX);
|
|
@@ -6951,9 +6980,18 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
|
|
/* Try using ADDVL or ADDPL to add the whole value. */
|
|
if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
|
|
{
|
|
- rtx offset_rtx = gen_int_mode (offset, mode);
|
|
+ gcc_assert (offset.coeffs[0] == offset.coeffs[1]);
|
|
+ rtx offset_rtx;
|
|
+ if (force_isa_mode == 0)
|
|
+ offset_rtx = gen_int_mode (offset, mode);
|
|
+ else
|
|
+ offset_rtx = aarch64_sme_vq_immediate (mode, offset.coeffs[0], 0);
|
|
rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
|
|
RTX_FRAME_RELATED_P (insn) = frame_related_p;
|
|
+ if (frame_related_p && (force_isa_mode & AARCH64_FL_SM_ON))
|
|
+ add_reg_note (insn, REG_CFA_ADJUST_CFA,
|
|
+ gen_rtx_SET (dest, plus_constant (Pmode, src,
|
|
+ offset)));
|
|
return;
|
|
}
|
|
|
|
@@ -6969,11 +7007,19 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
|
|
if (src != const0_rtx
|
|
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset))
|
|
{
|
|
- rtx offset_rtx = gen_int_mode (poly_offset, mode);
|
|
+ rtx offset_rtx;
|
|
+ if (force_isa_mode == 0)
|
|
+ offset_rtx = gen_int_mode (poly_offset, mode);
|
|
+ else
|
|
+ offset_rtx = aarch64_sme_vq_immediate (mode, factor, 0);
|
|
if (frame_related_p)
|
|
{
|
|
rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
|
|
RTX_FRAME_RELATED_P (insn) = true;
|
|
+ if (force_isa_mode & AARCH64_FL_SM_ON)
|
|
+ add_reg_note (insn, REG_CFA_ADJUST_CFA,
|
|
+ gen_rtx_SET (dest, plus_constant (Pmode, src,
|
|
+ poly_offset)));
|
|
src = dest;
|
|
}
|
|
else
|
|
@@ -7004,9 +7050,19 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
|
|
rtx val;
|
|
if (IN_RANGE (rel_factor, -32, 31))
|
|
{
|
|
+ if (force_isa_mode & AARCH64_FL_SM_ON)
|
|
+ {
|
|
+ /* Try to use an unshifted RDSVL, otherwise fall back on
|
|
+ a shifted RDSVL #1. */
|
|
+ if (aarch64_sve_rdvl_addvl_factor_p (factor))
|
|
+ shift = 0;
|
|
+ else
|
|
+ factor = rel_factor * 16;
|
|
+ val = aarch64_sme_vq_immediate (mode, factor, 0);
|
|
+ }
|
|
/* Try to use an unshifted CNT[BHWD] or RDVL. */
|
|
- if (aarch64_sve_cnt_factor_p (factor)
|
|
- || aarch64_sve_rdvl_addvl_factor_p (factor))
|
|
+ else if (aarch64_sve_cnt_factor_p (factor)
|
|
+ || aarch64_sve_rdvl_addvl_factor_p (factor))
|
|
{
|
|
val = gen_int_mode (poly_int64 (factor, factor), mode);
|
|
shift = 0;
|
|
@@ -7036,11 +7092,18 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
|
|
a shift and add sequence for the multiplication.
|
|
If CNTB << SHIFT is out of range, stick with the current
|
|
shift factor. */
|
|
- if (IN_RANGE (low_bit, 2, 16 * 16))
|
|
+ if (force_isa_mode == 0
|
|
+ && IN_RANGE (low_bit, 2, 16 * 16))
|
|
{
|
|
val = gen_int_mode (poly_int64 (low_bit, low_bit), mode);
|
|
shift = 0;
|
|
}
|
|
+ else if ((force_isa_mode & AARCH64_FL_SM_ON)
|
|
+ && aarch64_sve_rdvl_addvl_factor_p (low_bit))
|
|
+ {
|
|
+ val = aarch64_sme_vq_immediate (mode, low_bit, 0);
|
|
+ shift = 0;
|
|
+ }
|
|
else
|
|
val = gen_int_mode (BYTES_PER_SVE_VECTOR, mode);
|
|
|
|
@@ -7128,30 +7191,34 @@ aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
|
|
rtx offset_rtx, rtx temp1, rtx temp2)
|
|
{
|
|
aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
|
|
- temp1, temp2, false);
|
|
+ temp1, temp2, 0, false);
|
|
}
|
|
|
|
/* Add DELTA to the stack pointer, marking the instructions frame-related.
|
|
- TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
|
|
- if TEMP1 already contains abs (DELTA). */
|
|
+ TEMP1 is available as a temporary if nonnull. FORCE_ISA_MODE is as
|
|
+ for aarch64_add_offset. EMIT_MOVE_IMM is false if TEMP1 already
|
|
+ contains abs (DELTA). */
|
|
|
|
static inline void
|
|
-aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
|
|
+aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta,
|
|
+ aarch64_feature_flags force_isa_mode, bool emit_move_imm)
|
|
{
|
|
aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
|
|
- temp1, temp2, true, emit_move_imm);
|
|
+ temp1, temp2, force_isa_mode, true, emit_move_imm);
|
|
}
|
|
|
|
/* Subtract DELTA from the stack pointer, marking the instructions
|
|
- frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
|
|
- if nonnull. */
|
|
+ frame-related if FRAME_RELATED_P. FORCE_ISA_MODE is as for
|
|
+ aarch64_add_offset. TEMP1 is available as a temporary if nonnull. */
|
|
|
|
static inline void
|
|
-aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
|
|
- bool emit_move_imm = true)
|
|
+aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta,
|
|
+ aarch64_feature_flags force_isa_mode,
|
|
+ bool frame_related_p, bool emit_move_imm = true)
|
|
{
|
|
aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
|
|
- temp1, temp2, frame_related_p, emit_move_imm);
|
|
+ temp1, temp2, force_isa_mode, frame_related_p,
|
|
+ emit_move_imm);
|
|
}
|
|
|
|
/* A streaming-compatible function needs to switch temporarily to the known
|
|
@@ -8176,11 +8243,11 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
|
|
{
|
|
base = aarch64_force_temporary (int_mode, dest, base);
|
|
aarch64_add_offset (int_mode, dest, base, offset,
|
|
- NULL_RTX, NULL_RTX, false);
|
|
+ NULL_RTX, NULL_RTX, 0, false);
|
|
}
|
|
else
|
|
aarch64_add_offset (int_mode, dest, base, offset,
|
|
- dest, NULL_RTX, false);
|
|
+ dest, NULL_RTX, 0, false);
|
|
}
|
|
return;
|
|
}
|
|
@@ -8207,7 +8274,7 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
|
|
gcc_assert (can_create_pseudo_p ());
|
|
base = aarch64_force_temporary (int_mode, dest, base);
|
|
aarch64_add_offset (int_mode, dest, base, const_offset,
|
|
- NULL_RTX, NULL_RTX, false);
|
|
+ NULL_RTX, NULL_RTX, 0, false);
|
|
return;
|
|
}
|
|
|
|
@@ -8247,7 +8314,7 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
|
|
gcc_assert(can_create_pseudo_p ());
|
|
base = aarch64_force_temporary (int_mode, dest, base);
|
|
aarch64_add_offset (int_mode, dest, base, const_offset,
|
|
- NULL_RTX, NULL_RTX, false);
|
|
+ NULL_RTX, NULL_RTX, 0, false);
|
|
return;
|
|
}
|
|
/* FALLTHRU */
|
|
@@ -9755,6 +9822,9 @@ aarch64_need_old_pstate_sm ()
|
|
if (aarch64_cfun_incoming_pstate_sm () != 0)
|
|
return false;
|
|
|
|
+ if (aarch64_cfun_enables_pstate_sm ())
|
|
+ return true;
|
|
+
|
|
if (cfun->machine->call_switches_pstate_sm)
|
|
for (auto insn = get_insns (); insn; insn = NEXT_INSN (insn))
|
|
if (auto *call = dyn_cast<rtx_call_insn *> (insn))
|
|
@@ -9781,6 +9851,7 @@ aarch64_layout_frame (void)
|
|
bool frame_related_fp_reg_p = false;
|
|
aarch64_frame &frame = cfun->machine->frame;
|
|
poly_int64 top_of_locals = -1;
|
|
+ bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
|
|
|
|
vec_safe_truncate (frame.saved_gprs, 0);
|
|
vec_safe_truncate (frame.saved_fprs, 0);
|
|
@@ -9818,7 +9889,7 @@ aarch64_layout_frame (void)
|
|
frame.reg_offset[regno] = SLOT_REQUIRED;
|
|
|
|
for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
|
|
- if (df_regs_ever_live_p (regno)
|
|
+ if ((enables_pstate_sm || df_regs_ever_live_p (regno))
|
|
&& !fixed_regs[regno]
|
|
&& !crtl->abi->clobbers_full_reg_p (regno))
|
|
{
|
|
@@ -9847,7 +9918,7 @@ aarch64_layout_frame (void)
|
|
}
|
|
|
|
for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
|
|
- if (df_regs_ever_live_p (regno)
|
|
+ if ((enables_pstate_sm || df_regs_ever_live_p (regno))
|
|
&& !fixed_regs[regno]
|
|
&& !crtl->abi->clobbers_full_reg_p (regno))
|
|
frame.reg_offset[regno] = SLOT_REQUIRED;
|
|
@@ -9964,7 +10035,8 @@ aarch64_layout_frame (void)
|
|
/* If the current function changes the SVE vector length, ensure that the
|
|
old value of the DWARF VG register is saved and available in the CFI,
|
|
so that outer frames with VL-sized offsets can be processed correctly. */
|
|
- if (cfun->machine->call_switches_pstate_sm)
|
|
+ if (cfun->machine->call_switches_pstate_sm
|
|
+ || aarch64_cfun_enables_pstate_sm ())
|
|
{
|
|
frame.reg_offset[VG_REGNUM] = offset;
|
|
offset += UNITS_PER_WORD;
|
|
@@ -10749,9 +10821,16 @@ aarch64_get_separate_components (void)
|
|
bitmap_clear (components);
|
|
|
|
/* The registers we need saved to the frame. */
|
|
+ bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
|
|
for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
|
|
if (aarch64_register_saved_on_entry (regno))
|
|
{
|
|
+ /* Disallow shrink wrapping for registers that will be clobbered
|
|
+ by an SMSTART SM in the prologue. */
|
|
+ if (enables_pstate_sm
|
|
+ && (FP_REGNUM_P (regno) || PR_REGNUM_P (regno)))
|
|
+ continue;
|
|
+
|
|
/* Punt on saves and restores that use ST1D and LD1D. We could
|
|
try to be smarter, but it would involve making sure that the
|
|
spare predicate register itself is safe to use at the save
|
|
@@ -11070,11 +11149,16 @@ aarch64_emit_stack_tie (rtx reg)
|
|
events, e.g. if we were to allow the stack to be dropped by more than a page
|
|
and then have multiple probes up and we take a signal somewhere in between
|
|
then the signal handler doesn't know the state of the stack and can make no
|
|
- assumptions about which pages have been probed. */
|
|
+ assumptions about which pages have been probed.
|
|
+
|
|
+ FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of POLY_SIZE
|
|
+ is measured relative to the SME vector length instead of the current
|
|
+ prevailing vector length. It is 0 otherwise. */
|
|
|
|
static void
|
|
aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
|
|
poly_int64 poly_size,
|
|
+ aarch64_feature_flags force_isa_mode,
|
|
bool frame_related_p,
|
|
bool final_adjustment_p)
|
|
{
|
|
@@ -11116,7 +11200,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
|
|
if (known_lt (poly_size, min_probe_threshold)
|
|
|| !flag_stack_clash_protection)
|
|
{
|
|
- aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
|
|
+ aarch64_sub_sp (temp1, temp2, poly_size, force_isa_mode,
|
|
+ frame_related_p);
|
|
return;
|
|
}
|
|
|
|
@@ -11133,7 +11218,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
|
|
|
|
/* First calculate the amount of bytes we're actually spilling. */
|
|
aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
|
|
- poly_size, temp1, temp2, false, true);
|
|
+ poly_size, temp1, temp2, force_isa_mode,
|
|
+ false, true);
|
|
|
|
rtx_insn *insn = get_last_insn ();
|
|
|
|
@@ -11191,7 +11277,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
|
|
{
|
|
for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
|
|
{
|
|
- aarch64_sub_sp (NULL, temp2, guard_size, true);
|
|
+ aarch64_sub_sp (NULL, temp2, guard_size, force_isa_mode, true);
|
|
emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
|
|
guard_used_by_caller));
|
|
emit_insn (gen_blockage ());
|
|
@@ -11202,7 +11288,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
|
|
{
|
|
/* Compute the ending address. */
|
|
aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
|
|
- temp1, NULL, false, true);
|
|
+ temp1, NULL, force_isa_mode, false, true);
|
|
rtx_insn *insn = get_last_insn ();
|
|
|
|
/* For the initial allocation, we don't have a frame pointer
|
|
@@ -11268,7 +11354,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
|
|
if (final_adjustment_p && rounded_size != 0)
|
|
min_probe_threshold = 0;
|
|
|
|
- aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
|
|
+ aarch64_sub_sp (temp1, temp2, residual, force_isa_mode, frame_related_p);
|
|
if (residual >= min_probe_threshold)
|
|
{
|
|
if (dump_file)
|
|
@@ -11333,6 +11419,14 @@ aarch64_epilogue_uses (int regno)
|
|
return 0;
|
|
}
|
|
|
|
+/* Implement TARGET_USE_LATE_PROLOGUE_EPILOGUE. */
|
|
+
|
|
+static bool
|
|
+aarch64_use_late_prologue_epilogue ()
|
|
+{
|
|
+ return aarch64_cfun_enables_pstate_sm ();
|
|
+}
|
|
+
|
|
/* The current function's frame has a save slot for the incoming state
|
|
of SVCR. Return a legitimate memory for the slot, based on the hard
|
|
frame pointer. */
|
|
@@ -11469,6 +11563,9 @@ aarch64_expand_prologue (void)
|
|
unsigned reg2 = frame.wb_push_candidate2;
|
|
bool emit_frame_chain = frame.emit_frame_chain;
|
|
rtx_insn *insn;
|
|
+ aarch64_feature_flags force_isa_mode = 0;
|
|
+ if (aarch64_cfun_enables_pstate_sm ())
|
|
+ force_isa_mode = AARCH64_FL_SM_ON;
|
|
|
|
if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
|
|
{
|
|
@@ -11530,7 +11627,7 @@ aarch64_expand_prologue (void)
|
|
less the amount of the guard reserved for use by the caller's
|
|
outgoing args. */
|
|
aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
|
|
- true, false);
|
|
+ force_isa_mode, true, false);
|
|
|
|
if (callee_adjust != 0)
|
|
aarch64_push_regs (reg1, reg2, callee_adjust);
|
|
@@ -11553,7 +11650,8 @@ aarch64_expand_prologue (void)
|
|
gcc_assert (known_eq (chain_offset, 0));
|
|
aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
|
|
stack_pointer_rtx, chain_offset,
|
|
- tmp1_rtx, tmp0_rtx, frame_pointer_needed);
|
|
+ tmp1_rtx, tmp0_rtx, force_isa_mode,
|
|
+ frame_pointer_needed);
|
|
if (frame_pointer_needed && !frame_size.is_constant ())
|
|
{
|
|
/* Variable-sized frames need to describe the save slot
|
|
@@ -11600,6 +11698,7 @@ aarch64_expand_prologue (void)
|
|
|| known_eq (initial_adjust, 0));
|
|
aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
|
|
sve_callee_adjust,
|
|
+ force_isa_mode,
|
|
!frame_pointer_needed, false);
|
|
bytes_below_sp -= sve_callee_adjust;
|
|
}
|
|
@@ -11612,12 +11711,15 @@ aarch64_expand_prologue (void)
|
|
that is assumed by the called. */
|
|
gcc_assert (known_eq (bytes_below_sp, final_adjust));
|
|
aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
|
|
+ force_isa_mode,
|
|
!frame_pointer_needed, true);
|
|
if (emit_frame_chain && maybe_ne (final_adjust, 0))
|
|
aarch64_emit_stack_tie (hard_frame_pointer_rtx);
|
|
|
|
- /* Save the incoming value of PSTATE.SM, if required. */
|
|
- if (known_ge (frame.old_svcr_offset, 0))
|
|
+ /* Save the incoming value of PSTATE.SM, if required. Code further
|
|
+ down does this for locally-streaming functions. */
|
|
+ if (known_ge (frame.old_svcr_offset, 0)
|
|
+ && !aarch64_cfun_enables_pstate_sm ())
|
|
{
|
|
rtx mem = aarch64_old_svcr_mem ();
|
|
MEM_VOLATILE_P (mem) = 1;
|
|
@@ -11649,6 +11751,34 @@ aarch64_expand_prologue (void)
|
|
emit_move_insn (gen_rtx_REG (DImode, R1_REGNUM), old_r1);
|
|
}
|
|
}
|
|
+
|
|
+ /* Enable PSTATE.SM, if required. */
|
|
+ if (aarch64_cfun_enables_pstate_sm ())
|
|
+ {
|
|
+ rtx_insn *guard_label = nullptr;
|
|
+ if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
|
|
+ {
|
|
+ /* The current function is streaming-compatible. Save the
|
|
+ original state of PSTATE.SM. */
|
|
+ rtx svcr = gen_rtx_REG (DImode, IP0_REGNUM);
|
|
+ emit_insn (gen_aarch64_read_svcr (svcr));
|
|
+ emit_move_insn (aarch64_old_svcr_mem (), svcr);
|
|
+ guard_label = aarch64_guard_switch_pstate_sm (svcr,
|
|
+ aarch64_isa_flags);
|
|
+ }
|
|
+ aarch64_sme_mode_switch_regs args_switch;
|
|
+ auto &args = crtl->args.info;
|
|
+ for (unsigned int i = 0; i < args.num_sme_mode_switch_args; ++i)
|
|
+ {
|
|
+ rtx x = args.sme_mode_switch_args[i];
|
|
+ args_switch.add_reg (GET_MODE (x), REGNO (x));
|
|
+ }
|
|
+ args_switch.emit_prologue ();
|
|
+ emit_insn (gen_aarch64_smstart_sm ());
|
|
+ args_switch.emit_epilogue ();
|
|
+ if (guard_label)
|
|
+ emit_label (guard_label);
|
|
+ }
|
|
}
|
|
|
|
/* Return TRUE if we can use a simple_return insn.
|
|
@@ -11695,6 +11825,9 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall)
|
|
HOST_WIDE_INT guard_size
|
|
= 1 << param_stack_clash_protection_guard_size;
|
|
HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
|
|
+ aarch64_feature_flags force_isa_mode = 0;
|
|
+ if (aarch64_cfun_enables_pstate_sm ())
|
|
+ force_isa_mode = AARCH64_FL_SM_ON;
|
|
|
|
/* We can re-use the registers when:
|
|
|
|
@@ -11719,6 +11852,24 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall)
|
|
= maybe_ne (get_frame_size ()
|
|
+ frame.saved_varargs_size, 0);
|
|
|
|
+ /* Reset PSTATE.SM, if required. */
|
|
+ if (aarch64_cfun_enables_pstate_sm ())
|
|
+ {
|
|
+ rtx_insn *guard_label = nullptr;
|
|
+ if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
|
|
+ guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
|
|
+ aarch64_isa_flags);
|
|
+ aarch64_sme_mode_switch_regs return_switch;
|
|
+ if (crtl->return_rtx && REG_P (crtl->return_rtx))
|
|
+ return_switch.add_reg (GET_MODE (crtl->return_rtx),
|
|
+ REGNO (crtl->return_rtx));
|
|
+ return_switch.emit_prologue ();
|
|
+ emit_insn (gen_aarch64_smstop_sm ());
|
|
+ return_switch.emit_epilogue ();
|
|
+ if (guard_label)
|
|
+ emit_label (guard_label);
|
|
+ }
|
|
+
|
|
/* Emit a barrier to prevent loads from a deallocated stack. */
|
|
if (maybe_gt (final_adjust, crtl->outgoing_args_size)
|
|
|| cfun->calls_alloca
|
|
@@ -11739,19 +11890,21 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall)
|
|
aarch64_add_offset (Pmode, stack_pointer_rtx,
|
|
hard_frame_pointer_rtx,
|
|
-bytes_below_hard_fp + final_adjust,
|
|
- tmp1_rtx, tmp0_rtx, callee_adjust == 0);
|
|
+ tmp1_rtx, tmp0_rtx, force_isa_mode,
|
|
+ callee_adjust == 0);
|
|
else
|
|
/* The case where we need to re-use the register here is very rare, so
|
|
avoid the complicated condition and just always emit a move if the
|
|
immediate doesn't fit. */
|
|
- aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
|
|
+ aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, force_isa_mode, true);
|
|
|
|
/* Restore the vector registers before the predicate registers,
|
|
so that we can use P4 as a temporary for big-endian SVE frames. */
|
|
aarch64_restore_callee_saves (final_adjust, frame.saved_fprs, &cfi_ops);
|
|
aarch64_restore_callee_saves (final_adjust, frame.saved_prs, &cfi_ops);
|
|
if (maybe_ne (sve_callee_adjust, 0))
|
|
- aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
|
|
+ aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust,
|
|
+ force_isa_mode, true);
|
|
|
|
/* When shadow call stack is enabled, the scs_pop in the epilogue will
|
|
restore x30, we don't need to restore x30 again in the traditional
|
|
@@ -11781,7 +11934,7 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall)
|
|
|
|
/* Liveness of EP0_REGNUM can not be trusted across function calls either, so
|
|
add restriction on emit_move optimization to leaf functions. */
|
|
- aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
|
|
+ aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust, force_isa_mode,
|
|
(!can_inherit_p || !crtl->is_leaf
|
|
|| df_regs_ever_live_p (EP0_REGNUM)));
|
|
|
|
@@ -11914,7 +12067,8 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
|
|
temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
|
|
|
|
if (vcall_offset == 0)
|
|
- aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
|
|
+ aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0,
|
|
+ 0, false);
|
|
else
|
|
{
|
|
gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
|
|
@@ -11927,7 +12081,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
|
|
plus_constant (Pmode, this_rtx, delta));
|
|
else
|
|
aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
|
|
- temp1, temp0, false);
|
|
+ temp1, temp0, 0, false);
|
|
}
|
|
|
|
if (Pmode == ptr_mode)
|
|
@@ -30962,6 +31116,9 @@ aarch64_libgcc_floating_mode_supported_p
|
|
#undef TARGET_EXTRA_LIVE_ON_ENTRY
|
|
#define TARGET_EXTRA_LIVE_ON_ENTRY aarch64_extra_live_on_entry
|
|
|
|
+#undef TARGET_USE_LATE_PROLOGUE_EPILOGUE
|
|
+#define TARGET_USE_LATE_PROLOGUE_EPILOGUE aarch64_use_late_prologue_epilogue
|
|
+
|
|
#undef TARGET_EMIT_EPILOGUE_FOR_SIBCALL
|
|
#define TARGET_EMIT_EPILOGUE_FOR_SIBCALL aarch64_expand_epilogue
|
|
|
|
diff --git a/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C b/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C
|
|
index 8b0755014..dc5c097bd 100644
|
|
--- a/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C
|
|
+++ b/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C
|
|
@@ -7,3 +7,4 @@ void f4 () __arm_out("za");
|
|
void f5 () __arm_inout("za");
|
|
void f6 () __arm_preserves("za");
|
|
__arm_new("za") void f7 () {}
|
|
+__arm_locally_streaming void f8 () {}
|
|
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c b/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c
|
|
index fcabe3edc..22f5facfd 100644
|
|
--- a/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c
|
|
+++ b/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c
|
|
@@ -7,3 +7,4 @@ void f4 () __arm_out("za");
|
|
void f5 () __arm_inout("za");
|
|
void f6 () __arm_preserves("za");
|
|
__arm_new("za") void f7 () {}
|
|
+__arm_locally_streaming void f8 () {}
|
|
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c
|
|
new file mode 100644
|
|
index 000000000..20ff4b87d
|
|
--- /dev/null
|
|
+++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c
|
|
@@ -0,0 +1,466 @@
|
|
+// { dg-options "-O -fomit-frame-pointer" }
|
|
+// { dg-final { check-function-bodies "**" "" } }
|
|
+
|
|
+void consume_za () [[arm::streaming, arm::inout("za")]];
|
|
+
|
|
+/*
|
|
+** n_ls:
|
|
+** sub sp, sp, #?80
|
|
+** cntd x16
|
|
+** str x16, \[sp\]
|
|
+** stp d8, d9, \[sp, #?16\]
|
|
+** stp d10, d11, \[sp, #?32\]
|
|
+** stp d12, d13, \[sp, #?48\]
|
|
+** stp d14, d15, \[sp, #?64\]
|
|
+** smstart sm
|
|
+** smstop sm
|
|
+** ldp d8, d9, \[sp, #?16\]
|
|
+** ldp d10, d11, \[sp, #?32\]
|
|
+** ldp d12, d13, \[sp, #?48\]
|
|
+** ldp d14, d15, \[sp, #?64\]
|
|
+** add sp, sp, #?80
|
|
+** ret
|
|
+*/
|
|
+[[arm::locally_streaming]] void
|
|
+n_ls ()
|
|
+{
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** s_ls:
|
|
+** ret
|
|
+*/
|
|
+[[arm::locally_streaming]] void
|
|
+s_ls () [[arm::streaming]]
|
|
+{
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** sc_ls:
|
|
+** stp x29, x30, \[sp, #?-96\]!
|
|
+** mov x29, sp
|
|
+** cntd x16
|
|
+** str x16, \[sp, #?24\]
|
|
+** stp d8, d9, \[sp, #?32\]
|
|
+** stp d10, d11, \[sp, #?48\]
|
|
+** stp d12, d13, \[sp, #?64\]
|
|
+** stp d14, d15, \[sp, #?80\]
|
|
+** mrs x16, svcr
|
|
+** str x16, \[x29, #?16\]
|
|
+** tbnz x16, 0, [^\n]+
|
|
+** smstart sm
|
|
+** ldr x16, \[x29, #?16\]
|
|
+** tbnz x16, 0, [^\n]+
|
|
+** smstop sm
|
|
+** ldp d8, d9, \[sp, #?32\]
|
|
+** ldp d10, d11, \[sp, #?48\]
|
|
+** ldp d12, d13, \[sp, #?64\]
|
|
+** ldp d14, d15, \[sp, #?80\]
|
|
+** ldp x29, x30, \[sp\], #?96
|
|
+** ret
|
|
+*/
|
|
+[[arm::locally_streaming]] void
|
|
+sc_ls () [[arm::streaming_compatible]]
|
|
+{
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** n_ls_new_za:
|
|
+** str x30, \[sp, #?-80\]!
|
|
+** cntd x16
|
|
+** str x16, \[sp, #?8\]
|
|
+** stp d8, d9, \[sp, #?16\]
|
|
+** stp d10, d11, \[sp, #?32\]
|
|
+** stp d12, d13, \[sp, #?48\]
|
|
+** stp d14, d15, \[sp, #?64\]
|
|
+** smstart sm
|
|
+** mrs (x[0-9]+), tpidr2_el0
|
|
+** cbz \1, [^\n]+
|
|
+** bl __arm_tpidr2_save
|
|
+** msr tpidr2_el0, xzr
|
|
+** zero { za }
|
|
+** smstart za
|
|
+** bl consume_za
|
|
+** smstop za
|
|
+** smstop sm
|
|
+** ldp d8, d9, \[sp, #?16\]
|
|
+** ldp d10, d11, \[sp, #?32\]
|
|
+** ldp d12, d13, \[sp, #?48\]
|
|
+** ldp d14, d15, \[sp, #?64\]
|
|
+** ldr x30, \[sp\], #?80
|
|
+** ret
|
|
+*/
|
|
+[[arm::locally_streaming, arm::new("za")]] void
|
|
+n_ls_new_za ()
|
|
+{
|
|
+ consume_za ();
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** s_ls_new_za:
|
|
+** str x30, \[sp, #?-16\]!
|
|
+** mrs (x[0-9]+), tpidr2_el0
|
|
+** cbz \1, [^\n]+
|
|
+** bl __arm_tpidr2_save
|
|
+** msr tpidr2_el0, xzr
|
|
+** zero { za }
|
|
+** smstart za
|
|
+** bl consume_za
|
|
+** smstop za
|
|
+** ldr x30, \[sp\], #?16
|
|
+** ret
|
|
+*/
|
|
+[[arm::locally_streaming, arm::new("za")]] void
|
|
+s_ls_new_za () [[arm::streaming]]
|
|
+{
|
|
+ consume_za ();
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** sc_ls_new_za:
|
|
+** stp x29, x30, \[sp, #?-96\]!
|
|
+** mov x29, sp
|
|
+** cntd x16
|
|
+** str x16, \[sp, #?24\]
|
|
+** stp d8, d9, \[sp, #?32\]
|
|
+** stp d10, d11, \[sp, #?48\]
|
|
+** stp d12, d13, \[sp, #?64\]
|
|
+** stp d14, d15, \[sp, #?80\]
|
|
+** mrs x16, svcr
|
|
+** str x16, \[x29, #?16\]
|
|
+** tbnz x16, 0, [^\n]+
|
|
+** smstart sm
|
|
+** mrs (x[0-9]+), tpidr2_el0
|
|
+** cbz \1, [^\n]+
|
|
+** bl __arm_tpidr2_save
|
|
+** msr tpidr2_el0, xzr
|
|
+** zero { za }
|
|
+** smstart za
|
|
+** bl consume_za
|
|
+** smstop za
|
|
+** ldr x16, \[x29, #?16\]
|
|
+** tbnz x16, 0, [^\n]+
|
|
+** smstop sm
|
|
+** ldp d8, d9, \[sp, #?32\]
|
|
+** ldp d10, d11, \[sp, #?48\]
|
|
+** ldp d12, d13, \[sp, #?64\]
|
|
+** ldp d14, d15, \[sp, #?80\]
|
|
+** ldp x29, x30, \[sp\], #?96
|
|
+** ret
|
|
+*/
|
|
+[[arm::locally_streaming, arm::new("za")]] void
|
|
+sc_ls_new_za () [[arm::streaming_compatible]]
|
|
+{
|
|
+ consume_za ();
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** n_ls_shared_za:
|
|
+** str x30, \[sp, #?-80\]!
|
|
+** cntd x16
|
|
+** str x16, \[sp, #?8\]
|
|
+** stp d8, d9, \[sp, #?16\]
|
|
+** stp d10, d11, \[sp, #?32\]
|
|
+** stp d12, d13, \[sp, #?48\]
|
|
+** stp d14, d15, \[sp, #?64\]
|
|
+** smstart sm
|
|
+** bl consume_za
|
|
+** smstop sm
|
|
+** ldp d8, d9, \[sp, #?16\]
|
|
+** ldp d10, d11, \[sp, #?32\]
|
|
+** ldp d12, d13, \[sp, #?48\]
|
|
+** ldp d14, d15, \[sp, #?64\]
|
|
+** ldr x30, \[sp\], #?80
|
|
+** ret
|
|
+*/
|
|
+[[arm::locally_streaming]] void
|
|
+n_ls_shared_za () [[arm::inout("za")]]
|
|
+{
|
|
+ consume_za ();
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** s_ls_shared_za:
|
|
+** str x30, \[sp, #?-16\]!
|
|
+** bl consume_za
|
|
+** ldr x30, \[sp\], #?16
|
|
+** ret
|
|
+*/
|
|
+[[arm::locally_streaming]] void
|
|
+s_ls_shared_za () [[arm::streaming, arm::inout("za")]]
|
|
+{
|
|
+ consume_za ();
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** sc_ls_shared_za:
|
|
+** stp x29, x30, \[sp, #?-96\]!
|
|
+** mov x29, sp
|
|
+** cntd x16
|
|
+** str x16, \[sp, #?24\]
|
|
+** stp d8, d9, \[sp, #?32\]
|
|
+** stp d10, d11, \[sp, #?48\]
|
|
+** stp d12, d13, \[sp, #?64\]
|
|
+** stp d14, d15, \[sp, #?80\]
|
|
+** mrs x16, svcr
|
|
+** str x16, \[x29, #?16\]
|
|
+** tbnz x16, 0, [^\n]+
|
|
+** smstart sm
|
|
+** bl consume_za
|
|
+** ldr x16, \[x29, #?16\]
|
|
+** tbnz x16, 0, [^\n]+
|
|
+** smstop sm
|
|
+** ldp d8, d9, \[sp, #?32\]
|
|
+** ldp d10, d11, \[sp, #?48\]
|
|
+** ldp d12, d13, \[sp, #?64\]
|
|
+** ldp d14, d15, \[sp, #?80\]
|
|
+** ldp x29, x30, \[sp\], #?96
|
|
+** ret
|
|
+*/
|
|
+[[arm::locally_streaming]] void
|
|
+sc_ls_shared_za () [[arm::streaming_compatible, arm::inout("za")]]
|
|
+{
|
|
+ consume_za ();
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** n_ls_vector_pcs:
|
|
+** sub sp, sp, #?272
|
|
+** cntd x16
|
|
+** str x16, \[sp\]
|
|
+** stp q8, q9, \[sp, #?16\]
|
|
+** stp q10, q11, \[sp, #?48\]
|
|
+** stp q12, q13, \[sp, #?80\]
|
|
+** stp q14, q15, \[sp, #?112\]
|
|
+** stp q16, q17, \[sp, #?144\]
|
|
+** stp q18, q19, \[sp, #?176\]
|
|
+** stp q20, q21, \[sp, #?208\]
|
|
+** stp q22, q23, \[sp, #?240\]
|
|
+** smstart sm
|
|
+** smstop sm
|
|
+** ldp q8, q9, \[sp, #?16\]
|
|
+** ldp q10, q11, \[sp, #?48\]
|
|
+** ldp q12, q13, \[sp, #?80\]
|
|
+** ldp q14, q15, \[sp, #?112\]
|
|
+** ldp q16, q17, \[sp, #?144\]
|
|
+** ldp q18, q19, \[sp, #?176\]
|
|
+** ldp q20, q21, \[sp, #?208\]
|
|
+** ldp q22, q23, \[sp, #?240\]
|
|
+** add sp, sp, #?272
|
|
+** ret
|
|
+*/
|
|
+[[arm::locally_streaming]] void __attribute__((aarch64_vector_pcs))
|
|
+n_ls_vector_pcs ()
|
|
+{
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** n_ls_sve_pcs:
|
|
+** sub sp, sp, #?16
|
|
+** cntd x16
|
|
+** str x16, \[sp\]
|
|
+** addsvl sp, sp, #-18
|
|
+** str p4, \[sp\]
|
|
+** str p5, \[sp, #1, mul vl\]
|
|
+** str p6, \[sp, #2, mul vl\]
|
|
+** str p7, \[sp, #3, mul vl\]
|
|
+** str p8, \[sp, #4, mul vl\]
|
|
+** str p9, \[sp, #5, mul vl\]
|
|
+** str p10, \[sp, #6, mul vl\]
|
|
+** str p11, \[sp, #7, mul vl\]
|
|
+** str p12, \[sp, #8, mul vl\]
|
|
+** str p13, \[sp, #9, mul vl\]
|
|
+** str p14, \[sp, #10, mul vl\]
|
|
+** str p15, \[sp, #11, mul vl\]
|
|
+** str z8, \[sp, #2, mul vl\]
|
|
+** str z9, \[sp, #3, mul vl\]
|
|
+** str z10, \[sp, #4, mul vl\]
|
|
+** str z11, \[sp, #5, mul vl\]
|
|
+** str z12, \[sp, #6, mul vl\]
|
|
+** str z13, \[sp, #7, mul vl\]
|
|
+** str z14, \[sp, #8, mul vl\]
|
|
+** str z15, \[sp, #9, mul vl\]
|
|
+** str z16, \[sp, #10, mul vl\]
|
|
+** str z17, \[sp, #11, mul vl\]
|
|
+** str z18, \[sp, #12, mul vl\]
|
|
+** str z19, \[sp, #13, mul vl\]
|
|
+** str z20, \[sp, #14, mul vl\]
|
|
+** str z21, \[sp, #15, mul vl\]
|
|
+** str z22, \[sp, #16, mul vl\]
|
|
+** str z23, \[sp, #17, mul vl\]
|
|
+** addvl sp, sp, #-1
|
|
+** str p0, \[sp\]
|
|
+** smstart sm
|
|
+** ldr p0, \[sp\]
|
|
+** addvl sp, sp, #1
|
|
+** smstop sm
|
|
+** ldr z8, \[sp, #2, mul vl\]
|
|
+** ldr z9, \[sp, #3, mul vl\]
|
|
+** ldr z10, \[sp, #4, mul vl\]
|
|
+** ldr z11, \[sp, #5, mul vl\]
|
|
+** ldr z12, \[sp, #6, mul vl\]
|
|
+** ldr z13, \[sp, #7, mul vl\]
|
|
+** ldr z14, \[sp, #8, mul vl\]
|
|
+** ldr z15, \[sp, #9, mul vl\]
|
|
+** ldr z16, \[sp, #10, mul vl\]
|
|
+** ldr z17, \[sp, #11, mul vl\]
|
|
+** ldr z18, \[sp, #12, mul vl\]
|
|
+** ldr z19, \[sp, #13, mul vl\]
|
|
+** ldr z20, \[sp, #14, mul vl\]
|
|
+** ldr z21, \[sp, #15, mul vl\]
|
|
+** ldr z22, \[sp, #16, mul vl\]
|
|
+** ldr z23, \[sp, #17, mul vl\]
|
|
+** ldr p4, \[sp\]
|
|
+** ldr p5, \[sp, #1, mul vl\]
|
|
+** ldr p6, \[sp, #2, mul vl\]
|
|
+** ldr p7, \[sp, #3, mul vl\]
|
|
+** ldr p8, \[sp, #4, mul vl\]
|
|
+** ldr p9, \[sp, #5, mul vl\]
|
|
+** ldr p10, \[sp, #6, mul vl\]
|
|
+** ldr p11, \[sp, #7, mul vl\]
|
|
+** ldr p12, \[sp, #8, mul vl\]
|
|
+** ldr p13, \[sp, #9, mul vl\]
|
|
+** ldr p14, \[sp, #10, mul vl\]
|
|
+** ldr p15, \[sp, #11, mul vl\]
|
|
+** addsvl sp, sp, #18
|
|
+** add sp, sp, #?16
|
|
+** ret
|
|
+*/
|
|
+[[arm::locally_streaming]] void
|
|
+n_ls_sve_pcs (__SVBool_t x)
|
|
+{
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** n_ls_v0:
|
|
+** addsvl sp, sp, #-1
|
|
+** ...
|
|
+** smstart sm
|
|
+** add x[0-9]+, [^\n]+
|
|
+** smstop sm
|
|
+** ...
|
|
+** addsvl sp, sp, #1
|
|
+** ...
|
|
+*/
|
|
+#define TEST(VN) __SVInt32_t VN; asm ("" :: "r" (&VN));
|
|
+[[arm::locally_streaming]] void
|
|
+n_ls_v0 ()
|
|
+{
|
|
+ TEST (v0);
|
|
+}
|
|
+
|
|
+/*
|
|
+** n_ls_v32:
|
|
+** addsvl sp, sp, #-32
|
|
+** ...
|
|
+** smstart sm
|
|
+** ...
|
|
+** smstop sm
|
|
+** ...
|
|
+** rdsvl (x[0-9]+), #1
|
|
+** lsl (x[0-9]+), \1, #?5
|
|
+** add sp, sp, \2
|
|
+** ...
|
|
+*/
|
|
+[[arm::locally_streaming]] void
|
|
+n_ls_v32 ()
|
|
+{
|
|
+ TEST (v0);
|
|
+ TEST (v1);
|
|
+ TEST (v2);
|
|
+ TEST (v3);
|
|
+ TEST (v4);
|
|
+ TEST (v5);
|
|
+ TEST (v6);
|
|
+ TEST (v7);
|
|
+ TEST (v8);
|
|
+ TEST (v9);
|
|
+ TEST (v10);
|
|
+ TEST (v11);
|
|
+ TEST (v12);
|
|
+ TEST (v13);
|
|
+ TEST (v14);
|
|
+ TEST (v15);
|
|
+ TEST (v16);
|
|
+ TEST (v17);
|
|
+ TEST (v18);
|
|
+ TEST (v19);
|
|
+ TEST (v20);
|
|
+ TEST (v21);
|
|
+ TEST (v22);
|
|
+ TEST (v23);
|
|
+ TEST (v24);
|
|
+ TEST (v25);
|
|
+ TEST (v26);
|
|
+ TEST (v27);
|
|
+ TEST (v28);
|
|
+ TEST (v29);
|
|
+ TEST (v30);
|
|
+ TEST (v31);
|
|
+}
|
|
+
|
|
+/*
|
|
+** n_ls_v33:
|
|
+** rdsvl (x[0-9]+), #1
|
|
+** mov (x[0-9]+), #?33
|
|
+** mul (x[0-9]+), (?:\1, \2|\2, \1)
|
|
+** sub sp, sp, \3
|
|
+** ...
|
|
+** smstart sm
|
|
+** ...
|
|
+** smstop sm
|
|
+** ...
|
|
+** rdsvl (x[0-9]+), #1
|
|
+** mov (x[0-9]+), #?33
|
|
+** mul (x[0-9]+), (?:\4, \5|\5, \4)
|
|
+** add sp, sp, \6
|
|
+** ...
|
|
+*/
|
|
+[[arm::locally_streaming]] void
|
|
+n_ls_v33 ()
|
|
+{
|
|
+ TEST (v0);
|
|
+ TEST (v1);
|
|
+ TEST (v2);
|
|
+ TEST (v3);
|
|
+ TEST (v4);
|
|
+ TEST (v5);
|
|
+ TEST (v6);
|
|
+ TEST (v7);
|
|
+ TEST (v8);
|
|
+ TEST (v9);
|
|
+ TEST (v10);
|
|
+ TEST (v11);
|
|
+ TEST (v12);
|
|
+ TEST (v13);
|
|
+ TEST (v14);
|
|
+ TEST (v15);
|
|
+ TEST (v16);
|
|
+ TEST (v17);
|
|
+ TEST (v18);
|
|
+ TEST (v19);
|
|
+ TEST (v20);
|
|
+ TEST (v21);
|
|
+ TEST (v22);
|
|
+ TEST (v23);
|
|
+ TEST (v24);
|
|
+ TEST (v25);
|
|
+ TEST (v26);
|
|
+ TEST (v27);
|
|
+ TEST (v28);
|
|
+ TEST (v29);
|
|
+ TEST (v30);
|
|
+ TEST (v31);
|
|
+ TEST (v32);
|
|
+}
|
|
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c
|
|
new file mode 100644
|
|
index 000000000..0eba99385
|
|
--- /dev/null
|
|
+++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c
|
|
@@ -0,0 +1,177 @@
|
|
+// { dg-options "-O -fomit-frame-pointer" }
|
|
+// { dg-final { check-function-bodies "**" "" } }
|
|
+
|
|
+#include <arm_neon.h>
|
|
+#include <arm_sve.h>
|
|
+
|
|
+/*
|
|
+** test_d0:
|
|
+** ...
|
|
+** smstart sm
|
|
+** ...
|
|
+** fmov x10, d0
|
|
+** smstop sm
|
|
+** fmov d0, x10
|
|
+** ...
|
|
+*/
|
|
+[[arm::locally_streaming]] double
|
|
+test_d0 ()
|
|
+{
|
|
+ asm ("");
|
|
+ return 1.0f;
|
|
+}
|
|
+
|
|
+/*
|
|
+** test_d0_vec:
|
|
+** ...
|
|
+** smstart sm
|
|
+** ...
|
|
+** (
|
|
+** fmov x10, d0
|
|
+** |
|
|
+** umov x10, v0.d\[0\]
|
|
+** )
|
|
+** smstop sm
|
|
+** fmov d0, x10
|
|
+** ...
|
|
+*/
|
|
+[[arm::locally_streaming]] int8x8_t
|
|
+test_d0_vec ()
|
|
+{
|
|
+ asm ("");
|
|
+ return (int8x8_t) {};
|
|
+}
|
|
+
|
|
+/*
|
|
+** test_q0:
|
|
+** ...
|
|
+** smstart sm
|
|
+** ...
|
|
+** str q0, \[sp, #?-16\]!
|
|
+** smstop sm
|
|
+** ldr q0, \[sp\], #?16
|
|
+** ...
|
|
+*/
|
|
+[[arm::locally_streaming]] int8x16_t
|
|
+test_q0 ()
|
|
+{
|
|
+ asm ("");
|
|
+ return (int8x16_t) {};
|
|
+}
|
|
+
|
|
+/*
|
|
+** test_q1:
|
|
+** ...
|
|
+** smstart sm
|
|
+** ...
|
|
+** stp q0, q1, \[sp, #?-32\]!
|
|
+** smstop sm
|
|
+** ldp q0, q1, \[sp\], #?32
|
|
+** ...
|
|
+*/
|
|
+[[arm::locally_streaming]] int8x16x2_t
|
|
+test_q1 ()
|
|
+{
|
|
+ asm ("");
|
|
+ return (int8x16x2_t) {};
|
|
+}
|
|
+
|
|
+/*
|
|
+** test_q2:
|
|
+** ...
|
|
+** smstart sm
|
|
+** ...
|
|
+** stp q0, q1, \[sp, #?-48\]!
|
|
+** str q2, \[sp, #?32\]
|
|
+** smstop sm
|
|
+** ldr q2, \[sp, #?32\]
|
|
+** ldp q0, q1, \[sp\], #?48
|
|
+** ...
|
|
+*/
|
|
+[[arm::locally_streaming]] int8x16x3_t
|
|
+test_q2 ()
|
|
+{
|
|
+ asm ("");
|
|
+ return (int8x16x3_t) {};
|
|
+}
|
|
+
|
|
+/*
|
|
+** test_q3:
|
|
+** ...
|
|
+** smstart sm
|
|
+** ...
|
|
+** stp q0, q1, \[sp, #?-64\]!
|
|
+** stp q2, q3, \[sp, #?32\]
|
|
+** smstop sm
|
|
+** ldp q2, q3, \[sp, #?32\]
|
|
+** ldp q0, q1, \[sp\], #?64
|
|
+** ...
|
|
+*/
|
|
+[[arm::locally_streaming]] int8x16x4_t
|
|
+test_q3 ()
|
|
+{
|
|
+ asm ("");
|
|
+ return (int8x16x4_t) {};
|
|
+}
|
|
+
|
|
+/*
|
|
+** test_z0:
|
|
+** ...
|
|
+** smstart sm
|
|
+** mov z0\.b, #0
|
|
+** addvl sp, sp, #-1
|
|
+** str z0, \[sp\]
|
|
+** smstop sm
|
|
+** ldr z0, \[sp\]
|
|
+** addvl sp, sp, #1
|
|
+** ...
|
|
+*/
|
|
+[[arm::locally_streaming]] svint8_t
|
|
+test_z0 ()
|
|
+{
|
|
+ asm ("");
|
|
+ return (svint8_t) {};
|
|
+}
|
|
+
|
|
+/*
|
|
+** test_z3:
|
|
+** ...
|
|
+** smstart sm
|
|
+** ...
|
|
+** addvl sp, sp, #-4
|
|
+** str z0, \[sp\]
|
|
+** str z1, \[sp, #1, mul vl\]
|
|
+** str z2, \[sp, #2, mul vl\]
|
|
+** str z3, \[sp, #3, mul vl\]
|
|
+** smstop sm
|
|
+** ldr z0, \[sp\]
|
|
+** ldr z1, \[sp, #1, mul vl\]
|
|
+** ldr z2, \[sp, #2, mul vl\]
|
|
+** ldr z3, \[sp, #3, mul vl\]
|
|
+** ...
|
|
+*/
|
|
+[[arm::locally_streaming]] svint8x4_t
|
|
+test_z3 ()
|
|
+{
|
|
+ asm ("");
|
|
+ return (svint8x4_t) {};
|
|
+}
|
|
+
|
|
+/*
|
|
+** test_p0:
|
|
+** ...
|
|
+** smstart sm
|
|
+** pfalse p0\.b
|
|
+** addvl sp, sp, #-1
|
|
+** str p0, \[sp\]
|
|
+** smstop sm
|
|
+** ldr p0, \[sp\]
|
|
+** addvl sp, sp, #1
|
|
+** ...
|
|
+*/
|
|
+[[arm::locally_streaming]] svbool_t
|
|
+test_p0 ()
|
|
+{
|
|
+ asm ("");
|
|
+ return (svbool_t) {};
|
|
+}
|
|
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c
|
|
new file mode 100644
|
|
index 000000000..2bdea6ac6
|
|
--- /dev/null
|
|
+++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c
|
|
@@ -0,0 +1,273 @@
|
|
+// { dg-options "-O -fomit-frame-pointer" }
|
|
+// { dg-final { check-function-bodies "**" "" } }
|
|
+
|
|
+#include <arm_neon.h>
|
|
+#include <arm_sve.h>
|
|
+
|
|
+/*
|
|
+** test_d0:
|
|
+** ...
|
|
+** fmov x10, d0
|
|
+** smstart sm
|
|
+** fmov d0, x10
|
|
+** smstop sm
|
|
+** ...
|
|
+*/
|
|
+[[arm::locally_streaming]] void
|
|
+test_d0 (double d0)
|
|
+{
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** test_d7:
|
|
+** ...
|
|
+** fmov x10, d0
|
|
+** fmov x11, d1
|
|
+** fmov x12, d2
|
|
+** fmov x13, d3
|
|
+** fmov x14, d4
|
|
+** fmov x15, d5
|
|
+** fmov x16, d6
|
|
+** fmov x17, d7
|
|
+** smstart sm
|
|
+** fmov d0, x10
|
|
+** fmov d1, x11
|
|
+** fmov d2, x12
|
|
+** fmov d3, x13
|
|
+** fmov d4, x14
|
|
+** fmov d5, x15
|
|
+** fmov d6, x16
|
|
+** fmov d7, x17
|
|
+** smstop sm
|
|
+** ...
|
|
+*/
|
|
+[[arm::locally_streaming]] void
|
|
+test_d7 (double d0, double d1, double d2, double d3,
|
|
+ double d4, double d5, double d6, double d7)
|
|
+{
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** test_d0_vec:
|
|
+** ...
|
|
+** (
|
|
+** fmov x10, d0
|
|
+** |
|
|
+** umov x10, v0.d\[0\]
|
|
+** )
|
|
+** smstart sm
|
|
+** fmov d0, x10
|
|
+** smstop sm
|
|
+** ...
|
|
+*/
|
|
+[[arm::locally_streaming]] void
|
|
+test_d0_vec (int8x8_t d0)
|
|
+{
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** test_d7_vec:
|
|
+** ...
|
|
+** (
|
|
+** fmov x10, d0
|
|
+** fmov x11, d1
|
|
+** fmov x12, d2
|
|
+** fmov x13, d3
|
|
+** fmov x14, d4
|
|
+** fmov x15, d5
|
|
+** fmov x16, d6
|
|
+** fmov x17, d7
|
|
+** |
|
|
+** umov x10, v0.d\[0\]
|
|
+** umov x11, v1.d\[0\]
|
|
+** umov x12, v2.d\[0\]
|
|
+** umov x13, v3.d\[0\]
|
|
+** umov x14, v4.d\[0\]
|
|
+** umov x15, v5.d\[0\]
|
|
+** umov x16, v6.d\[0\]
|
|
+** umov x17, v7.d\[0\]
|
|
+** )
|
|
+** smstart sm
|
|
+** fmov d0, x10
|
|
+** fmov d1, x11
|
|
+** fmov d2, x12
|
|
+** fmov d3, x13
|
|
+** fmov d4, x14
|
|
+** fmov d5, x15
|
|
+** fmov d6, x16
|
|
+** fmov d7, x17
|
|
+** smstop sm
|
|
+** ...
|
|
+*/
|
|
+[[arm::locally_streaming]] void
|
|
+test_d7_vec (int8x8_t d0, int8x8_t d1, int8x8_t d2, int8x8_t d3,
|
|
+ int8x8_t d4, int8x8_t d5, int8x8_t d6, int8x8_t d7)
|
|
+{
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** test_q0:
|
|
+** ...
|
|
+** str q0, \[sp, #?-16\]!
|
|
+** smstart sm
|
|
+** ldr q0, \[sp\], #?16
|
|
+** smstop sm
|
|
+** ...
|
|
+*/
|
|
+[[arm::locally_streaming]] void
|
|
+test_q0 (int8x16_t q0)
|
|
+{
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** test_q7:
|
|
+** ...
|
|
+** stp q0, q1, \[sp, #?-128\]!
|
|
+** stp q2, q3, \[sp, #?32\]
|
|
+** stp q4, q5, \[sp, #?64\]
|
|
+** stp q6, q7, \[sp, #?96\]
|
|
+** smstart sm
|
|
+** ldp q2, q3, \[sp, #?32\]
|
|
+** ldp q4, q5, \[sp, #?64\]
|
|
+** ldp q6, q7, \[sp, #?96\]
|
|
+** ldp q0, q1, \[sp\], #?128
|
|
+** smstop sm
|
|
+** ...
|
|
+*/
|
|
+[[arm::locally_streaming]] void
|
|
+test_q7 (int8x16x4_t q0, int8x16x4_t q4)
|
|
+{
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** test_z0:
|
|
+** ...
|
|
+** addvl sp, sp, #-1
|
|
+** str z0, \[sp\]
|
|
+** smstart sm
|
|
+** ldr z0, \[sp\]
|
|
+** addvl sp, sp, #1
|
|
+** smstop sm
|
|
+** ...
|
|
+*/
|
|
+[[arm::locally_streaming]] void
|
|
+test_z0 (svint8_t z0)
|
|
+{
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** test_z7:
|
|
+** ...
|
|
+** addvl sp, sp, #-8
|
|
+** str z0, \[sp\]
|
|
+** str z1, \[sp, #1, mul vl\]
|
|
+** str z2, \[sp, #2, mul vl\]
|
|
+** str z3, \[sp, #3, mul vl\]
|
|
+** str z4, \[sp, #4, mul vl\]
|
|
+** str z5, \[sp, #5, mul vl\]
|
|
+** str z6, \[sp, #6, mul vl\]
|
|
+** str z7, \[sp, #7, mul vl\]
|
|
+** smstart sm
|
|
+** ldr z0, \[sp\]
|
|
+** ldr z1, \[sp, #1, mul vl\]
|
|
+** ldr z2, \[sp, #2, mul vl\]
|
|
+** ldr z3, \[sp, #3, mul vl\]
|
|
+** ldr z4, \[sp, #4, mul vl\]
|
|
+** ldr z5, \[sp, #5, mul vl\]
|
|
+** ldr z6, \[sp, #6, mul vl\]
|
|
+** ldr z7, \[sp, #7, mul vl\]
|
|
+** addvl sp, sp, #8
|
|
+** smstop sm
|
|
+** ...
|
|
+*/
|
|
+[[arm::locally_streaming]] void
|
|
+test_z7 (svint8x4_t z0, svint8x4_t z4)
|
|
+{
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** test_p0:
|
|
+** ...
|
|
+** addvl sp, sp, #-1
|
|
+** str p0, \[sp\]
|
|
+** smstart sm
|
|
+** ldr p0, \[sp\]
|
|
+** addvl sp, sp, #1
|
|
+** smstop sm
|
|
+** ...
|
|
+*/
|
|
+[[arm::locally_streaming]] void
|
|
+test_p0 (svbool_t p0)
|
|
+{
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** test_p3:
|
|
+** ...
|
|
+** addvl sp, sp, #-1
|
|
+** str p0, \[sp\]
|
|
+** str p1, \[sp, #1, mul vl\]
|
|
+** str p2, \[sp, #2, mul vl\]
|
|
+** str p3, \[sp, #3, mul vl\]
|
|
+** smstart sm
|
|
+** ldr p0, \[sp\]
|
|
+** ldr p1, \[sp, #1, mul vl\]
|
|
+** ldr p2, \[sp, #2, mul vl\]
|
|
+** ldr p3, \[sp, #3, mul vl\]
|
|
+** addvl sp, sp, #1
|
|
+** smstop sm
|
|
+** ...
|
|
+*/
|
|
+[[arm::locally_streaming]] void
|
|
+test_p3 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
|
|
+{
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** test_mixed:
|
|
+** ...
|
|
+** addvl sp, sp, #-3
|
|
+** str p0, \[sp\]
|
|
+** str p1, \[sp, #1, mul vl\]
|
|
+** str p2, \[sp, #2, mul vl\]
|
|
+** str p3, \[sp, #3, mul vl\]
|
|
+** str z3, \[sp, #1, mul vl\]
|
|
+** str z7, \[sp, #2, mul vl\]
|
|
+** stp q2, q6, \[sp, #?-32\]!
|
|
+** fmov w10, s0
|
|
+** fmov x11, d1
|
|
+** fmov w12, s4
|
|
+** fmov x13, d5
|
|
+** smstart sm
|
|
+** fmov s0, w10
|
|
+** fmov d1, x11
|
|
+** fmov s4, w12
|
|
+** fmov d5, x13
|
|
+** ldp q2, q6, \[sp\], #?32
|
|
+** ldr p0, \[sp\]
|
|
+** ldr p1, \[sp, #1, mul vl\]
|
|
+** ldr p2, \[sp, #2, mul vl\]
|
|
+** ldr p3, \[sp, #3, mul vl\]
|
|
+** ldr z3, \[sp, #1, mul vl\]
|
|
+** ldr z7, \[sp, #2, mul vl\]
|
|
+** addvl sp, sp, #3
|
|
+** smstop sm
|
|
+** ...
|
|
+*/
|
|
+[[arm::locally_streaming]] void
|
|
+test_mixed (float s0, double d1, float32x4_t q2, svfloat32_t z3,
|
|
+ float s4, double d5, float64x2_t q6, svfloat64_t z7,
|
|
+ svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
|
|
+{
|
|
+ asm ("");
|
|
+}
|
|
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c
|
|
new file mode 100644
|
|
index 000000000..42adeb152
|
|
--- /dev/null
|
|
+++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c
|
|
@@ -0,0 +1,145 @@
|
|
+// { dg-options "-O -fomit-frame-pointer" }
|
|
+/* { dg-final { check-function-bodies "**" "" } } */
|
|
+
|
|
+#include <arm_neon.h>
|
|
+#include <arm_sve.h>
|
|
+
|
|
+/*
|
|
+** test_d0:
|
|
+** ...
|
|
+** smstart sm
|
|
+** ...
|
|
+** fmov x10, d0
|
|
+** smstop sm
|
|
+** fmov d0, x10
|
|
+** ...
|
|
+** smstart sm
|
|
+** ...
|
|
+** smstop sm
|
|
+** ...
|
|
+*/
|
|
+void consume_d0 (double d0);
|
|
+
|
|
+__arm_locally_streaming void
|
|
+test_d0 ()
|
|
+{
|
|
+ asm ("");
|
|
+ consume_d0 (1.0);
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** test_d7:
|
|
+** ...
|
|
+** fmov x10, d0
|
|
+** fmov x11, d1
|
|
+** fmov x12, d2
|
|
+** fmov x13, d3
|
|
+** fmov x14, d4
|
|
+** fmov x15, d5
|
|
+** fmov x16, d6
|
|
+** fmov x17, d7
|
|
+** smstop sm
|
|
+** fmov d0, x10
|
|
+** fmov d1, x11
|
|
+** fmov d2, x12
|
|
+** fmov d3, x13
|
|
+** fmov d4, x14
|
|
+** fmov d5, x15
|
|
+** fmov d6, x16
|
|
+** fmov d7, x17
|
|
+** ...
|
|
+*/
|
|
+void consume_d7 (double d0, double d1, double d2, double d3,
|
|
+ double d4, double d5, double d6, double d7);
|
|
+__arm_locally_streaming void
|
|
+test_d7 ()
|
|
+{
|
|
+ asm ("");
|
|
+ consume_d7 (1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** test_q7:
|
|
+** ...
|
|
+** stp q0, q1, \[sp, #?-128\]!
|
|
+** stp q2, q3, \[sp, #?32\]
|
|
+** stp q4, q5, \[sp, #?64\]
|
|
+** stp q6, q7, \[sp, #?96\]
|
|
+** smstop sm
|
|
+** ldp q2, q3, \[sp, #?32\]
|
|
+** ldp q4, q5, \[sp, #?64\]
|
|
+** ldp q6, q7, \[sp, #?96\]
|
|
+** ldp q0, q1, \[sp\], #?128
|
|
+** ...
|
|
+*/
|
|
+void consume_q7 (int8x16x4_t q0, int8x16x4_t q4);
|
|
+
|
|
+__arm_locally_streaming void
|
|
+test_q7 (int8x16x4_t *ptr)
|
|
+{
|
|
+ asm ("");
|
|
+ consume_q7 (ptr[0], ptr[1]);
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** test_z7:
|
|
+** ...
|
|
+** addvl sp, sp, #-8
|
|
+** str z0, \[sp\]
|
|
+** str z1, \[sp, #1, mul vl\]
|
|
+** str z2, \[sp, #2, mul vl\]
|
|
+** str z3, \[sp, #3, mul vl\]
|
|
+** str z4, \[sp, #4, mul vl\]
|
|
+** str z5, \[sp, #5, mul vl\]
|
|
+** str z6, \[sp, #6, mul vl\]
|
|
+** str z7, \[sp, #7, mul vl\]
|
|
+** smstop sm
|
|
+** ldr z0, \[sp\]
|
|
+** ldr z1, \[sp, #1, mul vl\]
|
|
+** ldr z2, \[sp, #2, mul vl\]
|
|
+** ldr z3, \[sp, #3, mul vl\]
|
|
+** ldr z4, \[sp, #4, mul vl\]
|
|
+** ldr z5, \[sp, #5, mul vl\]
|
|
+** ldr z6, \[sp, #6, mul vl\]
|
|
+** ldr z7, \[sp, #7, mul vl\]
|
|
+** addvl sp, sp, #8
|
|
+** ...
|
|
+*/
|
|
+void consume_z7 (svint8x4_t z0, svint8x4_t z4);
|
|
+
|
|
+__arm_locally_streaming void
|
|
+test_z7 (svint8x4_t *ptr1, svint8x4_t *ptr2)
|
|
+{
|
|
+ asm ("");
|
|
+ consume_z7 (*ptr1, *ptr2);
|
|
+ asm ("");
|
|
+}
|
|
+
|
|
+/*
|
|
+** test_p3:
|
|
+** ...
|
|
+** addvl sp, sp, #-1
|
|
+** str p0, \[sp\]
|
|
+** str p1, \[sp, #1, mul vl\]
|
|
+** str p2, \[sp, #2, mul vl\]
|
|
+** str p3, \[sp, #3, mul vl\]
|
|
+** smstop sm
|
|
+** ldr p0, \[sp\]
|
|
+** ldr p1, \[sp, #1, mul vl\]
|
|
+** ldr p2, \[sp, #2, mul vl\]
|
|
+** ldr p3, \[sp, #3, mul vl\]
|
|
+** addvl sp, sp, #1
|
|
+** ...
|
|
+*/
|
|
+void consume_p3 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3);
|
|
+
|
|
+__arm_locally_streaming void
|
|
+test_p3 (svbool_t *ptr1, svbool_t *ptr2, svbool_t *ptr3, svbool_t *ptr4)
|
|
+{
|
|
+ asm ("");
|
|
+ consume_p3 (*ptr1, *ptr2, *ptr3, *ptr4);
|
|
+ asm ("");
|
|
+}
|
|
--
|
|
2.33.0
|
|
|