gcc/0003-Sw64-Port-add-multi-prefetch-support-for-sw64.patch

529 lines
19 KiB
Diff
Raw Permalink Normal View History

2024-11-28 08:55:07 +08:00
From bfa51c2eda3e40fdfd64601f5e7df19049a006cd Mon Sep 17 00:00:00 2001
From: swcompiler <lc@wxiat.com>
Date: Mon, 25 Nov 2024 16:33:10 +0800
Subject: [PATCH 03/16] Sw64 Port: add multi-prefetch support for sw64
---
gcc/builtins.cc | 161 +++++++++++++++++++++++++++++++++-
gcc/builtins.def | 2 +
gcc/ipa-pure-const.cc | 2 +
gcc/opt-functions.awk | 4 +-
gcc/params.opt | 12 +++
gcc/target-insns.def | 3 +
gcc/tree-ssa-loop-prefetch.cc | 155 +++++++++++++++++++++++++++++++-
7 files changed, 336 insertions(+), 3 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 57929a42b..c2589f316 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -109,6 +109,8 @@ static int apply_args_size (void);
static int apply_result_size (void);
static rtx result_vector (int, rtx);
static void expand_builtin_prefetch (tree);
+static void expand_builtin_prefetch_sc (tree);
+static void expand_builtin_prefetch_tc (tree);
static rtx expand_builtin_apply_args (void);
static rtx expand_builtin_apply_args_1 (void);
static rtx expand_builtin_apply (rtx, rtx, rtx);
@@ -1352,6 +1354,156 @@ expand_builtin_prefetch (tree exp)
emit_insn (op0);
}
+static void
+expand_builtin_prefetch_sc (tree exp)
+{
+ tree arg0, arg1, arg2;
+ int nargs;
+ rtx op0, op1, op2;
+
+ if (!validate_arglist (exp, POINTER_TYPE, 0))
+ return;
+
+ arg0 = CALL_EXPR_ARG (exp, 0);
+
+ /* Arguments 1 and 2 are optional; argument 1 (read/write) defaults to
+ * zero (read) and argument 2 (locality) defaults to 3 (high degree of
+ * locality). */
+ nargs = call_expr_nargs (exp);
+ if (nargs > 1)
+ arg1 = CALL_EXPR_ARG (exp, 1);
+ else
+ arg1 = integer_zero_node;
+ if (nargs > 2)
+ arg2 = CALL_EXPR_ARG (exp, 2);
+ else
+ arg2 = integer_three_node;
+
+ /* Argument 0 is an address. */
+ op0 = expand_expr (arg0, NULL_RTX, Pmode, EXPAND_NORMAL);
+
+ /* Argument 1 (read/write flag) must be a compile-time constant int. */
+ if (TREE_CODE (arg1) != INTEGER_CST)
+ {
+ error ("second argument to %<__builtin_prefetch_sc%> must be a constant");
+ arg1 = integer_zero_node;
+ }
+ op1 = expand_normal (arg1);
+ /* Argument 1 must be either zero or one. */
+ if (INTVAL (op1) != 0 && INTVAL (op1) != 1)
+ {
+ warning (0, "invalid second argument to %<__builtin_prefetch_sc%>;"
+ " using zero");
+ op1 = const0_rtx;
+ }
+
+ /* Argument 2 (locality) must be a compile-time constant int. */
+ if (TREE_CODE (arg2) != INTEGER_CST)
+ {
+ error ("third argument to %<__builtin_prefetch_sc%> must be a constant");
+ arg2 = integer_zero_node;
+ }
+ op2 = expand_normal (arg2);
+ /* Argument 2 must be 0, 1, 2, or 3. */
+ if (INTVAL (op2) < 0 || INTVAL (op2) > 3)
+ {
+ warning (
+ 0, "invalid third argument to %<__builtin_prefetch_sc%>; using zero");
+ op2 = const0_rtx;
+ }
+
+ if (targetm.have_prefetch ())
+ {
+ class expand_operand ops[3];
+
+ create_address_operand (&ops[0], op0);
+ create_integer_operand (&ops[1], INTVAL (op1));
+ create_integer_operand (&ops[2], INTVAL (op2));
+ if (maybe_expand_insn (targetm.code_for_prefetch_sc, 3, ops))
+ return;
+ }
+
+ /* Don't do anything with direct references to volatile memory, but
+ * generate code to handle other side effects. */
+ if (!MEM_P (op0) && side_effects_p (op0))
+ emit_insn (op0);
+}
+
+static void
+expand_builtin_prefetch_tc (tree exp)
+{
+ tree arg0, arg1, arg2;
+ int nargs;
+ rtx op0, op1, op2;
+
+ if (!validate_arglist (exp, POINTER_TYPE, 0))
+ return;
+
+ arg0 = CALL_EXPR_ARG (exp, 0);
+
+ /* Arguments 1 and 2 are optional; argument 1 (read/write) defaults to
+ * zero (read) and argument 2 (locality) defaults to 3 (high degree of
+ * locality). */
+ nargs = call_expr_nargs (exp);
+ if (nargs > 1)
+ arg1 = CALL_EXPR_ARG (exp, 1);
+ else
+ arg1 = integer_zero_node;
+ if (nargs > 2)
+ arg2 = CALL_EXPR_ARG (exp, 2);
+ else
+ arg2 = integer_three_node;
+
+ /* Argument 0 is an address. */
+ op0 = expand_expr (arg0, NULL_RTX, Pmode, EXPAND_NORMAL);
+
+ /* Argument 1 (read/write flag) must be a compile-time constant int. */
+ if (TREE_CODE (arg1) != INTEGER_CST)
+ {
+ error ("second argument to %<__builtin_prefetch%> must be a constant");
+ arg1 = integer_zero_node;
+ }
+ op1 = expand_normal (arg1);
+ /* Argument 1 must be either zero or one. */
+ if (INTVAL (op1) != 0 && INTVAL (op1) != 1)
+ {
+ warning (0, "invalid second argument to %<__builtin_prefetch%>;"
+ " using zero");
+ op1 = const0_rtx;
+ }
+
+ /* Argument 2 (locality) must be a compile-time constant int. */
+ if (TREE_CODE (arg2) != INTEGER_CST)
+ {
+ error ("third argument to %<__builtin_prefetch%> must be a constant");
+ arg2 = integer_zero_node;
+ }
+ op2 = expand_normal (arg2);
+ /* Argument 2 must be 0, 1, 2, or 3. */
+ if (INTVAL (op2) < 0 || INTVAL (op2) > 3)
+ {
+ warning (0,
+ "invalid third argument to %<__builtin_prefetch%>; using zero");
+ op2 = const0_rtx;
+ }
+
+ if (targetm.have_prefetch ())
+ {
+ class expand_operand ops[3];
+
+ create_address_operand (&ops[0], op0);
+ create_integer_operand (&ops[1], INTVAL (op1));
+ create_integer_operand (&ops[2], INTVAL (op2));
+ if (maybe_expand_insn (targetm.code_for_prefetch_tc, 3, ops))
+ return;
+ }
+
+ /* Don't do anything with direct references to volatile memory, but
+ * generate code to handle other side effects. */
+ if (!MEM_P (op0) && side_effects_p (op0))
+ emit_insn (op0);
+}
+
/* Get a MEM rtx for expression EXP which is the address of an operand
to be used in a string instruction (cmpstrsi, cpymemsi, ..). LEN is
the maximum length of the block of memory that might be accessed or
@@ -7598,7 +7750,12 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
case BUILT_IN_PREFETCH:
expand_builtin_prefetch (exp);
return const0_rtx;
-
+ case BUILT_IN_PREFETCH_SC:
+ expand_builtin_prefetch_sc (exp);
+ return const0_rtx;
+ case BUILT_IN_PREFETCH_TC:
+ expand_builtin_prefetch_tc (exp);
+ return const0_rtx;
case BUILT_IN_INIT_TRAMPOLINE:
return expand_builtin_init_trampoline (exp, true);
case BUILT_IN_INIT_HEAP_TRAMPOLINE:
@@ -10989,6 +11146,8 @@ is_inexpensive_builtin (tree decl)
case BUILT_IN_LABS:
case BUILT_IN_LLABS:
case BUILT_IN_PREFETCH:
+ case BUILT_IN_PREFETCH_SC:
+ case BUILT_IN_PREFETCH_TC:
case BUILT_IN_ACC_ON_DEVICE:
return true;
diff --git a/gcc/builtins.def b/gcc/builtins.def
index 005976f34..983de293e 100644
--- a/gcc/builtins.def
+++ b/gcc/builtins.def
@@ -924,6 +924,8 @@ DEF_GCC_BUILTIN (BUILT_IN_POPCOUNTL, "popcountl", BT_FN_INT_ULONG, ATTR_C
DEF_GCC_BUILTIN (BUILT_IN_POPCOUNTLL, "popcountll", BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
DEF_EXT_LIB_BUILTIN (BUILT_IN_POSIX_MEMALIGN, "posix_memalign", BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
DEF_GCC_BUILTIN (BUILT_IN_PREFETCH, "prefetch", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
+DEF_GCC_BUILTIN (BUILT_IN_PREFETCH_SC, "prefetch_sc", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
+DEF_GCC_BUILTIN (BUILT_IN_PREFETCH_TC, "prefetch_tc", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
DEF_LIB_BUILTIN (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE, ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST)
DEF_GCC_BUILTIN (BUILT_IN_RETURN, "return", BT_FN_VOID_PTR, ATTR_NORETURN_NOTHROW_LEAF_LIST)
DEF_GCC_BUILTIN (BUILT_IN_RETURN_ADDRESS, "return_address", BT_FN_PTR_UINT, ATTR_LEAF_LIST)
diff --git a/gcc/ipa-pure-const.cc b/gcc/ipa-pure-const.cc
index 2642df91e..89a950966 100644
--- a/gcc/ipa-pure-const.cc
+++ b/gcc/ipa-pure-const.cc
@@ -534,6 +534,8 @@ builtin_safe_for_const_function_p (bool *looping, tree callee)
*looping = false;
return true;
case BUILT_IN_PREFETCH:
+ case BUILT_IN_PREFETCH_SC:
+ case BUILT_IN_PREFETCH_TC:
*looping = true;
return true;
default:
diff --git a/gcc/opt-functions.awk b/gcc/opt-functions.awk
index 2aee0b9f1..0dabde89d 100644
--- a/gcc/opt-functions.awk
+++ b/gcc/opt-functions.awk
@@ -247,6 +247,8 @@ function var_type(flags)
return "HOST_WIDE_INT "
else if (flag_set_p("UInteger", flags))
return "int "
+ else if (flag_set_p("UInteger", flags))
+ return "int "
else
return "const char *"
}
@@ -256,7 +258,7 @@ function var_type(flags)
# type instead of int to save space.
function var_type_struct(flags)
{
- if (flag_set_p("UInteger", flags)) {
+ if (flag_set_p("UInteger", flags)) {
if (host_wide_int[var_name(flags)] == "yes")
return "HOST_WIDE_INT ";
if (flag_set_p("ByteSize", flags))
diff --git a/gcc/params.opt b/gcc/params.opt
index 3ddfaf5b2..5abc8ce82 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -385,6 +385,18 @@ The size of L1 cache.
Common Joined UInteger Var(param_l2_cache_size) Init(512) Param Optimization
The size of L2 cache.
+-param=pf1=
+Common Joined UInteger Var(PF1) Init(0) IntegerRange(0,200) Param Optimization
+The number of Cache lines add to L1 prefetch delta.
+
+-param=pf2=
+Common Joined UInteger Var(PF2) Init(0) IntegerRange(0,200) Param Optimization
+The number of Cache lines add to L2 prefetch delta.
+
+-param=pf3=
+Common Joined UInteger Var(PF3) Init(0) IntegerRange(0,200) Param Optimization
+The number of Cache lines add to L3 prefetch delta.
+
-param=large-function-growth=
Common Joined UInteger Var(param_large_function_growth) Optimization Init(100) Param
Maximal growth due to inlining of large function (in percent).
diff --git a/gcc/target-insns.def b/gcc/target-insns.def
index de8c0092f..8b4da8bc4 100644
--- a/gcc/target-insns.def
+++ b/gcc/target-insns.def
@@ -77,6 +77,9 @@ DEF_TARGET_INSN (omp_simt_vote_any, (rtx x0, rtx x1))
DEF_TARGET_INSN (omp_simt_xchg_bfly, (rtx x0, rtx x1, rtx x2))
DEF_TARGET_INSN (omp_simt_xchg_idx, (rtx x0, rtx x1, rtx x2))
DEF_TARGET_INSN (prefetch, (rtx x0, rtx x1, rtx x2))
+DEF_TARGET_INSN (prefetch_sc, (rtx x0, rtx x1, rtx x2))
+DEF_TARGET_INSN (prefetch_tc, (rtx x0, rtx x1, rtx x2))
+/*********************/
DEF_TARGET_INSN (probe_stack, (rtx x0))
DEF_TARGET_INSN (probe_stack_address, (rtx x0))
DEF_TARGET_INSN (prologue, (void))
diff --git a/gcc/tree-ssa-loop-prefetch.cc b/gcc/tree-ssa-loop-prefetch.cc
index aebd7c920..6aa242260 100644
--- a/gcc/tree-ssa-loop-prefetch.cc
+++ b/gcc/tree-ssa-loop-prefetch.cc
@@ -193,6 +193,9 @@ along with GCC; see the file COPYING3. If not see
#define L1_CACHE_SIZE_BYTES ((unsigned) (param_l1_cache_size * 1024))
#define L2_CACHE_SIZE_BYTES ((unsigned) (param_l2_cache_size * 1024))
+#ifdef FLAG_SW64_PREFETCH
+#define L1_CACHE_LINE_SIZE ((unsigned) (param_l1_cache_line_size))
+#endif
/* We consider a memory access nontemporal if it is not reused sooner than
after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore
accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
@@ -1057,7 +1060,11 @@ schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
/* At most param_simultaneous_prefetches should be running
at the same time. */
+#ifdef FLAG_SW64_PREFETCH
+ remaining_prefetch_slots = param_simultaneous_prefetches * 5;
+#else
remaining_prefetch_slots = param_simultaneous_prefetches;
+#endif
/* The prefetch will run for AHEAD iterations of the original loop, i.e.,
AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration,
@@ -1081,8 +1088,10 @@ schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
/* The loop is far from being sufficiently unrolled for this
prefetch. Do not generate prefetch to avoid many redudant
prefetches. */
- if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO)
+#ifndef FLAG_SW64_PREFETCH
+ if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO)
continue;
+#endif
/* If we need to prefetch the reference each PREFETCH_MOD iterations,
and we unroll the loop UNROLL_FACTOR times, we need to insert
@@ -1153,6 +1162,19 @@ estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor)
return prefetch_count;
}
+#ifdef FLAG_SW64_PREFETCH
+/*Due to the need for SW to dynamically adjust the value of PF during
+ * prefetching,PF needs to handle negative values.However ,since Common Joined
+ * UInteger Var(PFX) is used, the function needs to convert unsigned (0-200) to
+ * (-100,100)*/
+int
+convert_default_to_sw (unsigned int pf_value)
+{
+ if (pf_value > 100)
+ return 100 - (int) pf_value;
+ return pf_value;
+}
+#endif
/* Issue prefetches for the reference REF into loop as decided before.
HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR
is the factor by which LOOP was unrolled. */
@@ -1184,11 +1206,21 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
for (ap = 0; ap < n_prefetches; ap++)
{
+#ifdef FLAG_SW64_PREFETCH
+ if (flag_sw_prefetch_dc == 1)
+ {
+#endif
if (cst_and_fits_in_hwi (ref->group->step))
{
/* Determine the address to prefetch. */
+#ifdef FLAG_SW64_PREFETCH
+ delta = (ahead + ap * ref->prefetch_mod) *
+ int_cst_value (ref->group->step) * 2
+ + convert_default_to_sw (PF1) * L1_CACHE_LINE_SIZE;
+#else
delta = (ahead + ap * ref->prefetch_mod) *
int_cst_value (ref->group->step);
+#endif
addr = fold_build_pointer_plus_hwi (addr_base, delta);
addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
NULL, true, GSI_SAME_STMT);
@@ -1220,6 +1252,86 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
prefetch = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
3, addr, write_p, local);
gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
+#ifdef FLAG_SW64_PREFETCH
+ }
+ /* Generate L2 prefetch if the option is open. */
+ if (flag_sw_prefetch_sc == 1)
+ {
+ if (cst_and_fits_in_hwi (ref->group->step))
+ {
+ delta = (ahead + ap * ref->prefetch_mod) *
+ int_cst_value (ref->group->step) * 2
+ + (4 + convert_default_to_sw (PF2)) * L1_CACHE_LINE_SIZE;
+
+ addr = fold_build_pointer_plus_hwi (addr_base, delta);
+ addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
+ NULL, true, GSI_SAME_STMT);
+ }
+ else
+ {
+ ahead += (unsigned) (convert_default_to_sw (PF2)
+ - convert_default_to_sw (PF1));
+ forward = fold_build2 (MULT_EXPR, sizetype,
+ fold_convert (sizetype, ref->group->step),
+ fold_convert (sizetype, size_int (ahead)));
+ addr = fold_build_pointer_plus (addr_base, forward);
+ addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
+ NULL, true, GSI_SAME_STMT);
+ }
+
+ if (addr_base != addr && TREE_CODE (addr_base) == SSA_NAME
+ && TREE_CODE (addr) == SSA_NAME)
+ {
+ duplicate_ssa_name_ptr_info (addr, SSA_NAME_PTR_INFO (addr_base));
+ if (SSA_NAME_PTR_INFO (addr))
+ mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr));
+ }
+
+ /* Create the L2 prefetch instruction. */
+ prefetch
+ = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH_SC),
+ 3, addr, write_p, local);
+ gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
+ }
+ /* Generate L3 prefetch if the option is open. */
+ if (flag_sw_prefetch_tc == 1)
+ {
+ if (cst_and_fits_in_hwi (ref->group->step))
+ {
+ delta = (ahead + ap * ref->prefetch_mod) *
+ int_cst_value (ref->group->step) * 2
+ + (10 + convert_default_to_sw (PF3)) * L1_CACHE_LINE_SIZE;
+
+ addr = fold_build_pointer_plus_hwi (addr_base, delta);
+ addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
+ NULL, true, GSI_SAME_STMT);
+ }
+ else
+ {
+ ahead += (unsigned) (convert_default_to_sw (PF3)
+ - convert_default_to_sw (PF1));
+ forward = fold_build2 (MULT_EXPR, sizetype,
+ fold_convert (sizetype, ref->group->step),
+ fold_convert (sizetype, size_int (ahead)));
+ addr = fold_build_pointer_plus (addr_base, forward);
+ addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
+ NULL, true, GSI_SAME_STMT);
+ }
+
+ if (addr_base != addr && TREE_CODE (addr_base) == SSA_NAME
+ && TREE_CODE (addr) == SSA_NAME)
+ {
+ duplicate_ssa_name_ptr_info (addr, SSA_NAME_PTR_INFO (addr_base));
+ if (SSA_NAME_PTR_INFO (addr))
+ mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr));
+ }
+ /* Create the L3 prefetch instruction. */
+ prefetch
+ = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH_TC),
+ 3, addr, write_p, local);
+ gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
+ }
+#endif
}
}
@@ -1375,9 +1487,22 @@ should_unroll_loop_p (class loop *loop, class tree_niter_desc *desc,
as well; but the unrolling/prefetching is usually more profitable for
loops consisting of a single basic block, and we want to limit the
code growth. */
+#ifdef FLAG_SW64_PREFETCH
+ if (flag_sw_prefetch_unroll == 1)
+ {
+ if (loop->num_nodes > 7)
+ return false;
+ }
+ else
+ {
+ if (loop->num_nodes > 2)
+ return false;
+ }
+#else
if (loop->num_nodes > 2)
return false;
+#endif
return true;
}
@@ -1422,6 +1547,12 @@ determine_unroll_factor (class loop *loop, struct mem_ref_group *refs,
if (should_issue_prefetch_p (ref))
{
mod_constraint = ref->prefetch_mod;
+#ifdef FLAG_SW64_PREFETCH
+ /* TODO: mod_constraint is set to 4 by experience, but we should do it
+ * with precision. */
+ if (mod_constraint > upper_bound)
+ mod_constraint = 4;
+#endif
nfactor = least_common_multiple (mod_constraint, factor);
if (nfactor <= upper_bound)
factor = nfactor;
@@ -2022,6 +2153,28 @@ tree_ssa_prefetch_arrays (void)
DECL_IS_NOVOPS (decl) = true;
set_builtin_decl (BUILT_IN_PREFETCH, decl, false);
}
+#ifdef FLAG_SW64_PREFETCH
+ if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH_SC))
+ {
+ tree type = build_function_type_list (void_type_node, const_ptr_type_node,
+ NULL_TREE);
+ tree decl = add_builtin_function ("__builtin_prefetch_sc", type,
+ BUILT_IN_PREFETCH_SC, BUILT_IN_NORMAL,
+ NULL, NULL_TREE);
+ DECL_IS_NOVOPS (decl) = true;
+ set_builtin_decl (BUILT_IN_PREFETCH_SC, decl, false);
+ }
+ if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH_TC))
+ {
+ tree type = build_function_type_list (void_type_node, const_ptr_type_node,
+ NULL_TREE);
+ tree decl = add_builtin_function ("__builtin_prefetch_tc", type,
+ BUILT_IN_PREFETCH_TC, BUILT_IN_NORMAL,
+ NULL, NULL_TREE);
+ DECL_IS_NOVOPS (decl) = true;
+ set_builtin_decl (BUILT_IN_PREFETCH_TC, decl, false);
+ }
+#endif
for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
{
--
2.25.1