529 lines
19 KiB
Diff
529 lines
19 KiB
Diff
|
|
From bfa51c2eda3e40fdfd64601f5e7df19049a006cd Mon Sep 17 00:00:00 2001
|
||
|
|
From: swcompiler <lc@wxiat.com>
|
||
|
|
Date: Mon, 25 Nov 2024 16:33:10 +0800
|
||
|
|
Subject: [PATCH 03/16] Sw64 Port: add multi-prefetch support for sw64
|
||
|
|
|
||
|
|
---
|
||
|
|
gcc/builtins.cc | 161 +++++++++++++++++++++++++++++++++-
|
||
|
|
gcc/builtins.def | 2 +
|
||
|
|
gcc/ipa-pure-const.cc | 2 +
|
||
|
|
gcc/opt-functions.awk | 4 +-
|
||
|
|
gcc/params.opt | 12 +++
|
||
|
|
gcc/target-insns.def | 3 +
|
||
|
|
gcc/tree-ssa-loop-prefetch.cc | 155 +++++++++++++++++++++++++++++++-
|
||
|
|
7 files changed, 336 insertions(+), 3 deletions(-)
|
||
|
|
|
||
|
|
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
|
||
|
|
index 57929a42b..c2589f316 100644
|
||
|
|
--- a/gcc/builtins.cc
|
||
|
|
+++ b/gcc/builtins.cc
|
||
|
|
@@ -109,6 +109,8 @@ static int apply_args_size (void);
|
||
|
|
static int apply_result_size (void);
|
||
|
|
static rtx result_vector (int, rtx);
|
||
|
|
static void expand_builtin_prefetch (tree);
|
||
|
|
+static void expand_builtin_prefetch_sc (tree);
|
||
|
|
+static void expand_builtin_prefetch_tc (tree);
|
||
|
|
static rtx expand_builtin_apply_args (void);
|
||
|
|
static rtx expand_builtin_apply_args_1 (void);
|
||
|
|
static rtx expand_builtin_apply (rtx, rtx, rtx);
|
||
|
|
@@ -1352,6 +1354,156 @@ expand_builtin_prefetch (tree exp)
|
||
|
|
emit_insn (op0);
|
||
|
|
}
|
||
|
|
|
||
|
|
+static void
|
||
|
|
+expand_builtin_prefetch_sc (tree exp)
|
||
|
|
+{
|
||
|
|
+ tree arg0, arg1, arg2;
|
||
|
|
+ int nargs;
|
||
|
|
+ rtx op0, op1, op2;
|
||
|
|
+
|
||
|
|
+ if (!validate_arglist (exp, POINTER_TYPE, 0))
|
||
|
|
+ return;
|
||
|
|
+
|
||
|
|
+ arg0 = CALL_EXPR_ARG (exp, 0);
|
||
|
|
+
|
||
|
|
+ /* Arguments 1 and 2 are optional; argument 1 (read/write) defaults to
|
||
|
|
+ * zero (read) and argument 2 (locality) defaults to 3 (high degree of
|
||
|
|
+ * locality). */
|
||
|
|
+ nargs = call_expr_nargs (exp);
|
||
|
|
+ if (nargs > 1)
|
||
|
|
+ arg1 = CALL_EXPR_ARG (exp, 1);
|
||
|
|
+ else
|
||
|
|
+ arg1 = integer_zero_node;
|
||
|
|
+ if (nargs > 2)
|
||
|
|
+ arg2 = CALL_EXPR_ARG (exp, 2);
|
||
|
|
+ else
|
||
|
|
+ arg2 = integer_three_node;
|
||
|
|
+
|
||
|
|
+ /* Argument 0 is an address. */
|
||
|
|
+ op0 = expand_expr (arg0, NULL_RTX, Pmode, EXPAND_NORMAL);
|
||
|
|
+
|
||
|
|
+ /* Argument 1 (read/write flag) must be a compile-time constant int. */
|
||
|
|
+ if (TREE_CODE (arg1) != INTEGER_CST)
|
||
|
|
+ {
|
||
|
|
+ error ("second argument to %<__builtin_prefetch_sc%> must be a constant");
|
||
|
|
+ arg1 = integer_zero_node;
|
||
|
|
+ }
|
||
|
|
+ op1 = expand_normal (arg1);
|
||
|
|
+ /* Argument 1 must be either zero or one. */
|
||
|
|
+ if (INTVAL (op1) != 0 && INTVAL (op1) != 1)
|
||
|
|
+ {
|
||
|
|
+ warning (0, "invalid second argument to %<__builtin_prefetch_sc%>;"
|
||
|
|
+ " using zero");
|
||
|
|
+ op1 = const0_rtx;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ /* Argument 2 (locality) must be a compile-time constant int. */
|
||
|
|
+ if (TREE_CODE (arg2) != INTEGER_CST)
|
||
|
|
+ {
|
||
|
|
+ error ("third argument to %<__builtin_prefetch_sc%> must be a constant");
|
||
|
|
+ arg2 = integer_zero_node;
|
||
|
|
+ }
|
||
|
|
+ op2 = expand_normal (arg2);
|
||
|
|
+ /* Argument 2 must be 0, 1, 2, or 3. */
|
||
|
|
+ if (INTVAL (op2) < 0 || INTVAL (op2) > 3)
|
||
|
|
+ {
|
||
|
|
+ warning (
|
||
|
|
+ 0, "invalid third argument to %<__builtin_prefetch_sc%>; using zero");
|
||
|
|
+ op2 = const0_rtx;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ if (targetm.have_prefetch ())
|
||
|
|
+ {
|
||
|
|
+ class expand_operand ops[3];
|
||
|
|
+
|
||
|
|
+ create_address_operand (&ops[0], op0);
|
||
|
|
+ create_integer_operand (&ops[1], INTVAL (op1));
|
||
|
|
+ create_integer_operand (&ops[2], INTVAL (op2));
|
||
|
|
+ if (maybe_expand_insn (targetm.code_for_prefetch_sc, 3, ops))
|
||
|
|
+ return;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ /* Don't do anything with direct references to volatile memory, but
|
||
|
|
+ * generate code to handle other side effects. */
|
||
|
|
+ if (!MEM_P (op0) && side_effects_p (op0))
|
||
|
|
+ emit_insn (op0);
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static void
|
||
|
|
+expand_builtin_prefetch_tc (tree exp)
|
||
|
|
+{
|
||
|
|
+ tree arg0, arg1, arg2;
|
||
|
|
+ int nargs;
|
||
|
|
+ rtx op0, op1, op2;
|
||
|
|
+
|
||
|
|
+ if (!validate_arglist (exp, POINTER_TYPE, 0))
|
||
|
|
+ return;
|
||
|
|
+
|
||
|
|
+ arg0 = CALL_EXPR_ARG (exp, 0);
|
||
|
|
+
|
||
|
|
+ /* Arguments 1 and 2 are optional; argument 1 (read/write) defaults to
|
||
|
|
+ * zero (read) and argument 2 (locality) defaults to 3 (high degree of
|
||
|
|
+ * locality). */
|
||
|
|
+ nargs = call_expr_nargs (exp);
|
||
|
|
+ if (nargs > 1)
|
||
|
|
+ arg1 = CALL_EXPR_ARG (exp, 1);
|
||
|
|
+ else
|
||
|
|
+ arg1 = integer_zero_node;
|
||
|
|
+ if (nargs > 2)
|
||
|
|
+ arg2 = CALL_EXPR_ARG (exp, 2);
|
||
|
|
+ else
|
||
|
|
+ arg2 = integer_three_node;
|
||
|
|
+
|
||
|
|
+ /* Argument 0 is an address. */
|
||
|
|
+ op0 = expand_expr (arg0, NULL_RTX, Pmode, EXPAND_NORMAL);
|
||
|
|
+
|
||
|
|
+ /* Argument 1 (read/write flag) must be a compile-time constant int. */
|
||
|
|
+ if (TREE_CODE (arg1) != INTEGER_CST)
|
||
|
|
+ {
|
||
|
|
+ error ("second argument to %<__builtin_prefetch%> must be a constant");
|
||
|
|
+ arg1 = integer_zero_node;
|
||
|
|
+ }
|
||
|
|
+ op1 = expand_normal (arg1);
|
||
|
|
+ /* Argument 1 must be either zero or one. */
|
||
|
|
+ if (INTVAL (op1) != 0 && INTVAL (op1) != 1)
|
||
|
|
+ {
|
||
|
|
+ warning (0, "invalid second argument to %<__builtin_prefetch%>;"
|
||
|
|
+ " using zero");
|
||
|
|
+ op1 = const0_rtx;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ /* Argument 2 (locality) must be a compile-time constant int. */
|
||
|
|
+ if (TREE_CODE (arg2) != INTEGER_CST)
|
||
|
|
+ {
|
||
|
|
+ error ("third argument to %<__builtin_prefetch%> must be a constant");
|
||
|
|
+ arg2 = integer_zero_node;
|
||
|
|
+ }
|
||
|
|
+ op2 = expand_normal (arg2);
|
||
|
|
+ /* Argument 2 must be 0, 1, 2, or 3. */
|
||
|
|
+ if (INTVAL (op2) < 0 || INTVAL (op2) > 3)
|
||
|
|
+ {
|
||
|
|
+ warning (0,
|
||
|
|
+ "invalid third argument to %<__builtin_prefetch%>; using zero");
|
||
|
|
+ op2 = const0_rtx;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ if (targetm.have_prefetch ())
|
||
|
|
+ {
|
||
|
|
+ class expand_operand ops[3];
|
||
|
|
+
|
||
|
|
+ create_address_operand (&ops[0], op0);
|
||
|
|
+ create_integer_operand (&ops[1], INTVAL (op1));
|
||
|
|
+ create_integer_operand (&ops[2], INTVAL (op2));
|
||
|
|
+ if (maybe_expand_insn (targetm.code_for_prefetch_tc, 3, ops))
|
||
|
|
+ return;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ /* Don't do anything with direct references to volatile memory, but
|
||
|
|
+ * generate code to handle other side effects. */
|
||
|
|
+ if (!MEM_P (op0) && side_effects_p (op0))
|
||
|
|
+ emit_insn (op0);
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
/* Get a MEM rtx for expression EXP which is the address of an operand
|
||
|
|
to be used in a string instruction (cmpstrsi, cpymemsi, ..). LEN is
|
||
|
|
the maximum length of the block of memory that might be accessed or
|
||
|
|
@@ -7598,7 +7750,12 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
|
||
|
|
case BUILT_IN_PREFETCH:
|
||
|
|
expand_builtin_prefetch (exp);
|
||
|
|
return const0_rtx;
|
||
|
|
-
|
||
|
|
+ case BUILT_IN_PREFETCH_SC:
|
||
|
|
+ expand_builtin_prefetch_sc (exp);
|
||
|
|
+ return const0_rtx;
|
||
|
|
+ case BUILT_IN_PREFETCH_TC:
|
||
|
|
+ expand_builtin_prefetch_tc (exp);
|
||
|
|
+ return const0_rtx;
|
||
|
|
case BUILT_IN_INIT_TRAMPOLINE:
|
||
|
|
return expand_builtin_init_trampoline (exp, true);
|
||
|
|
case BUILT_IN_INIT_HEAP_TRAMPOLINE:
|
||
|
|
@@ -10989,6 +11146,8 @@ is_inexpensive_builtin (tree decl)
|
||
|
|
case BUILT_IN_LABS:
|
||
|
|
case BUILT_IN_LLABS:
|
||
|
|
case BUILT_IN_PREFETCH:
|
||
|
|
+ case BUILT_IN_PREFETCH_SC:
|
||
|
|
+ case BUILT_IN_PREFETCH_TC:
|
||
|
|
case BUILT_IN_ACC_ON_DEVICE:
|
||
|
|
return true;
|
||
|
|
|
||
|
|
diff --git a/gcc/builtins.def b/gcc/builtins.def
|
||
|
|
index 005976f34..983de293e 100644
|
||
|
|
--- a/gcc/builtins.def
|
||
|
|
+++ b/gcc/builtins.def
|
||
|
|
@@ -924,6 +924,8 @@ DEF_GCC_BUILTIN (BUILT_IN_POPCOUNTL, "popcountl", BT_FN_INT_ULONG, ATTR_C
|
||
|
|
DEF_GCC_BUILTIN (BUILT_IN_POPCOUNTLL, "popcountll", BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
|
||
|
|
DEF_EXT_LIB_BUILTIN (BUILT_IN_POSIX_MEMALIGN, "posix_memalign", BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
|
||
|
|
DEF_GCC_BUILTIN (BUILT_IN_PREFETCH, "prefetch", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
|
||
|
|
+DEF_GCC_BUILTIN (BUILT_IN_PREFETCH_SC, "prefetch_sc", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
|
||
|
|
+DEF_GCC_BUILTIN (BUILT_IN_PREFETCH_TC, "prefetch_tc", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
|
||
|
|
DEF_LIB_BUILTIN (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE, ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST)
|
||
|
|
DEF_GCC_BUILTIN (BUILT_IN_RETURN, "return", BT_FN_VOID_PTR, ATTR_NORETURN_NOTHROW_LEAF_LIST)
|
||
|
|
DEF_GCC_BUILTIN (BUILT_IN_RETURN_ADDRESS, "return_address", BT_FN_PTR_UINT, ATTR_LEAF_LIST)
|
||
|
|
diff --git a/gcc/ipa-pure-const.cc b/gcc/ipa-pure-const.cc
|
||
|
|
index 2642df91e..89a950966 100644
|
||
|
|
--- a/gcc/ipa-pure-const.cc
|
||
|
|
+++ b/gcc/ipa-pure-const.cc
|
||
|
|
@@ -534,6 +534,8 @@ builtin_safe_for_const_function_p (bool *looping, tree callee)
|
||
|
|
*looping = false;
|
||
|
|
return true;
|
||
|
|
case BUILT_IN_PREFETCH:
|
||
|
|
+ case BUILT_IN_PREFETCH_SC:
|
||
|
|
+ case BUILT_IN_PREFETCH_TC:
|
||
|
|
*looping = true;
|
||
|
|
return true;
|
||
|
|
default:
|
||
|
|
diff --git a/gcc/opt-functions.awk b/gcc/opt-functions.awk
|
||
|
|
index 2aee0b9f1..0dabde89d 100644
|
||
|
|
--- a/gcc/opt-functions.awk
|
||
|
|
+++ b/gcc/opt-functions.awk
|
||
|
|
@@ -247,6 +247,8 @@ function var_type(flags)
|
||
|
|
return "HOST_WIDE_INT "
|
||
|
|
else if (flag_set_p("UInteger", flags))
|
||
|
|
return "int "
|
||
|
|
+ else if (flag_set_p("UInteger", flags))
|
||
|
|
+ return "int "
|
||
|
|
else
|
||
|
|
return "const char *"
|
||
|
|
}
|
||
|
|
@@ -256,7 +258,7 @@ function var_type(flags)
|
||
|
|
# type instead of int to save space.
|
||
|
|
function var_type_struct(flags)
|
||
|
|
{
|
||
|
|
- if (flag_set_p("UInteger", flags)) {
|
||
|
|
+ if (flag_set_p("UInteger", flags)) {
|
||
|
|
if (host_wide_int[var_name(flags)] == "yes")
|
||
|
|
return "HOST_WIDE_INT ";
|
||
|
|
if (flag_set_p("ByteSize", flags))
|
||
|
|
diff --git a/gcc/params.opt b/gcc/params.opt
|
||
|
|
index 3ddfaf5b2..5abc8ce82 100644
|
||
|
|
--- a/gcc/params.opt
|
||
|
|
+++ b/gcc/params.opt
|
||
|
|
@@ -385,6 +385,18 @@ The size of L1 cache.
|
||
|
|
Common Joined UInteger Var(param_l2_cache_size) Init(512) Param Optimization
|
||
|
|
The size of L2 cache.
|
||
|
|
|
||
|
|
+-param=pf1=
|
||
|
|
+Common Joined UInteger Var(PF1) Init(0) IntegerRange(0,200) Param Optimization
|
||
|
|
+The number of Cache lines add to L1 prefetch delta.
|
||
|
|
+
|
||
|
|
+-param=pf2=
|
||
|
|
+Common Joined UInteger Var(PF2) Init(0) IntegerRange(0,200) Param Optimization
|
||
|
|
+The number of Cache lines add to L2 prefetch delta.
|
||
|
|
+
|
||
|
|
+-param=pf3=
|
||
|
|
+Common Joined UInteger Var(PF3) Init(0) IntegerRange(0,200) Param Optimization
|
||
|
|
+The number of Cache lines add to L3 prefetch delta.
|
||
|
|
+
|
||
|
|
-param=large-function-growth=
|
||
|
|
Common Joined UInteger Var(param_large_function_growth) Optimization Init(100) Param
|
||
|
|
Maximal growth due to inlining of large function (in percent).
|
||
|
|
diff --git a/gcc/target-insns.def b/gcc/target-insns.def
|
||
|
|
index de8c0092f..8b4da8bc4 100644
|
||
|
|
--- a/gcc/target-insns.def
|
||
|
|
+++ b/gcc/target-insns.def
|
||
|
|
@@ -77,6 +77,9 @@ DEF_TARGET_INSN (omp_simt_vote_any, (rtx x0, rtx x1))
|
||
|
|
DEF_TARGET_INSN (omp_simt_xchg_bfly, (rtx x0, rtx x1, rtx x2))
|
||
|
|
DEF_TARGET_INSN (omp_simt_xchg_idx, (rtx x0, rtx x1, rtx x2))
|
||
|
|
DEF_TARGET_INSN (prefetch, (rtx x0, rtx x1, rtx x2))
|
||
|
|
+DEF_TARGET_INSN (prefetch_sc, (rtx x0, rtx x1, rtx x2))
|
||
|
|
+DEF_TARGET_INSN (prefetch_tc, (rtx x0, rtx x1, rtx x2))
|
||
|
|
+/*********************/
|
||
|
|
DEF_TARGET_INSN (probe_stack, (rtx x0))
|
||
|
|
DEF_TARGET_INSN (probe_stack_address, (rtx x0))
|
||
|
|
DEF_TARGET_INSN (prologue, (void))
|
||
|
|
diff --git a/gcc/tree-ssa-loop-prefetch.cc b/gcc/tree-ssa-loop-prefetch.cc
|
||
|
|
index aebd7c920..6aa242260 100644
|
||
|
|
--- a/gcc/tree-ssa-loop-prefetch.cc
|
||
|
|
+++ b/gcc/tree-ssa-loop-prefetch.cc
|
||
|
|
@@ -193,6 +193,9 @@ along with GCC; see the file COPYING3. If not see
|
||
|
|
#define L1_CACHE_SIZE_BYTES ((unsigned) (param_l1_cache_size * 1024))
|
||
|
|
#define L2_CACHE_SIZE_BYTES ((unsigned) (param_l2_cache_size * 1024))
|
||
|
|
|
||
|
|
+#ifdef FLAG_SW64_PREFETCH
|
||
|
|
+#define L1_CACHE_LINE_SIZE ((unsigned) (param_l1_cache_line_size))
|
||
|
|
+#endif
|
||
|
|
/* We consider a memory access nontemporal if it is not reused sooner than
|
||
|
|
after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore
|
||
|
|
accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
|
||
|
|
@@ -1057,7 +1060,11 @@ schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
|
||
|
|
|
||
|
|
/* At most param_simultaneous_prefetches should be running
|
||
|
|
at the same time. */
|
||
|
|
+#ifdef FLAG_SW64_PREFETCH
|
||
|
|
+ remaining_prefetch_slots = param_simultaneous_prefetches * 5;
|
||
|
|
+#else
|
||
|
|
remaining_prefetch_slots = param_simultaneous_prefetches;
|
||
|
|
+#endif
|
||
|
|
|
||
|
|
/* The prefetch will run for AHEAD iterations of the original loop, i.e.,
|
||
|
|
AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration,
|
||
|
|
@@ -1081,8 +1088,10 @@ schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
|
||
|
|
/* The loop is far from being sufficiently unrolled for this
|
||
|
|
prefetch. Do not generate prefetch to avoid many redudant
|
||
|
|
prefetches. */
|
||
|
|
- if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO)
|
||
|
|
+#ifndef FLAG_SW64_PREFETCH
|
||
|
|
+ if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO)
|
||
|
|
continue;
|
||
|
|
+#endif
|
||
|
|
|
||
|
|
/* If we need to prefetch the reference each PREFETCH_MOD iterations,
|
||
|
|
and we unroll the loop UNROLL_FACTOR times, we need to insert
|
||
|
|
@@ -1153,6 +1162,19 @@ estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor)
|
||
|
|
return prefetch_count;
|
||
|
|
}
|
||
|
|
|
||
|
|
+#ifdef FLAG_SW64_PREFETCH
|
||
|
|
+/*Due to the need for SW to dynamically adjust the value of PF during
|
||
|
|
+ * prefetching,PF needs to handle negative values.However ,since Common Joined
|
||
|
|
+ * UInteger Var(PFX) is used, the function needs to convert unsigned (0-200) to
|
||
|
|
+ * (-100,100)*/
|
||
|
|
+int
|
||
|
|
+convert_default_to_sw (unsigned int pf_value)
|
||
|
|
+{
|
||
|
|
+ if (pf_value > 100)
|
||
|
|
+ return 100 - (int) pf_value;
|
||
|
|
+ return pf_value;
|
||
|
|
+}
|
||
|
|
+#endif
|
||
|
|
/* Issue prefetches for the reference REF into loop as decided before.
|
||
|
|
HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR
|
||
|
|
is the factor by which LOOP was unrolled. */
|
||
|
|
@@ -1184,11 +1206,21 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
|
||
|
|
|
||
|
|
for (ap = 0; ap < n_prefetches; ap++)
|
||
|
|
{
|
||
|
|
+#ifdef FLAG_SW64_PREFETCH
|
||
|
|
+ if (flag_sw_prefetch_dc == 1)
|
||
|
|
+ {
|
||
|
|
+#endif
|
||
|
|
if (cst_and_fits_in_hwi (ref->group->step))
|
||
|
|
{
|
||
|
|
/* Determine the address to prefetch. */
|
||
|
|
+#ifdef FLAG_SW64_PREFETCH
|
||
|
|
+ delta = (ahead + ap * ref->prefetch_mod) *
|
||
|
|
+ int_cst_value (ref->group->step) * 2
|
||
|
|
+ + convert_default_to_sw (PF1) * L1_CACHE_LINE_SIZE;
|
||
|
|
+#else
|
||
|
|
delta = (ahead + ap * ref->prefetch_mod) *
|
||
|
|
int_cst_value (ref->group->step);
|
||
|
|
+#endif
|
||
|
|
addr = fold_build_pointer_plus_hwi (addr_base, delta);
|
||
|
|
addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
|
||
|
|
NULL, true, GSI_SAME_STMT);
|
||
|
|
@@ -1220,6 +1252,86 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
|
||
|
|
prefetch = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
|
||
|
|
3, addr, write_p, local);
|
||
|
|
gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
|
||
|
|
+#ifdef FLAG_SW64_PREFETCH
|
||
|
|
+ }
|
||
|
|
+ /* Generate L2 prefetch if the option is open. */
|
||
|
|
+ if (flag_sw_prefetch_sc == 1)
|
||
|
|
+ {
|
||
|
|
+ if (cst_and_fits_in_hwi (ref->group->step))
|
||
|
|
+ {
|
||
|
|
+ delta = (ahead + ap * ref->prefetch_mod) *
|
||
|
|
+ int_cst_value (ref->group->step) * 2
|
||
|
|
+ + (4 + convert_default_to_sw (PF2)) * L1_CACHE_LINE_SIZE;
|
||
|
|
+
|
||
|
|
+ addr = fold_build_pointer_plus_hwi (addr_base, delta);
|
||
|
|
+ addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
|
||
|
|
+ NULL, true, GSI_SAME_STMT);
|
||
|
|
+ }
|
||
|
|
+ else
|
||
|
|
+ {
|
||
|
|
+ ahead += (unsigned) (convert_default_to_sw (PF2)
|
||
|
|
+ - convert_default_to_sw (PF1));
|
||
|
|
+ forward = fold_build2 (MULT_EXPR, sizetype,
|
||
|
|
+ fold_convert (sizetype, ref->group->step),
|
||
|
|
+ fold_convert (sizetype, size_int (ahead)));
|
||
|
|
+ addr = fold_build_pointer_plus (addr_base, forward);
|
||
|
|
+ addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
|
||
|
|
+ NULL, true, GSI_SAME_STMT);
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ if (addr_base != addr && TREE_CODE (addr_base) == SSA_NAME
|
||
|
|
+ && TREE_CODE (addr) == SSA_NAME)
|
||
|
|
+ {
|
||
|
|
+ duplicate_ssa_name_ptr_info (addr, SSA_NAME_PTR_INFO (addr_base));
|
||
|
|
+ if (SSA_NAME_PTR_INFO (addr))
|
||
|
|
+ mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr));
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ /* Create the L2 prefetch instruction. */
|
||
|
|
+ prefetch
|
||
|
|
+ = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH_SC),
|
||
|
|
+ 3, addr, write_p, local);
|
||
|
|
+ gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
|
||
|
|
+ }
|
||
|
|
+ /* Generate L3 prefetch if the option is open. */
|
||
|
|
+ if (flag_sw_prefetch_tc == 1)
|
||
|
|
+ {
|
||
|
|
+ if (cst_and_fits_in_hwi (ref->group->step))
|
||
|
|
+ {
|
||
|
|
+ delta = (ahead + ap * ref->prefetch_mod) *
|
||
|
|
+ int_cst_value (ref->group->step) * 2
|
||
|
|
+ + (10 + convert_default_to_sw (PF3)) * L1_CACHE_LINE_SIZE;
|
||
|
|
+
|
||
|
|
+ addr = fold_build_pointer_plus_hwi (addr_base, delta);
|
||
|
|
+ addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
|
||
|
|
+ NULL, true, GSI_SAME_STMT);
|
||
|
|
+ }
|
||
|
|
+ else
|
||
|
|
+ {
|
||
|
|
+ ahead += (unsigned) (convert_default_to_sw (PF3)
|
||
|
|
+ - convert_default_to_sw (PF1));
|
||
|
|
+ forward = fold_build2 (MULT_EXPR, sizetype,
|
||
|
|
+ fold_convert (sizetype, ref->group->step),
|
||
|
|
+ fold_convert (sizetype, size_int (ahead)));
|
||
|
|
+ addr = fold_build_pointer_plus (addr_base, forward);
|
||
|
|
+ addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
|
||
|
|
+ NULL, true, GSI_SAME_STMT);
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ if (addr_base != addr && TREE_CODE (addr_base) == SSA_NAME
|
||
|
|
+ && TREE_CODE (addr) == SSA_NAME)
|
||
|
|
+ {
|
||
|
|
+ duplicate_ssa_name_ptr_info (addr, SSA_NAME_PTR_INFO (addr_base));
|
||
|
|
+ if (SSA_NAME_PTR_INFO (addr))
|
||
|
|
+ mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr));
|
||
|
|
+ }
|
||
|
|
+ /* Create the L3 prefetch instruction. */
|
||
|
|
+ prefetch
|
||
|
|
+ = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH_TC),
|
||
|
|
+ 3, addr, write_p, local);
|
||
|
|
+ gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
|
||
|
|
+ }
|
||
|
|
+#endif
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
@@ -1375,9 +1487,22 @@ should_unroll_loop_p (class loop *loop, class tree_niter_desc *desc,
|
||
|
|
as well; but the unrolling/prefetching is usually more profitable for
|
||
|
|
loops consisting of a single basic block, and we want to limit the
|
||
|
|
code growth. */
|
||
|
|
+#ifdef FLAG_SW64_PREFETCH
|
||
|
|
+ if (flag_sw_prefetch_unroll == 1)
|
||
|
|
+ {
|
||
|
|
+ if (loop->num_nodes > 7)
|
||
|
|
+ return false;
|
||
|
|
+ }
|
||
|
|
+ else
|
||
|
|
+ {
|
||
|
|
+ if (loop->num_nodes > 2)
|
||
|
|
+ return false;
|
||
|
|
+ }
|
||
|
|
+#else
|
||
|
|
if (loop->num_nodes > 2)
|
||
|
|
return false;
|
||
|
|
|
||
|
|
+#endif
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
|
||
|
|
@@ -1422,6 +1547,12 @@ determine_unroll_factor (class loop *loop, struct mem_ref_group *refs,
|
||
|
|
if (should_issue_prefetch_p (ref))
|
||
|
|
{
|
||
|
|
mod_constraint = ref->prefetch_mod;
|
||
|
|
+#ifdef FLAG_SW64_PREFETCH
|
||
|
|
+ /* TODO: mod_constraint is set to 4 by experience, but we should do it
|
||
|
|
+ * with precision. */
|
||
|
|
+ if (mod_constraint > upper_bound)
|
||
|
|
+ mod_constraint = 4;
|
||
|
|
+#endif
|
||
|
|
nfactor = least_common_multiple (mod_constraint, factor);
|
||
|
|
if (nfactor <= upper_bound)
|
||
|
|
factor = nfactor;
|
||
|
|
@@ -2022,6 +2153,28 @@ tree_ssa_prefetch_arrays (void)
|
||
|
|
DECL_IS_NOVOPS (decl) = true;
|
||
|
|
set_builtin_decl (BUILT_IN_PREFETCH, decl, false);
|
||
|
|
}
|
||
|
|
+#ifdef FLAG_SW64_PREFETCH
|
||
|
|
+ if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH_SC))
|
||
|
|
+ {
|
||
|
|
+ tree type = build_function_type_list (void_type_node, const_ptr_type_node,
|
||
|
|
+ NULL_TREE);
|
||
|
|
+ tree decl = add_builtin_function ("__builtin_prefetch_sc", type,
|
||
|
|
+ BUILT_IN_PREFETCH_SC, BUILT_IN_NORMAL,
|
||
|
|
+ NULL, NULL_TREE);
|
||
|
|
+ DECL_IS_NOVOPS (decl) = true;
|
||
|
|
+ set_builtin_decl (BUILT_IN_PREFETCH_SC, decl, false);
|
||
|
|
+ }
|
||
|
|
+ if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH_TC))
|
||
|
|
+ {
|
||
|
|
+ tree type = build_function_type_list (void_type_node, const_ptr_type_node,
|
||
|
|
+ NULL_TREE);
|
||
|
|
+ tree decl = add_builtin_function ("__builtin_prefetch_tc", type,
|
||
|
|
+ BUILT_IN_PREFETCH_TC, BUILT_IN_NORMAL,
|
||
|
|
+ NULL, NULL_TREE);
|
||
|
|
+ DECL_IS_NOVOPS (decl) = true;
|
||
|
|
+ set_builtin_decl (BUILT_IN_PREFETCH_TC, decl, false);
|
||
|
|
+ }
|
||
|
|
+#endif
|
||
|
|
|
||
|
|
for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
|
||
|
|
{
|
||
|
|
--
|
||
|
|
2.25.1
|
||
|
|
|