From bfa51c2eda3e40fdfd64601f5e7df19049a006cd Mon Sep 17 00:00:00 2001 From: swcompiler Date: Mon, 25 Nov 2024 16:33:10 +0800 Subject: [PATCH 03/16] Sw64 Port: add multi-prefetch support for sw64 --- gcc/builtins.cc | 161 +++++++++++++++++++++++++++++++++- gcc/builtins.def | 2 + gcc/ipa-pure-const.cc | 2 + gcc/opt-functions.awk | 4 +- gcc/params.opt | 12 +++ gcc/target-insns.def | 3 + gcc/tree-ssa-loop-prefetch.cc | 155 +++++++++++++++++++++++++++++++- 7 files changed, 336 insertions(+), 3 deletions(-) diff --git a/gcc/builtins.cc b/gcc/builtins.cc index 57929a42b..c2589f316 100644 --- a/gcc/builtins.cc +++ b/gcc/builtins.cc @@ -109,6 +109,8 @@ static int apply_args_size (void); static int apply_result_size (void); static rtx result_vector (int, rtx); static void expand_builtin_prefetch (tree); +static void expand_builtin_prefetch_sc (tree); +static void expand_builtin_prefetch_tc (tree); static rtx expand_builtin_apply_args (void); static rtx expand_builtin_apply_args_1 (void); static rtx expand_builtin_apply (rtx, rtx, rtx); @@ -1352,6 +1354,156 @@ expand_builtin_prefetch (tree exp) emit_insn (op0); } +static void +expand_builtin_prefetch_sc (tree exp) +{ + tree arg0, arg1, arg2; + int nargs; + rtx op0, op1, op2; + + if (!validate_arglist (exp, POINTER_TYPE, 0)) + return; + + arg0 = CALL_EXPR_ARG (exp, 0); + + /* Arguments 1 and 2 are optional; argument 1 (read/write) defaults to + * zero (read) and argument 2 (locality) defaults to 3 (high degree of + * locality). */ + nargs = call_expr_nargs (exp); + if (nargs > 1) + arg1 = CALL_EXPR_ARG (exp, 1); + else + arg1 = integer_zero_node; + if (nargs > 2) + arg2 = CALL_EXPR_ARG (exp, 2); + else + arg2 = integer_three_node; + + /* Argument 0 is an address. */ + op0 = expand_expr (arg0, NULL_RTX, Pmode, EXPAND_NORMAL); + + /* Argument 1 (read/write flag) must be a compile-time constant int. */ + if (TREE_CODE (arg1) != INTEGER_CST) + { + error ("second argument to %<__builtin_prefetch_sc%> must be a constant"); + arg1 = integer_zero_node; + } + op1 = expand_normal (arg1); + /* Argument 1 must be either zero or one. */ + if (INTVAL (op1) != 0 && INTVAL (op1) != 1) + { + warning (0, "invalid second argument to %<__builtin_prefetch_sc%>;" + " using zero"); + op1 = const0_rtx; + } + + /* Argument 2 (locality) must be a compile-time constant int. */ + if (TREE_CODE (arg2) != INTEGER_CST) + { + error ("third argument to %<__builtin_prefetch_sc%> must be a constant"); + arg2 = integer_zero_node; + } + op2 = expand_normal (arg2); + /* Argument 2 must be 0, 1, 2, or 3. */ + if (INTVAL (op2) < 0 || INTVAL (op2) > 3) + { + warning ( + 0, "invalid third argument to %<__builtin_prefetch_sc%>; using zero"); + op2 = const0_rtx; + } + + if (targetm.have_prefetch ()) + { + class expand_operand ops[3]; + + create_address_operand (&ops[0], op0); + create_integer_operand (&ops[1], INTVAL (op1)); + create_integer_operand (&ops[2], INTVAL (op2)); + if (maybe_expand_insn (targetm.code_for_prefetch_sc, 3, ops)) + return; + } + + /* Don't do anything with direct references to volatile memory, but + * generate code to handle other side effects. */ + if (!MEM_P (op0) && side_effects_p (op0)) + emit_insn (op0); +} + +static void +expand_builtin_prefetch_tc (tree exp) +{ + tree arg0, arg1, arg2; + int nargs; + rtx op0, op1, op2; + + if (!validate_arglist (exp, POINTER_TYPE, 0)) + return; + + arg0 = CALL_EXPR_ARG (exp, 0); + + /* Arguments 1 and 2 are optional; argument 1 (read/write) defaults to + * zero (read) and argument 2 (locality) defaults to 3 (high degree of + * locality). */ + nargs = call_expr_nargs (exp); + if (nargs > 1) + arg1 = CALL_EXPR_ARG (exp, 1); + else + arg1 = integer_zero_node; + if (nargs > 2) + arg2 = CALL_EXPR_ARG (exp, 2); + else + arg2 = integer_three_node; + + /* Argument 0 is an address. */ + op0 = expand_expr (arg0, NULL_RTX, Pmode, EXPAND_NORMAL); + + /* Argument 1 (read/write flag) must be a compile-time constant int. */ + if (TREE_CODE (arg1) != INTEGER_CST) + { + error ("second argument to %<__builtin_prefetch%> must be a constant"); + arg1 = integer_zero_node; + } + op1 = expand_normal (arg1); + /* Argument 1 must be either zero or one. */ + if (INTVAL (op1) != 0 && INTVAL (op1) != 1) + { + warning (0, "invalid second argument to %<__builtin_prefetch%>;" + " using zero"); + op1 = const0_rtx; + } + + /* Argument 2 (locality) must be a compile-time constant int. */ + if (TREE_CODE (arg2) != INTEGER_CST) + { + error ("third argument to %<__builtin_prefetch%> must be a constant"); + arg2 = integer_zero_node; + } + op2 = expand_normal (arg2); + /* Argument 2 must be 0, 1, 2, or 3. */ + if (INTVAL (op2) < 0 || INTVAL (op2) > 3) + { + warning (0, + "invalid third argument to %<__builtin_prefetch%>; using zero"); + op2 = const0_rtx; + } + + if (targetm.have_prefetch ()) + { + class expand_operand ops[3]; + + create_address_operand (&ops[0], op0); + create_integer_operand (&ops[1], INTVAL (op1)); + create_integer_operand (&ops[2], INTVAL (op2)); + if (maybe_expand_insn (targetm.code_for_prefetch_tc, 3, ops)) + return; + } + + /* Don't do anything with direct references to volatile memory, but + * generate code to handle other side effects. */ + if (!MEM_P (op0) && side_effects_p (op0)) + emit_insn (op0); +} + /* Get a MEM rtx for expression EXP which is the address of an operand to be used in a string instruction (cmpstrsi, cpymemsi, ..). LEN is the maximum length of the block of memory that might be accessed or @@ -7598,7 +7750,12 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode, case BUILT_IN_PREFETCH: expand_builtin_prefetch (exp); return const0_rtx; - + case BUILT_IN_PREFETCH_SC: + expand_builtin_prefetch_sc (exp); + return const0_rtx; + case BUILT_IN_PREFETCH_TC: + expand_builtin_prefetch_tc (exp); + return const0_rtx; case BUILT_IN_INIT_TRAMPOLINE: return expand_builtin_init_trampoline (exp, true); case BUILT_IN_INIT_HEAP_TRAMPOLINE: @@ -10989,6 +11146,8 @@ is_inexpensive_builtin (tree decl) case BUILT_IN_LABS: case BUILT_IN_LLABS: case BUILT_IN_PREFETCH: + case BUILT_IN_PREFETCH_SC: + case BUILT_IN_PREFETCH_TC: case BUILT_IN_ACC_ON_DEVICE: return true; diff --git a/gcc/builtins.def b/gcc/builtins.def index 005976f34..983de293e 100644 --- a/gcc/builtins.def +++ b/gcc/builtins.def @@ -924,6 +924,8 @@ DEF_GCC_BUILTIN (BUILT_IN_POPCOUNTL, "popcountl", BT_FN_INT_ULONG, ATTR_C DEF_GCC_BUILTIN (BUILT_IN_POPCOUNTLL, "popcountll", BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST) DEF_EXT_LIB_BUILTIN (BUILT_IN_POSIX_MEMALIGN, "posix_memalign", BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF) DEF_GCC_BUILTIN (BUILT_IN_PREFETCH, "prefetch", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST) +DEF_GCC_BUILTIN (BUILT_IN_PREFETCH_SC, "prefetch_sc", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST) +DEF_GCC_BUILTIN (BUILT_IN_PREFETCH_TC, "prefetch_tc", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST) DEF_LIB_BUILTIN (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE, ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST) DEF_GCC_BUILTIN (BUILT_IN_RETURN, "return", BT_FN_VOID_PTR, ATTR_NORETURN_NOTHROW_LEAF_LIST) DEF_GCC_BUILTIN (BUILT_IN_RETURN_ADDRESS, "return_address", BT_FN_PTR_UINT, ATTR_LEAF_LIST) diff --git a/gcc/ipa-pure-const.cc b/gcc/ipa-pure-const.cc index 2642df91e..89a950966 100644 --- a/gcc/ipa-pure-const.cc +++ b/gcc/ipa-pure-const.cc @@ -534,6 +534,8 @@ builtin_safe_for_const_function_p (bool *looping, tree callee) *looping = false; return true; case BUILT_IN_PREFETCH: + case BUILT_IN_PREFETCH_SC: + case BUILT_IN_PREFETCH_TC: *looping = true; return true; default: diff --git a/gcc/opt-functions.awk b/gcc/opt-functions.awk index 2aee0b9f1..0dabde89d 100644 --- a/gcc/opt-functions.awk +++ b/gcc/opt-functions.awk @@ -247,6 +247,8 @@ function var_type(flags) return "HOST_WIDE_INT " else if (flag_set_p("UInteger", flags)) return "int " + else if (flag_set_p("UInteger", flags)) + return "int " else return "const char *" } @@ -256,7 +258,7 @@ function var_type(flags) # type instead of int to save space. function var_type_struct(flags) { - if (flag_set_p("UInteger", flags)) { + if (flag_set_p("UInteger", flags)) { if (host_wide_int[var_name(flags)] == "yes") return "HOST_WIDE_INT "; if (flag_set_p("ByteSize", flags)) diff --git a/gcc/params.opt b/gcc/params.opt index 3ddfaf5b2..5abc8ce82 100644 --- a/gcc/params.opt +++ b/gcc/params.opt @@ -385,6 +385,18 @@ The size of L1 cache. Common Joined UInteger Var(param_l2_cache_size) Init(512) Param Optimization The size of L2 cache. +-param=pf1= +Common Joined UInteger Var(PF1) Init(0) IntegerRange(0,200) Param Optimization +The number of Cache lines add to L1 prefetch delta. + +-param=pf2= +Common Joined UInteger Var(PF2) Init(0) IntegerRange(0,200) Param Optimization +The number of Cache lines add to L2 prefetch delta. + +-param=pf3= +Common Joined UInteger Var(PF3) Init(0) IntegerRange(0,200) Param Optimization +The number of Cache lines add to L3 prefetch delta. + -param=large-function-growth= Common Joined UInteger Var(param_large_function_growth) Optimization Init(100) Param Maximal growth due to inlining of large function (in percent). diff --git a/gcc/target-insns.def b/gcc/target-insns.def index de8c0092f..8b4da8bc4 100644 --- a/gcc/target-insns.def +++ b/gcc/target-insns.def @@ -77,6 +77,9 @@ DEF_TARGET_INSN (omp_simt_vote_any, (rtx x0, rtx x1)) DEF_TARGET_INSN (omp_simt_xchg_bfly, (rtx x0, rtx x1, rtx x2)) DEF_TARGET_INSN (omp_simt_xchg_idx, (rtx x0, rtx x1, rtx x2)) DEF_TARGET_INSN (prefetch, (rtx x0, rtx x1, rtx x2)) +DEF_TARGET_INSN (prefetch_sc, (rtx x0, rtx x1, rtx x2)) +DEF_TARGET_INSN (prefetch_tc, (rtx x0, rtx x1, rtx x2)) +/*********************/ DEF_TARGET_INSN (probe_stack, (rtx x0)) DEF_TARGET_INSN (probe_stack_address, (rtx x0)) DEF_TARGET_INSN (prologue, (void)) diff --git a/gcc/tree-ssa-loop-prefetch.cc b/gcc/tree-ssa-loop-prefetch.cc index aebd7c920..6aa242260 100644 --- a/gcc/tree-ssa-loop-prefetch.cc +++ b/gcc/tree-ssa-loop-prefetch.cc @@ -193,6 +193,9 @@ along with GCC; see the file COPYING3. If not see #define L1_CACHE_SIZE_BYTES ((unsigned) (param_l1_cache_size * 1024)) #define L2_CACHE_SIZE_BYTES ((unsigned) (param_l2_cache_size * 1024)) +#ifdef FLAG_SW64_PREFETCH +#define L1_CACHE_LINE_SIZE ((unsigned) (param_l1_cache_line_size)) +#endif /* We consider a memory access nontemporal if it is not reused sooner than after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION, @@ -1057,7 +1060,11 @@ schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor, /* At most param_simultaneous_prefetches should be running at the same time. */ +#ifdef FLAG_SW64_PREFETCH + remaining_prefetch_slots = param_simultaneous_prefetches * 5; +#else remaining_prefetch_slots = param_simultaneous_prefetches; +#endif /* The prefetch will run for AHEAD iterations of the original loop, i.e., AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration, @@ -1081,8 +1088,10 @@ schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor, /* The loop is far from being sufficiently unrolled for this prefetch. Do not generate prefetch to avoid many redudant prefetches. */ - if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO) +#ifndef FLAG_SW64_PREFETCH + if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO) continue; +#endif /* If we need to prefetch the reference each PREFETCH_MOD iterations, and we unroll the loop UNROLL_FACTOR times, we need to insert @@ -1153,6 +1162,19 @@ estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor) return prefetch_count; } +#ifdef FLAG_SW64_PREFETCH +/*Due to the need for SW to dynamically adjust the value of PF during + * prefetching,PF needs to handle negative values.However ,since Common Joined + * UInteger Var(PFX) is used, the function needs to convert unsigned (0-200) to + * (-100,100)*/ +int +convert_default_to_sw (unsigned int pf_value) +{ + if (pf_value > 100) + return 100 - (int) pf_value; + return pf_value; +} +#endif /* Issue prefetches for the reference REF into loop as decided before. HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the factor by which LOOP was unrolled. */ @@ -1184,11 +1206,21 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead) for (ap = 0; ap < n_prefetches; ap++) { +#ifdef FLAG_SW64_PREFETCH + if (flag_sw_prefetch_dc == 1) + { +#endif if (cst_and_fits_in_hwi (ref->group->step)) { /* Determine the address to prefetch. */ +#ifdef FLAG_SW64_PREFETCH + delta = (ahead + ap * ref->prefetch_mod) * + int_cst_value (ref->group->step) * 2 + + convert_default_to_sw (PF1) * L1_CACHE_LINE_SIZE; +#else delta = (ahead + ap * ref->prefetch_mod) * int_cst_value (ref->group->step); +#endif addr = fold_build_pointer_plus_hwi (addr_base, delta); addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, NULL, true, GSI_SAME_STMT); @@ -1220,6 +1252,86 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead) prefetch = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH), 3, addr, write_p, local); gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT); +#ifdef FLAG_SW64_PREFETCH + } + /* Generate L2 prefetch if the option is open. */ + if (flag_sw_prefetch_sc == 1) + { + if (cst_and_fits_in_hwi (ref->group->step)) + { + delta = (ahead + ap * ref->prefetch_mod) * + int_cst_value (ref->group->step) * 2 + + (4 + convert_default_to_sw (PF2)) * L1_CACHE_LINE_SIZE; + + addr = fold_build_pointer_plus_hwi (addr_base, delta); + addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, + NULL, true, GSI_SAME_STMT); + } + else + { + ahead += (unsigned) (convert_default_to_sw (PF2) + - convert_default_to_sw (PF1)); + forward = fold_build2 (MULT_EXPR, sizetype, + fold_convert (sizetype, ref->group->step), + fold_convert (sizetype, size_int (ahead))); + addr = fold_build_pointer_plus (addr_base, forward); + addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, + NULL, true, GSI_SAME_STMT); + } + + if (addr_base != addr && TREE_CODE (addr_base) == SSA_NAME + && TREE_CODE (addr) == SSA_NAME) + { + duplicate_ssa_name_ptr_info (addr, SSA_NAME_PTR_INFO (addr_base)); + if (SSA_NAME_PTR_INFO (addr)) + mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr)); + } + + /* Create the L2 prefetch instruction. */ + prefetch + = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH_SC), + 3, addr, write_p, local); + gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT); + } + /* Generate L3 prefetch if the option is open. */ + if (flag_sw_prefetch_tc == 1) + { + if (cst_and_fits_in_hwi (ref->group->step)) + { + delta = (ahead + ap * ref->prefetch_mod) * + int_cst_value (ref->group->step) * 2 + + (10 + convert_default_to_sw (PF3)) * L1_CACHE_LINE_SIZE; + + addr = fold_build_pointer_plus_hwi (addr_base, delta); + addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, + NULL, true, GSI_SAME_STMT); + } + else + { + ahead += (unsigned) (convert_default_to_sw (PF3) + - convert_default_to_sw (PF1)); + forward = fold_build2 (MULT_EXPR, sizetype, + fold_convert (sizetype, ref->group->step), + fold_convert (sizetype, size_int (ahead))); + addr = fold_build_pointer_plus (addr_base, forward); + addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, + NULL, true, GSI_SAME_STMT); + } + + if (addr_base != addr && TREE_CODE (addr_base) == SSA_NAME + && TREE_CODE (addr) == SSA_NAME) + { + duplicate_ssa_name_ptr_info (addr, SSA_NAME_PTR_INFO (addr_base)); + if (SSA_NAME_PTR_INFO (addr)) + mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr)); + } + /* Create the L3 prefetch instruction. */ + prefetch + = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH_TC), + 3, addr, write_p, local); + gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT); + } +#endif } } @@ -1375,9 +1487,22 @@ should_unroll_loop_p (class loop *loop, class tree_niter_desc *desc, as well; but the unrolling/prefetching is usually more profitable for loops consisting of a single basic block, and we want to limit the code growth. */ +#ifdef FLAG_SW64_PREFETCH + if (flag_sw_prefetch_unroll == 1) + { + if (loop->num_nodes > 7) + return false; + } + else + { + if (loop->num_nodes > 2) + return false; + } +#else if (loop->num_nodes > 2) return false; +#endif return true; } @@ -1422,6 +1547,12 @@ determine_unroll_factor (class loop *loop, struct mem_ref_group *refs, if (should_issue_prefetch_p (ref)) { mod_constraint = ref->prefetch_mod; +#ifdef FLAG_SW64_PREFETCH + /* TODO: mod_constraint is set to 4 by experience, but we should do it + * with precision. */ + if (mod_constraint > upper_bound) + mod_constraint = 4; +#endif nfactor = least_common_multiple (mod_constraint, factor); if (nfactor <= upper_bound) factor = nfactor; @@ -2022,6 +2153,28 @@ tree_ssa_prefetch_arrays (void) DECL_IS_NOVOPS (decl) = true; set_builtin_decl (BUILT_IN_PREFETCH, decl, false); } +#ifdef FLAG_SW64_PREFETCH + if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH_SC)) + { + tree type = build_function_type_list (void_type_node, const_ptr_type_node, + NULL_TREE); + tree decl = add_builtin_function ("__builtin_prefetch_sc", type, + BUILT_IN_PREFETCH_SC, BUILT_IN_NORMAL, + NULL, NULL_TREE); + DECL_IS_NOVOPS (decl) = true; + set_builtin_decl (BUILT_IN_PREFETCH_SC, decl, false); + } + if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH_TC)) + { + tree type = build_function_type_list (void_type_node, const_ptr_type_node, + NULL_TREE); + tree decl = add_builtin_function ("__builtin_prefetch_tc", type, + BUILT_IN_PREFETCH_TC, BUILT_IN_NORMAL, + NULL, NULL_TREE); + DECL_IS_NOVOPS (decl) = true; + set_builtin_decl (BUILT_IN_PREFETCH_TC, decl, false); + } +#endif for (auto loop : loops_list (cfun, LI_FROM_INNERMOST)) { -- 2.25.1