!436 [Sync]Sync from openEuler-24.03-LTS

From: @jubo-run 
Reviewed-by: @huang-xiaoquan 
Signed-off-by: @huang-xiaoquan
This commit is contained in:
openeuler-ci-bot 2024-05-31 02:56:24 +00:00 committed by Gitee
commit 14054239e6
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
170 changed files with 325236 additions and 9 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,550 @@
From 72531376df5ed93c2d945469368ba5514eca8407 Mon Sep 17 00:00:00 2001
From: zhenyu--zhao_admin <zhaozhenyu17@huawei.com>
Date: Tue, 5 Dec 2023 15:33:08 +0800
Subject: [PATCH] [AutoBOLT] Support saving feedback count info to ELF segment
1/3
---
gcc/common.opt | 8 +
gcc/final.cc | 405 ++++++++++++++++++++++++++++++++++++++++++++++++-
gcc/opts.cc | 61 ++++++++
3 files changed, 473 insertions(+), 1 deletion(-)
diff --git a/gcc/common.opt b/gcc/common.opt
index b01df919e..e69947fc2 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -2546,6 +2546,14 @@ freorder-functions
Common Var(flag_reorder_functions) Optimization
Reorder functions to improve code placement.
+fauto-bolt
+Common Var(flag_auto_bolt)
+Generate profile from AutoFDO or PGO and do BOLT optimization after linkage.
+
+fauto-bolt=
+Common Joined RejectNegative
+Specify the feedback data directory required by BOLT-plugin. The default is the current directory.
+
frerun-cse-after-loop
Common Var(flag_rerun_cse_after_loop) Optimization
Add a common subexpression elimination pass after loop optimizations.
diff --git a/gcc/final.cc b/gcc/final.cc
index a9868861b..d4c4fa08f 100644
--- a/gcc/final.cc
+++ b/gcc/final.cc
@@ -81,6 +81,7 @@ along with GCC; see the file COPYING3. If not see
#include "rtl-iter.h"
#include "print-rtl.h"
#include "function-abi.h"
+#include "insn-codes.h"
#include "common/common-target.h"
#ifdef XCOFF_DEBUGGING_INFO
@@ -4266,7 +4267,403 @@ leaf_renumber_regs_insn (rtx in_rtx)
}
}
#endif
-
+
+#define ASM_FDO_SECTION_PREFIX ".text.fdo."
+
+#define ASM_FDO_CALLER_FLAG ".fdo.caller "
+#define ASM_FDO_CALLER_SIZE_FLAG ".fdo.caller.size "
+#define ASM_FDO_CALLER_BIND_FLAG ".fdo.caller.bind"
+
+#define ASM_FDO_CALLEE_FLAG ".fdo.callee"
+
+/* Return the relative offset address of the start instruction of BB,
+ return -1 if it is empty instruction. */
+
+static int
+get_bb_start_addr (basic_block bb)
+{
+ rtx_insn *insn;
+ FOR_BB_INSNS (bb, insn)
+ {
+ if (!INSN_P (insn))
+ {
+ continue;
+ }
+ /* The jump target of call is not in this function, so
+ it should be excluded. */
+ if (CALL_P (insn))
+ {
+ return -1;
+ }
+
+ int insn_code = recog_memoized (insn);
+
+ /* The instruction NOP in llvm-bolt belongs to the previous
+ BB, so it needs to be skipped. */
+ if (insn_code != CODE_FOR_nop)
+ {
+ return INSN_ADDRESSES (INSN_UID (insn));
+ }
+ }
+ return -1;
+}
+
+/* Return the relative offet address of the end instruction of BB,
+ return -1 if it is empty or call instruction. */
+
+static int
+get_bb_end_addr (basic_block bb)
+{
+ rtx_insn *insn;
+ int num_succs = EDGE_COUNT (bb->succs);
+ FOR_BB_INSNS_REVERSE (bb, insn)
+ {
+ if (!INSN_P (insn))
+ {
+ continue;
+ }
+ /* The jump target of call is not in this function, so
+ it should be excluded. */
+ if (CALL_P (insn))
+ {
+ return -1;
+ }
+ if ((num_succs == 1)
+ || ((num_succs == 2) && any_condjump_p (insn)))
+ {
+ return INSN_ADDRESSES (INSN_UID (insn));
+ }
+ else
+ {
+ return -1;
+ }
+ }
+ return -1;
+}
+
+/* Return the end address of cfun. */
+
+static int
+get_function_end_addr ()
+{
+ rtx_insn *insn = get_last_insn ();
+ for (; insn != get_insns (); insn = PREV_INSN (insn))
+ {
+ if (!INSN_P (insn))
+ {
+ continue;
+ }
+ return INSN_ADDRESSES (INSN_UID (insn));
+ }
+
+ return -1;
+}
+
+/* Return the function profile status string. */
+
+static const char *
+get_function_profile_status ()
+{
+ const char *profile_status[] = {
+ "PROFILE_ABSENT",
+ "PROFILE_GUESSED",
+ "PROFILE_READ",
+ "PROFILE_LAST" /* Last value, used by profile streaming. */
+ };
+
+ return profile_status[profile_status_for_fn (cfun)];
+}
+
+/* Return the count from the feedback data, such as PGO or ADDO. */
+
+inline static gcov_type
+get_fdo_count (profile_count count)
+{
+ return count.quality () >= GUESSED
+ ? count.to_gcov_type () : 0;
+}
+
+/* Return the profile quality string. */
+
+static const char *
+get_fdo_count_quality (profile_count count)
+{
+ const char *profile_quality[] = {
+ "UNINITIALIZED_PROFILE",
+ "GUESSED_LOCAL",
+ "GUESSED_GLOBAL0",
+ "GUESSED_GLOBAL0_ADJUSTED",
+ "GUESSED",
+ "AFDO",
+ "ADJUSTED",
+ "PRECISE"
+ };
+
+ return profile_quality[count.quality ()];
+}
+
+static const char *
+alias_local_functions (const char *fnname)
+{
+ if (TREE_PUBLIC (cfun->decl))
+ {
+ return fnname;
+ }
+ return concat (fnname, "/", lbasename (dump_base_name), NULL);
+}
+
+/* Return function bind type string. */
+
+static const char *
+simple_get_function_bind ()
+{
+ const char *function_bind[] = {
+ "GLOBAL",
+ "WEAK",
+ "LOCAL",
+ "UNKNOWN"
+ };
+
+ if (TREE_PUBLIC (cfun->decl))
+ {
+ if (!(DECL_WEAK (cfun->decl)))
+ {
+ return function_bind[0];
+ }
+ else
+ {
+ return function_bind[1];
+ }
+ }
+ else
+ {
+ return function_bind[2];
+ }
+
+ return function_bind[3];
+}
+
+/* Dumo the callee functions insn in bb by CALL_P (insn). */
+
+static void
+dump_direct_callee_info_to_asm (basic_block bb, gcov_type call_count)
+{
+ rtx_insn *insn;
+ FOR_BB_INSNS (bb, insn)
+ {
+ if (insn && CALL_P (insn))
+ {
+ tree callee = get_call_fndecl (insn);
+
+ if (callee)
+ {
+ fprintf (asm_out_file, "\t.string \"%x\"\n",
+ INSN_ADDRESSES (INSN_UID (insn)));
+
+ fprintf (asm_out_file, "\t.string \"%s%s\"\n",
+ ASM_FDO_CALLEE_FLAG,
+ alias_local_functions (get_fnname_from_decl (callee)));
+
+ fprintf (asm_out_file,
+ "\t.string \"" HOST_WIDE_INT_PRINT_DEC "\"\n",
+ call_count);
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "call: %x --> %s \n",
+ INSN_ADDRESSES (INSN_UID (insn)),
+ alias_local_functions
+ (get_fnname_from_decl (callee)));
+ }
+ }
+ }
+ }
+}
+
+/* Dump the edge info into asm. */
+static int
+dump_edge_jump_info_to_asm (basic_block bb, gcov_type bb_count)
+{
+ edge e;
+ edge_iterator ei;
+ gcov_type edge_total_count = 0;
+
+ FOR_EACH_EDGE (e, ei, bb->succs)
+ {
+ gcov_type edge_count = get_fdo_count (e->count ());
+ edge_total_count += edge_count;
+
+ int edge_start_addr = get_bb_end_addr (e->src);
+ int edge_end_addr = get_bb_start_addr(e->dest);
+
+ if (edge_start_addr == -1 || edge_end_addr == -1)
+ {
+ continue;
+ }
+
+ /* This is a reserved assert for the original design. If this
+ assert is found, use the address of the previous instruction
+ as edge_start_addr. */
+ gcc_assert (edge_start_addr != edge_end_addr);
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "edge: %x --> %x = (%ld)\n",
+ edge_start_addr, edge_end_addr, edge_count);
+ }
+
+ if (edge_count > 0)
+ {
+ fprintf(asm_out_file, "\t.string \"%x\"\n", edge_start_addr);
+ fprintf(asm_out_file, "\t.string \"%x\"\n", edge_end_addr);
+ fprintf(asm_out_file, "\t.string \"" HOST_WIDE_INT_PRINT_DEC "\"\n",
+ edge_count);
+ }
+ }
+
+ gcov_type call_count = MAX (edge_total_count, bb_count);
+ if (call_count > 0)
+ {
+ dump_direct_callee_info_to_asm (bb, call_count);
+ }
+}
+
+/* Dump the bb info into asm. */
+
+static void
+dump_bb_info_to_asm (basic_block bb, gcov_type bb_count)
+{
+ int bb_start_addr = get_bb_start_addr (bb);
+ if (bb_start_addr != -1)
+ {
+ fprintf (asm_out_file, "\t.string \"%x\"\n", bb_start_addr);
+ fprintf (asm_out_file, "\t.string \"" HOST_WIDE_INT_PRINT_DEC "\"\n",
+ bb_count);
+ }
+}
+
+/* Dump the function info into asm. */
+
+static void
+dump_function_info_to_asm (const char *fnname)
+{
+ fprintf (asm_out_file, "\t.string \"%s%s\"\n",
+ ASM_FDO_CALLER_FLAG, alias_local_functions (fnname));
+ fprintf (asm_out_file, "\t.string \"%s%d\"\n",
+ ASM_FDO_CALLER_SIZE_FLAG, get_function_end_addr ());
+ fprintf (asm_out_file, "\t.string \"%s%s\"\n",
+ ASM_FDO_CALLER_BIND_FLAG, simple_get_function_bind ());
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "\n FUNC_NAME: %s\n",
+ alias_local_functions (fnname));
+ fprintf (dump_file, " file: %s\n",
+ dump_base_name);
+ fprintf (dump_file, "profile_status: %s\n",
+ get_function_profile_status ());
+ fprintf (dump_file, " size: %x\n",
+ get_function_end_addr ());
+ fprintf (dump_file, " function_bind: %s\n",
+ simple_get_function_bind ());
+ }
+}
+
+/* Dump function profile into form AutoFDO or PGO to asm. */
+
+static void
+dump_fdo_info_to_asm (const char *fnname)
+{
+ basic_block bb;
+
+ dump_function_info_to_asm (fnname);
+
+ FOR_EACH_BB_FN (bb, cfun)
+ {
+ gcov_type bb_count = get_fdo_count (bb->count);
+ if (bb_count == 0)
+ {
+ continue;
+ }
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "BB: %x --> %x = (%ld) [%s]\n",
+ get_bb_start_addr (bb), get_bb_end_addr (bb),
+ bb_count, get_fdo_count_quality (bb->count));
+ }
+
+ if (flag_profile_use)
+ {
+ dump_edge_jump_info_to_asm (bb, bb_count);
+ }
+ else if (flag_auto_profile)
+ {
+ dump_bb_info_to_asm (bb, bb_count);
+ }
+ }
+}
+
+/* When -fauto-bolt option is turnded on, the .text.fdo section
+ will be generated in the *.s file if there is feedback information
+ from PGO or AutoFDO. This section will parserd in BOLT-plugin. */
+
+static void
+dump_profile_to_elf_sections ()
+{
+ if (!flag_function_sections)
+ {
+ error ("-fauto-bolt should work with -ffunction-section");
+ return;
+ }
+ if (!flag_ipa_ra)
+ {
+ error ("-fauto-bolt should work with -fipa-ra");
+ return;
+ }
+ if (flag_align_jumps)
+ {
+ error ("-fauto-bolt is not supported with -falign-jumps");
+ return;
+ }
+ if (flag_align_labels)
+ {
+ error ("-fauto-bolt is not spported with -falign-loops");
+ return;
+ }
+ if (flag_align_loops)
+ {
+ error ("-fauto-bolt is not supported with -falign-loops");
+ return;
+ }
+
+ /* Return if no feedback data. */
+ if (!flag_profile_use && !flag_auto_profile)
+ {
+ error ("-fauto-bolt should use with -profile-use or -fauto-profile");
+ return;
+ }
+
+ /* Avoid empty functions. */
+ if (TREE_CODE (cfun->decl) != FUNCTION_DECL)
+ {
+ return;
+ }
+ int flags = SECTION_DEBUG | SECTION_EXCLUDE;
+ const char *fnname = get_fnname_from_decl (current_function_decl);
+ char *profile_fnname = NULL;
+
+ asprintf (&profile_fnname, "%s%s", ASM_FDO_SECTION_PREFIX, fnname);
+ switch_to_section (get_section (profile_fnname, flags, NULL));
+ dump_fdo_info_to_asm (fnname);
+
+ if (profile_fnname)
+ {
+ free (profile_fnname);
+ profile_fnname = NULL;
+ }
+}
+
/* Turn the RTL into assembly. */
static unsigned int
rest_of_handle_final (void)
@@ -4334,6 +4731,12 @@ rest_of_handle_final (void)
targetm.asm_out.destructor (XEXP (DECL_RTL (current_function_decl), 0),
decl_fini_priority_lookup
(current_function_decl));
+
+ if (flag_auto_bolt)
+ {
+ dump_profile_to_elf_sections ();
+ }
+
return 0;
}
diff --git a/gcc/opts.cc b/gcc/opts.cc
index b868d189e..6d57e7d69 100644
--- a/gcc/opts.cc
+++ b/gcc/opts.cc
@@ -1279,6 +1279,10 @@ finish_options (struct gcc_options *opts, struct gcc_options *opts_set,
if (opts->x_flag_vtable_verify && opts->x_flag_lto)
sorry ("vtable verification is not supported with LTO");
+ /* Currently -fauto-bolt is not supported for LTO. */
+ if (opts->x_flag_auto_bolt && opts->x_flag_lto)
+ sorry ("%<-fauto-bolt%> is not supported with LTO");
+
/* Control IPA optimizations based on different -flive-patching level. */
if (opts->x_flag_live_patching)
control_options_for_live_patching (opts, opts_set,
@@ -1291,6 +1295,58 @@ finish_options (struct gcc_options *opts, struct gcc_options *opts_set,
= (opts->x_flag_unroll_loops
|| opts->x_flag_peel_loops
|| opts->x_optimize >= 3);
+
+ if (opts->x_flag_auto_bolt)
+ {
+ /* Record the function section to facilitate the feedback
+ data storage. */
+ if (!opts->x_flag_function_sections)
+ {
+ inform (loc,
+ "%<-fauto-bolt%> should work with %<-ffunction-sections%>,"
+ " enabling %<-ffunction-sections%>");
+ opts->x_flag_function_sections = true;
+ }
+
+ /* Cancel the internal alignment of the function. The binary
+ optimizer bolt will cancel the internal alignment optimization
+ of the function, so the alignment is meaningless at this time,
+ and if not, it will bring trouble to the calculation of the
+ offset address of the instruction. */
+ if (opts->x_flag_align_jumps)
+ {
+ inform (loc,
+ "%<-fauto-bolt%> should not work with %<-falign-jumps%>,"
+ " disabling %<-falign-jumps%>");
+ opts->x_flag_align_jumps = false;
+ }
+
+ if (opts->x_flag_align_labels)
+ {
+ inform (loc,
+ "%<-fauto-bolt%> should not work with %<-falign-labels%>,"
+ " disabling %<-falign-labels%>");
+ opts->x_flag_align_labels = false;
+ }
+
+ if (opts->x_flag_align_loops)
+ {
+ inform (loc,
+ "%<-fauto-bolt%> should not work with %<-falign-loops%>,"
+ " disabling %<-falign-loops%>");
+ opts->x_flag_align_loops = false;
+ }
+
+ /* When parsing instructions in RTL phase, we need to know
+ the call information of instructions to avoid being optimized. */
+ if (!opts->x_flag_ipa_ra)
+ {
+ inform (loc,
+ "%<-fauto-bolt%> should work with %<-fipa-ra%>,"
+ " enabling %<-fipa-ra%>");
+ opts->x_flag_ipa_ra = true;
+ }
+ }
/* With -fcx-limited-range, we do cheap and quick complex arithmetic. */
if (opts->x_flag_cx_limited_range)
@@ -3226,6 +3282,11 @@ common_handle_option (struct gcc_options *opts,
&opts->x_flag_align_functions,
&opts->x_str_align_functions);
break;
+
+ case OPT_fauto_bolt_:
+ case OPT_fauto_bolt:
+ /* Deferred. */
+ break;
case OPT_ftabstop_:
/* It is documented that we silently ignore silly values. */
--
2.33.0

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,345 @@
From 94242286383a80e6ab83d824a4d7ea23ea311f75 Mon Sep 17 00:00:00 2001
From: zhenyu--zhao_admin <zhaozhenyu17@huawei.com>
Date: Mon, 22 Jan 2024 15:38:24 +0800
Subject: [PATCH] [AutoBOLT] Enable BOLT linker plugin on aarch64 3/3
---
Makefile.def | 10 ++++++++++
configure | 27 ++++++++++++++++++++++++++-
configure.ac | 22 +++++++++++++++++++++-
gcc/config.host | 1 +
gcc/config.in | 13 +++++++++++++
gcc/configure | 10 ++++++++--
gcc/configure.ac | 4 ++++
gcc/gcc.cc | 23 +++++++++++++++++++++++
8 files changed, 106 insertions(+), 4 deletions(-)
diff --git a/Makefile.def b/Makefile.def
index 72d585496..0ba868890 100644
--- a/Makefile.def
+++ b/Makefile.def
@@ -145,6 +145,9 @@ host_modules= { module= gnattools; };
host_modules= { module= lto-plugin; bootstrap=true;
extra_configure_flags='--enable-shared @extra_linker_plugin_flags@ @extra_linker_plugin_configure_flags@';
extra_make_flags='@extra_linker_plugin_flags@'; };
+host_modules= { module= bolt-plugin; bootstrap=true;
+ extra_configure_flags='--enable-shared @extra_linker_plugin_flags@ @extra_linker_plugin_configure_flags@';
+ extra_make_flags='@extra_linker_plugin_flags@'; };
host_modules= { module= libcc1; extra_configure_flags=--enable-shared; };
host_modules= { module= gotools; };
host_modules= { module= libctf; bootstrap=true; };
@@ -349,6 +352,7 @@ dependencies = { module=configure-gcc; on=all-mpfr; };
dependencies = { module=configure-gcc; on=all-mpc; };
dependencies = { module=configure-gcc; on=all-isl; };
dependencies = { module=configure-gcc; on=all-lto-plugin; };
+dependencies = { module=configure-gcc; on=all-bolt-plugin; };
dependencies = { module=configure-gcc; on=all-binutils; };
dependencies = { module=configure-gcc; on=all-gas; };
dependencies = { module=configure-gcc; on=all-ld; };
@@ -374,6 +378,7 @@ dependencies = { module=all-gcc; on=all-libdecnumber; hard=true; };
dependencies = { module=all-gcc; on=all-libiberty; };
dependencies = { module=all-gcc; on=all-fixincludes; };
dependencies = { module=all-gcc; on=all-lto-plugin; };
+dependencies = { module=all-gcc; on=all-bolt-plugin; };
dependencies = { module=all-gcc; on=all-libiconv; };
dependencies = { module=info-gcc; on=all-build-libiberty; };
dependencies = { module=dvi-gcc; on=all-build-libiberty; };
@@ -381,8 +386,10 @@ dependencies = { module=pdf-gcc; on=all-build-libiberty; };
dependencies = { module=html-gcc; on=all-build-libiberty; };
dependencies = { module=install-gcc ; on=install-fixincludes; };
dependencies = { module=install-gcc ; on=install-lto-plugin; };
+dependencies = { module=install-gcc ; on=install-bolt-plugin; };
dependencies = { module=install-strip-gcc ; on=install-strip-fixincludes; };
dependencies = { module=install-strip-gcc ; on=install-strip-lto-plugin; };
+dependencies = { module=install-strip-gcc ; on=install-strip-bolt-plugin; };
dependencies = { module=configure-libcpp; on=configure-libiberty; hard=true; };
dependencies = { module=configure-libcpp; on=configure-intl; };
@@ -401,6 +408,9 @@ dependencies = { module=all-gnattools; on=all-target-libstdc++-v3; };
dependencies = { module=all-lto-plugin; on=all-libiberty; };
dependencies = { module=all-lto-plugin; on=all-libiberty-linker-plugin; };
+dependencies = { module=all-bolt-plugin; on=all-libiberty; };
+dependencies = { module=all-bolt-plugin; on=all-libiberty-linker-plugin; };
+
dependencies = { module=configure-libcc1; on=configure-gcc; };
dependencies = { module=all-libcc1; on=all-gcc; };
diff --git a/configure b/configure
index 5dcaab14a..aff62c464 100755
--- a/configure
+++ b/configure
@@ -826,6 +826,7 @@ with_isl
with_isl_include
with_isl_lib
enable_isl_version_check
+enable_bolt
enable_lto
enable_linker_plugin_configure_flags
enable_linker_plugin_flags
@@ -1550,6 +1551,7 @@ Optional Features:
enable the PGO build
--disable-isl-version-check
disable check for isl version
+ --enable-bolt enable bolt optimization support
--enable-lto enable link time optimization support
--enable-linker-plugin-configure-flags=FLAGS
additional flags for configuring linker plugins
@@ -8564,6 +8566,15 @@ fi
+# Check for BOLT support.
+# Check whether --enable-bolt was given.
+if test "${enable_bolt+set}" = set; then :
+ enableval=$enable_bolt; enable_bolt=$enableval
+else
+ enable_bolt=no; default_enable_bolt=no
+fi
+
+
# Check for LTO support.
# Check whether --enable-lto was given.
if test "${enable_lto+set}" = set; then :
@@ -8593,6 +8604,16 @@ if test $target_elf = yes; then :
# ELF platforms build the lto-plugin always.
build_lto_plugin=yes
+ # ELF platforms can build the bolt-plugin.
+ # NOT BUILD BOLT BY DEFAULT.
+ case $target in
+ aarch64*-*-linux*)
+ if test $enable_bolt = yes; then :
+ build_bolt_plugin=yes
+ fi
+ ;;
+ esac
+
else
if test x"$default_enable_lto" = x"yes" ; then
case $target in
@@ -8780,6 +8801,10 @@ if test -d ${srcdir}/gcc; then
fi
fi
+ if test "${build_bolt_plugin}" = "yes" ; then
+ configdirs="$configdirs bolt-plugin"
+ fi
+
# If we're building an offloading compiler, add the LTO front end.
if test x"$enable_as_accelerator_for" != x ; then
case ,${enable_languages}, in
@@ -9202,7 +9227,7 @@ fi
extra_host_libiberty_configure_flags=
extra_host_zlib_configure_flags=
case " $configdirs " in
- *" lto-plugin "* | *" libcc1 "*)
+ *" lto-plugin "* | *" libcc1 "* | *" bolt-plugin "*)
# When these are to be built as shared libraries, the same applies to
# libiberty.
extra_host_libiberty_configure_flags=--enable-shared
diff --git a/configure.ac b/configure.ac
index 85977482a..f310d75ca 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1863,6 +1863,12 @@ fi
AC_SUBST(isllibs)
AC_SUBST(islinc)
+# Check for BOLT support.
+AC_ARG_ENABLE(bolt,
+[AS_HELP_STRING([--enable-bolt], [enable bolt optimization support])],
+enable_bolt=$enableval,
+enable_bolt=no; default_enable_bolt=no)
+
# Check for LTO support.
AC_ARG_ENABLE(lto,
[AS_HELP_STRING([--enable-lto], [enable link time optimization support])],
@@ -1871,6 +1877,16 @@ enable_lto=yes; default_enable_lto=yes)
ACX_ELF_TARGET_IFELSE([# ELF platforms build the lto-plugin always.
build_lto_plugin=yes
+
+ # ELF platforms can build the bolt-plugin.
+ # NOT BUILD BOLT BY DEFAULT.
+ case $target in
+ aarch64*-*-linux*)
+ if test $enable_bolt = yes; then :
+ build_bolt_plugin=yes
+ fi
+ ;;
+ esac
],[if test x"$default_enable_lto" = x"yes" ; then
case $target in
*-apple-darwin[[912]]* | *-cygwin* | *-mingw* | *djgpp*) ;;
@@ -2049,6 +2065,10 @@ if test -d ${srcdir}/gcc; then
fi
fi
+ if test "${build_bolt_plugin}" = "yes" ; then
+ configdirs="$configdirs bolt-plugin"
+ fi
+
# If we're building an offloading compiler, add the LTO front end.
if test x"$enable_as_accelerator_for" != x ; then
case ,${enable_languages}, in
@@ -2457,7 +2477,7 @@ fi
extra_host_libiberty_configure_flags=
extra_host_zlib_configure_flags=
case " $configdirs " in
- *" lto-plugin "* | *" libcc1 "*)
+ *" lto-plugin "* | *" libcc1 "* | *" bolt-plugin "*)
# When these are to be built as shared libraries, the same applies to
# libiberty.
extra_host_libiberty_configure_flags=--enable-shared
diff --git a/gcc/config.host b/gcc/config.host
index 4ca300f11..bf7dcb4cc 100644
--- a/gcc/config.host
+++ b/gcc/config.host
@@ -75,6 +75,7 @@ out_host_hook_obj=host-default.o
host_can_use_collect2=yes
use_long_long_for_widest_fast_int=no
host_lto_plugin_soname=liblto_plugin.so
+host_bolt_plugin_soname=libbolt_plugin.so
# Unsupported hosts list. Generally, only include hosts known to fail here,
# since we allow hosts not listed to be supported generically.
diff --git a/gcc/config.in b/gcc/config.in
index 64c27c9cf..6bb25b25b 100644
--- a/gcc/config.in
+++ b/gcc/config.in
@@ -24,6 +24,13 @@
#endif
+/* Define to the name of the BOLT plugin DSO that must be passed to the
+ linker's -plugin=LIB option. */
+#ifndef USED_FOR_TARGET
+#undef BOLTPLUGINSONAME
+#endif
+
+
/* Define to the root for URLs about GCC changes. */
#ifndef USED_FOR_TARGET
#undef CHANGES_ROOT_URL
@@ -2208,6 +2215,12 @@
#endif
+/* Define which stat syscall is able to handle 64bit indodes. */
+#ifndef USED_FOR_TARGET
+#undef HOST_STAT_FOR_64BIT_INODES
+#endif
+
+
/* Define as const if the declaration of iconv() needs const. */
#ifndef USED_FOR_TARGET
#undef ICONV_CONST
diff --git a/gcc/configure b/gcc/configure
index 98bbf0f85..30f386789 100755
--- a/gcc/configure
+++ b/gcc/configure
@@ -13578,6 +13578,12 @@ case $use_collect2 in
esac
+cat >>confdefs.h <<_ACEOF
+#define BOLTPLUGINSONAME "${host_bolt_plugin_soname}"
+_ACEOF
+
+
+
cat >>confdefs.h <<_ACEOF
#define LTOPLUGINSONAME "${host_lto_plugin_soname}"
_ACEOF
@@ -19668,7 +19674,7 @@ else
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
lt_status=$lt_dlunknown
cat > conftest.$ac_ext <<_LT_EOF
-#line 19671 "configure"
+#line 19677 "configure"
#include "confdefs.h"
#if HAVE_DLFCN_H
@@ -19774,7 +19780,7 @@ else
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
lt_status=$lt_dlunknown
cat > conftest.$ac_ext <<_LT_EOF
-#line 19777 "configure"
+#line 19783 "configure"
#include "confdefs.h"
#if HAVE_DLFCN_H
diff --git a/gcc/configure.ac b/gcc/configure.ac
index c74f4b555..dd6cd60f8 100644
--- a/gcc/configure.ac
+++ b/gcc/configure.ac
@@ -2531,6 +2531,10 @@ case $use_collect2 in
;;
esac
+AC_DEFINE_UNQUOTED(BOLTPLUGINSONAME,"${host_bolt_plugin_soname}",
+[Define to the name of the BOLT plugin DSO that must be
+ passed to the linker's -plugin=LIB option.])
+
AC_DEFINE_UNQUOTED(LTOPLUGINSONAME,"${host_lto_plugin_soname}",
[Define to the name of the LTO plugin DSO that must be
passed to the linker's -plugin=LIB option.])
diff --git a/gcc/gcc.cc b/gcc/gcc.cc
index fbcc9d033..b0d03430e 100644
--- a/gcc/gcc.cc
+++ b/gcc/gcc.cc
@@ -1156,6 +1156,8 @@ proper position among the other output files. */
%{!fsyntax-only:%{!c:%{!M:%{!MM:%{!E:%{!S:\
%(linker) " \
LINK_PLUGIN_SPEC \
+ "%{fauto-bolt|fauto-bolt=*|fbolt-use|fbolt-use=*: \
+ -plugin %(linker_auto_bolt_plugin_file) }"\
"%{flto|flto=*:%<fcompare-debug*} \
%{flto} %{fno-lto} %{flto=*} %l " LINK_PIE_SPEC \
"%{fuse-ld=*:-fuse-ld=%*} " LINK_COMPRESS_DEBUG_SPEC \
@@ -1210,6 +1212,7 @@ static const char *endfile_spec = ENDFILE_SPEC;
static const char *startfile_spec = STARTFILE_SPEC;
static const char *linker_name_spec = LINKER_NAME;
static const char *linker_plugin_file_spec = "";
+static const char *linker_auto_bolt_plugin_file_spec = "";
static const char *lto_wrapper_spec = "";
static const char *lto_gcc_spec = "";
static const char *post_link_spec = POST_LINK_SPEC;
@@ -1723,6 +1726,8 @@ static struct spec_list static_specs[] =
INIT_STATIC_SPEC ("multilib_reuse", &multilib_reuse),
INIT_STATIC_SPEC ("linker", &linker_name_spec),
INIT_STATIC_SPEC ("linker_plugin_file", &linker_plugin_file_spec),
+ INIT_STATIC_SPEC ("linker_auto_bolt_plugin_file",
+ &linker_auto_bolt_plugin_file_spec),
INIT_STATIC_SPEC ("lto_wrapper", &lto_wrapper_spec),
INIT_STATIC_SPEC ("lto_gcc", &lto_gcc_spec),
INIT_STATIC_SPEC ("post_link", &post_link_spec),
@@ -9118,6 +9123,24 @@ driver::maybe_run_linker (const char *argv0) const
}
#endif
set_static_spec_shared (&lto_gcc_spec, argv0);
+
+ /* Set bolt-plugin. */
+ const char *fauto_bolt = "fauto-bolt";
+ const char *fbolt_use = "fbolt-use";
+ if (switch_matches (fauto_bolt, fauto_bolt + strlen (fauto_bolt), 1)
+ || switch_matches (fbolt_use, fbolt_use + strlen (fbolt_use), 1))
+ {
+ linker_auto_bolt_plugin_file_spec = find_a_file (&exec_prefixes,
+ BOLTPLUGINSONAME, X_OK, false);
+ if (!linker_auto_bolt_plugin_file_spec)
+ {
+ fatal_error (input_location,
+ "-fauto-bolt or -fbolt-use is used, but %s is not found",
+ BOLTPLUGINSONAME);
+
+ }
+ }
+
}
/* Rebuild the COMPILER_PATH and LIBRARY_PATH environment variables
--
2.33.0

View File

@ -0,0 +1,312 @@
From b020447c840c6e22440a9b9063298a06333fd2f1 Mon Sep 17 00:00:00 2001
From: zhenyu--zhao <zhaozhenyu17@huawei.com>
Date: Sat, 23 Mar 2024 22:56:09 +0800
Subject: [PATCH] [Autofdo]Enable discrimibator and MCF algorithm on Autofdo
---
gcc/auto-profile.cc | 171 +++++++++++++++++++++++++++++++++++++++++++-
gcc/cfghooks.cc | 7 ++
gcc/opts.cc | 5 +-
gcc/tree-inline.cc | 14 ++++
4 files changed, 193 insertions(+), 4 deletions(-)
diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
index 2b34b80b8..f45f0ec66 100644
--- a/gcc/auto-profile.cc
+++ b/gcc/auto-profile.cc
@@ -466,6 +466,17 @@ string_table::get_index (const char *name) const
if (name == NULL)
return -1;
string_index_map::const_iterator iter = map_.find (name);
+ /* Function name may be duplicate. Try to distinguish by the
+ #file_name#function_name defined by the autofdo tool chain. */
+ if (iter == map_.end ())
+ {
+ char* file_name = get_original_name (lbasename (dump_base_name));
+ char* file_func_name
+ = concat ("#", file_name, "#", name, NULL);
+ iter = map_.find (file_func_name);
+ free (file_name);
+ free (file_func_name);
+ }
if (iter == map_.end ())
return -1;
@@ -654,7 +665,7 @@ function_instance::read_function_instance (function_instance_stack *stack,
for (unsigned i = 0; i < num_pos_counts; i++)
{
- unsigned offset = gcov_read_unsigned () & 0xffff0000;
+ unsigned offset = gcov_read_unsigned ();
unsigned num_targets = gcov_read_unsigned ();
gcov_type count = gcov_read_counter ();
s->pos_counts[offset].count = count;
@@ -733,6 +744,10 @@ autofdo_source_profile::get_count_info (gimple *stmt, count_info *info) const
function_instance *s = get_function_instance_by_inline_stack (stack);
if (s == NULL)
return false;
+ if (s->get_count_info (stack[0].second + stmt->bb->discriminator, info))
+ {
+ return true;
+ }
return s->get_count_info (stack[0].second, info);
}
@@ -1395,6 +1410,66 @@ afdo_propagate (bb_set *annotated_bb)
}
}
+/* Process the following scene when the branch probability
+ inversion when do function afdo_propagate (). E.g.
+ BB_NUM (sample count)
+ BB1 (1000)
+ / \
+ BB2 (10) BB3 (0)
+ \ /
+ BB4
+ In afdo_propagate ().count of BB3 is calculated by
+ COUNT (BB3) = 990 (990 = COUNT (BB1) - COUNT (BB2) = 1000 - 10)
+ In fact, BB3 may be colder than BB2 by sample count.
+ This function allocate source BB count to wach succ BB by sample
+ rate, E.g.
+ BB2_COUNT = BB1_COUNT * (BB2_COUNT / (BB2_COUNT + BB3_COUNT)) */
+
+static void
+afdo_preprocess_bb_count ()
+{
+ basic_block bb;
+ FOR_ALL_BB_FN (bb, cfun)
+ {
+ if (bb->count.ipa_p () && EDGE_COUNT (bb->succs) > 1
+ && bb->count > profile_count::zero ().afdo ())
+ {
+ basic_block bb1 = EDGE_SUCC (bb, 0)->dest;
+ basic_block bb2 = EDGE_SUCC (bb, 1)->dest;
+ if (single_succ_edge (bb1) && single_succ_edge (bb2)
+ && EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest)
+ {
+ gcov_type max_count = 0;
+ gcov_type total_count = 0;
+ edge e;
+ edge_iterator ei;
+ FOR_EACH_EDGE (e, ei, bb->succs)
+ {
+ if (!e->dest->count.ipa_p ())
+ {
+ continue;
+ }
+ max_count = MAX (max_count, e->dest->count.to_gcov_type ());
+ total_count += e->dest->count.to_gcov_type ();
+ }
+ /* Only bb_count > max_count * 2, branch probability will
+ inversion. */
+ if (max_count > 0 && bb->count.to_gcov_type () > max_count * 2)
+ {
+ FOR_EACH_EDGE (e, ei, bb->succs)
+ {
+ gcov_type target_count = bb->count.to_gcov_type ()
+ * e->dest->count.to_gcov_type ()/ total_count;
+ e->dest->count
+ = profile_count::from_gcov_type
+ (target_count).afdo ();
+ }
+ }
+ }
+ }
+ }
+}
+
/* Propagate counts on control flow graph and calculate branch
probabilities. */
@@ -1420,6 +1495,7 @@ afdo_calculate_branch_prob (bb_set *annotated_bb)
}
afdo_find_equiv_class (annotated_bb);
+ afdo_preprocess_bb_count ();
afdo_propagate (annotated_bb);
FOR_EACH_BB_FN (bb, cfun)
@@ -1523,6 +1599,83 @@ afdo_vpt_for_early_inline (stmt_set *promoted_stmts)
return false;
}
+/* Preparation before executing MCF algorithm. */
+
+static void
+afdo_init_mcf ()
+{
+ basic_block bb;
+ edge e;
+ edge_iterator ei;
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "\n init calling mcf_smooth_cfg (). \n");
+ }
+
+ /* Step1: when use mcf, BB id must be continous,
+ so we need compact_blocks (). */
+ compact_blocks ();
+
+ /* Step2: allocate memory for MCF input data. */
+ bb_gcov_counts.safe_grow_cleared (cfun->cfg->x_last_basic_block);
+ edge_gcov_counts = new hash_map<edge, gcov_type>;
+
+ /* Step3: init MCF input data from cfg. */
+ FOR_ALL_BB_FN (bb, cfun)
+ {
+ /* Init BB count for MCF. */
+ bb_gcov_count (bb) = bb->count.to_gcov_type ();
+
+ gcov_type total_count = 0;
+ FOR_EACH_EDGE (e, ei, bb->succs)
+ {
+ total_count += e->dest->count.to_gcov_type ();
+ }
+
+ /* If there is no sample in each successor blocks, source
+ BB samples are allocated to each edge by branch static prob. */
+
+ FOR_EACH_EDGE (e, ei, bb->succs)
+ {
+ if (total_count == 0)
+ {
+ edge_gcov_count (e) = e->src->count.to_gcov_type ()
+ * e->probability.to_reg_br_prob_base () / REG_BR_PROB_BASE;
+ }
+ else
+ {
+ edge_gcov_count (e) = e->src->count.to_gcov_type ()
+ * e->dest->count.to_gcov_type () / total_count;
+ }
+ }
+ }
+}
+
+
+/* Free the resources used by MCF and reset BB count from MCF result.
+ branch probability has been updated in mcf_smooth_cfg (). */
+
+static void
+afdo_process_after_mcf ()
+{
+ basic_block bb;
+ /* Reset BB count from MCF result. */
+ FOR_EACH_BB_FN (bb, cfun)
+ {
+ if (bb_gcov_count (bb))
+ {
+ bb->count
+ = profile_count::from_gcov_type (bb_gcov_count (bb)).afdo ();
+ }
+ }
+
+ /* Clean up MCF resource. */
+ bb_gcov_counts.release ();
+ delete edge_gcov_counts;
+ edge_gcov_counts = NULL;
+}
+
/* Annotate auto profile to the control flow graph. Do not annotate value
profile for stmts in PROMOTED_STMTS. */
@@ -1574,8 +1727,20 @@ afdo_annotate_cfg (const stmt_set &promoted_stmts)
afdo_source_profile->mark_annotated (cfun->function_end_locus);
if (max_count > profile_count::zero ())
{
- /* Calculate, propagate count and probability information on CFG. */
- afdo_calculate_branch_prob (&annotated_bb);
+ /* 1 means -fprofile-correction is enbaled manually, and MCF
+ algorithm will be used to calculate count and probability.
+ Otherwise, use the default calculate algorithm. */
+ if (flag_profile_correction == 1)
+ {
+ afdo_init_mcf ();
+ mcf_smooth_cfg ();
+ afdo_process_after_mcf ();
+ }
+ else
+ {
+ /* Calculate, propagate count and probability information on CFG. */
+ afdo_calculate_branch_prob (&annotated_bb);
+ }
}
update_max_bb_count ();
profile_status_for_fn (cfun) = PROFILE_READ;
diff --git a/gcc/cfghooks.cc b/gcc/cfghooks.cc
index c0b7bdcd9..323663010 100644
--- a/gcc/cfghooks.cc
+++ b/gcc/cfghooks.cc
@@ -542,6 +542,9 @@ split_block_1 (basic_block bb, void *i)
return NULL;
new_bb->count = bb->count;
+ /* Copy discriminator from original bb for distinguishes among
+ several basic blocks that share a common locus, allowing for
+ more accurate autofdo. */
new_bb->discriminator = bb->discriminator;
if (dom_info_available_p (CDI_DOMINATORS))
@@ -1113,6 +1116,10 @@ duplicate_block (basic_block bb, edge e, basic_block after, copy_bb_data *id)
move_block_after (new_bb, after);
new_bb->flags = (bb->flags & ~BB_DUPLICATED);
+ /* Copy discriminator from original bb for distinguishes among
+ several basic blocks that share a common locus, allowing for
+ more accurate autofdo. */
+ new_bb->discriminator = bb->discriminator;
FOR_EACH_EDGE (s, ei, bb->succs)
{
/* Since we are creating edges from a new block to successors
diff --git a/gcc/opts.cc b/gcc/opts.cc
index 2bba88140..4b4925331 100644
--- a/gcc/opts.cc
+++ b/gcc/opts.cc
@@ -3014,7 +3014,10 @@ common_handle_option (struct gcc_options *opts,
/* FALLTHRU */
case OPT_fauto_profile:
enable_fdo_optimizations (opts, opts_set, value);
- SET_OPTION_IF_UNSET (opts, opts_set, flag_profile_correction, value);
+ /* 2 is special and means flag_profile_correction trun on by
+ -fauto-profile. */
+ SET_OPTION_IF_UNSET (opts, opts_set, flag_profile_correction,
+ (value ? 2 : 0));
break;
case OPT_fipa_struct_reorg_:
diff --git a/gcc/tree-inline.cc b/gcc/tree-inline.cc
index f892cee3f..f50dbbc52 100644
--- a/gcc/tree-inline.cc
+++ b/gcc/tree-inline.cc
@@ -2038,6 +2038,10 @@ copy_bb (copy_body_data *id, basic_block bb,
basic_block_info automatically. */
copy_basic_block = create_basic_block (NULL, (basic_block) prev->aux);
copy_basic_block->count = bb->count.apply_scale (num, den);
+ /* Copy discriminator from original bb for distinguishes among
+ several basic blocks that share a common locus, allowing for
+ more accurate autofdo. */
+ copy_basic_block->discriminator = bb->discriminator;
copy_gsi = gsi_start_bb (copy_basic_block);
@@ -3058,6 +3062,16 @@ copy_cfg_body (copy_body_data * id,
den += e->count ();
ENTRY_BLOCK_PTR_FOR_FN (cfun)->count = den;
}
+ /* When autofdo uses PMU as the sampling unit, the number of
+ ENTRY_BLOCK_PTR_FOR_FN cannot be obtained directly and will
+ be zero. It using for adjust_for_ipa_scaling will cause the
+ inlined BB count incorrectly overestimated. So set den equal
+ to num, which is the source inline BB count to avoid
+ overestimated. */
+ if (den == profile_count::zero ().afdo ())
+ {
+ den = num;
+ }
profile_count::adjust_for_ipa_scaling (&num, &den);
--
2.33.0

View File

@ -0,0 +1,194 @@
From aa39a66f6029fe16a656d7c6339908b953fb1e04 Mon Sep 17 00:00:00 2001
From: Diachkov Ilia WX1215920 <diachkov.ilia1@huawei-partners.com>
Date: Thu, 22 Feb 2024 11:27:43 +0300
Subject: [PATCH 01/18] Add insn defs and correct costs for cmlt generation
---
gcc/config/aarch64/aarch64-simd.md | 48 +++++++++++++++++++++++++++++
gcc/config/aarch64/aarch64.cc | 15 +++++++++
gcc/config/aarch64/aarch64.opt | 4 +++
gcc/config/aarch64/iterators.md | 3 +-
gcc/config/aarch64/predicates.md | 25 +++++++++++++++
gcc/testsuite/gcc.dg/combine-cmlt.c | 20 ++++++++++++
6 files changed, 114 insertions(+), 1 deletion(-)
create mode 100755 gcc/testsuite/gcc.dg/combine-cmlt.c
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index ee7f0b89c..82f73805f 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -6454,6 +6454,54 @@
[(set_attr "type" "neon_compare<q>, neon_compare_zero<q>")]
)
+;; Use cmlt to replace vector arithmetic operations like this (SImode example):
+;; B = (((A >> 15) & 0x00010001) << 16) - ((A >> 15) & 0x00010001)
+;; TODO: maybe extend to scalar operations or other cm** instructions.
+
+(define_insn "*aarch64_cmlt_as_arith<mode>"
+ [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
+ (minus:<V_INT_EQUIV>
+ (ashift:<V_INT_EQUIV>
+ (and:<V_INT_EQUIV>
+ (lshiftrt:<V_INT_EQUIV>
+ (match_operand:VDQHSD 1 "register_operand" "w")
+ (match_operand:VDQHSD 2 "half_size_minus_one_operand"))
+ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand"))
+ (match_operand:VDQHSD 4 "half_size_operand"))
+ (and:<V_INT_EQUIV>
+ (lshiftrt:<V_INT_EQUIV>
+ (match_dup 1)
+ (match_dup 2))
+ (match_dup 3))))]
+ "TARGET_SIMD && flag_cmlt_arith"
+ "cmlt\t%<v>0.<V2ntype>, %<v>1.<V2ntype>, #0"
+ [(set_attr "type" "neon_compare_zero")]
+)
+
+;; The helper definition that allows combiner to use the previous pattern.
+
+(define_insn_and_split "*arch64_cmlt_tmp<mode>"
+ [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
+ (and:<V_INT_EQUIV>
+ (lshiftrt:<V_INT_EQUIV>
+ (match_operand:VDQHSD 1 "register_operand" "w")
+ (match_operand:VDQHSD 2 "half_size_minus_one_operand"))
+ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))]
+ "TARGET_SIMD && flag_cmlt_arith"
+ "#"
+ "&& reload_completed"
+ [(set (match_operand:<V_INT_EQUIV> 0 "register_operand")
+ (lshiftrt:<V_INT_EQUIV>
+ (match_operand:VDQHSD 1 "register_operand")
+ (match_operand:VDQHSD 2 "half_size_minus_one_operand")))
+ (set (match_dup 0)
+ (and:<V_INT_EQUIV>
+ (match_dup 0)
+ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))]
+ ""
+ [(set_attr "type" "neon_compare_zero")]
+)
+
(define_insn_and_split "aarch64_cm<optab>di"
[(set (match_operand:DI 0 "register_operand" "=w,w,r")
(neg:DI
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index a3da4ca30..04072ca25 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -14064,6 +14064,21 @@ cost_minus:
return true;
}
+ /* Detect aarch64_cmlt_as_arith instruction. Now only this pattern
+ matches the condition. The costs of cmlt and sub instructions
+ are comparable, so we are not increasing the cost here. */
+ if (flag_cmlt_arith && GET_CODE (op0) == ASHIFT
+ && GET_CODE (op1) == AND)
+ {
+ rtx op0_subop0 = XEXP (op0, 0);
+ if (rtx_equal_p (op0_subop0, op1))
+ {
+ rtx lshrt_op = XEXP (op0_subop0, 0);
+ if (GET_CODE (lshrt_op) == LSHIFTRT)
+ return true;
+ }
+ }
+
/* Look for SUB (extended register). */
if (is_a <scalar_int_mode> (mode)
&& aarch64_rtx_arith_op_extract_p (op1))
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index a64b927e9..101664c7c 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -262,6 +262,10 @@ Use an immediate to offset from the stack protector guard register, sp_el0.
This option is for use with fstack-protector-strong and not for use in
user-land code.
+mcmlt-arith
+Target Var(flag_cmlt_arith) Optimization Init(0)
+Use SIMD cmlt instruction to perform some arithmetic/logic calculations.
+
TargetVariable
long aarch64_stack_protector_guard_offset = 0
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 26a840d7f..967e6b0b1 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1485,7 +1485,8 @@
(V2DI "2s")])
;; Register suffix narrowed modes for VQN.
-(define_mode_attr V2ntype [(V8HI "16b") (V4SI "8h")
+(define_mode_attr V2ntype [(V4HI "8b") (V2SI "4h")
+ (V8HI "16b") (V4SI "8h")
(V2DI "4s")])
;; Widened modes of vector modes.
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index c308015ac..07c14aacb 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -49,6 +49,31 @@
return CONST_INT_P (op) && IN_RANGE (INTVAL (op), 1, 3);
})
+(define_predicate "half_size_minus_one_operand"
+ (match_code "const_vector")
+{
+ op = unwrap_const_vec_duplicate (op);
+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
+ return CONST_INT_P (op) && (UINTVAL (op) == size - 1);
+})
+
+(define_predicate "half_size_operand"
+ (match_code "const_vector")
+{
+ op = unwrap_const_vec_duplicate (op);
+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
+ return CONST_INT_P (op) && (UINTVAL (op) == size);
+})
+
+(define_predicate "cmlt_arith_mask_operand"
+ (match_code "const_vector")
+{
+ op = unwrap_const_vec_duplicate (op);
+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
+ unsigned long long mask = ((unsigned long long) 1 << size) | 1;
+ return CONST_INT_P (op) && (UINTVAL (op) == mask);
+})
+
(define_predicate "subreg_lowpart_operator"
(ior (match_code "truncate")
(and (match_code "subreg")
diff --git a/gcc/testsuite/gcc.dg/combine-cmlt.c b/gcc/testsuite/gcc.dg/combine-cmlt.c
new file mode 100755
index 000000000..b4c9a37ff
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/combine-cmlt.c
@@ -0,0 +1,20 @@
+/* { dg-do compile { target aarch64-*-* } } */
+/* { dg-options "-O3 -mcmlt-arith" } */
+
+/* The test checks usage of cmlt insns for arithmetic/logic calculations
+ * in foo (). It's inspired by sources of x264 codec. */
+
+typedef unsigned short int uint16_t;
+typedef unsigned int uint32_t;
+
+void foo( uint32_t *a, uint32_t *b)
+{
+ for (unsigned i = 0; i < 4; i++)
+ {
+ uint32_t s = ((a[i]>>((8 * sizeof(uint16_t))-1))
+ &(((uint32_t)1<<(8 * sizeof(uint16_t)))+1))*((uint16_t)-1);
+ b[i] = (a[i]+s)^s;
+ }
+}
+
+/* { dg-final { scan-assembler-times {cmlt\t} 1 } } */
--
2.33.0

View File

@ -0,0 +1,560 @@
From 4cae948c1c00ad7a59f0f234f809fbd9a0208eb4 Mon Sep 17 00:00:00 2001
From: vchernon <chernonog.vyacheslav@huawei.com>
Date: Wed, 28 Feb 2024 23:05:12 +0800
Subject: [PATCH 02/18] [rtl-ifcvt] introduce rtl ifcvt enchancements new
option: -fifcvt-allow-complicated-cmps: allows ifcvt to deal
with complicated cmps like
cmp reg1 (reg2 + reg3)
can increase compilation time
new param:
-param=ifcvt-allow-register-renaming=[0,1,2]
1 : allows ifcvt to rename registers in then and else bb
2 : allows to rename registers in condition and else/then bb
can increase compilation time and register pressure
---
gcc/common.opt | 4 +
gcc/ifcvt.cc | 291 +++++++++++++++---
gcc/params.opt | 4 +
.../gcc.c-torture/execute/ifcvt-renaming-1.c | 35 +++
gcc/testsuite/gcc.dg/ifcvt-6.c | 27 ++
5 files changed, 311 insertions(+), 50 deletions(-)
create mode 100644 gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c
create mode 100644 gcc/testsuite/gcc.dg/ifcvt-6.c
diff --git a/gcc/common.opt b/gcc/common.opt
index c7c6bc256..aa00fb7b0 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -3691,4 +3691,8 @@ fipa-ra
Common Var(flag_ipa_ra) Optimization
Use caller save register across calls if possible.
+fifcvt-allow-complicated-cmps
+Common Var(flag_ifcvt_allow_complicated_cmps) Optimization
+Allow RTL if-conversion pass to deal with complicated cmps (can increase compilation time).
+
; This comment is to ensure we retain the blank line above.
diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc
index 2c1eba312..584db7b55 100644
--- a/gcc/ifcvt.cc
+++ b/gcc/ifcvt.cc
@@ -886,7 +886,9 @@ noce_emit_store_flag (struct noce_if_info *if_info, rtx x, int reversep,
}
/* Don't even try if the comparison operands or the mode of X are weird. */
- if (cond_complex || !SCALAR_INT_MODE_P (GET_MODE (x)))
+ if (!flag_ifcvt_allow_complicated_cmps
+ && (cond_complex
+ || !SCALAR_INT_MODE_P (GET_MODE (x))))
return NULL_RTX;
return emit_store_flag (x, code, XEXP (cond, 0),
@@ -1965,7 +1967,8 @@ insn_valid_noce_process_p (rtx_insn *insn, rtx cc)
/* Currently support only simple single sets in test_bb. */
if (!sset
|| !noce_operand_ok (SET_DEST (sset))
- || contains_ccmode_rtx_p (SET_DEST (sset))
+ || (!flag_ifcvt_allow_complicated_cmps
+ && contains_ccmode_rtx_p (SET_DEST (sset)))
|| !noce_operand_ok (SET_SRC (sset)))
return false;
@@ -1979,13 +1982,17 @@ insn_valid_noce_process_p (rtx_insn *insn, rtx cc)
in this function. */
static bool
-bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
+bbs_ok_for_cmove_arith (basic_block bb_a,
+ basic_block bb_b,
+ rtx to_rename,
+ bitmap conflict_regs)
{
rtx_insn *a_insn;
bitmap bba_sets = BITMAP_ALLOC (&reg_obstack);
-
+ bitmap intersections = BITMAP_ALLOC (&reg_obstack);
df_ref def;
df_ref use;
+ rtx_insn *last_a = last_active_insn (bb_a, FALSE);
FOR_BB_INSNS (bb_a, a_insn)
{
@@ -1995,18 +2002,15 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
rtx sset_a = single_set (a_insn);
if (!sset_a)
- {
- BITMAP_FREE (bba_sets);
- return false;
- }
+ goto end_cmove_arith_check_and_fail;
/* Record all registers that BB_A sets. */
FOR_EACH_INSN_DEF (def, a_insn)
- if (!(to_rename && DF_REF_REG (def) == to_rename))
+ if (!(to_rename && DF_REF_REG (def) == to_rename && a_insn == last_a))
bitmap_set_bit (bba_sets, DF_REF_REGNO (def));
}
+ bitmap_and (intersections, df_get_live_in (bb_b), bba_sets);
rtx_insn *b_insn;
-
FOR_BB_INSNS (bb_b, b_insn)
{
if (!active_insn_p (b_insn))
@@ -2015,10 +2019,7 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
rtx sset_b = single_set (b_insn);
if (!sset_b)
- {
- BITMAP_FREE (bba_sets);
- return false;
- }
+ goto end_cmove_arith_check_and_fail;
/* Make sure this is a REG and not some instance
of ZERO_EXTRACT or SUBREG or other dangerous stuff.
@@ -2030,25 +2031,34 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
if (MEM_P (SET_DEST (sset_b)))
gcc_assert (rtx_equal_p (SET_DEST (sset_b), to_rename));
else if (!REG_P (SET_DEST (sset_b)))
- {
- BITMAP_FREE (bba_sets);
- return false;
- }
+ goto end_cmove_arith_check_and_fail;
- /* If the insn uses a reg set in BB_A return false. */
+ /* If the insn uses a reg set in BB_A return false
+ or try to collect register list for renaming. */
FOR_EACH_INSN_USE (use, b_insn)
{
- if (bitmap_bit_p (bba_sets, DF_REF_REGNO (use)))
+ if (bitmap_bit_p (intersections, DF_REF_REGNO (use)))
{
- BITMAP_FREE (bba_sets);
- return false;
+ if (param_ifcvt_allow_register_renaming < 1)
+ goto end_cmove_arith_check_and_fail;
+
+ /* Those regs should be renamed. We can't rename CC reg, but
+ possibly we can provide combined comparison in the future. */
+ if (GET_MODE_CLASS (GET_MODE (DF_REF_REG (use))) == MODE_CC)
+ goto end_cmove_arith_check_and_fail;
+ bitmap_set_bit (conflict_regs, DF_REF_REGNO (use));
}
}
-
}
BITMAP_FREE (bba_sets);
+ BITMAP_FREE (intersections);
return true;
+
+end_cmove_arith_check_and_fail:
+ BITMAP_FREE (bba_sets);
+ BITMAP_FREE (intersections);
+ return false;
}
/* Emit copies of all the active instructions in BB except the last.
@@ -2103,6 +2113,142 @@ noce_emit_bb (rtx last_insn, basic_block bb, bool simple)
return true;
}
+/* This function tries to rename regs that intersect with considered bb
+ inside condition expression. Condition expression will be moved down
+ if the optimization will be applied, so it is essential to be sure that
+ all intersected registers will be renamed otherwise transformation
+ can't be applied. Function returns true if renaming was successful
+ and optimization can proceed futher. */
+
+static bool
+noce_rename_regs_in_cond (struct noce_if_info *if_info, bitmap cond_rename_regs)
+{
+ bool success = true;
+ if (bitmap_empty_p (cond_rename_regs))
+ return true;
+ if (param_ifcvt_allow_register_renaming < 2)
+ return false;
+ df_ref use;
+ rtx_insn *cmp_insn = if_info->cond_earliest;
+ /* Jump instruction as a condion currently unsupported. */
+ if (JUMP_P (cmp_insn))
+ return false;
+ rtx_insn *before_cmp = PREV_INSN (cmp_insn);
+ start_sequence ();
+ rtx_insn *copy_of_cmp = as_a <rtx_insn *> (copy_rtx (cmp_insn));
+ basic_block cmp_block = BLOCK_FOR_INSN (cmp_insn);
+ FOR_EACH_INSN_USE (use, cmp_insn)
+ {
+ if (bitmap_bit_p (cond_rename_regs, DF_REF_REGNO (use)))
+ {
+ rtx use_reg = DF_REF_REG (use);
+ rtx tmp = gen_reg_rtx (GET_MODE (use_reg));
+ if (!validate_replace_rtx (use_reg, tmp, copy_of_cmp))
+ {
+ end_sequence ();
+ return false;
+ }
+ noce_emit_move_insn (tmp, use_reg);
+ }
+ }
+
+ emit_insn (PATTERN (copy_of_cmp));
+ rtx_insn *seq = get_insns ();
+ unshare_all_rtl_in_chain (seq);
+ end_sequence ();
+
+ emit_insn_after_setloc (seq, before_cmp, INSN_LOCATION (cmp_insn));
+ delete_insn_and_edges (cmp_insn);
+ rtx_insn *insn;
+ FOR_BB_INSNS (cmp_block, insn)
+ df_insn_rescan (insn);
+
+ if_info->cond = noce_get_condition (if_info->jump,
+ &copy_of_cmp,
+ if_info->then_else_reversed);
+ if_info->cond_earliest = copy_of_cmp;
+ if_info->rev_cond = NULL_RTX;
+
+ return success;
+}
+
+/* This function tries to rename regs that intersect with considered bb.
+ return true if the renaming was successful and optimization can
+ proceed futher, false otherwise. */
+static bool
+noce_rename_regs_in_bb (basic_block test_bb, bitmap rename_regs)
+{
+ if (bitmap_empty_p (rename_regs))
+ return true;
+ rtx_insn *insn;
+ rtx_insn *last_insn = last_active_insn (test_bb, FALSE);
+ bool res = true;
+ start_sequence ();
+ FOR_BB_INSNS (test_bb, insn)
+ {
+ if (!active_insn_p (insn))
+ continue;
+ /* Only ssets are supported for now. */
+ rtx sset = single_set (insn);
+ gcc_assert (sset);
+ rtx x = SET_DEST (sset);
+ if (!REG_P (x) || !bitmap_bit_p (rename_regs, REGNO (x)))
+ continue;
+ /* Do not need to rename dest in the last instruction
+ it will be renamed anyway. */
+ if (insn == last_insn)
+ continue;
+ machine_mode mode = GET_MODE (x);
+ rtx tmp = gen_reg_rtx (mode);
+ if (!validate_replace_rtx_part (x, tmp, &SET_DEST (sset), insn))
+ {
+ gcc_assert (insn != last_insn);
+ /* We can generate additional move for such case,
+ but it will increase register preasure.
+ For now just stop transformation. */
+ rtx result_rtx = SET_DEST (single_set (last_insn));
+ if (REG_P (result_rtx) && (x != result_rtx))
+ {
+ res = false;
+ break;
+ }
+ if (!validate_replace_rtx (x, tmp, insn))
+ gcc_unreachable ();
+ noce_emit_move_insn (tmp,x);
+ }
+ set_used_flags (insn);
+ rtx_insn *rename_candidate;
+ for (rename_candidate = NEXT_INSN (insn);
+ rename_candidate && rename_candidate!= NEXT_INSN (BB_END (test_bb));
+ rename_candidate = NEXT_INSN (rename_candidate))
+ {
+ if (!reg_overlap_mentioned_p (x, rename_candidate))
+ continue;
+
+ int replace_res = TRUE;
+ if (rename_candidate == last_insn)
+ {
+ validate_replace_src_group (x, tmp, rename_candidate);
+ replace_res = apply_change_group ();
+ }
+ else
+ replace_res = validate_replace_rtx (x, tmp, rename_candidate);
+ gcc_assert (replace_res);
+ set_used_flags (rename_candidate);
+ }
+ set_used_flags (x);
+ set_used_flags (tmp);
+ }
+ rtx_insn *seq = get_insns ();
+ unshare_all_rtl_in_chain (seq);
+ end_sequence ();
+ emit_insn_before_setloc (seq, first_active_insn (test_bb),
+ INSN_LOCATION (first_active_insn (test_bb)));
+ FOR_BB_INSNS (test_bb, insn)
+ df_insn_rescan (insn);
+ return res;
+}
+
/* Try more complex cases involving conditional_move. */
static int
@@ -2185,11 +2331,30 @@ noce_try_cmove_arith (struct noce_if_info *if_info)
std::swap (then_bb, else_bb);
}
}
-
+ bitmap else_bb_rename_regs = BITMAP_ALLOC (&reg_obstack);
+ bitmap then_bb_rename_regs = BITMAP_ALLOC (&reg_obstack);
if (then_bb && else_bb
- && (!bbs_ok_for_cmove_arith (then_bb, else_bb, if_info->orig_x)
- || !bbs_ok_for_cmove_arith (else_bb, then_bb, if_info->orig_x)))
- return FALSE;
+ && (!bbs_ok_for_cmove_arith (then_bb, else_bb,
+ if_info->orig_x,
+ then_bb_rename_regs)
+ || !bbs_ok_for_cmove_arith (else_bb, then_bb,
+ if_info->orig_x,
+ else_bb_rename_regs)))
+ {
+ BITMAP_FREE (then_bb_rename_regs);
+ BITMAP_FREE (else_bb_rename_regs);
+ return FALSE;
+ }
+ bool prepass_renaming = noce_rename_regs_in_bb (then_bb,
+ then_bb_rename_regs)
+ && noce_rename_regs_in_bb (else_bb,
+ else_bb_rename_regs);
+
+ BITMAP_FREE (then_bb_rename_regs);
+ BITMAP_FREE (else_bb_rename_regs);
+
+ if (!prepass_renaming)
+ return FALSE;
start_sequence ();
@@ -3072,7 +3237,8 @@ noce_operand_ok (const_rtx op)
static bool
bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
- unsigned int *cost, bool *simple_p)
+ unsigned int *cost, bool *simple_p,
+ bitmap cond_rename_regs)
{
if (!test_bb)
return false;
@@ -3112,8 +3278,9 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
rtx_insn *prev_last_insn = PREV_INSN (last_insn);
gcc_assert (prev_last_insn);
- /* For now, disallow setting x multiple times in test_bb. */
- if (REG_P (x) && reg_set_between_p (x, first_insn, prev_last_insn))
+ if (REG_P (x)
+ && reg_set_between_p (x, first_insn, prev_last_insn)
+ && param_ifcvt_allow_register_renaming < 1)
return false;
bitmap test_bb_temps = BITMAP_ALLOC (&reg_obstack);
@@ -3125,25 +3292,35 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
rtx_insn *insn;
FOR_BB_INSNS (test_bb, insn)
{
- if (insn != last_insn)
- {
- if (!active_insn_p (insn))
- continue;
+ if (insn == last_insn)
+ continue;
+ if (!active_insn_p (insn))
+ continue;
- if (!insn_valid_noce_process_p (insn, cc))
- goto free_bitmap_and_fail;
+ if (!insn_valid_noce_process_p (insn, cc))
+ goto free_bitmap_and_fail;
- rtx sset = single_set (insn);
- gcc_assert (sset);
+ rtx sset = single_set (insn);
+ gcc_assert (sset);
- if (contains_mem_rtx_p (SET_SRC (sset))
- || !REG_P (SET_DEST (sset))
- || reg_overlap_mentioned_p (SET_DEST (sset), cond))
- goto free_bitmap_and_fail;
+ if (contains_mem_rtx_p (SET_SRC (sset))
+ || !REG_P (SET_DEST (sset)))
+ goto free_bitmap_and_fail;
- potential_cost += pattern_cost (sset, speed_p);
- bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset)));
+ if (reg_overlap_mentioned_p (SET_DEST (sset), cond))
+ {
+ if (param_ifcvt_allow_register_renaming < 1)
+ goto free_bitmap_and_fail;
+ rtx sset_dest = SET_DEST (sset);
+ if (REG_P (sset_dest)
+ && (GET_MODE_CLASS (GET_MODE (sset_dest)) != MODE_CC))
+ bitmap_set_bit (cond_rename_regs, REGNO (sset_dest));
+ else
+ goto free_bitmap_and_fail;
}
+ potential_cost += pattern_cost (sset, speed_p);
+ if (SET_DEST (sset) != SET_DEST (last_set))
+ bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset)));
}
/* If any of the intermediate results in test_bb are live after test_bb
@@ -3777,15 +3954,29 @@ noce_process_if_block (struct noce_if_info *if_info)
bool speed_p = optimize_bb_for_speed_p (test_bb);
unsigned int then_cost = 0, else_cost = 0;
+ bitmap cond_rename_regs = BITMAP_ALLOC (&reg_obstack);
if (!bb_valid_for_noce_process_p (then_bb, cond, &then_cost,
- &if_info->then_simple))
- return false;
+ &if_info->then_simple, cond_rename_regs))
+ {
+ BITMAP_FREE (cond_rename_regs);
+ return false;
+ }
if (else_bb
&& !bb_valid_for_noce_process_p (else_bb, cond, &else_cost,
- &if_info->else_simple))
- return false;
+ &if_info->else_simple, cond_rename_regs))
+ {
+ BITMAP_FREE (cond_rename_regs);
+ return false;
+ }
+ if (!noce_rename_regs_in_cond (if_info, cond_rename_regs))
+ {
+ BITMAP_FREE (cond_rename_regs);
+ return false;
+ }
+ BITMAP_FREE (cond_rename_regs);
+ cond = if_info->cond;
if (speed_p)
if_info->original_cost += average_cost (then_cost, else_cost,
find_edge (test_bb, then_bb));
@@ -5823,12 +6014,13 @@ if_convert (bool after_combine)
{
basic_block bb;
int pass;
-
if (optimize == 1)
{
df_live_add_problem ();
df_live_set_all_dirty ();
}
+ free_dominance_info (CDI_DOMINATORS);
+ cleanup_cfg (CLEANUP_EXPENSIVE);
/* Record whether we are after combine pass. */
ifcvt_after_combine = after_combine;
@@ -5933,7 +6125,6 @@ rest_of_handle_if_conversion (void)
dump_reg_info (dump_file);
dump_flow_info (dump_file, dump_flags);
}
- cleanup_cfg (CLEANUP_EXPENSIVE);
if_convert (false);
if (num_updated_if_blocks)
/* Get rid of any dead CC-related instructions. */
diff --git a/gcc/params.opt b/gcc/params.opt
index d2196dc68..ba87f820b 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -669,6 +669,10 @@ Maximum permissible cost for the sequence that would be generated by the RTL if-
Common Joined UInteger Var(param_max_rtl_if_conversion_unpredictable_cost) Init(40) IntegerRange(0, 200) Param Optimization
Maximum permissible cost for the sequence that would be generated by the RTL if-conversion pass for a branch that is considered unpredictable.
+-param=ifcvt-allow-register-renaming=
+Common Joined UInteger Var(param_ifcvt_allow_register_renaming) IntegerRange(0, 2) Param Optimization
+Allow RTL if-conversion pass to aggressively rename registers in basic blocks. Sometimes additional moves will be created.
+
-param=max-sched-extend-regions-iters=
Common Joined UInteger Var(param_max_sched_extend_regions_iters) Param Optimization
The maximum number of iterations through CFG to extend regions.
diff --git a/gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c b/gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c
new file mode 100644
index 000000000..65c4d4140
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c
@@ -0,0 +1,35 @@
+
+extern void abort(void);
+
+__attribute__ ((noinline))
+int foo (int x, int y, int z, int a, int b)
+{
+ if (a < 2) {
+ if (a == 0) {
+ if (x - y < 0)
+ x = x - y + z;
+ else
+ x = x - y;
+ }
+ else {
+ if (x + y >= z)
+ x = x + y - z;
+ else
+ x = x + y;
+ }
+ }
+ return x;
+}
+
+int main(void) {
+ if (foo (5,10,7,0,1) != 2) // x - y + z = -5 + 7 = 2
+ abort ();
+ if (foo (50,10,7,0,1) != 40) // x - y = 40
+ abort ();
+ if (foo (5,10,7,1,1) != 8) // x + y - z = 5 + 10 - 7 = 8
+ abort ();
+ if (foo (5,10,70,1,1) != 15) // x + y = 15
+ abort ();
+ return 0;
+}
+
diff --git a/gcc/testsuite/gcc.dg/ifcvt-6.c b/gcc/testsuite/gcc.dg/ifcvt-6.c
new file mode 100644
index 000000000..be9a67b3f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ifcvt-6.c
@@ -0,0 +1,27 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-options "-fdump-rtl-ce1 -O2 --param max-rtl-if-conversion-unpredictable-cost=100 --param max-rtl-if-conversion-predictable-cost=100 --param=ifcvt-allow-register-renaming=2 -fifcvt-allow-complicated-cmps" } */
+
+typedef unsigned int uint16_t;
+
+uint16_t
+foo (uint16_t x, uint16_t y, uint16_t z, uint16_t a,
+ uint16_t b, uint16_t c, uint16_t d) {
+ int i = 1;
+ int j = 1;
+ if (a > b) {
+ j = x;
+ if (b > c)
+ i = y;
+ else
+ i = z;
+ }
+ else {
+ j = y;
+ if (c > d)
+ i = z;
+ }
+ return i * j;
+}
+
+/* { dg-final { scan-rtl-dump "7 true changes made" "ce1" } } */
+
--
2.33.0

View File

@ -0,0 +1,109 @@
From 310eade1450995b55d9f8120561022fbf164b2ec Mon Sep 17 00:00:00 2001
From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
Date: Thu, 12 Jan 2023 14:52:49 +0300
Subject: [PATCH 03/18] Perform early if-conversion of simple arithmetic
---
gcc/common.opt | 4 ++++
gcc/match.pd | 25 +++++++++++++++++++
gcc/testsuite/gcc.dg/ifcvt-gimple.c | 37 +++++++++++++++++++++++++++++
3 files changed, 66 insertions(+)
create mode 100644 gcc/testsuite/gcc.dg/ifcvt-gimple.c
diff --git a/gcc/common.opt b/gcc/common.opt
index aa00fb7b0..dac477c04 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1821,6 +1821,10 @@ fif-conversion2
Common Var(flag_if_conversion2) Optimization
Perform conversion of conditional jumps to conditional execution.
+fif-conversion-gimple
+Common Var(flag_if_conversion_gimple) Optimization
+Perform conversion of conditional jumps to branchless equivalents during gimple transformations.
+
fstack-reuse=
Common Joined RejectNegative Enum(stack_reuse_level) Var(flag_stack_reuse) Init(SR_ALL) Optimization
-fstack-reuse=[all|named_vars|none] Set stack reuse level for local variables.
diff --git a/gcc/match.pd b/gcc/match.pd
index 6f24d5079..3cbaf2a5b 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -4278,6 +4278,31 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
)
)
)
+
+(if (flag_if_conversion_gimple)
+ (for simple_op (plus minus bit_and bit_ior bit_xor)
+ (simplify
+ (cond @0 (simple_op @1 INTEGER_CST@2) @1)
+ (switch
+ /* a = cond ? a + 1 : a -> a = a + ((int) cond) */
+ (if (integer_onep (@2))
+ (simple_op @1 (convert (convert:boolean_type_node @0))))
+ /* a = cond ? a + powerof2cst : a ->
+ a = a + ((int) cond) << log2 (powerof2cst) */
+ (if (INTEGRAL_TYPE_P (type) && integer_pow2p (@2))
+ (with
+ {
+ tree shift = build_int_cst (integer_type_node, tree_log2 (@2));
+ }
+ (simple_op @1 (lshift (convert (convert:boolean_type_node @0))
+ { shift; })
+ )
+ )
+ )
+ )
+ )
+ )
+)
#endif
#if GIMPLE
diff --git a/gcc/testsuite/gcc.dg/ifcvt-gimple.c b/gcc/testsuite/gcc.dg/ifcvt-gimple.c
new file mode 100644
index 000000000..0f7c87e5c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ifcvt-gimple.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fif-conversion-gimple -fdump-tree-optimized" } */
+
+int test_int (int optimizable_int) {
+ if (optimizable_int > 5)
+ ++optimizable_int;
+ return optimizable_int;
+}
+
+int test_int_pow2 (int optimizable_int_pow2) {
+ if (optimizable_int_pow2 <= 4)
+ optimizable_int_pow2 += 1024;
+ return optimizable_int_pow2;
+}
+
+int test_int_non_pow2 (int not_optimizable_int_non_pow2) {
+ if (not_optimizable_int_non_pow2 == 1)
+ not_optimizable_int_non_pow2 += 513;
+ return not_optimizable_int_non_pow2;
+}
+
+float test_float (float not_optimizable_float) {
+ if (not_optimizable_float > 5)
+ not_optimizable_float += 1;
+ return not_optimizable_float;
+}
+
+/* Expecting if-else block in test_float and test_int_non_pow2 only. */
+/* { dg-final { scan-tree-dump-not "if \\(optimizable" "optimized" } } */
+/* { dg-final { scan-tree-dump "if \\(not_optimizable_int_non_pow2" "optimized" } } */
+/* { dg-final { scan-tree-dump "if \\(not_optimizable_float" "optimized" } } */
+/* { dg-final { scan-tree-dump-times "if " 2 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "else" 2 "optimized" } } */
+
+/* Expecting shifted result only for optimizable_int_pow2. */
+/* { dg-final { scan-tree-dump-times " << " 1 "optimized" } } */
+/* { dg-final { scan-tree-dump " << 10;" "optimized" } } */
--
2.33.0

View File

@ -0,0 +1,252 @@
From 6684509e81e4341675c73a7dc853180229a8abcb Mon Sep 17 00:00:00 2001
From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
Date: Tue, 24 Jan 2023 16:43:40 +0300
Subject: [PATCH 04/18] Add option to allow matching uaddsub overflow for widen
ops too.
---
gcc/common.opt | 5 ++
gcc/testsuite/gcc.dg/uaddsub.c | 143 +++++++++++++++++++++++++++++++++
gcc/tree-ssa-math-opts.cc | 43 ++++++++--
3 files changed, 184 insertions(+), 7 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/uaddsub.c
diff --git a/gcc/common.opt b/gcc/common.opt
index dac477c04..39c90604e 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -3106,6 +3106,11 @@ freciprocal-math
Common Var(flag_reciprocal_math) SetByCombined Optimization
Same as -fassociative-math for expressions which include division.
+fuaddsub-overflow-match-all
+Common Var(flag_uaddsub_overflow_match_all)
+Match unsigned add/sub overflow even if the target does not support
+the corresponding instruction.
+
; Nonzero means that unsafe floating-point math optimizations are allowed
; for the sake of speed. IEEE compliance is not guaranteed, and operations
; are allowed to assume that their arguments and results are "normal"
diff --git a/gcc/testsuite/gcc.dg/uaddsub.c b/gcc/testsuite/gcc.dg/uaddsub.c
new file mode 100644
index 000000000..96c26d308
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/uaddsub.c
@@ -0,0 +1,143 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fuaddsub-overflow-match-all -fdump-tree-optimized" } */
+#include <stdint.h>
+
+typedef unsigned __int128 uint128_t;
+typedef struct uint256_t
+{
+ uint128_t lo;
+ uint128_t hi;
+} uint256_t;
+
+uint16_t add16 (uint8_t a, uint8_t b)
+{
+ uint8_t tmp = a + b;
+ uint8_t overflow = 0;
+ if (tmp < a)
+ overflow = 1;
+
+ uint16_t res = overflow;
+ res <<= 8;
+ res += tmp;
+ return res;
+}
+
+uint32_t add32 (uint16_t a, uint16_t b)
+{
+ uint16_t tmp = a + b;
+ uint16_t overflow = 0;
+ if (tmp < a)
+ overflow = 1;
+
+ uint32_t res = overflow;
+ res <<= 16;
+ res += tmp;
+ return res;
+}
+
+uint64_t add64 (uint32_t a, uint32_t b)
+{
+ uint32_t tmp = a + b;
+ uint32_t overflow = 0;
+ if (tmp < a)
+ overflow = 1;
+
+ uint64_t res = overflow;
+ res <<= 32;
+ res += tmp;
+ return res;
+}
+
+uint128_t add128 (uint64_t a, uint64_t b)
+{
+ uint64_t tmp = a + b;
+ uint64_t overflow = 0;
+ if (tmp < a)
+ overflow = 1;
+
+ uint128_t res = overflow;
+ res <<= 64;
+ res += tmp;
+ return res;
+}
+
+uint256_t add256 (uint128_t a, uint128_t b)
+{
+ uint128_t tmp = a + b;
+ uint128_t overflow = 0;
+ if (tmp < a)
+ overflow = 1;
+
+ uint256_t res;
+ res.hi = overflow;
+ res.lo = tmp;
+ return res;
+}
+
+uint16_t sub16 (uint8_t a, uint8_t b)
+{
+ uint8_t tmp = a - b;
+ uint8_t overflow = 0;
+ if (tmp > a)
+ overflow = -1;
+
+ uint16_t res = overflow;
+ res <<= 8;
+ res += tmp;
+ return res;
+}
+
+uint32_t sub32 (uint16_t a, uint16_t b)
+{
+ uint16_t tmp = a - b;
+ uint16_t overflow = 0;
+ if (tmp > a)
+ overflow = -1;
+
+ uint32_t res = overflow;
+ res <<= 16;
+ res += tmp;
+ return res;
+}
+
+uint64_t sub64 (uint32_t a, uint32_t b)
+{
+ uint32_t tmp = a - b;
+ uint32_t overflow = 0;
+ if (tmp > a)
+ overflow = -1;
+
+ uint64_t res = overflow;
+ res <<= 32;
+ res += tmp;
+ return res;
+}
+
+uint128_t sub128 (uint64_t a, uint64_t b)
+{
+ uint64_t tmp = a - b;
+ uint64_t overflow = 0;
+ if (tmp > a)
+ overflow = -1;
+
+ uint128_t res = overflow;
+ res <<= 64;
+ res += tmp;
+ return res;
+}
+
+uint256_t sub256 (uint128_t a, uint128_t b)
+{
+ uint128_t tmp = a - b;
+ uint128_t overflow = 0;
+ if (tmp > a)
+ overflow = -1;
+
+ uint256_t res;
+ res.hi = overflow;
+ res.lo = tmp;
+ return res;
+}
+
+/* { dg-final { scan-tree-dump-times "= .ADD_OVERFLOW \\(a_\[0-9\]+\\(D\\), b_\[0-9\]+\\(D\\)\\)" 5 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "= .SUB_OVERFLOW \\(a_\[0-9\]+\\(D\\), b_\[0-9\]+\\(D\\)\\)" 5 "optimized" } } */
diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
index 232e903b0..55d6ee8ae 100644
--- a/gcc/tree-ssa-math-opts.cc
+++ b/gcc/tree-ssa-math-opts.cc
@@ -3468,6 +3468,27 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
}
}
+/* Check if the corresponding operation has wider equivalent on the target. */
+
+static bool
+wider_optab_check_p (optab op, machine_mode mode, int unsignedp)
+{
+ machine_mode wider_mode;
+ FOR_EACH_WIDER_MODE (wider_mode, mode)
+ {
+ machine_mode next_mode;
+ if (optab_handler (op, wider_mode) != CODE_FOR_nothing
+ || (op == smul_optab
+ && GET_MODE_WIDER_MODE (wider_mode).exists (&next_mode)
+ && (find_widening_optab_handler ((unsignedp
+ ? umul_widen_optab
+ : smul_widen_optab),
+ next_mode, mode))))
+ return true;
+ }
+
+ return false;
+}
/* Helper function of match_arith_overflow. For MUL_OVERFLOW, if we have
a check for non-zero like:
@@ -3903,15 +3924,22 @@ match_arith_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
|| code == MINUS_EXPR
|| code == MULT_EXPR
|| code == BIT_NOT_EXPR);
+ int unsignedp = TYPE_UNSIGNED (type);
if (!INTEGRAL_TYPE_P (type)
- || !TYPE_UNSIGNED (type)
- || has_zero_uses (lhs)
- || (code != PLUS_EXPR
- && code != MULT_EXPR
- && optab_handler (code == MINUS_EXPR ? usubv4_optab : uaddv4_optab,
- TYPE_MODE (type)) == CODE_FOR_nothing))
+ || !unsignedp
+ || has_zero_uses (lhs))
return false;
+ if (code == PLUS_EXPR || code == MINUS_EXPR)
+ {
+ machine_mode mode = TYPE_MODE (type);
+ optab op = code == PLUS_EXPR ? uaddv4_optab : usubv4_optab;
+ if (optab_handler (op, mode) == CODE_FOR_nothing
+ && (!flag_uaddsub_overflow_match_all
+ || !wider_optab_check_p (op, mode, unsignedp)))
+ return false;
+ }
+
tree rhs1 = gimple_assign_rhs1 (stmt);
tree rhs2 = gimple_assign_rhs2 (stmt);
FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
@@ -3986,7 +4014,8 @@ match_arith_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
|| (code != MULT_EXPR && (code == BIT_NOT_EXPR ? use_seen : !use_seen))
|| (code == PLUS_EXPR
&& optab_handler (uaddv4_optab,
- TYPE_MODE (type)) == CODE_FOR_nothing)
+ TYPE_MODE (type)) == CODE_FOR_nothing
+ && !flag_uaddsub_overflow_match_all)
|| (code == MULT_EXPR
&& optab_handler (cast_stmt ? mulv4_optab : umulv4_optab,
TYPE_MODE (type)) == CODE_FOR_nothing))
--
2.33.0

View File

@ -0,0 +1,488 @@
From e7b22f97f960b62e555dfd6f2e3ae43973fcbb3e Mon Sep 17 00:00:00 2001
From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
Date: Wed, 25 Jan 2023 15:04:07 +0300
Subject: [PATCH 05/18] Match double sized mul pattern
---
gcc/match.pd | 136 +++++++++++++++++++++
gcc/testsuite/gcc.dg/double_sized_mul-1.c | 141 ++++++++++++++++++++++
gcc/testsuite/gcc.dg/double_sized_mul-2.c | 62 ++++++++++
gcc/tree-ssa-math-opts.cc | 80 ++++++++++++
4 files changed, 419 insertions(+)
create mode 100644 gcc/testsuite/gcc.dg/double_sized_mul-1.c
create mode 100644 gcc/testsuite/gcc.dg/double_sized_mul-2.c
diff --git a/gcc/match.pd b/gcc/match.pd
index 3cbaf2a5b..61866cb90 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -7895,3 +7895,139 @@ and,
== TYPE_UNSIGNED (TREE_TYPE (@3))))
&& single_use (@4)
&& single_use (@5))))
+
+/* Match multiplication with double sized result.
+
+ Consider the following calculations:
+ arg0 * arg1 = (2^(bit_size/2) * arg0_hi + arg0_lo)
+ * (2^(bit_size/2) * arg1_hi + arg1_lo)
+ arg0 * arg1 = 2^bit_size * arg0_hi * arg1_hi
+ + 2^(bit_size/2) * (arg0_hi * arg1_lo + arg0_lo * arg1_hi)
+ + arg0_lo * arg1_lo
+
+ The products of high and low parts fits in bit_size values, thus they are
+ placed in high and low parts of result respectively.
+
+ The sum of the mixed products may overflow, so we need a detection for that.
+ Also it has a bit_size/2 offset, thus it intersects with both high and low
+ parts of result. Overflow detection constant is bit_size/2 due to this.
+
+ With this info:
+ arg0 * arg1 = 2^bit_size * arg0_hi * arg1_hi
+ + 2^(bit_size/2) * middle
+ + 2^bit_size * possible_middle_overflow
+ + arg0_lo * arg1_lo
+ arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + possible_middle_overflow)
+ + 2^(bit_size/2) * (2^(bit_size/2) * middle_hi + middle_lo)
+ + arg0_lo * arg1_lo
+ arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + middle_hi
+ + possible_middle_overflow)
+ + 2^(bit_size/2) * middle_lo
+ + arg0_lo * arg1_lo
+
+ The last sum can produce overflow for the high result part. With this:
+ arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + possible_middle_overflow
+ + possible_res_lo_overflow + middle_hi)
+ + res_lo
+ = res_hi + res_lo
+
+ This formula is quite big to fit into one match pattern with all of the
+ combinations of terms inside it. There are many helpers for better code
+ readability.
+
+ The simplification basis is res_hi: assuming that res_lo only is not
+ real practical case for such calculations.
+
+ Overflow handling is done via matching complex calculations:
+ the realpart and imagpart are quite handy here. */
+/* Match low and high parts of the argument. */
+(match (double_size_mul_arg_lo @0 @1)
+ (bit_and @0 INTEGER_CST@1)
+ (if (wi::to_wide (@1)
+ == wi::mask (TYPE_PRECISION (type) / 2, false, TYPE_PRECISION (type)))))
+(match (double_size_mul_arg_hi @0 @1)
+ (rshift @0 INTEGER_CST@1)
+ (if (wi::to_wide (@1) == TYPE_PRECISION (type) / 2)))
+
+/* Match various argument parts products. */
+(match (double_size_mul_lolo @0 @1)
+ (mult@4 (double_size_mul_arg_lo @0 @2) (double_size_mul_arg_lo @1 @3))
+ (if (single_use (@4))))
+(match (double_size_mul_hihi @0 @1)
+ (mult@4 (double_size_mul_arg_hi @0 @2) (double_size_mul_arg_hi @1 @3))
+ (if (single_use (@4))))
+(match (double_size_mul_lohi @0 @1)
+ (mult:c@4 (double_size_mul_arg_lo @0 @2) (double_size_mul_arg_hi @1 @3))
+ (if (single_use (@4))))
+
+/* Match complex middle sum. */
+(match (double_size_mul_middle_complex @0 @1)
+ (IFN_ADD_OVERFLOW@2 (double_size_mul_lohi @0 @1) (double_size_mul_lohi @1 @0))
+ (if (num_imm_uses (@2) == 2)))
+
+/* Match real middle results. */
+(match (double_size_mul_middle @0 @1)
+ (realpart@2 (double_size_mul_middle_complex @0 @1))
+ (if (num_imm_uses (@2) == 2)))
+(match (double_size_mul_middleres_lo @0 @1)
+ (lshift@3 (double_size_mul_middle @0 @1) INTEGER_CST@2)
+ (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2
+ && single_use (@3))))
+(match (double_size_mul_middleres_hi @0 @1)
+ (rshift@3 (double_size_mul_middle @0 @1) INTEGER_CST@2)
+ (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2
+ && single_use (@3))))
+
+/* Match low result part. */
+/* Number of uses may be < 2 in case when we are interested in
+ high part only. */
+(match (double_size_mul_res_lo_complex @0 @1)
+ (IFN_ADD_OVERFLOW:c@2
+ (double_size_mul_lolo:c @0 @1) (double_size_mul_middleres_lo @0 @1))
+ (if (num_imm_uses (@2) <= 2)))
+(match (double_size_mul_res_lo @0 @1)
+ (realpart (double_size_mul_res_lo_complex @0 @1)))
+
+/* Match overflow terms. */
+(match (double_size_mul_overflow_check_lo @0 @1 @5)
+ (convert@4 (ne@3
+ (imagpart@2 (double_size_mul_res_lo_complex@5 @0 @1)) integer_zerop))
+ (if (single_use (@2) && single_use (@3) && single_use (@4))))
+(match (double_size_mul_overflow_check_hi @0 @1)
+ (lshift@6 (convert@5 (ne@4
+ (imagpart@3 (double_size_mul_middle_complex @0 @1)) integer_zerop))
+ INTEGER_CST@2)
+ (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2
+ && single_use (@3) && single_use (@4) && single_use (@5)
+ && single_use (@6))))
+
+/* Match all possible permutations for high result part calculations. */
+(for op1 (double_size_mul_hihi
+ double_size_mul_overflow_check_hi
+ double_size_mul_middleres_hi)
+ op2 (double_size_mul_overflow_check_hi
+ double_size_mul_middleres_hi
+ double_size_mul_hihi)
+ op3 (double_size_mul_middleres_hi
+ double_size_mul_hihi
+ double_size_mul_overflow_check_hi)
+ (match (double_size_mul_candidate @0 @1 @2 @3)
+ (plus:c@2
+ (plus:c@4 (double_size_mul_overflow_check_lo @0 @1 @3) (op1:c @0 @1))
+ (plus:c@5 (op2:c @0 @1) (op3:c @0 @1)))
+ (if (single_use (@4) && single_use (@5))))
+ (match (double_size_mul_candidate @0 @1 @2 @3)
+ (plus:c@2 (double_size_mul_overflow_check_lo @0 @1 @3)
+ (plus:c@4 (op1:c @0 @1)
+ (plus:c@5 (op2:c @0 @1) (op3:c @0 @1))))
+ (if (single_use (@4) && single_use (@5))))
+ (match (double_size_mul_candidate @0 @1 @2 @3)
+ (plus:c@2 (op1:c @0 @1)
+ (plus:c@4 (double_size_mul_overflow_check_lo @0 @1 @3)
+ (plus:c@5 (op2:c @0 @1) (op3:c @0 @1))))
+ (if (single_use (@4) && single_use (@5))))
+ (match (double_size_mul_candidate @0 @1 @2 @3)
+ (plus:c@2 (op1:c @0 @1)
+ (plus:c@4 (op2:c @0 @1)
+ (plus:c@5 (double_size_mul_overflow_check_lo @0 @1 @3) (op3:c @0 @1))))
+ (if (single_use (@4) && single_use (@5)))))
diff --git a/gcc/testsuite/gcc.dg/double_sized_mul-1.c b/gcc/testsuite/gcc.dg/double_sized_mul-1.c
new file mode 100644
index 000000000..4d475cc8a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/double_sized_mul-1.c
@@ -0,0 +1,141 @@
+/* { dg-do compile } */
+/* fif-conversion-gimple and fuaddsub-overflow-match-all are required for
+ proper overflow detection in some cases. */
+/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */
+#include <stdint.h>
+
+typedef unsigned __int128 uint128_t;
+
+uint16_t mul16 (uint8_t a, uint8_t b)
+{
+ uint8_t a_lo = a & 0xF;
+ uint8_t b_lo = b & 0xF;
+ uint8_t a_hi = a >> 4;
+ uint8_t b_hi = b >> 4;
+ uint8_t lolo = a_lo * b_lo;
+ uint8_t lohi = a_lo * b_hi;
+ uint8_t hilo = a_hi * b_lo;
+ uint8_t hihi = a_hi * b_hi;
+ uint8_t middle = hilo + lohi;
+ uint8_t middle_hi = middle >> 4;
+ uint8_t middle_lo = middle << 4;
+ uint8_t res_lo = lolo + middle_lo;
+ uint8_t res_hi = hihi + middle_hi;
+ res_hi += (res_lo < middle_lo ? 1 : 0);
+ res_hi += (middle < hilo ? 0x10 : 0);
+ uint16_t res = ((uint16_t) res_hi) << 8;
+ res += res_lo;
+ return res;
+}
+
+uint32_t mul32 (uint16_t a, uint16_t b)
+{
+ uint16_t a_lo = a & 0xFF;
+ uint16_t b_lo = b & 0xFF;
+ uint16_t a_hi = a >> 8;
+ uint16_t b_hi = b >> 8;
+ uint16_t lolo = a_lo * b_lo;
+ uint16_t lohi = a_lo * b_hi;
+ uint16_t hilo = a_hi * b_lo;
+ uint16_t hihi = a_hi * b_hi;
+ uint16_t middle = hilo + lohi;
+ uint16_t middle_hi = middle >> 8;
+ uint16_t middle_lo = middle << 8;
+ uint16_t res_lo = lolo + middle_lo;
+ uint16_t res_hi = hihi + middle_hi;
+ res_hi += (res_lo < middle_lo ? 1 : 0);
+ res_hi += (middle < hilo ? 0x100 : 0);
+ uint32_t res = ((uint32_t) res_hi) << 16;
+ res += res_lo;
+ return res;
+}
+
+uint64_t mul64 (uint32_t a, uint32_t b)
+{
+ uint32_t a_lo = a & 0xFFFF;
+ uint32_t b_lo = b & 0xFFFF;
+ uint32_t a_hi = a >> 16;
+ uint32_t b_hi = b >> 16;
+ uint32_t lolo = a_lo * b_lo;
+ uint32_t lohi = a_lo * b_hi;
+ uint32_t hilo = a_hi * b_lo;
+ uint32_t hihi = a_hi * b_hi;
+ uint32_t middle = hilo + lohi;
+ uint32_t middle_hi = middle >> 16;
+ uint32_t middle_lo = middle << 16;
+ uint32_t res_lo = lolo + middle_lo;
+ uint32_t res_hi = hihi + middle_hi;
+ res_hi += (res_lo < middle_lo ? 1 : 0);
+ res_hi += (middle < hilo ? 0x10000 : 0);
+ uint64_t res = ((uint64_t) res_hi) << 32;
+ res += res_lo;
+ return res;
+}
+
+uint128_t mul128 (uint64_t a, uint64_t b)
+{
+ uint64_t a_lo = a & 0xFFFFFFFF;
+ uint64_t b_lo = b & 0xFFFFFFFF;
+ uint64_t a_hi = a >> 32;
+ uint64_t b_hi = b >> 32;
+ uint64_t lolo = a_lo * b_lo;
+ uint64_t lohi = a_lo * b_hi;
+ uint64_t hilo = a_hi * b_lo;
+ uint64_t hihi = a_hi * b_hi;
+ uint64_t middle = hilo + lohi;
+ uint64_t middle_hi = middle >> 32;
+ uint64_t middle_lo = middle << 32;
+ uint64_t res_lo = lolo + middle_lo;
+ uint64_t res_hi = hihi + middle_hi;
+ res_hi += (res_lo < middle_lo ? 1 : 0);
+ res_hi += (middle < hilo ? 0x100000000 : 0);
+ uint128_t res = ((uint128_t) res_hi) << 64;
+ res += res_lo;
+ return res;
+}
+
+uint64_t mul64_perm (uint32_t a, uint32_t b)
+{
+ uint32_t a_lo = a & 0xFFFF;
+ uint32_t b_lo = b & 0xFFFF;
+ uint32_t a_hi = a >> 16;
+ uint32_t b_hi = b >> 16;
+ uint32_t lolo = a_lo * b_lo;
+ uint32_t lohi = a_lo * b_hi;
+ uint32_t hilo = a_hi * b_lo;
+ uint32_t hihi = a_hi * b_hi;
+ uint32_t middle = hilo + lohi;
+ uint32_t middle_hi = middle >> 16;
+ uint32_t middle_lo = middle << 16;
+ uint32_t res_lo = lolo + middle_lo;
+ uint32_t res_hi = hihi + middle_hi;
+ res_hi = res_lo < middle_lo ? res_hi + 1 : res_hi;
+ res_hi = middle < hilo ? res_hi + 0x10000 : res_hi;
+ uint64_t res = ((uint64_t) res_hi) << 32;
+ res += res_lo;
+ return res;
+}
+
+uint128_t mul128_perm (uint64_t a, uint64_t b)
+{
+ uint64_t a_lo = a & 0xFFFFFFFF;
+ uint64_t b_lo = b & 0xFFFFFFFF;
+ uint64_t a_hi = a >> 32;
+ uint64_t b_hi = b >> 32;
+ uint64_t lolo = a_lo * b_lo;
+ uint64_t lohi = a_lo * b_hi;
+ uint64_t hilo = a_hi * b_lo;
+ uint64_t hihi = a_hi * b_hi;
+ uint64_t middle = hilo + lohi;
+ uint64_t middle_hi = middle >> 32;
+ uint64_t middle_lo = middle << 32;
+ uint64_t res_lo = lolo + middle_lo;
+ uint64_t res_hi = hihi + middle_hi;
+ res_hi = res_lo < middle_lo ? res_hi + 1 : res_hi;
+ res_hi = middle < hilo ? res_hi + 0x100000000 : res_hi;
+ uint128_t res = ((uint128_t) res_hi) << 64;
+ res += res_lo;
+ return res;
+}
+
+/* { dg-final { scan-tree-dump-times "double sized mul optimized: 1" 6 "widening_mul" } } */
diff --git a/gcc/testsuite/gcc.dg/double_sized_mul-2.c b/gcc/testsuite/gcc.dg/double_sized_mul-2.c
new file mode 100644
index 000000000..cc6e5af25
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/double_sized_mul-2.c
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* fif-conversion-gimple is required for proper overflow detection
+ in some cases. */
+/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */
+#include <stdint.h>
+
+typedef unsigned __int128 uint128_t;
+typedef struct uint256_t
+{
+ uint128_t lo;
+ uint128_t hi;
+} uint256_t;
+
+uint64_t mul64_double_use (uint32_t a, uint32_t b)
+{
+ uint32_t a_lo = a & 0xFFFF;
+ uint32_t b_lo = b & 0xFFFF;
+ uint32_t a_hi = a >> 16;
+ uint32_t b_hi = b >> 16;
+ uint32_t lolo = a_lo * b_lo;
+ uint32_t lohi = a_lo * b_hi;
+ uint32_t hilo = a_hi * b_lo;
+ uint32_t hihi = a_hi * b_hi;
+ uint32_t middle = hilo + lohi;
+ uint32_t middle_hi = middle >> 16;
+ uint32_t middle_lo = middle << 16;
+ uint32_t res_lo = lolo + middle_lo;
+ uint32_t res_hi = hihi + middle_hi;
+ res_hi += (res_lo < middle_lo ? 1 : 0);
+ res_hi += (middle < hilo ? 0x10000 : 0);
+ uint64_t res = ((uint64_t) res_hi) << 32;
+ res += res_lo;
+ return res + lolo;
+}
+
+uint256_t mul256 (uint128_t a, uint128_t b)
+{
+ uint128_t a_lo = a & 0xFFFFFFFFFFFFFFFF;
+ uint128_t b_lo = b & 0xFFFFFFFFFFFFFFFF;
+ uint128_t a_hi = a >> 64;
+ uint128_t b_hi = b >> 64;
+ uint128_t lolo = a_lo * b_lo;
+ uint128_t lohi = a_lo * b_hi;
+ uint128_t hilo = a_hi * b_lo;
+ uint128_t hihi = a_hi * b_hi;
+ uint128_t middle = hilo + lohi;
+ uint128_t middle_hi = middle >> 64;
+ uint128_t middle_lo = middle << 64;
+ uint128_t res_lo = lolo + middle_lo;
+ uint128_t res_hi = hihi + middle_hi;
+ res_hi += (res_lo < middle_lo ? 1 : 0);
+ /* Constant is to big warning WA */
+ uint128_t overflow_tmp = (middle < hilo ? 1 : 0);
+ overflow_tmp <<= 64;
+ res_hi += overflow_tmp;
+ uint256_t res;
+ res.lo = res_lo;
+ res.hi = res_hi;
+ return res;
+}
+
+/* { dg-final { scan-tree-dump-not "double sized mul optimized" "widening_mul" } } */
diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
index 55d6ee8ae..2c06b8a60 100644
--- a/gcc/tree-ssa-math-opts.cc
+++ b/gcc/tree-ssa-math-opts.cc
@@ -210,6 +210,9 @@ static struct
/* Number of highpart multiplication ops inserted. */
int highpart_mults_inserted;
+
+ /* Number of optimized double sized multiplications. */
+ int double_sized_mul_optimized;
} widen_mul_stats;
/* The instance of "struct occurrence" representing the highest
@@ -4893,6 +4896,78 @@ optimize_spaceship (gimple *stmt)
}
+/* Pattern matcher for double sized multiplication defined in match.pd. */
+extern bool gimple_double_size_mul_candidate (tree, tree*, tree (*)(tree));
+
+static bool
+convert_double_size_mul (gimple_stmt_iterator *gsi, gimple *stmt)
+{
+ gimple *use_stmt, *complex_res_lo;
+ gimple_stmt_iterator insert_before;
+ imm_use_iterator use_iter;
+ tree match[4]; // arg0, arg1, res_hi, complex_res_lo
+ tree arg0, arg1, widen_mult, new_type, tmp;
+ tree lhs = gimple_assign_lhs (stmt);
+ location_t loc = UNKNOWN_LOCATION;
+ machine_mode mode;
+
+ if (!gimple_double_size_mul_candidate (lhs, match, NULL))
+ return false;
+
+ new_type = build_nonstandard_integer_type (
+ TYPE_PRECISION (TREE_TYPE (match[0])) * 2, 1);
+ mode = TYPE_MODE (new_type);
+
+ /* Early return if the target multiplication doesn't exist on target. */
+ if (optab_handler (smul_optab, mode) == CODE_FOR_nothing
+ && !wider_optab_check_p (smul_optab, mode, 1))
+ return false;
+
+ /* Determine the point where the wide multiplication
+ should be inserted. Complex low res is OK since it is required
+ by both high and low part getters, thus it dominates both of them. */
+ complex_res_lo = SSA_NAME_DEF_STMT (match[3]);
+ insert_before = gsi_for_stmt (complex_res_lo);
+ gsi_next (&insert_before);
+
+ /* Create the widen multiplication. */
+ arg0 = build_and_insert_cast (&insert_before, loc, new_type, match[0]);
+ arg1 = build_and_insert_cast (&insert_before, loc, new_type, match[1]);
+ widen_mult = build_and_insert_binop (&insert_before, loc, "widen_mult",
+ MULT_EXPR, arg0, arg1);
+
+ /* Find the mult low part getter. */
+ FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, match[3])
+ if (gimple_assign_rhs_code (use_stmt) == REALPART_EXPR)
+ break;
+
+ /* Create high and low (if needed) parts extractors. */
+ /* Low part. */
+ if (use_stmt)
+ {
+ loc = gimple_location (use_stmt);
+ tmp = build_and_insert_cast (&insert_before, loc,
+ TREE_TYPE (gimple_get_lhs (use_stmt)),
+ widen_mult);
+ gassign *new_stmt = gimple_build_assign (gimple_get_lhs (use_stmt),
+ NOP_EXPR, tmp);
+ gsi_replace (&insert_before, new_stmt, true);
+ }
+
+ /* High part. */
+ loc = gimple_location (stmt);
+ tmp = build_and_insert_binop (gsi, loc, "widen_mult_hi",
+ RSHIFT_EXPR, widen_mult,
+ build_int_cst (new_type,
+ TYPE_PRECISION (new_type) / 2));
+ tmp = build_and_insert_cast (gsi, loc, TREE_TYPE (lhs), tmp);
+ gassign *new_stmt = gimple_build_assign (lhs, NOP_EXPR, tmp);
+ gsi_replace (gsi, new_stmt, true);
+
+ widen_mul_stats.double_sized_mul_optimized++;
+ return true;
+}
+
/* Find integer multiplications where the operands are extended from
smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
or MULT_HIGHPART_EXPR where appropriate. */
@@ -4987,6 +5062,9 @@ math_opts_dom_walker::after_dom_children (basic_block bb)
break;
case PLUS_EXPR:
+ if (convert_double_size_mul (&gsi, stmt))
+ break;
+ __attribute__ ((fallthrough));
case MINUS_EXPR:
if (!convert_plusminus_to_widen (&gsi, stmt, code))
match_arith_overflow (&gsi, stmt, code, m_cfg_changed_p);
@@ -5091,6 +5169,8 @@ pass_optimize_widening_mul::execute (function *fun)
widen_mul_stats.divmod_calls_inserted);
statistics_counter_event (fun, "highpart multiplications inserted",
widen_mul_stats.highpart_mults_inserted);
+ statistics_counter_event (fun, "double sized mul optimized",
+ widen_mul_stats.double_sized_mul_optimized);
return cfg_changed ? TODO_cleanup_cfg : 0;
}
--
2.33.0

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,100 @@
From aaa117a9ff58fb208e8c8859e075ca425f995f63 Mon Sep 17 00:00:00 2001
From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
Date: Tue, 27 Feb 2024 07:43:57 +0800
Subject: [PATCH 07/18] Port fixes in icp to GCC 12
---
gcc/ipa-devirt.cc | 37 ++++++++++++++++++++++++++++++-------
1 file changed, 30 insertions(+), 7 deletions(-)
diff --git a/gcc/ipa-devirt.cc b/gcc/ipa-devirt.cc
index 383839189..318535d06 100644
--- a/gcc/ipa-devirt.cc
+++ b/gcc/ipa-devirt.cc
@@ -4431,6 +4431,11 @@ print_type_set(unsigned ftype_uid, type_alias_map *map)
if (!map->count (ftype_uid))
return;
type_set* s = (*map)[ftype_uid];
+ if (!s)
+ {
+ fprintf (dump_file, "%d (no set)", ftype_uid);
+ return;
+ }
for (type_set::const_iterator it = s->begin (); it != s->end (); it++)
fprintf (dump_file, it == s->begin () ? "%d" : ", %d", *it);
}
@@ -4696,12 +4701,19 @@ maybe_register_aliases (tree type1, tree type2)
if (register_ailas_type (type1, type2, ta_map))
analyze_pointees (type1, type2);
}
+ unsigned type1_uid = TYPE_UID (type1);
+ unsigned type2_uid = TYPE_UID (type2);
+ if (type_uid_map->count (type1_uid) == 0)
+ (*type_uid_map)[type1_uid] = type1;
+ if (type_uid_map->count (type2_uid) == 0)
+ (*type_uid_map)[type2_uid] = type2;
+
/* If function and non-function type pointers alias,
the function type is unsafe. */
if (FUNCTION_POINTER_TYPE_P (type1) && !FUNCTION_POINTER_TYPE_P (type2))
- unsafe_types->insert (TYPE_UID (type1));
+ unsafe_types->insert (type1_uid);
if (FUNCTION_POINTER_TYPE_P (type2) && !FUNCTION_POINTER_TYPE_P (type1))
- unsafe_types->insert (TYPE_UID (type2));
+ unsafe_types->insert (type2_uid);
/* Try to figure out with pointers to incomplete types. */
if (POINTER_TYPE_P (type1) && POINTER_TYPE_P (type2))
@@ -4825,10 +4837,12 @@ compare_block_and_init_type (tree block, tree t1)
static void
analyze_global_var (varpool_node *var)
{
- var->get_constructor();
tree decl = var->decl;
- if (TREE_CODE (decl) == SSA_NAME || !DECL_INITIAL (decl)
- || integer_zerop (DECL_INITIAL (decl)))
+ if (decl || !DECL_INITIAL (decl))
+ return;
+ var->get_constructor ();
+ if (TREE_CODE (decl) == SSA_NAME || integer_zerop (DECL_INITIAL (decl))
+ || TREE_CODE (DECL_INITIAL (decl)) == ERROR_MARK)
return;
if (dump_file && (dump_flags & TDF_DETAILS))
@@ -4998,7 +5012,9 @@ analyze_assign_stmt (gimple *stmt)
{
rhs = TREE_OPERAND (rhs, 0);
if (VAR_OR_FUNCTION_DECL_P (rhs) || TREE_CODE (rhs) == STRING_CST
- || TREE_CODE (rhs) == ARRAY_REF || TREE_CODE (rhs) == PARM_DECL)
+ || TREE_CODE (rhs) == ARRAY_REF || TREE_CODE (rhs) == PARM_DECL
+ || TREE_CODE (rhs) == LABEL_DECL || TREE_CODE (rhs) == CONST_DECL
+ || TREE_CODE (rhs) == RESULT_DECL)
rhs_type = build_pointer_type (TREE_TYPE (rhs));
else if (TREE_CODE (rhs) == COMPONENT_REF)
{
@@ -5012,7 +5028,12 @@ analyze_assign_stmt (gimple *stmt)
gcc_assert (POINTER_TYPE_P (rhs_type));
}
else
- gcc_unreachable();
+ {
+ fprintf (dump_file, "\nUnsupported rhs type %s in assign stmt: ",
+ get_tree_code_name (TREE_CODE (rhs)));
+ print_gimple_stmt (dump_file, stmt, 0);
+ gcc_unreachable ();
+ }
}
else
rhs_type = TREE_TYPE (rhs);
@@ -5710,6 +5731,8 @@ merge_fs_map_for_ftype_aliases ()
decl_set *d_set = it1->second;
tree type = (*type_uid_map)[it1->first];
type_set *set = (*fta_map)[it1->first];
+ if (!set)
+ continue;
for (type_set::const_iterator it2 = set->begin ();
it2 != set->end (); it2++)
{
--
2.33.0

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,378 @@
From a3013c074cd2ab5f71eb98a587a627f38c68656c Mon Sep 17 00:00:00 2001
From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
Date: Thu, 22 Feb 2024 17:07:24 +0800
Subject: [PATCH 12/18] Port maxmin patch to GCC 12
---
gcc/config/aarch64/aarch64-simd.md | 256 ++++++++++++++++++++++++++
gcc/config/aarch64/predicates.md | 19 ++
gcc/testsuite/gcc.dg/combine-maxmin.c | 46 +++++
3 files changed, 321 insertions(+)
create mode 100755 gcc/testsuite/gcc.dg/combine-maxmin.c
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 82f73805f..de92802f5 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1138,6 +1138,82 @@
[(set_attr "type" "neon_compare<q>,neon_shift_imm<q>")]
)
+;; Simplify the extension with following truncation for shift+neg operation.
+
+(define_insn_and_split "*aarch64_sshr_neg_v8hi"
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
+ (vec_concat:V8HI
+ (truncate:V4HI
+ (ashiftrt:V4SI
+ (neg:V4SI
+ (sign_extend:V4SI
+ (vec_select:V4HI
+ (match_operand:V8HI 1 "register_operand")
+ (match_operand:V8HI 3 "vect_par_cnst_lo_half"))))
+ (match_operand:V4SI 2 "maxmin_arith_shift_operand")))
+ (truncate:V4HI
+ (ashiftrt:V4SI
+ (neg:V4SI
+ (sign_extend:V4SI
+ (vec_select:V4HI
+ (match_dup 1)
+ (match_operand:V8HI 4 "vect_par_cnst_hi_half"))))
+ (match_dup 2)))))]
+ "TARGET_SIMD"
+ "#"
+ "&& true"
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
+ (ashiftrt:V8HI
+ (neg:V8HI
+ (match_operand:V8HI 1 "register_operand" "w"))
+ (match_operand:V8HI 2 "aarch64_simd_imm_minus_one")))]
+ {
+ /* Reduce the shift amount to smaller mode. */
+ int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[2], 0))
+ - (GET_MODE_UNIT_BITSIZE (GET_MODE (operands[2])) / 2);
+ operands[2] = aarch64_simd_gen_const_vector_dup (V8HImode, val);
+ }
+ [(set_attr "type" "multiple")]
+)
+
+;; The helper definition that allows combiner to use the previous pattern.
+
+(define_insn_and_split "*aarch64_sshr_neg_tmpv8hi"
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
+ (vec_concat:V8HI
+ (truncate:V4HI
+ (ashiftrt:V4SI
+ (neg:V4SI
+ (match_operand:V4SI 1 "register_operand" "w"))
+ (match_operand:V4SI 2 "maxmin_arith_shift_operand")))
+ (truncate:V4HI
+ (ashiftrt:V4SI
+ (neg:V4SI
+ (match_operand:V4SI 3 "register_operand" "w"))
+ (match_dup 2)))))]
+ "TARGET_SIMD"
+ "#"
+ "&& true"
+ [(set (match_operand:V4SI 1 "register_operand" "=w")
+ (ashiftrt:V4SI
+ (neg:V4SI
+ (match_dup 1))
+ (match_operand:V4SI 2 "maxmin_arith_shift_operand")))
+ (set (match_operand:V4SI 3 "register_operand" "=w")
+ (ashiftrt:V4SI
+ (neg:V4SI
+ (match_dup 3))
+ (match_dup 2)))
+ (set (match_operand:V8HI 0 "register_operand" "=w")
+ (vec_concat:V8HI
+ (truncate:V4HI
+ (match_dup 1))
+ (truncate:V4HI
+ (match_dup 3))))]
+ ""
+ [(set_attr "type" "multiple")]
+)
+
(define_insn "*aarch64_simd_sra<mode>"
[(set (match_operand:VDQ_I 0 "register_operand" "=w")
(plus:VDQ_I
@@ -1714,6 +1790,26 @@
}
)
+(define_insn "vec_pack_trunc_shifted_<mode>"
+ [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=&w")
+ (vec_concat:<VNARROWQ2>
+ (truncate:<VNARROWQ>
+ (ashiftrt:VQN (match_operand:VQN 1 "register_operand" "w")
+ (match_operand:VQN 2 "half_size_operand" "w")))
+ (truncate:<VNARROWQ>
+ (ashiftrt:VQN (match_operand:VQN 3 "register_operand" "w")
+ (match_operand:VQN 4 "half_size_operand" "w")))))]
+ "TARGET_SIMD"
+ {
+ if (BYTES_BIG_ENDIAN)
+ return "uzp2\\t%0.<V2ntype>, %3.<V2ntype>, %1.<V2ntype>";
+ else
+ return "uzp2\\t%0.<V2ntype>, %1.<V2ntype>, %3.<V2ntype>";
+ }
+ [(set_attr "type" "neon_permute<q>")
+ (set_attr "length" "4")]
+)
+
(define_insn "aarch64_shrn<mode>_insn_le"
[(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
(vec_concat:<VNARROWQ2>
@@ -6652,6 +6748,166 @@
[(set_attr "type" "neon_tst<q>")]
)
+;; Simplify the extension with following truncation for cmtst-like operation.
+
+(define_insn_and_split "*aarch64_cmtst_arith_v8hi"
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
+ (vec_concat:V8HI
+ (plus:V4HI
+ (truncate:V4HI
+ (eq:V4SI
+ (sign_extend:V4SI
+ (vec_select:V4HI
+ (and:V8HI
+ (match_operand:V8HI 1 "register_operand")
+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
+ (match_operand:V8HI 3 "vect_par_cnst_lo_half")))
+ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero")))
+ (match_operand:V4HI 5 "aarch64_simd_imm_minus_one"))
+ (plus:V4HI
+ (truncate:V4HI
+ (eq:V4SI
+ (sign_extend:V4SI
+ (vec_select:V4HI
+ (and:V8HI
+ (match_dup 1)
+ (match_dup 2))
+ (match_operand:V8HI 6 "vect_par_cnst_hi_half")))
+ (match_dup 4)))
+ (match_dup 5))))]
+ "TARGET_SIMD && !reload_completed"
+ "#"
+ "&& true"
+ [(set (match_operand:V8HI 6 "register_operand" "=w")
+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
+ (set (match_operand:V8HI 0 "register_operand" "=w")
+ (plus:V8HI
+ (eq:V8HI
+ (and:V8HI
+ (match_operand:V8HI 1 "register_operand" "w")
+ (match_dup 6))
+ (match_operand:V8HI 4 "aarch64_simd_imm_zero"))
+ (match_operand:V8HI 5 "aarch64_simd_imm_minus_one")))]
+ {
+ if (can_create_pseudo_p ())
+ {
+ int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[4], 0));
+ operands[4] = aarch64_simd_gen_const_vector_dup (V8HImode, val);
+ int val2 = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[5], 0));
+ operands[5] = aarch64_simd_gen_const_vector_dup (V8HImode, val2);
+
+ operands[6] = gen_reg_rtx (V8HImode);
+ }
+ else
+ FAIL;
+ }
+ [(set_attr "type" "neon_tst_q")]
+)
+
+;; Three helper definitions that allow combiner to use the previous pattern.
+
+(define_insn_and_split "*aarch64_cmtst_arith_tmp_lo_v8hi"
+ [(set (match_operand:V4SI 0 "register_operand" "=w")
+ (neg:V4SI
+ (eq:V4SI
+ (sign_extend:V4SI
+ (vec_select:V4HI
+ (and:V8HI
+ (match_operand:V8HI 1 "register_operand")
+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
+ (match_operand:V8HI 3 "vect_par_cnst_lo_half")))
+ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
+ "TARGET_SIMD && !reload_completed"
+ "#"
+ "&& true"
+ [(set (match_operand:V8HI 5 "register_operand" "=w")
+ (and:V8HI
+ (match_operand:V8HI 1 "register_operand")
+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")))
+ (set (match_operand:V4SI 0 "register_operand" "=w")
+ (sign_extend:V4SI
+ (vec_select:V4HI
+ (match_dup 5)
+ (match_operand:V8HI 3 "vect_par_cnst_lo_half"))))
+ (set (match_dup 0)
+ (neg:V4SI
+ (eq:V4SI
+ (match_dup 0)
+ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
+ {
+ if (can_create_pseudo_p ())
+ operands[5] = gen_reg_rtx (V8HImode);
+ else
+ FAIL;
+ }
+ [(set_attr "type" "multiple")]
+)
+
+(define_insn_and_split "*aarch64_cmtst_arith_tmp_hi_v8hi"
+ [(set (match_operand:V4SI 0 "register_operand" "=w")
+ (neg:V4SI
+ (eq:V4SI
+ (sign_extend:V4SI
+ (vec_select:V4HI
+ (and:V8HI
+ (match_operand:V8HI 1 "register_operand")
+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
+ (match_operand:V8HI 3 "vect_par_cnst_hi_half")))
+ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
+ "TARGET_SIMD && !reload_completed"
+ "#"
+ "&& true"
+ [(set (match_operand:V8HI 5 "register_operand" "=w")
+ (and:V8HI
+ (match_operand:V8HI 1 "register_operand")
+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")))
+ (set (match_operand:V4SI 0 "register_operand" "=w")
+ (sign_extend:V4SI
+ (vec_select:V4HI
+ (match_dup 5)
+ (match_operand:V8HI 3 "vect_par_cnst_hi_half"))))
+ (set (match_dup 0)
+ (neg:V4SI
+ (eq:V4SI
+ (match_dup 0)
+ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
+ {
+ if (can_create_pseudo_p ())
+ operands[5] = gen_reg_rtx (V8HImode);
+ else
+ FAIL;
+ }
+ [(set_attr "type" "multiple")]
+)
+
+(define_insn_and_split "*aarch64_cmtst_arith_tmpv8hi"
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
+ (vec_concat:V8HI
+ (truncate:V4HI
+ (not:V4SI
+ (match_operand:V4SI 1 "register_operand" "w")))
+ (truncate:V4HI
+ (not:V4SI
+ (match_operand:V4SI 2 "register_operand" "w")))))]
+ "TARGET_SIMD"
+ "#"
+ "&& true"
+ [(set (match_operand:V4SI 1 "register_operand" "=w")
+ (not:V4SI
+ (match_dup 1)))
+ (set (match_operand:V4SI 2 "register_operand" "=w")
+ (not:V4SI
+ (match_dup 2)))
+ (set (match_operand:V8HI 0 "register_operand" "=w")
+ (vec_concat:V8HI
+ (truncate:V4HI
+ (match_dup 1))
+ (truncate:V4HI
+ (match_dup 2))))]
+ ""
+ [(set_attr "type" "multiple")]
+)
+
(define_insn_and_split "aarch64_cmtstdi"
[(set (match_operand:DI 0 "register_operand" "=w,r")
(neg:DI
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 07c14aacb..1b8496c07 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -118,6 +118,25 @@
(match_test "aarch64_simd_valid_immediate (op, NULL,
AARCH64_CHECK_ORR)"))))
+(define_predicate "aarch64_bic_imm_for_maxmin"
+ (match_code "const_vector")
+{
+ if (!aarch64_simd_valid_immediate (op, NULL, AARCH64_CHECK_BIC))
+ return false;
+ op = unwrap_const_vec_duplicate (op);
+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode);
+ return CONST_INT_P (op)
+ && ((~UINTVAL (op)) < (((long unsigned int) 1 << size) - 1));
+})
+
+(define_predicate "maxmin_arith_shift_operand"
+ (match_code "const_vector")
+{
+ op = unwrap_const_vec_duplicate (op);
+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) - 1;
+ return CONST_INT_P (op) && (UINTVAL (op) == size);
+})
+
(define_predicate "aarch64_reg_or_bic_imm"
(ior (match_operand 0 "register_operand")
(and (match_code "const_vector")
diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c
new file mode 100755
index 000000000..06bce7029
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/combine-maxmin.c
@@ -0,0 +1,46 @@
+/* { dg-do compile { target aarch64-*-* } } */
+/* { dg-options "-O3 -fdump-rtl-combine-all" } */
+
+/* The test checks usage of smax/smin insns for clip evaluation and
+ * uzp1/uzp2 insns for vector element narrowing. It's inspired by
+ * sources of x264 codec. */
+
+typedef unsigned char uint8_t;
+typedef long int intptr_t;
+typedef signed short int int16_t;
+
+static __attribute__((always_inline)) inline uint8_t clip (int x )
+{
+ return ( (x & ~((1 << 8)-1)) ? (-x)>>31 & ((1 << 8)-1) : x );
+}
+
+void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
+ intptr_t stride, int width, int height, int16_t *buf)
+{
+ const int pad = (8 > 9) ? (-10 * ((1 << 8)-1)) : 0;
+ for( int y = 0; y < height; y++ ) {
+ for( int x = -2; x < width+3; x++ ) {
+ int v = ((src)[x-2*stride] + (src)[x+3*stride] - 5*((src)[x-stride]
+ + (src)[x+2*stride]) + 20*((src)[x] + (src)[x+stride]));
+ dstv[x] = clip ( (v + 16) >> 5 );
+ buf[x+2] = v + pad;
+ }
+ for( int x = 0; x < width; x++ )
+ dstc[x] = clip ((((buf+2)[x-2*1] + (buf+2)[x+3*1] - 5*((buf+2)[x-1]
+ + (buf+2)[x+2*1]) + 20*((buf+2)[x] + (buf+2)[x+1]))
+ - 32*pad + 512) >> 10);
+ for( int x = 0; x < width; x++ )
+ dsth[x] = clip ((((src)[x-2*1] + (src)[x+3*1] - 5*((src)[x-1]
+ + (src)[x+2*1]) + 20*((src)[x] + (src)[x+1]))
+ + 16) >> 5);
+ dsth += stride;
+ dstv += stride;
+ dstc += stride;
+ src += stride;
+ }
+}
+
+/* { dg-final { scan-assembler-times {smax\t} 4 } } */
+/* { dg-final { scan-assembler-times {smin\t} 4 } } */
+/* { dg-final { scan-assembler-times {cmtst\t} 2 } } */
+/* { dg-final { scan-assembler-times {uzp1\t} 6 } } */
--
2.33.0

View File

@ -0,0 +1,239 @@
From 11da40d18e35219961226d40f11b0702b8649044 Mon Sep 17 00:00:00 2001
From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
Date: Thu, 22 Feb 2024 17:13:27 +0800
Subject: [PATCH 13/18] Port moving minmask pattern to gimple to GCC 12
---
gcc/common.opt | 4 +
gcc/match.pd | 104 ++++++++++++++++++++++++
gcc/testsuite/gcc.dg/combine-maxmin-1.c | 15 ++++
gcc/testsuite/gcc.dg/combine-maxmin-2.c | 14 ++++
gcc/testsuite/gcc.dg/combine-maxmin.c | 19 +++--
5 files changed, 151 insertions(+), 5 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/combine-maxmin-1.c
create mode 100644 gcc/testsuite/gcc.dg/combine-maxmin-2.c
diff --git a/gcc/common.opt b/gcc/common.opt
index 6c6fabb31..3a5004271 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1846,6 +1846,10 @@ fif-conversion-gimple
Common Var(flag_if_conversion_gimple) Optimization
Perform conversion of conditional jumps to branchless equivalents during gimple transformations.
+fconvert-minmax
+Common Var(flag_convert_minmax) Optimization
+Convert saturating clipping to min max.
+
fstack-reuse=
Common Joined RejectNegative Enum(stack_reuse_level) Var(flag_stack_reuse) Init(SR_ALL) Optimization
-fstack-reuse=[all|named_vars|none] Set stack reuse level for local variables.
diff --git a/gcc/match.pd b/gcc/match.pd
index 61866cb90..3a19e93b3 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -8031,3 +8031,107 @@ and,
(plus:c@4 (op2:c @0 @1)
(plus:c@5 (double_size_mul_overflow_check_lo @0 @1 @3) (op3:c @0 @1))))
(if (single_use (@4) && single_use (@5)))))
+
+/* MinMax pattern matching helpers. More info on the transformation below. */
+
+/* Match (a & 0b11..100..0) pattern. */
+(match (minmax_cmp_arg @0 @1)
+ (bit_and @0 INTEGER_CST@1)
+ (if (wi::popcount (~wi::to_widest (@1) + 1) == 1)))
+
+/* Match (inversed_sign_bit >> sign_bit_pos) pattern.
+ This statement is blocking for the transformation of unsigned integers.
+ Do type check here to avoid unnecessary duplications. */
+(match (minmax_sat_arg @0)
+ (rshift (negate @0) INTEGER_CST@1)
+ (if (!TYPE_UNSIGNED (TREE_TYPE (@0))
+ && wi::eq_p (wi::to_widest (@1), TYPE_PRECISION (TREE_TYPE (@0)) - 1))))
+
+/* Transform ((x & ~mask) ? (-x)>>31 & mask : x) to (min (max (x, 0), mask)).
+ The matched pattern can be described as saturated clipping.
+
+ The pattern supports truncation via both casts and bit_and.
+ Also there are patterns for possible inverted conditions. */
+(if (flag_convert_minmax)
+/* Truncation via casts. Unfortunately convert? cannot be applied here
+ because convert and cond take different number of arguments. */
+ (simplify
+ (convert
+ (cond
+ (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
+ (convert? (minmax_sat_arg @0))
+ (convert? @0)))
+ (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
+ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
+ (convert (min (max @0 { integer_zero_node; })
+ { mask; })))))
+ (simplify
+ (cond
+ (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
+ (convert? (minmax_sat_arg @0))
+ (convert? @0))
+ (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
+ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
+ (convert (min (max @0 { integer_zero_node; })
+ { mask; })))))
+
+ (simplify
+ (convert
+ (cond
+ (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
+ (convert? @0)
+ (convert? (minmax_sat_arg @0))))
+ (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
+ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
+ (convert (min (max @0 { integer_zero_node; })
+ { mask; })))))
+ (simplify
+ (cond
+ (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
+ (convert? @0)
+ (convert? (minmax_sat_arg @0)))
+ (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
+ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
+ (convert (min (max @0 { integer_zero_node; })
+ { mask; })))))
+
+ /* Truncation via bit_and with mask. Same concerns on convert? here. */
+ (simplify
+ (convert
+ (cond
+ (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
+ (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))
+ (convert? @0)))
+ (if (wi::to_widest (@2) == ~wi::to_widest (@1))
+ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
+ (convert (min (max @0 { integer_zero_node; })
+ { mask; })))))
+ (simplify
+ (cond
+ (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
+ (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))
+ (convert? @0))
+ (if (wi::to_widest (@2) == ~wi::to_widest (@1))
+ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
+ (convert (min (max @0 { integer_zero_node; })
+ { mask; })))))
+
+ (simplify
+ (convert
+ (cond
+ (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
+ (convert? @0)
+ (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))))
+ (if (wi::to_widest (@2) == ~wi::to_widest (@1))
+ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
+ (convert (min (max @0 { integer_zero_node; })
+ { mask; })))))
+ (simplify
+ (cond
+ (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
+ (convert? @0)
+ (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2)))
+ (if (wi::to_widest (@2) == ~wi::to_widest (@1))
+ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
+ (convert (min (max @0 { integer_zero_node; })
+ { mask; }))))))
diff --git a/gcc/testsuite/gcc.dg/combine-maxmin-1.c b/gcc/testsuite/gcc.dg/combine-maxmin-1.c
new file mode 100644
index 000000000..859ff7df8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/combine-maxmin-1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile { target aarch64-*-* } } */
+/* { dg-options "-O3 -fconvert-minmax" } */
+
+#include <inttypes.h>
+
+__attribute__((noinline))
+void test (int32_t *restrict a, int32_t *restrict x)
+{
+ for (int i = 0; i < 4; i++)
+ a[i] = ((((-x[i]) >> 31) ^ x[i])
+ & (-((int32_t)((x[i] & (~((1 << 8)-1))) == 0)))) ^ ((-x[i]) >> 31);
+}
+
+/* { dg-final { scan-assembler-not {smax\t} } } */
+/* { dg-final { scan-assembler-not {smin\t} } } */
diff --git a/gcc/testsuite/gcc.dg/combine-maxmin-2.c b/gcc/testsuite/gcc.dg/combine-maxmin-2.c
new file mode 100644
index 000000000..63d4d85b3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/combine-maxmin-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target aarch64-*-* } } */
+/* { dg-options "-O3 -fconvert-minmax" } */
+
+#include <inttypes.h>
+
+__attribute__((noinline))
+void test (int8_t *restrict a, int32_t *restrict x)
+{
+ for (int i = 0; i < 8; i++)
+ a[i] = ((x[i] & ~((1 << 9)-1)) ? (-x[i])>>31 & ((1 << 9)-1) : x[i]);
+}
+
+/* { dg-final { scan-assembler-times {smax\t} 4 } } */
+/* { dg-final { scan-assembler-times {smin\t} 4 } } */
diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c
index 06bce7029..a984fa560 100755
--- a/gcc/testsuite/gcc.dg/combine-maxmin.c
+++ b/gcc/testsuite/gcc.dg/combine-maxmin.c
@@ -1,5 +1,5 @@
/* { dg-do compile { target aarch64-*-* } } */
-/* { dg-options "-O3 -fdump-rtl-combine-all" } */
+/* { dg-options "-O3 -fconvert-minmax" } */
/* The test checks usage of smax/smin insns for clip evaluation and
* uzp1/uzp2 insns for vector element narrowing. It's inspired by
@@ -19,20 +19,26 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
{
const int pad = (8 > 9) ? (-10 * ((1 << 8)-1)) : 0;
for( int y = 0; y < height; y++ ) {
+ /* This loop is not being vectorized now. */
for( int x = -2; x < width+3; x++ ) {
int v = ((src)[x-2*stride] + (src)[x+3*stride] - 5*((src)[x-stride]
+ (src)[x+2*stride]) + 20*((src)[x] + (src)[x+stride]));
dstv[x] = clip ( (v + 16) >> 5 );
buf[x+2] = v + pad;
}
+
+ /* Produces two versions of the code: 3xUZP1/2xMAX/2xMIN + 1xUZP1/1xMAX/1xMIN. */
for( int x = 0; x < width; x++ )
dstc[x] = clip ((((buf+2)[x-2*1] + (buf+2)[x+3*1] - 5*((buf+2)[x-1]
+ (buf+2)[x+2*1]) + 20*((buf+2)[x] + (buf+2)[x+1]))
- 32*pad + 512) >> 10);
+
+ /* Priduces two versions of the code: 1xUZP1/2xMAX/2xMIN + 0xUZP1/1xMAX/1xMIN. */
for( int x = 0; x < width; x++ )
dsth[x] = clip ((((src)[x-2*1] + (src)[x+3*1] - 5*((src)[x-1]
+ (src)[x+2*1]) + 20*((src)[x] + (src)[x+1]))
+ 16) >> 5);
+
dsth += stride;
dstv += stride;
dstc += stride;
@@ -40,7 +46,10 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
}
}
-/* { dg-final { scan-assembler-times {smax\t} 4 } } */
-/* { dg-final { scan-assembler-times {smin\t} 4 } } */
-/* { dg-final { scan-assembler-times {cmtst\t} 2 } } */
-/* { dg-final { scan-assembler-times {uzp1\t} 6 } } */
+/* Max is performed on 0 from signed values, match smax exactly. */
+/* { dg-final { scan-assembler-times {smax\t} 6 } } */
+/* Min is performed on signed val>0 and a mask, min sign doesn't matter. */
+/* { dg-final { scan-assembler-times {[us]min\t} 6 } } */
+/* All of the vectorized patterns are expected to be matched. */
+/* { dg-final { scan-assembler-not {cmtst\t} } } */
+/* { dg-final { scan-assembler-times {uzp1\t} 5 } } */
--
2.33.0

View File

@ -0,0 +1,65 @@
From dbcb2630c426c8dd2117b5ce625da8422dd8cd65 Mon Sep 17 00:00:00 2001
From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
Date: Thu, 22 Feb 2024 17:20:17 +0800
Subject: [PATCH 14/18] Add new pattern to pass the maxmin tests
---
gcc/match.pd | 24 ++++++++++++++++++++++++
gcc/testsuite/gcc.dg/combine-maxmin.c | 2 +-
2 files changed, 25 insertions(+), 1 deletion(-)
diff --git a/gcc/match.pd b/gcc/match.pd
index 3a19e93b3..aee58e47b 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -8038,6 +8038,10 @@ and,
(match (minmax_cmp_arg @0 @1)
(bit_and @0 INTEGER_CST@1)
(if (wi::popcount (~wi::to_widest (@1) + 1) == 1)))
+/* Match ((unsigned) a > 0b0..01..1) pattern. */
+(match (minmax_cmp_arg1 @0 @1)
+ (gt @0 INTEGER_CST@1)
+ (if (wi::popcount (wi::to_widest (@1) + 1) == 1)))
/* Match (inversed_sign_bit >> sign_bit_pos) pattern.
This statement is blocking for the transformation of unsigned integers.
@@ -8095,6 +8099,26 @@ and,
(convert (min (max @0 { integer_zero_node; })
{ mask; })))))
+ (simplify
+ (convert
+ (cond
+ (minmax_cmp_arg1 (convert? @0) INTEGER_CST@1)
+ (convert? (minmax_sat_arg @0))
+ (convert? @0)))
+ (if (wi::geu_p (wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
+ (with { tree mask = build_int_cst (integer_type_node, tree_to_shwi (@1)); }
+ (convert (min (max (convert:integer_type_node @0) { integer_zero_node; })
+ { mask; })))))
+ (simplify
+ (cond
+ (minmax_cmp_arg1 (convert? @0) INTEGER_CST@1)
+ (convert? (minmax_sat_arg @0))
+ (convert? @0))
+ (if (wi::geu_p (wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
+ (with { tree mask = build_int_cst (integer_type_node, tree_to_shwi (@1)); }
+ (convert (min (max (convert:integer_type_node @0) { integer_zero_node; })
+ { mask; })))))
+
/* Truncation via bit_and with mask. Same concerns on convert? here. */
(simplify
(convert
diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c
index a984fa560..5c0c9cc49 100755
--- a/gcc/testsuite/gcc.dg/combine-maxmin.c
+++ b/gcc/testsuite/gcc.dg/combine-maxmin.c
@@ -52,4 +52,4 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
/* { dg-final { scan-assembler-times {[us]min\t} 6 } } */
/* All of the vectorized patterns are expected to be matched. */
/* { dg-final { scan-assembler-not {cmtst\t} } } */
-/* { dg-final { scan-assembler-times {uzp1\t} 5 } } */
+/* { dg-final { scan-assembler-times {uzp1\t} 2 } } */
--
2.33.0

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,27 @@
From 915d549b03c10ab403538888149facd417a02ebc Mon Sep 17 00:00:00 2001
From: vchernon <chernonog.vyacheslav@huawei.com>
Date: Wed, 27 Dec 2023 23:31:26 +0800
Subject: [PATCH 16/18] [crypto-accel] add optimization level requirement to
the gate
fix issue (src-openEuler/gcc: I8RRDW)
---
gcc/crypto-accel.cc | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/gcc/crypto-accel.cc b/gcc/crypto-accel.cc
index f4e810a6b..e7766a585 100644
--- a/gcc/crypto-accel.cc
+++ b/gcc/crypto-accel.cc
@@ -2391,7 +2391,7 @@ public:
/* opt_pass methods: */
virtual bool gate (function *)
{
- if (flag_crypto_accel_aes <= 0)
+ if (flag_crypto_accel_aes <= 0 || optimize < 1)
return false;
return targetm.get_v16qi_mode
&& targetm.gen_rev32v16qi
--
2.33.0

View File

@ -0,0 +1,239 @@
From b5865aef36ebaac87ae30d51f08bfe081795ed67 Mon Sep 17 00:00:00 2001
From: Chernonog Viacheslav <chernonog.vyacheslav@huawei.com>
Date: Tue, 12 Mar 2024 23:30:56 +0800
Subject: [PATCH 17/18] Add more flexible check for pointer aliasing during
vectorization It takes minimum between number of iteration and segment length
it helps to speed up loops with small number of iterations when only tail can
be vectorized
---
gcc/params.opt | 5 ++
.../sve/var_stride_flexible_segment_len_1.c | 23 +++++++
gcc/tree-data-ref.cc | 67 +++++++++++++------
gcc/tree-data-ref.h | 11 ++-
gcc/tree-vect-data-refs.cc | 14 +++-
5 files changed, 95 insertions(+), 25 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
diff --git a/gcc/params.opt b/gcc/params.opt
index 6176d4790..7e5c119cf 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -1180,6 +1180,11 @@ Maximum number of loop peels to enhance alignment of data references in a loop.
Common Joined UInteger Var(param_vect_max_version_for_alias_checks) Init(10) Param Optimization
Bound on number of runtime checks inserted by the vectorizer's loop versioning for alias check.
+-param=vect-alias-flexible-segment-len=
+Common Joined UInteger Var(param_flexible_seg_len) Init(0) IntegerRange(0, 1) Param Optimization
+Use a minimum length of different segments. Currenlty the minimum between
+iteration number and vectorization length is chosen by this param.
+
-param=vect-max-version-for-alignment-checks=
Common Joined UInteger Var(param_vect_max_version_for_alignment_checks) Init(6) Param Optimization
Bound on number of runtime checks inserted by the vectorizer's loop versioning for alignment check.
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
new file mode 100644
index 000000000..894f075f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize --param=vect-alias-flexible-segment-len=1" } */
+
+#define TYPE int
+#define SIZE 257
+
+void __attribute__ ((weak))
+f (TYPE *x, TYPE *y, unsigned short n, long m __attribute__((unused)))
+{
+ for (int i = 0; i < SIZE; ++i)
+ x[i * n] += y[i * n];
+}
+
+/* { dg-final { scan-assembler {\tld1w\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
+/* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
+/* Should use a WAR check that multiplies by (VF-2)*4 rather than
+ an overlap check that multiplies by (257-1)*4. */
+/* { dg-final { scan-assembler {\tcntb\t(x[0-9]+)\n.*\tsub\tx[0-9]+, \1, #8\n.*\tmul\tx[0-9]+,[^\n]*\1} } } */
+/* One range check and a check for n being zero. */
+/* { dg-final { scan-assembler-times {\t(?:cmp|tst)\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */
diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
index 397792c35..e6ae9e847 100644
--- a/gcc/tree-data-ref.cc
+++ b/gcc/tree-data-ref.cc
@@ -2329,31 +2329,15 @@ create_intersect_range_checks_index (class loop *loop, tree *cond_expr,
same arguments. Try to optimize cases in which the second access
is a write and in which some overlap is valid. */
-static bool
-create_waw_or_war_checks (tree *cond_expr,
+static void
+create_waw_or_war_checks2 (tree *cond_expr, tree seg_len_a,
const dr_with_seg_len_pair_t &alias_pair)
{
const dr_with_seg_len& dr_a = alias_pair.first;
const dr_with_seg_len& dr_b = alias_pair.second;
- /* Check for cases in which:
-
- (a) DR_B is always a write;
- (b) the accesses are well-ordered in both the original and new code
- (see the comment above the DR_ALIAS_* flags for details); and
- (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */
- if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
- return false;
-
- /* Check for equal (but possibly variable) steps. */
tree step = DR_STEP (dr_a.dr);
- if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
- return false;
-
- /* Make sure that we can operate on sizetype without loss of precision. */
tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
- if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
- return false;
/* All addresses involved are known to have a common alignment ALIGN.
We can therefore subtract ALIGN from an exclusive endpoint to get
@@ -2370,9 +2354,6 @@ create_waw_or_war_checks (tree *cond_expr,
fold_convert (ssizetype, indicator),
ssize_int (0));
- /* Get lengths in sizetype. */
- tree seg_len_a
- = fold_convert (sizetype, rewrite_to_non_trapping_overflow (dr_a.seg_len));
step = fold_convert (sizetype, rewrite_to_non_trapping_overflow (step));
/* Each access has the following pattern:
@@ -2479,6 +2460,50 @@ create_waw_or_war_checks (tree *cond_expr,
*cond_expr = fold_build2 (GT_EXPR, boolean_type_node, subject, limit);
if (dump_enabled_p ())
dump_printf (MSG_NOTE, "using an address-based WAR/WAW test\n");
+}
+
+/* This is a wrapper function for create_waw_or_war_checks2. */
+static bool
+create_waw_or_war_checks (tree *cond_expr,
+ const dr_with_seg_len_pair_t &alias_pair)
+{
+ const dr_with_seg_len& dr_a = alias_pair.first;
+ const dr_with_seg_len& dr_b = alias_pair.second;
+
+ /* Check for cases in which:
+
+ (a) DR_B is always a write;
+ (b) the accesses are well-ordered in both the original and new code
+ (see the comment above the DR_ALIAS_* flags for details); and
+ (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */
+ if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
+ return false;
+
+ /* Check for equal (but possibly variable) steps. */
+ tree step = DR_STEP (dr_a.dr);
+ if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
+ return false;
+
+ /* Make sure that we can operate on sizetype without loss of precision. */
+ tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
+ if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
+ return false;
+
+ /* Get lengths in sizetype. */
+ tree seg_len_a
+ = fold_convert (sizetype,
+ rewrite_to_non_trapping_overflow (dr_a.seg_len));
+ create_waw_or_war_checks2 (cond_expr, seg_len_a, alias_pair);
+ if (param_flexible_seg_len && dr_a.seg_len != dr_a.seg_len2)
+ {
+ tree seg_len2_a
+ = fold_convert (sizetype,
+ rewrite_to_non_trapping_overflow (dr_a.seg_len2));
+ tree cond_expr2;
+ create_waw_or_war_checks2 (&cond_expr2, seg_len2_a, alias_pair);
+ *cond_expr = fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
+ *cond_expr, cond_expr2);
+ }
return true;
}
diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h
index f643a95b2..9bc5f16ee 100644
--- a/gcc/tree-data-ref.h
+++ b/gcc/tree-data-ref.h
@@ -213,12 +213,19 @@ class dr_with_seg_len
public:
dr_with_seg_len (data_reference_p d, tree len, unsigned HOST_WIDE_INT size,
unsigned int a)
- : dr (d), seg_len (len), access_size (size), align (a) {}
-
+ : dr (d), seg_len (len), seg_len2 (len), access_size (size), align (a)
+ {}
+ dr_with_seg_len (data_reference_p d, tree len, tree len2,
+ unsigned HOST_WIDE_INT size, unsigned int a)
+ : dr (d), seg_len (len), seg_len2 (len2), access_size (size), align (a)
+ {}
data_reference_p dr;
/* The offset of the last access that needs to be checked minus
the offset of the first. */
tree seg_len;
+ /* The second version of segment length. Currently this is used to
+ soften checks for a small number of iterations. */
+ tree seg_len2;
/* A value that, when added to abs (SEG_LEN), gives the total number of
bytes in the segment. */
poly_uint64 access_size;
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index 4e615b80b..04e68f621 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -3646,6 +3646,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
{
poly_uint64 lower_bound;
tree segment_length_a, segment_length_b;
+ tree segment_length2_a, segment_length2_b;
unsigned HOST_WIDE_INT access_size_a, access_size_b;
unsigned int align_a, align_b;
@@ -3751,6 +3752,8 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
{
segment_length_a = size_zero_node;
segment_length_b = size_zero_node;
+ segment_length2_a = size_zero_node;
+ segment_length2_b = size_zero_node;
}
else
{
@@ -3759,8 +3762,15 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
length_factor = scalar_loop_iters;
else
length_factor = size_int (vect_factor);
+ /* In any case we should rememeber scalar_loop_iters
+ this helps to create flexible aliasing check
+ for small number of iterations. */
segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
+ segment_length2_a
+ = vect_vfa_segment_size (dr_info_a, scalar_loop_iters);
+ segment_length2_b
+ = vect_vfa_segment_size (dr_info_b, scalar_loop_iters);
}
access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
@@ -3805,9 +3815,9 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
}
dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
- access_size_a, align_a);
+ segment_length2_a, access_size_a, align_a);
dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
- access_size_b, align_b);
+ segment_length2_b, access_size_b, align_b);
/* Canonicalize the order to be the one that's needed for accurate
RAW, WAR and WAW flags, in cases where the data references are
well-ordered. The order doesn't really matter otherwise,
--
2.33.0

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,94 @@
From 0263daa1312d0cdcdf9c770bcf5d982a2d4fc16b Mon Sep 17 00:00:00 2001
From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
Date: Fri, 29 Mar 2024 17:15:41 +0800
Subject: [PATCH 2/2] Fix fails in IPA prefetch (src-openEuler/gcc: I96ID7)
---
gcc/ipa-prefetch.cc | 28 ++++++++++++++++++++++++++--
1 file changed, 26 insertions(+), 2 deletions(-)
diff --git a/gcc/ipa-prefetch.cc b/gcc/ipa-prefetch.cc
index 9537e4835..1ceb5137f 100644
--- a/gcc/ipa-prefetch.cc
+++ b/gcc/ipa-prefetch.cc
@@ -366,6 +366,7 @@ typedef std::map<memref_t *, memref_t *> memref_map;
typedef std::map<memref_t *, tree> memref_tree_map;
typedef std::set<gimple *> stmt_set;
+typedef std::set<tree> tree_set;
typedef std::map<tree, tree> tree_map;
tree_memref_map *tm_map;
@@ -1124,8 +1125,21 @@ analyse_loops ()
}
}
+/* Compare memrefs by IDs; helper for qsort. */
+
+static int
+memref_id_cmp (const void *p1, const void *p2)
+{
+ const memref_t *mr1 = *(const memref_t **) p1;
+ const memref_t *mr2 = *(const memref_t **) p2;
+
+ if ((unsigned) mr1->mr_id > (unsigned) mr2->mr_id)
+ return 1;
+ return -1;
+}
+
/* Reduce the set filtering out memrefs with the same memory references,
- return the result vector of memrefs. */
+ sort and return the result vector of memrefs. */
static void
reduce_memref_set (memref_set *set, vec<memref_t *> &vec)
@@ -1162,6 +1176,7 @@ reduce_memref_set (memref_set *set, vec<memref_t *> &vec)
vec.safe_push (mr1);
}
}
+ vec.qsort (memref_id_cmp);
if (dump_file)
{
fprintf (dump_file, "MRs (%d) after filtering: ", vec.length ());
@@ -1663,10 +1678,15 @@ optimize_function (cgraph_node *n, function *fn)
}
/* Create other new vars. Insert new stmts. */
+ vec<memref_t *> used_mr_vec = vNULL;
for (memref_set::const_iterator it = used_mrs.begin ();
it != used_mrs.end (); it++)
+ used_mr_vec.safe_push (*it);
+ used_mr_vec.qsort (memref_id_cmp);
+
+ for (unsigned int j = 0; j < used_mr_vec.length (); j++)
{
- memref_t *mr = *it;
+ memref_t *mr = used_mr_vec[j];
if (mr == comp_mr)
continue;
gimple *last_stmt = gimple_copy_and_remap_memref_stmts (mr, stmts, 0,
@@ -1702,6 +1722,7 @@ optimize_function (cgraph_node *n, function *fn)
local = integer_three_node;
break;
}
+ tree_set prefetched_addrs;
for (unsigned int j = 0; j < vmrs.length (); j++)
{
memref_t *mr = vmrs[j];
@@ -1714,10 +1735,13 @@ optimize_function (cgraph_node *n, function *fn)
tree addr = get_mem_ref_address_ssa_name (mr->mem, NULL_TREE);
if (decl_map->count (addr))
addr = (*decl_map)[addr];
+ if (prefetched_addrs.count (addr))
+ continue;
last_stmt = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
3, addr, write_p, local);
pcalls.safe_push (last_stmt);
gimple_seq_add_stmt (&stmts, last_stmt);
+ prefetched_addrs.insert (addr);
if (dump_file)
{
fprintf (dump_file, "Insert %d prefetch stmt:\n", j);
--
2.33.0

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,28 @@
From 9dc3df938b9ed2c27498c8548087fee1ce930366 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=83=91=E6=99=A8=E5=8D=89?= <zhengchenhui1@huawei.com>
Date: Tue, 2 Apr 2024 11:08:30 +0800
Subject: [PATCH] [Struct Reorg] Bugfix for structure pointer compression
---
gcc/ipa-struct-reorg/ipa-struct-reorg.cc | 2 ++
1 file changed, 2 insertions(+)
diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
index fa33f2d35..3922873f3 100644
--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
+++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
@@ -7541,9 +7541,11 @@ ipa_struct_reorg::check_and_prune_struct_for_pointer_compression (void)
if (!type->has_legal_alloc_num)
{
if (current_layout_opt_level & POINTER_COMPRESSION_UNSAFE)
+ {
if (dump_file)
fprintf (dump_file, " has unknown alloc size, but"
" in unsafe mode, so");
+ }
else
{
if (dump_file)
--
2.33.0

View File

@ -0,0 +1,420 @@
From 55c547748af36ffc3f2d5ed154a91fb3fcb8431c Mon Sep 17 00:00:00 2001
From: Mingchuan Wu <wumingchuan1992@foxmail.com>
Date: Thu, 11 Apr 2024 15:49:59 +0800
Subject: [PATCH] [Struct Reorg] Port bugfixes to GCC 12.3.1
Migrated from commits in GCC10.3.1:
https://gitee.com/openeuler/gcc/commit/41af6d361a6d85ef4fce8a8438113d765596afdd
https://gitee.com/openeuler/gcc/commit/25d74b98caeaae881e374924886ee664aa1af5bc
https://gitee.com/openeuler/gcc/commit/b5a3bfe92f96cd0d2224d80ac4eaa80dab1bd6bf
https://gitee.com/openeuler/gcc/commit/708ffe6f132ee39441b66b6ab6b98847d35916b7
https://gitee.com/openeuler/gcc/commit/e875e4e7f3716aa268ffbbf55ee199ec82b6aeba
---
gcc/ipa-struct-reorg/ipa-struct-reorg.cc | 97 ++++++++++---------
gcc/testsuite/gcc.dg/struct/dfe_escape.c | 50 ++++++++++
gcc/testsuite/gcc.dg/struct/dfe_func_ptr.c | 69 +++++++++++++
gcc/testsuite/gcc.dg/struct/struct-reorg.exp | 2 +
gcc/testsuite/gcc.dg/struct/struct_reorg-10.c | 29 ++++++
gcc/testsuite/gcc.dg/struct/struct_reorg-11.c | 16 +++
gcc/testsuite/gcc.dg/struct/struct_reorg-12.c | 26 +++++
7 files changed, 243 insertions(+), 46 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/struct/dfe_escape.c
create mode 100644 gcc/testsuite/gcc.dg/struct/dfe_func_ptr.c
create mode 100644 gcc/testsuite/gcc.dg/struct/struct_reorg-10.c
create mode 100644 gcc/testsuite/gcc.dg/struct/struct_reorg-11.c
create mode 100644 gcc/testsuite/gcc.dg/struct/struct_reorg-12.c
diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
index 6a202b4bd..f03d1d875 100644
--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
+++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
@@ -466,10 +466,19 @@ srtype::has_dead_field (void)
unsigned i;
FOR_EACH_VEC_ELT (fields, i, this_field)
{
- if (!(this_field->field_access & READ_FIELD))
- {
- may_dfe = true;
- break;
+ /* Function pointer members are not processed, because DFE
+ does not currently support accurate analysis of function
+ pointers, and we have not identified specific use cases. */
+ if (!(this_field->field_access & READ_FIELD)
+ && !FUNCTION_POINTER_TYPE_P (this_field->fieldtype))
+ {
+ /* Fields with escape risks should not be processed. */
+ if (this_field->type == NULL
+ || (this_field->type->escapes == does_not_escape))
+ {
+ may_dfe = true;
+ break;
+ }
}
}
return may_dfe;
@@ -1032,8 +1041,13 @@ srtype::create_new_type (void)
{
srfield *f = fields[i];
if (current_layout_opt_level & DEAD_FIELD_ELIMINATION
- && !(f->field_access & READ_FIELD))
- continue;
+ && !(f->field_access & READ_FIELD)
+ && !FUNCTION_POINTER_TYPE_P (f->fieldtype))
+ {
+ /* Fields with escape risks should not be processed. */
+ if (f->type == NULL || (f->type->escapes == does_not_escape))
+ continue;
+ }
f->create_new_fields (newtype, newfields, newlast);
}
@@ -3815,9 +3829,17 @@ ipa_struct_reorg::maybe_mark_or_record_other_side (tree side, tree other,
if (VOID_POINTER_P (TREE_TYPE (side))
&& TREE_CODE (side) == SSA_NAME)
{
- /* The type is other, the declaration is side. */
- current_function->record_decl (type, side, -1,
- isptrptr (TREE_TYPE (other)) ? TREE_TYPE (other) : NULL);
+ tree inner = SSA_NAME_VAR (side);
+ if (inner)
+ {
+ srdecl *in = find_decl (inner);
+ if (in && !in->type->has_escaped ())
+ {
+ /* The type is other, the declaration is side. */
+ current_function->record_decl (type, side, -1,
+ isptrptr (TREE_TYPE (other)) ? TREE_TYPE (other) : NULL);
+ }
+ }
}
else
/* *_1 = &MEM[(void *)&x + 8B]. */
@@ -3910,6 +3932,12 @@ ipa_struct_reorg::maybe_record_assign (cgraph_node *node, gassign *stmt)
maybe_mark_or_record_other_side (rhs, lhs, stmt);
if (TREE_CODE (lhs) == SSA_NAME)
maybe_mark_or_record_other_side (lhs, rhs, stmt);
+
+ /* Handle missing ARRAY_REF cases. */
+ if (TREE_CODE (lhs) == ARRAY_REF)
+ mark_type_as_escape (TREE_TYPE (lhs), escape_array, stmt);
+ if (TREE_CODE (rhs) == ARRAY_REF)
+ mark_type_as_escape (TREE_TYPE (rhs), escape_array, stmt);
}
}
@@ -5272,8 +5300,11 @@ ipa_struct_reorg::record_accesses (void)
record_function (cnode);
else
{
- tree return_type = TREE_TYPE (TREE_TYPE (cnode->decl));
- mark_type_as_escape (return_type, escape_return, NULL);
+ if (cnode->externally_visible)
+ {
+ tree return_type = TREE_TYPE (TREE_TYPE (cnode->decl));
+ mark_type_as_escape (return_type, escape_return, NULL);
+ }
}
}
@@ -5889,6 +5920,7 @@ ipa_struct_reorg::rewrite_expr (tree expr,
bool escape_from_base = false;
tree newbase[max_split];
+ memset (newbase, 0, sizeof (tree[max_split]));
memset (newexpr, 0, sizeof (tree[max_split]));
if (TREE_CODE (expr) == CONSTRUCTOR)
@@ -6912,7 +6944,7 @@ create_bb_for_group_diff_ne_0 (basic_block new_bb, tree &phi, tree ptr,
}
tree
-ipa_struct_reorg::rewrite_pointer_plus_integer (gimple *stmt,
+ipa_struct_reorg::rewrite_pointer_plus_integer (gimple *stmt ATTRIBUTE_UNUSED,
gimple_stmt_iterator *gsi,
tree ptr, tree offset,
srtype *type)
@@ -7889,41 +7921,14 @@ ipa_struct_reorg::rewrite_cond (gcond *stmt,
should be removed. */
bool
-ipa_struct_reorg::rewrite_debug (gimple *stmt, gimple_stmt_iterator *)
+ipa_struct_reorg::rewrite_debug (gimple *, gimple_stmt_iterator *)
{
- if (current_layout_opt_level >= STRUCT_REORDER_FIELDS)
- /* Delete debug gimple now. */
- return true;
- bool remove = false;
- if (gimple_debug_bind_p (stmt))
- {
- tree var = gimple_debug_bind_get_var (stmt);
- tree newvar[max_split];
- if (rewrite_expr (var, newvar, true))
- remove = true;
- if (gimple_debug_bind_has_value_p (stmt))
- {
- var = gimple_debug_bind_get_value (stmt);
- if (TREE_CODE (var) == POINTER_PLUS_EXPR)
- var = TREE_OPERAND (var, 0);
- if (rewrite_expr (var, newvar, true))
- remove = true;
- }
- }
- else if (gimple_debug_source_bind_p (stmt))
- {
- tree var = gimple_debug_source_bind_get_var (stmt);
- tree newvar[max_split];
- if (rewrite_expr (var, newvar, true))
- remove = true;
- var = gimple_debug_source_bind_get_value (stmt);
- if (TREE_CODE (var) == POINTER_PLUS_EXPR)
- var = TREE_OPERAND (var, 0);
- if (rewrite_expr (var, newvar, true))
- remove = true;
- }
-
- return remove;
+ /* In debug statements, there might be some statements that have
+ been optimized out in gimple but left in debug gimple. Sometimes
+ these statements need to be analyzed to escape, but in rewrite
+ stage it shouldn't happen. It needs to care a lot to handle these
+ cases but seems useless. So now we just delete debug gimple. */
+ return true;
}
/* Rewrite PHI nodes, return true if the PHI was replaced. */
diff --git a/gcc/testsuite/gcc.dg/struct/dfe_escape.c b/gcc/testsuite/gcc.dg/struct/dfe_escape.c
new file mode 100644
index 000000000..09efe8027
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/struct/dfe_escape.c
@@ -0,0 +1,50 @@
+/* { dg-do compile } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+typedef struct arc arc_t;
+typedef struct arc *arc_p;
+
+typedef struct network
+{
+ int x;
+} network_t;
+
+struct arc
+{
+ int flow;
+ network_t* net_add;
+};
+
+const int MAX = 100;
+
+/* let it escape_array, "Type is used in an array [not handled yet]". */
+network_t* net[2];
+arc_p stop_arcs = NULL;
+
+int
+main ()
+{
+ net[0] = (network_t*) calloc (1, sizeof(network_t));
+ stop_arcs = (arc_p) calloc (MAX, sizeof (arc_t));
+
+ net[0]->x = 100;
+
+ for (unsigned i = 0; i < 3; i++)
+ {
+ net[0]->x = net[0]->x + 2;
+ stop_arcs->flow = net[0]->x / 2;
+ stop_arcs->flow = stop_arcs->flow + 20;
+ stop_arcs->net_add = net[0];
+ stop_arcs++;
+ }
+
+ if( net[1] != 0 && stop_arcs != 0)
+ {
+ return -1;
+ }
+ return 0;
+}
+
+/* { dg-final { scan-ipa-dump-times "Dead field elimination" 0 "struct_reorg" } } */
diff --git a/gcc/testsuite/gcc.dg/struct/dfe_func_ptr.c b/gcc/testsuite/gcc.dg/struct/dfe_func_ptr.c
new file mode 100644
index 000000000..74ea93bbc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/struct/dfe_func_ptr.c
@@ -0,0 +1,69 @@
+/* { dg-do compile } */
+/* { dg-do run } */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#ifdef STACK_SIZE
+#if STACK_SIZE > 16000
+#define N 1000
+#else
+#define N (STACK_SIZE/16)
+#endif
+#else
+#define N 1000
+#endif
+
+int num;
+
+int (*foo)(int d);
+int f (int t);
+
+typedef struct str_t str_t1;
+struct str_t
+{
+ int a;
+ float b;
+ int (*foo)(int d);
+};
+
+int main ()
+{
+ int i, r;
+ r = rand ();
+ num = r > N ? N : r;
+ str_t1 * p1 = calloc (num, sizeof (str_t1));
+ if (p1 == NULL)
+ return 0;
+ for (i = 0; i < num; i++)
+ {
+ p1[i].foo = malloc (1 * sizeof (f));
+ p1[i].foo = f;
+ p1[i].foo (i);
+ }
+
+ for (i = 0; i < num; i++)
+ p1[i].a = 1;
+
+ for (i = 0; i < num; i++)
+ p1[i].b = 2;
+
+ for (i = 0; i < num; i++)
+ if (p1[i].a != 1)
+ abort ();
+
+ for (i = 0; i < num; i++)
+ if (abs (p1[i].b - 2) > 0.0001)
+ abort ();
+
+ return 0;
+}
+
+int f (int t)
+{
+ if ( t < 0)
+ abort ();
+ return 0;
+}
+
+/* { dg-final { scan-ipa-dump-times "Dead field elimination" 0 "struct_reorg" } } */
diff --git a/gcc/testsuite/gcc.dg/struct/struct-reorg.exp b/gcc/testsuite/gcc.dg/struct/struct-reorg.exp
index c5a955b00..687f6609f 100644
--- a/gcc/testsuite/gcc.dg/struct/struct-reorg.exp
+++ b/gcc/testsuite/gcc.dg/struct/struct-reorg.exp
@@ -46,6 +46,8 @@ gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/rf_*.c]] \
# -fipa-struct-reorg=3
gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/dfe*.c]] \
"" "-fipa-struct-reorg=3 -fdump-ipa-all -flto-partition=one -fwhole-program"
+gcc-dg-runtest $srcdir/$subdir/struct_reorg-7.c \
+ "" "-fipa-struct-reorg=3 -fdump-ipa-all -flto-partition=one -fwhole-program"
# -fipa-struct-reorg=4
gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/pc*.c]] \
diff --git a/gcc/testsuite/gcc.dg/struct/struct_reorg-10.c b/gcc/testsuite/gcc.dg/struct/struct_reorg-10.c
new file mode 100644
index 000000000..ec422f76f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/struct/struct_reorg-10.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-w -g -O3 -flto-partition=one -fipa-struct-reorg -fwhole-program -S" } */
+
+struct a {
+ int b;
+ char c;
+};
+struct {
+ double d;
+ _Bool e;
+} * f;
+struct g {
+ struct a h;
+} i;
+long j;
+void k();
+void l() { k(i); }
+void k(struct a m) {
+ f->e = 0;
+ for (;;)
+ l();
+}
+int main() {
+ for (; j; f = 0) {
+ struct g *n = 0;
+ char o = n->h.c;
+ }
+ l();
+}
diff --git a/gcc/testsuite/gcc.dg/struct/struct_reorg-11.c b/gcc/testsuite/gcc.dg/struct/struct_reorg-11.c
new file mode 100644
index 000000000..3e42aa84a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/struct/struct_reorg-11.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-w -g -O3 -flto-partition=one -fipa-struct-reorg -fwhole-program -S" } */
+
+struct a {
+ int b;
+ double c;
+};
+struct d {
+ struct a e;
+};
+int f;
+int main() {
+ _Bool g;
+ struct d **h = 0;
+ g = *h += f;
+}
diff --git a/gcc/testsuite/gcc.dg/struct/struct_reorg-12.c b/gcc/testsuite/gcc.dg/struct/struct_reorg-12.c
new file mode 100644
index 000000000..d434f9fe0
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/struct/struct_reorg-12.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-w -g -O3 -flto-partition=one -fipa-struct-reorg -fwhole-program -S" } */
+
+struct foo {
+ long element1;
+ long element2;
+};
+
+struct goo {
+ struct foo element_foo;
+};
+
+struct goo g1;
+
+void func () {
+ struct foo (*local)[] = 0;
+ long idx;
+ (g1).element_foo = (*local)[idx];
+}
+
+struct foo g2;
+int main () {
+ func ();
+ g2 = g1.element_foo;
+ return 0;
+}
--
2.33.0

View File

@ -0,0 +1,27 @@
From fa6f80044dcebd28506e871e6e5d25e2dfd7e105 Mon Sep 17 00:00:00 2001
From: tiancheng-bao <baotiancheng1@huawei.com>
Date: Fri, 12 Apr 2024 15:09:28 +0800
Subject: [PATCH 01/32] Fix bug that verifying gimple failed when reorg-level >
5
---
gcc/ipa-struct-reorg/ipa-struct-reorg.cc | 3 +++
1 file changed, 3 insertions(+)
diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
index f03d1d875..e08577c0c 100644
--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
+++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
@@ -7461,6 +7461,9 @@ ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi)
continue;
tree lhs_expr = newlhs[i] ? newlhs[i] : lhs;
tree rhs_expr = newrhs[i] ? newrhs[i] : rhs;
+ if (!useless_type_conversion_p (TREE_TYPE (lhs_expr),
+ TREE_TYPE (rhs_expr)))
+ rhs_expr = gimplify_build1 (gsi, NOP_EXPR, TREE_TYPE (lhs_expr), rhs_expr);
gimple *newstmt = gimple_build_assign (lhs_expr, rhs_expr);
if (dump_file && (dump_flags & TDF_DETAILS))
{
--
2.28.0.windows.1

View File

@ -0,0 +1,90 @@
From 13e82fccba781b29e55a6e1934986514019b728d Mon Sep 17 00:00:00 2001
From: zhenyu--zhao <zhaozhenyu17@huawei.com>
Date: Sun, 24 Mar 2024 20:42:27 +0800
Subject: [PATCH 02/32] [AutoFdo] Fix memory leaks in autofdo
---
gcc/final.cc | 22 ++++++++++++++--------
1 file changed, 14 insertions(+), 8 deletions(-)
diff --git a/gcc/final.cc b/gcc/final.cc
index d4c4fa08f..af4e529bb 100644
--- a/gcc/final.cc
+++ b/gcc/final.cc
@@ -4402,12 +4402,15 @@ get_fdo_count_quality (profile_count count)
return profile_quality[count.quality ()];
}
-static const char *
+/* If the function is not public, return the function_name/file_name for
+ disambiguation of local symbols since there could be identical function
+ names coming from identical file names. The caller needs to free memory. */
+static char *
alias_local_functions (const char *fnname)
{
if (TREE_PUBLIC (cfun->decl))
{
- return fnname;
+ return concat (fnname, NULL);
}
return concat (fnname, "/", lbasename (dump_base_name), NULL);
}
@@ -4457,12 +4460,13 @@ dump_direct_callee_info_to_asm (basic_block bb, gcov_type call_count)
if (callee)
{
+ char *func_name =
+ alias_local_functions (get_fnname_from_decl (callee));
fprintf (asm_out_file, "\t.string \"%x\"\n",
INSN_ADDRESSES (INSN_UID (insn)));
fprintf (asm_out_file, "\t.string \"%s%s\"\n",
- ASM_FDO_CALLEE_FLAG,
- alias_local_functions (get_fnname_from_decl (callee)));
+ ASM_FDO_CALLEE_FLAG, func_name);
fprintf (asm_out_file,
"\t.string \"" HOST_WIDE_INT_PRINT_DEC "\"\n",
@@ -4472,9 +4476,9 @@ dump_direct_callee_info_to_asm (basic_block bb, gcov_type call_count)
{
fprintf (dump_file, "call: %x --> %s \n",
INSN_ADDRESSES (INSN_UID (insn)),
- alias_local_functions
- (get_fnname_from_decl (callee)));
+ func_name);
}
+ free (func_name);
}
}
}
@@ -4547,8 +4551,9 @@ dump_bb_info_to_asm (basic_block bb, gcov_type bb_count)
static void
dump_function_info_to_asm (const char *fnname)
{
+ char *func_name = alias_local_functions (fnname);
fprintf (asm_out_file, "\t.string \"%s%s\"\n",
- ASM_FDO_CALLER_FLAG, alias_local_functions (fnname));
+ ASM_FDO_CALLER_FLAG, func_name);
fprintf (asm_out_file, "\t.string \"%s%d\"\n",
ASM_FDO_CALLER_SIZE_FLAG, get_function_end_addr ());
fprintf (asm_out_file, "\t.string \"%s%s\"\n",
@@ -4557,7 +4562,7 @@ dump_function_info_to_asm (const char *fnname)
if (dump_file)
{
fprintf (dump_file, "\n FUNC_NAME: %s\n",
- alias_local_functions (fnname));
+ func_name);
fprintf (dump_file, " file: %s\n",
dump_base_name);
fprintf (dump_file, "profile_status: %s\n",
@@ -4567,6 +4572,7 @@ dump_function_info_to_asm (const char *fnname)
fprintf (dump_file, " function_bind: %s\n",
simple_get_function_bind ());
}
+ free (func_name);
}
/* Dump function profile into form AutoFDO or PGO to asm. */
--
2.28.0.windows.1

View File

@ -0,0 +1,321 @@
From c546aad5d38165e2962456525a0f6a427e03583b Mon Sep 17 00:00:00 2001
From: "Vladimir N. Makarov" <vmakarov@redhat.com>
Date: Thu, 26 Oct 2023 09:50:40 -0400
Subject: [PATCH 31/32] Modfify cost calculation for dealing with equivalences
RISCV target developers reported that pseudos with equivalence used in
a loop can be spilled. Simple changes of heuristics of cost
calculation of pseudos with equivalence or even ignoring equivalences
resulted in numerous testsuite failures on different targets or worse
spec2017 performance. This patch implements more sophisticated cost
calculations of pseudos with equivalences. The patch does not change
RA behaviour for targets still using the old reload pass instead of
LRA. The patch solves the reported problem and improves x86-64
specint2017 a bit (specfp2017 performance stays the same). The patch
takes into account how the equivalence will be used: will it be
integrated into the user insns or require an input reload insn. It
requires additional pass over insns. To compensate RA slow down, the
patch removes a pass over insns in the reload pass used by IRA before.
This also decouples IRA from reload more and will help to remove the
reload pass in the future if it ever happens.
gcc/ChangeLog:
* dwarf2out.cc (reg_loc_descriptor): Use lra_eliminate_regs when
LRA is used.
* ira-costs.cc: Include regset.h.
(equiv_can_be_consumed_p, get_equiv_regno, calculate_equiv_gains):
New functions.
(find_costs_and_classes): Call calculate_equiv_gains and redefine
mem_cost of pseudos with equivs when LRA is used.
* var-tracking.cc: Include ira.h and lra.h.
(vt_initialize): Use lra_eliminate_regs when LRA is used.
---
gcc/dwarf2out.cc | 4 +-
gcc/ira-costs.cc | 169 ++++++++++++++++++++++++++++++++++++++++++--
gcc/var-tracking.cc | 14 +++-
3 files changed, 179 insertions(+), 8 deletions(-)
diff --git a/gcc/dwarf2out.cc b/gcc/dwarf2out.cc
index 0a5c081d8..f0f6f4fd4 100644
--- a/gcc/dwarf2out.cc
+++ b/gcc/dwarf2out.cc
@@ -14263,7 +14263,9 @@ reg_loc_descriptor (rtx rtl, enum var_init_status initialized)
argument pointer and soft frame pointer rtx's.
Use DW_OP_fbreg offset DW_OP_stack_value in this case. */
if ((rtl == arg_pointer_rtx || rtl == frame_pointer_rtx)
- && eliminate_regs (rtl, VOIDmode, NULL_RTX) != rtl)
+ && (ira_use_lra_p
+ ? lra_eliminate_regs (rtl, VOIDmode, NULL_RTX)
+ : eliminate_regs (rtl, VOIDmode, NULL_RTX)) != rtl)
{
dw_loc_descr_ref result = NULL;
diff --git a/gcc/ira-costs.cc b/gcc/ira-costs.cc
index 642fda529..c79311783 100644
--- a/gcc/ira-costs.cc
+++ b/gcc/ira-costs.cc
@@ -30,6 +30,7 @@ along with GCC; see the file COPYING3. If not see
#include "tm_p.h"
#include "insn-config.h"
#include "regs.h"
+#include "regset.h"
#include "ira.h"
#include "ira-int.h"
#include "addresses.h"
@@ -1750,6 +1751,145 @@ process_bb_node_for_costs (ira_loop_tree_node_t loop_tree_node)
process_bb_for_costs (bb);
}
+/* Check that reg REGNO can be changed by TO in INSN. Return true in case the
+ result insn would be valid one. */
+static bool
+equiv_can_be_consumed_p (int regno, rtx to, rtx_insn *insn)
+{
+ validate_replace_src_group (regno_reg_rtx[regno], to, insn);
+ bool res = verify_changes (0);
+ cancel_changes (0);
+ return res;
+}
+
+/* Return true if X contains a pseudo with equivalence. In this case also
+ return the pseudo through parameter REG. If the pseudo is a part of subreg,
+ return the subreg through parameter SUBREG. */
+
+static bool
+get_equiv_regno (rtx x, int &regno, rtx &subreg)
+{
+ subreg = NULL_RTX;
+ if (GET_CODE (x) == SUBREG)
+ {
+ subreg = x;
+ x = SUBREG_REG (x);
+ }
+ if (REG_P (x)
+ && (ira_reg_equiv[REGNO (x)].memory != NULL
+ || ira_reg_equiv[REGNO (x)].constant != NULL))
+ {
+ regno = REGNO (x);
+ return true;
+ }
+ RTX_CODE code = GET_CODE (x);
+ const char *fmt = GET_RTX_FORMAT (code);
+
+ for (int i = GET_RTX_LENGTH (code) - 1; i >= 0; i--)
+ if (fmt[i] == 'e')
+ {
+ if (get_equiv_regno (XEXP (x, i), regno, subreg))
+ return true;
+ }
+ else if (fmt[i] == 'E')
+ {
+ for (int j = 0; j < XVECLEN (x, i); j++)
+ if (get_equiv_regno (XVECEXP (x, i, j), regno, subreg))
+ return true;
+ }
+ return false;
+}
+
+/* A pass through the current function insns. Calculate costs of using
+ equivalences for pseudos and store them in regno_equiv_gains. */
+
+static void
+calculate_equiv_gains (void)
+{
+ basic_block bb;
+ int regno, freq, cost;
+ rtx subreg;
+ rtx_insn *insn;
+ machine_mode mode;
+ enum reg_class rclass;
+ bitmap_head equiv_pseudos;
+
+ ira_assert (allocno_p);
+ bitmap_initialize (&equiv_pseudos, &reg_obstack);
+ for (regno = max_reg_num () - 1; regno >= FIRST_PSEUDO_REGISTER; regno--)
+ if (ira_reg_equiv[regno].init_insns != NULL
+ && (ira_reg_equiv[regno].memory != NULL
+ || (ira_reg_equiv[regno].constant != NULL
+ /* Ignore complicated constants which probably will be placed
+ in memory: */
+ && GET_CODE (ira_reg_equiv[regno].constant) != CONST_DOUBLE
+ && GET_CODE (ira_reg_equiv[regno].constant) != CONST_VECTOR
+ && GET_CODE (ira_reg_equiv[regno].constant) != LABEL_REF)))
+ {
+ rtx_insn_list *x;
+ for (x = ira_reg_equiv[regno].init_insns; x != NULL; x = x->next ())
+ {
+ insn = x->insn ();
+ rtx set = single_set (insn);
+
+ if (set == NULL_RTX || SET_DEST (set) != regno_reg_rtx[regno])
+ break;
+ bb = BLOCK_FOR_INSN (insn);
+ ira_curr_regno_allocno_map
+ = ira_bb_nodes[bb->index].parent->regno_allocno_map;
+ mode = PSEUDO_REGNO_MODE (regno);
+ rclass = pref[COST_INDEX (regno)];
+ ira_init_register_move_cost_if_necessary (mode);
+ if (ira_reg_equiv[regno].memory != NULL)
+ cost = ira_memory_move_cost[mode][rclass][1];
+ else
+ cost = ira_register_move_cost[mode][rclass][rclass];
+ freq = REG_FREQ_FROM_BB (bb);
+ regno_equiv_gains[regno] += cost * freq;
+ }
+ if (x != NULL)
+ /* We found complicated equiv or reverse equiv mem=reg. Ignore
+ them. */
+ regno_equiv_gains[regno] = 0;
+ else
+ bitmap_set_bit (&equiv_pseudos, regno);
+ }
+
+ FOR_EACH_BB_FN (bb, cfun)
+ {
+ freq = REG_FREQ_FROM_BB (bb);
+ ira_curr_regno_allocno_map
+ = ira_bb_nodes[bb->index].parent->regno_allocno_map;
+ FOR_BB_INSNS (bb, insn)
+ {
+ if (!INSN_P (insn) || !get_equiv_regno (PATTERN (insn), regno, subreg)
+ || !bitmap_bit_p (&equiv_pseudos, regno))
+ continue;
+ rtx subst = ira_reg_equiv[regno].memory;
+
+ if (subst == NULL)
+ subst = ira_reg_equiv[regno].constant;
+ ira_assert (subst != NULL);
+ mode = PSEUDO_REGNO_MODE (regno);
+ ira_init_register_move_cost_if_necessary (mode);
+ bool consumed_p = equiv_can_be_consumed_p (regno, subst, insn);
+
+ rclass = pref[COST_INDEX (regno)];
+ if (MEM_P (subst)
+ /* If it is a change of constant into double for example, the
+ result constant probably will be placed in memory. */
+ || (subreg != NULL_RTX && !INTEGRAL_MODE_P (GET_MODE (subreg))))
+ cost = ira_memory_move_cost[mode][rclass][1] + (consumed_p ? 0 : 1);
+ else if (consumed_p)
+ continue;
+ else
+ cost = ira_register_move_cost[mode][rclass][rclass];
+ regno_equiv_gains[regno] -= cost * freq;
+ }
+ }
+ bitmap_clear (&equiv_pseudos);
+}
+
/* Find costs of register classes and memory for allocnos or pseudos
and their best costs. Set up preferred, alternative and allocno
classes for pseudos. */
@@ -1848,6 +1988,12 @@ find_costs_and_classes (FILE *dump_file)
if (pass == 0)
pref = pref_buffer;
+ if (ira_use_lra_p && allocno_p && pass == 1)
+ /* It is a pass through all insns. So do it once and only for RA (not
+ for insn scheduler) when we already found preferable pseudo register
+ classes on the previous pass. */
+ calculate_equiv_gains ();
+
/* Now for each allocno look at how desirable each class is and
find which class is preferred. */
for (i = max_reg_num () - 1; i >= FIRST_PSEUDO_REGISTER; i--)
@@ -1940,6 +2086,17 @@ find_costs_and_classes (FILE *dump_file)
}
if (i >= first_moveable_pseudo && i < last_moveable_pseudo)
i_mem_cost = 0;
+ else if (ira_use_lra_p)
+ {
+ if (equiv_savings > 0)
+ {
+ i_mem_cost = 0;
+ if (ira_dump_file != NULL && internal_flag_ira_verbose > 5)
+ fprintf (ira_dump_file,
+ " Use MEM for r%d as the equiv savings is %d\n",
+ i, equiv_savings);
+ }
+ }
else if (equiv_savings < 0)
i_mem_cost = -equiv_savings;
else if (equiv_savings > 0)
@@ -2378,7 +2535,10 @@ ira_costs (void)
total_allocno_costs = (struct costs *) ira_allocate (max_struct_costs_size
* ira_allocnos_num);
initiate_regno_cost_classes ();
- calculate_elim_costs_all_insns ();
+ if (!ira_use_lra_p)
+ /* Process equivs in reload to update costs through hook
+ ira_adjust_equiv_reg_cost. */
+ calculate_elim_costs_all_insns ();
find_costs_and_classes (ira_dump_file);
setup_allocno_class_and_costs ();
finish_regno_cost_classes ();
@@ -2503,13 +2663,14 @@ ira_tune_allocno_costs (void)
}
}
-/* Add COST to the estimated gain for eliminating REGNO with its
- equivalence. If COST is zero, record that no such elimination is
- possible. */
+/* A hook from the reload pass. Add COST to the estimated gain for eliminating
+ REGNO with its equivalence. If COST is zero, record that no such
+ elimination is possible. */
void
ira_adjust_equiv_reg_cost (unsigned regno, int cost)
{
+ ira_assert (!ira_use_lra_p);
if (cost == 0)
regno_equiv_gains[regno] = 0;
else
diff --git a/gcc/var-tracking.cc b/gcc/var-tracking.cc
index 7c3ad0a55..b10c8c1eb 100644
--- a/gcc/var-tracking.cc
+++ b/gcc/var-tracking.cc
@@ -107,6 +107,8 @@
#include "cfgrtl.h"
#include "cfganal.h"
#include "reload.h"
+#include "ira.h"
+#include "lra.h"
#include "calls.h"
#include "tree-dfa.h"
#include "tree-ssa.h"
@@ -10133,7 +10135,9 @@ vt_initialize (void)
#else
reg = arg_pointer_rtx;
#endif
- elim = eliminate_regs (reg, VOIDmode, NULL_RTX);
+ elim = (ira_use_lra_p
+ ? lra_eliminate_regs (reg, VOIDmode, NULL_RTX)
+ : eliminate_regs (reg, VOIDmode, NULL_RTX));
if (elim != reg)
{
if (GET_CODE (elim) == PLUS)
@@ -10153,7 +10157,9 @@ vt_initialize (void)
reg = arg_pointer_rtx;
fp_cfa_offset = ARG_POINTER_CFA_OFFSET (current_function_decl);
#endif
- elim = eliminate_regs (reg, VOIDmode, NULL_RTX);
+ elim = (ira_use_lra_p
+ ? lra_eliminate_regs (reg, VOIDmode, NULL_RTX)
+ : eliminate_regs (reg, VOIDmode, NULL_RTX));
if (elim != reg)
{
if (GET_CODE (elim) == PLUS)
@@ -10185,7 +10191,9 @@ vt_initialize (void)
#else
reg = arg_pointer_rtx;
#endif
- elim = eliminate_regs (reg, VOIDmode, NULL_RTX);
+ elim = (ira_use_lra_p
+ ? lra_eliminate_regs (reg, VOIDmode, NULL_RTX)
+ : eliminate_regs (reg, VOIDmode, NULL_RTX));
if (elim != reg)
{
if (GET_CODE (elim) == PLUS)
--
2.28.0.windows.1

View File

@ -0,0 +1,49 @@
From 4965473a4211a9feb46a0d168180ab450cb18bcc Mon Sep 17 00:00:00 2001
From: "Vladimir N. Makarov" <vmakarov@redhat.com>
Date: Fri, 27 Oct 2023 08:28:24 -0400
Subject: [PATCH 32/32] Add cost calculation for reg equivalence invariants
My recent patch improving cost calculation for pseudos with equivalence
resulted in failure of gcc.target/arm/eliminate.c on aarch64. This patch
fixes this failure.
gcc/ChangeLog:
* ira-costs.cc: (get_equiv_regno, calculate_equiv_gains):
Process reg equivalence invariants.
---
gcc/ira-costs.cc | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/gcc/ira-costs.cc b/gcc/ira-costs.cc
index c79311783..d33104a30 100644
--- a/gcc/ira-costs.cc
+++ b/gcc/ira-costs.cc
@@ -1777,6 +1777,7 @@ get_equiv_regno (rtx x, int &regno, rtx &subreg)
}
if (REG_P (x)
&& (ira_reg_equiv[REGNO (x)].memory != NULL
+ || ira_reg_equiv[REGNO (x)].invariant != NULL
|| ira_reg_equiv[REGNO (x)].constant != NULL))
{
regno = REGNO (x);
@@ -1819,6 +1820,7 @@ calculate_equiv_gains (void)
for (regno = max_reg_num () - 1; regno >= FIRST_PSEUDO_REGISTER; regno--)
if (ira_reg_equiv[regno].init_insns != NULL
&& (ira_reg_equiv[regno].memory != NULL
+ || ira_reg_equiv[regno].invariant != NULL
|| (ira_reg_equiv[regno].constant != NULL
/* Ignore complicated constants which probably will be placed
in memory: */
@@ -1869,6 +1871,8 @@ calculate_equiv_gains (void)
if (subst == NULL)
subst = ira_reg_equiv[regno].constant;
+ if (subst == NULL)
+ subst = ira_reg_equiv[regno].invariant;
ira_assert (subst != NULL);
mode = PSEUDO_REGNO_MODE (regno);
ira_init_register_move_cost_if_necessary (mode);
--
2.28.0.windows.1

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,48 @@
From 302b7e15d6308c29c215db4c9901342e1106381a Mon Sep 17 00:00:00 2001
From: huang-xiaoquan <huangxiaoquan1@huawei.com>
Date: Mon, 29 Apr 2024 11:00:12 +0800
Subject: [PATCH] [StructReorderFields] Fix gimple call not rewritten due to
empty function node
Add parameter type escape for empty functions or inline functions.
---
gcc/ipa-struct-reorg/ipa-struct-reorg.cc | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
index e08577c0c..2257d3528 100644
--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
+++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
@@ -4366,6 +4366,17 @@ ipa_struct_reorg::maybe_record_call (cgraph_node *node, gcall *stmt)
argtype = argtype ? TREE_CHAIN (argtype) : NULL_TREE;
}
+
+ /* Types escapes via a argument at empty or inlined function. */
+ cgraph_node *callee = node->get_edge (stmt)->callee;
+ if (!gimple_call_builtin_p (stmt, BUILT_IN_FREE)
+ && gimple_call_num_args (stmt)
+ && callee && (!callee->has_gimple_body_p () || callee->inlined_to))
+ {
+ for (unsigned i = 0; i < gimple_call_num_args (stmt); i++)
+ mark_type_as_escape (TREE_TYPE (gimple_call_arg (stmt, i)),
+ escape_var_arg_function);
+ }
}
void
@@ -8068,6 +8079,11 @@ ipa_struct_reorg::rewrite_functions (void)
if (dump_file && (dump_flags & TDF_DETAILS))
{
fprintf (dump_file, "\nNo rewrite:\n");
+ if (current_function_decl == NULL)
+ {
+ fprintf (dump_file, "\ncurrent_function_decl == NULL\n");
+ continue;
+ }
if (current_function_decl)
dump_function_to_file (current_function_decl, dump_file,
dump_flags | TDF_VOPS);
--
2.33.0

View File

@ -0,0 +1,40 @@
From 01517aa2397f854ffa96128a0fb23dd5542be709 Mon Sep 17 00:00:00 2001
From: Chernonog Viacheslav <chernonog.vyacheslav@huawei.com>
Date: Tue, 30 Apr 2024 18:43:32 +0800
Subject: [PATCH 1/4] [double-sized-mul][testsuite] Add march armv8.2-a for dg
tests
---
gcc/testsuite/gcc.dg/double_sized_mul-1.c | 2 +-
gcc/testsuite/gcc.dg/double_sized_mul-2.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/gcc/testsuite/gcc.dg/double_sized_mul-1.c b/gcc/testsuite/gcc.dg/double_sized_mul-1.c
index 4d475cc8a..d32a25223 100644
--- a/gcc/testsuite/gcc.dg/double_sized_mul-1.c
+++ b/gcc/testsuite/gcc.dg/double_sized_mul-1.c
@@ -1,7 +1,7 @@
/* { dg-do compile } */
/* fif-conversion-gimple and fuaddsub-overflow-match-all are required for
proper overflow detection in some cases. */
-/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */
+/* { dg-options "-O2 -fif-conversion-gimple -march=armv8.2-a -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */
#include <stdint.h>
typedef unsigned __int128 uint128_t;
diff --git a/gcc/testsuite/gcc.dg/double_sized_mul-2.c b/gcc/testsuite/gcc.dg/double_sized_mul-2.c
index cc6e5af25..ff35902b7 100644
--- a/gcc/testsuite/gcc.dg/double_sized_mul-2.c
+++ b/gcc/testsuite/gcc.dg/double_sized_mul-2.c
@@ -1,7 +1,7 @@
/* { dg-do compile } */
/* fif-conversion-gimple is required for proper overflow detection
in some cases. */
-/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */
+/* { dg-options "-O2 -fif-conversion-gimple -march=armv8.2-a -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */
#include <stdint.h>
typedef unsigned __int128 uint128_t;
--
2.33.0

View File

@ -0,0 +1,34 @@
From b84a896e2df214b08d6519a097cc410d3e582add Mon Sep 17 00:00:00 2001
From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
Date: Wed, 8 May 2024 21:28:32 +0800
Subject: [PATCH 2/4] [IPA][Bugfix] Fix fails in IPA prefetch
(src-openEuler/gcc: I9J6N6)
---
gcc/ipa-prefetch.cc | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/gcc/ipa-prefetch.cc b/gcc/ipa-prefetch.cc
index 1ceb5137f..94290ea9c 100644
--- a/gcc/ipa-prefetch.cc
+++ b/gcc/ipa-prefetch.cc
@@ -1432,8 +1432,14 @@ remap_gimple_op_r (tree *tp, int *walk_subtrees, void *data)
TREE_THIS_VOLATILE (*tp) = TREE_THIS_VOLATILE (old);
TREE_SIDE_EFFECTS (*tp) = TREE_SIDE_EFFECTS (old);
TREE_NO_WARNING (*tp) = TREE_NO_WARNING (old);
- /* TODO: maybe support this case. */
- gcc_assert (MR_DEPENDENCE_CLIQUE (old) == 0);
+ if (MR_DEPENDENCE_CLIQUE (old) != 0)
+ {
+ MR_DEPENDENCE_CLIQUE (*tp) = MR_DEPENDENCE_CLIQUE (old);
+ MR_DEPENDENCE_BASE (*tp) = MR_DEPENDENCE_BASE (old);
+ if (dump_file)
+ fprintf (dump_file, "Copy clique=%d base=%d info.\n",
+ MR_DEPENDENCE_CLIQUE (old), MR_DEPENDENCE_BASE (old));
+ }
/* We cannot propagate the TREE_THIS_NOTRAP flag if we have
remapped a parameter as the property might be valid only
for the parameter itself. */
--
2.33.0

View File

@ -0,0 +1,29 @@
From acb6bbf0612aead00a879892ba8ed816c90fe788 Mon Sep 17 00:00:00 2001
From: Chernonog Viacheslav <chernonog.vyacheslav@huawei.com>
Date: Wed, 8 May 2024 19:24:27 +0800
Subject: [PATCH 3/4] [AES][Bugfix] Change set_of to reg_set_p, and add check
for global_regs fix for I9JDHE
---
gcc/rtl-matcher.h | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/gcc/rtl-matcher.h b/gcc/rtl-matcher.h
index 6aed8d98d..5310f6266 100644
--- a/gcc/rtl-matcher.h
+++ b/gcc/rtl-matcher.h
@@ -56,8 +56,9 @@ check_def_chain_ref (df_ref ref, rtx reg)
if (!ref || !DF_REF_INSN_INFO (ref))
return false;
- return !global_regs[REGNO (reg)]
- || set_of (reg, DF_REF_INSN (ref));
+ return !(REGNO (reg) < FIRST_PSEUDO_REGISTER
+ && global_regs[REGNO (reg)])
+ || reg_set_p (reg, DF_REF_INSN (ref));
}
/* Get the single def instruction of the reg being used in the insn. */
--
2.33.0

View File

@ -0,0 +1,26 @@
From 48724ee73cd58b67d59962ee4d56ac85db797e61 Mon Sep 17 00:00:00 2001
From: tiancheng-bao <baotiancheng1@huawei.com>
Date: Fri, 10 May 2024 17:52:27 +0800
Subject: [PATCH 4/4] fix bugs within pointer compression and DFE
---
gcc/ipa-struct-reorg/ipa-struct-reorg.cc | 3 ---
1 file changed, 3 deletions(-)
diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
index 2257d3528..1a169c635 100644
--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
+++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
@@ -7472,9 +7472,6 @@ ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi)
continue;
tree lhs_expr = newlhs[i] ? newlhs[i] : lhs;
tree rhs_expr = newrhs[i] ? newrhs[i] : rhs;
- if (!useless_type_conversion_p (TREE_TYPE (lhs_expr),
- TREE_TYPE (rhs_expr)))
- rhs_expr = gimplify_build1 (gsi, NOP_EXPR, TREE_TYPE (lhs_expr), rhs_expr);
gimple *newstmt = gimple_build_assign (lhs_expr, rhs_expr);
if (dump_file && (dump_flags & TDF_DETAILS))
{
--
2.33.0

View File

@ -0,0 +1,28 @@
From 4861c3db991e947060de54a4d20c1a13747a6024 Mon Sep 17 00:00:00 2001
From: zhenyu--zhao_admin <zhaozhenyu17@huawei.com>
Date: Wed, 15 May 2024 14:41:45 +0800
Subject: [PATCH] [BUGFIX] AutoBOLT function miss bind type
---
gcc/final.cc | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/gcc/final.cc b/gcc/final.cc
index af4e529bb..c440846f7 100644
--- a/gcc/final.cc
+++ b/gcc/final.cc
@@ -4272,9 +4272,9 @@ leaf_renumber_regs_insn (rtx in_rtx)
#define ASM_FDO_CALLER_FLAG ".fdo.caller "
#define ASM_FDO_CALLER_SIZE_FLAG ".fdo.caller.size "
-#define ASM_FDO_CALLER_BIND_FLAG ".fdo.caller.bind"
+#define ASM_FDO_CALLER_BIND_FLAG ".fdo.caller.bind "
-#define ASM_FDO_CALLEE_FLAG ".fdo.callee"
+#define ASM_FDO_CALLEE_FLAG ".fdo.callee "
/* Return the relative offset address of the start instruction of BB,
return -1 if it is empty instruction. */
--
2.33.0

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,45 @@
From 06e86b362f74ba0706fb5d8377f78d24b658c300 Mon Sep 17 00:00:00 2001
From: zhenyu--zhao_admin <zhaozhenyu17@huawei.com>
Date: Sat, 18 May 2024 12:22:23 +0800
Subject: [PATCH] [Bugfix] Autofdo use PMU sampling set num eauals den
---
gcc/final.cc | 2 +-
gcc/tree-cfg.cc | 8 ++++++++
2 files changed, 9 insertions(+), 1 deletion(-)
diff --git a/gcc/final.cc b/gcc/final.cc
index f66c9d155..e4bfceabc 100644
--- a/gcc/final.cc
+++ b/gcc/final.cc
@@ -4604,7 +4604,7 @@ dump_profile_to_elf_sections ()
/* Return if no feedback data. */
if (!flag_profile_use && !flag_auto_profile)
{
- error ("-fauto-bolt should use with -profile-use or -fauto-profile");
+ error ("-fauto-bolt should use with -fprofile-use or -fauto-profile");
return;
}
diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
index 05fc45147..48b52f785 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -9741,6 +9741,14 @@ execute_fixup_cfg (void)
/* Same scaling is also done by ipa_merge_profiles. */
profile_count num = node->count;
profile_count den = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count;
+ /* When autofdo uses PMU as the sampling unit, the number of
+ node can not be obtained directly, sometimes it will be zero,
+ but the execution number for function should at least be 1. We
+ set num be den here to make sure the num will not decrease. */
+ if (num == profile_count::zero ().afdo () && den.quality () == profile_quality::AFDO)
+ {
+ num = den;
+ }
bool scale = num.initialized_p () && !(num == den);
auto_bitmap dce_ssa_names;
--
2.33.0

View File

@ -0,0 +1,59 @@
From 62ea18c632200edbbf46b4e957bc4d997f1c66f0 Mon Sep 17 00:00:00 2001
From: Lulu Cheng <chenglulu@loongson.cn>
Date: Tue, 27 Sep 2022 15:28:43 +0800
Subject: [PATCH 024/124] Libvtv: Add loongarch support.
The loongarch64 specification permits page sizes of 4KiB, 16KiB and 64KiB,
but only 16KiB pages are supported for now.
Co-Authored-By: qijingwen <qijingwen@loongson.cn>
include/ChangeLog:
* vtv-change-permission.h (defined): Determines whether the macro
__loongarch_lp64 is defined
(VTV_PAGE_SIZE): Set VTV_PAGE_SIZE to 16KiB for loongarch64.
libvtv/ChangeLog:
* configure.tgt: Add loongarch support.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
include/vtv-change-permission.h | 4 ++++
libvtv/configure.tgt | 3 +++
2 files changed, 7 insertions(+)
diff --git a/include/vtv-change-permission.h b/include/vtv-change-permission.h
index 70bdad92b..e7b9294a0 100644
--- a/include/vtv-change-permission.h
+++ b/include/vtv-change-permission.h
@@ -48,6 +48,10 @@ extern void __VLTChangePermission (int);
#else
#if defined(__sun__) && defined(__svr4__) && defined(__sparc__)
#define VTV_PAGE_SIZE 8192
+#elif defined(__loongarch_lp64)
+/* The page size is configurable by the kernel to be 4, 16 or 64 KiB.
+ For now, only the default page size of 16KiB is supported. */
+#define VTV_PAGE_SIZE 16384
#else
#define VTV_PAGE_SIZE 4096
#endif
diff --git a/libvtv/configure.tgt b/libvtv/configure.tgt
index aa2a3f675..6cdd1e97a 100644
--- a/libvtv/configure.tgt
+++ b/libvtv/configure.tgt
@@ -50,6 +50,9 @@ case "${target}" in
;;
x86_64-*-darwin[1]* | i?86-*-darwin[1]*)
;;
+ loongarch*-*-linux*)
+ VTV_SUPPORTED=yes
+ ;;
*)
;;
esac
--
2.33.0

View File

@ -0,0 +1,332 @@
From c68463abbab98aa7f5a9b91e71ed6f6834c723df Mon Sep 17 00:00:00 2001
From: Lulu Cheng <chenglulu@loongson.cn>
Date: Thu, 16 Nov 2023 20:43:53 +0800
Subject: [PATCH] LoongArch: Add LA664 support.
Define ISA_BASE_LA64V110, which represents the base instruction set defined in LoongArch1.1.
Support the configure setting --with-arch =la664, and support -march=la664,-mtune=la664.
gcc/ChangeLog:
* config.gcc: Support LA664.
* config/loongarch/genopts/loongarch-strings: Likewise.
* config/loongarch/genopts/loongarch.opt.in: Likewise.
* config/loongarch/loongarch-cpu.cc (fill_native_cpu_config): Likewise.
* config/loongarch/loongarch-def.c: Likewise.
* config/loongarch/loongarch-def.h (N_ISA_BASE_TYPES): Likewise.
(ISA_BASE_LA64V110): Define macro.
(N_ARCH_TYPES): Update value.
(N_TUNE_TYPES): Update value.
(CPU_LA664): New macro.
* config/loongarch/loongarch-opts.cc (isa_default_abi): Likewise.
(isa_base_compat_p): Likewise.
* config/loongarch/loongarch-opts.h (TARGET_64BIT): This parameter is enabled
when la_target.isa.base is equal to ISA_BASE_LA64V100 or ISA_BASE_LA64V110.
(TARGET_uARCH_LA664): Define macro.
* config/loongarch/loongarch-str.h (STR_CPU_LA664): Likewise.
* config/loongarch/loongarch.cc (loongarch_cpu_sched_reassociation_width):
Add LA664 support.
* config/loongarch/loongarch.opt: Regenerate.
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
gcc/config.gcc | 10 ++++-----
.../loongarch/genopts/loongarch-strings | 1 +
gcc/config/loongarch/genopts/loongarch.opt.in | 3 +++
gcc/config/loongarch/loongarch-cpu.cc | 4 ++++
gcc/config/loongarch/loongarch-def.c | 21 +++++++++++++++++++
gcc/config/loongarch/loongarch-def.h | 8 ++++---
gcc/config/loongarch/loongarch-opts.cc | 8 +++----
gcc/config/loongarch/loongarch-opts.h | 4 +++-
gcc/config/loongarch/loongarch-str.h | 1 +
gcc/config/loongarch/loongarch.cc | 1 +
gcc/config/loongarch/loongarch.opt | 3 +++
11 files changed, 51 insertions(+), 13 deletions(-)
diff --git a/gcc/config.gcc b/gcc/config.gcc
index 6d51bd93f3f..b88591b6fd8 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -5039,7 +5039,7 @@ case "${target}" in
# Perform initial sanity checks on --with-* options.
case ${with_arch} in
- "" | abi-default | loongarch64 | la464) ;; # OK, append here.
+ "" | abi-default | loongarch64 | la[46]64) ;; # OK, append here.
native)
if test x${host} != x${target}; then
echo "--with-arch=native is illegal for cross-compiler." 1>&2
@@ -5088,7 +5088,7 @@ case "${target}" in
case ${abi_base}/${abi_ext} in
lp64*/base)
# architectures that support lp64* ABI
- arch_pattern="native|abi-default|loongarch64|la464"
+ arch_pattern="native|abi-default|loongarch64|la[46]64"
# default architecture for lp64* ABI
arch_default="abi-default"
;;
@@ -5163,7 +5163,7 @@ case "${target}" in
# Check default with_tune configuration using with_arch.
case ${with_arch} in
loongarch64)
- tune_pattern="native|abi-default|loongarch64|la464"
+ tune_pattern="native|abi-default|loongarch64|la[46]64"
;;
*)
# By default, $with_tune == $with_arch
@@ -5219,7 +5219,7 @@ case "${target}" in
# Fixed: use the default gcc configuration for all multilib
# builds by default.
with_multilib_default="" ;;
- arch,native|arch,loongarch64|arch,la464) # OK, append here.
+ arch,native|arch,loongarch64|arch,la[46]64) # OK, append here.
with_multilib_default="/march=${component}" ;;
arch,*)
with_multilib_default="/march=abi-default"
@@ -5307,7 +5307,7 @@ case "${target}" in
if test x${parse_state} = x"arch"; then
# -march option
case ${component} in
- native | abi-default | loongarch64 | la464) # OK, append here.
+ native | abi-default | loongarch64 | la[46]64) # OK, append here.
# Append -march spec for each multilib variant.
loongarch_multilib_list_make="${loongarch_multilib_list_make}/march=${component}"
parse_state="opts"
diff --git a/gcc/config/loongarch/genopts/loongarch-strings b/gcc/config/loongarch/genopts/loongarch-strings
index 8e412f7536e..7bc4824007e 100644
--- a/gcc/config/loongarch/genopts/loongarch-strings
+++ b/gcc/config/loongarch/genopts/loongarch-strings
@@ -26,6 +26,7 @@ STR_CPU_NATIVE native
STR_CPU_ABI_DEFAULT abi-default
STR_CPU_LOONGARCH64 loongarch64
STR_CPU_LA464 la464
+STR_CPU_LA664 la664
# Base architecture
STR_ISA_BASE_LA64V100 la64
diff --git a/gcc/config/loongarch/genopts/loongarch.opt.in b/gcc/config/loongarch/genopts/loongarch.opt.in
index 158701d327a..00b4733d75b 100644
--- a/gcc/config/loongarch/genopts/loongarch.opt.in
+++ b/gcc/config/loongarch/genopts/loongarch.opt.in
@@ -107,6 +107,9 @@ Enum(cpu_type) String(@@STR_CPU_LOONGARCH64@@) Value(CPU_LOONGARCH64)
EnumValue
Enum(cpu_type) String(@@STR_CPU_LA464@@) Value(CPU_LA464)
+EnumValue
+Enum(cpu_type) String(@@STR_CPU_LA664@@) Value(CPU_LA664)
+
m@@OPTSTR_ARCH@@=
Target RejectNegative Joined Enum(cpu_type) Var(la_opt_cpu_arch) Init(M_OPT_UNSET)
-m@@OPTSTR_ARCH@@=PROCESSOR Generate code for the given PROCESSOR ISA.
diff --git a/gcc/config/loongarch/loongarch-cpu.cc b/gcc/config/loongarch/loongarch-cpu.cc
index 7a2866f60f9..f3a13414143 100644
--- a/gcc/config/loongarch/loongarch-cpu.cc
+++ b/gcc/config/loongarch/loongarch-cpu.cc
@@ -106,6 +106,10 @@ fill_native_cpu_config (struct loongarch_target *tgt)
native_cpu_type = CPU_LA464;
break;
+ case 0x0014d000: /* LA664 */
+ native_cpu_type = CPU_LA664;
+ break;
+
default:
/* Unknown PRID. */
if (tune_native_p)
diff --git a/gcc/config/loongarch/loongarch-def.c b/gcc/config/loongarch/loongarch-def.c
index 430ef8b2d95..067629141b6 100644
--- a/gcc/config/loongarch/loongarch-def.c
+++ b/gcc/config/loongarch/loongarch-def.c
@@ -28,6 +28,7 @@ loongarch_cpu_strings[N_TUNE_TYPES] = {
[CPU_ABI_DEFAULT] = STR_CPU_ABI_DEFAULT,
[CPU_LOONGARCH64] = STR_CPU_LOONGARCH64,
[CPU_LA464] = STR_CPU_LA464,
+ [CPU_LA664] = STR_CPU_LA664,
};
struct loongarch_isa
@@ -42,6 +43,11 @@ loongarch_cpu_default_isa[N_ARCH_TYPES] = {
.fpu = ISA_EXT_FPU64,
.simd = ISA_EXT_SIMD_LASX,
},
+ [CPU_LA664] = {
+ .base = ISA_BASE_LA64V110,
+ .fpu = ISA_EXT_FPU64,
+ .simd = ISA_EXT_SIMD_LASX,
+ },
};
struct loongarch_cache
@@ -58,6 +64,12 @@ loongarch_cpu_cache[N_TUNE_TYPES] = {
.l2d_size = 256,
.simultaneous_prefetches = 4,
},
+ [CPU_LA664] = {
+ .l1d_line_size = 64,
+ .l1d_size = 64,
+ .l2d_size = 256,
+ .simultaneous_prefetches = 4,
+ },
};
struct loongarch_align
@@ -70,6 +82,10 @@ loongarch_cpu_align[N_TUNE_TYPES] = {
.function = "32",
.label = "16",
},
+ [CPU_LA664] = {
+ .function = "32",
+ .label = "16",
+ },
};
@@ -104,6 +120,9 @@ loongarch_cpu_rtx_cost_data[N_TUNE_TYPES] = {
[CPU_LA464] = {
DEFAULT_COSTS
},
+ [CPU_LA664] = {
+ DEFAULT_COSTS
+ },
};
/* RTX costs to use when optimizing for size. */
@@ -127,6 +146,7 @@ loongarch_cpu_issue_rate[N_TUNE_TYPES] = {
[CPU_NATIVE] = 4,
[CPU_LOONGARCH64] = 4,
[CPU_LA464] = 4,
+ [CPU_LA664] = 6,
};
int
@@ -134,6 +154,7 @@ loongarch_cpu_multipass_dfa_lookahead[N_TUNE_TYPES] = {
[CPU_NATIVE] = 4,
[CPU_LOONGARCH64] = 4,
[CPU_LA464] = 4,
+ [CPU_LA664] = 6,
};
/* Wiring string definitions from loongarch-str.h to global arrays
diff --git a/gcc/config/loongarch/loongarch-def.h b/gcc/config/loongarch/loongarch-def.h
index 6e2a6987910..db497f3ffe2 100644
--- a/gcc/config/loongarch/loongarch-def.h
+++ b/gcc/config/loongarch/loongarch-def.h
@@ -55,7 +55,8 @@ extern "C" {
/* enum isa_base */
extern const char* loongarch_isa_base_strings[];
#define ISA_BASE_LA64V100 0
-#define N_ISA_BASE_TYPES 1
+#define ISA_BASE_LA64V110 1
+#define N_ISA_BASE_TYPES 2
/* enum isa_ext_* */
extern const char* loongarch_isa_ext_strings[];
@@ -141,8 +142,9 @@ struct loongarch_target
#define CPU_ABI_DEFAULT 1
#define CPU_LOONGARCH64 2
#define CPU_LA464 3
-#define N_ARCH_TYPES 4
-#define N_TUNE_TYPES 4
+#define CPU_LA664 4
+#define N_ARCH_TYPES 5
+#define N_TUNE_TYPES 5
/* parallel tables. */
extern const char* loongarch_cpu_strings[];
diff --git a/gcc/config/loongarch/loongarch-opts.cc b/gcc/config/loongarch/loongarch-opts.cc
index e5921189a06..67a59152a01 100644
--- a/gcc/config/loongarch/loongarch-opts.cc
+++ b/gcc/config/loongarch/loongarch-opts.cc
@@ -552,17 +552,17 @@ isa_default_abi (const struct loongarch_isa *isa)
switch (isa->fpu)
{
case ISA_EXT_FPU64:
- if (isa->base == ISA_BASE_LA64V100)
+ if (isa->base >= ISA_BASE_LA64V100)
abi.base = ABI_BASE_LP64D;
break;
case ISA_EXT_FPU32:
- if (isa->base == ISA_BASE_LA64V100)
+ if (isa->base >= ISA_BASE_LA64V100)
abi.base = ABI_BASE_LP64F;
break;
case ISA_EXT_NONE:
- if (isa->base == ISA_BASE_LA64V100)
+ if (isa->base >= ISA_BASE_LA64V100)
abi.base = ABI_BASE_LP64S;
break;
@@ -582,7 +582,7 @@ isa_base_compat_p (const struct loongarch_isa *set1,
switch (set2->base)
{
case ISA_BASE_LA64V100:
- return (set1->base == ISA_BASE_LA64V100);
+ return (set1->base >= ISA_BASE_LA64V100);
default:
gcc_unreachable ();
diff --git a/gcc/config/loongarch/loongarch-opts.h b/gcc/config/loongarch/loongarch-opts.h
index 6dd309aad96..0e1b3e528a1 100644
--- a/gcc/config/loongarch/loongarch-opts.h
+++ b/gcc/config/loongarch/loongarch-opts.h
@@ -76,7 +76,8 @@ loongarch_update_gcc_opt_status (struct loongarch_target *target,
#define TARGET_DOUBLE_FLOAT (la_target.isa.fpu == ISA_EXT_FPU64)
#define TARGET_DOUBLE_FLOAT_ABI (la_target.abi.base == ABI_BASE_LP64D)
-#define TARGET_64BIT (la_target.isa.base == ISA_BASE_LA64V100)
+#define TARGET_64BIT (la_target.isa.base == ISA_BASE_LA64V100 \
+ || la_target.isa.base == ISA_BASE_LA64V110)
#define TARGET_ABI_LP64 (la_target.abi.base == ABI_BASE_LP64D \
|| la_target.abi.base == ABI_BASE_LP64F \
|| la_target.abi.base == ABI_BASE_LP64S)
@@ -88,6 +89,7 @@ loongarch_update_gcc_opt_status (struct loongarch_target *target,
/* TARGET_ macros for use in *.md template conditionals */
#define TARGET_uARCH_LA464 (la_target.cpu_tune == CPU_LA464)
+#define TARGET_uARCH_LA664 (la_target.cpu_tune == CPU_LA664)
/* Note: optimize_size may vary across functions,
while -m[no]-memcpy imposes a global constraint. */
diff --git a/gcc/config/loongarch/loongarch-str.h b/gcc/config/loongarch/loongarch-str.h
index 072558c28f1..fc4f41bfc1e 100644
--- a/gcc/config/loongarch/loongarch-str.h
+++ b/gcc/config/loongarch/loongarch-str.h
@@ -30,6 +30,7 @@ along with GCC; see the file COPYING3. If not see
#define STR_CPU_ABI_DEFAULT "abi-default"
#define STR_CPU_LOONGARCH64 "loongarch64"
#define STR_CPU_LA464 "la464"
+#define STR_CPU_LA664 "la664"
#define STR_ISA_BASE_LA64V100 "la64"
diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index 22ca24a1878..4cd509f11c6 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -10177,6 +10177,7 @@ loongarch_cpu_sched_reassociation_width (struct loongarch_target *target,
{
case CPU_LOONGARCH64:
case CPU_LA464:
+ case CPU_LA664:
/* Vector part. */
if (LSX_SUPPORTED_MODE_P (mode) || LASX_SUPPORTED_MODE_P (mode))
{
diff --git a/gcc/config/loongarch/loongarch.opt b/gcc/config/loongarch/loongarch.opt
index a5988411fbb..7f129e53ba5 100644
--- a/gcc/config/loongarch/loongarch.opt
+++ b/gcc/config/loongarch/loongarch.opt
@@ -114,6 +114,9 @@ Enum(cpu_type) String(loongarch64) Value(CPU_LOONGARCH64)
EnumValue
Enum(cpu_type) String(la464) Value(CPU_LA464)
+EnumValue
+Enum(cpu_type) String(la664) Value(CPU_LA664)
+
march=
Target RejectNegative Joined Enum(cpu_type) Var(la_opt_cpu_arch) Init(M_OPT_UNSET)
-march=PROCESSOR Generate code for the given PROCESSOR ISA.
--
2.33.0

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,166 @@
From 7cfe6e057045ac794afbe9097b1b211c0e1ea723 Mon Sep 17 00:00:00 2001
From: Lulu Cheng <chenglulu@loongson.cn>
Date: Thu, 6 Apr 2023 16:02:07 +0800
Subject: [PATCH 039/124] LoongArch: Add built-in functions description of
LoongArch Base instruction set instructions.
gcc/ChangeLog:
* doc/extend.texi: Add section for LoongArch Base Built-in functions.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
gcc/doc/extend.texi | 129 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 129 insertions(+)
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 3c101ca89..1d1bac255 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -14678,6 +14678,7 @@ instructions, but allow the compiler to schedule those calls.
* Blackfin Built-in Functions::
* BPF Built-in Functions::
* FR-V Built-in Functions::
+* LoongArch Base Built-in Functions::
* MIPS DSP Built-in Functions::
* MIPS Paired-Single Support::
* MIPS Loongson Built-in Functions::
@@ -16128,6 +16129,134 @@ Use the @code{nldub} instruction to load the contents of address @var{x}
into the data cache. The instruction is issued in slot I1@.
@end table
+@node LoongArch Base Built-in Functions
+@subsection LoongArch Base Built-in Functions
+
+These built-in functions are available for LoongArch.
+
+Data Type Description:
+@itemize
+@item @code{imm0_31}, a compile-time constant in range 0 to 31;
+@item @code{imm0_16383}, a compile-time constant in range 0 to 16383;
+@item @code{imm0_32767}, a compile-time constant in range 0 to 32767;
+@item @code{imm_n2048_2047}, a compile-time constant in range -2048 to 2047;
+@end itemize
+
+The intrinsics provided are listed below:
+@smallexample
+ unsigned int __builtin_loongarch_movfcsr2gr (imm0_31)
+ void __builtin_loongarch_movgr2fcsr (imm0_31, unsigned int)
+ void __builtin_loongarch_cacop_d (imm0_31, unsigned long int, imm_n2048_2047)
+ unsigned int __builtin_loongarch_cpucfg (unsigned int)
+ void __builtin_loongarch_asrtle_d (long int, long int)
+ void __builtin_loongarch_asrtgt_d (long int, long int)
+ long int __builtin_loongarch_lddir_d (long int, imm0_31)
+ void __builtin_loongarch_ldpte_d (long int, imm0_31)
+
+ int __builtin_loongarch_crc_w_b_w (char, int)
+ int __builtin_loongarch_crc_w_h_w (short, int)
+ int __builtin_loongarch_crc_w_w_w (int, int)
+ int __builtin_loongarch_crc_w_d_w (long int, int)
+ int __builtin_loongarch_crcc_w_b_w (char, int)
+ int __builtin_loongarch_crcc_w_h_w (short, int)
+ int __builtin_loongarch_crcc_w_w_w (int, int)
+ int __builtin_loongarch_crcc_w_d_w (long int, int)
+
+ unsigned int __builtin_loongarch_csrrd_w (imm0_16383)
+ unsigned int __builtin_loongarch_csrwr_w (unsigned int, imm0_16383)
+ unsigned int __builtin_loongarch_csrxchg_w (unsigned int, unsigned int, imm0_16383)
+ unsigned long int __builtin_loongarch_csrrd_d (imm0_16383)
+ unsigned long int __builtin_loongarch_csrwr_d (unsigned long int, imm0_16383)
+ unsigned long int __builtin_loongarch_csrxchg_d (unsigned long int, unsigned long int, imm0_16383)
+
+ unsigned char __builtin_loongarch_iocsrrd_b (unsigned int)
+ unsigned short __builtin_loongarch_iocsrrd_h (unsigned int)
+ unsigned int __builtin_loongarch_iocsrrd_w (unsigned int)
+ unsigned long int __builtin_loongarch_iocsrrd_d (unsigned int)
+ void __builtin_loongarch_iocsrwr_b (unsigned char, unsigned int)
+ void __builtin_loongarch_iocsrwr_h (unsigned short, unsigned int)
+ void __builtin_loongarch_iocsrwr_w (unsigned int, unsigned int)
+ void __builtin_loongarch_iocsrwr_d (unsigned long int, unsigned int)
+
+ void __builtin_loongarch_dbar (imm0_32767)
+ void __builtin_loongarch_ibar (imm0_32767)
+
+ void __builtin_loongarch_syscall (imm0_32767)
+ void __builtin_loongarch_break (imm0_32767)
+@end smallexample
+
+@emph{Note:}Since the control register is divided into 32-bit and 64-bit,
+but the access instruction is not distinguished. So GCC renames the control
+instructions when implementing intrinsics.
+
+Take the csrrd instruction as an example, built-in functions are implemented as follows:
+@smallexample
+ __builtin_loongarch_csrrd_w // When reading the 32-bit control register use.
+ __builtin_loongarch_csrrd_d // When reading the 64-bit control register use.
+@end smallexample
+
+For the convenience of use, the built-in functions are encapsulated,
+the encapsulated functions and @code{__drdtime_t, __rdtime_t} are
+defined in the @code{larchintrin.h}. So if you call the following
+function you need to include @code{larchintrin.h}.
+
+@smallexample
+ typedef struct drdtime@{
+ unsigned long dvalue;
+ unsigned long dtimeid;
+ @} __drdtime_t;
+
+ typedef struct rdtime@{
+ unsigned int value;
+ unsigned int timeid;
+ @} __rdtime_t;
+@end smallexample
+
+@smallexample
+ __drdtime_t __rdtime_d (void)
+ __rdtime_t __rdtimel_w (void)
+ __rdtime_t __rdtimeh_w (void)
+ unsigned int __movfcsr2gr (imm0_31)
+ void __movgr2fcsr (imm0_31, unsigned int)
+ void __cacop_d (imm0_31, unsigned long, imm_n2048_2047)
+ unsigned int __cpucfg (unsigned int)
+ void __asrtle_d (long int, long int)
+ void __asrtgt_d (long int, long int)
+ long int __lddir_d (long int, imm0_31)
+ void __ldpte_d (long int, imm0_31)
+
+ int __crc_w_b_w (char, int)
+ int __crc_w_h_w (short, int)
+ int __crc_w_w_w (int, int)
+ int __crc_w_d_w (long int, int)
+ int __crcc_w_b_w (char, int)
+ int __crcc_w_h_w (short, int)
+ int __crcc_w_w_w (int, int)
+ int __crcc_w_d_w (long int, int)
+
+ unsigned int __csrrd_w (imm0_16383)
+ unsigned int __csrwr_w (unsigned int, imm0_16383)
+ unsigned int __csrxchg_w (unsigned int, unsigned int, imm0_16383)
+ unsigned long __csrrd_d (imm0_16383)
+ unsigned long __csrwr_d (unsigned long, imm0_16383)
+ unsigned long __csrxchg_d (unsigned long, unsigned long, imm0_16383)
+
+ unsigned char __iocsrrd_b (unsigned int)
+ unsigned short __iocsrrd_h (unsigned int)
+ unsigned int __iocsrrd_w (unsigned int)
+ unsigned long __iocsrrd_d (unsigned int)
+ void __iocsrwr_b (unsigned char, unsigned int)
+ void __iocsrwr_h (unsigned short, unsigned int)
+ void __iocsrwr_w (unsigned int, unsigned int)
+ void __iocsrwr_d (unsigned long, unsigned int)
+
+ void __dbar (imm0_32767)
+ void __ibar (imm0_32767)
+
+ void __syscall (imm0_32767)
+ void __break (imm0_32767)
+@end smallexample
+
@node MIPS DSP Built-in Functions
@subsection MIPS DSP Built-in Functions
--
2.33.0

View File

@ -0,0 +1,107 @@
From 41a4945886631a1b2898ae957389d5db18a07141 Mon Sep 17 00:00:00 2001
From: Xi Ruoyao <xry111@xry111.site>
Date: Fri, 4 Nov 2022 15:12:22 +0800
Subject: [PATCH 025/124] LoongArch: Add fcopysign instructions
Add fcopysign.{s,d} with the names copysign{sf,df}3 so GCC will expand
__builtin_copysign{f,} to a single instruction.
Link: https://sourceware.org/pipermail/libc-alpha/2022-November/143177.html
gcc/ChangeLog:
* config/loongarch/loongarch.md (UNSPEC_FCOPYSIGN): New unspec.
(type): Add fcopysign.
(copysign<mode>3): New instruction template.
gcc/testsuite/ChangeLog:
* gcc.target/loongarch/fcopysign.c: New test.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
gcc/config/loongarch/loongarch.md | 22 ++++++++++++++++++-
.../gcc.target/loongarch/fcopysign.c | 16 ++++++++++++++
2 files changed, 37 insertions(+), 1 deletion(-)
create mode 100644 gcc/testsuite/gcc.target/loongarch/fcopysign.c
diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md
index 214b14bdd..bda34d0f3 100644
--- a/gcc/config/loongarch/loongarch.md
+++ b/gcc/config/loongarch/loongarch.md
@@ -37,6 +37,7 @@
UNSPEC_FCLASS
UNSPEC_FMAX
UNSPEC_FMIN
+ UNSPEC_FCOPYSIGN
;; Override return address for exception handling.
UNSPEC_EH_RETURN
@@ -214,6 +215,7 @@
;; fabs floating point absolute value
;; fneg floating point negation
;; fcmp floating point compare
+;; fcopysign floating point copysign
;; fcvt floating point convert
;; fsqrt floating point square root
;; frsqrt floating point reciprocal square root
@@ -226,7 +228,7 @@
"unknown,branch,jump,call,load,fpload,fpidxload,store,fpstore,fpidxstore,
prefetch,prefetchx,condmove,mgtf,mftg,const,arith,logical,
shift,slt,signext,clz,trap,imul,idiv,move,
- fmove,fadd,fmul,fmadd,fdiv,frdiv,fabs,fneg,fcmp,fcvt,fsqrt,
+ fmove,fadd,fmul,fmadd,fdiv,frdiv,fabs,fneg,fcmp,fcopysign,fcvt,fsqrt,
frsqrt,accext,accmod,multi,atomic,syncloop,nop,ghost"
(cond [(eq_attr "jirl" "!unset") (const_string "call")
(eq_attr "got" "load") (const_string "load")
@@ -976,6 +978,24 @@
(set_attr "mode" "<UNITMODE>")])
;;
+;; ....................
+;;
+;; FLOATING POINT COPYSIGN
+;;
+;; ....................
+
+(define_insn "copysign<mode>3"
+ [(set (match_operand:ANYF 0 "register_operand" "=f")
+ (unspec:ANYF [(match_operand:ANYF 1 "register_operand" "f")
+ (match_operand:ANYF 2 "register_operand" "f")]
+ UNSPEC_FCOPYSIGN))]
+ "TARGET_HARD_FLOAT"
+ "fcopysign.<fmt>\t%0,%1,%2"
+ [(set_attr "type" "fcopysign")
+ (set_attr "mode" "<UNITMODE>")])
+
+
+;;
;; ...................
;;
;; Count leading zeroes.
diff --git a/gcc/testsuite/gcc.target/loongarch/fcopysign.c b/gcc/testsuite/gcc.target/loongarch/fcopysign.c
new file mode 100644
index 000000000..058ba2cf5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/fcopysign.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-mdouble-float" } */
+/* { dg-final { scan-assembler "fcopysign\\.s" } } */
+/* { dg-final { scan-assembler "fcopysign\\.d" } } */
+
+double
+my_copysign (double a, double b)
+{
+ return __builtin_copysign (a, b);
+}
+
+float
+my_copysignf (float a, float b)
+{
+ return __builtin_copysignf (a, b);
+}
--
2.33.0

View File

@ -0,0 +1,123 @@
From 2ae587a86bba31b91a127e353c31c9f861ff5326 Mon Sep 17 00:00:00 2001
From: Xi Ruoyao <xry111@xry111.site>
Date: Tue, 8 Nov 2022 13:42:20 +0800
Subject: [PATCH 030/124] LoongArch: Add flogb.{s,d} instructions and expand
logb{sf,df}2
On LoongArch, flogb instructions extract the exponent of a non-negative
floating point value, but produces NaN for negative values. So we need
to add a fabs instruction when we expand logb.
gcc/ChangeLog:
* config/loongarch/loongarch.md (UNSPEC_FLOGB): New unspec.
(type): Add flogb.
(logb_non_negative<mode>2): New instruction template.
(logb<mode>2): New define_expand.
gcc/testsuite/ChangeLog:
* gcc.target/loongarch/flogb.c: New test.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
gcc/config/loongarch/loongarch.md | 35 ++++++++++++++++++++--
gcc/testsuite/gcc.target/loongarch/flogb.c | 18 +++++++++++
2 files changed, 51 insertions(+), 2 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/loongarch/flogb.c
diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md
index c141c9add..682ab9617 100644
--- a/gcc/config/loongarch/loongarch.md
+++ b/gcc/config/loongarch/loongarch.md
@@ -42,6 +42,7 @@
UNSPEC_FTINTRM
UNSPEC_FTINTRP
UNSPEC_FSCALEB
+ UNSPEC_FLOGB
;; Override return address for exception handling.
UNSPEC_EH_RETURN
@@ -217,6 +218,7 @@
;; fdiv floating point divide
;; frdiv floating point reciprocal divide
;; fabs floating point absolute value
+;; flogb floating point exponent extract
;; fneg floating point negation
;; fcmp floating point compare
;; fcopysign floating point copysign
@@ -233,8 +235,8 @@
"unknown,branch,jump,call,load,fpload,fpidxload,store,fpstore,fpidxstore,
prefetch,prefetchx,condmove,mgtf,mftg,const,arith,logical,
shift,slt,signext,clz,trap,imul,idiv,move,
- fmove,fadd,fmul,fmadd,fdiv,frdiv,fabs,fneg,fcmp,fcopysign,fcvt,fscaleb,
- fsqrt,frsqrt,accext,accmod,multi,atomic,syncloop,nop,ghost"
+ fmove,fadd,fmul,fmadd,fdiv,frdiv,fabs,flogb,fneg,fcmp,fcopysign,fcvt,
+ fscaleb,fsqrt,frsqrt,accext,accmod,multi,atomic,syncloop,nop,ghost"
(cond [(eq_attr "jirl" "!unset") (const_string "call")
(eq_attr "got" "load") (const_string "load")
@@ -1039,6 +1041,35 @@
(set_attr "mode" "<UNITMODE>")])
;;
+;; ....................
+;;
+;; FLOATING POINT EXPONENT EXTRACT
+;;
+;; ....................
+
+(define_insn "logb_non_negative<mode>2"
+ [(set (match_operand:ANYF 0 "register_operand" "=f")
+ (unspec:ANYF [(match_operand:ANYF 1 "register_operand" "f")]
+ UNSPEC_FLOGB))]
+ "TARGET_HARD_FLOAT"
+ "flogb.<fmt>\t%0,%1"
+ [(set_attr "type" "flogb")
+ (set_attr "mode" "<UNITMODE>")])
+
+(define_expand "logb<mode>2"
+ [(set (match_operand:ANYF 0 "register_operand")
+ (unspec:ANYF [(abs:ANYF (match_operand:ANYF 1 "register_operand"))]
+ UNSPEC_FLOGB))]
+ "TARGET_HARD_FLOAT"
+{
+ rtx tmp = gen_reg_rtx (<MODE>mode);
+
+ emit_insn (gen_abs<mode>2 (tmp, operands[1]));
+ emit_insn (gen_logb_non_negative<mode>2 (operands[0], tmp));
+ DONE;
+})
+
+;;
;; ...................
;;
;; Count leading zeroes.
diff --git a/gcc/testsuite/gcc.target/loongarch/flogb.c b/gcc/testsuite/gcc.target/loongarch/flogb.c
new file mode 100644
index 000000000..1daefe54e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/flogb.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-mdouble-float -fno-math-errno" } */
+/* { dg-final { scan-assembler "fabs\\.s" } } */
+/* { dg-final { scan-assembler "fabs\\.d" } } */
+/* { dg-final { scan-assembler "flogb\\.s" } } */
+/* { dg-final { scan-assembler "flogb\\.d" } } */
+
+double
+my_logb (double a)
+{
+ return __builtin_logb (a);
+}
+
+float
+my_logbf (float a)
+{
+ return __builtin_logbf (a);
+}
--
2.33.0

View File

@ -0,0 +1,155 @@
From e3d69a3b7a4e00e8bba88b8b4abaa1c17bc083d5 Mon Sep 17 00:00:00 2001
From: Xi Ruoyao <xry111@xry111.site>
Date: Tue, 8 Nov 2022 12:14:35 +0800
Subject: [PATCH 029/124] LoongArch: Add fscaleb.{s,d} instructions as
ldexp{sf,df}3
This allows optimizing __builtin_ldexp{,f} and __builtin_scalbn{,f} with
-fno-math-errno.
IMODE is added because we can't hard code SI for operand 2: fscaleb.d
instruction always take the high half of both source registers into
account. See my_ldexp_long in the test case.
gcc/ChangeLog:
* config/loongarch/loongarch.md (UNSPEC_FSCALEB): New unspec.
(type): Add fscaleb.
(IMODE): New mode attr.
(ldexp<mode>3): New instruction template.
gcc/testsuite/ChangeLog:
* gcc.target/loongarch/fscaleb.c: New test.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
gcc/config/loongarch/loongarch.md | 26 ++++++++++-
gcc/testsuite/gcc.target/loongarch/fscaleb.c | 48 ++++++++++++++++++++
2 files changed, 72 insertions(+), 2 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/loongarch/fscaleb.c
diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md
index eb127c346..c141c9add 100644
--- a/gcc/config/loongarch/loongarch.md
+++ b/gcc/config/loongarch/loongarch.md
@@ -41,6 +41,7 @@
UNSPEC_FTINT
UNSPEC_FTINTRM
UNSPEC_FTINTRP
+ UNSPEC_FSCALEB
;; Override return address for exception handling.
UNSPEC_EH_RETURN
@@ -220,6 +221,7 @@
;; fcmp floating point compare
;; fcopysign floating point copysign
;; fcvt floating point convert
+;; fscaleb floating point scale
;; fsqrt floating point square root
;; frsqrt floating point reciprocal square root
;; multi multiword sequence (or user asm statements)
@@ -231,8 +233,8 @@
"unknown,branch,jump,call,load,fpload,fpidxload,store,fpstore,fpidxstore,
prefetch,prefetchx,condmove,mgtf,mftg,const,arith,logical,
shift,slt,signext,clz,trap,imul,idiv,move,
- fmove,fadd,fmul,fmadd,fdiv,frdiv,fabs,fneg,fcmp,fcopysign,fcvt,fsqrt,
- frsqrt,accext,accmod,multi,atomic,syncloop,nop,ghost"
+ fmove,fadd,fmul,fmadd,fdiv,frdiv,fabs,fneg,fcmp,fcopysign,fcvt,fscaleb,
+ fsqrt,frsqrt,accext,accmod,multi,atomic,syncloop,nop,ghost"
(cond [(eq_attr "jirl" "!unset") (const_string "call")
(eq_attr "got" "load") (const_string "load")
@@ -418,6 +420,10 @@
;; the controlling mode.
(define_mode_attr HALFMODE [(DF "SI") (DI "SI") (TF "DI")])
+;; This attribute gives the integer mode that has the same size of a
+;; floating-point mode.
+(define_mode_attr IMODE [(SF "SI") (DF "DI")])
+
;; This code iterator allows signed and unsigned widening multiplications
;; to use the same template.
(define_code_iterator any_extend [sign_extend zero_extend])
@@ -1014,7 +1020,23 @@
"fcopysign.<fmt>\t%0,%1,%2"
[(set_attr "type" "fcopysign")
(set_attr "mode" "<UNITMODE>")])
+
+;;
+;; ....................
+;;
+;; FLOATING POINT SCALE
+;;
+;; ....................
+(define_insn "ldexp<mode>3"
+ [(set (match_operand:ANYF 0 "register_operand" "=f")
+ (unspec:ANYF [(match_operand:ANYF 1 "register_operand" "f")
+ (match_operand:<IMODE> 2 "register_operand" "f")]
+ UNSPEC_FSCALEB))]
+ "TARGET_HARD_FLOAT"
+ "fscaleb.<fmt>\t%0,%1,%2"
+ [(set_attr "type" "fscaleb")
+ (set_attr "mode" "<UNITMODE>")])
;;
;; ...................
diff --git a/gcc/testsuite/gcc.target/loongarch/fscaleb.c b/gcc/testsuite/gcc.target/loongarch/fscaleb.c
new file mode 100644
index 000000000..f18470fbb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/fscaleb.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mabi=lp64d -mdouble-float -fno-math-errno" } */
+/* { dg-final { scan-assembler-times "fscaleb\\.s" 3 } } */
+/* { dg-final { scan-assembler-times "fscaleb\\.d" 4 } } */
+/* { dg-final { scan-assembler-times "slli\\.w" 1 } } */
+
+double
+my_scalbln (double a, long b)
+{
+ return __builtin_scalbln (a, b);
+}
+
+double
+my_scalbn (double a, int b)
+{
+ return __builtin_scalbn (a, b);
+}
+
+double
+my_ldexp (double a, int b)
+{
+ return __builtin_ldexp (a, b);
+}
+
+float
+my_scalblnf (float a, long b)
+{
+ return __builtin_scalblnf (a, b);
+}
+
+float
+my_scalbnf (float a, int b)
+{
+ return __builtin_scalbnf (a, b);
+}
+
+float
+my_ldexpf (float a, int b)
+{
+ return __builtin_ldexpf (a, b);
+}
+
+/* b must be sign-extended */
+double
+my_ldexp_long (double a, long b)
+{
+ return __builtin_ldexp (a, b);
+}
--
2.33.0

View File

@ -0,0 +1,220 @@
From 76d599c6d8f9cf78b51cd76a7ca8fbe11e2cda2b Mon Sep 17 00:00:00 2001
From: Xi Ruoyao <xry111@xry111.site>
Date: Sun, 6 Nov 2022 23:16:49 +0800
Subject: [PATCH 028/124] LoongArch: Add ftint{,rm,rp}.{w,l}.{s,d} instructions
This allows to optimize the following builtins if -fno-math-errno:
- __builtin_lrint{,f}
- __builtin_lfloor{,f}
- __builtin_lceil{,f}
Inspired by
https://gcc.gnu.org/pipermail/gcc-patches/2022-November/605287.html.
ANYFI is added so the compiler won't try ftint.l.s if -mfpu=32. If we
simply used GPR here an ICE would be triggered with __builtin_lrintf
and -mfpu=32.
ftint{rm,rp} instructions may raise inexact exception, so they can't be
used if -fno-trapping-math -fno-fp-int-builtin-inexact.
Note that the .w.{s,d} variants are not tested because we don't support
ILP32 for now.
gcc/ChangeLog:
* config/loongarch/loongarch.md (UNSPEC_FTINT): New unspec.
(UNSPEC_FTINTRM): Likewise.
(UNSPEC_FTINTRP): Likewise.
(LRINT): New define_int_iterator.
(lrint_pattern): New define_int_attr.
(lrint_submenmonic): Likewise.
(lrint_allow_inexact): Likewise.
(ANYFI): New define_mode_iterator.
(lrint<ANYF><ANYFI>): New instruction template.
gcc/testsuite/ChangeLog:
* gcc.target/loongarch/ftint.c: New test.
* gcc.target/loongarch/ftint-no-inexact.c: New test.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
gcc/config/loongarch/loongarch.md | 34 ++++++++++++++
.../gcc.target/loongarch/ftint-no-inexact.c | 44 +++++++++++++++++++
gcc/testsuite/gcc.target/loongarch/ftint.c | 44 +++++++++++++++++++
3 files changed, 122 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/loongarch/ftint-no-inexact.c
create mode 100644 gcc/testsuite/gcc.target/loongarch/ftint.c
diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md
index a14ab14ac..eb127c346 100644
--- a/gcc/config/loongarch/loongarch.md
+++ b/gcc/config/loongarch/loongarch.md
@@ -38,6 +38,9 @@
UNSPEC_FMAX
UNSPEC_FMIN
UNSPEC_FCOPYSIGN
+ UNSPEC_FTINT
+ UNSPEC_FTINTRM
+ UNSPEC_FTINTRP
;; Override return address for exception handling.
UNSPEC_EH_RETURN
@@ -374,6 +377,11 @@
(define_mode_iterator ANYF [(SF "TARGET_HARD_FLOAT")
(DF "TARGET_DOUBLE_FLOAT")])
+;; Iterator for fixed-point modes which can be hold by a hardware
+;; floating-point register.
+(define_mode_iterator ANYFI [(SI "TARGET_HARD_FLOAT")
+ (DI "TARGET_DOUBLE_FLOAT")])
+
;; A mode for which moves involving FPRs may need to be split.
(define_mode_iterator SPLITF
[(DF "!TARGET_64BIT && TARGET_DOUBLE_FLOAT")
@@ -515,6 +523,19 @@
(define_code_attr sel [(eq "masknez") (ne "maskeqz")])
(define_code_attr selinv [(eq "maskeqz") (ne "masknez")])
+;; Iterator and attributes for floating-point to fixed-point conversion
+;; instructions.
+(define_int_iterator LRINT [UNSPEC_FTINT UNSPEC_FTINTRM UNSPEC_FTINTRP])
+(define_int_attr lrint_pattern [(UNSPEC_FTINT "lrint")
+ (UNSPEC_FTINTRM "lfloor")
+ (UNSPEC_FTINTRP "lceil")])
+(define_int_attr lrint_submenmonic [(UNSPEC_FTINT "")
+ (UNSPEC_FTINTRM "rm")
+ (UNSPEC_FTINTRP "rp")])
+(define_int_attr lrint_allow_inexact [(UNSPEC_FTINT "1")
+ (UNSPEC_FTINTRM "0")
+ (UNSPEC_FTINTRP "0")])
+
;;
;; ....................
;;
@@ -2022,6 +2043,19 @@
[(set_attr "type" "fcvt")
(set_attr "mode" "<MODE>")])
+;; Convert floating-point numbers to integers
+(define_insn "<lrint_pattern><ANYF:mode><ANYFI:mode>2"
+ [(set (match_operand:ANYFI 0 "register_operand" "=f")
+ (unspec:ANYFI [(match_operand:ANYF 1 "register_operand" "f")]
+ LRINT))]
+ "TARGET_HARD_FLOAT &&
+ (<lrint_allow_inexact>
+ || flag_fp_int_builtin_inexact
+ || !flag_trapping_math)"
+ "ftint<lrint_submenmonic>.<ANYFI:ifmt>.<ANYF:fmt> %0,%1"
+ [(set_attr "type" "fcvt")
+ (set_attr "mode" "<ANYF:MODE>")])
+
;; Load the low word of operand 0 with operand 1.
(define_insn "load_low<mode>"
[(set (match_operand:SPLITF 0 "register_operand" "=f,f")
diff --git a/gcc/testsuite/gcc.target/loongarch/ftint-no-inexact.c b/gcc/testsuite/gcc.target/loongarch/ftint-no-inexact.c
new file mode 100644
index 000000000..88b83a9c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/ftint-no-inexact.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-options "-mabi=lp64d -mdouble-float -fno-math-errno -fno-fp-int-builtin-inexact" } */
+/* { dg-final { scan-assembler "ftint\\.l\\.s" } } */
+/* { dg-final { scan-assembler "ftint\\.l\\.d" } } */
+/* { dg-final { scan-assembler-not "ftintrm\\.l\\.s" } } */
+/* { dg-final { scan-assembler-not "ftintrm\\.l\\.d" } } */
+/* { dg-final { scan-assembler-not "ftintrp\\.l\\.s" } } */
+/* { dg-final { scan-assembler-not "ftintrp\\.l\\.d" } } */
+
+long
+my_lrint (double a)
+{
+ return __builtin_lrint (a);
+}
+
+long
+my_lrintf (float a)
+{
+ return __builtin_lrintf (a);
+}
+
+long
+my_lfloor (double a)
+{
+ return __builtin_lfloor (a);
+}
+
+long
+my_lfloorf (float a)
+{
+ return __builtin_lfloorf (a);
+}
+
+long
+my_lceil (double a)
+{
+ return __builtin_lceil (a);
+}
+
+long
+my_lceilf (float a)
+{
+ return __builtin_lceilf (a);
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/ftint.c b/gcc/testsuite/gcc.target/loongarch/ftint.c
new file mode 100644
index 000000000..7a326a454
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/ftint.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-options "-mabi=lp64d -mdouble-float -fno-math-errno -ffp-int-builtin-inexact" } */
+/* { dg-final { scan-assembler "ftint\\.l\\.s" } } */
+/* { dg-final { scan-assembler "ftint\\.l\\.d" } } */
+/* { dg-final { scan-assembler "ftintrm\\.l\\.s" } } */
+/* { dg-final { scan-assembler "ftintrm\\.l\\.d" } } */
+/* { dg-final { scan-assembler "ftintrp\\.l\\.s" } } */
+/* { dg-final { scan-assembler "ftintrp\\.l\\.d" } } */
+
+long
+my_lrint (double a)
+{
+ return __builtin_lrint (a);
+}
+
+long
+my_lrintf (float a)
+{
+ return __builtin_lrintf (a);
+}
+
+long
+my_lfloor (double a)
+{
+ return __builtin_lfloor (a);
+}
+
+long
+my_lfloorf (float a)
+{
+ return __builtin_lfloorf (a);
+}
+
+long
+my_lceil (double a)
+{
+ return __builtin_lceil (a);
+}
+
+long
+my_lceilf (float a)
+{
+ return __builtin_lceilf (a);
+}
--
2.33.0

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,158 @@
From 52a41006c2e8141a42de93ffcc2c040e034244b2 Mon Sep 17 00:00:00 2001
From: Lulu Cheng <chenglulu@loongson.cn>
Date: Wed, 16 Nov 2022 09:25:14 +0800
Subject: [PATCH 031/124] LoongArch: Add prefetch instructions.
Enable sw prefetching at -O3 and higher.
Co-Authored-By: xujiahao <xujiahao@loongson.cn>
gcc/ChangeLog:
* config/loongarch/constraints.md (ZD): New constraint.
* config/loongarch/loongarch-def.c: Initial number of parallel prefetch.
* config/loongarch/loongarch-tune.h (struct loongarch_cache):
Define number of parallel prefetch.
* config/loongarch/loongarch.cc (loongarch_option_override_internal):
Set up parameters to be used in prefetching algorithm.
* config/loongarch/loongarch.md (prefetch): New template.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
gcc/config/loongarch/constraints.md | 10 ++++++++++
gcc/config/loongarch/loongarch-def.c | 2 ++
gcc/config/loongarch/loongarch-tune.h | 1 +
gcc/config/loongarch/loongarch.cc | 28 +++++++++++++++++++++++++++
gcc/config/loongarch/loongarch.md | 14 ++++++++++++++
5 files changed, 55 insertions(+)
diff --git a/gcc/config/loongarch/constraints.md b/gcc/config/loongarch/constraints.md
index 43cb7b5f0..46f7f63ae 100644
--- a/gcc/config/loongarch/constraints.md
+++ b/gcc/config/loongarch/constraints.md
@@ -86,6 +86,10 @@
;; "ZB"
;; "An address that is held in a general-purpose register.
;; The offset is zero"
+;; "ZD"
+;; "An address operand whose address is formed by a base register
+;; and offset that is suitable for use in instructions with the same
+;; addressing mode as @code{preld}."
;; "<" "Matches a pre-dec or post-dec operand." (Global non-architectural)
;; ">" "Matches a pre-inc or post-inc operand." (Global non-architectural)
@@ -190,3 +194,9 @@
The offset is zero"
(and (match_code "mem")
(match_test "REG_P (XEXP (op, 0))")))
+
+(define_address_constraint "ZD"
+ "An address operand whose address is formed by a base register
+ and offset that is suitable for use in instructions with the same
+ addressing mode as @code{preld}."
+ (match_test "loongarch_12bit_offset_address_p (op, mode)"))
diff --git a/gcc/config/loongarch/loongarch-def.c b/gcc/config/loongarch/loongarch-def.c
index cbf995d81..80ab10a52 100644
--- a/gcc/config/loongarch/loongarch-def.c
+++ b/gcc/config/loongarch/loongarch-def.c
@@ -62,11 +62,13 @@ loongarch_cpu_cache[N_TUNE_TYPES] = {
.l1d_line_size = 64,
.l1d_size = 64,
.l2d_size = 256,
+ .simultaneous_prefetches = 4,
},
[CPU_LA464] = {
.l1d_line_size = 64,
.l1d_size = 64,
.l2d_size = 256,
+ .simultaneous_prefetches = 4,
},
};
diff --git a/gcc/config/loongarch/loongarch-tune.h b/gcc/config/loongarch/loongarch-tune.h
index 6f3530f5c..8e3eb2947 100644
--- a/gcc/config/loongarch/loongarch-tune.h
+++ b/gcc/config/loongarch/loongarch-tune.h
@@ -45,6 +45,7 @@ struct loongarch_cache {
int l1d_line_size; /* bytes */
int l1d_size; /* KiB */
int l2d_size; /* kiB */
+ int simultaneous_prefetches; /* number of parallel prefetch */
};
#endif /* LOONGARCH_TUNE_H */
diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index d552b162a..622c9435b 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -63,6 +63,7 @@ along with GCC; see the file COPYING3. If not see
#include "context.h"
#include "builtins.h"
#include "rtl-iter.h"
+#include "opts.h"
/* This file should be included last. */
#include "target-def.h"
@@ -6099,6 +6100,33 @@ loongarch_option_override_internal (struct gcc_options *opts)
if (loongarch_branch_cost == 0)
loongarch_branch_cost = loongarch_cost->branch_cost;
+ /* Set up parameters to be used in prefetching algorithm. */
+ int simultaneous_prefetches
+ = loongarch_cpu_cache[LARCH_ACTUAL_TUNE].simultaneous_prefetches;
+
+ SET_OPTION_IF_UNSET (opts, &global_options_set,
+ param_simultaneous_prefetches,
+ simultaneous_prefetches);
+
+ SET_OPTION_IF_UNSET (opts, &global_options_set,
+ param_l1_cache_line_size,
+ loongarch_cpu_cache[LARCH_ACTUAL_TUNE].l1d_line_size);
+
+ SET_OPTION_IF_UNSET (opts, &global_options_set,
+ param_l1_cache_size,
+ loongarch_cpu_cache[LARCH_ACTUAL_TUNE].l1d_size);
+
+ SET_OPTION_IF_UNSET (opts, &global_options_set,
+ param_l2_cache_size,
+ loongarch_cpu_cache[LARCH_ACTUAL_TUNE].l2d_size);
+
+
+ /* Enable sw prefetching at -O3 and higher. */
+ if (opts->x_flag_prefetch_loop_arrays < 0
+ && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
+ && !opts->x_optimize_size)
+ opts->x_flag_prefetch_loop_arrays = 1;
+
if (TARGET_DIRECT_EXTERN_ACCESS && flag_shlib)
error ("%qs cannot be used for compiling a shared library",
"-mdirect-extern-access");
diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md
index 682ab9617..2fda53819 100644
--- a/gcc/config/loongarch/loongarch.md
+++ b/gcc/config/loongarch/loongarch.md
@@ -3282,6 +3282,20 @@
;; ....................
;;
+(define_insn "prefetch"
+ [(prefetch (match_operand 0 "address_operand" "ZD")
+ (match_operand 1 "const_int_operand" "n")
+ (match_operand 2 "const_int_operand" "n"))]
+ ""
+{
+ switch (INTVAL (operands[1]))
+ {
+ case 0: return "preld\t0,%a0";
+ case 1: return "preld\t8,%a0";
+ default: gcc_unreachable ();
+ }
+})
+
(define_insn "nop"
[(const_int 0)]
""
--
2.33.0

View File

@ -0,0 +1,794 @@
From b1c92fb9dab678e4c9c23fa77185011494d145b9 Mon Sep 17 00:00:00 2001
From: Lulu Cheng <chenglulu@loongson.cn>
Date: Thu, 18 Aug 2022 17:26:13 +0800
Subject: [PATCH 011/124] LoongArch: Add support code model extreme.
Use five instructions to calculate a signed 64-bit offset relative to the pc.
gcc/ChangeLog:
* config/loongarch/loongarch-opts.cc: Allow cmodel to be extreme.
* config/loongarch/loongarch.cc (loongarch_call_tls_get_addr):
Add extreme support for TLS GD and LD types.
(loongarch_legitimize_tls_address): Add extreme support for TLS LE
and IE.
(loongarch_split_symbol): When compiling with -mcmodel=extreme,
the symbol address will be obtained through five instructions.
(loongarch_print_operand_reloc): Add support.
(loongarch_print_operand): Add support.
(loongarch_print_operand_address): Add support.
(loongarch_option_override_internal): Set '-mcmodel=extreme' option
incompatible with '-mno-explicit-relocs'.
* config/loongarch/loongarch.md (@lui_l_hi20<mode>):
Loads bits 12-31 of data into registers.
(lui_h_lo20): Load bits 32-51 of the data and spell bits 0-31 of
the source register.
(lui_h_hi12): Load bits 52-63 of the data and spell bits 0-51 of
the source register.
* config/loongarch/predicates.md: Symbols need to be decomposed
when defining the macro TARGET_CMODEL_EXTREME
* doc/invoke.texi: Modify the description information of cmodel in the document.
Document -W[no-]extreme-plt.
gcc/testsuite/ChangeLog:
* gcc.target/loongarch/func-call-1.c: Add option '-mcmodel=normal'.
* gcc.target/loongarch/func-call-2.c: Likewise.
* gcc.target/loongarch/func-call-3.c: Likewise.
* gcc.target/loongarch/func-call-4.c: Likewise.
* gcc.target/loongarch/func-call-5.c: Likewise.
* gcc.target/loongarch/func-call-6.c: Likewise.
* gcc.target/loongarch/func-call-7.c: Likewise.
* gcc.target/loongarch/func-call-8.c: Likewise.
* gcc.target/loongarch/relocs-symbol-noaddend.c: Likewise.
* gcc.target/loongarch/func-call-extreme-1.c: New test.
* gcc.target/loongarch/func-call-extreme-2.c: New test.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
gcc/config/loongarch/loongarch-opts.cc | 3 +-
gcc/config/loongarch/loongarch.cc | 222 +++++++++++++++---
gcc/config/loongarch/loongarch.md | 34 ++-
gcc/config/loongarch/predicates.md | 9 +-
gcc/doc/invoke.texi | 50 +---
.../gcc.target/loongarch/func-call-1.c | 2 +-
.../gcc.target/loongarch/func-call-2.c | 2 +-
.../gcc.target/loongarch/func-call-3.c | 2 +-
.../gcc.target/loongarch/func-call-4.c | 2 +-
.../gcc.target/loongarch/func-call-5.c | 2 +-
.../gcc.target/loongarch/func-call-6.c | 2 +-
.../gcc.target/loongarch/func-call-7.c | 2 +-
.../gcc.target/loongarch/func-call-8.c | 2 +-
.../loongarch/func-call-extreme-1.c | 32 +++
.../loongarch/func-call-extreme-2.c | 32 +++
.../loongarch/relocs-symbol-noaddend.c | 2 +-
16 files changed, 318 insertions(+), 82 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/loongarch/func-call-extreme-1.c
create mode 100644 gcc/testsuite/gcc.target/loongarch/func-call-extreme-2.c
diff --git a/gcc/config/loongarch/loongarch-opts.cc b/gcc/config/loongarch/loongarch-opts.cc
index 3f70943de..2ae89f234 100644
--- a/gcc/config/loongarch/loongarch-opts.cc
+++ b/gcc/config/loongarch/loongarch-opts.cc
@@ -376,14 +376,13 @@ fallback:
/* 5. Target code model */
t.cmodel = constrained.cmodel ? opt_cmodel : CMODEL_NORMAL;
- if (t.cmodel != CMODEL_NORMAL)
+ if (t.cmodel != CMODEL_NORMAL && t.cmodel != CMODEL_EXTREME)
{
warning (0, "%qs is not supported, now cmodel is set to %qs",
loongarch_cmodel_strings[t.cmodel], "normal");
t.cmodel = CMODEL_NORMAL;
}
-
/* Cleanup and return. */
obstack_free (&msg_obstack, NULL);
*target = t;
diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index 76bf55ea4..1a33f668f 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -2436,7 +2436,19 @@ loongarch_call_tls_get_addr (rtx sym, enum loongarch_symbol_type type, rtx v0)
/* Split tls symbol to high and low. */
rtx high = gen_rtx_HIGH (Pmode, copy_rtx (loc));
high = loongarch_force_temporary (tmp, high);
- emit_insn (gen_tls_low (Pmode, a0, high, loc));
+
+ if (TARGET_CMODEL_EXTREME)
+ {
+ gcc_assert (TARGET_EXPLICIT_RELOCS);
+
+ rtx tmp1 = gen_reg_rtx (Pmode);
+ emit_insn (gen_tls_low (Pmode, tmp1, gen_rtx_REG (Pmode, 0), loc));
+ emit_insn (gen_lui_h_lo20 (tmp1, tmp1, loc));
+ emit_insn (gen_lui_h_hi12 (tmp1, tmp1, loc));
+ emit_move_insn (a0, gen_rtx_PLUS (Pmode, high, tmp1));
+ }
+ else
+ emit_insn (gen_tls_low (Pmode, a0, high, loc));
}
else
{
@@ -2449,14 +2461,44 @@ loongarch_call_tls_get_addr (rtx sym, enum loongarch_symbol_type type, rtx v0)
}
if (flag_plt)
- insn = emit_call_insn (gen_call_value_internal (v0, loongarch_tls_symbol,
+ insn = emit_call_insn (gen_call_value_internal (v0,
+ loongarch_tls_symbol,
const0_rtx));
else
{
rtx dest = gen_reg_rtx (Pmode);
- rtx high = gen_reg_rtx (Pmode);
- loongarch_emit_move (high, gen_rtx_HIGH (Pmode, loongarch_tls_symbol));
- emit_insn (gen_ld_from_got (Pmode, dest, high, loongarch_tls_symbol));
+
+ if (TARGET_CMODEL_EXTREME)
+ {
+ gcc_assert (TARGET_EXPLICIT_RELOCS);
+
+ rtx tmp1 = gen_reg_rtx (Pmode);
+ rtx high = gen_reg_rtx (Pmode);
+
+ loongarch_emit_move (high,
+ gen_rtx_HIGH (Pmode, loongarch_tls_symbol));
+ loongarch_emit_move (tmp1, gen_rtx_LO_SUM (Pmode,
+ gen_rtx_REG (Pmode, 0),
+ loongarch_tls_symbol));
+ emit_insn (gen_lui_h_lo20 (tmp1, tmp1, loongarch_tls_symbol));
+ emit_insn (gen_lui_h_hi12 (tmp1, tmp1, loongarch_tls_symbol));
+ loongarch_emit_move (dest,
+ gen_rtx_MEM (Pmode,
+ gen_rtx_PLUS (Pmode, high, tmp1)));
+ }
+ else
+ {
+ if (TARGET_EXPLICIT_RELOCS)
+ {
+ rtx high = gen_reg_rtx (Pmode);
+ loongarch_emit_move (high,
+ gen_rtx_HIGH (Pmode, loongarch_tls_symbol));
+ emit_insn (gen_ld_from_got (Pmode, dest, high,
+ loongarch_tls_symbol));
+ }
+ else
+ loongarch_emit_move (dest, loongarch_tls_symbol);
+ }
insn = emit_call_insn (gen_call_value_internal (v0, dest, const0_rtx));
}
@@ -2508,7 +2550,23 @@ loongarch_legitimize_tls_address (rtx loc)
tmp3 = gen_reg_rtx (Pmode);
rtx high = gen_rtx_HIGH (Pmode, copy_rtx (tmp2));
high = loongarch_force_temporary (tmp3, high);
- emit_insn (gen_ld_from_got (Pmode, tmp1, high, tmp2));
+
+ if (TARGET_CMODEL_EXTREME)
+ {
+ gcc_assert (TARGET_EXPLICIT_RELOCS);
+
+ rtx tmp3 = gen_reg_rtx (Pmode);
+ emit_insn (gen_tls_low (Pmode, tmp3,
+ gen_rtx_REG (Pmode, 0), tmp2));
+ emit_insn (gen_lui_h_lo20 (tmp3, tmp3, tmp2));
+ emit_insn (gen_lui_h_hi12 (tmp3, tmp3, tmp2));
+ emit_move_insn (tmp1,
+ gen_rtx_MEM (Pmode,
+ gen_rtx_PLUS (Pmode,
+ high, tmp3)));
+ }
+ else
+ emit_insn (gen_ld_from_got (Pmode, tmp1, high, tmp2));
}
else
emit_insn (loongarch_got_load_tls_ie (tmp1, loc));
@@ -2530,11 +2588,18 @@ loongarch_legitimize_tls_address (rtx loc)
rtx high = gen_rtx_HIGH (Pmode, copy_rtx (tmp2));
high = loongarch_force_temporary (tmp3, high);
emit_insn (gen_ori_l_lo12 (Pmode, tmp1, high, tmp2));
+
+ if (TARGET_CMODEL_EXTREME)
+ {
+ gcc_assert (TARGET_EXPLICIT_RELOCS);
+
+ emit_insn (gen_lui_h_lo20 (tmp1, tmp1, tmp2));
+ emit_insn (gen_lui_h_hi12 (tmp1, tmp1, tmp2));
+ }
}
else
emit_insn (loongarch_got_load_tls_le (tmp1, loc));
emit_insn (gen_add3_insn (dest, tmp1, tp));
-
}
break;
@@ -2603,7 +2668,6 @@ bool
loongarch_split_symbol (rtx temp, rtx addr, machine_mode mode, rtx *low_out)
{
enum loongarch_symbol_type symbol_type;
- rtx high;
/* If build with '-mno-explicit-relocs', don't split symbol. */
if (!TARGET_EXPLICIT_RELOCS)
@@ -2615,6 +2679,8 @@ loongarch_split_symbol (rtx temp, rtx addr, machine_mode mode, rtx *low_out)
|| !loongarch_split_symbol_type (symbol_type))
return false;
+ rtx high, temp1 = NULL;
+
if (temp == NULL)
temp = gen_reg_rtx (Pmode);
@@ -2622,20 +2688,42 @@ loongarch_split_symbol (rtx temp, rtx addr, machine_mode mode, rtx *low_out)
high = gen_rtx_HIGH (Pmode, copy_rtx (addr));
high = loongarch_force_temporary (temp, high);
+ if (TARGET_CMODEL_EXTREME && can_create_pseudo_p ())
+ {
+ gcc_assert (TARGET_EXPLICIT_RELOCS);
+
+ temp1 = gen_reg_rtx (Pmode);
+ emit_move_insn (temp1, gen_rtx_LO_SUM (Pmode, gen_rtx_REG (Pmode, 0),
+ addr));
+ emit_insn (gen_lui_h_lo20 (temp1, temp1, addr));
+ emit_insn (gen_lui_h_hi12 (temp1, temp1, addr));
+ }
+
if (low_out)
switch (symbol_type)
{
case SYMBOL_PCREL:
- *low_out = gen_rtx_LO_SUM (Pmode, high, addr);
- break;
+ {
+ if (TARGET_CMODEL_EXTREME && can_create_pseudo_p ())
+ *low_out = gen_rtx_PLUS (Pmode, high, temp1);
+ else
+ *low_out = gen_rtx_LO_SUM (Pmode, high, addr);
+ break;
+ }
case SYMBOL_GOT_DISP:
/* SYMBOL_GOT_DISP symbols are loaded from the GOT. */
{
- rtx low = gen_rtx_LO_SUM (Pmode, high, addr);
- rtx mem = gen_rtx_MEM (Pmode, low);
- *low_out = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, mem),
- UNSPEC_LOAD_FROM_GOT);
+ if (TARGET_CMODEL_EXTREME && can_create_pseudo_p ())
+ *low_out = gen_rtx_MEM (Pmode, gen_rtx_PLUS (Pmode, high, temp1));
+ else
+ {
+ rtx low = gen_rtx_LO_SUM (Pmode, high, addr);
+ rtx mem = gen_rtx_MEM (Pmode, low);
+ *low_out = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, mem),
+ UNSPEC_LOAD_FROM_GOT);
+ }
+
break;
}
@@ -4584,34 +4672,86 @@ loongarch_memmodel_needs_release_fence (enum memmodel model)
in context CONTEXT. HI_RELOC indicates a high-part reloc. */
static void
-loongarch_print_operand_reloc (FILE *file, rtx op, bool hi_reloc)
+loongarch_print_operand_reloc (FILE *file, rtx op, bool hi64_part,
+ bool hi_reloc)
{
const char *reloc;
+ if (TARGET_CMODEL_EXTREME)
+ gcc_assert (TARGET_EXPLICIT_RELOCS);
+
switch (loongarch_classify_symbolic_expression (op))
{
case SYMBOL_PCREL:
- reloc = hi_reloc ? "%pc_hi20" : "%pc_lo12";
+ if (hi64_part)
+ {
+ if (TARGET_CMODEL_EXTREME)
+ reloc = hi_reloc ? "%pc64_hi12" : "%pc64_lo20";
+ else
+ gcc_unreachable ();
+ }
+ else
+ reloc = hi_reloc ? "%pc_hi20" : "%pc_lo12";
break;
case SYMBOL_GOT_DISP:
- reloc = hi_reloc ? "%got_pc_hi20" : "%got_pc_lo12";
+ if (hi64_part)
+ {
+ if (TARGET_CMODEL_EXTREME)
+ reloc = hi_reloc ? "%got64_pc_hi12" : "%got64_pc_lo20";
+ else
+ gcc_unreachable ();
+ }
+ else
+ reloc = hi_reloc ? "%got_pc_hi20" : "%got_pc_lo12";
break;
case SYMBOL_TLS_IE:
- reloc = hi_reloc ? "%ie_pc_hi20" : "%ie_pc_lo12";
+ if (hi64_part)
+ {
+ if (TARGET_CMODEL_EXTREME)
+ reloc = hi_reloc ? "%ie64_pc_hi12" : "%ie64_pc_lo20";
+ else
+ gcc_unreachable ();
+ }
+ else
+ reloc = hi_reloc ? "%ie_pc_hi20" : "%ie_pc_lo12";
break;
case SYMBOL_TLS_LE:
- reloc = hi_reloc ? "%le_hi20" : "%le_lo12";
+ if (hi64_part)
+ {
+ if (TARGET_CMODEL_EXTREME)
+ reloc = hi_reloc ? "%le64_hi12" : "%le64_lo20";
+ else
+ gcc_unreachable ();
+ }
+ else
+ reloc = hi_reloc ? "%le_hi20" : "%le_lo12";
break;
case SYMBOL_TLSGD:
- reloc = hi_reloc ? "%gd_pc_hi20" : "%got_pc_lo12";
+ if (hi64_part)
+ {
+ if (TARGET_CMODEL_EXTREME)
+ reloc = hi_reloc ? "%got64_pc_hi12" : "%got64_pc_lo20";
+ else
+ gcc_unreachable ();
+ }
+ else
+ reloc = hi_reloc ? "%gd_pc_hi20" : "%got_pc_lo12";
break;
case SYMBOL_TLSLDM:
- reloc = hi_reloc ? "%ld_pc_hi20" : "%got_pc_lo12";
+ if (hi64_part)
+ {
+ if (TARGET_CMODEL_EXTREME)
+ reloc = hi_reloc ? "%got64_pc_hi12" : "%got64_pc_lo20";
+ else
+ gcc_unreachable ();
+ }
+ else
+ reloc = hi_reloc ? "%ld_pc_hi20" : "%got_pc_lo12";
break;
default:
@@ -4637,6 +4777,8 @@ loongarch_print_operand_reloc (FILE *file, rtx op, bool hi_reloc)
'L' Print the low-part relocation associated with OP.
'm' Print one less than CONST_INT OP in decimal.
'N' Print the inverse of the integer branch condition for comparison OP.
+ 'r' Print address 12-31bit relocation associated with OP.
+ 'R' Print address 32-51bit relocation associated with OP.
'T' Print 'f' for (eq:CC ...), 't' for (ne:CC ...),
'z' for (eq:?I ...), 'n' for (ne:?I ...).
't' Like 'T', but with the EQ/NE cases reversed
@@ -4694,7 +4836,13 @@ loongarch_print_operand (FILE *file, rtx op, int letter)
case 'h':
if (code == HIGH)
op = XEXP (op, 0);
- loongarch_print_operand_reloc (file, op, true /* hi_reloc */);
+ loongarch_print_operand_reloc (file, op, false /* hi64_part */,
+ true /* hi_reloc */);
+ break;
+
+ case 'H':
+ loongarch_print_operand_reloc (file, op, true /* hi64_part */,
+ true /* hi_reloc */);
break;
case 'i':
@@ -4703,7 +4851,8 @@ loongarch_print_operand (FILE *file, rtx op, int letter)
break;
case 'L':
- loongarch_print_operand_reloc (file, op, false /* lo_reloc */);
+ loongarch_print_operand_reloc (file, op, false /* hi64_part*/,
+ false /* lo_reloc */);
break;
case 'm':
@@ -4718,6 +4867,16 @@ loongarch_print_operand (FILE *file, rtx op, int letter)
letter);
break;
+ case 'r':
+ loongarch_print_operand_reloc (file, op, false /* hi64_part */,
+ true /* lo_reloc */);
+ break;
+
+ case 'R':
+ loongarch_print_operand_reloc (file, op, true /* hi64_part */,
+ false /* lo_reloc */);
+ break;
+
case 't':
case 'T':
{
@@ -4848,7 +5007,8 @@ loongarch_print_operand_address (FILE *file, machine_mode /* mode */, rtx x)
case ADDRESS_LO_SUM:
fprintf (file, "%s,", reg_names[REGNO (addr.reg)]);
- loongarch_print_operand_reloc (file, addr.offset, false /* hi_reloc */);
+ loongarch_print_operand_reloc (file, addr.offset, false /* hi64_part */,
+ false /* hi_reloc */);
return;
case ADDRESS_CONST_INT:
@@ -5821,13 +5981,21 @@ loongarch_option_override_internal (struct gcc_options *opts)
switch (la_target.cmodel)
{
- case CMODEL_TINY_STATIC:
case CMODEL_EXTREME:
+ if (!TARGET_EXPLICIT_RELOCS)
+ error ("code model %qs needs %s",
+ "extreme", "-mexplicit-relocs");
+
if (opts->x_flag_plt)
- error ("code model %qs and %qs not support %s mode",
- "tiny-static", "extreme", "plt");
+ {
+ if (global_options_set.x_flag_plt)
+ error ("code model %qs is not compatible with %s",
+ "extreme", "-fplt");
+ opts->x_flag_plt = 0;
+ }
break;
+ case CMODEL_TINY_STATIC:
case CMODEL_NORMAL:
case CMODEL_TINY:
case CMODEL_LARGE:
diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md
index 8e8868de9..8fc10444c 100644
--- a/gcc/config/loongarch/loongarch.md
+++ b/gcc/config/loongarch/loongarch.md
@@ -60,6 +60,9 @@
UNSPEC_LOAD_FROM_GOT
UNSPEC_ORI_L_LO12
+ UNSPEC_LUI_L_HI20
+ UNSPEC_LUI_H_LO20
+ UNSPEC_LUI_H_HI12
UNSPEC_TLS_LOW
])
@@ -1934,16 +1937,45 @@
[(set_attr "type" "move")]
)
+(define_insn "@lui_l_hi20<mode>"
+ [(set (match_operand:P 0 "register_operand" "=r")
+ (unspec:P [(match_operand:P 1 "symbolic_operand")]
+ UNSPEC_LUI_L_HI20))]
+ ""
+ "lu12i.w\t%0,%r1"
+ [(set_attr "type" "move")]
+)
+
(define_insn "@ori_l_lo12<mode>"
[(set (match_operand:P 0 "register_operand" "=r")
(unspec:P [(match_operand:P 1 "register_operand" "r")
- (match_operand:P 2 "symbolic_operand")]
+ (match_operand:P 2 "symbolic_operand")]
UNSPEC_ORI_L_LO12))]
""
"ori\t%0,%1,%L2"
[(set_attr "type" "move")]
)
+(define_insn "lui_h_lo20"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(match_operand:DI 1 "register_operand" "0")
+ (match_operand:DI 2 "symbolic_operand")]
+ UNSPEC_LUI_H_LO20))]
+ "TARGET_64BIT"
+ "lu32i.d\t%0,%R2"
+ [(set_attr "type" "move")]
+)
+
+(define_insn "lui_h_hi12"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(match_operand:DI 1 "register_operand" "r")
+ (match_operand:DI 2 "symbolic_operand")]
+ UNSPEC_LUI_H_HI12))]
+ "TARGET_64BIT"
+ "lu52i.d\t%0,%1,%H2"
+ [(set_attr "type" "move")]
+)
+
;; Convert floating-point numbers to integers
(define_insn "frint_<fmt>"
[(set (match_operand:ANYF 0 "register_operand" "=f")
diff --git a/gcc/config/loongarch/predicates.md b/gcc/config/loongarch/predicates.md
index cd3528c7c..e38c6fbdd 100644
--- a/gcc/config/loongarch/predicates.md
+++ b/gcc/config/loongarch/predicates.md
@@ -111,7 +111,7 @@
(match_code "const,symbol_ref,label_ref")
{
/* Split symbol to high and low if return false.
- If defined TARGET_CMODEL_LARGE, all symbol would be splited,
+ If defined TARGET_CMODEL_EXTREME, all symbol would be splited,
else if offset is not zero, the symbol would be splited. */
enum loongarch_symbol_type symbol_type;
@@ -126,10 +126,13 @@
switch (symbol_type)
{
case SYMBOL_PCREL:
- return 1;
+ if (TARGET_CMODEL_EXTREME)
+ return false;
+ else
+ return 1;
case SYMBOL_GOT_DISP:
- if (TARGET_CMODEL_LARGE || !flag_plt)
+ if (TARGET_CMODEL_EXTREME || !flag_plt)
return false;
else
return 1;
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 1de2b2bd4..c4f83e62a 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1006,6 +1006,7 @@ Objective-C and Objective-C++ Dialects}.
-mcond-move-float -mno-cond-move-float @gol
-memcpy -mno-memcpy -mstrict-align -mno-strict-align @gol
-mmax-inline-memcpy-size=@var{n} @gol
+-mexplicit-relocs -mno-explicit-relocs @gol
-mcmodel=@var{code-model}}
@emph{M32R/D Options}
@@ -24617,50 +24618,19 @@ less than or equal to @var{n} bytes. The default value of @var{n} is 1024.
@item -mcmodel=@var{code-model}
Set the code model to one of:
@table @samp
-@item tiny-static
-@itemize @bullet
-@item
-local symbol and global strong symbol: The data section must be within +/-2MiB addressing space.
-The text section must be within +/-128MiB addressing space.
-@item
-global weak symbol: The got table must be within +/-2GiB addressing space.
-@end itemize
-
-@item tiny
-@itemize @bullet
-@item
-local symbol: The data section must be within +/-2MiB addressing space.
-The text section must be within +/-128MiB
-addressing space.
-@item
-global symbol: The got table must be within +/-2GiB addressing space.
-@end itemize
+@item tiny-static (Not implemented yet)
+@item tiny (Not implemented yet)
@item normal
-@itemize @bullet
-@item
-local symbol: The data section must be within +/-2GiB addressing space.
-The text section must be within +/-128MiB addressing space.
-@item
-global symbol: The got table must be within +/-2GiB addressing space.
-@end itemize
+The text segment must be within 128MB addressing space. The data segment must
+be within 2GB addressing space.
-@item large
-@itemize @bullet
-@item
-local symbol: The data section must be within +/-2GiB addressing space.
-The text section must be within +/-128GiB addressing space.
-@item
-global symbol: The got table must be within +/-2GiB addressing space.
-@end itemize
+@item large (Not implemented yet)
-@item extreme(Not implemented yet)
-@itemize @bullet
-@item
-local symbol: The data and text section must be within +/-8EiB addressing space.
-@item
-global symbol: The data got table must be within +/-8EiB addressing space.
-@end itemize
+@item extreme
+This mode does not limit the size of the code segment and data segment.
+The @option{-mcmodel=extreme} option is incompatible with @option{-fplt} and
+@option{-mno-explicit-relocs}.
@end table
The default code model is @code{normal}.
diff --git a/gcc/testsuite/gcc.target/loongarch/func-call-1.c b/gcc/testsuite/gcc.target/loongarch/func-call-1.c
index 01b8ea23f..76bf11b0c 100644
--- a/gcc/testsuite/gcc.target/loongarch/func-call-1.c
+++ b/gcc/testsuite/gcc.target/loongarch/func-call-1.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-mabi=lp64d -O0 -fpic -fplt -mno-explicit-relocs" } */
+/* { dg-options "-mabi=lp64d -O0 -fpic -fplt -mno-explicit-relocs -mcmodel=normal" } */
/* { dg-final { scan-assembler "test:.*bl\t%plt\\(g\\)\n" } } */
/* { dg-final { scan-assembler "test1:.*bl\t%plt\\(f\\)\n" } } */
/* { dg-final { scan-assembler "test2:.*bl\tl\n" } } */
diff --git a/gcc/testsuite/gcc.target/loongarch/func-call-2.c b/gcc/testsuite/gcc.target/loongarch/func-call-2.c
index 4565baaec..4b468fef8 100644
--- a/gcc/testsuite/gcc.target/loongarch/func-call-2.c
+++ b/gcc/testsuite/gcc.target/loongarch/func-call-2.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-mabi=lp64d -O0 -fno-pic -fplt -mno-explicit-relocs" } */
+/* { dg-options "-mabi=lp64d -O0 -fno-pic -fplt -mno-explicit-relocs -mcmodel=normal" } */
/* { dg-final { scan-assembler "test:.*bl\t%plt\\(g\\)\n" } } */
/* { dg-final { scan-assembler "test1:.*bl\tf\n" } } */
/* { dg-final { scan-assembler "test2:.*bl\tl\n" } } */
diff --git a/gcc/testsuite/gcc.target/loongarch/func-call-3.c b/gcc/testsuite/gcc.target/loongarch/func-call-3.c
index 4f669a029..dd3a4882d 100644
--- a/gcc/testsuite/gcc.target/loongarch/func-call-3.c
+++ b/gcc/testsuite/gcc.target/loongarch/func-call-3.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-mabi=lp64d -O0 -fpic -fno-plt -mno-explicit-relocs" } */
+/* { dg-options "-mabi=lp64d -O0 -fpic -fno-plt -mno-explicit-relocs -mcmodel=normal" } */
/* { dg-final { scan-assembler "test:.*la\.global\t.*g\n\tjirl" } } */
/* { dg-final { scan-assembler "test1:.*la\.global\t.*f\n\tjirl" } } */
/* { dg-final { scan-assembler "test2:.*bl\tl\n" } } */
diff --git a/gcc/testsuite/gcc.target/loongarch/func-call-4.c b/gcc/testsuite/gcc.target/loongarch/func-call-4.c
index 943adb640..f8158ec34 100644
--- a/gcc/testsuite/gcc.target/loongarch/func-call-4.c
+++ b/gcc/testsuite/gcc.target/loongarch/func-call-4.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-mabi=lp64d -O0 -fno-pic -fno-plt -mno-explicit-relocs" } */
+/* { dg-options "-mabi=lp64d -O0 -fno-pic -fno-plt -mno-explicit-relocs -mcmodel=normal" } */
/* { dg-final { scan-assembler "test:.*la\.global\t.*g\n\tjirl" } } */
/* { dg-final { scan-assembler "test1:.*bl\tf\n" } } */
/* { dg-final { scan-assembler "test2:.*bl\tl\n" } } */
diff --git a/gcc/testsuite/gcc.target/loongarch/func-call-5.c b/gcc/testsuite/gcc.target/loongarch/func-call-5.c
index 2c2a1c8a1..37994af43 100644
--- a/gcc/testsuite/gcc.target/loongarch/func-call-5.c
+++ b/gcc/testsuite/gcc.target/loongarch/func-call-5.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-mabi=lp64d -O0 -fpic -fplt -mexplicit-relocs" } */
+/* { dg-options "-mabi=lp64d -O0 -fpic -fplt -mexplicit-relocs -mcmodel=normal" } */
/* { dg-final { scan-assembler "test:.*bl\t%plt\\(g\\)\n" } } */
/* { dg-final { scan-assembler "test1:.*bl\t%plt\\(f\\)\n" } } */
/* { dg-final { scan-assembler "test2:.*bl\tl\n" } } */
diff --git a/gcc/testsuite/gcc.target/loongarch/func-call-6.c b/gcc/testsuite/gcc.target/loongarch/func-call-6.c
index 4b0e4266e..8e366e376 100644
--- a/gcc/testsuite/gcc.target/loongarch/func-call-6.c
+++ b/gcc/testsuite/gcc.target/loongarch/func-call-6.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-mabi=lp64d -O0 -fno-pic -fplt -mexplicit-relocs" } */
+/* { dg-options "-mabi=lp64d -O0 -fno-pic -fplt -mexplicit-relocs -mcmodel=normal" } */
/* { dg-final { scan-assembler "test:.*bl\t%plt\\(g\\)\n" } } */
/* { dg-final { scan-assembler "test1:.*bl\tf\n" } } */
/* { dg-final { scan-assembler "test2:.*bl\tl\n" } } */
diff --git a/gcc/testsuite/gcc.target/loongarch/func-call-7.c b/gcc/testsuite/gcc.target/loongarch/func-call-7.c
index 51792711f..4177c3d96 100644
--- a/gcc/testsuite/gcc.target/loongarch/func-call-7.c
+++ b/gcc/testsuite/gcc.target/loongarch/func-call-7.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-mabi=lp64d -O0 -fpic -fno-plt -mexplicit-relocs" } */
+/* { dg-options "-mabi=lp64d -O0 -fpic -fno-plt -mexplicit-relocs -mcmodel=normal" } */
/* { dg-final { scan-assembler "test:.*pcalau12i\t.*%got_pc_hi20\\(g\\)\n\tld\.d\t.*%got_pc_lo12\\(g\\)\n\tjirl" } } */
/* { dg-final { scan-assembler "test1:.*pcalau12i\t.*%got_pc_hi20\\(f\\)\n\tld\.d\t.*%got_pc_lo12\\(f\\)\n\tjirl" } } */
/* { dg-final { scan-assembler "test2:.*bl\tl\n" } } */
diff --git a/gcc/testsuite/gcc.target/loongarch/func-call-8.c b/gcc/testsuite/gcc.target/loongarch/func-call-8.c
index 330140d88..4254eaa16 100644
--- a/gcc/testsuite/gcc.target/loongarch/func-call-8.c
+++ b/gcc/testsuite/gcc.target/loongarch/func-call-8.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-mabi=lp64d -O0 -fno-pic -fno-plt -mexplicit-relocs" } */
+/* { dg-options "-mabi=lp64d -O0 -fno-pic -fno-plt -mexplicit-relocs -mcmodel=normal" } */
/* { dg-final { scan-assembler "test:.*pcalau12i\t.*%got_pc_hi20\\(g\\)\n\tld\.d\t.*%got_pc_lo12\\(g\\)\n\tjirl" } } */
/* { dg-final { scan-assembler "test1:.*bl\tf\n" } } */
/* { dg-final { scan-assembler "test2:.*bl\tl\n" } } */
diff --git a/gcc/testsuite/gcc.target/loongarch/func-call-extreme-1.c b/gcc/testsuite/gcc.target/loongarch/func-call-extreme-1.c
new file mode 100644
index 000000000..db1e0f853
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/func-call-extreme-1.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-mabi=lp64d -O0 -fno-pic -fno-plt -mexplicit-relocs -mcmodel=extreme" } */
+/* { dg-final { scan-assembler "test:.*pcalau12i.*%got_pc_hi20.*\n\taddi\.d.*%got_pc_lo12.*\n\tlu32i\.d.*%got64_pc_lo20.*\n\tlu52i\.d.*%got64_pc_hi12.*\n\tldx\.d" } } */
+/* { dg-final { scan-assembler "test1:.*pcalau12i.*%pc_hi20.*\n\taddi\.d.*%pc_lo12.*\n\tlu32i\.d.*%pc64_lo20.*\n\tlu52i\.d.*pc64_hi12.*\n\tadd\.d" } } */
+/* { dg-final { scan-assembler "test2:.*pcalau12i.*%pc_hi20.*\n\taddi\.d.*%pc_lo12.*\n\tlu32i\.d.*%pc64_lo20.*\n\tlu52i\.d.*pc64_hi12.*\n\tadd\.d" } } */
+
+extern void g (void);
+void
+f (void)
+{}
+
+static void
+l (void)
+{}
+
+void
+test (void)
+{
+ g ();
+}
+
+void
+test1 (void)
+{
+ f ();
+}
+
+void
+test2 (void)
+{
+ l ();
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/func-call-extreme-2.c b/gcc/testsuite/gcc.target/loongarch/func-call-extreme-2.c
new file mode 100644
index 000000000..21bf81ae8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/func-call-extreme-2.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-mabi=lp64d -O0 -fpic -fno-plt -mexplicit-relocs -mcmodel=extreme" } */
+/* { dg-final { scan-assembler "test:.*pcalau12i.*%got_pc_hi20.*\n\taddi\.d.*%got_pc_lo12.*\n\tlu32i\.d.*%got64_pc_lo20.*\n\tlu52i\.d.*%got64_pc_hi12.*\n\tldx\.d" } } */
+/* { dg-final { scan-assembler "test1:.*pcalau12i.*%got_pc_hi20.*\n\taddi\.d.*%got_pc_lo12.*\n\tlu32i\.d.*%got64_pc_lo20.*\n\tlu52i\.d.*%got64_pc_hi12.*\n\tldx\.d" } } */
+/* { dg-final { scan-assembler "test2:.*pcalau12i.*%pc_hi20.*\n\taddi\.d.*%pc_lo12.*\n\tlu32i\.d.*%pc64_lo20.*\n\tlu52i\.d.*pc64_hi12.*\n\tadd\.d" } } */
+
+extern void g (void);
+void
+f (void)
+{}
+
+static void
+l (void)
+{}
+
+void
+test (void)
+{
+ g ();
+}
+
+void
+test1 (void)
+{
+ f ();
+}
+
+void
+test2 (void)
+{
+ l ();
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/relocs-symbol-noaddend.c b/gcc/testsuite/gcc.target/loongarch/relocs-symbol-noaddend.c
index bfcc9bc33..3ec8bd229 100644
--- a/gcc/testsuite/gcc.target/loongarch/relocs-symbol-noaddend.c
+++ b/gcc/testsuite/gcc.target/loongarch/relocs-symbol-noaddend.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-mabi=lp64d -mexplicit-relocs -fno-pic -O2" } */
+/* { dg-options "-mabi=lp64d -mexplicit-relocs -fno-pic -O2 -mcmodel=normal" } */
/* { dg-final { scan-assembler "pcalau12i.*%pc_hi20\\(\.LANCHOR0\\)\n" } } */
/* { dg-final { scan-assembler "addi\.d.*%pc_lo12\\(\.LANCHOR0\\)\n" } } */
/* { dg-final { scan-assembler "ldptr.d\t\\\$r4,.*,0\n" } } */
--
2.33.0

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,65 @@
From 2ef90d604d7bae207d5b2067b4ce38d04d4835be Mon Sep 17 00:00:00 2001
From: Xiaolong Chen <chenxiaolong@loongson.cn>
Date: Tue, 12 Sep 2023 16:00:48 +0800
Subject: [PATCH 110/124] LoongArch: Add tests for ASX xvldrepl/xvstelm
instruction generation.
gcc/testsuite/ChangeLog:
* gcc.target/loongarch/vector/lasx/lasx-xvldrepl.c: New test.
* gcc.target/loongarch/vector/lasx/lasx-xvstelm.c: New test.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
.../loongarch/vector/lasx/lasx-xvldrepl.c | 16 ++++++++++++++++
.../loongarch/vector/lasx/lasx-xvstelm.c | 14 ++++++++++++++
2 files changed, 30 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-xvldrepl.c
create mode 100644 gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-xvstelm.c
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-xvldrepl.c b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-xvldrepl.c
new file mode 100644
index 000000000..105567951
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-xvldrepl.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mlasx" } */
+/* { dg-final { scan-assembler-times "xvldrepl.w" 2} } */
+
+#define N 258
+
+float a[N], b[N], c[N];
+
+void
+test ()
+{
+ for (int i = 0; i < 256; i++)
+ {
+ a[i] = c[0] * b[i] + c[1];
+ }
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-xvstelm.c b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-xvstelm.c
new file mode 100644
index 000000000..1a7b0e86f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-xvstelm.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mlasx" } */
+/* { dg-final { scan-assembler-times "xvstelm.w" 8} } */
+
+#define LEN 256
+
+float a[LEN], b[LEN], c[LEN];
+
+void
+test ()
+{
+ for (int i = 0; i < LEN; i += 2)
+ a[i] = b[i] + c[i];
+}
--
2.33.0

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,715 @@
From 243656b5b87a3125c2a885d11f022a79cca98b39 Mon Sep 17 00:00:00 2001
From: Xiaolong Chen <chenxiaolong@loongson.cn>
Date: Mon, 11 Sep 2023 10:07:24 +0800
Subject: [PATCH 082/124] LoongArch: Add tests for SX vector addition vsadd
instructions.
gcc/testsuite/ChangeLog:
* gcc.target/loongarch/vector/lsx/lsx-vsadd-1.c: New test.
* gcc.target/loongarch/vector/lsx/lsx-vsadd-2.c: New test.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
.../loongarch/vector/lsx/lsx-vsadd-1.c | 335 +++++++++++++++++
.../loongarch/vector/lsx/lsx-vsadd-2.c | 345 ++++++++++++++++++
2 files changed, 680 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vsadd-1.c
create mode 100644 gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vsadd-2.c
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vsadd-1.c b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vsadd-1.c
new file mode 100644
index 000000000..1bc27c983
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vsadd-1.c
@@ -0,0 +1,335 @@
+/* { dg-do run } */
+/* { dg-options "-mlsx -w -fno-strict-aliasing" } */
+#include "../simd_correctness_check.h"
+#include <lsxintrin.h>
+
+int
+main ()
+{
+ __m128i __m128i_op0, __m128i_op1, __m128i_op2, __m128i_out, __m128i_result;
+ __m128 __m128_op0, __m128_op1, __m128_op2, __m128_out, __m128_result;
+ __m128d __m128d_op0, __m128d_op1, __m128d_op2, __m128d_out, __m128d_result;
+
+ int int_op0, int_op1, int_op2, int_out, int_result, i = 1, fail;
+ long int long_op0, long_op1, long_op2, lont_out, lont_result;
+ long int long_int_out, long_int_result;
+ unsigned int unsigned_int_out, unsigned_int_result;
+ unsigned long int unsigned_long_int_out, unsigned_long_int_result;
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x00000000ffffffff;
+ *((unsigned long *)&__m128i_op1[0]) = 0x00000000ffffffff;
+ *((unsigned long *)&__m128i_result[1]) = 0x00000000ffffffff;
+ *((unsigned long *)&__m128i_result[0]) = 0x00000000ffffffff;
+ __m128i_out = __lsx_vsadd_b (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0xffffffffffffffff;
+ *((unsigned long *)&__m128i_op1[1]) = 0xffffffffffffffff;
+ *((unsigned long *)&__m128i_op1[0]) = 0xffffffffffffffff;
+ *((unsigned long *)&__m128i_result[1]) = 0xffffffffffffffff;
+ *((unsigned long *)&__m128i_result[0]) = 0xfefefefefefefefe;
+ __m128i_out = __lsx_vsadd_b (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0xffffffff3c992b2e;
+ *((unsigned long *)&__m128i_op1[0]) = 0xffffffffffff730f;
+ *((unsigned long *)&__m128i_result[1]) = 0xffffffff3c992b2e;
+ *((unsigned long *)&__m128i_result[0]) = 0xffffffffffff730f;
+ __m128i_out = __lsx_vsadd_b (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000000000000000;
+ __m128i_out = __lsx_vsadd_b (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000000000000000;
+ __m128i_out = __lsx_vsadd_b (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x00007fff00007fff;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x000000002bfd9461;
+ *((unsigned long *)&__m128i_result[1]) = 0x00007fff00007fff;
+ *((unsigned long *)&__m128i_result[0]) = 0x000000002bfd9461;
+ __m128i_out = __lsx_vsadd_b (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x00d3012acc56f9bb;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000001021;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x00d3012acc56f9bb;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000000000001021;
+ __m128i_out = __lsx_vsadd_b (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000001000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000001000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000000001000;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000000000001000;
+ __m128i_out = __lsx_vsadd_b (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x80808080806b000b;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[0]) = 0x80808080806b000b;
+ __m128i_out = __lsx_vsadd_b (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0xffffffffffffffff;
+ *((unsigned long *)&__m128i_op0[0]) = 0xffffffffff01ff01;
+ *((unsigned long *)&__m128i_op1[1]) = 0x3c600000ff800000;
+ *((unsigned long *)&__m128i_op1[0]) = 0xfffffffffffffffe;
+ *((unsigned long *)&__m128i_result[1]) = 0x3c5fffffff7fffff;
+ *((unsigned long *)&__m128i_result[0]) = 0xfffefffeff00feff;
+ __m128i_out = __lsx_vsadd_h (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000000000000000;
+ __m128i_out = __lsx_vsadd_h (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x00ff00ff00ff00ff;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[0]) = 0x00ff00ff00ff00ff;
+ __m128i_out = __lsx_vsadd_h (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x00000000ffffffff;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[0]) = 0x00000000ffffffff;
+ __m128i_out = __lsx_vsadd_h (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x3ff0000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x40f3fa0000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x3ff0000000000000;
+ *((unsigned long *)&__m128i_result[0]) = 0x40f3fa0000000000;
+ __m128i_out = __lsx_vsadd_h (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000008a0000008a;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000008900000009;
+ *((unsigned long *)&__m128i_op1[1]) = 0x63637687636316bb;
+ *((unsigned long *)&__m128i_op1[0]) = 0x6363636363636363;
+ *((unsigned long *)&__m128i_result[1]) = 0x6363771163631745;
+ *((unsigned long *)&__m128i_result[0]) = 0x636363ec6363636c;
+ __m128i_out = __lsx_vsadd_h (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000004;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000000000000004;
+ __m128i_out = __lsx_vsadd_h (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000080000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000080000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000080000000;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000000080000000;
+ __m128i_out = __lsx_vsadd_w (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0xfffffffffefefe6a;
+ *((unsigned long *)&__m128i_op0[0]) = 0x00000000c2bac2c2;
+ *((unsigned long *)&__m128i_op1[1]) = 0x00000001fffffffe;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x00000000fefefe68;
+ *((unsigned long *)&__m128i_result[0]) = 0x00000000c2bac2c2;
+ __m128i_out = __lsx_vsadd_w (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x027c027c000027c0;
+ *((unsigned long *)&__m128i_op1[1]) = 0x001ffff0003ffff0;
+ *((unsigned long *)&__m128i_op1[0]) = 0x000fffefffefffef;
+ *((unsigned long *)&__m128i_result[1]) = 0x001ffff0003ffff0;
+ *((unsigned long *)&__m128i_result[0]) = 0x028c026bfff027af;
+ __m128i_out = __lsx_vsadd_w (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0007000000040000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0003000000010000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x0007000000040000;
+ *((unsigned long *)&__m128i_result[0]) = 0x0003000000010000;
+ __m128i_out = __lsx_vsadd_w (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x3f8000003f800000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x3f8000003f800000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x3fffff0000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x3fffff0000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x7f7fff003f800000;
+ *((unsigned long *)&__m128i_result[0]) = 0x7f7fff003f800000;
+ __m128i_out = __lsx_vsadd_w (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000820202020;
+ *((unsigned long *)&__m128i_op0[0]) = 0x00fe01fc0005fff4;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000003a24;
+ *((unsigned long *)&__m128i_op1[0]) = 0x003dbe88077c78c1;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000820205a44;
+ *((unsigned long *)&__m128i_result[0]) = 0x013bc084078278b5;
+ __m128i_out = __lsx_vsadd_w (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000001;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000140001;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000000000001;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000000000140001;
+ __m128i_out = __lsx_vsadd_w (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000000000000000;
+ __m128i_out = __lsx_vsadd_w (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x67eb85afb2ebb000;
+ *((unsigned long *)&__m128i_op0[0]) = 0xc8847ef6ed3f2000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000100000001;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x67eb85b0b2ebb001;
+ *((unsigned long *)&__m128i_result[0]) = 0xc8847ef6ed3f2000;
+ __m128i_out = __lsx_vsadd_w (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000000000000000;
+ __m128i_out = __lsx_vsadd_w (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0xffffffff00000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0xffff000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000014eb54ab;
+ *((unsigned long *)&__m128i_op1[0]) = 0x14eb6a002a406a00;
+ *((unsigned long *)&__m128i_result[1]) = 0xffffffff14eb54ab;
+ *((unsigned long *)&__m128i_result[0]) = 0x14ea6a002a406a00;
+ __m128i_out = __lsx_vsadd_w (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000004;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000004;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0xce9035c49ffff570;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000000000004;
+ *((unsigned long *)&__m128i_result[0]) = 0xce9035c49ffff574;
+ __m128i_out = __lsx_vsadd_w (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000010;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000000000000010;
+ __m128i_out = __lsx_vadd_d (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x000000000000000d;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000400;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[0]) = 0x000000000000040d;
+ __m128i_out = __lsx_vadd_d (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000001300000013;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000001300000013;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000001300000013;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000001300000013;
+ __m128i_out = __lsx_vadd_d (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000000000000000;
+ __m128i_out = __lsx_vadd_d (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000100000100;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000100000100;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0xffffffffffffffff;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000100000100;
+ *((unsigned long *)&__m128i_result[0]) = 0x00000001000000ff;
+ __m128i_out = __lsx_vadd_d (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000300000001;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000100010001;
+ *((unsigned long *)&__m128i_op1[1]) = 0xfffffffffffffffa;
+ *((unsigned long *)&__m128i_op1[0]) = 0xfffffffffffffffa;
+ *((unsigned long *)&__m128i_result[1]) = 0x00000002fffffffb;
+ *((unsigned long *)&__m128i_result[0]) = 0x000000010000fffb;
+ __m128i_out = __lsx_vadd_d (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000000000000000;
+ __m128i_out = __lsx_vadd_d (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vsadd-2.c b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vsadd-2.c
new file mode 100644
index 000000000..67d189991
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vsadd-2.c
@@ -0,0 +1,345 @@
+/* { dg-do run } */
+/* { dg-options "-mlsx -w -fno-strict-aliasing" } */
+#include "../simd_correctness_check.h"
+#include <lsxintrin.h>
+
+int
+main ()
+{
+ __m128i __m128i_op0, __m128i_op1, __m128i_op2, __m128i_out, __m128i_result;
+ __m128 __m128_op0, __m128_op1, __m128_op2, __m128_out, __m128_result;
+ __m128d __m128d_op0, __m128d_op1, __m128d_op2, __m128d_out, __m128d_result;
+
+ int int_op0, int_op1, int_op2, int_out, int_result, i = 1, fail;
+ long int long_op0, long_op1, long_op2, lont_out, lont_result;
+ long int long_int_out, long_int_result;
+ unsigned int unsigned_int_out, unsigned_int_result;
+ unsigned long int unsigned_long_int_out, unsigned_long_int_result;
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x10f917d72d3d01e4;
+ *((unsigned long *)&__m128i_op1[0]) = 0x203e16d116de012b;
+ *((unsigned long *)&__m128i_result[1]) = 0x10f917d72d3d01e4;
+ *((unsigned long *)&__m128i_result[0]) = 0x203e16d116de012b;
+ __m128i_out = __lsx_vsadd_bu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0xfffebd06fffe820c;
+ *((unsigned long *)&__m128i_op0[0]) = 0x7fff7ffe7fff3506;
+ *((unsigned long *)&__m128i_op1[1]) = 0xfffebd06fffe820c;
+ *((unsigned long *)&__m128i_op1[0]) = 0x7fff7ffe7fff3506;
+ *((unsigned long *)&__m128i_result[1]) = 0xffffff0cffffff18;
+ *((unsigned long *)&__m128i_result[0]) = 0xfefffefffeff6a0c;
+ __m128i_out = __lsx_vsadd_bu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000000000000000;
+ __m128i_out = __lsx_vsadd_bu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000000000000000;
+ __m128i_out = __lsx_vsadd_bu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0xffffffffffffffff;
+ *((unsigned long *)&__m128i_op0[0]) = 0xffffffffffffffff;
+ *((unsigned long *)&__m128i_op1[1]) = 0x4f804f804f804f80;
+ *((unsigned long *)&__m128i_op1[0]) = 0x4f804f804f804f80;
+ *((unsigned long *)&__m128i_result[1]) = 0xffffffffffffffff;
+ *((unsigned long *)&__m128i_result[0]) = 0xffffffffffffffff;
+ __m128i_out = __lsx_vsadd_bu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0xfffff60ca7104649;
+ *((unsigned long *)&__m128i_op0[0]) = 0xfffff790a15db63d;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000001;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000001;
+ *((unsigned long *)&__m128i_result[1]) = 0xfffff60ca710464a;
+ *((unsigned long *)&__m128i_result[0]) = 0xfffff790a15db63e;
+ __m128i_out = __lsx_vsadd_bu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0xfffffffffffffffe;
+ *((unsigned long *)&__m128i_op0[0]) = 0xffffffffffffff46;
+ *((unsigned long *)&__m128i_op1[1]) = 0xffffffffffffffff;
+ *((unsigned long *)&__m128i_op1[0]) = 0xffffffffffffffff;
+ *((unsigned long *)&__m128i_result[1]) = 0xffffffffffffffff;
+ *((unsigned long *)&__m128i_result[0]) = 0xffffffffffffffff;
+ __m128i_out = __lsx_vsadd_bu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x00fe000100cf005f;
+ *((unsigned long *)&__m128i_op0[0]) = 0x7fff7fff7fff7fff;
+ *((unsigned long *)&__m128i_op1[1]) = 0x5f675e96e29a5a60;
+ *((unsigned long *)&__m128i_op1[0]) = 0x7fff7fff7fff7fff;
+ *((unsigned long *)&__m128i_result[1]) = 0x5fff5e97e2ff5abf;
+ *((unsigned long *)&__m128i_result[0]) = 0xfefffefffefffeff;
+ __m128i_out = __lsx_vsadd_bu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000001000100010;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0001000100010058;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x0001001100110068;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000000000000000;
+ __m128i_out = __lsx_vsadd_bu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x7fffffff7fffffff;
+ *((unsigned long *)&__m128i_op0[0]) = 0x7fffffff7fffffff;
+ *((unsigned long *)&__m128i_op1[1]) = 0x7fff010181010102;
+ *((unsigned long *)&__m128i_op1[0]) = 0x7fffffff81010102;
+ *((unsigned long *)&__m128i_result[1]) = 0xfeffffffffffffff;
+ *((unsigned long *)&__m128i_result[0]) = 0xfeffffffffffffff;
+ __m128i_out = __lsx_vsadd_bu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000ebd20000714f;
+ *((unsigned long *)&__m128i_op0[0]) = 0x00012c8a0000a58a;
+ *((unsigned long *)&__m128i_op1[1]) = 0xffffffffb81a6f70;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000d48eaa1a2;
+ *((unsigned long *)&__m128i_result[1]) = 0xffffffffb81ae0bf;
+ *((unsigned long *)&__m128i_result[0]) = 0x00012c9748eaffff;
+ __m128i_out = __lsx_vsadd_bu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0177fff0fffffff0;
+ *((unsigned long *)&__m128i_op0[0]) = 0x00000000011ff8bc;
+ *((unsigned long *)&__m128i_op1[1]) = 0xffffffffffffffff;
+ *((unsigned long *)&__m128i_op1[0]) = 0xffffffffffffffff;
+ *((unsigned long *)&__m128i_result[1]) = 0xffffffffffffffff;
+ *((unsigned long *)&__m128i_result[0]) = 0xffffffffffffffff;
+ __m128i_out = __lsx_vsadd_bu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000200;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000200;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000000000200;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000000000000200;
+ __m128i_out = __lsx_vsadd_hu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0xffffffffffffffff;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000001;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0xffffffffffffffff;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000000000000001;
+ __m128i_out = __lsx_vsadd_hu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000000000000000;
+ __m128i_out = __lsx_vsadd_hu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000000000000000;
+ __m128i_out = __lsx_vsadd_hu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000d0000000d;
+ *((unsigned long *)&__m128i_op1[1]) = 0x8006000000040000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x8002000000000007;
+ *((unsigned long *)&__m128i_result[1]) = 0x8006000000040000;
+ *((unsigned long *)&__m128i_result[0]) = 0x8002000d00000014;
+ __m128i_out = __lsx_vsadd_hu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000014;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000000000000014;
+ __m128i_out = __lsx_vsadd_hu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0xffffffffffffffff;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0xffffffffffffffff;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0xffffffffffffffff;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000000000000000;
+ __m128i_out = __lsx_vsadd_hu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000000000000000;
+ __m128i_out = __lsx_vsadd_hu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ int_out = __lsx_vpickve2gr_h (__m128i_op0, 0x1);
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000600007fff;
+ *((unsigned long *)&__m128i_op0[0]) = 0x00000008ffffa209;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000600007fff;
+ *((unsigned long *)&__m128i_result[0]) = 0x00000008ffffa209;
+ __m128i_out = __lsx_vsadd_hu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x636363633f3e47c1;
+ *((unsigned long *)&__m128i_op0[0]) = 0x41f8e080f1ef4eaa;
+ *((unsigned long *)&__m128i_op1[1]) = 0x00000807bf0a1f80;
+ *((unsigned long *)&__m128i_op1[0]) = 0x00000800ecedee68;
+ *((unsigned long *)&__m128i_result[1]) = 0x63636b6afe486741;
+ *((unsigned long *)&__m128i_result[0]) = 0x41f8e880ffffffff;
+ __m128i_out = __lsx_vsadd_hu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000ebd20000714f;
+ *((unsigned long *)&__m128i_op0[0]) = 0x00012c8a0000a58a;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000ebd20000714f;
+ *((unsigned long *)&__m128i_op1[0]) = 0x00012c8a0000a58a;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000ffff0000e29e;
+ *((unsigned long *)&__m128i_result[0]) = 0x000259140000ffff;
+ __m128i_out = __lsx_vsadd_hu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0xfffffffeffffffff;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[0]) = 0xfffffffeffffffff;
+ __m128i_out = __lsx_vsadd_wu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0c03e17edd781b11;
+ *((unsigned long *)&__m128i_op0[0]) = 0x342caf9be55700b5;
+ *((unsigned long *)&__m128i_op1[1]) = 0x00040003ff83ff84;
+ *((unsigned long *)&__m128i_op1[0]) = 0x00040003ff4dffca;
+ *((unsigned long *)&__m128i_result[1]) = 0x0c07e181ffffffff;
+ *((unsigned long *)&__m128i_result[0]) = 0x3430af9effffffff;
+ __m128i_out = __lsx_vsadd_wu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x00000000ffa8ff9f;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000ffffffabff99;
+ *((unsigned long *)&__m128i_op1[1]) = 0x000100000002007d;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0001000000020001;
+ *((unsigned long *)&__m128i_result[1]) = 0x00010000ffab001c;
+ *((unsigned long *)&__m128i_result[0]) = 0x0001ffffffadff9a;
+ __m128i_out = __lsx_vsadd_wu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0800080008000800;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0800080008000800;
+ *((unsigned long *)&__m128i_result[1]) = 0x0800080008000800;
+ *((unsigned long *)&__m128i_result[0]) = 0x0800080008000800;
+ __m128i_out = __lsx_vsadd_wu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000001;
+ *((unsigned long *)&__m128i_op0[0]) = 0x76f424887fffffff;
+ *((unsigned long *)&__m128i_op1[1]) = 0xc110000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0xc00d060000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0xc110000000000001;
+ *((unsigned long *)&__m128i_result[0]) = 0xffffffff7fffffff;
+ __m128i_out = __lsx_vsadd_wu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x000000000000002f;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000029;
+ *((unsigned long *)&__m128i_op1[1]) = 0xfbfbfb17fbfb38ea;
+ *((unsigned long *)&__m128i_op1[0]) = 0xfbfb47fbfbfb0404;
+ *((unsigned long *)&__m128i_result[1]) = 0xfbfbfb17fbfb3919;
+ *((unsigned long *)&__m128i_result[0]) = 0xfbfb47fbfbfb042d;
+ __m128i_out = __lsx_vsadd_wu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x8080808080808081;
+ *((unsigned long *)&__m128i_op1[1]) = 0xffffffffffffffff;
+ *((unsigned long *)&__m128i_op1[0]) = 0x00000000ffffffff;
+ *((unsigned long *)&__m128i_result[1]) = 0xffffffffffffffff;
+ *((unsigned long *)&__m128i_result[0]) = 0x80808080ffffffff;
+ __m128i_out = __lsx_vsadd_wu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x00123fff00120012;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0012001200120012;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x000000000005003a;
+ *((unsigned long *)&__m128i_result[1]) = 0x00123fff00120012;
+ *((unsigned long *)&__m128i_result[0]) = 0x001200120017004c;
+ __m128i_out = __lsx_vsadd_wu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0xbfd10d0d7b6b6b73;
+ *((unsigned long *)&__m128i_op1[0]) = 0xc5c534920000c4ed;
+ *((unsigned long *)&__m128i_result[1]) = 0xbfd10d0d7b6b6b73;
+ *((unsigned long *)&__m128i_result[0]) = 0xc5c534920000c4ed;
+ __m128i_out = __lsx_vsadd_wu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x000aa822a79308f6;
+ *((unsigned long *)&__m128i_op0[0]) = 0x00000000084d12ce;
+ *((unsigned long *)&__m128i_op1[1]) = 0x000aa822a79308f6;
+ *((unsigned long *)&__m128i_op1[0]) = 0x03aa558e1d37b5a1;
+ *((unsigned long *)&__m128i_result[1]) = 0x00155044ffffffff;
+ *((unsigned long *)&__m128i_result[0]) = 0x03aa558e2584c86f;
+ __m128i_out = __lsx_vsadd_wu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x021b7d24c9678a35;
+ *((unsigned long *)&__m128i_op0[0]) = 0x030298a6a1030a49;
+ *((unsigned long *)&__m128i_op1[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_result[1]) = 0x021b7d24c9678a35;
+ *((unsigned long *)&__m128i_result[0]) = 0x030298a6a1030a49;
+ __m128i_out = __lsx_vsadd_wu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x00007a8000000480;
+ *((unsigned long *)&__m128i_op0[0]) = 0x00000485000004cc;
+ *((unsigned long *)&__m128i_op1[1]) = 0x00007a8000000480;
+ *((unsigned long *)&__m128i_op1[0]) = 0x00000485000004cc;
+ *((unsigned long *)&__m128i_result[1]) = 0x0000f50000000900;
+ *((unsigned long *)&__m128i_result[0]) = 0x0000090a00000998;
+ __m128i_out = __lsx_vsadd_wu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ *((unsigned long *)&__m128i_op0[1]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op0[0]) = 0x0000000000000000;
+ *((unsigned long *)&__m128i_op1[1]) = 0x004eff6200d2ff76;
+ *((unsigned long *)&__m128i_op1[0]) = 0xff70002800be00a0;
+ *((unsigned long *)&__m128i_result[1]) = 0x004eff6200d2ff76;
+ *((unsigned long *)&__m128i_result[0]) = 0xff70002800be00a0;
+ __m128i_out = __lsx_vsadd_wu (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m128i_result, __m128i_out);
+
+ return 0;
+}
--
2.33.0

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,37 @@
From f07b91862055533d779fbf76c12cb7c0ae75b53d Mon Sep 17 00:00:00 2001
From: Xiaolong Chen <chenxiaolong@loongson.cn>
Date: Mon, 11 Sep 2023 09:35:24 +0800
Subject: [PATCH 076/124] LoongArch: Add tests of -mstrict-align option.
gcc/testsuite/ChangeLog:
* gcc.target/loongarch/strict-align.c: New test.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
gcc/testsuite/gcc.target/loongarch/strict-align.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/loongarch/strict-align.c
diff --git a/gcc/testsuite/gcc.target/loongarch/strict-align.c b/gcc/testsuite/gcc.target/loongarch/strict-align.c
new file mode 100644
index 000000000..040d84958
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/strict-align.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mstrict-align -mlasx" } */
+/* { dg-final { scan-assembler-not "vfadd.s" } } */
+
+void
+foo (float *restrict x, float *restrict y)
+{
+ x[0] = x[0] + y[0];
+ x[1] = x[1] + y[1];
+ x[2] = x[2] + y[2];
+ x[3] = x[3] + y[3];
+}
--
2.33.0

View File

@ -0,0 +1,131 @@
From aebd03c944312be767f03d129eeebc0c4cdf5b4a Mon Sep 17 00:00:00 2001
From: Xiaolong Chen <chenxiaolong@loongson.cn>
Date: Mon, 11 Sep 2023 09:36:35 +0800
Subject: [PATCH 077/124] LoongArch: Add testsuite framework for Loongson
SX/ASX.
gcc/testsuite/ChangeLog:
* gcc.target/loongarch/vector/loongarch-vector.exp: New test.
* gcc.target/loongarch/vector/simd_correctness_check.h: New test.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
.../loongarch/vector/loongarch-vector.exp | 42 +++++++++++++++
.../loongarch/vector/simd_correctness_check.h | 54 +++++++++++++++++++
2 files changed, 96 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/loongarch/vector/loongarch-vector.exp
create mode 100644 gcc/testsuite/gcc.target/loongarch/vector/simd_correctness_check.h
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/loongarch-vector.exp b/gcc/testsuite/gcc.target/loongarch/vector/loongarch-vector.exp
new file mode 100644
index 000000000..2c37aa91d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/loongarch-vector.exp
@@ -0,0 +1,42 @@
+#Copyright(C) 2023 Free Software Foundation, Inc.
+
+#This program is free software; you can redistribute it and / or modify
+#it under the terms of the GNU General Public License as published by
+#the Free Software Foundation; either version 3 of the License, or
+#(at your option) any later version.
+#
+#This program is distributed in the hope that it will be useful,
+#but WITHOUT ANY WARRANTY; without even the implied warranty of
+#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the
+#GNU General Public License for more details.
+#
+#You should have received a copy of the GNU General Public License
+#along with GCC; see the file COPYING3.If not see
+# <http: //www.gnu.org/licenses/>.
+
+#GCC testsuite that uses the `dg.exp' driver.
+
+#Exit immediately if this isn't a LoongArch target.
+if ![istarget loongarch*-*-*] then {
+ return
+}
+
+#Load support procs.
+load_lib gcc-dg.exp
+
+#If a testcase doesn't have special options, use these.
+global DEFAULT_CFLAGS
+if ![info exists DEFAULT_CFLAGS] then {
+ set DEFAULT_CFLAGS " "
+}
+
+#Initialize `dg'.
+dg-init
+
+#Main loop.
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/lsx/*.\[cS\]]] \
+ " -mlsx" $DEFAULT_CFLAGS
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/lasx/*.\[cS\]]] \
+ " -mlasx" $DEFAULT_CFLAGS
+# All done.
+dg-finish
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/simd_correctness_check.h b/gcc/testsuite/gcc.target/loongarch/vector/simd_correctness_check.h
new file mode 100644
index 000000000..eb7fbd59c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/simd_correctness_check.h
@@ -0,0 +1,54 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define ASSERTEQ_64(line, ref, res) \
+ do \
+ { \
+ int fail = 0; \
+ for (size_t i = 0; i < sizeof (res) / sizeof (res[0]); ++i) \
+ { \
+ long *temp_ref = &ref[i], *temp_res = &res[i]; \
+ if (abs (*temp_ref - *temp_res) > 0) \
+ { \
+ printf (" error: %s at line %ld , expected " #ref \
+ "[%ld]:0x%lx, got: 0x%lx\n", \
+ __FILE__, line, i, *temp_ref, *temp_res); \
+ fail = 1; \
+ } \
+ } \
+ if (fail == 1) \
+ abort (); \
+ } \
+ while (0)
+
+#define ASSERTEQ_32(line, ref, res) \
+ do \
+ { \
+ int fail = 0; \
+ for (size_t i = 0; i < sizeof (res) / sizeof (res[0]); ++i) \
+ { \
+ int *temp_ref = &ref[i], *temp_res = &res[i]; \
+ if (abs (*temp_ref - *temp_res) > 0) \
+ { \
+ printf (" error: %s at line %ld , expected " #ref \
+ "[%ld]:0x%x, got: 0x%x\n", \
+ __FILE__, line, i, *temp_ref, *temp_res); \
+ fail = 1; \
+ } \
+ } \
+ if (fail == 1) \
+ abort (); \
+ } \
+ while (0)
+
+#define ASSERTEQ_int(line, ref, res) \
+ do \
+ { \
+ if (ref != res) \
+ { \
+ printf (" error: %s at line %ld , expected %d, got %d\n", __FILE__, \
+ line, ref, res); \
+ } \
+ } \
+ while (0)
--
2.33.0

Some files were not shown because too many files have changed in this diff Show More