gcc/0050-LoongArch-Use-LSX-for-scalar-FP-rounding-with-explic.patch

From 05fafb78b301ce9a545e0dad896b19339f716eaf Mon Sep 17 00:00:00 2001
From: Xi Ruoyao <xry111@xry111.site>
Date: Mon, 20 Nov 2023 03:51:56 +0800
Subject: [PATCH 050/188] LoongArch: Use LSX for scalar FP rounding with
 explicit rounding mode

In LoongArch FP base ISA there is only the frint.{s/d} instruction which
reads the global rounding mode.  Utilize LSX for explicit rounding mode
even if the operand is scalar.  It seems wasting the CPU power, but
still much faster than calling the library function.

gcc/ChangeLog:

	* config/loongarch/simd.md (LSX_SCALAR_FRINT): New int iterator.
	(VLSX_FOR_FMODE): New mode attribute.
	(<simd_for_scalar_frint_pattern><mode>2): New expander,
	expanding to vreplvei.{w/d} + frint{rp/rz/rm/rne}.{s.d}.

gcc/testsuite/ChangeLog:

	* gcc.target/loongarch/vect-frint-scalar.c: New test.
	* gcc.target/loongarch/vect-frint-scalar-no-inexact.c: New test.
---
 gcc/config/loongarch/simd.md                  | 28 ++++++++++++
 .../loongarch/vect-frint-scalar-no-inexact.c  | 23 ++++++++++
 .../gcc.target/loongarch/vect-frint-scalar.c  | 43 +++++++++++++++++++
 3 files changed, 94 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/vect-frint-scalar-no-inexact.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/vect-frint-scalar.c

diff --git a/gcc/config/loongarch/simd.md b/gcc/config/loongarch/simd.md
index 4ecf7a55e..843b1a41f 100644
--- a/gcc/config/loongarch/simd.md
+++ b/gcc/config/loongarch/simd.md
@@ -169,6 +169,34 @@
 		     UNSPEC_SIMD_FRINTRZ))]
   "")

+;; Use LSX for scalar ceil/floor/trunc/roundeven when -mlsx and -ffp-int-
+;; builtin-inexact.  The base FP instruction set lacks these operations.
+;; Yes we are wasting 50% or even 75% of the CPU horsepower, but it's still
+;; much faster than calling a libc function: on LA464 and LA664 there is a
+;; 3x ~ 5x speed up.
+;;
+;; Note that a vreplvei instruction is needed or we'll also operate on the
+;; junk in high bits of the vector register and produce random FP exceptions.
+
+(define_int_iterator LSX_SCALAR_FRINT
+  [UNSPEC_SIMD_FRINTRP
+   UNSPEC_SIMD_FRINTRZ
+   UNSPEC_SIMD_FRINTRM
+   UNSPEC_SIMD_FRINTRNE])
+
+(define_mode_attr VLSX_FOR_FMODE [(DF "V2DF") (SF "V4SF")])
+
+(define_expand "<simd_frint_pattern><mode>2"
+  [(set (match_dup 2)
+     (vec_duplicate:<VLSX_FOR_FMODE>
+       (match_operand:ANYF 1 "register_operand")))
+   (set (match_dup 2)
+	(unspec:<VLSX_FOR_FMODE> [(match_dup 2)] LSX_SCALAR_FRINT))
+   (set (match_operand:ANYF 0 "register_operand")
+	(vec_select:ANYF (match_dup 2) (parallel [(const_int 0)])))]
+  "ISA_HAS_LSX && (flag_fp_int_builtin_inexact || !flag_trapping_math)"
+  "operands[2] = gen_reg_rtx (<VLSX_FOR_FMODE>mode);")
+
 ;; <x>vftint.{/rp/rz/rm}
 (define_insn
   "<simd_isa>_<x>vftint<simd_frint_rounding>_<simdifmt_for_f>_<simdfmt>"
diff --git a/gcc/testsuite/gcc.target/loongarch/vect-frint-scalar-no-inexact.c b/gcc/testsuite/gcc.target/loongarch/vect-frint-scalar-no-inexact.c
new file mode 100644
index 000000000..002e3b92d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vect-frint-scalar-no-inexact.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mlsx -fno-fp-int-builtin-inexact" } */
+
+#include "vect-frint-scalar.c"
+
+/* cannot use LSX for these with -fno-fp-int-builtin-inexact,
+   call library function.  */
+/* { dg-final { scan-assembler "\tb\t%plt\\(ceil\\)" } } */
+/* { dg-final { scan-assembler "\tb\t%plt\\(ceilf\\)" } } */
+/* { dg-final { scan-assembler "\tb\t%plt\\(floor\\)" } } */
+/* { dg-final { scan-assembler "\tb\t%plt\\(floorf\\)" } } */
+/* { dg-final { scan-assembler "\tb\t%plt\\(trunc\\)" } } */
+/* { dg-final { scan-assembler "\tb\t%plt\\(truncf\\)" } } */
+/* { dg-final { scan-assembler "\tb\t%plt\\(roundeven\\)" } } */
+/* { dg-final { scan-assembler "\tb\t%plt\\(roundevenf\\)" } } */
+
+/* nearbyint is not allowed to rasie FE_INEXACT for decades */
+/* { dg-final { scan-assembler "\tb\t%plt\\(nearbyint\\)" } } */
+/* { dg-final { scan-assembler "\tb\t%plt\\(nearbyintf\\)" } } */
+
+/* rint should just use basic FP operation */
+/* { dg-final { scan-assembler "\tfrint\.s" } } */
+/* { dg-final { scan-assembler "\tfrint\.d" } } */
diff --git a/gcc/testsuite/gcc.target/loongarch/vect-frint-scalar.c b/gcc/testsuite/gcc.target/loongarch/vect-frint-scalar.c
new file mode 100644
index 000000000..c7cb40be7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vect-frint-scalar.c
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mlsx" } */
+
+#define test(func, suffix) \
+__typeof__ (1.##suffix) \
+_##func##suffix (__typeof__ (1.##suffix) x) \
+{ \
+  return __builtin_##func##suffix (x); \
+}
+
+test (ceil, f)
+test (ceil, )
+test (floor, f)
+test (floor, )
+test (trunc, f)
+test (trunc, )
+test (roundeven, f)
+test (roundeven, )
+test (nearbyint, f)
+test (nearbyint, )
+test (rint, f)
+test (rint, )
+
+/* { dg-final { scan-assembler "\tvfrintrp\.s" } } */
+/* { dg-final { scan-assembler "\tvfrintrm\.s" } } */
+/* { dg-final { scan-assembler "\tvfrintrz\.s" } } */
+/* { dg-final { scan-assembler "\tvfrintrne\.s" } } */
+/* { dg-final { scan-assembler "\tvfrintrp\.d" } } */
+/* { dg-final { scan-assembler "\tvfrintrm\.d" } } */
+/* { dg-final { scan-assembler "\tvfrintrz\.d" } } */
+/* { dg-final { scan-assembler "\tvfrintrne\.d" } } */
+
+/* must do vreplvei first */
+/* { dg-final { scan-assembler-times "\tvreplvei\.w\t\\\$vr0,\\\$vr0,0" 4 } } */
+/* { dg-final { scan-assembler-times "\tvreplvei\.d\t\\\$vr0,\\\$vr0,0" 4 } } */
+
+/* nearbyint is not allowed to rasie FE_INEXACT for decades */
+/* { dg-final { scan-assembler "\tb\t%plt\\(nearbyint\\)" } } */
+/* { dg-final { scan-assembler "\tb\t%plt\\(nearbyintf\\)" } } */
+
+/* rint should just use basic FP operation */
+/* { dg-final { scan-assembler "\tfrint\.s" } } */
+/* { dg-final { scan-assembler "\tfrint\.d" } } */
--
2.43.0