gcc/0062-LoongArch-New-options-mrecip-and-mrecip-with-ffast-m.patch

From faac4efbee23e60691fc086a78284225ecf824a8 Mon Sep 17 00:00:00 2001
From: Jiahao Xu <xujiahao@loongson.cn>
Date: Wed, 6 Dec 2023 15:04:52 +0800
Subject: [PATCH 062/188] LoongArch: New options -mrecip and -mrecip= with
 ffast-math.

When both the -mrecip and -mfrecipe options are enabled, use approximate reciprocal
instructions and approximate reciprocal square root instructions with additional
Newton-Raphson steps to implement single precision floating-point division, square
root and reciprocal square root operations, for a better performance.

gcc/ChangeLog:

	* config/loongarch/genopts/loongarch.opt.in (recip_mask): New variable.
	(-mrecip, -mrecip): New options.
	* config/loongarch/lasx.md (div<mode>3): New expander.
	(*div<mode>3): Rename.
	(sqrt<mode>2): New expander.
	(*sqrt<mode>2): Rename.
	(rsqrt<mode>2): New expander.
	* config/loongarch/loongarch-protos.h (loongarch_emit_swrsqrtsf): New prototype.
	(loongarch_emit_swdivsf): Ditto.
	* config/loongarch/loongarch.cc (loongarch_option_override_internal): Set
	recip_mask for -mrecip and -mrecip= options.
	(loongarch_emit_swrsqrtsf): New function.
	(loongarch_emit_swdivsf): Ditto.
	* config/loongarch/loongarch.h (RECIP_MASK_NONE, RECIP_MASK_DIV, RECIP_MASK_SQRT
	RECIP_MASK_RSQRT, RECIP_MASK_VEC_DIV, RECIP_MASK_VEC_SQRT, RECIP_MASK_VEC_RSQRT
	RECIP_MASK_ALL): New bitmasks.
	(TARGET_RECIP_DIV, TARGET_RECIP_SQRT, TARGET_RECIP_RSQRT, TARGET_RECIP_VEC_DIV
	TARGET_RECIP_VEC_SQRT, TARGET_RECIP_VEC_RSQRT): New tests.
	* config/loongarch/loongarch.md (sqrt<mode>2): New expander.
	(*sqrt<mode>2): Rename.
	(rsqrt<mode>2): New expander.
	* config/loongarch/loongarch.opt (recip_mask): New variable.
	(-mrecip, -mrecip): New options.
	* config/loongarch/lsx.md (div<mode>3): New expander.
	(*div<mode>3): Rename.
	(sqrt<mode>2): New expander.
	(*sqrt<mode>2): Rename.
	(rsqrt<mode>2): New expander.
	* config/loongarch/predicates.md (reg_or_vecotr_1_operand): New predicate.
	* doc/invoke.texi (LoongArch Options): Document new options.

gcc/testsuite/ChangeLog:

	* gcc.target/loongarch/divf.c: New test.
	* gcc.target/loongarch/recip-divf.c: New test.
	* gcc.target/loongarch/recip-sqrtf.c: New test.
	* gcc.target/loongarch/sqrtf.c: New test.
	* gcc.target/loongarch/vector/lasx/lasx-divf.c: New test.
	* gcc.target/loongarch/vector/lasx/lasx-recip-divf.c: New test.
	* gcc.target/loongarch/vector/lasx/lasx-recip-sqrtf.c: New test.
	* gcc.target/loongarch/vector/lasx/lasx-recip.c: New test.
	* gcc.target/loongarch/vector/lasx/lasx-sqrtf.c: New test.
	* gcc.target/loongarch/vector/lsx/lsx-divf.c: New test.
	* gcc.target/loongarch/vector/lsx/lsx-recip-divf.c: New test.
	* gcc.target/loongarch/vector/lsx/lsx-recip-sqrtf.c: New test.
	* gcc.target/loongarch/vector/lsx/lsx-recip.c: New test.
	* gcc.target/loongarch/vector/lsx/lsx-sqrtf.c: New test.
---
 gcc/config/loongarch/genopts/loongarch.opt.in |  11 +
 gcc/config/loongarch/lasx.md                  |  53 ++++-
 gcc/config/loongarch/loongarch-protos.h       |   2 +
 gcc/config/loongarch/loongarch.cc             | 188 ++++++++++++++++++
 gcc/config/loongarch/loongarch.h              |  18 ++
 gcc/config/loongarch/loongarch.md             |  49 ++++-
 gcc/config/loongarch/loongarch.opt            |  11 +
 gcc/config/loongarch/lsx.md                   |  53 ++++-
 gcc/config/loongarch/predicates.md            |   4 +
 gcc/doc/invoke.texi                           |  55 ++++-
 gcc/testsuite/gcc.target/loongarch/divf.c     |  10 +
 .../gcc.target/loongarch/recip-divf.c         |   9 +
 .../gcc.target/loongarch/recip-sqrtf.c        |  23 +++
 gcc/testsuite/gcc.target/loongarch/sqrtf.c    |  24 +++
 .../loongarch/vector/lasx/lasx-divf.c         |  13 ++
 .../loongarch/vector/lasx/lasx-recip-divf.c   |  12 ++
 .../loongarch/vector/lasx/lasx-recip-sqrtf.c  |  28 +++
 .../loongarch/vector/lasx/lasx-recip.c        |  24 +++
 .../loongarch/vector/lasx/lasx-sqrtf.c        |  29 +++
 .../loongarch/vector/lsx/lsx-divf.c           |  13 ++
 .../loongarch/vector/lsx/lsx-recip-divf.c     |  12 ++
 .../loongarch/vector/lsx/lsx-recip-sqrtf.c    |  28 +++
 .../loongarch/vector/lsx/lsx-recip.c          |  24 +++
 .../loongarch/vector/lsx/lsx-sqrtf.c          |  29 +++
 24 files changed, 711 insertions(+), 11 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/divf.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/recip-divf.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/recip-sqrtf.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/sqrtf.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-divf.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-recip-divf.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-recip-sqrtf.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-recip.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-sqrtf.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-divf.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-recip-divf.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-recip-sqrtf.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-recip.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-sqrtf.c

diff --git a/gcc/config/loongarch/genopts/loongarch.opt.in b/gcc/config/loongarch/genopts/loongarch.opt.in
index cd5e75e4f..102202b03 100644
--- a/gcc/config/loongarch/genopts/loongarch.opt.in
+++ b/gcc/config/loongarch/genopts/loongarch.opt.in
@@ -23,6 +23,9 @@ config/loongarch/loongarch-opts.h
 HeaderInclude
 config/loongarch/loongarch-str.h

+TargetVariable
+unsigned int recip_mask = 0
+
 ; ISA related options
 ;; Base ISA
 Enum
@@ -194,6 +197,14 @@ mexplicit-relocs
 Target Var(la_opt_explicit_relocs_backward) Init(M_OPT_UNSET)
 Use %reloc() assembly operators (for backward compatibility).

+mrecip
+Target RejectNegative Var(loongarch_recip)
+Generate approximate reciprocal divide and square root for better throughput.
+
+mrecip=
+Target RejectNegative Joined Var(loongarch_recip_name)
+Control generation of reciprocal estimates.
+
 ; The code model option names for -mcmodel.
 Enum
 Name(cmodel) Type(int)
diff --git a/gcc/config/loongarch/lasx.md b/gcc/config/loongarch/lasx.md
index ad49a3ffb..eeac8cd98 100644
--- a/gcc/config/loongarch/lasx.md
+++ b/gcc/config/loongarch/lasx.md
@@ -1194,7 +1194,25 @@
   [(set_attr "type" "simd_fmul")
    (set_attr "mode" "<MODE>")])

-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+  [(set (match_operand:FLASX 0 "register_operand")
+    (div:FLASX (match_operand:FLASX 1 "reg_or_vecotr_1_operand")
+	       (match_operand:FLASX 2 "register_operand")))]
+  "ISA_HAS_LASX"
+{
+  if (<MODE>mode == V8SFmode
+    && TARGET_RECIP_VEC_DIV
+    && optimize_insn_for_speed_p ()
+    && flag_finite_math_only && !flag_trapping_math
+    && flag_unsafe_math_optimizations)
+  {
+    loongarch_emit_swdivsf (operands[0], operands[1],
+	operands[2], V8SFmode);
+    DONE;
+  }
+})
+
+(define_insn "*div<mode>3"
   [(set (match_operand:FLASX 0 "register_operand" "=f")
 	(div:FLASX (match_operand:FLASX 1 "register_operand" "f")
 		   (match_operand:FLASX 2 "register_operand" "f")))]
@@ -1223,7 +1241,23 @@
   [(set_attr "type" "simd_fmadd")
    (set_attr "mode" "<MODE>")])

-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:FLASX 0 "register_operand")
+    (sqrt:FLASX (match_operand:FLASX 1 "register_operand")))]
+  "ISA_HAS_LASX"
+{
+  if (<MODE>mode == V8SFmode
+      && TARGET_RECIP_VEC_SQRT
+      && flag_unsafe_math_optimizations
+      && optimize_insn_for_speed_p ()
+      && flag_finite_math_only && !flag_trapping_math)
+    {
+      loongarch_emit_swrsqrtsf (operands[0], operands[1], V8SFmode, 0);
+      DONE;
+    }
+})
+
+(define_insn "*sqrt<mode>2"
   [(set (match_operand:FLASX 0 "register_operand" "=f")
 	(sqrt:FLASX (match_operand:FLASX 1 "register_operand" "f")))]
   "ISA_HAS_LASX"
@@ -1646,7 +1680,20 @@
   [(set_attr "type" "simd_fdiv")
    (set_attr "mode" "<MODE>")])

-(define_insn "rsqrt<mode>2"
+(define_expand "rsqrt<mode>2"
+  [(set (match_operand:FLASX 0 "register_operand" "=f")
+    (unspec:FLASX [(match_operand:FLASX 1 "register_operand" "f")]
+	     UNSPEC_LASX_XVFRSQRT))]
+  "ISA_HAS_LASX"
+ {
+   if (<MODE>mode == V8SFmode && TARGET_RECIP_VEC_RSQRT)
+     {
+       loongarch_emit_swrsqrtsf (operands[0], operands[1], V8SFmode, 1);
+       DONE;
+     }
+})
+
+(define_insn "*rsqrt<mode>2"
   [(set (match_operand:FLASX 0 "register_operand" "=f")
     (unspec:FLASX [(match_operand:FLASX 1 "register_operand" "f")]
 		  UNSPEC_LASX_XVFRSQRT))]
diff --git a/gcc/config/loongarch/loongarch-protos.h b/gcc/config/loongarch/loongarch-protos.h
index 51d38177b..117669e9f 100644
--- a/gcc/config/loongarch/loongarch-protos.h
+++ b/gcc/config/loongarch/loongarch-protos.h
@@ -220,5 +220,7 @@ extern rtx loongarch_gen_const_int_vector_shuffle (machine_mode, int);
 extern tree loongarch_build_builtin_va_list (void);

 extern rtx loongarch_build_signbit_mask (machine_mode, bool, bool);
+extern void loongarch_emit_swrsqrtsf (rtx, rtx, machine_mode, bool);
+extern void loongarch_emit_swdivsf (rtx, rtx, rtx, machine_mode);
 extern bool loongarch_explicit_relocs_p (enum loongarch_symbol_type);
 #endif /* ! GCC_LOONGARCH_PROTOS_H */
diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index 95aa9453b..18326ce47 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -7548,6 +7548,71 @@ loongarch_option_override_internal (struct gcc_options *opts,

   /* Function to allocate machine-dependent function status.  */
   init_machine_status = &loongarch_init_machine_status;
+
+  /* -mrecip options.  */
+  static struct
+    {
+      const char *string;	    /* option name.  */
+      unsigned int mask;	    /* mask bits to set.  */
+    }
+  const recip_options[] = {
+	{ "all",       RECIP_MASK_ALL },
+	{ "none",      RECIP_MASK_NONE },
+	{ "div",       RECIP_MASK_DIV },
+	{ "sqrt",      RECIP_MASK_SQRT },
+	{ "rsqrt",     RECIP_MASK_RSQRT },
+	{ "vec-div",   RECIP_MASK_VEC_DIV },
+	{ "vec-sqrt",  RECIP_MASK_VEC_SQRT },
+	{ "vec-rsqrt", RECIP_MASK_VEC_RSQRT },
+  };
+
+  if (loongarch_recip_name)
+    {
+      char *p = ASTRDUP (loongarch_recip_name);
+      char *q;
+      unsigned int mask, i;
+      bool invert;
+
+      while ((q = strtok (p, ",")) != NULL)
+	{
+	  p = NULL;
+	  if (*q == '!')
+	    {
+	      invert = true;
+	      q++;
+	    }
+	  else
+	    invert = false;
+
+	  if (!strcmp (q, "default"))
+	    mask = RECIP_MASK_ALL;
+	  else
+	    {
+	      for (i = 0; i < ARRAY_SIZE (recip_options); i++)
+		if (!strcmp (q, recip_options[i].string))
+		  {
+		    mask = recip_options[i].mask;
+		    break;
+		  }
+
+	      if (i == ARRAY_SIZE (recip_options))
+		{
+		  error ("unknown option for %<-mrecip=%s%>", q);
+		  invert = false;
+		  mask = RECIP_MASK_NONE;
+		}
+	    }
+
+	  if (invert)
+	    recip_mask &= ~mask;
+	  else
+	    recip_mask |= mask;
+	}
+    }
+  if (loongarch_recip)
+    recip_mask |= RECIP_MASK_ALL;
+  if (!TARGET_FRECIPE)
+    recip_mask = RECIP_MASK_NONE;
 }


@@ -11470,6 +11535,126 @@ loongarch_build_signbit_mask (machine_mode mode, bool vect, bool invert)
   return force_reg (vec_mode, v);
 }

+/* Use rsqrte instruction and Newton-Rhapson to compute the approximation of
+   a single precision floating point [reciprocal] square root.  */
+
+void loongarch_emit_swrsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
+{
+  rtx x0, e0, e1, e2, mhalf, monehalf;
+  REAL_VALUE_TYPE r;
+  int unspec;
+
+  x0 = gen_reg_rtx (mode);
+  e0 = gen_reg_rtx (mode);
+  e1 = gen_reg_rtx (mode);
+  e2 = gen_reg_rtx (mode);
+
+  real_arithmetic (&r, ABS_EXPR, &dconsthalf, NULL);
+  mhalf = const_double_from_real_value (r, SFmode);
+
+  real_arithmetic (&r, PLUS_EXPR, &dconsthalf, &dconst1);
+  monehalf = const_double_from_real_value (r, SFmode);
+  unspec = UNSPEC_RSQRTE;
+
+  if (VECTOR_MODE_P (mode))
+    {
+      mhalf = loongarch_build_const_vector (mode, true, mhalf);
+      monehalf = loongarch_build_const_vector (mode, true, monehalf);
+      unspec = GET_MODE_SIZE (mode) == 32 ? UNSPEC_LASX_XVFRSQRTE
+					  : UNSPEC_LSX_VFRSQRTE;
+    }
+
+  /* rsqrt(a) =  rsqrte(a) * (1.5 - 0.5 * a * rsqrte(a) * rsqrte(a))
+     sqrt(a)  =  a * rsqrte(a) * (1.5 - 0.5 * a * rsqrte(a) * rsqrte(a))  */
+
+  a = force_reg (mode, a);
+
+  /* x0 = rsqrt(a) estimate.  */
+  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
+					      unspec)));
+
+  /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0).  */
+  if (!recip)
+    {
+      rtx zero = force_reg (mode, CONST0_RTX (mode));
+
+      if (VECTOR_MODE_P (mode))
+	{
+	  machine_mode imode = related_int_vector_mode (mode).require ();
+	  rtx mask = gen_reg_rtx (imode);
+	  emit_insn (gen_rtx_SET (mask, gen_rtx_NE (imode, a, zero)));
+	  emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0,
+						   gen_lowpart (mode, mask))));
+	}
+      else
+	{
+	  rtx target = emit_conditional_move (x0, { GT, a, zero, mode },
+					      x0, zero, mode, 0);
+	  if (target != x0)
+	    emit_move_insn (x0, target);
+	}
+    }
+
+  /* e0 = x0 * a  */
+  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
+  /* e1 = e0 * x0  */
+  emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
+
+  /* e2 = 1.5 - e1 * 0.5  */
+  mhalf = force_reg (mode, mhalf);
+  monehalf = force_reg (mode, monehalf);
+  emit_insn (gen_rtx_SET (e2, gen_rtx_FMA (mode,
+					   gen_rtx_NEG (mode, e1),
+							mhalf, monehalf)));
+
+  if (recip)
+    /* res = e2 * x0  */
+    emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, x0, e2)));
+  else
+    /* res = e2 * e0  */
+    emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e0)));
+}
+
+/* Use recipe instruction and Newton-Rhapson to compute the approximation of
+   a single precision floating point divide.  */
+
+void loongarch_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
+{
+  rtx x0, e0, mtwo;
+  REAL_VALUE_TYPE r;
+  x0 = gen_reg_rtx (mode);
+  e0 = gen_reg_rtx (mode);
+  int unspec = UNSPEC_RECIPE;
+
+  real_arithmetic (&r, ABS_EXPR, &dconst2, NULL);
+  mtwo = const_double_from_real_value (r, SFmode);
+
+  if (VECTOR_MODE_P (mode))
+    {
+      mtwo = loongarch_build_const_vector (mode, true, mtwo);
+      unspec = GET_MODE_SIZE (mode) == 32 ? UNSPEC_LASX_XVFRECIPE
+					  : UNSPEC_LSX_VFRECIPE;
+    }
+
+  mtwo = force_reg (mode, mtwo);
+
+  /* a / b = a * recipe(b) * (2.0 - b * recipe(b))  */
+
+  /* x0 = 1./b estimate.  */
+  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
+					      unspec)));
+  /* 2.0 - b * x0  */
+  emit_insn (gen_rtx_SET (e0, gen_rtx_FMA (mode,
+					   gen_rtx_NEG (mode, b), x0, mtwo)));
+
+  /* x0 = a * x0  */
+  if (a != CONST1_RTX (mode))
+    emit_insn (gen_rtx_SET (x0, gen_rtx_MULT (mode, a, x0)));
+
+  /* res = e0 * x0  */
+  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e0, x0)));
+}
+
 static bool
 loongarch_builtin_support_vector_misalignment (machine_mode mode,
 					       const_tree type,
@@ -11665,6 +11850,9 @@ loongarch_asm_code_end (void)
 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
   loongarch_autovectorize_vector_modes

+#undef TARGET_OPTAB_SUPPORTED_P
+#define TARGET_OPTAB_SUPPORTED_P loongarch_optab_supported_p
+
 #undef TARGET_INIT_BUILTINS
 #define TARGET_INIT_BUILTINS loongarch_init_builtins
 #undef TARGET_BUILTIN_DECL
diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h
index 8b28be0e4..fbc0f53e4 100644
--- a/gcc/config/loongarch/loongarch.h
+++ b/gcc/config/loongarch/loongarch.h
@@ -702,6 +702,24 @@ enum reg_class
    && (GET_MODE_CLASS (MODE) == MODE_VECTOR_INT		\
        || GET_MODE_CLASS (MODE) == MODE_VECTOR_FLOAT))

+#define RECIP_MASK_NONE         0x00
+#define RECIP_MASK_DIV          0x01
+#define RECIP_MASK_SQRT         0x02
+#define RECIP_MASK_RSQRT        0x04
+#define RECIP_MASK_VEC_DIV      0x08
+#define RECIP_MASK_VEC_SQRT     0x10
+#define RECIP_MASK_VEC_RSQRT    0x20
+#define RECIP_MASK_ALL (RECIP_MASK_DIV | RECIP_MASK_SQRT \
+			| RECIP_MASK_RSQRT | RECIP_MASK_VEC_SQRT \
+			| RECIP_MASK_VEC_DIV | RECIP_MASK_VEC_RSQRT)
+
+#define TARGET_RECIP_DIV        ((recip_mask & RECIP_MASK_DIV) != 0 || TARGET_uARCH_LA664)
+#define TARGET_RECIP_SQRT       ((recip_mask & RECIP_MASK_SQRT) != 0 || TARGET_uARCH_LA664)
+#define TARGET_RECIP_RSQRT      ((recip_mask & RECIP_MASK_RSQRT) != 0 || TARGET_uARCH_LA664)
+#define TARGET_RECIP_VEC_DIV    ((recip_mask & RECIP_MASK_VEC_DIV) != 0 || TARGET_uARCH_LA664)
+#define TARGET_RECIP_VEC_SQRT   ((recip_mask & RECIP_MASK_VEC_SQRT) != 0 || TARGET_uARCH_LA664)
+#define TARGET_RECIP_VEC_RSQRT  ((recip_mask & RECIP_MASK_VEC_RSQRT) != 0 || TARGET_uARCH_LA664)
+
 /* 1 if N is a possible register number for function argument passing.
    We have no FP argument registers when soft-float.  */

diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md
index 4dfe583e2..c6edd1dda 100644
--- a/gcc/config/loongarch/loongarch.md
+++ b/gcc/config/loongarch/loongarch.md
@@ -893,9 +893,21 @@
 ;; Float division and modulus.
 (define_expand "div<mode>3"
   [(set (match_operand:ANYF 0 "register_operand")
-	(div:ANYF (match_operand:ANYF 1 "reg_or_1_operand")
-		  (match_operand:ANYF 2 "register_operand")))]
-  "")
+    (div:ANYF (match_operand:ANYF 1 "reg_or_1_operand")
+	      (match_operand:ANYF 2 "register_operand")))]
+  ""
+{
+  if (<MODE>mode == SFmode
+    && TARGET_RECIP_DIV
+    && optimize_insn_for_speed_p ()
+    && flag_finite_math_only && !flag_trapping_math
+    && flag_unsafe_math_optimizations)
+  {
+    loongarch_emit_swdivsf (operands[0], operands[1],
+	operands[2], SFmode);
+    DONE;
+  }
+})

 (define_insn "*div<mode>3"
   [(set (match_operand:ANYF 0 "register_operand" "=f")
@@ -1126,7 +1138,23 @@
 ;;
 ;;  ....................

-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:ANYF 0 "register_operand")
+    (sqrt:ANYF (match_operand:ANYF 1 "register_operand")))]
+  ""
+ {
+  if (<MODE>mode == SFmode
+      && TARGET_RECIP_SQRT
+      && flag_unsafe_math_optimizations
+      && !optimize_insn_for_size_p ()
+      && flag_finite_math_only && !flag_trapping_math)
+    {
+      loongarch_emit_swrsqrtsf (operands[0], operands[1], SFmode, 0);
+      DONE;
+    }
+ })
+
+(define_insn "*sqrt<mode>2"
   [(set (match_operand:ANYF 0 "register_operand" "=f")
 	(sqrt:ANYF (match_operand:ANYF 1 "register_operand" "f")))]
   ""
@@ -1135,6 +1163,19 @@
    (set_attr "mode" "<UNITMODE>")
    (set_attr "insn_count" "1")])

+(define_expand "rsqrt<mode>2"
+  [(set (match_operand:ANYF 0 "register_operand")
+    (unspec:ANYF [(match_operand:ANYF 1 "register_operand")]
+	   UNSPEC_RSQRT))]
+  "TARGET_HARD_FLOAT"
+{
+   if (<MODE>mode == SFmode && TARGET_RECIP_RSQRT)
+     {
+       loongarch_emit_swrsqrtsf (operands[0], operands[1], SFmode, 1);
+       DONE;
+     }
+})
+
 (define_insn "*rsqrt<mode>2"
   [(set (match_operand:ANYF 0 "register_operand" "=f")
     (unspec:ANYF [(match_operand:ANYF 1 "register_operand" "f")]
diff --git a/gcc/config/loongarch/loongarch.opt b/gcc/config/loongarch/loongarch.opt
index e7bc8bed4..56f6a9564 100644
--- a/gcc/config/loongarch/loongarch.opt
+++ b/gcc/config/loongarch/loongarch.opt
@@ -31,6 +31,9 @@ config/loongarch/loongarch-opts.h
 HeaderInclude
 config/loongarch/loongarch-str.h

+TargetVariable
+unsigned int recip_mask = 0
+
 ; ISA related options
 ;; Base ISA
 Enum
@@ -202,6 +205,14 @@ mexplicit-relocs
 Target Var(la_opt_explicit_relocs_backward) Init(M_OPT_UNSET)
 Use %reloc() assembly operators (for backward compatibility).

+mrecip
+Target RejectNegative Var(loongarch_recip)
+Generate approximate reciprocal divide and square root for better throughput.
+
+mrecip=
+Target RejectNegative Joined Var(loongarch_recip_name)
+Control generation of reciprocal estimates.
+
 ; The code model option names for -mcmodel.
 Enum
 Name(cmodel) Type(int)
diff --git a/gcc/config/loongarch/lsx.md b/gcc/config/loongarch/lsx.md
index f2774f021..dbdb42301 100644
--- a/gcc/config/loongarch/lsx.md
+++ b/gcc/config/loongarch/lsx.md
@@ -1083,7 +1083,25 @@
   [(set_attr "type" "simd_fmul")
    (set_attr "mode" "<MODE>")])

-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+  [(set (match_operand:FLSX 0 "register_operand")
+    (div:FLSX (match_operand:FLSX 1 "reg_or_vecotr_1_operand")
+	      (match_operand:FLSX 2 "register_operand")))]
+  "ISA_HAS_LSX"
+{
+  if (<MODE>mode == V4SFmode
+    && TARGET_RECIP_VEC_DIV
+    && optimize_insn_for_speed_p ()
+    && flag_finite_math_only && !flag_trapping_math
+    && flag_unsafe_math_optimizations)
+  {
+    loongarch_emit_swdivsf (operands[0], operands[1],
+	operands[2], V4SFmode);
+    DONE;
+  }
+})
+
+(define_insn "*div<mode>3"
   [(set (match_operand:FLSX 0 "register_operand" "=f")
 	(div:FLSX (match_operand:FLSX 1 "register_operand" "f")
 		  (match_operand:FLSX 2 "register_operand" "f")))]
@@ -1112,7 +1130,23 @@
   [(set_attr "type" "simd_fmadd")
    (set_attr "mode" "<MODE>")])

-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:FLSX 0 "register_operand")
+    (sqrt:FLSX (match_operand:FLSX 1 "register_operand")))]
+  "ISA_HAS_LSX"
+{
+  if (<MODE>mode == V4SFmode
+      && TARGET_RECIP_VEC_SQRT
+      && flag_unsafe_math_optimizations
+      && optimize_insn_for_speed_p ()
+      && flag_finite_math_only && !flag_trapping_math)
+    {
+      loongarch_emit_swrsqrtsf (operands[0], operands[1], V4SFmode, 0);
+      DONE;
+    }
+})
+
+(define_insn "*sqrt<mode>2"
   [(set (match_operand:FLSX 0 "register_operand" "=f")
 	(sqrt:FLSX (match_operand:FLSX 1 "register_operand" "f")))]
   "ISA_HAS_LSX"
@@ -1559,7 +1593,20 @@
   [(set_attr "type" "simd_fdiv")
    (set_attr "mode" "<MODE>")])

-(define_insn "rsqrt<mode>2"
+(define_expand "rsqrt<mode>2"
+  [(set (match_operand:FLSX 0 "register_operand" "=f")
+    (unspec:FLSX [(match_operand:FLSX 1 "register_operand" "f")]
+	     UNSPEC_LSX_VFRSQRT))]
+ "ISA_HAS_LSX"
+{
+ if (<MODE>mode == V4SFmode && TARGET_RECIP_VEC_RSQRT)
+   {
+     loongarch_emit_swrsqrtsf (operands[0], operands[1], V4SFmode, 1);
+     DONE;
+   }
+})
+
+(define_insn "*rsqrt<mode>2"
   [(set (match_operand:FLSX 0 "register_operand" "=f")
     (unspec:FLSX [(match_operand:FLSX 1 "register_operand" "f")]
 		 UNSPEC_LSX_VFRSQRT))]
diff --git a/gcc/config/loongarch/predicates.md b/gcc/config/loongarch/predicates.md
index 572550dbc..88e54c915 100644
--- a/gcc/config/loongarch/predicates.md
+++ b/gcc/config/loongarch/predicates.md
@@ -235,6 +235,10 @@
   (ior (match_operand 0 "const_1_operand")
        (match_operand 0 "register_operand")))

+(define_predicate "reg_or_vecotr_1_operand"
+  (ior (match_operand 0 "const_vector_1_operand")
+       (match_operand 0 "register_operand")))
+
 ;; These are used in vec_merge, hence accept bitmask as const_int.
 (define_predicate "const_exp_2_operand"
   (and (match_code "const_int")
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 168f3d0db..76a8f20d1 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1008,7 +1008,8 @@ Objective-C and Objective-C++ Dialects}.
 -mmax-inline-memcpy-size=@var{n} @gol
 -mexplicit-relocs -mno-explicit-relocs @gol
 -mdirect-extern-access -mno-direct-extern-access @gol
--mcmodel=@var{code-model}}
+-mcmodel=@var{code-model} -mrelax -mpass-mrelax-to-as} @gol
+-mrecip  -mrecip=@var{opt}

 @emph{M32R/D Options}
 @gccoptlist{-m32r2  -m32rx  -m32r @gol
@@ -24633,6 +24634,58 @@ kernels, executables linked with @option{-static} or @option{-static-pie}.
 @option{-mdirect-extern-access} is not compatible with @option{-fPIC} or
 @option{-fpic}.

+@opindex mrecip
+@item -mrecip
+This option enables use of the reciprocal estimate and reciprocal square
+root estimate instructions with additional Newton-Raphson steps to increase
+precision instead of doing a divide or square root and divide for
+floating-point arguments.
+These instructions are generated only when @option{-funsafe-math-optimizations}
+is enabled together with @option{-ffinite-math-only} and
+@option{-fno-trapping-math}.
+This option is off by default. Before you can use this option, you must sure the
+target CPU supports frecipe and frsqrte instructions.
+Note that while the throughput of the sequence is higher than the throughput of
+the non-reciprocal instruction, the precision of the sequence can be decreased
+by up to 2 ulp (i.e. the inverse of 1.0 equals 0.99999994).
+
+@opindex mrecip=opt
+@item -mrecip=@var{opt}
+This option controls which reciprocal estimate instructions
+may be used.  @var{opt} is a comma-separated list of options, which may
+be preceded by a @samp{!} to invert the option:
+@table @samp
+@item all
+Enable all estimate instructions.
+
+@item default
+Enable the default instructions, equivalent to @option{-mrecip}.
+
+@item none
+Disable all estimate instructions, equivalent to @option{-mno-recip}.
+
+@item div
+Enable the approximation for scalar division.
+
+@item vec-div
+Enable the approximation for vectorized division.
+
+@item sqrt
+Enable the approximation for scalar square root.
+
+@item vec-sqrt
+Enable the approximation for vectorized square root.
+
+@item rsqrt
+Enable the approximation for scalar reciprocal square root.
+
+@item vec-rsqrt
+Enable the approximation for vectorized reciprocal square root.
+@end table
+
+So, for example, @option{-mrecip=all,!sqrt} enables
+all of the reciprocal approximations, except for scalar square root.
+
 @item loongarch-vect-unroll-limit
 The vectorizer will use available tuning information to determine whether it
 would be beneficial to unroll the main vectorized loop and by how much.  This
diff --git a/gcc/testsuite/gcc.target/loongarch/divf.c b/gcc/testsuite/gcc.target/loongarch/divf.c
new file mode 100644
index 000000000..6c831817c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/divf.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -mrecip -mfrecipe -fno-unsafe-math-optimizations" } */
+/* { dg-final { scan-assembler "fdiv.s" } } */
+/* { dg-final { scan-assembler-not "frecipe.s" } } */
+
+float
+foo(float a, float b)
+{
+  return a / b;
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/recip-divf.c b/gcc/testsuite/gcc.target/loongarch/recip-divf.c
new file mode 100644
index 000000000..db5e3e488
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/recip-divf.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -mrecip -mfrecipe" } */
+/* { dg-final { scan-assembler "frecipe.s" } } */
+
+float
+foo(float a, float b)
+{
+  return a / b;
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/recip-sqrtf.c b/gcc/testsuite/gcc.target/loongarch/recip-sqrtf.c
new file mode 100644
index 000000000..7f45db6cd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/recip-sqrtf.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -mrecip -mfrecipe" } */
+/* { dg-final { scan-assembler-times "frsqrte.s" 3 } } */
+
+extern float sqrtf (float);
+
+float
+foo1 (float a, float b)
+{
+  return a/sqrtf(b);
+}
+
+float
+foo2 (float a, float b)
+{
+  return sqrtf(a/b);
+}
+
+float
+foo3 (float a)
+{
+  return sqrtf(a);
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/sqrtf.c b/gcc/testsuite/gcc.target/loongarch/sqrtf.c
new file mode 100644
index 000000000..c2720faac
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/sqrtf.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -mrecip -mfrecipe -fno-unsafe-math-optimizations" } */
+/* { dg-final { scan-assembler-times "fsqrt.s" 3 } } */
+/* { dg-final { scan-assembler-not "frsqrte.s" } } */
+
+extern float sqrtf (float);
+
+float
+foo1 (float a, float b)
+{
+  return a/sqrtf(b);
+}
+
+float
+foo2 (float a, float b)
+{
+  return sqrtf(a/b);
+}
+
+float
+foo3 (float a)
+{
+  return sqrtf(a);
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-divf.c b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-divf.c
new file mode 100644
index 000000000..748a82200
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-divf.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mrecip -mlasx -mfrecipe -fno-unsafe-math-optimizations" } */
+/* { dg-final { scan-assembler "xvfdiv.s" } } */
+/* { dg-final { scan-assembler-not "xvfrecipe.s" } } */
+
+float a[8],b[8],c[8];
+
+void
+foo ()
+{
+  for (int i = 0; i < 8; i++)
+    c[i] = a[i] / b[i];
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-recip-divf.c b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-recip-divf.c
new file mode 100644
index 000000000..6532756f0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-recip-divf.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -mrecip -mlasx -mfrecipe" } */
+/* { dg-final { scan-assembler "xvfrecipe.s" } } */
+
+float a[8],b[8],c[8];
+
+void
+foo ()
+{
+  for (int i = 0; i < 8; i++)
+    c[i] = a[i] / b[i];
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-recip-sqrtf.c b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-recip-sqrtf.c
new file mode 100644
index 000000000..a623dff8f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-recip-sqrtf.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -mrecip -mlasx -mfrecipe" } */
+/* { dg-final { scan-assembler-times "xvfrsqrte.s" 3 } } */
+
+float a[8], b[8], c[8];
+
+extern float sqrtf (float);
+
+void
+foo1 (void)
+{
+  for (int i = 0; i < 8; i++)
+    c[i] = a[i] / sqrtf (b[i]);
+}
+
+void
+foo2 (void)
+{
+  for (int i = 0; i < 8; i++)
+    c[i] = sqrtf (a[i] / b[i]);
+}
+
+void
+foo3 (void)
+{
+  for (int i = 0; i < 8; i++)
+    c[i] = sqrtf (a[i]);
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-recip.c b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-recip.c
new file mode 100644
index 000000000..083c86840
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-recip.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mlasx -fno-vect-cost-model" } */
+/* { dg-final { scan-assembler "xvfrecip.s" } } */
+/* { dg-final { scan-assembler "xvfrecip.d" } } */
+/* { dg-final { scan-assembler-not "xvfdiv.s" } } */
+/* { dg-final { scan-assembler-not "xvfdiv.d" } } */
+
+float a[8], b[8];
+
+void
+foo1(void)
+{
+  for (int i = 0; i < 8; i++)
+    a[i] = 1 / (b[i]);
+}
+
+double da[4], db[4];
+
+void
+foo2(void)
+{
+  for (int i = 0; i < 4; i++)
+    da[i] = 1 / (db[i]);
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-sqrtf.c b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-sqrtf.c
new file mode 100644
index 000000000..a005a3886
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-sqrtf.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -fno-unsafe-math-optimizations  -mrecip -mlasx -mfrecipe" } */
+/* { dg-final { scan-assembler-times "xvfsqrt.s" 3 } } */
+/* { dg-final { scan-assembler-not "xvfrsqrte.s" } } */
+
+float a[8], b[8], c[8];
+
+extern float sqrtf (float);
+
+void
+foo1 (void)
+{
+  for (int i = 0; i < 8; i++)
+    c[i] = a[i] / sqrtf (b[i]);
+}
+
+void
+foo2 (void)
+{
+  for (int i = 0; i < 8; i++)
+    c[i] = sqrtf (a[i] / b[i]);
+}
+
+void
+foo3 (void)
+{
+  for (int i = 0; i < 8; i++)
+    c[i] = sqrtf (a[i]);
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-divf.c b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-divf.c
new file mode 100644
index 000000000..1219b1ef8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-divf.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -mrecip -mlsx -mfrecipe -fno-unsafe-math-optimizations" } */
+/* { dg-final { scan-assembler "vfdiv.s" } } */
+/* { dg-final { scan-assembler-not "vfrecipe.s" } } */
+
+float a[4],b[4],c[4];
+
+void
+foo ()
+{
+  for (int i = 0; i < 4; i++)
+    c[i] = a[i] / b[i];
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-recip-divf.c b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-recip-divf.c
new file mode 100644
index 000000000..edbe8d909
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-recip-divf.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -mrecip -mlsx -mfrecipe" } */
+/* { dg-final { scan-assembler "vfrecipe.s" } } */
+
+float a[4],b[4],c[4];
+
+void
+foo ()
+{
+  for (int i = 0; i < 4; i++)
+    c[i] = a[i] / b[i];
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-recip-sqrtf.c b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-recip-sqrtf.c
new file mode 100644
index 000000000..d356f915e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-recip-sqrtf.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -mrecip -mlsx -mfrecipe" } */
+/* { dg-final { scan-assembler-times "vfrsqrte.s" 3 } } */
+
+float a[4], b[4], c[4];
+
+extern float sqrtf (float);
+
+void
+foo1 (void)
+{
+  for (int i = 0; i < 4; i++)
+    c[i] = a[i] / sqrtf (b[i]);
+}
+
+void
+foo2 (void)
+{
+  for (int i = 0; i < 4; i++)
+    c[i] = sqrtf (a[i] / b[i]);
+}
+
+void
+foo3 (void)
+{
+  for (int i = 0; i < 4; i++)
+    c[i] = sqrtf (a[i]);
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-recip.c b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-recip.c
new file mode 100644
index 000000000..c4d6af4db
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-recip.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mlsx -fno-vect-cost-model" } */
+/* { dg-final { scan-assembler "vfrecip.s" } } */
+/* { dg-final { scan-assembler "vfrecip.d" } } */
+/* { dg-final { scan-assembler-not "vfdiv.s" } } */
+/* { dg-final { scan-assembler-not "vfdiv.d" } } */
+
+float a[4], b[4];
+
+void
+foo1(void)
+{
+  for (int i = 0; i < 4; i++)
+    a[i] = 1 / (b[i]);
+}
+
+double da[2], db[2];
+
+void
+foo2(void)
+{
+  for (int i = 0; i < 2; i++)
+    da[i] = 1 / (db[i]);
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-sqrtf.c b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-sqrtf.c
new file mode 100644
index 000000000..3ff6570a6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-sqrtf.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -mrecip -mlsx -mfrecipe -fno-unsafe-math-optimizations" } */
+/* { dg-final { scan-assembler-times "vfsqrt.s" 3 } } */
+/* { dg-final { scan-assembler-not "vfrsqrte.s" } } */
+
+float a[4], b[4], c[4];
+
+extern float sqrtf (float);
+
+void
+foo1 (void)
+{
+  for (int i = 0; i < 4; i++)
+    c[i] = a[i] / sqrtf (b[i]);
+}
+
+void
+foo2 (void)
+{
+  for (int i = 0; i < 4; i++)
+    c[i] = sqrtf (a[i] / b[i]);
+}
+
+void
+foo3 (void)
+{
+  for (int i = 0; i < 4; i++)
+    c[i] = sqrtf (a[i]);
+}
--
2.43.0