392 lines
11 KiB
Diff
392 lines
11 KiB
Diff
From 4498010fba61c1446286c96cbda24d5ed53c53c7 Mon Sep 17 00:00:00 2001
|
|
From: Xi Ruoyao <xry111@xry111.site>
|
|
Date: Mon, 6 Nov 2023 16:06:08 +0800
|
|
Subject: [PATCH 027/188] LoongArch: Remove redundant barrier instructions
|
|
before LL-SC loops
|
|
|
|
This is isomorphic to the LLVM changes [1-2].
|
|
|
|
On LoongArch, the LL and SC instructions has memory barrier semantics:
|
|
|
|
- LL: <memory-barrier> + <load-exclusive>
|
|
- SC: <store-conditional> + <memory-barrier>
|
|
|
|
But the compare and swap operation is allowed to fail, and if it fails
|
|
the SC instruction is not executed, thus the guarantee of acquiring
|
|
semantics cannot be ensured. Therefore, an acquire barrier needs to be
|
|
generated when failure_memorder includes an acquire operation.
|
|
|
|
On CPUs implementing LoongArch v1.10 or later, "dbar 0b10100" is an
|
|
acquire barrier; on CPUs implementing LoongArch v1.00, it is a full
|
|
barrier. So it's always enough for acquire semantics. OTOH if an
|
|
acquire semantic is not needed, we still needs the "dbar 0x700" as the
|
|
load-load barrier like all LL-SC loops.
|
|
|
|
[1]:https://github.com/llvm/llvm-project/pull/67391
|
|
[2]:https://github.com/llvm/llvm-project/pull/69339
|
|
|
|
gcc/ChangeLog:
|
|
|
|
* config/loongarch/loongarch.cc
|
|
(loongarch_memmodel_needs_release_fence): Remove.
|
|
(loongarch_cas_failure_memorder_needs_acquire): New static
|
|
function.
|
|
(loongarch_print_operand): Redefine 'G' for the barrier on CAS
|
|
failure.
|
|
* config/loongarch/sync.md (atomic_cas_value_strong<mode>):
|
|
Remove the redundant barrier before the LL instruction, and
|
|
emit an acquire barrier on failure if needed by
|
|
failure_memorder.
|
|
(atomic_cas_value_cmp_and_7_<mode>): Likewise.
|
|
(atomic_cas_value_add_7_<mode>): Remove the unnecessary barrier
|
|
before the LL instruction.
|
|
(atomic_cas_value_sub_7_<mode>): Likewise.
|
|
(atomic_cas_value_and_7_<mode>): Likewise.
|
|
(atomic_cas_value_xor_7_<mode>): Likewise.
|
|
(atomic_cas_value_or_7_<mode>): Likewise.
|
|
(atomic_cas_value_nand_7_<mode>): Likewise.
|
|
(atomic_cas_value_exchange_7_<mode>): Likewise.
|
|
|
|
gcc/testsuite/ChangeLog:
|
|
|
|
* gcc.target/loongarch/cas-acquire.c: New test.
|
|
---
|
|
gcc/config/loongarch/loongarch.cc | 30 ++++---
|
|
gcc/config/loongarch/sync.md | 49 +++++------
|
|
.../gcc.target/loongarch/cas-acquire.c | 82 +++++++++++++++++++
|
|
3 files changed, 119 insertions(+), 42 deletions(-)
|
|
create mode 100644 gcc/testsuite/gcc.target/loongarch/cas-acquire.c
|
|
|
|
diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
|
|
index 6d580ee75..8467f03cf 100644
|
|
--- a/gcc/config/loongarch/loongarch.cc
|
|
+++ b/gcc/config/loongarch/loongarch.cc
|
|
@@ -5829,27 +5829,27 @@ loongarch_memmodel_needs_rel_acq_fence (enum memmodel model)
|
|
}
|
|
}
|
|
|
|
-/* Return true if a FENCE should be emitted to before a memory access to
|
|
- implement the release portion of memory model MODEL. */
|
|
+/* Return true if a FENCE should be emitted after a failed CAS to
|
|
+ implement the acquire semantic of failure_memorder. */
|
|
|
|
static bool
|
|
-loongarch_memmodel_needs_release_fence (enum memmodel model)
|
|
+loongarch_cas_failure_memorder_needs_acquire (enum memmodel model)
|
|
{
|
|
- switch (model)
|
|
+ switch (memmodel_base (model))
|
|
{
|
|
+ case MEMMODEL_ACQUIRE:
|
|
case MEMMODEL_ACQ_REL:
|
|
case MEMMODEL_SEQ_CST:
|
|
- case MEMMODEL_SYNC_SEQ_CST:
|
|
- case MEMMODEL_RELEASE:
|
|
- case MEMMODEL_SYNC_RELEASE:
|
|
return true;
|
|
|
|
- case MEMMODEL_ACQUIRE:
|
|
- case MEMMODEL_CONSUME:
|
|
- case MEMMODEL_SYNC_ACQUIRE:
|
|
case MEMMODEL_RELAXED:
|
|
+ case MEMMODEL_RELEASE:
|
|
return false;
|
|
|
|
+ /* MEMMODEL_CONSUME is deliberately not handled because it's always
|
|
+ replaced by MEMMODEL_ACQUIRE as at now. If you see an ICE caused by
|
|
+ MEMMODEL_CONSUME, read the change (re)introducing it carefully and
|
|
+ decide what to do. See PR 59448 and get_memmodel in builtins.cc. */
|
|
default:
|
|
gcc_unreachable ();
|
|
}
|
|
@@ -5962,7 +5962,8 @@ loongarch_print_operand_reloc (FILE *file, rtx op, bool hi64_part,
|
|
'd' Print CONST_INT OP in decimal.
|
|
'E' Print CONST_INT OP element 0 of a replicated CONST_VECTOR in decimal.
|
|
'F' Print the FPU branch condition for comparison OP.
|
|
- 'G' Print a DBAR insn if the memory model requires a release.
|
|
+ 'G' Print a DBAR insn for CAS failure (with an acquire semantic if
|
|
+ needed, otherwise a simple load-load barrier).
|
|
'H' Print address 52-61bit relocation associated with OP.
|
|
'h' Print the high-part relocation associated with OP.
|
|
'i' Print i if the operand is not a register.
|
|
@@ -6053,8 +6054,11 @@ loongarch_print_operand (FILE *file, rtx op, int letter)
|
|
break;
|
|
|
|
case 'G':
|
|
- if (loongarch_memmodel_needs_release_fence ((enum memmodel) INTVAL (op)))
|
|
- fputs ("dbar\t0", file);
|
|
+ if (loongarch_cas_failure_memorder_needs_acquire (
|
|
+ memmodel_from_int (INTVAL (op))))
|
|
+ fputs ("dbar\t0b10100", file);
|
|
+ else
|
|
+ fputs ("dbar\t0x700", file);
|
|
break;
|
|
|
|
case 'h':
|
|
diff --git a/gcc/config/loongarch/sync.md b/gcc/config/loongarch/sync.md
|
|
index efa40f24c..dd1f98946 100644
|
|
--- a/gcc/config/loongarch/sync.md
|
|
+++ b/gcc/config/loongarch/sync.md
|
|
@@ -162,19 +162,18 @@
|
|
(clobber (match_scratch:GPR 6 "=&r"))]
|
|
""
|
|
{
|
|
- return "%G5\\n\\t"
|
|
- "1:\\n\\t"
|
|
+ return "1:\\n\\t"
|
|
"ll.<amo>\\t%0,%1\\n\\t"
|
|
"bne\\t%0,%z2,2f\\n\\t"
|
|
"or%i3\\t%6,$zero,%3\\n\\t"
|
|
"sc.<amo>\\t%6,%1\\n\\t"
|
|
- "beq\\t$zero,%6,1b\\n\\t"
|
|
+ "beqz\\t%6,1b\\n\\t"
|
|
"b\\t3f\\n\\t"
|
|
"2:\\n\\t"
|
|
- "dbar\\t0x700\\n\\t"
|
|
+ "%G5\\n\\t"
|
|
"3:\\n\\t";
|
|
}
|
|
- [(set (attr "length") (const_int 32))])
|
|
+ [(set (attr "length") (const_int 28))])
|
|
|
|
(define_expand "atomic_compare_and_swap<mode>"
|
|
[(match_operand:SI 0 "register_operand" "") ;; bool output
|
|
@@ -267,8 +266,7 @@
|
|
(clobber (match_scratch:GPR 7 "=&r"))]
|
|
""
|
|
{
|
|
- return "%G6\\n\\t"
|
|
- "1:\\n\\t"
|
|
+ return "1:\\n\\t"
|
|
"ll.<amo>\\t%0,%1\\n\\t"
|
|
"and\\t%7,%0,%2\\n\\t"
|
|
"bne\\t%7,%z4,2f\\n\\t"
|
|
@@ -278,10 +276,10 @@
|
|
"beq\\t$zero,%7,1b\\n\\t"
|
|
"b\\t3f\\n\\t"
|
|
"2:\\n\\t"
|
|
- "dbar\\t0x700\\n\\t"
|
|
+ "%G6\\n\\t"
|
|
"3:\\n\\t";
|
|
}
|
|
- [(set (attr "length") (const_int 40))])
|
|
+ [(set (attr "length") (const_int 36))])
|
|
|
|
(define_expand "atomic_compare_and_swap<mode>"
|
|
[(match_operand:SI 0 "register_operand" "") ;; bool output
|
|
@@ -336,8 +334,7 @@
|
|
(clobber (match_scratch:GPR 8 "=&r"))]
|
|
""
|
|
{
|
|
- return "%G6\\n\\t"
|
|
- "1:\\n\\t"
|
|
+ return "1:\\n\\t"
|
|
"ll.<amo>\\t%0,%1\\n\\t"
|
|
"and\\t%7,%0,%3\\n\\t"
|
|
"add.w\\t%8,%0,%z5\\n\\t"
|
|
@@ -347,7 +344,7 @@
|
|
"beq\\t$zero,%7,1b";
|
|
}
|
|
|
|
- [(set (attr "length") (const_int 32))])
|
|
+ [(set (attr "length") (const_int 28))])
|
|
|
|
(define_insn "atomic_cas_value_sub_7_<mode>"
|
|
[(set (match_operand:GPR 0 "register_operand" "=&r") ;; res
|
|
@@ -363,8 +360,7 @@
|
|
(clobber (match_scratch:GPR 8 "=&r"))]
|
|
""
|
|
{
|
|
- return "%G6\\n\\t"
|
|
- "1:\\n\\t"
|
|
+ return "1:\\n\\t"
|
|
"ll.<amo>\\t%0,%1\\n\\t"
|
|
"and\\t%7,%0,%3\\n\\t"
|
|
"sub.w\\t%8,%0,%z5\\n\\t"
|
|
@@ -373,7 +369,7 @@
|
|
"sc.<amo>\\t%7,%1\\n\\t"
|
|
"beq\\t$zero,%7,1b";
|
|
}
|
|
- [(set (attr "length") (const_int 32))])
|
|
+ [(set (attr "length") (const_int 28))])
|
|
|
|
(define_insn "atomic_cas_value_and_7_<mode>"
|
|
[(set (match_operand:GPR 0 "register_operand" "=&r") ;; res
|
|
@@ -389,8 +385,7 @@
|
|
(clobber (match_scratch:GPR 8 "=&r"))]
|
|
""
|
|
{
|
|
- return "%G6\\n\\t"
|
|
- "1:\\n\\t"
|
|
+ return "1:\\n\\t"
|
|
"ll.<amo>\\t%0,%1\\n\\t"
|
|
"and\\t%7,%0,%3\\n\\t"
|
|
"and\\t%8,%0,%z5\\n\\t"
|
|
@@ -399,7 +394,7 @@
|
|
"sc.<amo>\\t%7,%1\\n\\t"
|
|
"beq\\t$zero,%7,1b";
|
|
}
|
|
- [(set (attr "length") (const_int 32))])
|
|
+ [(set (attr "length") (const_int 28))])
|
|
|
|
(define_insn "atomic_cas_value_xor_7_<mode>"
|
|
[(set (match_operand:GPR 0 "register_operand" "=&r") ;; res
|
|
@@ -415,8 +410,7 @@
|
|
(clobber (match_scratch:GPR 8 "=&r"))]
|
|
""
|
|
{
|
|
- return "%G6\\n\\t"
|
|
- "1:\\n\\t"
|
|
+ return "1:\\n\\t"
|
|
"ll.<amo>\\t%0,%1\\n\\t"
|
|
"and\\t%7,%0,%3\\n\\t"
|
|
"xor\\t%8,%0,%z5\\n\\t"
|
|
@@ -426,7 +420,7 @@
|
|
"beq\\t$zero,%7,1b";
|
|
}
|
|
|
|
- [(set (attr "length") (const_int 32))])
|
|
+ [(set (attr "length") (const_int 28))])
|
|
|
|
(define_insn "atomic_cas_value_or_7_<mode>"
|
|
[(set (match_operand:GPR 0 "register_operand" "=&r") ;; res
|
|
@@ -442,8 +436,7 @@
|
|
(clobber (match_scratch:GPR 8 "=&r"))]
|
|
""
|
|
{
|
|
- return "%G6\\n\\t"
|
|
- "1:\\n\\t"
|
|
+ return "1:\\n\\t"
|
|
"ll.<amo>\\t%0,%1\\n\\t"
|
|
"and\\t%7,%0,%3\\n\\t"
|
|
"or\\t%8,%0,%z5\\n\\t"
|
|
@@ -453,7 +446,7 @@
|
|
"beq\\t$zero,%7,1b";
|
|
}
|
|
|
|
- [(set (attr "length") (const_int 32))])
|
|
+ [(set (attr "length") (const_int 28))])
|
|
|
|
(define_insn "atomic_cas_value_nand_7_<mode>"
|
|
[(set (match_operand:GPR 0 "register_operand" "=&r") ;; res
|
|
@@ -469,8 +462,7 @@
|
|
(clobber (match_scratch:GPR 8 "=&r"))]
|
|
""
|
|
{
|
|
- return "%G6\\n\\t"
|
|
- "1:\\n\\t"
|
|
+ return "1:\\n\\t"
|
|
"ll.<amo>\\t%0,%1\\n\\t"
|
|
"and\\t%7,%0,%3\\n\\t"
|
|
"and\\t%8,%0,%z5\\n\\t"
|
|
@@ -479,7 +471,7 @@
|
|
"sc.<amo>\\t%7,%1\\n\\t"
|
|
"beq\\t$zero,%7,1b";
|
|
}
|
|
- [(set (attr "length") (const_int 32))])
|
|
+ [(set (attr "length") (const_int 28))])
|
|
|
|
(define_insn "atomic_cas_value_exchange_7_<mode>"
|
|
[(set (match_operand:GPR 0 "register_operand" "=&r")
|
|
@@ -494,8 +486,7 @@
|
|
(clobber (match_scratch:GPR 7 "=&r"))]
|
|
""
|
|
{
|
|
- return "%G6\\n\\t"
|
|
- "1:\\n\\t"
|
|
+ return "1:\\n\\t"
|
|
"ll.<amo>\\t%0,%1\\n\\t"
|
|
"and\\t%7,%0,%z3\\n\\t"
|
|
"or%i5\\t%7,%7,%5\\n\\t"
|
|
diff --git a/gcc/testsuite/gcc.target/loongarch/cas-acquire.c b/gcc/testsuite/gcc.target/loongarch/cas-acquire.c
|
|
new file mode 100644
|
|
index 000000000..ff7ba866f
|
|
--- /dev/null
|
|
+++ b/gcc/testsuite/gcc.target/loongarch/cas-acquire.c
|
|
@@ -0,0 +1,82 @@
|
|
+/* { dg-do run } */
|
|
+/* { dg-require-effective-target c99_runtime } */
|
|
+/* { dg-require-effective-target pthread } */
|
|
+/* { dg-options "-std=c99 -pthread" } */
|
|
+
|
|
+/* https://github.com/llvm/llvm-project/pull/67391#issuecomment-1752403934
|
|
+ reported that this had failed with GCC and 3A6000. */
|
|
+
|
|
+#include <pthread.h>
|
|
+#include <stdatomic.h>
|
|
+#include <stdbool.h>
|
|
+#include <stdio.h>
|
|
+
|
|
+static unsigned int tags[32];
|
|
+static unsigned int vals[32];
|
|
+
|
|
+static void *
|
|
+writer_entry (void *data)
|
|
+{
|
|
+ atomic_uint *pt = (atomic_uint *)tags;
|
|
+ atomic_uint *pv = (atomic_uint *)vals;
|
|
+
|
|
+ for (unsigned int n = 1; n < 10000; n++)
|
|
+ {
|
|
+ atomic_store_explicit (&pv[n & 31], n, memory_order_release);
|
|
+ atomic_store_explicit (&pt[n & 31], n, memory_order_release);
|
|
+ }
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static void *
|
|
+reader_entry (void *data)
|
|
+{
|
|
+ atomic_uint *pt = (atomic_uint *)tags;
|
|
+ atomic_uint *pv = (atomic_uint *)vals;
|
|
+ int i;
|
|
+
|
|
+ for (;;)
|
|
+ {
|
|
+ for (i = 0; i < 32; i++)
|
|
+ {
|
|
+ unsigned int tag = 0;
|
|
+ bool res;
|
|
+
|
|
+ res = atomic_compare_exchange_weak_explicit (
|
|
+ &pt[i], &tag, 0, memory_order_acquire, memory_order_acquire);
|
|
+ if (!res)
|
|
+ {
|
|
+ unsigned int val;
|
|
+
|
|
+ val = atomic_load_explicit (&pv[i], memory_order_relaxed);
|
|
+ if (val < tag)
|
|
+ __builtin_trap ();
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+int
|
|
+main (int argc, char *argv[])
|
|
+{
|
|
+ pthread_t writer;
|
|
+ pthread_t reader;
|
|
+ int res;
|
|
+
|
|
+ res = pthread_create (&writer, NULL, writer_entry, NULL);
|
|
+ if (res < 0)
|
|
+ __builtin_trap ();
|
|
+
|
|
+ res = pthread_create (&reader, NULL, reader_entry, NULL);
|
|
+ if (res < 0)
|
|
+ __builtin_trap ();
|
|
+
|
|
+ res = pthread_join (writer, NULL);
|
|
+ if (res < 0)
|
|
+ __builtin_trap ();
|
|
+
|
|
+ return 0;
|
|
+}
|
|
--
|
|
2.43.0
|
|
|