llvm-bolt/0009-support-aarch64-instrumentation.patch

2631 lines
94 KiB
Diff
Raw Permalink Normal View History

From a7d826d3985dd886523df050949f1c3c151df636 Mon Sep 17 00:00:00 2001
From: rfwang07 <wangrufeng5@huawei.com>
Date: Thu, 31 Oct 2024 15:34:10 +0800
Subject: [PATCH] support aarch64 instrumentation
---
bolt/CMakeLists.txt | 6 +-
bolt/include/bolt/Core/MCPlusBuilder.h | 24 +-
bolt/lib/Core/BinaryFunction.cpp | 6 +
bolt/lib/Passes/Instrumentation.cpp | 28 +-
bolt/lib/Passes/MCF.cpp | 1 +
bolt/lib/Passes/TailDuplication.cpp | 2 +-
.../Target/AArch64/AArch64MCPlusBuilder.cpp | 446 +++++++++++++++++-
bolt/lib/Target/X86/X86MCPlusBuilder.cpp | 67 +--
bolt/runtime/CMakeLists.txt | 12 +-
bolt/runtime/common.h | 417 ++--------------
bolt/runtime/instr.cpp | 61 ++-
bolt/runtime/sys_aarch64.h | 394 ++++++++++++++++
bolt/runtime/sys_x86_64.h | 360 ++++++++++++++
bolt/test/AArch64/exclusive-instrument.s | 39 ++
bolt/test/X86/asm-dump.c | 5 +-
...olt-address-translation-internal-call.test | 9 +-
.../test/X86/instrumentation-eh_frame_hdr.cpp | 2 +-
bolt/test/X86/internal-call-instrument.s | 24 +-
bolt/test/X86/tail-duplication-pass.s | 9 +
bolt/test/assume-abi.test | 7 +
.../AArch64/Inputs/basic-instrumentation.s | 9 +
.../AArch64/basic-instrumentation.test | 22 +
.../AArch64/instrumentation-ind-call.c | 38 ++
.../{X86 => }/Inputs/exceptions_split.cpp | 16 +-
.../runtime/X86/instrumentation-tail-call.s | 6 +-
.../{X86 => }/exceptions-instrumentation.test | 0
.../{X86 => }/pie-exceptions-split.test | 4 +-
27 files changed, 1545 insertions(+), 469 deletions(-)
create mode 100644 bolt/runtime/sys_aarch64.h
create mode 100644 bolt/runtime/sys_x86_64.h
create mode 100644 bolt/test/AArch64/exclusive-instrument.s
create mode 100644 bolt/test/assume-abi.test
create mode 100644 bolt/test/runtime/AArch64/Inputs/basic-instrumentation.s
create mode 100644 bolt/test/runtime/AArch64/basic-instrumentation.test
create mode 100644 bolt/test/runtime/AArch64/instrumentation-ind-call.c
rename bolt/test/runtime/{X86 => }/Inputs/exceptions_split.cpp (85%)
rename bolt/test/runtime/{X86 => }/exceptions-instrumentation.test (100%)
rename bolt/test/runtime/{X86 => }/pie-exceptions-split.test (95%)
diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt
index 4ff90c1..89462f8 100644
--- a/bolt/CMakeLists.txt
+++ b/bolt/CMakeLists.txt
@@ -32,10 +32,10 @@ foreach (tgt ${BOLT_TARGETS_TO_BUILD})
endforeach()
set(BOLT_ENABLE_RUNTIME_default OFF)
-if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64"
+if ((CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64"
+ OR CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
AND (CMAKE_SYSTEM_NAME STREQUAL "Linux"
- OR CMAKE_SYSTEM_NAME STREQUAL "Darwin")
- AND "X86" IN_LIST BOLT_TARGETS_TO_BUILD)
+ OR CMAKE_SYSTEM_NAME STREQUAL "Darwin"))
set(BOLT_ENABLE_RUNTIME_default ON)
endif()
option(BOLT_ENABLE_RUNTIME "Enable BOLT runtime" ${BOLT_ENABLE_RUNTIME_default})
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index beb0675..e6945c9 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -498,9 +498,9 @@ public:
}
/// Create increment contents of target by 1 for Instrumentation
- virtual InstructionListType createInstrIncMemory(const MCSymbol *Target,
- MCContext *Ctx,
- bool IsLeaf) const {
+ virtual InstructionListType
+ createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf,
+ unsigned CodePointerSize) const {
llvm_unreachable("not implemented");
return InstructionListType();
}
@@ -620,6 +620,11 @@ public:
return false;
}
+ virtual bool isAArch64Exclusive(const MCInst &Inst) const {
+ llvm_unreachable("not implemented");
+ return false;
+ }
+
virtual bool isCleanRegXOR(const MCInst &Inst) const {
llvm_unreachable("not implemented");
return false;
@@ -1597,18 +1602,11 @@ public:
return false;
}
- virtual void createLoadImmediate(MCInst &Inst, const MCPhysReg Dest,
- uint32_t Imm) const {
+ virtual InstructionListType createLoadImmediate(const MCPhysReg Dest,
+ uint64_t Imm) const {
llvm_unreachable("not implemented");
}
- /// Create instruction to increment contents of target by 1
- virtual bool createIncMemory(MCInst &Inst, const MCSymbol *Target,
- MCContext *Ctx) const {
- llvm_unreachable("not implemented");
- return false;
- }
-
/// Create a fragment of code (sequence of instructions) that load a 32-bit
/// address from memory, zero-extends it to 64 and jump to it (indirect jump).
virtual bool
@@ -1969,7 +1967,7 @@ public:
}
virtual InstructionListType createSymbolTrampoline(const MCSymbol *TgtSym,
- MCContext *Ctx) const {
+ MCContext *Ctx) {
llvm_unreachable("not implemented");
return InstructionListType();
}
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 5b44a76..b79bd58 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -2305,6 +2305,12 @@ void BinaryFunction::removeConditionalTailCalls() {
// This branch is no longer a conditional tail call.
BC.MIB->unsetConditionalTailCall(*CTCInstr);
+
+ // Move offset from CTCInstr to TailCallInstr.
+ if (std::optional<uint32_t> Offset = BC.MIB->getOffset(*CTCInstr)) {
+ BC.MIB->setOffset(TailCallInstr, *Offset);
+ BC.MIB->clearOffset(*CTCInstr);
+ }
}
insertBasicBlocks(std::prev(end()), std::move(NewBlocks),
diff --git a/bolt/lib/Passes/Instrumentation.cpp b/bolt/lib/Passes/Instrumentation.cpp
index fae6770..72adb31 100644
--- a/bolt/lib/Passes/Instrumentation.cpp
+++ b/bolt/lib/Passes/Instrumentation.cpp
@@ -13,6 +13,7 @@
#include "bolt/Passes/Instrumentation.h"
#include "bolt/Core/ParallelUtilities.h"
#include "bolt/RuntimeLibs/InstrumentationRuntimeLibrary.h"
+#include "bolt/Utils/CommandLineOpts.h"
#include "bolt/Utils/Utils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/RWMutex.h"
@@ -85,6 +86,24 @@ cl::opt<bool> InstrumentCalls("instrument-calls",
namespace llvm {
namespace bolt {
+static bool hasAArch64ExclusiveMemop(BinaryFunction &Function) {
+ // FIXME ARMv8-a architecture reference manual says that software must avoid
+ // having any explicit memory accesses between exclusive load and associated
+ // store instruction. So for now skip instrumentation for functions that have
+ // these instructions, since it might lead to runtime deadlock.
+ BinaryContext &BC = Function.getBinaryContext();
+ for (const BinaryBasicBlock &BB : Function)
+ for (const MCInst &Inst : BB)
+ if (BC.MIB->isAArch64Exclusive(Inst)) {
+ if (opts::Verbosity >= 1)
+ outs() << "BOLT-INSTRUMENTER: Function " << Function
+ << " has exclusive instructions, skip instrumentation\n";
+ return true;
+ }
+
+ return false;
+}
+
uint32_t Instrumentation::getFunctionNameIndex(const BinaryFunction &Function) {
auto Iter = FuncToStringIdx.find(&Function);
if (Iter != FuncToStringIdx.end())
@@ -176,7 +195,8 @@ Instrumentation::createInstrumentationSnippet(BinaryContext &BC, bool IsLeaf) {
auto L = BC.scopeLock();
MCSymbol *Label = BC.Ctx->createNamedTempSymbol("InstrEntry");
Summary->Counters.emplace_back(Label);
- return BC.MIB->createInstrIncMemory(Label, BC.Ctx.get(), IsLeaf);
+ return BC.MIB->createInstrIncMemory(Label, BC.Ctx.get(), IsLeaf,
+ BC.AsmInfo->getCodePointerSize());
}
// Helper instruction sequence insertion function
@@ -287,6 +307,9 @@ void Instrumentation::instrumentFunction(BinaryFunction &Function,
if (BC.isMachO() && Function.hasName("___GLOBAL_init_65535/1"))
return;
+ if (BC.isAArch64() && hasAArch64ExclusiveMemop(Function))
+ return;
+
SplitWorklistTy SplitWorklist;
SplitInstrsTy SplitInstrs;
@@ -504,9 +527,6 @@ void Instrumentation::instrumentFunction(BinaryFunction &Function,
}
void Instrumentation::runOnFunctions(BinaryContext &BC) {
- if (!BC.isX86())
- return;
-
const unsigned Flags = BinarySection::getFlags(/*IsReadOnly=*/false,
/*IsText=*/false,
/*IsAllocatable=*/true);
diff --git a/bolt/lib/Passes/MCF.cpp b/bolt/lib/Passes/MCF.cpp
index ec04012..c3898d2 100644
--- a/bolt/lib/Passes/MCF.cpp
+++ b/bolt/lib/Passes/MCF.cpp
@@ -262,6 +262,7 @@ bool guessPredEdgeCounts(BinaryBasicBlock *BB, ArcSet &GuessedArcs) {
continue;
Pred->getBranchInfo(*BB).Count = Guessed;
+ GuessedArcs.insert(std::make_pair(Pred, BB));
return true;
}
llvm_unreachable("Expected unguessed arc");
diff --git a/bolt/lib/Passes/TailDuplication.cpp b/bolt/lib/Passes/TailDuplication.cpp
index c04efd7..7141d5d 100644
--- a/bolt/lib/Passes/TailDuplication.cpp
+++ b/bolt/lib/Passes/TailDuplication.cpp
@@ -303,7 +303,7 @@ TailDuplication::aggressiveDuplicate(BinaryBasicBlock &BB,
if (isInCacheLine(BB, Tail))
return BlocksToDuplicate;
- BinaryBasicBlock *CurrBB = &BB;
+ BinaryBasicBlock *CurrBB = &Tail;
while (CurrBB) {
LLVM_DEBUG(dbgs() << "Aggressive tail duplication: adding "
<< CurrBB->getName() << " to duplication list\n";);
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index cd66b65..3f6497e 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -16,6 +16,9 @@
#include "Utils/AArch64BaseInfo.h"
#include "bolt/Core/MCPlusBuilder.h"
#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCInstBuilder.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/Debug.h"
@@ -28,6 +31,100 @@ using namespace bolt;
namespace {
+static void getSystemFlag(MCInst &Inst, MCPhysReg RegName) {
+ Inst.setOpcode(AArch64::MRS);
+ Inst.clear();
+ Inst.addOperand(MCOperand::createReg(RegName));
+ Inst.addOperand(MCOperand::createImm(AArch64SysReg::NZCV));
+}
+
+static void setSystemFlag(MCInst &Inst, MCPhysReg RegName) {
+ Inst.setOpcode(AArch64::MSR);
+ Inst.clear();
+ Inst.addOperand(MCOperand::createImm(AArch64SysReg::NZCV));
+ Inst.addOperand(MCOperand::createReg(RegName));
+}
+
+static void createPushRegisters(MCInst &Inst, MCPhysReg Reg1, MCPhysReg Reg2) {
+ Inst.clear();
+ unsigned NewOpcode = AArch64::STPXpre;
+ Inst.setOpcode(NewOpcode);
+ Inst.addOperand(MCOperand::createReg(AArch64::SP));
+ Inst.addOperand(MCOperand::createReg(Reg1));
+ Inst.addOperand(MCOperand::createReg(Reg2));
+ Inst.addOperand(MCOperand::createReg(AArch64::SP));
+ Inst.addOperand(MCOperand::createImm(-2));
+}
+
+static void createPopRegisters(MCInst &Inst, MCPhysReg Reg1, MCPhysReg Reg2) {
+ Inst.clear();
+ unsigned NewOpcode = AArch64::LDPXpost;
+ Inst.setOpcode(NewOpcode);
+ Inst.addOperand(MCOperand::createReg(AArch64::SP));
+ Inst.addOperand(MCOperand::createReg(Reg1));
+ Inst.addOperand(MCOperand::createReg(Reg2));
+ Inst.addOperand(MCOperand::createReg(AArch64::SP));
+ Inst.addOperand(MCOperand::createImm(2));
+}
+
+static void loadReg(MCInst &Inst, MCPhysReg To, MCPhysReg From) {
+ Inst.setOpcode(AArch64::LDRXui);
+ Inst.clear();
+ if (From == AArch64::SP) {
+ Inst.setOpcode(AArch64::LDRXpost);
+ Inst.addOperand(MCOperand::createReg(From));
+ Inst.addOperand(MCOperand::createReg(To));
+ Inst.addOperand(MCOperand::createReg(From));
+ Inst.addOperand(MCOperand::createImm(16));
+ } else {
+ Inst.addOperand(MCOperand::createReg(To));
+ Inst.addOperand(MCOperand::createReg(From));
+ Inst.addOperand(MCOperand::createImm(0));
+ }
+}
+
+static void storeReg(MCInst &Inst, MCPhysReg From, MCPhysReg To) {
+ Inst.setOpcode(AArch64::STRXui);
+ Inst.clear();
+ if (To == AArch64::SP) {
+ Inst.setOpcode(AArch64::STRXpre);
+ Inst.addOperand(MCOperand::createReg(To));
+ Inst.addOperand(MCOperand::createReg(From));
+ Inst.addOperand(MCOperand::createReg(To));
+ Inst.addOperand(MCOperand::createImm(-16));
+ } else {
+ Inst.addOperand(MCOperand::createReg(From));
+ Inst.addOperand(MCOperand::createReg(To));
+ Inst.addOperand(MCOperand::createImm(0));
+ }
+}
+
+static void atomicAdd(MCInst &Inst, MCPhysReg RegTo, MCPhysReg RegCnt) {
+ // NOTE: Supports only ARM with LSE extension
+ Inst.setOpcode(AArch64::LDADDX);
+ Inst.clear();
+ Inst.addOperand(MCOperand::createReg(AArch64::XZR));
+ Inst.addOperand(MCOperand::createReg(RegCnt));
+ Inst.addOperand(MCOperand::createReg(RegTo));
+}
+
+static void createMovz(MCInst &Inst, MCPhysReg Reg, uint64_t Imm) {
+ assert(Imm <= UINT16_MAX && "Invalid Imm size");
+ Inst.clear();
+ Inst.setOpcode(AArch64::MOVZXi);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ Inst.addOperand(MCOperand::createImm(Imm & 0xFFFF));
+ Inst.addOperand(MCOperand::createImm(0));
+}
+
+static InstructionListType createIncMemory(MCPhysReg RegTo, MCPhysReg RegTmp) {
+ InstructionListType Insts;
+ Insts.emplace_back();
+ createMovz(Insts.back(), RegTmp, 1);
+ Insts.emplace_back();
+ atomicAdd(Insts.back(), RegTo, RegTmp);
+ return Insts;
+}
class AArch64MCPlusBuilder : public MCPlusBuilder {
public:
AArch64MCPlusBuilder(const MCInstrAnalysis *Analysis, const MCInstrInfo *Info,
@@ -176,6 +273,34 @@ public:
return isLDRB(Inst) || isLDRH(Inst) || isLDRW(Inst) || isLDRX(Inst);
}
+ bool isAArch64Exclusive(const MCInst &Inst) const override {
+ return (Inst.getOpcode() == AArch64::LDXPX ||
+ Inst.getOpcode() == AArch64::LDXPW ||
+ Inst.getOpcode() == AArch64::LDXRX ||
+ Inst.getOpcode() == AArch64::LDXRW ||
+ Inst.getOpcode() == AArch64::LDXRH ||
+ Inst.getOpcode() == AArch64::LDXRB ||
+ Inst.getOpcode() == AArch64::STXPX ||
+ Inst.getOpcode() == AArch64::STXPW ||
+ Inst.getOpcode() == AArch64::STXRX ||
+ Inst.getOpcode() == AArch64::STXRW ||
+ Inst.getOpcode() == AArch64::STXRH ||
+ Inst.getOpcode() == AArch64::STXRB ||
+ Inst.getOpcode() == AArch64::LDAXPX ||
+ Inst.getOpcode() == AArch64::LDAXPW ||
+ Inst.getOpcode() == AArch64::LDAXRX ||
+ Inst.getOpcode() == AArch64::LDAXRW ||
+ Inst.getOpcode() == AArch64::LDAXRH ||
+ Inst.getOpcode() == AArch64::LDAXRB ||
+ Inst.getOpcode() == AArch64::STLXPX ||
+ Inst.getOpcode() == AArch64::STLXPW ||
+ Inst.getOpcode() == AArch64::STLXRX ||
+ Inst.getOpcode() == AArch64::STLXRW ||
+ Inst.getOpcode() == AArch64::STLXRH ||
+ Inst.getOpcode() == AArch64::STLXRB ||
+ Inst.getOpcode() == AArch64::CLREX);
+ }
+
bool isLoadFromStack(const MCInst &Inst) const {
if (!isLoad(Inst))
return false;
@@ -207,6 +332,40 @@ public:
return Inst.getOpcode() == AArch64::BLR;
}
+ MCPhysReg getSpRegister(int Size) const {
+ switch (Size) {
+ case 4:
+ return AArch64::WSP;
+ case 8:
+ return AArch64::SP;
+ default:
+ llvm_unreachable("Unexpected size");
+ }
+ }
+
+ MCPhysReg getIntArgRegister(unsigned ArgNo) const override {
+ switch (ArgNo) {
+ case 0:
+ return AArch64::X0;
+ case 1:
+ return AArch64::X1;
+ case 2:
+ return AArch64::X2;
+ case 3:
+ return AArch64::X3;
+ case 4:
+ return AArch64::X4;
+ case 5:
+ return AArch64::X5;
+ case 6:
+ return AArch64::X6;
+ case 7:
+ return AArch64::X7;
+ default:
+ return getNoRegister();
+ }
+ }
+
bool hasPCRelOperand(const MCInst &Inst) const override {
// ADRP is blacklisted and is an exception. Even though it has a
// PC-relative operand, this operand is not a complete symbol reference
@@ -313,6 +472,22 @@ public:
return true;
}
+ void getCalleeSavedRegs(BitVector &Regs) const override {
+ Regs |= getAliases(AArch64::X18);
+ Regs |= getAliases(AArch64::X19);
+ Regs |= getAliases(AArch64::X20);
+ Regs |= getAliases(AArch64::X21);
+ Regs |= getAliases(AArch64::X22);
+ Regs |= getAliases(AArch64::X23);
+ Regs |= getAliases(AArch64::X24);
+ Regs |= getAliases(AArch64::X25);
+ Regs |= getAliases(AArch64::X26);
+ Regs |= getAliases(AArch64::X27);
+ Regs |= getAliases(AArch64::X28);
+ Regs |= getAliases(AArch64::LR);
+ Regs |= getAliases(AArch64::FP);
+ }
+
const MCExpr *getTargetExprFor(MCInst &Inst, const MCExpr *Expr,
MCContext &Ctx,
uint64_t RelType) const override {
@@ -818,6 +993,22 @@ public:
int getUncondBranchEncodingSize() const override { return 28; }
+ InstructionListType createCmpJE(MCPhysReg RegNo, int64_t Imm,
+ const MCSymbol *Target,
+ MCContext *Ctx) const override {
+ InstructionListType Code;
+ Code.emplace_back(MCInstBuilder(AArch64::SUBSXri)
+ .addReg(RegNo)
+ .addReg(RegNo)
+ .addImm(Imm)
+ .addImm(0));
+ Code.emplace_back(MCInstBuilder(AArch64::Bcc)
+ .addImm(Imm)
+ .addExpr(MCSymbolRefExpr::create(
+ Target, MCSymbolRefExpr::VK_None, *Ctx)));
+ return Code;
+ }
+
bool createCall(MCInst &Inst, const MCSymbol *Target,
MCContext *Ctx) override {
Inst.setOpcode(AArch64::BL);
@@ -828,12 +1019,7 @@ public:
bool createTailCall(MCInst &Inst, const MCSymbol *Target,
MCContext *Ctx) override {
- Inst.setOpcode(AArch64::B);
- Inst.addOperand(MCOperand::createExpr(getTargetExprFor(
- Inst, MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx),
- *Ctx, 0)));
- setTailCall(Inst);
- return true;
+ return createDirectCall(Inst, Target, Ctx, /*IsTailCall*/ true);
}
void createLongTailCall(InstructionListType &Seq, const MCSymbol *Target,
@@ -882,6 +1068,18 @@ public:
bool isStore(const MCInst &Inst) const override { return false; }
+ bool createDirectCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx,
+ bool IsTailCall) override {
+ Inst.setOpcode(IsTailCall ? AArch64::B : AArch64::BL);
+ Inst.clear();
+ Inst.addOperand(MCOperand::createExpr(getTargetExprFor(
+ Inst, MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx),
+ *Ctx, 0)));
+ if (IsTailCall)
+ convertJmpToTailCall(Inst);
+ return true;
+ }
+
bool analyzeBranch(InstructionIterator Begin, InstructionIterator End,
const MCSymbol *&TBB, const MCSymbol *&FBB,
MCInst *&CondBranch,
@@ -1153,6 +1351,242 @@ public:
return true;
}
+ bool createStackPointerIncrement(
+ MCInst &Inst, int Size,
+ bool NoFlagsClobber = false /*unused for AArch64*/) const override {
+ Inst.setOpcode(AArch64::SUBXri);
+ Inst.clear();
+ Inst.addOperand(MCOperand::createReg(AArch64::SP));
+ Inst.addOperand(MCOperand::createReg(AArch64::SP));
+ Inst.addOperand(MCOperand::createImm(Size));
+ Inst.addOperand(MCOperand::createImm(0));
+ return true;
+ }
+
+ bool createStackPointerDecrement(
+ MCInst &Inst, int Size,
+ bool NoFlagsClobber = false /*unused for AArch64*/) const override {
+ Inst.setOpcode(AArch64::ADDXri);
+ Inst.clear();
+ Inst.addOperand(MCOperand::createReg(AArch64::SP));
+ Inst.addOperand(MCOperand::createReg(AArch64::SP));
+ Inst.addOperand(MCOperand::createImm(Size));
+ Inst.addOperand(MCOperand::createImm(0));
+ return true;
+ }
+
+ void createIndirectBranch(MCInst &Inst, MCPhysReg MemBaseReg,
+ int64_t Disp) const {
+ Inst.setOpcode(AArch64::BR);
+ Inst.addOperand(MCOperand::createReg(MemBaseReg));
+ }
+
+ InstructionListType createInstrumentedIndCallHandlerExitBB() const override {
+ InstructionListType Insts(5);
+ // Code sequence for instrumented indirect call handler:
+ // msr nzcv, x1
+ // ldp x0, x1, [sp], #16
+ // ldr x16, [sp], #16
+ // ldp x0, x1, [sp], #16
+ // br x16
+ setSystemFlag(Insts[0], AArch64::X1);
+ createPopRegisters(Insts[1], AArch64::X0, AArch64::X1);
+ // Here we load address of the next function which should be called in the
+ // original binary to X16 register. Writing to X16 is permitted without
+ // needing to restore.
+ loadReg(Insts[2], AArch64::X16, AArch64::SP);
+ createPopRegisters(Insts[3], AArch64::X0, AArch64::X1);
+ createIndirectBranch(Insts[4], AArch64::X16, 0);
+ return Insts;
+ }
+
+ InstructionListType
+ createInstrumentedIndTailCallHandlerExitBB() const override {
+ return createInstrumentedIndCallHandlerExitBB();
+ }
+
+ InstructionListType createGetter(MCContext *Ctx, const char *name) const {
+ InstructionListType Insts(4);
+ MCSymbol *Locs = Ctx->getOrCreateSymbol(name);
+ InstructionListType Addr = materializeAddress(Locs, Ctx, AArch64::X0);
+ std::copy(Addr.begin(), Addr.end(), Insts.begin());
+ assert(Addr.size() == 2 && "Invalid Addr size");
+ loadReg(Insts[2], AArch64::X0, AArch64::X0);
+ createReturn(Insts[3]);
+ return Insts;
+ }
+
+ InstructionListType createNumCountersGetter(MCContext *Ctx) const override {
+ return createGetter(Ctx, "__bolt_num_counters");
+ }
+
+ InstructionListType
+ createInstrLocationsGetter(MCContext *Ctx) const override {
+ return createGetter(Ctx, "__bolt_instr_locations");
+ }
+
+ InstructionListType createInstrTablesGetter(MCContext *Ctx) const override {
+ return createGetter(Ctx, "__bolt_instr_tables");
+ }
+
+ InstructionListType createInstrNumFuncsGetter(MCContext *Ctx) const override {
+ return createGetter(Ctx, "__bolt_instr_num_funcs");
+ }
+
+ void convertIndirectCallToLoad(MCInst &Inst, MCPhysReg Reg) override {
+ bool IsTailCall = isTailCall(Inst);
+ if (IsTailCall)
+ removeAnnotation(Inst, MCPlus::MCAnnotation::kTailCall);
+ if (Inst.getOpcode() == AArch64::BR || Inst.getOpcode() == AArch64::BLR) {
+ Inst.setOpcode(AArch64::ORRXrs);
+ Inst.insert(Inst.begin(), MCOperand::createReg(Reg));
+ Inst.insert(Inst.begin() + 1, MCOperand::createReg(AArch64::XZR));
+ Inst.insert(Inst.begin() + 3, MCOperand::createImm(0));
+ return;
+ }
+ llvm_unreachable("not implemented");
+ }
+
+ InstructionListType createLoadImmediate(const MCPhysReg Dest,
+ uint64_t Imm) const override {
+ InstructionListType Insts(4);
+ int Shift = 48;
+ for (int I = 0; I < 4; I++, Shift -= 16) {
+ Insts[I].setOpcode(AArch64::MOVKXi);
+ Insts[I].addOperand(MCOperand::createReg(Dest));
+ Insts[I].addOperand(MCOperand::createReg(Dest));
+ Insts[I].addOperand(MCOperand::createImm((Imm >> Shift) & 0xFFFF));
+ Insts[I].addOperand(MCOperand::createImm(Shift));
+ }
+ return Insts;
+ }
+
+ void createIndirectCallInst(MCInst &Inst, bool IsTailCall,
+ MCPhysReg Reg) const {
+ Inst.clear();
+ Inst.setOpcode(IsTailCall ? AArch64::BR : AArch64::BLR);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ }
+
+ InstructionListType createInstrumentedIndirectCall(MCInst &&CallInst,
+ MCSymbol *HandlerFuncAddr,
+ int CallSiteID,
+ MCContext *Ctx) override {
+ InstructionListType Insts;
+ // Code sequence used to enter indirect call instrumentation helper:
+ // stp x0, x1, [sp, #-16]! createPushRegisters
+ // mov target x0 convertIndirectCallToLoad -> orr x0 target xzr
+ // mov x1 CallSiteID createLoadImmediate ->
+ // movk x1, #0x0, lsl #48
+ // movk x1, #0x0, lsl #32
+ // movk x1, #0x0, lsl #16
+ // movk x1, #0x0
+ // stp x0, x1, [sp, #-16]!
+ // bl *HandlerFuncAddr createIndirectCall ->
+ // adr x0 *HandlerFuncAddr -> adrp + add
+ // blr x0
+ Insts.emplace_back();
+ createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1);
+ Insts.emplace_back(CallInst);
+ convertIndirectCallToLoad(Insts.back(), AArch64::X0);
+ InstructionListType LoadImm =
+ createLoadImmediate(getIntArgRegister(1), CallSiteID);
+ Insts.insert(Insts.end(), LoadImm.begin(), LoadImm.end());
+ Insts.emplace_back();
+ createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1);
+ Insts.resize(Insts.size() + 2);
+ InstructionListType Addr =
+ materializeAddress(HandlerFuncAddr, Ctx, AArch64::X0);
+ assert(Addr.size() == 2 && "Invalid Addr size");
+ std::copy(Addr.begin(), Addr.end(), Insts.end() - Addr.size());
+ Insts.emplace_back();
+ createIndirectCallInst(Insts.back(), isTailCall(CallInst), AArch64::X0);
+
+ // Carry over metadata including tail call marker if present.
+ stripAnnotations(Insts.back());
+ moveAnnotations(std::move(CallInst), Insts.back());
+
+ return Insts;
+ }
+
+ InstructionListType
+ createInstrumentedIndCallHandlerEntryBB(const MCSymbol *InstrTrampoline,
+ const MCSymbol *IndCallHandler,
+ MCContext *Ctx) override {
+ // Code sequence used to check whether InstrTampoline was initialized
+ // and call it if so, returns via IndCallHandler
+ // stp x0, x1, [sp, #-16]!
+ // mrs x1, nzcv
+ // adr x0, InstrTrampoline -> adrp + add
+ // ldr x0, [x0]
+ // subs x0, x0, #0x0
+ // b.eq IndCallHandler
+ // str x30, [sp, #-16]!
+ // blr x0
+ // ldr x30, [sp], #16
+ // b IndCallHandler
+ InstructionListType Insts;
+ Insts.emplace_back();
+ createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1);
+ Insts.emplace_back();
+ getSystemFlag(Insts.back(), getIntArgRegister(1));
+ Insts.emplace_back();
+ Insts.emplace_back();
+ InstructionListType Addr =
+ materializeAddress(InstrTrampoline, Ctx, AArch64::X0);
+ std::copy(Addr.begin(), Addr.end(), Insts.end() - Addr.size());
+ assert(Addr.size() == 2 && "Invalid Addr size");
+ Insts.emplace_back();
+ loadReg(Insts.back(), AArch64::X0, AArch64::X0);
+ InstructionListType cmpJmp =
+ createCmpJE(AArch64::X0, 0, IndCallHandler, Ctx);
+ Insts.insert(Insts.end(), cmpJmp.begin(), cmpJmp.end());
+ Insts.emplace_back();
+ storeReg(Insts.back(), AArch64::LR, AArch64::SP);
+ Insts.emplace_back();
+ Insts.back().setOpcode(AArch64::BLR);
+ Insts.back().addOperand(MCOperand::createReg(AArch64::X0));
+ Insts.emplace_back();
+ loadReg(Insts.back(), AArch64::LR, AArch64::SP);
+ Insts.emplace_back();
+ createDirectCall(Insts.back(), IndCallHandler, Ctx, /*IsTailCall*/ true);
+ return Insts;
+ }
+
+ InstructionListType
+ createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf,
+ unsigned CodePointerSize) const override {
+ unsigned int I = 0;
+ InstructionListType Instrs(IsLeaf ? 12 : 10);
+
+ if (IsLeaf)
+ createStackPointerIncrement(Instrs[I++], 128);
+ createPushRegisters(Instrs[I++], AArch64::X0, AArch64::X1);
+ getSystemFlag(Instrs[I++], AArch64::X1);
+ InstructionListType Addr = materializeAddress(Target, Ctx, AArch64::X0);
+ assert(Addr.size() == 2 && "Invalid Addr size");
+ std::copy(Addr.begin(), Addr.end(), Instrs.begin() + I);
+ I += Addr.size();
+ storeReg(Instrs[I++], AArch64::X2, AArch64::SP);
+ InstructionListType Insts = createIncMemory(AArch64::X0, AArch64::X2);
+ assert(Insts.size() == 2 && "Invalid Insts size");
+ std::copy(Insts.begin(), Insts.end(), Instrs.begin() + I);
+ I += Insts.size();
+ loadReg(Instrs[I++], AArch64::X2, AArch64::SP);
+ setSystemFlag(Instrs[I++], AArch64::X1);
+ createPopRegisters(Instrs[I++], AArch64::X0, AArch64::X1);
+ if (IsLeaf)
+ createStackPointerDecrement(Instrs[I++], 128);
+ return Instrs;
+ }
+
+ std::vector<MCInst> createSymbolTrampoline(const MCSymbol *TgtSym,
+ MCContext *Ctx) override {
+ std::vector<MCInst> Insts;
+ createShortJmp(Insts, TgtSym, Ctx, /*IsTailCall*/ true);
+ return Insts;
+ }
+
InstructionListType materializeAddress(const MCSymbol *Target, MCContext *Ctx,
MCPhysReg RegName,
int64_t Addend = 0) const override {
diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
index 5e3c01a..25b6970 100644
--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
@@ -61,6 +61,25 @@ bool isADDri(const MCInst &Inst) {
Inst.getOpcode() == X86::ADD64ri8;
}
+// Create instruction to increment contents of target by 1
+static InstructionListType createIncMemory(const MCSymbol *Target,
+ MCContext *Ctx) {
+ InstructionListType Insts;
+ Insts.emplace_back();
+ Insts.back().setOpcode(X86::LOCK_INC64m);
+ Insts.back().clear();
+ Insts.back().addOperand(MCOperand::createReg(X86::RIP)); // BaseReg
+ Insts.back().addOperand(MCOperand::createImm(1)); // ScaleAmt
+ Insts.back().addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg
+
+ Insts.back().addOperand(MCOperand::createExpr(
+ MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None,
+ *Ctx))); // Displacement
+ Insts.back().addOperand(
+ MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg
+ return Insts;
+}
+
#define GET_INSTRINFO_OPERAND_TYPES_ENUM
#define GET_INSTRINFO_OPERAND_TYPE
#define GET_INSTRINFO_MEM_OPERAND_SIZE
@@ -2309,28 +2328,15 @@ public:
return true;
}
- void createLoadImmediate(MCInst &Inst, const MCPhysReg Dest,
- uint32_t Imm) const override {
- Inst.setOpcode(X86::MOV64ri32);
- Inst.clear();
- Inst.addOperand(MCOperand::createReg(Dest));
- Inst.addOperand(MCOperand::createImm(Imm));
- }
-
- bool createIncMemory(MCInst &Inst, const MCSymbol *Target,
- MCContext *Ctx) const override {
-
- Inst.setOpcode(X86::LOCK_INC64m);
- Inst.clear();
- Inst.addOperand(MCOperand::createReg(X86::RIP)); // BaseReg
- Inst.addOperand(MCOperand::createImm(1)); // ScaleAmt
- Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg
-
- Inst.addOperand(MCOperand::createExpr(
- MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None,
- *Ctx))); // Displacement
- Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg
- return true;
+ InstructionListType createLoadImmediate(const MCPhysReg Dest,
+ uint64_t Imm) const override {
+ InstructionListType Insts;
+ Insts.emplace_back();
+ Insts.back().setOpcode(X86::MOV64ri32);
+ Insts.back().clear();
+ Insts.back().addOperand(MCOperand::createReg(Dest));
+ Insts.back().addOperand(MCOperand::createImm(Imm));
+ return Insts;
}
bool createIJmp32Frag(SmallVectorImpl<MCInst> &Insts,
@@ -3057,9 +3063,9 @@ public:
Inst.clear();
}
- InstructionListType createInstrIncMemory(const MCSymbol *Target,
- MCContext *Ctx,
- bool IsLeaf) const override {
+ InstructionListType
+ createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf,
+ unsigned CodePointerSize) const override {
InstructionListType Instrs(IsLeaf ? 13 : 11);
unsigned int I = 0;
@@ -3079,7 +3085,10 @@ public:
createClearRegWithNoEFlagsUpdate(Instrs[I++], X86::RAX, 8);
createX86SaveOVFlagToRegister(Instrs[I++], X86::AL);
// LOCK INC
- createIncMemory(Instrs[I++], Target, Ctx);
+ InstructionListType IncMem = createIncMemory(Target, Ctx);
+ assert(IncMem.size() == 1 && "Invalid IncMem size");
+ std::copy(IncMem.begin(), IncMem.end(), Instrs.begin() + I);
+ I += IncMem.size();
// POPF
createAddRegImm(Instrs[I++], X86::AL, 127, 1);
createPopRegister(Instrs[I++], X86::RAX, 8);
@@ -3153,8 +3162,8 @@ public:
}
Insts.emplace_back();
createPushRegister(Insts.back(), TempReg, 8);
- Insts.emplace_back();
- createLoadImmediate(Insts.back(), TempReg, CallSiteID);
+ InstructionListType LoadImm = createLoadImmediate(TempReg, CallSiteID);
+ Insts.insert(Insts.end(), LoadImm.begin(), LoadImm.end());
Insts.emplace_back();
createPushRegister(Insts.back(), TempReg, 8);
@@ -3264,7 +3273,7 @@ public:
}
InstructionListType createSymbolTrampoline(const MCSymbol *TgtSym,
- MCContext *Ctx) const override {
+ MCContext *Ctx) override {
InstructionListType Insts(1);
createUncondBranch(Insts[0], TgtSym, Ctx);
return Insts;
diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt
index 8472ce0..838c8cb 100644
--- a/bolt/runtime/CMakeLists.txt
+++ b/bolt/runtime/CMakeLists.txt
@@ -27,8 +27,14 @@ set(BOLT_RT_FLAGS
-fno-exceptions
-fno-rtti
-fno-stack-protector
- -mno-sse
- -fPIC)
+ -fPIC
+ -mgeneral-regs-only)
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
+ set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-sse")
+endif()
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+ set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-outline-atomics")
+endif()
# Don't let the compiler think it can create calls to standard libs
target_compile_options(bolt_rt_instr PRIVATE ${BOLT_RT_FLAGS})
@@ -39,7 +45,7 @@ target_include_directories(bolt_rt_hugify PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
install(TARGETS bolt_rt_instr DESTINATION "lib${LLVM_LIBDIR_SUFFIX}")
install(TARGETS bolt_rt_hugify DESTINATION "lib${LLVM_LIBDIR_SUFFIX}")
-if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*")
+if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*" AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
add_library(bolt_rt_instr_osx STATIC
instr.cpp
${CMAKE_CURRENT_BINARY_DIR}/config.h
diff --git a/bolt/runtime/common.h b/bolt/runtime/common.h
index 9e6f175..9b9965b 100644
--- a/bolt/runtime/common.h
+++ b/bolt/runtime/common.h
@@ -6,10 +6,6 @@
//
//===----------------------------------------------------------------------===//
-#if !defined(__x86_64__)
-#error "For x86_64 only"
-#endif
-
#if defined(__linux__)
#include <cstddef>
@@ -44,44 +40,6 @@ typedef int int32_t;
#error "For Linux or MacOS only"
#endif
-// Save all registers while keeping 16B stack alignment
-#define SAVE_ALL \
- "push %%rax\n" \
- "push %%rbx\n" \
- "push %%rcx\n" \
- "push %%rdx\n" \
- "push %%rdi\n" \
- "push %%rsi\n" \
- "push %%rbp\n" \
- "push %%r8\n" \
- "push %%r9\n" \
- "push %%r10\n" \
- "push %%r11\n" \
- "push %%r12\n" \
- "push %%r13\n" \
- "push %%r14\n" \
- "push %%r15\n" \
- "sub $8, %%rsp\n"
-
-// Mirrors SAVE_ALL
-#define RESTORE_ALL \
- "add $8, %%rsp\n" \
- "pop %%r15\n" \
- "pop %%r14\n" \
- "pop %%r13\n" \
- "pop %%r12\n" \
- "pop %%r11\n" \
- "pop %%r10\n" \
- "pop %%r9\n" \
- "pop %%r8\n" \
- "pop %%rbp\n" \
- "pop %%rsi\n" \
- "pop %%rdi\n" \
- "pop %%rdx\n" \
- "pop %%rcx\n" \
- "pop %%rbx\n" \
- "pop %%rax\n"
-
#define PROT_READ 0x1 /* Page can be read. */
#define PROT_WRITE 0x2 /* Page can be written. */
#define PROT_EXEC 0x4 /* Page can be executed. */
@@ -165,141 +123,41 @@ int memcmp(const void *s1, const void *s2, size_t n) {
// Anonymous namespace covering everything but our library entry point
namespace {
-// Get the difference between runtime addrress of .text section and
-// static address in section header table. Can be extracted from arbitrary
-// pc value recorded at runtime to get the corresponding static address, which
-// in turn can be used to search for indirect call description. Needed because
-// indirect call descriptions are read-only non-relocatable data.
-uint64_t getTextBaseAddress() {
- uint64_t DynAddr;
- uint64_t StaticAddr;
- __asm__ volatile("leaq __hot_end(%%rip), %0\n\t"
- "movabsq $__hot_end, %1\n\t"
- : "=r"(DynAddr), "=r"(StaticAddr));
- return DynAddr - StaticAddr;
-}
-
-constexpr uint32_t BufSize = 10240;
-
-#define _STRINGIFY(x) #x
-#define STRINGIFY(x) _STRINGIFY(x)
-
-uint64_t __read(uint64_t fd, const void *buf, uint64_t count) {
- uint64_t ret;
-#if defined(__APPLE__)
-#define READ_SYSCALL 0x2000003
-#else
-#define READ_SYSCALL 0
-#endif
- __asm__ __volatile__("movq $" STRINGIFY(READ_SYSCALL) ", %%rax\n"
- "syscall\n"
- : "=a"(ret)
- : "D"(fd), "S"(buf), "d"(count)
- : "cc", "rcx", "r11", "memory");
- return ret;
-}
-
-uint64_t __write(uint64_t fd, const void *buf, uint64_t count) {
- uint64_t ret;
-#if defined(__APPLE__)
-#define WRITE_SYSCALL 0x2000004
-#else
-#define WRITE_SYSCALL 1
-#endif
- __asm__ __volatile__("movq $" STRINGIFY(WRITE_SYSCALL) ", %%rax\n"
- "syscall\n"
- : "=a"(ret)
- : "D"(fd), "S"(buf), "d"(count)
- : "cc", "rcx", "r11", "memory");
- return ret;
-}
-
-void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags,
- uint64_t fd, uint64_t offset) {
-#if defined(__APPLE__)
-#define MMAP_SYSCALL 0x20000c5
-#else
-#define MMAP_SYSCALL 9
-#endif
- void *ret;
- register uint64_t r8 asm("r8") = fd;
- register uint64_t r9 asm("r9") = offset;
- register uint64_t r10 asm("r10") = flags;
- __asm__ __volatile__("movq $" STRINGIFY(MMAP_SYSCALL) ", %%rax\n"
- "syscall\n"
- : "=a"(ret)
- : "D"(addr), "S"(size), "d"(prot), "r"(r10), "r"(r8),
- "r"(r9)
- : "cc", "rcx", "r11", "memory");
- return ret;
-}
-
-uint64_t __munmap(void *addr, uint64_t size) {
-#if defined(__APPLE__)
-#define MUNMAP_SYSCALL 0x2000049
-#else
-#define MUNMAP_SYSCALL 11
-#endif
- uint64_t ret;
- __asm__ __volatile__("movq $" STRINGIFY(MUNMAP_SYSCALL) ", %%rax\n"
- "syscall\n"
- : "=a"(ret)
- : "D"(addr), "S"(size)
- : "cc", "rcx", "r11", "memory");
- return ret;
-}
+struct dirent64 {
+ uint64_t d_ino; /* Inode number */
+ int64_t d_off; /* Offset to next linux_dirent */
+ unsigned short d_reclen; /* Length of this linux_dirent */
+ unsigned char d_type;
+ char d_name[]; /* Filename (null-terminated) */
+ /* length is actually (d_reclen - 2 -
+ offsetof(struct linux_dirent, d_name)) */
+};
-#define SIG_BLOCK 0
-#define SIG_UNBLOCK 1
-#define SIG_SETMASK 2
+/* Length of the entries in `struct utsname' is 65. */
+#define _UTSNAME_LENGTH 65
-static const uint64_t MaskAllSignals[] = {-1ULL};
+struct UtsNameTy {
+ char sysname[_UTSNAME_LENGTH]; /* Operating system name (e.g., "Linux") */
+ char nodename[_UTSNAME_LENGTH]; /* Name within "some implementation-defined
+ network" */
+ char release[_UTSNAME_LENGTH]; /* Operating system release (e.g., "2.6.28") */
+ char version[_UTSNAME_LENGTH]; /* Operating system version */
+ char machine[_UTSNAME_LENGTH]; /* Hardware identifier */
+ char domainname[_UTSNAME_LENGTH]; /* NIS or YP domain name */
+};
-uint64_t __sigprocmask(int how, const void *set, void *oldset) {
-#if defined(__APPLE__)
-#define SIGPROCMASK_SYSCALL 0x2000030
-#else
-#define SIGPROCMASK_SYSCALL 14
-#endif
- uint64_t ret;
- register long r10 asm("r10") = sizeof(uint64_t);
- __asm__ __volatile__("movq $" STRINGIFY(SIGPROCMASK_SYSCALL) ", %%rax\n"
- "syscall\n"
- : "=a"(ret)
- : "D"(how), "S"(set), "d"(oldset), "r"(r10)
- : "cc", "rcx", "r11", "memory");
- return ret;
-}
+struct timespec {
+ uint64_t tv_sec; /* seconds */
+ uint64_t tv_nsec; /* nanoseconds */
+};
-uint64_t __getpid() {
- uint64_t ret;
-#if defined(__APPLE__)
-#define GETPID_SYSCALL 20
+#if defined(__aarch64__)
+#include "sys_aarch64.h"
#else
-#define GETPID_SYSCALL 39
+#include "sys_x86_64.h"
#endif
- __asm__ __volatile__("movq $" STRINGIFY(GETPID_SYSCALL) ", %%rax\n"
- "syscall\n"
- : "=a"(ret)
- :
- : "cc", "rcx", "r11", "memory");
- return ret;
-}
-uint64_t __exit(uint64_t code) {
-#if defined(__APPLE__)
-#define EXIT_SYSCALL 0x2000001
-#else
-#define EXIT_SYSCALL 231
-#endif
- uint64_t ret;
- __asm__ __volatile__("movq $" STRINGIFY(EXIT_SYSCALL) ", %%rax\n"
- "syscall\n"
- : "=a"(ret)
- : "D"(code)
- : "cc", "rcx", "r11", "memory");
- return ret;
-}
+constexpr uint32_t BufSize = 10240;
// Helper functions for writing strings to the .fdata file. We intentionally
// avoid using libc names to make it clear it is our impl.
@@ -415,219 +273,6 @@ static bool scanUInt32(const char *&Buf, const char *End, uint32_t &Ret) {
return false;
}
-#if !defined(__APPLE__)
-// We use a stack-allocated buffer for string manipulation in many pieces of
-// this code, including the code that prints each line of the fdata file. This
-// buffer needs to accomodate large function names, but shouldn't be arbitrarily
-// large (dynamically allocated) for simplicity of our memory space usage.
-
-// Declare some syscall wrappers we use throughout this code to avoid linking
-// against system libc.
-uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) {
- uint64_t ret;
- __asm__ __volatile__("movq $2, %%rax\n"
- "syscall"
- : "=a"(ret)
- : "D"(pathname), "S"(flags), "d"(mode)
- : "cc", "rcx", "r11", "memory");
- return ret;
-}
-
-struct dirent {
- unsigned long d_ino; /* Inode number */
- unsigned long d_off; /* Offset to next linux_dirent */
- unsigned short d_reclen; /* Length of this linux_dirent */
- char d_name[]; /* Filename (null-terminated) */
- /* length is actually (d_reclen - 2 -
- offsetof(struct linux_dirent, d_name)) */
-};
-
-long __getdents(unsigned int fd, dirent *dirp, size_t count) {
- long ret;
- __asm__ __volatile__("movq $78, %%rax\n"
- "syscall"
- : "=a"(ret)
- : "D"(fd), "S"(dirp), "d"(count)
- : "cc", "rcx", "r11", "memory");
- return ret;
-}
-
-uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) {
- uint64_t ret;
- __asm__ __volatile__("movq $89, %%rax\n"
- "syscall"
- : "=a"(ret)
- : "D"(pathname), "S"(buf), "d"(bufsize)
- : "cc", "rcx", "r11", "memory");
- return ret;
-}
-
-uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) {
- uint64_t ret;
- __asm__ __volatile__("movq $8, %%rax\n"
- "syscall\n"
- : "=a"(ret)
- : "D"(fd), "S"(pos), "d"(whence)
- : "cc", "rcx", "r11", "memory");
- return ret;
-}
-
-int __ftruncate(uint64_t fd, uint64_t length) {
- int ret;
- __asm__ __volatile__("movq $77, %%rax\n"
- "syscall\n"
- : "=a"(ret)
- : "D"(fd), "S"(length)
- : "cc", "rcx", "r11", "memory");
- return ret;
-}
-
-int __close(uint64_t fd) {
- uint64_t ret;
- __asm__ __volatile__("movq $3, %%rax\n"
- "syscall\n"
- : "=a"(ret)
- : "D"(fd)
- : "cc", "rcx", "r11", "memory");
- return ret;
-}
-
-int __madvise(void *addr, size_t length, int advice) {
- int ret;
- __asm__ __volatile__("movq $28, %%rax\n"
- "syscall\n"
- : "=a"(ret)
- : "D"(addr), "S"(length), "d"(advice)
- : "cc", "rcx", "r11", "memory");
- return ret;
-}
-
-#define _UTSNAME_LENGTH 65
-
-struct UtsNameTy {
- char sysname[_UTSNAME_LENGTH]; /* Operating system name (e.g., "Linux") */
- char nodename[_UTSNAME_LENGTH]; /* Name within "some implementation-defined
- network" */
- char release[_UTSNAME_LENGTH]; /* Operating system release (e.g., "2.6.28") */
- char version[_UTSNAME_LENGTH]; /* Operating system version */
- char machine[_UTSNAME_LENGTH]; /* Hardware identifier */
- char domainname[_UTSNAME_LENGTH]; /* NIS or YP domain name */
-};
-
-int __uname(struct UtsNameTy *Buf) {
- int Ret;
- __asm__ __volatile__("movq $63, %%rax\n"
- "syscall\n"
- : "=a"(Ret)
- : "D"(Buf)
- : "cc", "rcx", "r11", "memory");
- return Ret;
-}
-
-struct timespec {
- uint64_t tv_sec; /* seconds */
- uint64_t tv_nsec; /* nanoseconds */
-};
-
-uint64_t __nanosleep(const timespec *req, timespec *rem) {
- uint64_t ret;
- __asm__ __volatile__("movq $35, %%rax\n"
- "syscall\n"
- : "=a"(ret)
- : "D"(req), "S"(rem)
- : "cc", "rcx", "r11", "memory");
- return ret;
-}
-
-int64_t __fork() {
- uint64_t ret;
- __asm__ __volatile__("movq $57, %%rax\n"
- "syscall\n"
- : "=a"(ret)
- :
- : "cc", "rcx", "r11", "memory");
- return ret;
-}
-
-int __mprotect(void *addr, size_t len, int prot) {
- int ret;
- __asm__ __volatile__("movq $10, %%rax\n"
- "syscall\n"
- : "=a"(ret)
- : "D"(addr), "S"(len), "d"(prot)
- : "cc", "rcx", "r11", "memory");
- return ret;
-}
-
-uint64_t __getppid() {
- uint64_t ret;
- __asm__ __volatile__("movq $110, %%rax\n"
- "syscall\n"
- : "=a"(ret)
- :
- : "cc", "rcx", "r11", "memory");
- return ret;
-}
-
-int __setpgid(uint64_t pid, uint64_t pgid) {
- int ret;
- __asm__ __volatile__("movq $109, %%rax\n"
- "syscall\n"
- : "=a"(ret)
- : "D"(pid), "S"(pgid)
- : "cc", "rcx", "r11", "memory");
- return ret;
-}
-
-uint64_t __getpgid(uint64_t pid) {
- uint64_t ret;
- __asm__ __volatile__("movq $121, %%rax\n"
- "syscall\n"
- : "=a"(ret)
- : "D"(pid)
- : "cc", "rcx", "r11", "memory");
- return ret;
-}
-
-int __kill(uint64_t pid, int sig) {
- int ret;
- __asm__ __volatile__("movq $62, %%rax\n"
- "syscall\n"
- : "=a"(ret)
- : "D"(pid), "S"(sig)
- : "cc", "rcx", "r11", "memory");
- return ret;
-}
-
-int __fsync(int fd) {
- int ret;
- __asm__ __volatile__("movq $74, %%rax\n"
- "syscall\n"
- : "=a"(ret)
- : "D"(fd)
- : "cc", "rcx", "r11", "memory");
- return ret;
-}
-
-// %rdi %rsi %rdx %r10 %r8
-// sys_prctl int option unsigned unsigned unsigned unsigned
-// long arg2 long arg3 long arg4 long arg5
-int __prctl(int Option, unsigned long Arg2, unsigned long Arg3,
- unsigned long Arg4, unsigned long Arg5) {
- int Ret;
- register long rdx asm("rdx") = Arg3;
- register long r8 asm("r8") = Arg5;
- register long r10 asm("r10") = Arg4;
- __asm__ __volatile__("movq $157, %%rax\n"
- "syscall\n"
- : "=a"(Ret)
- : "D"(Option), "S"(Arg2), "d"(rdx), "r"(r10), "r"(r8)
- :);
- return Ret;
-}
-
-#endif
-
void reportError(const char *Msg, uint64_t Size) {
__write(2, Msg, Size);
__exit(1);
@@ -644,6 +289,12 @@ void assert(bool Assertion, const char *Msg) {
reportError(Buf, Ptr - Buf);
}
+#define SIG_BLOCK 0
+#define SIG_UNBLOCK 1
+#define SIG_SETMASK 2
+
+static const uint64_t MaskAllSignals[] = {-1ULL};
+
class Mutex {
volatile bool InUse{false};
diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp
index 96a43f6..cfd113e 100644
--- a/bolt/runtime/instr.cpp
+++ b/bolt/runtime/instr.cpp
@@ -40,7 +40,6 @@
//
//===----------------------------------------------------------------------===//
-#if defined (__x86_64__)
#include "common.h"
// Enables a very verbose logging to stderr useful when debugging
@@ -695,12 +694,12 @@ static char *getBinaryPath() {
assert(static_cast<int64_t>(FDdir) >= 0,
"failed to open /proc/self/map_files");
- while (long Nread = __getdents(FDdir, (struct dirent *)Buf, BufSize)) {
+ while (long Nread = __getdents64(FDdir, (struct dirent64 *)Buf, BufSize)) {
assert(static_cast<int64_t>(Nread) != -1, "failed to get folder entries");
- struct dirent *d;
+ struct dirent64 *d;
for (long Bpos = 0; Bpos < Nread; Bpos += d->d_reclen) {
- d = (struct dirent *)(Buf + Bpos);
+ d = (struct dirent64 *)(Buf + Bpos);
uint64_t StartAddress, EndAddress;
if (!parseAddressRange(d->d_name, StartAddress, EndAddress))
@@ -1668,6 +1667,17 @@ instrumentIndirectCall(uint64_t Target, uint64_t IndCallID) {
/// as well as the target address for the call
extern "C" __attribute((naked)) void __bolt_instr_indirect_call()
{
+#if defined(__aarch64__)
+ // clang-format off
+ __asm__ __volatile__(SAVE_ALL
+ "ldp x0, x1, [sp, #288]\n"
+ "bl instrumentIndirectCall\n"
+ RESTORE_ALL
+ "ret\n"
+ :::);
+ // clang-format on
+#else
+ // clang-format off
__asm__ __volatile__(SAVE_ALL
"mov 0xa0(%%rsp), %%rdi\n"
"mov 0x98(%%rsp), %%rsi\n"
@@ -1675,10 +1685,23 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_call()
RESTORE_ALL
"ret\n"
:::);
+ // clang-format on
+#endif
}
extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall()
{
+#if defined(__aarch64__)
+ // clang-format off
+ __asm__ __volatile__(SAVE_ALL
+ "ldp x0, x1, [sp, #288]\n"
+ "bl instrumentIndirectCall\n"
+ RESTORE_ALL
+ "ret\n"
+ :::);
+ // clang-format on
+#else
+ // clang-format off
__asm__ __volatile__(SAVE_ALL
"mov 0x98(%%rsp), %%rdi\n"
"mov 0x90(%%rsp), %%rsi\n"
@@ -1686,21 +1709,48 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall()
RESTORE_ALL
"ret\n"
:::);
+ // clang-format on
+#endif
}
/// This is hooking ELF's entry, it needs to save all machine state.
extern "C" __attribute((naked)) void __bolt_instr_start()
{
+#if defined(__aarch64__)
+ // clang-format off
+ __asm__ __volatile__(SAVE_ALL
+ "bl __bolt_instr_setup\n"
+ RESTORE_ALL
+ "adrp x16, __bolt_start_trampoline\n"
+ "add x16, x16, #:lo12:__bolt_start_trampoline\n"
+ "br x16\n"
+ :::);
+ // clang-format on
+#else
+ // clang-format off
__asm__ __volatile__(SAVE_ALL
"call __bolt_instr_setup\n"
RESTORE_ALL
"jmp __bolt_start_trampoline\n"
:::);
+ // clang-format on
+#endif
}
/// This is hooking into ELF's DT_FINI
extern "C" void __bolt_instr_fini() {
- __bolt_fini_trampoline();
+#if defined(__aarch64__)
+ // clang-format off
+ __asm__ __volatile__(SAVE_ALL
+ "adrp x16, __bolt_fini_trampoline\n"
+ "add x16, x16, #:lo12:__bolt_fini_trampoline\n"
+ "blr x16\n"
+ RESTORE_ALL
+ :::);
+ // clang-format on
+#else
+ __asm__ __volatile__("call __bolt_fini_trampoline\n" :::);
+#endif
if (__bolt_instr_sleep_time == 0) {
int FD = openProfile();
__bolt_instr_data_dump(FD);
@@ -1752,4 +1802,3 @@ void _bolt_instr_fini() {
}
#endif
-#endif
diff --git a/bolt/runtime/sys_aarch64.h b/bolt/runtime/sys_aarch64.h
new file mode 100644
index 0000000..77c9cfc
--- /dev/null
+++ b/bolt/runtime/sys_aarch64.h
@@ -0,0 +1,394 @@
+#ifndef LLVM_TOOLS_LLVM_BOLT_SYS_AARCH64
+#define LLVM_TOOLS_LLVM_BOLT_SYS_AARCH64
+
+// Save all registers while keeping 16B stack alignment
+#define SAVE_ALL \
+ "stp x0, x1, [sp, #-16]!\n" \
+ "stp x2, x3, [sp, #-16]!\n" \
+ "stp x4, x5, [sp, #-16]!\n" \
+ "stp x6, x7, [sp, #-16]!\n" \
+ "stp x8, x9, [sp, #-16]!\n" \
+ "stp x10, x11, [sp, #-16]!\n" \
+ "stp x12, x13, [sp, #-16]!\n" \
+ "stp x14, x15, [sp, #-16]!\n" \
+ "stp x16, x17, [sp, #-16]!\n" \
+ "stp x18, x19, [sp, #-16]!\n" \
+ "stp x20, x21, [sp, #-16]!\n" \
+ "stp x22, x23, [sp, #-16]!\n" \
+ "stp x24, x25, [sp, #-16]!\n" \
+ "stp x26, x27, [sp, #-16]!\n" \
+ "stp x28, x29, [sp, #-16]!\n" \
+ "str x30, [sp,#-16]!\n"
+// Mirrors SAVE_ALL
+#define RESTORE_ALL \
+ "ldr x30, [sp], #16\n" \
+ "ldp x28, x29, [sp], #16\n" \
+ "ldp x26, x27, [sp], #16\n" \
+ "ldp x24, x25, [sp], #16\n" \
+ "ldp x22, x23, [sp], #16\n" \
+ "ldp x20, x21, [sp], #16\n" \
+ "ldp x18, x19, [sp], #16\n" \
+ "ldp x16, x17, [sp], #16\n" \
+ "ldp x14, x15, [sp], #16\n" \
+ "ldp x12, x13, [sp], #16\n" \
+ "ldp x10, x11, [sp], #16\n" \
+ "ldp x8, x9, [sp], #16\n" \
+ "ldp x6, x7, [sp], #16\n" \
+ "ldp x4, x5, [sp], #16\n" \
+ "ldp x2, x3, [sp], #16\n" \
+ "ldp x0, x1, [sp], #16\n"
+
+// Anonymous namespace covering everything but our library entry point
+namespace {
+
+// Get the difference between runtime addrress of .text section and
+// static address in section header table. Can be extracted from arbitrary
+// pc value recorded at runtime to get the corresponding static address, which
+// in turn can be used to search for indirect call description. Needed because
+// indirect call descriptions are read-only non-relocatable data.
+uint64_t getTextBaseAddress() {
+ uint64_t DynAddr;
+ uint64_t StaticAddr;
+ __asm__ volatile("b .instr%=\n\t"
+ ".StaticAddr%=:\n\t"
+ ".dword __hot_end\n\t"
+ ".instr%=:\n\t"
+ "ldr %0, .StaticAddr%=\n\t"
+ "adrp %1, __hot_end\n\t"
+ "add %1, %1, :lo12:__hot_end\n\t"
+ : "=r"(StaticAddr), "=r"(DynAddr));
+ return DynAddr - StaticAddr;
+}
+
+uint64_t __read(uint64_t fd, const void *buf, uint64_t count) {
+ uint64_t ret;
+ register uint64_t x0 __asm__("x0") = fd;
+ register const void *x1 __asm__("x1") = buf;
+ register uint64_t x2 __asm__("x2") = count;
+ register uint32_t w8 __asm__("w8") = 63;
+ __asm__ __volatile__("svc #0\n"
+ "mov %0, x0"
+ : "=r"(ret), "+r"(x0), "+r"(x1)
+ : "r"(x2), "r"(w8)
+ : "cc", "memory");
+ return ret;
+}
+
+uint64_t __write(uint64_t fd, const void *buf, uint64_t count) {
+ uint64_t ret;
+ register uint64_t x0 __asm__("x0") = fd;
+ register const void *x1 __asm__("x1") = buf;
+ register uint64_t x2 __asm__("x2") = count;
+ register uint32_t w8 __asm__("w8") = 64;
+ __asm__ __volatile__("svc #0\n"
+ "mov %0, x0"
+ : "=r"(ret), "+r"(x0), "+r"(x1)
+ : "r"(x2), "r"(w8)
+ : "cc", "memory");
+ return ret;
+}
+
+void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags,
+ uint64_t fd, uint64_t offset) {
+ void *ret;
+ register uint64_t x0 __asm__("x0") = addr;
+ register uint64_t x1 __asm__("x1") = size;
+ register uint64_t x2 __asm__("x2") = prot;
+ register uint64_t x3 __asm__("x3") = flags;
+ register uint64_t x4 __asm__("x4") = fd;
+ register uint64_t x5 __asm__("x5") = offset;
+ register uint32_t w8 __asm__("w8") = 222;
+ __asm__ __volatile__("svc #0\n"
+ "mov %0, x0"
+ : "=r"(ret), "+r"(x0), "+r"(x1)
+ : "r"(x2), "r"(x3), "r"(x4), "r"(x5), "r"(w8)
+ : "cc", "memory");
+ return ret;
+}
+
+uint64_t __munmap(void *addr, uint64_t size) {
+ uint64_t ret;
+ register void *x0 __asm__("x0") = addr;
+ register uint64_t x1 __asm__("x1") = size;
+ register uint32_t w8 __asm__("w8") = 215;
+ __asm__ __volatile__("svc #0\n"
+ "mov %0, x0"
+ : "=r"(ret), "+r"(x0), "+r"(x1)
+ : "r"(w8)
+ : "cc", "memory");
+ return ret;
+}
+
+uint64_t __exit(uint64_t code) {
+ uint64_t ret;
+ register uint64_t x0 __asm__("x0") = code;
+ register uint32_t w8 __asm__("w8") = 94;
+ __asm__ __volatile__("svc #0\n"
+ "mov %0, x0"
+ : "=r"(ret), "+r"(x0)
+ : "r"(w8)
+ : "cc", "memory", "x1");
+ return ret;
+}
+
+uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) {
+ uint64_t ret;
+ register int x0 __asm__("x0") = -100;
+ register const char *x1 __asm__("x1") = pathname;
+ register uint64_t x2 __asm__("x2") = flags;
+ register uint64_t x3 __asm__("x3") = mode;
+ register uint32_t w8 __asm__("w8") = 56;
+ __asm__ __volatile__("svc #0\n"
+ "mov %0, x0"
+ : "=r"(ret), "+r"(x0), "+r"(x1)
+ : "r"(x2), "r"(x3), "r"(w8)
+ : "cc", "memory");
+ return ret;
+}
+
+long __getdents64(unsigned int fd, dirent64 *dirp, size_t count) {
+ long ret;
+ register unsigned int x0 __asm__("x0") = fd;
+ register dirent64 *x1 __asm__("x1") = dirp;
+ register size_t x2 __asm__("x2") = count;
+ register uint32_t w8 __asm__("w8") = 61;
+ __asm__ __volatile__("svc #0\n"
+ "mov %0, x0"
+ : "=r"(ret), "+r"(x0), "+r"(x1)
+ : "r"(x2), "r"(w8)
+ : "cc", "memory");
+ return ret;
+}
+
+uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) {
+ uint64_t ret;
+ register int x0 __asm__("x0") = -100;
+ register const char *x1 __asm__("x1") = pathname;
+ register char *x2 __asm__("x2") = buf;
+ register size_t x3 __asm__("x3") = bufsize;
+ register uint32_t w8 __asm__("w8") = 78; // readlinkat
+ __asm__ __volatile__("svc #0\n"
+ "mov %0, x0"
+ : "=r"(ret), "+r"(x0), "+r"(x1)
+ : "r"(x2), "r"(x3), "r"(w8)
+ : "cc", "memory");
+ return ret;
+}
+
+uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) {
+ uint64_t ret;
+ register uint64_t x0 __asm__("x0") = fd;
+ register uint64_t x1 __asm__("x1") = pos;
+ register uint64_t x2 __asm__("x2") = whence;
+ register uint32_t w8 __asm__("w8") = 62;
+ __asm__ __volatile__("svc #0\n"
+ "mov %0, x0"
+ : "=r"(ret), "+r"(x0), "+r"(x1)
+ : "r"(x2), "r"(w8)
+ : "cc", "memory");
+ return ret;
+}
+
+int __ftruncate(uint64_t fd, uint64_t length) {
+ int ret;
+ register uint64_t x0 __asm__("x0") = fd;
+ register uint64_t x1 __asm__("x1") = length;
+ register uint32_t w8 __asm__("w8") = 46;
+ __asm__ __volatile__("svc #0\n"
+ "mov %w0, w0"
+ : "=r"(ret), "+r"(x0), "+r"(x1)
+ : "r"(w8)
+ : "cc", "memory");
+ return ret;
+}
+
+int __close(uint64_t fd) {
+ int ret;
+ register uint64_t x0 __asm__("x0") = fd;
+ register uint32_t w8 __asm__("w8") = 57;
+ __asm__ __volatile__("svc #0\n"
+ "mov %w0, w0"
+ : "=r"(ret), "+r"(x0)
+ : "r"(w8)
+ : "cc", "memory", "x1");
+ return ret;
+}
+
+int __madvise(void *addr, size_t length, int advice) {
+ int ret;
+ register void *x0 __asm__("x0") = addr;
+ register size_t x1 __asm__("x1") = length;
+ register int x2 __asm__("x2") = advice;
+ register uint32_t w8 __asm__("w8") = 233;
+ __asm__ __volatile__("svc #0\n"
+ "mov %w0, w0"
+ : "=r"(ret), "+r"(x0), "+r"(x1)
+ : "r"(x2), "r"(w8)
+ : "cc", "memory");
+ return ret;
+}
+
+int __uname(struct UtsNameTy *buf) {
+ int ret;
+ register UtsNameTy *x0 __asm__("x0") = buf;
+ register uint32_t w8 __asm__("w8") = 160;
+ __asm__ __volatile__("svc #0\n"
+ "mov %w0, w0"
+ : "=r"(ret), "+r"(x0)
+ : "r"(w8)
+ : "cc", "memory", "x1");
+ return ret;
+}
+
+uint64_t __nanosleep(const timespec *req, timespec *rem) {
+ uint64_t ret;
+ register const timespec *x0 __asm__("x0") = req;
+ register timespec *x1 __asm__("x1") = rem;
+ register uint32_t w8 __asm__("w8") = 101;
+ __asm__ __volatile__("svc #0\n"
+ "mov %0, x0"
+ : "=r"(ret), "+r"(x0), "+r"(x1)
+ : "r"(w8)
+ : "cc", "memory");
+ return ret;
+}
+
+int64_t __fork() {
+ uint64_t ret;
+ // clone instead of fork with flags
+ // "CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD"
+ register uint64_t x0 __asm__("x0") = 0x1200011;
+ register uint64_t x1 __asm__("x1") = 0;
+ register uint64_t x2 __asm__("x2") = 0;
+ register uint64_t x3 __asm__("x3") = 0;
+ register uint64_t x4 __asm__("x4") = 0;
+ register uint32_t w8 __asm__("w8") = 220;
+ __asm__ __volatile__("svc #0\n"
+ "mov %0, x0"
+ : "=r"(ret), "+r"(x0), "+r"(x1)
+ : "r"(x2), "r"(x3), "r"(x4), "r"(w8)
+ : "cc", "memory");
+ return ret;
+}
+
+int __mprotect(void *addr, size_t len, int prot) {
+ int ret;
+ register void *x0 __asm__("x0") = addr;
+ register size_t x1 __asm__("x1") = len;
+ register int x2 __asm__("x2") = prot;
+ register uint32_t w8 __asm__("w8") = 226;
+ __asm__ __volatile__("svc #0\n"
+ "mov %w0, w0"
+ : "=r"(ret), "+r"(x0), "+r"(x1)
+ : "r"(x2), "r"(w8)
+ : "cc", "memory");
+ return ret;
+}
+
+uint64_t __getpid() {
+ uint64_t ret;
+ register uint32_t w8 __asm__("w8") = 172;
+ __asm__ __volatile__("svc #0\n"
+ "mov %0, x0"
+ : "=r"(ret)
+ : "r"(w8)
+ : "cc", "memory", "x0", "x1");
+ return ret;
+}
+
+uint64_t __getppid() {
+ uint64_t ret;
+ register uint32_t w8 __asm__("w8") = 173;
+ __asm__ __volatile__("svc #0\n"
+ "mov %0, x0"
+ : "=r"(ret)
+ : "r"(w8)
+ : "cc", "memory", "x0", "x1");
+ return ret;
+}
+
+int __setpgid(uint64_t pid, uint64_t pgid) {
+ int ret;
+ register uint64_t x0 __asm__("x0") = pid;
+ register uint64_t x1 __asm__("x1") = pgid;
+ register uint32_t w8 __asm__("w8") = 154;
+ __asm__ __volatile__("svc #0\n"
+ "mov %w0, w0"
+ : "=r"(ret), "+r"(x0), "+r"(x1)
+ : "r"(w8)
+ : "cc", "memory");
+ return ret;
+}
+
+uint64_t __getpgid(uint64_t pid) {
+ uint64_t ret;
+ register uint64_t x0 __asm__("x0") = pid;
+ register uint32_t w8 __asm__("w8") = 155;
+ __asm__ __volatile__("svc #0\n"
+ "mov %0, x0"
+ : "=r"(ret), "+r"(x0)
+ : "r"(w8)
+ : "cc", "memory", "x1");
+ return ret;
+}
+
+int __kill(uint64_t pid, int sig) {
+ int ret;
+ register uint64_t x0 __asm__("x0") = pid;
+ register int x1 __asm__("x1") = sig;
+ register uint32_t w8 __asm__("w8") = 129;
+ __asm__ __volatile__("svc #0\n"
+ "mov %w0, w0"
+ : "=r"(ret), "+r"(x0), "+r"(x1)
+ : "r"(w8)
+ : "cc", "memory");
+ return ret;
+}
+
+int __fsync(int fd) {
+ int ret;
+ register int x0 __asm__("x0") = fd;
+ register uint32_t w8 __asm__("w8") = 82;
+ __asm__ __volatile__("svc #0\n"
+ "mov %w0, w0"
+ : "=r"(ret), "+r"(x0)
+ : "r"(w8)
+ : "cc", "memory", "x1");
+ return ret;
+}
+
+uint64_t __sigprocmask(int how, const void *set, void *oldset) {
+ uint64_t ret;
+ register int x0 __asm__("x0") = how;
+ register const void *x1 __asm__("x1") = set;
+ register void *x2 __asm__("x2") = oldset;
+ register long x3 asm("x3") = 8;
+ register uint32_t w8 __asm__("w8") = 135;
+ __asm__ __volatile__("svc #0\n"
+ "mov %0, x0"
+ : "=r"(ret), "+r"(x0), "+r"(x1)
+ : "r"(x2), "r"(x3), "r"(w8)
+ : "cc", "memory");
+ return ret;
+}
+
+int __prctl(int option, unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5) {
+ int ret;
+ register int x0 __asm__("x0") = option;
+ register unsigned long x1 __asm__("x1") = arg2;
+ register unsigned long x2 __asm__("x2") = arg3;
+ register unsigned long x3 __asm__("x3") = arg4;
+ register unsigned long x4 __asm__("x4") = arg5;
+ register uint32_t w8 __asm__("w8") = 167;
+ __asm__ __volatile__("svc #0\n"
+ "mov %w0, w0"
+ : "=r"(ret), "+r"(x0), "+r"(x1)
+ : "r"(x2), "r"(x3), "r"(x4), "r"(w8)
+ : "cc", "memory");
+ return ret;
+}
+
+} // anonymous namespace
+
+#endif
diff --git a/bolt/runtime/sys_x86_64.h b/bolt/runtime/sys_x86_64.h
new file mode 100644
index 0000000..ca2c693
--- /dev/null
+++ b/bolt/runtime/sys_x86_64.h
@@ -0,0 +1,360 @@
+#ifndef LLVM_TOOLS_LLVM_BOLT_SYS_X86_64
+#define LLVM_TOOLS_LLVM_BOLT_SYS_X86_64
+
+// Save all registers while keeping 16B stack alignment
+#define SAVE_ALL \
+ "push %%rax\n" \
+ "push %%rbx\n" \
+ "push %%rcx\n" \
+ "push %%rdx\n" \
+ "push %%rdi\n" \
+ "push %%rsi\n" \
+ "push %%rbp\n" \
+ "push %%r8\n" \
+ "push %%r9\n" \
+ "push %%r10\n" \
+ "push %%r11\n" \
+ "push %%r12\n" \
+ "push %%r13\n" \
+ "push %%r14\n" \
+ "push %%r15\n" \
+ "sub $8, %%rsp\n"
+// Mirrors SAVE_ALL
+#define RESTORE_ALL \
+ "add $8, %%rsp\n" \
+ "pop %%r15\n" \
+ "pop %%r14\n" \
+ "pop %%r13\n" \
+ "pop %%r12\n" \
+ "pop %%r11\n" \
+ "pop %%r10\n" \
+ "pop %%r9\n" \
+ "pop %%r8\n" \
+ "pop %%rbp\n" \
+ "pop %%rsi\n" \
+ "pop %%rdi\n" \
+ "pop %%rdx\n" \
+ "pop %%rcx\n" \
+ "pop %%rbx\n" \
+ "pop %%rax\n"
+
+namespace {
+
+// Get the difference between runtime addrress of .text section and
+// static address in section header table. Can be extracted from arbitrary
+// pc value recorded at runtime to get the corresponding static address, which
+// in turn can be used to search for indirect call description. Needed because
+// indirect call descriptions are read-only non-relocatable data.
+uint64_t getTextBaseAddress() {
+ uint64_t DynAddr;
+ uint64_t StaticAddr;
+ __asm__ volatile("leaq __hot_end(%%rip), %0\n\t"
+ "movabsq $__hot_end, %1\n\t"
+ : "=r"(DynAddr), "=r"(StaticAddr));
+ return DynAddr - StaticAddr;
+}
+
+#define _STRINGIFY(x) #x
+#define STRINGIFY(x) _STRINGIFY(x)
+
+uint64_t __read(uint64_t fd, const void *buf, uint64_t count) {
+ uint64_t ret;
+#if defined(__APPLE__)
+#define READ_SYSCALL 0x2000003
+#else
+#define READ_SYSCALL 0
+#endif
+ __asm__ __volatile__("movq $" STRINGIFY(READ_SYSCALL) ", %%rax\n"
+ "syscall\n"
+ : "=a"(ret)
+ : "D"(fd), "S"(buf), "d"(count)
+ : "cc", "rcx", "r11", "memory");
+ return ret;
+}
+
+uint64_t __write(uint64_t fd, const void *buf, uint64_t count) {
+ uint64_t ret;
+#if defined(__APPLE__)
+#define WRITE_SYSCALL 0x2000004
+#else
+#define WRITE_SYSCALL 1
+#endif
+ __asm__ __volatile__("movq $" STRINGIFY(WRITE_SYSCALL) ", %%rax\n"
+ "syscall\n"
+ : "=a"(ret)
+ : "D"(fd), "S"(buf), "d"(count)
+ : "cc", "rcx", "r11", "memory");
+ return ret;
+}
+
+void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags,
+ uint64_t fd, uint64_t offset) {
+#if defined(__APPLE__)
+#define MMAP_SYSCALL 0x20000c5
+#else
+#define MMAP_SYSCALL 9
+#endif
+ void *ret;
+ register uint64_t r8 asm("r8") = fd;
+ register uint64_t r9 asm("r9") = offset;
+ register uint64_t r10 asm("r10") = flags;
+ __asm__ __volatile__("movq $" STRINGIFY(MMAP_SYSCALL) ", %%rax\n"
+ "syscall\n"
+ : "=a"(ret)
+ : "D"(addr), "S"(size), "d"(prot), "r"(r10), "r"(r8),
+ "r"(r9)
+ : "cc", "rcx", "r11", "memory");
+ return ret;
+}
+
+uint64_t __munmap(void *addr, uint64_t size) {
+#if defined(__APPLE__)
+#define MUNMAP_SYSCALL 0x2000049
+#else
+#define MUNMAP_SYSCALL 11
+#endif
+ uint64_t ret;
+ __asm__ __volatile__("movq $" STRINGIFY(MUNMAP_SYSCALL) ", %%rax\n"
+ "syscall\n"
+ : "=a"(ret)
+ : "D"(addr), "S"(size)
+ : "cc", "rcx", "r11", "memory");
+ return ret;
+}
+
+uint64_t __sigprocmask(int how, const void *set, void *oldset) {
+#if defined(__APPLE__)
+#define SIGPROCMASK_SYSCALL 0x2000030
+#else
+#define SIGPROCMASK_SYSCALL 14
+#endif
+ uint64_t ret;
+ register long r10 asm("r10") = sizeof(uint64_t);
+ __asm__ __volatile__("movq $" STRINGIFY(SIGPROCMASK_SYSCALL) ", %%rax\n"
+ "syscall\n"
+ : "=a"(ret)
+ : "D"(how), "S"(set), "d"(oldset), "r"(r10)
+ : "cc", "rcx", "r11", "memory");
+ return ret;
+}
+
+uint64_t __getpid() {
+ uint64_t ret;
+#if defined(__APPLE__)
+#define GETPID_SYSCALL 20
+#else
+#define GETPID_SYSCALL 39
+#endif
+ __asm__ __volatile__("movq $" STRINGIFY(GETPID_SYSCALL) ", %%rax\n"
+ "syscall\n"
+ : "=a"(ret)
+ :
+ : "cc", "rcx", "r11", "memory");
+ return ret;
+}
+
+uint64_t __exit(uint64_t code) {
+#if defined(__APPLE__)
+#define EXIT_SYSCALL 0x2000001
+#else
+#define EXIT_SYSCALL 231
+#endif
+ uint64_t ret;
+ __asm__ __volatile__("movq $" STRINGIFY(EXIT_SYSCALL) ", %%rax\n"
+ "syscall\n"
+ : "=a"(ret)
+ : "D"(code)
+ : "cc", "rcx", "r11", "memory");
+ return ret;
+}
+
+#if !defined(__APPLE__)
+// We use a stack-allocated buffer for string manipulation in many pieces of
+// this code, including the code that prints each line of the fdata file. This
+// buffer needs to accomodate large function names, but shouldn't be arbitrarily
+// large (dynamically allocated) for simplicity of our memory space usage.
+
+// Declare some syscall wrappers we use throughout this code to avoid linking
+// against system libc.
+uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) {
+ uint64_t ret;
+ __asm__ __volatile__("movq $2, %%rax\n"
+ "syscall"
+ : "=a"(ret)
+ : "D"(pathname), "S"(flags), "d"(mode)
+ : "cc", "rcx", "r11", "memory");
+ return ret;
+}
+
+long __getdents64(unsigned int fd, dirent64 *dirp, size_t count) {
+ long ret;
+ __asm__ __volatile__("movq $217, %%rax\n"
+ "syscall"
+ : "=a"(ret)
+ : "D"(fd), "S"(dirp), "d"(count)
+ : "cc", "rcx", "r11", "memory");
+ return ret;
+}
+
+uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) {
+ uint64_t ret;
+ __asm__ __volatile__("movq $89, %%rax\n"
+ "syscall"
+ : "=a"(ret)
+ : "D"(pathname), "S"(buf), "d"(bufsize)
+ : "cc", "rcx", "r11", "memory");
+ return ret;
+}
+
+uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) {
+ uint64_t ret;
+ __asm__ __volatile__("movq $8, %%rax\n"
+ "syscall\n"
+ : "=a"(ret)
+ : "D"(fd), "S"(pos), "d"(whence)
+ : "cc", "rcx", "r11", "memory");
+ return ret;
+}
+
+int __ftruncate(uint64_t fd, uint64_t length) {
+ int ret;
+ __asm__ __volatile__("movq $77, %%rax\n"
+ "syscall\n"
+ : "=a"(ret)
+ : "D"(fd), "S"(length)
+ : "cc", "rcx", "r11", "memory");
+ return ret;
+}
+
+int __close(uint64_t fd) {
+ uint64_t ret;
+ __asm__ __volatile__("movq $3, %%rax\n"
+ "syscall\n"
+ : "=a"(ret)
+ : "D"(fd)
+ : "cc", "rcx", "r11", "memory");
+ return ret;
+}
+
+int __madvise(void *addr, size_t length, int advice) {
+ int ret;
+ __asm__ __volatile__("movq $28, %%rax\n"
+ "syscall\n"
+ : "=a"(ret)
+ : "D"(addr), "S"(length), "d"(advice)
+ : "cc", "rcx", "r11", "memory");
+ return ret;
+}
+
+int __uname(struct UtsNameTy *Buf) {
+ int Ret;
+ __asm__ __volatile__("movq $63, %%rax\n"
+ "syscall\n"
+ : "=a"(Ret)
+ : "D"(Buf)
+ : "cc", "rcx", "r11", "memory");
+ return Ret;
+}
+
+uint64_t __nanosleep(const timespec *req, timespec *rem) {
+ uint64_t ret;
+ __asm__ __volatile__("movq $35, %%rax\n"
+ "syscall\n"
+ : "=a"(ret)
+ : "D"(req), "S"(rem)
+ : "cc", "rcx", "r11", "memory");
+ return ret;
+}
+
+int64_t __fork() {
+ uint64_t ret;
+ __asm__ __volatile__("movq $57, %%rax\n"
+ "syscall\n"
+ : "=a"(ret)
+ :
+ : "cc", "rcx", "r11", "memory");
+ return ret;
+}
+
+int __mprotect(void *addr, size_t len, int prot) {
+ int ret;
+ __asm__ __volatile__("movq $10, %%rax\n"
+ "syscall\n"
+ : "=a"(ret)
+ : "D"(addr), "S"(len), "d"(prot)
+ : "cc", "rcx", "r11", "memory");
+ return ret;
+}
+
+uint64_t __getppid() {
+ uint64_t ret;
+ __asm__ __volatile__("movq $110, %%rax\n"
+ "syscall\n"
+ : "=a"(ret)
+ :
+ : "cc", "rcx", "r11", "memory");
+ return ret;
+}
+
+int __setpgid(uint64_t pid, uint64_t pgid) {
+ int ret;
+ __asm__ __volatile__("movq $109, %%rax\n"
+ "syscall\n"
+ : "=a"(ret)
+ : "D"(pid), "S"(pgid)
+ : "cc", "rcx", "r11", "memory");
+ return ret;
+}
+
+uint64_t __getpgid(uint64_t pid) {
+ uint64_t ret;
+ __asm__ __volatile__("movq $121, %%rax\n"
+ "syscall\n"
+ : "=a"(ret)
+ : "D"(pid)
+ : "cc", "rcx", "r11", "memory");
+ return ret;
+}
+
+int __kill(uint64_t pid, int sig) {
+ int ret;
+ __asm__ __volatile__("movq $62, %%rax\n"
+ "syscall\n"
+ : "=a"(ret)
+ : "D"(pid), "S"(sig)
+ : "cc", "rcx", "r11", "memory");
+ return ret;
+}
+
+int __fsync(int fd) {
+ int ret;
+ __asm__ __volatile__("movq $74, %%rax\n"
+ "syscall\n"
+ : "=a"(ret)
+ : "D"(fd)
+ : "cc", "rcx", "r11", "memory");
+ return ret;
+}
+
+// %rdi %rsi %rdx %r10 %r8
+// sys_prctl int option unsigned unsigned unsigned unsigned
+// long arg2 long arg3 long arg4 long arg5
+int __prctl(int Option, unsigned long Arg2, unsigned long Arg3,
+ unsigned long Arg4, unsigned long Arg5) {
+ int Ret;
+ register long rdx asm("rdx") = Arg3;
+ register long r8 asm("r8") = Arg5;
+ register long r10 asm("r10") = Arg4;
+ __asm__ __volatile__("movq $157, %%rax\n"
+ "syscall\n"
+ : "=a"(Ret)
+ : "D"(Option), "S"(Arg2), "d"(rdx), "r"(r10), "r"(r8)
+ :);
+ return Ret;
+}
+
+#endif
+
+} // anonymous namespace
+
+#endif
diff --git a/bolt/test/AArch64/exclusive-instrument.s b/bolt/test/AArch64/exclusive-instrument.s
new file mode 100644
index 0000000..502dd83
--- /dev/null
+++ b/bolt/test/AArch64/exclusive-instrument.s
@@ -0,0 +1,39 @@
+// This test checks that the foo function having exclusive memory access
+// instructions won't be instrumented.
+
+// REQUIRES: system-linux,bolt-runtime,target=aarch64{{.*}}
+
+// RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \
+// RUN: %s -o %t.o
+// RUN: %clang %cflags -fPIC -pie %t.o -o %t.exe -nostdlib -Wl,-q -Wl,-fini=dummy
+// RUN: llvm-bolt %t.exe -o %t.bolt -instrument -v=1 | FileCheck %s
+
+// CHECK: Function foo has exclusive instructions, skip instrumentation
+
+.global foo
+.type foo, %function
+foo:
+ ldaxr w9, [x10]
+ cbnz w9, .Lret
+ stlxr w12, w11, [x9]
+ cbz w12, foo
+ clrex
+.Lret:
+ ret
+.size foo, .-foo
+
+.global _start
+.type _start, %function
+_start:
+ cmp x0, #0
+ b.eq .Lexit
+ bl foo
+.Lexit:
+ ret
+.size _start, .-_start
+
+.global dummy
+.type dummy, %function
+dummy:
+ ret
+.size dummy, .-dummy
diff --git a/bolt/test/X86/asm-dump.c b/bolt/test/X86/asm-dump.c
index 5d85e2a..fdd448e 100644
--- a/bolt/test/X86/asm-dump.c
+++ b/bolt/test/X86/asm-dump.c
@@ -1,13 +1,14 @@
/**
* Test for asm-dump functionality.
*
- * REQUIRES: system-linux,bolt-runtime
+ * REQUIRES: x86_64-linux,bolt-runtime
*
* Compile the source
* RUN: %clang -fPIC %s -o %t.exe -Wl,-q
*
* Profile collection: instrument the binary
- * RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata -o %t.instr
+ * RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata -o \
+ * RUN: %t.instr
*
* Profile collection: run instrumented binary (and capture output)
* RUN: %t.instr > %t.result
diff --git a/bolt/test/X86/bolt-address-translation-internal-call.test b/bolt/test/X86/bolt-address-translation-internal-call.test
index edc32d9..24cb635 100644
--- a/bolt/test/X86/bolt-address-translation-internal-call.test
+++ b/bolt/test/X86/bolt-address-translation-internal-call.test
@@ -4,12 +4,12 @@
# internal calls) might create new blocks without a mapping to an
# input block.
-# REQUIRES: system-linux,bolt-runtime
+# REQUIRES: x86_64-linux,bolt-runtime
# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
# Delete our BB symbols so BOLT doesn't mark them as entry points
# RUN: llvm-strip --strip-unneeded %t.o
-# RUN: %clang %t.o -o %t.exe -Wl,-q
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
# RUN: llvm-bolt --enable-bat %t.exe --relocs -o %t.out | FileCheck %s
# CHECK: BOLT-INFO: Wrote {{.*}} BAT maps
@@ -29,6 +29,7 @@ main:
push %rbx
sub $0x120,%rsp
mov $0x3,%rbx
+ movq rel(%rip), %rdi
.J1:
cmp $0x0,%rbx
je .J2
@@ -49,4 +50,8 @@ main:
.J4:
pop %rbp
retq
+end:
.size main, .-main
+
+ .data
+rel: .quad end
diff --git a/bolt/test/X86/instrumentation-eh_frame_hdr.cpp b/bolt/test/X86/instrumentation-eh_frame_hdr.cpp
index f6ebd6b..4ed8be4 100644
--- a/bolt/test/X86/instrumentation-eh_frame_hdr.cpp
+++ b/bolt/test/X86/instrumentation-eh_frame_hdr.cpp
@@ -1,7 +1,7 @@
// This test checks that .eh_frame_hdr address is in bounds of the last LOAD
// end address i.e. the section address is smaller then the LOAD end address.
-// REQUIRES: system-linux,bolt-runtime
+// REQUIRES: system-linux,bolt-runtime,target=x86_64{{.*}}
// RUN: %clangxx %cxxflags -static -Wl,-q %s -o %t.exe -Wl,--entry=_start
// RUN: llvm-bolt %t.exe -o %t.instr -instrument \
diff --git a/bolt/test/X86/internal-call-instrument.s b/bolt/test/X86/internal-call-instrument.s
index c137174..c393f1d 100644
--- a/bolt/test/X86/internal-call-instrument.s
+++ b/bolt/test/X86/internal-call-instrument.s
@@ -1,15 +1,23 @@
# This reproduces a bug with instrumentation crashes on internal call
-# REQUIRES: system-linux,bolt-runtime
+# REQUIRES: x86_64-linux,bolt-runtime,target=x86_64{{.*}}
# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
# Delete our BB symbols so BOLT doesn't mark them as entry points
# RUN: llvm-strip --strip-unneeded %t.o
-# RUN: %clang %t.o -o %t.exe -Wl,-q
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
# RUN: llvm-bolt --instrument %t.exe --relocs -o %t.out
.text
+ .globl _start
+ .type _start, %function
+ .p2align 4
+_start:
+ call main
+ ret
+ .size _start, .-_start
+
.globl main
.type main, %function
.p2align 4
@@ -20,6 +28,7 @@ main:
push %rbx
sub $0x120,%rsp
mov $0x3,%rbx
+ movq rel(%rip), %rdi
.J1:
cmp $0x0,%rbx
je .J2
@@ -40,4 +49,15 @@ main:
.J4:
pop %rbp
retq
+end:
.size main, .-main
+
+ .globl _fini
+ .type _fini, %function
+ .p2align 4
+_fini:
+ hlt
+ .size _fini, .-_fini
+
+ .data
+rel: .quad end
diff --git a/bolt/test/X86/tail-duplication-pass.s b/bolt/test/X86/tail-duplication-pass.s
index 677f498..ed50cc5 100644
--- a/bolt/test/X86/tail-duplication-pass.s
+++ b/bolt/test/X86/tail-duplication-pass.s
@@ -7,12 +7,21 @@
# RUN: llvm-bolt %t.exe --data %t.fdata --reorder-blocks=ext-tsp \
# RUN: --print-finalized --tail-duplication=moderate \
# RUN: --tail-duplication-minimum-offset=1 -o %t.out | FileCheck %s
+# RUN: llvm-bolt %t.exe --data %t.fdata --print-finalized \
+# RUN: --tail-duplication=aggressive --tail-duplication-minimum-offset=1 \
+# RUN: -o %t.out | FileCheck %s --check-prefix CHECK-NOLOOP
# FDATA: 1 main 2 1 main #.BB2# 0 10
# FDATA: 1 main 4 1 main #.BB2# 0 20
# CHECK: BOLT-INFO: tail duplication modified 1 ({{.*}}%) functions; duplicated 1 blocks (1 bytes) responsible for {{.*}} dynamic executions ({{.*}}% of all block executions)
# CHECK: BB Layout : .LBB00, .Ltail-dup0, .Ltmp0, .Ltmp1
+# Check that the successor of Ltail-dup0 is .LBB00, not itself.
+# CHECK-NOLOOP: .Ltail-dup0 (1 instructions, align : 1)
+# CHECK-NOLOOP: Predecessors: .LBB00
+# CHECK-NOLOOP: retq
+# CHECK-NOLOOP: .Ltmp0 (1 instructions, align : 1)
+
.text
.globl main
.type main, %function
diff --git a/bolt/test/assume-abi.test b/bolt/test/assume-abi.test
new file mode 100644
index 0000000..688ab01
--- /dev/null
+++ b/bolt/test/assume-abi.test
@@ -0,0 +1,7 @@
+# Validate the usage of the `--assume-abi` option in conjunction with
+# options related to the RegAnalysis Pass.
+
+REQUIRES: system-linux
+
+RUN: %clang %cflags %p/Inputs/hello.c -o %t -Wl,-q
+RUN: llvm-bolt %t -o %t.bolt --assume-abi --indirect-call-promotion=all
diff --git a/bolt/test/runtime/AArch64/Inputs/basic-instrumentation.s b/bolt/test/runtime/AArch64/Inputs/basic-instrumentation.s
new file mode 100644
index 0000000..fa1ac35
--- /dev/null
+++ b/bolt/test/runtime/AArch64/Inputs/basic-instrumentation.s
@@ -0,0 +1,9 @@
+ .globl main
+ .type main, %function
+main:
+ sub sp, sp, #16
+ mov w0, wzr
+ str wzr, [sp, #12]
+ add sp, sp, #16
+ ret
+.size main, .-main
diff --git a/bolt/test/runtime/AArch64/basic-instrumentation.test b/bolt/test/runtime/AArch64/basic-instrumentation.test
new file mode 100644
index 0000000..0f77b0c
--- /dev/null
+++ b/bolt/test/runtime/AArch64/basic-instrumentation.test
@@ -0,0 +1,22 @@
+# Try to instrument a very fast test. Input bin will not execute any code during
+# runtime besides returning zero in main, so it is a good trivial case.
+REQUIRES: system-linux,bolt-runtime
+
+RUN: %clang %p/Inputs/basic-instrumentation.s -Wl,-q -o %t.exe
+RUN: llvm-bolt %t.exe -o %t --instrument \
+RUN: --instrumentation-file=%t \
+RUN: --instrumentation-file-append-pid
+
+# Execute program to collect profile
+RUN: rm %t.*.fdata || echo Nothing to remove
+RUN: %t
+
+# Profile should be written to %t.PID.fdata, check it
+RUN: mv %t.*.fdata %t.fdata
+RUN: cat %t.fdata | FileCheck -check-prefix=CHECK %s
+
+# Check BOLT works with this profile
+RUN: llvm-bolt %t.exe --data %t.fdata -o %t.2 --reorder-blocks=cache
+
+# The instrumented profile should at least say main was called once
+CHECK: main 0 0 1{{$}}
diff --git a/bolt/test/runtime/AArch64/instrumentation-ind-call.c b/bolt/test/runtime/AArch64/instrumentation-ind-call.c
new file mode 100644
index 0000000..76ee8c0
--- /dev/null
+++ b/bolt/test/runtime/AArch64/instrumentation-ind-call.c
@@ -0,0 +1,38 @@
+#include <stdio.h>
+
+typedef int (*func_ptr)(int, int);
+
+int add(int a, int b) { return a + b; }
+
+int main() {
+ func_ptr fun;
+ fun = add;
+ int sum = fun(10, 20); // indirect call to 'add'
+ printf("The sum is: %d\n", sum);
+ return 0;
+}
+/*
+REQUIRES: system-linux,bolt-runtime
+
+RUN: %clang %cflags %s -o %t.exe -Wl,-q -nopie -fpie
+
+RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata \
+RUN: -o %t.instrumented
+
+# Instrumented program needs to finish returning zero
+RUN: %t.instrumented | FileCheck %s -check-prefix=CHECK-OUTPUT
+
+# Test that the instrumented data makes sense
+RUN: llvm-bolt %t.exe -o %t.bolted --data %t.fdata \
+RUN: --reorder-blocks=ext-tsp --reorder-functions=hfsort+ \
+RUN: --print-only=main --print-finalized | FileCheck %s
+
+RUN: %t.bolted | FileCheck %s -check-prefix=CHECK-OUTPUT
+
+CHECK-OUTPUT: The sum is: 30
+
+# Check that our indirect call has 1 hit recorded in the fdata file and that
+# this was processed correctly by BOLT
+CHECK: blr x8 # CallProfile: 1 (0 misses) :
+CHECK-NEXT: { add: 1 (0 misses) }
+*/
diff --git a/bolt/test/runtime/X86/Inputs/exceptions_split.cpp b/bolt/test/runtime/Inputs/exceptions_split.cpp
similarity index 85%
rename from bolt/test/runtime/X86/Inputs/exceptions_split.cpp
rename to bolt/test/runtime/Inputs/exceptions_split.cpp
index 2c136b9..de81adf 100644
--- a/bolt/test/runtime/X86/Inputs/exceptions_split.cpp
+++ b/bolt/test/runtime/Inputs/exceptions_split.cpp
@@ -3,31 +3,25 @@
//
// Record performance data with no args. Run test with 2 args.
-#include <stdio.h>
#include <stdint.h>
+#include <stdio.h>
-int foo()
-{
- return 0;
-}
+int foo() { return 0; }
void bar(int a) {
if (a > 2 && a % 2)
throw new int();
}
-void filter_only(){
- foo();
-}
+void filter_only() { foo(); }
-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
unsigned r = 0;
uint64_t limit = (argc >= 2 ? 10 : 5000);
for (uint64_t i = 0; i < limit; ++i) {
i += foo();
- try {
+ try {
bar(argc);
try {
if (argc >= 2)
diff --git a/bolt/test/runtime/X86/instrumentation-tail-call.s b/bolt/test/runtime/X86/instrumentation-tail-call.s
index 792d084..dfb12f0 100644
--- a/bolt/test/runtime/X86/instrumentation-tail-call.s
+++ b/bolt/test/runtime/X86/instrumentation-tail-call.s
@@ -14,6 +14,9 @@
# CHECK: leaq 0x80(%rsp), %rsp
+# RUN: FileCheck %s --input-file %t.fdata --check-prefix=CHECK-FDATA
+# CHECK-FDATA: 1 main {{.*}} 1 targetFunc 0 0 1
+
.text
.globl main
.type main, %function
@@ -32,7 +35,8 @@ main:
movq %rbp, %rsp
pop %rbp
mov -0x10(%rsp),%rax
- jmp targetFunc
+ test %rsp, %rsp
+ jne targetFunc
.LBBerror:
addq $0x20, %rsp
diff --git a/bolt/test/runtime/X86/exceptions-instrumentation.test b/bolt/test/runtime/exceptions-instrumentation.test
similarity index 100%
rename from bolt/test/runtime/X86/exceptions-instrumentation.test
rename to bolt/test/runtime/exceptions-instrumentation.test
diff --git a/bolt/test/runtime/X86/pie-exceptions-split.test b/bolt/test/runtime/pie-exceptions-split.test
similarity index 95%
rename from bolt/test/runtime/X86/pie-exceptions-split.test
rename to bolt/test/runtime/pie-exceptions-split.test
index 124fef6..30f2d02 100644
--- a/bolt/test/runtime/X86/pie-exceptions-split.test
+++ b/bolt/test/runtime/pie-exceptions-split.test
@@ -16,9 +16,9 @@ RUN: --print-only=main 2>&1 | FileCheck %s
## All calls to printf() should be from exception handling code that was
## recorded as cold during the profile collection run. Check that the calls
## are placed after the split point.
-CHECK-NOT: callq printf
+CHECK-NOT: printf
CHECK: HOT-COLD SPLIT POINT
-CHECK: callq printf
+CHECK: printf
## Verify the output still executes correctly when the exception path is being
## taken.
--
2.39.5 (Apple Git-154)