From a7d826d3985dd886523df050949f1c3c151df636 Mon Sep 17 00:00:00 2001 From: rfwang07 Date: Thu, 31 Oct 2024 15:34:10 +0800 Subject: [PATCH] support aarch64 instrumentation --- bolt/CMakeLists.txt | 6 +- bolt/include/bolt/Core/MCPlusBuilder.h | 24 +- bolt/lib/Core/BinaryFunction.cpp | 6 + bolt/lib/Passes/Instrumentation.cpp | 28 +- bolt/lib/Passes/MCF.cpp | 1 + bolt/lib/Passes/TailDuplication.cpp | 2 +- .../Target/AArch64/AArch64MCPlusBuilder.cpp | 446 +++++++++++++++++- bolt/lib/Target/X86/X86MCPlusBuilder.cpp | 67 +-- bolt/runtime/CMakeLists.txt | 12 +- bolt/runtime/common.h | 417 ++-------------- bolt/runtime/instr.cpp | 61 ++- bolt/runtime/sys_aarch64.h | 394 ++++++++++++++++ bolt/runtime/sys_x86_64.h | 360 ++++++++++++++ bolt/test/AArch64/exclusive-instrument.s | 39 ++ bolt/test/X86/asm-dump.c | 5 +- ...olt-address-translation-internal-call.test | 9 +- .../test/X86/instrumentation-eh_frame_hdr.cpp | 2 +- bolt/test/X86/internal-call-instrument.s | 24 +- bolt/test/X86/tail-duplication-pass.s | 9 + bolt/test/assume-abi.test | 7 + .../AArch64/Inputs/basic-instrumentation.s | 9 + .../AArch64/basic-instrumentation.test | 22 + .../AArch64/instrumentation-ind-call.c | 38 ++ .../{X86 => }/Inputs/exceptions_split.cpp | 16 +- .../runtime/X86/instrumentation-tail-call.s | 6 +- .../{X86 => }/exceptions-instrumentation.test | 0 .../{X86 => }/pie-exceptions-split.test | 4 +- 27 files changed, 1545 insertions(+), 469 deletions(-) create mode 100644 bolt/runtime/sys_aarch64.h create mode 100644 bolt/runtime/sys_x86_64.h create mode 100644 bolt/test/AArch64/exclusive-instrument.s create mode 100644 bolt/test/assume-abi.test create mode 100644 bolt/test/runtime/AArch64/Inputs/basic-instrumentation.s create mode 100644 bolt/test/runtime/AArch64/basic-instrumentation.test create mode 100644 bolt/test/runtime/AArch64/instrumentation-ind-call.c rename bolt/test/runtime/{X86 => }/Inputs/exceptions_split.cpp (85%) rename bolt/test/runtime/{X86 => }/exceptions-instrumentation.test (100%) rename bolt/test/runtime/{X86 => }/pie-exceptions-split.test (95%) diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 4ff90c1..89462f8 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -32,10 +32,10 @@ foreach (tgt ${BOLT_TARGETS_TO_BUILD}) endforeach() set(BOLT_ENABLE_RUNTIME_default OFF) -if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" +if ((CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" + OR CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") AND (CMAKE_SYSTEM_NAME STREQUAL "Linux" - OR CMAKE_SYSTEM_NAME STREQUAL "Darwin") - AND "X86" IN_LIST BOLT_TARGETS_TO_BUILD) + OR CMAKE_SYSTEM_NAME STREQUAL "Darwin")) set(BOLT_ENABLE_RUNTIME_default ON) endif() option(BOLT_ENABLE_RUNTIME "Enable BOLT runtime" ${BOLT_ENABLE_RUNTIME_default}) diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index beb0675..e6945c9 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -498,9 +498,9 @@ public: } /// Create increment contents of target by 1 for Instrumentation - virtual InstructionListType createInstrIncMemory(const MCSymbol *Target, - MCContext *Ctx, - bool IsLeaf) const { + virtual InstructionListType + createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf, + unsigned CodePointerSize) const { llvm_unreachable("not implemented"); return InstructionListType(); } @@ -620,6 +620,11 @@ public: return false; } + virtual bool isAArch64Exclusive(const MCInst &Inst) const { + llvm_unreachable("not implemented"); + return false; + } + virtual bool isCleanRegXOR(const MCInst &Inst) const { llvm_unreachable("not implemented"); return false; @@ -1597,18 +1602,11 @@ public: return false; } - virtual void createLoadImmediate(MCInst &Inst, const MCPhysReg Dest, - uint32_t Imm) const { + virtual InstructionListType createLoadImmediate(const MCPhysReg Dest, + uint64_t Imm) const { llvm_unreachable("not implemented"); } - /// Create instruction to increment contents of target by 1 - virtual bool createIncMemory(MCInst &Inst, const MCSymbol *Target, - MCContext *Ctx) const { - llvm_unreachable("not implemented"); - return false; - } - /// Create a fragment of code (sequence of instructions) that load a 32-bit /// address from memory, zero-extends it to 64 and jump to it (indirect jump). virtual bool @@ -1969,7 +1967,7 @@ public: } virtual InstructionListType createSymbolTrampoline(const MCSymbol *TgtSym, - MCContext *Ctx) const { + MCContext *Ctx) { llvm_unreachable("not implemented"); return InstructionListType(); } diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 5b44a76..b79bd58 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -2305,6 +2305,12 @@ void BinaryFunction::removeConditionalTailCalls() { // This branch is no longer a conditional tail call. BC.MIB->unsetConditionalTailCall(*CTCInstr); + + // Move offset from CTCInstr to TailCallInstr. + if (std::optional Offset = BC.MIB->getOffset(*CTCInstr)) { + BC.MIB->setOffset(TailCallInstr, *Offset); + BC.MIB->clearOffset(*CTCInstr); + } } insertBasicBlocks(std::prev(end()), std::move(NewBlocks), diff --git a/bolt/lib/Passes/Instrumentation.cpp b/bolt/lib/Passes/Instrumentation.cpp index fae6770..72adb31 100644 --- a/bolt/lib/Passes/Instrumentation.cpp +++ b/bolt/lib/Passes/Instrumentation.cpp @@ -13,6 +13,7 @@ #include "bolt/Passes/Instrumentation.h" #include "bolt/Core/ParallelUtilities.h" #include "bolt/RuntimeLibs/InstrumentationRuntimeLibrary.h" +#include "bolt/Utils/CommandLineOpts.h" #include "bolt/Utils/Utils.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/RWMutex.h" @@ -85,6 +86,24 @@ cl::opt InstrumentCalls("instrument-calls", namespace llvm { namespace bolt { +static bool hasAArch64ExclusiveMemop(BinaryFunction &Function) { + // FIXME ARMv8-a architecture reference manual says that software must avoid + // having any explicit memory accesses between exclusive load and associated + // store instruction. So for now skip instrumentation for functions that have + // these instructions, since it might lead to runtime deadlock. + BinaryContext &BC = Function.getBinaryContext(); + for (const BinaryBasicBlock &BB : Function) + for (const MCInst &Inst : BB) + if (BC.MIB->isAArch64Exclusive(Inst)) { + if (opts::Verbosity >= 1) + outs() << "BOLT-INSTRUMENTER: Function " << Function + << " has exclusive instructions, skip instrumentation\n"; + return true; + } + + return false; +} + uint32_t Instrumentation::getFunctionNameIndex(const BinaryFunction &Function) { auto Iter = FuncToStringIdx.find(&Function); if (Iter != FuncToStringIdx.end()) @@ -176,7 +195,8 @@ Instrumentation::createInstrumentationSnippet(BinaryContext &BC, bool IsLeaf) { auto L = BC.scopeLock(); MCSymbol *Label = BC.Ctx->createNamedTempSymbol("InstrEntry"); Summary->Counters.emplace_back(Label); - return BC.MIB->createInstrIncMemory(Label, BC.Ctx.get(), IsLeaf); + return BC.MIB->createInstrIncMemory(Label, BC.Ctx.get(), IsLeaf, + BC.AsmInfo->getCodePointerSize()); } // Helper instruction sequence insertion function @@ -287,6 +307,9 @@ void Instrumentation::instrumentFunction(BinaryFunction &Function, if (BC.isMachO() && Function.hasName("___GLOBAL_init_65535/1")) return; + if (BC.isAArch64() && hasAArch64ExclusiveMemop(Function)) + return; + SplitWorklistTy SplitWorklist; SplitInstrsTy SplitInstrs; @@ -504,9 +527,6 @@ void Instrumentation::instrumentFunction(BinaryFunction &Function, } void Instrumentation::runOnFunctions(BinaryContext &BC) { - if (!BC.isX86()) - return; - const unsigned Flags = BinarySection::getFlags(/*IsReadOnly=*/false, /*IsText=*/false, /*IsAllocatable=*/true); diff --git a/bolt/lib/Passes/MCF.cpp b/bolt/lib/Passes/MCF.cpp index ec04012..c3898d2 100644 --- a/bolt/lib/Passes/MCF.cpp +++ b/bolt/lib/Passes/MCF.cpp @@ -262,6 +262,7 @@ bool guessPredEdgeCounts(BinaryBasicBlock *BB, ArcSet &GuessedArcs) { continue; Pred->getBranchInfo(*BB).Count = Guessed; + GuessedArcs.insert(std::make_pair(Pred, BB)); return true; } llvm_unreachable("Expected unguessed arc"); diff --git a/bolt/lib/Passes/TailDuplication.cpp b/bolt/lib/Passes/TailDuplication.cpp index c04efd7..7141d5d 100644 --- a/bolt/lib/Passes/TailDuplication.cpp +++ b/bolt/lib/Passes/TailDuplication.cpp @@ -303,7 +303,7 @@ TailDuplication::aggressiveDuplicate(BinaryBasicBlock &BB, if (isInCacheLine(BB, Tail)) return BlocksToDuplicate; - BinaryBasicBlock *CurrBB = &BB; + BinaryBasicBlock *CurrBB = &Tail; while (CurrBB) { LLVM_DEBUG(dbgs() << "Aggressive tail duplication: adding " << CurrBB->getName() << " to duplication list\n";); diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index cd66b65..3f6497e 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -16,6 +16,9 @@ #include "Utils/AArch64BaseInfo.h" #include "bolt/Core/MCPlusBuilder.h" #include "llvm/BinaryFormat/ELF.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCInstBuilder.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/Debug.h" @@ -28,6 +31,100 @@ using namespace bolt; namespace { +static void getSystemFlag(MCInst &Inst, MCPhysReg RegName) { + Inst.setOpcode(AArch64::MRS); + Inst.clear(); + Inst.addOperand(MCOperand::createReg(RegName)); + Inst.addOperand(MCOperand::createImm(AArch64SysReg::NZCV)); +} + +static void setSystemFlag(MCInst &Inst, MCPhysReg RegName) { + Inst.setOpcode(AArch64::MSR); + Inst.clear(); + Inst.addOperand(MCOperand::createImm(AArch64SysReg::NZCV)); + Inst.addOperand(MCOperand::createReg(RegName)); +} + +static void createPushRegisters(MCInst &Inst, MCPhysReg Reg1, MCPhysReg Reg2) { + Inst.clear(); + unsigned NewOpcode = AArch64::STPXpre; + Inst.setOpcode(NewOpcode); + Inst.addOperand(MCOperand::createReg(AArch64::SP)); + Inst.addOperand(MCOperand::createReg(Reg1)); + Inst.addOperand(MCOperand::createReg(Reg2)); + Inst.addOperand(MCOperand::createReg(AArch64::SP)); + Inst.addOperand(MCOperand::createImm(-2)); +} + +static void createPopRegisters(MCInst &Inst, MCPhysReg Reg1, MCPhysReg Reg2) { + Inst.clear(); + unsigned NewOpcode = AArch64::LDPXpost; + Inst.setOpcode(NewOpcode); + Inst.addOperand(MCOperand::createReg(AArch64::SP)); + Inst.addOperand(MCOperand::createReg(Reg1)); + Inst.addOperand(MCOperand::createReg(Reg2)); + Inst.addOperand(MCOperand::createReg(AArch64::SP)); + Inst.addOperand(MCOperand::createImm(2)); +} + +static void loadReg(MCInst &Inst, MCPhysReg To, MCPhysReg From) { + Inst.setOpcode(AArch64::LDRXui); + Inst.clear(); + if (From == AArch64::SP) { + Inst.setOpcode(AArch64::LDRXpost); + Inst.addOperand(MCOperand::createReg(From)); + Inst.addOperand(MCOperand::createReg(To)); + Inst.addOperand(MCOperand::createReg(From)); + Inst.addOperand(MCOperand::createImm(16)); + } else { + Inst.addOperand(MCOperand::createReg(To)); + Inst.addOperand(MCOperand::createReg(From)); + Inst.addOperand(MCOperand::createImm(0)); + } +} + +static void storeReg(MCInst &Inst, MCPhysReg From, MCPhysReg To) { + Inst.setOpcode(AArch64::STRXui); + Inst.clear(); + if (To == AArch64::SP) { + Inst.setOpcode(AArch64::STRXpre); + Inst.addOperand(MCOperand::createReg(To)); + Inst.addOperand(MCOperand::createReg(From)); + Inst.addOperand(MCOperand::createReg(To)); + Inst.addOperand(MCOperand::createImm(-16)); + } else { + Inst.addOperand(MCOperand::createReg(From)); + Inst.addOperand(MCOperand::createReg(To)); + Inst.addOperand(MCOperand::createImm(0)); + } +} + +static void atomicAdd(MCInst &Inst, MCPhysReg RegTo, MCPhysReg RegCnt) { + // NOTE: Supports only ARM with LSE extension + Inst.setOpcode(AArch64::LDADDX); + Inst.clear(); + Inst.addOperand(MCOperand::createReg(AArch64::XZR)); + Inst.addOperand(MCOperand::createReg(RegCnt)); + Inst.addOperand(MCOperand::createReg(RegTo)); +} + +static void createMovz(MCInst &Inst, MCPhysReg Reg, uint64_t Imm) { + assert(Imm <= UINT16_MAX && "Invalid Imm size"); + Inst.clear(); + Inst.setOpcode(AArch64::MOVZXi); + Inst.addOperand(MCOperand::createReg(Reg)); + Inst.addOperand(MCOperand::createImm(Imm & 0xFFFF)); + Inst.addOperand(MCOperand::createImm(0)); +} + +static InstructionListType createIncMemory(MCPhysReg RegTo, MCPhysReg RegTmp) { + InstructionListType Insts; + Insts.emplace_back(); + createMovz(Insts.back(), RegTmp, 1); + Insts.emplace_back(); + atomicAdd(Insts.back(), RegTo, RegTmp); + return Insts; +} class AArch64MCPlusBuilder : public MCPlusBuilder { public: AArch64MCPlusBuilder(const MCInstrAnalysis *Analysis, const MCInstrInfo *Info, @@ -176,6 +273,34 @@ public: return isLDRB(Inst) || isLDRH(Inst) || isLDRW(Inst) || isLDRX(Inst); } + bool isAArch64Exclusive(const MCInst &Inst) const override { + return (Inst.getOpcode() == AArch64::LDXPX || + Inst.getOpcode() == AArch64::LDXPW || + Inst.getOpcode() == AArch64::LDXRX || + Inst.getOpcode() == AArch64::LDXRW || + Inst.getOpcode() == AArch64::LDXRH || + Inst.getOpcode() == AArch64::LDXRB || + Inst.getOpcode() == AArch64::STXPX || + Inst.getOpcode() == AArch64::STXPW || + Inst.getOpcode() == AArch64::STXRX || + Inst.getOpcode() == AArch64::STXRW || + Inst.getOpcode() == AArch64::STXRH || + Inst.getOpcode() == AArch64::STXRB || + Inst.getOpcode() == AArch64::LDAXPX || + Inst.getOpcode() == AArch64::LDAXPW || + Inst.getOpcode() == AArch64::LDAXRX || + Inst.getOpcode() == AArch64::LDAXRW || + Inst.getOpcode() == AArch64::LDAXRH || + Inst.getOpcode() == AArch64::LDAXRB || + Inst.getOpcode() == AArch64::STLXPX || + Inst.getOpcode() == AArch64::STLXPW || + Inst.getOpcode() == AArch64::STLXRX || + Inst.getOpcode() == AArch64::STLXRW || + Inst.getOpcode() == AArch64::STLXRH || + Inst.getOpcode() == AArch64::STLXRB || + Inst.getOpcode() == AArch64::CLREX); + } + bool isLoadFromStack(const MCInst &Inst) const { if (!isLoad(Inst)) return false; @@ -207,6 +332,40 @@ public: return Inst.getOpcode() == AArch64::BLR; } + MCPhysReg getSpRegister(int Size) const { + switch (Size) { + case 4: + return AArch64::WSP; + case 8: + return AArch64::SP; + default: + llvm_unreachable("Unexpected size"); + } + } + + MCPhysReg getIntArgRegister(unsigned ArgNo) const override { + switch (ArgNo) { + case 0: + return AArch64::X0; + case 1: + return AArch64::X1; + case 2: + return AArch64::X2; + case 3: + return AArch64::X3; + case 4: + return AArch64::X4; + case 5: + return AArch64::X5; + case 6: + return AArch64::X6; + case 7: + return AArch64::X7; + default: + return getNoRegister(); + } + } + bool hasPCRelOperand(const MCInst &Inst) const override { // ADRP is blacklisted and is an exception. Even though it has a // PC-relative operand, this operand is not a complete symbol reference @@ -313,6 +472,22 @@ public: return true; } + void getCalleeSavedRegs(BitVector &Regs) const override { + Regs |= getAliases(AArch64::X18); + Regs |= getAliases(AArch64::X19); + Regs |= getAliases(AArch64::X20); + Regs |= getAliases(AArch64::X21); + Regs |= getAliases(AArch64::X22); + Regs |= getAliases(AArch64::X23); + Regs |= getAliases(AArch64::X24); + Regs |= getAliases(AArch64::X25); + Regs |= getAliases(AArch64::X26); + Regs |= getAliases(AArch64::X27); + Regs |= getAliases(AArch64::X28); + Regs |= getAliases(AArch64::LR); + Regs |= getAliases(AArch64::FP); + } + const MCExpr *getTargetExprFor(MCInst &Inst, const MCExpr *Expr, MCContext &Ctx, uint64_t RelType) const override { @@ -818,6 +993,22 @@ public: int getUncondBranchEncodingSize() const override { return 28; } + InstructionListType createCmpJE(MCPhysReg RegNo, int64_t Imm, + const MCSymbol *Target, + MCContext *Ctx) const override { + InstructionListType Code; + Code.emplace_back(MCInstBuilder(AArch64::SUBSXri) + .addReg(RegNo) + .addReg(RegNo) + .addImm(Imm) + .addImm(0)); + Code.emplace_back(MCInstBuilder(AArch64::Bcc) + .addImm(Imm) + .addExpr(MCSymbolRefExpr::create( + Target, MCSymbolRefExpr::VK_None, *Ctx))); + return Code; + } + bool createCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx) override { Inst.setOpcode(AArch64::BL); @@ -828,12 +1019,7 @@ public: bool createTailCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx) override { - Inst.setOpcode(AArch64::B); - Inst.addOperand(MCOperand::createExpr(getTargetExprFor( - Inst, MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx), - *Ctx, 0))); - setTailCall(Inst); - return true; + return createDirectCall(Inst, Target, Ctx, /*IsTailCall*/ true); } void createLongTailCall(InstructionListType &Seq, const MCSymbol *Target, @@ -882,6 +1068,18 @@ public: bool isStore(const MCInst &Inst) const override { return false; } + bool createDirectCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx, + bool IsTailCall) override { + Inst.setOpcode(IsTailCall ? AArch64::B : AArch64::BL); + Inst.clear(); + Inst.addOperand(MCOperand::createExpr(getTargetExprFor( + Inst, MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx), + *Ctx, 0))); + if (IsTailCall) + convertJmpToTailCall(Inst); + return true; + } + bool analyzeBranch(InstructionIterator Begin, InstructionIterator End, const MCSymbol *&TBB, const MCSymbol *&FBB, MCInst *&CondBranch, @@ -1153,6 +1351,242 @@ public: return true; } + bool createStackPointerIncrement( + MCInst &Inst, int Size, + bool NoFlagsClobber = false /*unused for AArch64*/) const override { + Inst.setOpcode(AArch64::SUBXri); + Inst.clear(); + Inst.addOperand(MCOperand::createReg(AArch64::SP)); + Inst.addOperand(MCOperand::createReg(AArch64::SP)); + Inst.addOperand(MCOperand::createImm(Size)); + Inst.addOperand(MCOperand::createImm(0)); + return true; + } + + bool createStackPointerDecrement( + MCInst &Inst, int Size, + bool NoFlagsClobber = false /*unused for AArch64*/) const override { + Inst.setOpcode(AArch64::ADDXri); + Inst.clear(); + Inst.addOperand(MCOperand::createReg(AArch64::SP)); + Inst.addOperand(MCOperand::createReg(AArch64::SP)); + Inst.addOperand(MCOperand::createImm(Size)); + Inst.addOperand(MCOperand::createImm(0)); + return true; + } + + void createIndirectBranch(MCInst &Inst, MCPhysReg MemBaseReg, + int64_t Disp) const { + Inst.setOpcode(AArch64::BR); + Inst.addOperand(MCOperand::createReg(MemBaseReg)); + } + + InstructionListType createInstrumentedIndCallHandlerExitBB() const override { + InstructionListType Insts(5); + // Code sequence for instrumented indirect call handler: + // msr nzcv, x1 + // ldp x0, x1, [sp], #16 + // ldr x16, [sp], #16 + // ldp x0, x1, [sp], #16 + // br x16 + setSystemFlag(Insts[0], AArch64::X1); + createPopRegisters(Insts[1], AArch64::X0, AArch64::X1); + // Here we load address of the next function which should be called in the + // original binary to X16 register. Writing to X16 is permitted without + // needing to restore. + loadReg(Insts[2], AArch64::X16, AArch64::SP); + createPopRegisters(Insts[3], AArch64::X0, AArch64::X1); + createIndirectBranch(Insts[4], AArch64::X16, 0); + return Insts; + } + + InstructionListType + createInstrumentedIndTailCallHandlerExitBB() const override { + return createInstrumentedIndCallHandlerExitBB(); + } + + InstructionListType createGetter(MCContext *Ctx, const char *name) const { + InstructionListType Insts(4); + MCSymbol *Locs = Ctx->getOrCreateSymbol(name); + InstructionListType Addr = materializeAddress(Locs, Ctx, AArch64::X0); + std::copy(Addr.begin(), Addr.end(), Insts.begin()); + assert(Addr.size() == 2 && "Invalid Addr size"); + loadReg(Insts[2], AArch64::X0, AArch64::X0); + createReturn(Insts[3]); + return Insts; + } + + InstructionListType createNumCountersGetter(MCContext *Ctx) const override { + return createGetter(Ctx, "__bolt_num_counters"); + } + + InstructionListType + createInstrLocationsGetter(MCContext *Ctx) const override { + return createGetter(Ctx, "__bolt_instr_locations"); + } + + InstructionListType createInstrTablesGetter(MCContext *Ctx) const override { + return createGetter(Ctx, "__bolt_instr_tables"); + } + + InstructionListType createInstrNumFuncsGetter(MCContext *Ctx) const override { + return createGetter(Ctx, "__bolt_instr_num_funcs"); + } + + void convertIndirectCallToLoad(MCInst &Inst, MCPhysReg Reg) override { + bool IsTailCall = isTailCall(Inst); + if (IsTailCall) + removeAnnotation(Inst, MCPlus::MCAnnotation::kTailCall); + if (Inst.getOpcode() == AArch64::BR || Inst.getOpcode() == AArch64::BLR) { + Inst.setOpcode(AArch64::ORRXrs); + Inst.insert(Inst.begin(), MCOperand::createReg(Reg)); + Inst.insert(Inst.begin() + 1, MCOperand::createReg(AArch64::XZR)); + Inst.insert(Inst.begin() + 3, MCOperand::createImm(0)); + return; + } + llvm_unreachable("not implemented"); + } + + InstructionListType createLoadImmediate(const MCPhysReg Dest, + uint64_t Imm) const override { + InstructionListType Insts(4); + int Shift = 48; + for (int I = 0; I < 4; I++, Shift -= 16) { + Insts[I].setOpcode(AArch64::MOVKXi); + Insts[I].addOperand(MCOperand::createReg(Dest)); + Insts[I].addOperand(MCOperand::createReg(Dest)); + Insts[I].addOperand(MCOperand::createImm((Imm >> Shift) & 0xFFFF)); + Insts[I].addOperand(MCOperand::createImm(Shift)); + } + return Insts; + } + + void createIndirectCallInst(MCInst &Inst, bool IsTailCall, + MCPhysReg Reg) const { + Inst.clear(); + Inst.setOpcode(IsTailCall ? AArch64::BR : AArch64::BLR); + Inst.addOperand(MCOperand::createReg(Reg)); + } + + InstructionListType createInstrumentedIndirectCall(MCInst &&CallInst, + MCSymbol *HandlerFuncAddr, + int CallSiteID, + MCContext *Ctx) override { + InstructionListType Insts; + // Code sequence used to enter indirect call instrumentation helper: + // stp x0, x1, [sp, #-16]! createPushRegisters + // mov target x0 convertIndirectCallToLoad -> orr x0 target xzr + // mov x1 CallSiteID createLoadImmediate -> + // movk x1, #0x0, lsl #48 + // movk x1, #0x0, lsl #32 + // movk x1, #0x0, lsl #16 + // movk x1, #0x0 + // stp x0, x1, [sp, #-16]! + // bl *HandlerFuncAddr createIndirectCall -> + // adr x0 *HandlerFuncAddr -> adrp + add + // blr x0 + Insts.emplace_back(); + createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1); + Insts.emplace_back(CallInst); + convertIndirectCallToLoad(Insts.back(), AArch64::X0); + InstructionListType LoadImm = + createLoadImmediate(getIntArgRegister(1), CallSiteID); + Insts.insert(Insts.end(), LoadImm.begin(), LoadImm.end()); + Insts.emplace_back(); + createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1); + Insts.resize(Insts.size() + 2); + InstructionListType Addr = + materializeAddress(HandlerFuncAddr, Ctx, AArch64::X0); + assert(Addr.size() == 2 && "Invalid Addr size"); + std::copy(Addr.begin(), Addr.end(), Insts.end() - Addr.size()); + Insts.emplace_back(); + createIndirectCallInst(Insts.back(), isTailCall(CallInst), AArch64::X0); + + // Carry over metadata including tail call marker if present. + stripAnnotations(Insts.back()); + moveAnnotations(std::move(CallInst), Insts.back()); + + return Insts; + } + + InstructionListType + createInstrumentedIndCallHandlerEntryBB(const MCSymbol *InstrTrampoline, + const MCSymbol *IndCallHandler, + MCContext *Ctx) override { + // Code sequence used to check whether InstrTampoline was initialized + // and call it if so, returns via IndCallHandler + // stp x0, x1, [sp, #-16]! + // mrs x1, nzcv + // adr x0, InstrTrampoline -> adrp + add + // ldr x0, [x0] + // subs x0, x0, #0x0 + // b.eq IndCallHandler + // str x30, [sp, #-16]! + // blr x0 + // ldr x30, [sp], #16 + // b IndCallHandler + InstructionListType Insts; + Insts.emplace_back(); + createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1); + Insts.emplace_back(); + getSystemFlag(Insts.back(), getIntArgRegister(1)); + Insts.emplace_back(); + Insts.emplace_back(); + InstructionListType Addr = + materializeAddress(InstrTrampoline, Ctx, AArch64::X0); + std::copy(Addr.begin(), Addr.end(), Insts.end() - Addr.size()); + assert(Addr.size() == 2 && "Invalid Addr size"); + Insts.emplace_back(); + loadReg(Insts.back(), AArch64::X0, AArch64::X0); + InstructionListType cmpJmp = + createCmpJE(AArch64::X0, 0, IndCallHandler, Ctx); + Insts.insert(Insts.end(), cmpJmp.begin(), cmpJmp.end()); + Insts.emplace_back(); + storeReg(Insts.back(), AArch64::LR, AArch64::SP); + Insts.emplace_back(); + Insts.back().setOpcode(AArch64::BLR); + Insts.back().addOperand(MCOperand::createReg(AArch64::X0)); + Insts.emplace_back(); + loadReg(Insts.back(), AArch64::LR, AArch64::SP); + Insts.emplace_back(); + createDirectCall(Insts.back(), IndCallHandler, Ctx, /*IsTailCall*/ true); + return Insts; + } + + InstructionListType + createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf, + unsigned CodePointerSize) const override { + unsigned int I = 0; + InstructionListType Instrs(IsLeaf ? 12 : 10); + + if (IsLeaf) + createStackPointerIncrement(Instrs[I++], 128); + createPushRegisters(Instrs[I++], AArch64::X0, AArch64::X1); + getSystemFlag(Instrs[I++], AArch64::X1); + InstructionListType Addr = materializeAddress(Target, Ctx, AArch64::X0); + assert(Addr.size() == 2 && "Invalid Addr size"); + std::copy(Addr.begin(), Addr.end(), Instrs.begin() + I); + I += Addr.size(); + storeReg(Instrs[I++], AArch64::X2, AArch64::SP); + InstructionListType Insts = createIncMemory(AArch64::X0, AArch64::X2); + assert(Insts.size() == 2 && "Invalid Insts size"); + std::copy(Insts.begin(), Insts.end(), Instrs.begin() + I); + I += Insts.size(); + loadReg(Instrs[I++], AArch64::X2, AArch64::SP); + setSystemFlag(Instrs[I++], AArch64::X1); + createPopRegisters(Instrs[I++], AArch64::X0, AArch64::X1); + if (IsLeaf) + createStackPointerDecrement(Instrs[I++], 128); + return Instrs; + } + + std::vector createSymbolTrampoline(const MCSymbol *TgtSym, + MCContext *Ctx) override { + std::vector Insts; + createShortJmp(Insts, TgtSym, Ctx, /*IsTailCall*/ true); + return Insts; + } + InstructionListType materializeAddress(const MCSymbol *Target, MCContext *Ctx, MCPhysReg RegName, int64_t Addend = 0) const override { diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp index 5e3c01a..25b6970 100644 --- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp @@ -61,6 +61,25 @@ bool isADDri(const MCInst &Inst) { Inst.getOpcode() == X86::ADD64ri8; } +// Create instruction to increment contents of target by 1 +static InstructionListType createIncMemory(const MCSymbol *Target, + MCContext *Ctx) { + InstructionListType Insts; + Insts.emplace_back(); + Insts.back().setOpcode(X86::LOCK_INC64m); + Insts.back().clear(); + Insts.back().addOperand(MCOperand::createReg(X86::RIP)); // BaseReg + Insts.back().addOperand(MCOperand::createImm(1)); // ScaleAmt + Insts.back().addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg + + Insts.back().addOperand(MCOperand::createExpr( + MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, + *Ctx))); // Displacement + Insts.back().addOperand( + MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg + return Insts; +} + #define GET_INSTRINFO_OPERAND_TYPES_ENUM #define GET_INSTRINFO_OPERAND_TYPE #define GET_INSTRINFO_MEM_OPERAND_SIZE @@ -2309,28 +2328,15 @@ public: return true; } - void createLoadImmediate(MCInst &Inst, const MCPhysReg Dest, - uint32_t Imm) const override { - Inst.setOpcode(X86::MOV64ri32); - Inst.clear(); - Inst.addOperand(MCOperand::createReg(Dest)); - Inst.addOperand(MCOperand::createImm(Imm)); - } - - bool createIncMemory(MCInst &Inst, const MCSymbol *Target, - MCContext *Ctx) const override { - - Inst.setOpcode(X86::LOCK_INC64m); - Inst.clear(); - Inst.addOperand(MCOperand::createReg(X86::RIP)); // BaseReg - Inst.addOperand(MCOperand::createImm(1)); // ScaleAmt - Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg - - Inst.addOperand(MCOperand::createExpr( - MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, - *Ctx))); // Displacement - Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg - return true; + InstructionListType createLoadImmediate(const MCPhysReg Dest, + uint64_t Imm) const override { + InstructionListType Insts; + Insts.emplace_back(); + Insts.back().setOpcode(X86::MOV64ri32); + Insts.back().clear(); + Insts.back().addOperand(MCOperand::createReg(Dest)); + Insts.back().addOperand(MCOperand::createImm(Imm)); + return Insts; } bool createIJmp32Frag(SmallVectorImpl &Insts, @@ -3057,9 +3063,9 @@ public: Inst.clear(); } - InstructionListType createInstrIncMemory(const MCSymbol *Target, - MCContext *Ctx, - bool IsLeaf) const override { + InstructionListType + createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf, + unsigned CodePointerSize) const override { InstructionListType Instrs(IsLeaf ? 13 : 11); unsigned int I = 0; @@ -3079,7 +3085,10 @@ public: createClearRegWithNoEFlagsUpdate(Instrs[I++], X86::RAX, 8); createX86SaveOVFlagToRegister(Instrs[I++], X86::AL); // LOCK INC - createIncMemory(Instrs[I++], Target, Ctx); + InstructionListType IncMem = createIncMemory(Target, Ctx); + assert(IncMem.size() == 1 && "Invalid IncMem size"); + std::copy(IncMem.begin(), IncMem.end(), Instrs.begin() + I); + I += IncMem.size(); // POPF createAddRegImm(Instrs[I++], X86::AL, 127, 1); createPopRegister(Instrs[I++], X86::RAX, 8); @@ -3153,8 +3162,8 @@ public: } Insts.emplace_back(); createPushRegister(Insts.back(), TempReg, 8); - Insts.emplace_back(); - createLoadImmediate(Insts.back(), TempReg, CallSiteID); + InstructionListType LoadImm = createLoadImmediate(TempReg, CallSiteID); + Insts.insert(Insts.end(), LoadImm.begin(), LoadImm.end()); Insts.emplace_back(); createPushRegister(Insts.back(), TempReg, 8); @@ -3264,7 +3273,7 @@ public: } InstructionListType createSymbolTrampoline(const MCSymbol *TgtSym, - MCContext *Ctx) const override { + MCContext *Ctx) override { InstructionListType Insts(1); createUncondBranch(Insts[0], TgtSym, Ctx); return Insts; diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt index 8472ce0..838c8cb 100644 --- a/bolt/runtime/CMakeLists.txt +++ b/bolt/runtime/CMakeLists.txt @@ -27,8 +27,14 @@ set(BOLT_RT_FLAGS -fno-exceptions -fno-rtti -fno-stack-protector - -mno-sse - -fPIC) + -fPIC + -mgeneral-regs-only) +if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") + set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-sse") +endif() +if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-outline-atomics") +endif() # Don't let the compiler think it can create calls to standard libs target_compile_options(bolt_rt_instr PRIVATE ${BOLT_RT_FLAGS}) @@ -39,7 +45,7 @@ target_include_directories(bolt_rt_hugify PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) install(TARGETS bolt_rt_instr DESTINATION "lib${LLVM_LIBDIR_SUFFIX}") install(TARGETS bolt_rt_hugify DESTINATION "lib${LLVM_LIBDIR_SUFFIX}") -if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*") +if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*" AND CMAKE_SYSTEM_NAME STREQUAL "Darwin") add_library(bolt_rt_instr_osx STATIC instr.cpp ${CMAKE_CURRENT_BINARY_DIR}/config.h diff --git a/bolt/runtime/common.h b/bolt/runtime/common.h index 9e6f175..9b9965b 100644 --- a/bolt/runtime/common.h +++ b/bolt/runtime/common.h @@ -6,10 +6,6 @@ // //===----------------------------------------------------------------------===// -#if !defined(__x86_64__) -#error "For x86_64 only" -#endif - #if defined(__linux__) #include @@ -44,44 +40,6 @@ typedef int int32_t; #error "For Linux or MacOS only" #endif -// Save all registers while keeping 16B stack alignment -#define SAVE_ALL \ - "push %%rax\n" \ - "push %%rbx\n" \ - "push %%rcx\n" \ - "push %%rdx\n" \ - "push %%rdi\n" \ - "push %%rsi\n" \ - "push %%rbp\n" \ - "push %%r8\n" \ - "push %%r9\n" \ - "push %%r10\n" \ - "push %%r11\n" \ - "push %%r12\n" \ - "push %%r13\n" \ - "push %%r14\n" \ - "push %%r15\n" \ - "sub $8, %%rsp\n" - -// Mirrors SAVE_ALL -#define RESTORE_ALL \ - "add $8, %%rsp\n" \ - "pop %%r15\n" \ - "pop %%r14\n" \ - "pop %%r13\n" \ - "pop %%r12\n" \ - "pop %%r11\n" \ - "pop %%r10\n" \ - "pop %%r9\n" \ - "pop %%r8\n" \ - "pop %%rbp\n" \ - "pop %%rsi\n" \ - "pop %%rdi\n" \ - "pop %%rdx\n" \ - "pop %%rcx\n" \ - "pop %%rbx\n" \ - "pop %%rax\n" - #define PROT_READ 0x1 /* Page can be read. */ #define PROT_WRITE 0x2 /* Page can be written. */ #define PROT_EXEC 0x4 /* Page can be executed. */ @@ -165,141 +123,41 @@ int memcmp(const void *s1, const void *s2, size_t n) { // Anonymous namespace covering everything but our library entry point namespace { -// Get the difference between runtime addrress of .text section and -// static address in section header table. Can be extracted from arbitrary -// pc value recorded at runtime to get the corresponding static address, which -// in turn can be used to search for indirect call description. Needed because -// indirect call descriptions are read-only non-relocatable data. -uint64_t getTextBaseAddress() { - uint64_t DynAddr; - uint64_t StaticAddr; - __asm__ volatile("leaq __hot_end(%%rip), %0\n\t" - "movabsq $__hot_end, %1\n\t" - : "=r"(DynAddr), "=r"(StaticAddr)); - return DynAddr - StaticAddr; -} - -constexpr uint32_t BufSize = 10240; - -#define _STRINGIFY(x) #x -#define STRINGIFY(x) _STRINGIFY(x) - -uint64_t __read(uint64_t fd, const void *buf, uint64_t count) { - uint64_t ret; -#if defined(__APPLE__) -#define READ_SYSCALL 0x2000003 -#else -#define READ_SYSCALL 0 -#endif - __asm__ __volatile__("movq $" STRINGIFY(READ_SYSCALL) ", %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(fd), "S"(buf), "d"(count) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -uint64_t __write(uint64_t fd, const void *buf, uint64_t count) { - uint64_t ret; -#if defined(__APPLE__) -#define WRITE_SYSCALL 0x2000004 -#else -#define WRITE_SYSCALL 1 -#endif - __asm__ __volatile__("movq $" STRINGIFY(WRITE_SYSCALL) ", %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(fd), "S"(buf), "d"(count) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags, - uint64_t fd, uint64_t offset) { -#if defined(__APPLE__) -#define MMAP_SYSCALL 0x20000c5 -#else -#define MMAP_SYSCALL 9 -#endif - void *ret; - register uint64_t r8 asm("r8") = fd; - register uint64_t r9 asm("r9") = offset; - register uint64_t r10 asm("r10") = flags; - __asm__ __volatile__("movq $" STRINGIFY(MMAP_SYSCALL) ", %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(addr), "S"(size), "d"(prot), "r"(r10), "r"(r8), - "r"(r9) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -uint64_t __munmap(void *addr, uint64_t size) { -#if defined(__APPLE__) -#define MUNMAP_SYSCALL 0x2000049 -#else -#define MUNMAP_SYSCALL 11 -#endif - uint64_t ret; - __asm__ __volatile__("movq $" STRINGIFY(MUNMAP_SYSCALL) ", %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(addr), "S"(size) - : "cc", "rcx", "r11", "memory"); - return ret; -} +struct dirent64 { + uint64_t d_ino; /* Inode number */ + int64_t d_off; /* Offset to next linux_dirent */ + unsigned short d_reclen; /* Length of this linux_dirent */ + unsigned char d_type; + char d_name[]; /* Filename (null-terminated) */ + /* length is actually (d_reclen - 2 - + offsetof(struct linux_dirent, d_name)) */ +}; -#define SIG_BLOCK 0 -#define SIG_UNBLOCK 1 -#define SIG_SETMASK 2 +/* Length of the entries in `struct utsname' is 65. */ +#define _UTSNAME_LENGTH 65 -static const uint64_t MaskAllSignals[] = {-1ULL}; +struct UtsNameTy { + char sysname[_UTSNAME_LENGTH]; /* Operating system name (e.g., "Linux") */ + char nodename[_UTSNAME_LENGTH]; /* Name within "some implementation-defined + network" */ + char release[_UTSNAME_LENGTH]; /* Operating system release (e.g., "2.6.28") */ + char version[_UTSNAME_LENGTH]; /* Operating system version */ + char machine[_UTSNAME_LENGTH]; /* Hardware identifier */ + char domainname[_UTSNAME_LENGTH]; /* NIS or YP domain name */ +}; -uint64_t __sigprocmask(int how, const void *set, void *oldset) { -#if defined(__APPLE__) -#define SIGPROCMASK_SYSCALL 0x2000030 -#else -#define SIGPROCMASK_SYSCALL 14 -#endif - uint64_t ret; - register long r10 asm("r10") = sizeof(uint64_t); - __asm__ __volatile__("movq $" STRINGIFY(SIGPROCMASK_SYSCALL) ", %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(how), "S"(set), "d"(oldset), "r"(r10) - : "cc", "rcx", "r11", "memory"); - return ret; -} +struct timespec { + uint64_t tv_sec; /* seconds */ + uint64_t tv_nsec; /* nanoseconds */ +}; -uint64_t __getpid() { - uint64_t ret; -#if defined(__APPLE__) -#define GETPID_SYSCALL 20 +#if defined(__aarch64__) +#include "sys_aarch64.h" #else -#define GETPID_SYSCALL 39 +#include "sys_x86_64.h" #endif - __asm__ __volatile__("movq $" STRINGIFY(GETPID_SYSCALL) ", %%rax\n" - "syscall\n" - : "=a"(ret) - : - : "cc", "rcx", "r11", "memory"); - return ret; -} -uint64_t __exit(uint64_t code) { -#if defined(__APPLE__) -#define EXIT_SYSCALL 0x2000001 -#else -#define EXIT_SYSCALL 231 -#endif - uint64_t ret; - __asm__ __volatile__("movq $" STRINGIFY(EXIT_SYSCALL) ", %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(code) - : "cc", "rcx", "r11", "memory"); - return ret; -} +constexpr uint32_t BufSize = 10240; // Helper functions for writing strings to the .fdata file. We intentionally // avoid using libc names to make it clear it is our impl. @@ -415,219 +273,6 @@ static bool scanUInt32(const char *&Buf, const char *End, uint32_t &Ret) { return false; } -#if !defined(__APPLE__) -// We use a stack-allocated buffer for string manipulation in many pieces of -// this code, including the code that prints each line of the fdata file. This -// buffer needs to accomodate large function names, but shouldn't be arbitrarily -// large (dynamically allocated) for simplicity of our memory space usage. - -// Declare some syscall wrappers we use throughout this code to avoid linking -// against system libc. -uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) { - uint64_t ret; - __asm__ __volatile__("movq $2, %%rax\n" - "syscall" - : "=a"(ret) - : "D"(pathname), "S"(flags), "d"(mode) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -struct dirent { - unsigned long d_ino; /* Inode number */ - unsigned long d_off; /* Offset to next linux_dirent */ - unsigned short d_reclen; /* Length of this linux_dirent */ - char d_name[]; /* Filename (null-terminated) */ - /* length is actually (d_reclen - 2 - - offsetof(struct linux_dirent, d_name)) */ -}; - -long __getdents(unsigned int fd, dirent *dirp, size_t count) { - long ret; - __asm__ __volatile__("movq $78, %%rax\n" - "syscall" - : "=a"(ret) - : "D"(fd), "S"(dirp), "d"(count) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) { - uint64_t ret; - __asm__ __volatile__("movq $89, %%rax\n" - "syscall" - : "=a"(ret) - : "D"(pathname), "S"(buf), "d"(bufsize) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) { - uint64_t ret; - __asm__ __volatile__("movq $8, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(fd), "S"(pos), "d"(whence) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -int __ftruncate(uint64_t fd, uint64_t length) { - int ret; - __asm__ __volatile__("movq $77, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(fd), "S"(length) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -int __close(uint64_t fd) { - uint64_t ret; - __asm__ __volatile__("movq $3, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(fd) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -int __madvise(void *addr, size_t length, int advice) { - int ret; - __asm__ __volatile__("movq $28, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(addr), "S"(length), "d"(advice) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -#define _UTSNAME_LENGTH 65 - -struct UtsNameTy { - char sysname[_UTSNAME_LENGTH]; /* Operating system name (e.g., "Linux") */ - char nodename[_UTSNAME_LENGTH]; /* Name within "some implementation-defined - network" */ - char release[_UTSNAME_LENGTH]; /* Operating system release (e.g., "2.6.28") */ - char version[_UTSNAME_LENGTH]; /* Operating system version */ - char machine[_UTSNAME_LENGTH]; /* Hardware identifier */ - char domainname[_UTSNAME_LENGTH]; /* NIS or YP domain name */ -}; - -int __uname(struct UtsNameTy *Buf) { - int Ret; - __asm__ __volatile__("movq $63, %%rax\n" - "syscall\n" - : "=a"(Ret) - : "D"(Buf) - : "cc", "rcx", "r11", "memory"); - return Ret; -} - -struct timespec { - uint64_t tv_sec; /* seconds */ - uint64_t tv_nsec; /* nanoseconds */ -}; - -uint64_t __nanosleep(const timespec *req, timespec *rem) { - uint64_t ret; - __asm__ __volatile__("movq $35, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(req), "S"(rem) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -int64_t __fork() { - uint64_t ret; - __asm__ __volatile__("movq $57, %%rax\n" - "syscall\n" - : "=a"(ret) - : - : "cc", "rcx", "r11", "memory"); - return ret; -} - -int __mprotect(void *addr, size_t len, int prot) { - int ret; - __asm__ __volatile__("movq $10, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(addr), "S"(len), "d"(prot) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -uint64_t __getppid() { - uint64_t ret; - __asm__ __volatile__("movq $110, %%rax\n" - "syscall\n" - : "=a"(ret) - : - : "cc", "rcx", "r11", "memory"); - return ret; -} - -int __setpgid(uint64_t pid, uint64_t pgid) { - int ret; - __asm__ __volatile__("movq $109, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(pid), "S"(pgid) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -uint64_t __getpgid(uint64_t pid) { - uint64_t ret; - __asm__ __volatile__("movq $121, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(pid) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -int __kill(uint64_t pid, int sig) { - int ret; - __asm__ __volatile__("movq $62, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(pid), "S"(sig) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -int __fsync(int fd) { - int ret; - __asm__ __volatile__("movq $74, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(fd) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -// %rdi %rsi %rdx %r10 %r8 -// sys_prctl int option unsigned unsigned unsigned unsigned -// long arg2 long arg3 long arg4 long arg5 -int __prctl(int Option, unsigned long Arg2, unsigned long Arg3, - unsigned long Arg4, unsigned long Arg5) { - int Ret; - register long rdx asm("rdx") = Arg3; - register long r8 asm("r8") = Arg5; - register long r10 asm("r10") = Arg4; - __asm__ __volatile__("movq $157, %%rax\n" - "syscall\n" - : "=a"(Ret) - : "D"(Option), "S"(Arg2), "d"(rdx), "r"(r10), "r"(r8) - :); - return Ret; -} - -#endif - void reportError(const char *Msg, uint64_t Size) { __write(2, Msg, Size); __exit(1); @@ -644,6 +289,12 @@ void assert(bool Assertion, const char *Msg) { reportError(Buf, Ptr - Buf); } +#define SIG_BLOCK 0 +#define SIG_UNBLOCK 1 +#define SIG_SETMASK 2 + +static const uint64_t MaskAllSignals[] = {-1ULL}; + class Mutex { volatile bool InUse{false}; diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp index 96a43f6..cfd113e 100644 --- a/bolt/runtime/instr.cpp +++ b/bolt/runtime/instr.cpp @@ -40,7 +40,6 @@ // //===----------------------------------------------------------------------===// -#if defined (__x86_64__) #include "common.h" // Enables a very verbose logging to stderr useful when debugging @@ -695,12 +694,12 @@ static char *getBinaryPath() { assert(static_cast(FDdir) >= 0, "failed to open /proc/self/map_files"); - while (long Nread = __getdents(FDdir, (struct dirent *)Buf, BufSize)) { + while (long Nread = __getdents64(FDdir, (struct dirent64 *)Buf, BufSize)) { assert(static_cast(Nread) != -1, "failed to get folder entries"); - struct dirent *d; + struct dirent64 *d; for (long Bpos = 0; Bpos < Nread; Bpos += d->d_reclen) { - d = (struct dirent *)(Buf + Bpos); + d = (struct dirent64 *)(Buf + Bpos); uint64_t StartAddress, EndAddress; if (!parseAddressRange(d->d_name, StartAddress, EndAddress)) @@ -1668,6 +1667,17 @@ instrumentIndirectCall(uint64_t Target, uint64_t IndCallID) { /// as well as the target address for the call extern "C" __attribute((naked)) void __bolt_instr_indirect_call() { +#if defined(__aarch64__) + // clang-format off + __asm__ __volatile__(SAVE_ALL + "ldp x0, x1, [sp, #288]\n" + "bl instrumentIndirectCall\n" + RESTORE_ALL + "ret\n" + :::); + // clang-format on +#else + // clang-format off __asm__ __volatile__(SAVE_ALL "mov 0xa0(%%rsp), %%rdi\n" "mov 0x98(%%rsp), %%rsi\n" @@ -1675,10 +1685,23 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_call() RESTORE_ALL "ret\n" :::); + // clang-format on +#endif } extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall() { +#if defined(__aarch64__) + // clang-format off + __asm__ __volatile__(SAVE_ALL + "ldp x0, x1, [sp, #288]\n" + "bl instrumentIndirectCall\n" + RESTORE_ALL + "ret\n" + :::); + // clang-format on +#else + // clang-format off __asm__ __volatile__(SAVE_ALL "mov 0x98(%%rsp), %%rdi\n" "mov 0x90(%%rsp), %%rsi\n" @@ -1686,21 +1709,48 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall() RESTORE_ALL "ret\n" :::); + // clang-format on +#endif } /// This is hooking ELF's entry, it needs to save all machine state. extern "C" __attribute((naked)) void __bolt_instr_start() { +#if defined(__aarch64__) + // clang-format off + __asm__ __volatile__(SAVE_ALL + "bl __bolt_instr_setup\n" + RESTORE_ALL + "adrp x16, __bolt_start_trampoline\n" + "add x16, x16, #:lo12:__bolt_start_trampoline\n" + "br x16\n" + :::); + // clang-format on +#else + // clang-format off __asm__ __volatile__(SAVE_ALL "call __bolt_instr_setup\n" RESTORE_ALL "jmp __bolt_start_trampoline\n" :::); + // clang-format on +#endif } /// This is hooking into ELF's DT_FINI extern "C" void __bolt_instr_fini() { - __bolt_fini_trampoline(); +#if defined(__aarch64__) + // clang-format off + __asm__ __volatile__(SAVE_ALL + "adrp x16, __bolt_fini_trampoline\n" + "add x16, x16, #:lo12:__bolt_fini_trampoline\n" + "blr x16\n" + RESTORE_ALL + :::); + // clang-format on +#else + __asm__ __volatile__("call __bolt_fini_trampoline\n" :::); +#endif if (__bolt_instr_sleep_time == 0) { int FD = openProfile(); __bolt_instr_data_dump(FD); @@ -1752,4 +1802,3 @@ void _bolt_instr_fini() { } #endif -#endif diff --git a/bolt/runtime/sys_aarch64.h b/bolt/runtime/sys_aarch64.h new file mode 100644 index 0000000..77c9cfc --- /dev/null +++ b/bolt/runtime/sys_aarch64.h @@ -0,0 +1,394 @@ +#ifndef LLVM_TOOLS_LLVM_BOLT_SYS_AARCH64 +#define LLVM_TOOLS_LLVM_BOLT_SYS_AARCH64 + +// Save all registers while keeping 16B stack alignment +#define SAVE_ALL \ + "stp x0, x1, [sp, #-16]!\n" \ + "stp x2, x3, [sp, #-16]!\n" \ + "stp x4, x5, [sp, #-16]!\n" \ + "stp x6, x7, [sp, #-16]!\n" \ + "stp x8, x9, [sp, #-16]!\n" \ + "stp x10, x11, [sp, #-16]!\n" \ + "stp x12, x13, [sp, #-16]!\n" \ + "stp x14, x15, [sp, #-16]!\n" \ + "stp x16, x17, [sp, #-16]!\n" \ + "stp x18, x19, [sp, #-16]!\n" \ + "stp x20, x21, [sp, #-16]!\n" \ + "stp x22, x23, [sp, #-16]!\n" \ + "stp x24, x25, [sp, #-16]!\n" \ + "stp x26, x27, [sp, #-16]!\n" \ + "stp x28, x29, [sp, #-16]!\n" \ + "str x30, [sp,#-16]!\n" +// Mirrors SAVE_ALL +#define RESTORE_ALL \ + "ldr x30, [sp], #16\n" \ + "ldp x28, x29, [sp], #16\n" \ + "ldp x26, x27, [sp], #16\n" \ + "ldp x24, x25, [sp], #16\n" \ + "ldp x22, x23, [sp], #16\n" \ + "ldp x20, x21, [sp], #16\n" \ + "ldp x18, x19, [sp], #16\n" \ + "ldp x16, x17, [sp], #16\n" \ + "ldp x14, x15, [sp], #16\n" \ + "ldp x12, x13, [sp], #16\n" \ + "ldp x10, x11, [sp], #16\n" \ + "ldp x8, x9, [sp], #16\n" \ + "ldp x6, x7, [sp], #16\n" \ + "ldp x4, x5, [sp], #16\n" \ + "ldp x2, x3, [sp], #16\n" \ + "ldp x0, x1, [sp], #16\n" + +// Anonymous namespace covering everything but our library entry point +namespace { + +// Get the difference between runtime addrress of .text section and +// static address in section header table. Can be extracted from arbitrary +// pc value recorded at runtime to get the corresponding static address, which +// in turn can be used to search for indirect call description. Needed because +// indirect call descriptions are read-only non-relocatable data. +uint64_t getTextBaseAddress() { + uint64_t DynAddr; + uint64_t StaticAddr; + __asm__ volatile("b .instr%=\n\t" + ".StaticAddr%=:\n\t" + ".dword __hot_end\n\t" + ".instr%=:\n\t" + "ldr %0, .StaticAddr%=\n\t" + "adrp %1, __hot_end\n\t" + "add %1, %1, :lo12:__hot_end\n\t" + : "=r"(StaticAddr), "=r"(DynAddr)); + return DynAddr - StaticAddr; +} + +uint64_t __read(uint64_t fd, const void *buf, uint64_t count) { + uint64_t ret; + register uint64_t x0 __asm__("x0") = fd; + register const void *x1 __asm__("x1") = buf; + register uint64_t x2 __asm__("x2") = count; + register uint32_t w8 __asm__("w8") = 63; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(w8) + : "cc", "memory"); + return ret; +} + +uint64_t __write(uint64_t fd, const void *buf, uint64_t count) { + uint64_t ret; + register uint64_t x0 __asm__("x0") = fd; + register const void *x1 __asm__("x1") = buf; + register uint64_t x2 __asm__("x2") = count; + register uint32_t w8 __asm__("w8") = 64; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(w8) + : "cc", "memory"); + return ret; +} + +void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags, + uint64_t fd, uint64_t offset) { + void *ret; + register uint64_t x0 __asm__("x0") = addr; + register uint64_t x1 __asm__("x1") = size; + register uint64_t x2 __asm__("x2") = prot; + register uint64_t x3 __asm__("x3") = flags; + register uint64_t x4 __asm__("x4") = fd; + register uint64_t x5 __asm__("x5") = offset; + register uint32_t w8 __asm__("w8") = 222; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(x3), "r"(x4), "r"(x5), "r"(w8) + : "cc", "memory"); + return ret; +} + +uint64_t __munmap(void *addr, uint64_t size) { + uint64_t ret; + register void *x0 __asm__("x0") = addr; + register uint64_t x1 __asm__("x1") = size; + register uint32_t w8 __asm__("w8") = 215; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(w8) + : "cc", "memory"); + return ret; +} + +uint64_t __exit(uint64_t code) { + uint64_t ret; + register uint64_t x0 __asm__("x0") = code; + register uint32_t w8 __asm__("w8") = 94; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0) + : "r"(w8) + : "cc", "memory", "x1"); + return ret; +} + +uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) { + uint64_t ret; + register int x0 __asm__("x0") = -100; + register const char *x1 __asm__("x1") = pathname; + register uint64_t x2 __asm__("x2") = flags; + register uint64_t x3 __asm__("x3") = mode; + register uint32_t w8 __asm__("w8") = 56; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(x3), "r"(w8) + : "cc", "memory"); + return ret; +} + +long __getdents64(unsigned int fd, dirent64 *dirp, size_t count) { + long ret; + register unsigned int x0 __asm__("x0") = fd; + register dirent64 *x1 __asm__("x1") = dirp; + register size_t x2 __asm__("x2") = count; + register uint32_t w8 __asm__("w8") = 61; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(w8) + : "cc", "memory"); + return ret; +} + +uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) { + uint64_t ret; + register int x0 __asm__("x0") = -100; + register const char *x1 __asm__("x1") = pathname; + register char *x2 __asm__("x2") = buf; + register size_t x3 __asm__("x3") = bufsize; + register uint32_t w8 __asm__("w8") = 78; // readlinkat + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(x3), "r"(w8) + : "cc", "memory"); + return ret; +} + +uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) { + uint64_t ret; + register uint64_t x0 __asm__("x0") = fd; + register uint64_t x1 __asm__("x1") = pos; + register uint64_t x2 __asm__("x2") = whence; + register uint32_t w8 __asm__("w8") = 62; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(w8) + : "cc", "memory"); + return ret; +} + +int __ftruncate(uint64_t fd, uint64_t length) { + int ret; + register uint64_t x0 __asm__("x0") = fd; + register uint64_t x1 __asm__("x1") = length; + register uint32_t w8 __asm__("w8") = 46; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(w8) + : "cc", "memory"); + return ret; +} + +int __close(uint64_t fd) { + int ret; + register uint64_t x0 __asm__("x0") = fd; + register uint32_t w8 __asm__("w8") = 57; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0) + : "r"(w8) + : "cc", "memory", "x1"); + return ret; +} + +int __madvise(void *addr, size_t length, int advice) { + int ret; + register void *x0 __asm__("x0") = addr; + register size_t x1 __asm__("x1") = length; + register int x2 __asm__("x2") = advice; + register uint32_t w8 __asm__("w8") = 233; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(w8) + : "cc", "memory"); + return ret; +} + +int __uname(struct UtsNameTy *buf) { + int ret; + register UtsNameTy *x0 __asm__("x0") = buf; + register uint32_t w8 __asm__("w8") = 160; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0) + : "r"(w8) + : "cc", "memory", "x1"); + return ret; +} + +uint64_t __nanosleep(const timespec *req, timespec *rem) { + uint64_t ret; + register const timespec *x0 __asm__("x0") = req; + register timespec *x1 __asm__("x1") = rem; + register uint32_t w8 __asm__("w8") = 101; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(w8) + : "cc", "memory"); + return ret; +} + +int64_t __fork() { + uint64_t ret; + // clone instead of fork with flags + // "CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD" + register uint64_t x0 __asm__("x0") = 0x1200011; + register uint64_t x1 __asm__("x1") = 0; + register uint64_t x2 __asm__("x2") = 0; + register uint64_t x3 __asm__("x3") = 0; + register uint64_t x4 __asm__("x4") = 0; + register uint32_t w8 __asm__("w8") = 220; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(x3), "r"(x4), "r"(w8) + : "cc", "memory"); + return ret; +} + +int __mprotect(void *addr, size_t len, int prot) { + int ret; + register void *x0 __asm__("x0") = addr; + register size_t x1 __asm__("x1") = len; + register int x2 __asm__("x2") = prot; + register uint32_t w8 __asm__("w8") = 226; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(w8) + : "cc", "memory"); + return ret; +} + +uint64_t __getpid() { + uint64_t ret; + register uint32_t w8 __asm__("w8") = 172; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret) + : "r"(w8) + : "cc", "memory", "x0", "x1"); + return ret; +} + +uint64_t __getppid() { + uint64_t ret; + register uint32_t w8 __asm__("w8") = 173; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret) + : "r"(w8) + : "cc", "memory", "x0", "x1"); + return ret; +} + +int __setpgid(uint64_t pid, uint64_t pgid) { + int ret; + register uint64_t x0 __asm__("x0") = pid; + register uint64_t x1 __asm__("x1") = pgid; + register uint32_t w8 __asm__("w8") = 154; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(w8) + : "cc", "memory"); + return ret; +} + +uint64_t __getpgid(uint64_t pid) { + uint64_t ret; + register uint64_t x0 __asm__("x0") = pid; + register uint32_t w8 __asm__("w8") = 155; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0) + : "r"(w8) + : "cc", "memory", "x1"); + return ret; +} + +int __kill(uint64_t pid, int sig) { + int ret; + register uint64_t x0 __asm__("x0") = pid; + register int x1 __asm__("x1") = sig; + register uint32_t w8 __asm__("w8") = 129; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(w8) + : "cc", "memory"); + return ret; +} + +int __fsync(int fd) { + int ret; + register int x0 __asm__("x0") = fd; + register uint32_t w8 __asm__("w8") = 82; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0) + : "r"(w8) + : "cc", "memory", "x1"); + return ret; +} + +uint64_t __sigprocmask(int how, const void *set, void *oldset) { + uint64_t ret; + register int x0 __asm__("x0") = how; + register const void *x1 __asm__("x1") = set; + register void *x2 __asm__("x2") = oldset; + register long x3 asm("x3") = 8; + register uint32_t w8 __asm__("w8") = 135; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(x3), "r"(w8) + : "cc", "memory"); + return ret; +} + +int __prctl(int option, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5) { + int ret; + register int x0 __asm__("x0") = option; + register unsigned long x1 __asm__("x1") = arg2; + register unsigned long x2 __asm__("x2") = arg3; + register unsigned long x3 __asm__("x3") = arg4; + register unsigned long x4 __asm__("x4") = arg5; + register uint32_t w8 __asm__("w8") = 167; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(x3), "r"(x4), "r"(w8) + : "cc", "memory"); + return ret; +} + +} // anonymous namespace + +#endif diff --git a/bolt/runtime/sys_x86_64.h b/bolt/runtime/sys_x86_64.h new file mode 100644 index 0000000..ca2c693 --- /dev/null +++ b/bolt/runtime/sys_x86_64.h @@ -0,0 +1,360 @@ +#ifndef LLVM_TOOLS_LLVM_BOLT_SYS_X86_64 +#define LLVM_TOOLS_LLVM_BOLT_SYS_X86_64 + +// Save all registers while keeping 16B stack alignment +#define SAVE_ALL \ + "push %%rax\n" \ + "push %%rbx\n" \ + "push %%rcx\n" \ + "push %%rdx\n" \ + "push %%rdi\n" \ + "push %%rsi\n" \ + "push %%rbp\n" \ + "push %%r8\n" \ + "push %%r9\n" \ + "push %%r10\n" \ + "push %%r11\n" \ + "push %%r12\n" \ + "push %%r13\n" \ + "push %%r14\n" \ + "push %%r15\n" \ + "sub $8, %%rsp\n" +// Mirrors SAVE_ALL +#define RESTORE_ALL \ + "add $8, %%rsp\n" \ + "pop %%r15\n" \ + "pop %%r14\n" \ + "pop %%r13\n" \ + "pop %%r12\n" \ + "pop %%r11\n" \ + "pop %%r10\n" \ + "pop %%r9\n" \ + "pop %%r8\n" \ + "pop %%rbp\n" \ + "pop %%rsi\n" \ + "pop %%rdi\n" \ + "pop %%rdx\n" \ + "pop %%rcx\n" \ + "pop %%rbx\n" \ + "pop %%rax\n" + +namespace { + +// Get the difference between runtime addrress of .text section and +// static address in section header table. Can be extracted from arbitrary +// pc value recorded at runtime to get the corresponding static address, which +// in turn can be used to search for indirect call description. Needed because +// indirect call descriptions are read-only non-relocatable data. +uint64_t getTextBaseAddress() { + uint64_t DynAddr; + uint64_t StaticAddr; + __asm__ volatile("leaq __hot_end(%%rip), %0\n\t" + "movabsq $__hot_end, %1\n\t" + : "=r"(DynAddr), "=r"(StaticAddr)); + return DynAddr - StaticAddr; +} + +#define _STRINGIFY(x) #x +#define STRINGIFY(x) _STRINGIFY(x) + +uint64_t __read(uint64_t fd, const void *buf, uint64_t count) { + uint64_t ret; +#if defined(__APPLE__) +#define READ_SYSCALL 0x2000003 +#else +#define READ_SYSCALL 0 +#endif + __asm__ __volatile__("movq $" STRINGIFY(READ_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(fd), "S"(buf), "d"(count) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __write(uint64_t fd, const void *buf, uint64_t count) { + uint64_t ret; +#if defined(__APPLE__) +#define WRITE_SYSCALL 0x2000004 +#else +#define WRITE_SYSCALL 1 +#endif + __asm__ __volatile__("movq $" STRINGIFY(WRITE_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(fd), "S"(buf), "d"(count) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags, + uint64_t fd, uint64_t offset) { +#if defined(__APPLE__) +#define MMAP_SYSCALL 0x20000c5 +#else +#define MMAP_SYSCALL 9 +#endif + void *ret; + register uint64_t r8 asm("r8") = fd; + register uint64_t r9 asm("r9") = offset; + register uint64_t r10 asm("r10") = flags; + __asm__ __volatile__("movq $" STRINGIFY(MMAP_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(addr), "S"(size), "d"(prot), "r"(r10), "r"(r8), + "r"(r9) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __munmap(void *addr, uint64_t size) { +#if defined(__APPLE__) +#define MUNMAP_SYSCALL 0x2000049 +#else +#define MUNMAP_SYSCALL 11 +#endif + uint64_t ret; + __asm__ __volatile__("movq $" STRINGIFY(MUNMAP_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(addr), "S"(size) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __sigprocmask(int how, const void *set, void *oldset) { +#if defined(__APPLE__) +#define SIGPROCMASK_SYSCALL 0x2000030 +#else +#define SIGPROCMASK_SYSCALL 14 +#endif + uint64_t ret; + register long r10 asm("r10") = sizeof(uint64_t); + __asm__ __volatile__("movq $" STRINGIFY(SIGPROCMASK_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(how), "S"(set), "d"(oldset), "r"(r10) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __getpid() { + uint64_t ret; +#if defined(__APPLE__) +#define GETPID_SYSCALL 20 +#else +#define GETPID_SYSCALL 39 +#endif + __asm__ __volatile__("movq $" STRINGIFY(GETPID_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __exit(uint64_t code) { +#if defined(__APPLE__) +#define EXIT_SYSCALL 0x2000001 +#else +#define EXIT_SYSCALL 231 +#endif + uint64_t ret; + __asm__ __volatile__("movq $" STRINGIFY(EXIT_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(code) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +#if !defined(__APPLE__) +// We use a stack-allocated buffer for string manipulation in many pieces of +// this code, including the code that prints each line of the fdata file. This +// buffer needs to accomodate large function names, but shouldn't be arbitrarily +// large (dynamically allocated) for simplicity of our memory space usage. + +// Declare some syscall wrappers we use throughout this code to avoid linking +// against system libc. +uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) { + uint64_t ret; + __asm__ __volatile__("movq $2, %%rax\n" + "syscall" + : "=a"(ret) + : "D"(pathname), "S"(flags), "d"(mode) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +long __getdents64(unsigned int fd, dirent64 *dirp, size_t count) { + long ret; + __asm__ __volatile__("movq $217, %%rax\n" + "syscall" + : "=a"(ret) + : "D"(fd), "S"(dirp), "d"(count) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) { + uint64_t ret; + __asm__ __volatile__("movq $89, %%rax\n" + "syscall" + : "=a"(ret) + : "D"(pathname), "S"(buf), "d"(bufsize) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) { + uint64_t ret; + __asm__ __volatile__("movq $8, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(fd), "S"(pos), "d"(whence) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __ftruncate(uint64_t fd, uint64_t length) { + int ret; + __asm__ __volatile__("movq $77, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(fd), "S"(length) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __close(uint64_t fd) { + uint64_t ret; + __asm__ __volatile__("movq $3, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(fd) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __madvise(void *addr, size_t length, int advice) { + int ret; + __asm__ __volatile__("movq $28, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(addr), "S"(length), "d"(advice) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __uname(struct UtsNameTy *Buf) { + int Ret; + __asm__ __volatile__("movq $63, %%rax\n" + "syscall\n" + : "=a"(Ret) + : "D"(Buf) + : "cc", "rcx", "r11", "memory"); + return Ret; +} + +uint64_t __nanosleep(const timespec *req, timespec *rem) { + uint64_t ret; + __asm__ __volatile__("movq $35, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(req), "S"(rem) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int64_t __fork() { + uint64_t ret; + __asm__ __volatile__("movq $57, %%rax\n" + "syscall\n" + : "=a"(ret) + : + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __mprotect(void *addr, size_t len, int prot) { + int ret; + __asm__ __volatile__("movq $10, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(addr), "S"(len), "d"(prot) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __getppid() { + uint64_t ret; + __asm__ __volatile__("movq $110, %%rax\n" + "syscall\n" + : "=a"(ret) + : + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __setpgid(uint64_t pid, uint64_t pgid) { + int ret; + __asm__ __volatile__("movq $109, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(pid), "S"(pgid) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __getpgid(uint64_t pid) { + uint64_t ret; + __asm__ __volatile__("movq $121, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(pid) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __kill(uint64_t pid, int sig) { + int ret; + __asm__ __volatile__("movq $62, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(pid), "S"(sig) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __fsync(int fd) { + int ret; + __asm__ __volatile__("movq $74, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(fd) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +// %rdi %rsi %rdx %r10 %r8 +// sys_prctl int option unsigned unsigned unsigned unsigned +// long arg2 long arg3 long arg4 long arg5 +int __prctl(int Option, unsigned long Arg2, unsigned long Arg3, + unsigned long Arg4, unsigned long Arg5) { + int Ret; + register long rdx asm("rdx") = Arg3; + register long r8 asm("r8") = Arg5; + register long r10 asm("r10") = Arg4; + __asm__ __volatile__("movq $157, %%rax\n" + "syscall\n" + : "=a"(Ret) + : "D"(Option), "S"(Arg2), "d"(rdx), "r"(r10), "r"(r8) + :); + return Ret; +} + +#endif + +} // anonymous namespace + +#endif diff --git a/bolt/test/AArch64/exclusive-instrument.s b/bolt/test/AArch64/exclusive-instrument.s new file mode 100644 index 0000000..502dd83 --- /dev/null +++ b/bolt/test/AArch64/exclusive-instrument.s @@ -0,0 +1,39 @@ +// This test checks that the foo function having exclusive memory access +// instructions won't be instrumented. + +// REQUIRES: system-linux,bolt-runtime,target=aarch64{{.*}} + +// RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ +// RUN: %s -o %t.o +// RUN: %clang %cflags -fPIC -pie %t.o -o %t.exe -nostdlib -Wl,-q -Wl,-fini=dummy +// RUN: llvm-bolt %t.exe -o %t.bolt -instrument -v=1 | FileCheck %s + +// CHECK: Function foo has exclusive instructions, skip instrumentation + +.global foo +.type foo, %function +foo: + ldaxr w9, [x10] + cbnz w9, .Lret + stlxr w12, w11, [x9] + cbz w12, foo + clrex +.Lret: + ret +.size foo, .-foo + +.global _start +.type _start, %function +_start: + cmp x0, #0 + b.eq .Lexit + bl foo +.Lexit: + ret +.size _start, .-_start + +.global dummy +.type dummy, %function +dummy: + ret +.size dummy, .-dummy diff --git a/bolt/test/X86/asm-dump.c b/bolt/test/X86/asm-dump.c index 5d85e2a..fdd448e 100644 --- a/bolt/test/X86/asm-dump.c +++ b/bolt/test/X86/asm-dump.c @@ -1,13 +1,14 @@ /** * Test for asm-dump functionality. * - * REQUIRES: system-linux,bolt-runtime + * REQUIRES: x86_64-linux,bolt-runtime * * Compile the source * RUN: %clang -fPIC %s -o %t.exe -Wl,-q * * Profile collection: instrument the binary - * RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata -o %t.instr + * RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata -o \ + * RUN: %t.instr * * Profile collection: run instrumented binary (and capture output) * RUN: %t.instr > %t.result diff --git a/bolt/test/X86/bolt-address-translation-internal-call.test b/bolt/test/X86/bolt-address-translation-internal-call.test index edc32d9..24cb635 100644 --- a/bolt/test/X86/bolt-address-translation-internal-call.test +++ b/bolt/test/X86/bolt-address-translation-internal-call.test @@ -4,12 +4,12 @@ # internal calls) might create new blocks without a mapping to an # input block. -# REQUIRES: system-linux,bolt-runtime +# REQUIRES: x86_64-linux,bolt-runtime # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o # Delete our BB symbols so BOLT doesn't mark them as entry points # RUN: llvm-strip --strip-unneeded %t.o -# RUN: %clang %t.o -o %t.exe -Wl,-q +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q # RUN: llvm-bolt --enable-bat %t.exe --relocs -o %t.out | FileCheck %s # CHECK: BOLT-INFO: Wrote {{.*}} BAT maps @@ -29,6 +29,7 @@ main: push %rbx sub $0x120,%rsp mov $0x3,%rbx + movq rel(%rip), %rdi .J1: cmp $0x0,%rbx je .J2 @@ -49,4 +50,8 @@ main: .J4: pop %rbp retq +end: .size main, .-main + + .data +rel: .quad end diff --git a/bolt/test/X86/instrumentation-eh_frame_hdr.cpp b/bolt/test/X86/instrumentation-eh_frame_hdr.cpp index f6ebd6b..4ed8be4 100644 --- a/bolt/test/X86/instrumentation-eh_frame_hdr.cpp +++ b/bolt/test/X86/instrumentation-eh_frame_hdr.cpp @@ -1,7 +1,7 @@ // This test checks that .eh_frame_hdr address is in bounds of the last LOAD // end address i.e. the section address is smaller then the LOAD end address. -// REQUIRES: system-linux,bolt-runtime +// REQUIRES: system-linux,bolt-runtime,target=x86_64{{.*}} // RUN: %clangxx %cxxflags -static -Wl,-q %s -o %t.exe -Wl,--entry=_start // RUN: llvm-bolt %t.exe -o %t.instr -instrument \ diff --git a/bolt/test/X86/internal-call-instrument.s b/bolt/test/X86/internal-call-instrument.s index c137174..c393f1d 100644 --- a/bolt/test/X86/internal-call-instrument.s +++ b/bolt/test/X86/internal-call-instrument.s @@ -1,15 +1,23 @@ # This reproduces a bug with instrumentation crashes on internal call -# REQUIRES: system-linux,bolt-runtime +# REQUIRES: x86_64-linux,bolt-runtime,target=x86_64{{.*}} # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o # Delete our BB symbols so BOLT doesn't mark them as entry points # RUN: llvm-strip --strip-unneeded %t.o -# RUN: %clang %t.o -o %t.exe -Wl,-q +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q # RUN: llvm-bolt --instrument %t.exe --relocs -o %t.out .text + .globl _start + .type _start, %function + .p2align 4 +_start: + call main + ret + .size _start, .-_start + .globl main .type main, %function .p2align 4 @@ -20,6 +28,7 @@ main: push %rbx sub $0x120,%rsp mov $0x3,%rbx + movq rel(%rip), %rdi .J1: cmp $0x0,%rbx je .J2 @@ -40,4 +49,15 @@ main: .J4: pop %rbp retq +end: .size main, .-main + + .globl _fini + .type _fini, %function + .p2align 4 +_fini: + hlt + .size _fini, .-_fini + + .data +rel: .quad end diff --git a/bolt/test/X86/tail-duplication-pass.s b/bolt/test/X86/tail-duplication-pass.s index 677f498..ed50cc5 100644 --- a/bolt/test/X86/tail-duplication-pass.s +++ b/bolt/test/X86/tail-duplication-pass.s @@ -7,12 +7,21 @@ # RUN: llvm-bolt %t.exe --data %t.fdata --reorder-blocks=ext-tsp \ # RUN: --print-finalized --tail-duplication=moderate \ # RUN: --tail-duplication-minimum-offset=1 -o %t.out | FileCheck %s +# RUN: llvm-bolt %t.exe --data %t.fdata --print-finalized \ +# RUN: --tail-duplication=aggressive --tail-duplication-minimum-offset=1 \ +# RUN: -o %t.out | FileCheck %s --check-prefix CHECK-NOLOOP # FDATA: 1 main 2 1 main #.BB2# 0 10 # FDATA: 1 main 4 1 main #.BB2# 0 20 # CHECK: BOLT-INFO: tail duplication modified 1 ({{.*}}%) functions; duplicated 1 blocks (1 bytes) responsible for {{.*}} dynamic executions ({{.*}}% of all block executions) # CHECK: BB Layout : .LBB00, .Ltail-dup0, .Ltmp0, .Ltmp1 +# Check that the successor of Ltail-dup0 is .LBB00, not itself. +# CHECK-NOLOOP: .Ltail-dup0 (1 instructions, align : 1) +# CHECK-NOLOOP: Predecessors: .LBB00 +# CHECK-NOLOOP: retq +# CHECK-NOLOOP: .Ltmp0 (1 instructions, align : 1) + .text .globl main .type main, %function diff --git a/bolt/test/assume-abi.test b/bolt/test/assume-abi.test new file mode 100644 index 0000000..688ab01 --- /dev/null +++ b/bolt/test/assume-abi.test @@ -0,0 +1,7 @@ +# Validate the usage of the `--assume-abi` option in conjunction with +# options related to the RegAnalysis Pass. + +REQUIRES: system-linux + +RUN: %clang %cflags %p/Inputs/hello.c -o %t -Wl,-q +RUN: llvm-bolt %t -o %t.bolt --assume-abi --indirect-call-promotion=all diff --git a/bolt/test/runtime/AArch64/Inputs/basic-instrumentation.s b/bolt/test/runtime/AArch64/Inputs/basic-instrumentation.s new file mode 100644 index 0000000..fa1ac35 --- /dev/null +++ b/bolt/test/runtime/AArch64/Inputs/basic-instrumentation.s @@ -0,0 +1,9 @@ + .globl main + .type main, %function +main: + sub sp, sp, #16 + mov w0, wzr + str wzr, [sp, #12] + add sp, sp, #16 + ret +.size main, .-main diff --git a/bolt/test/runtime/AArch64/basic-instrumentation.test b/bolt/test/runtime/AArch64/basic-instrumentation.test new file mode 100644 index 0000000..0f77b0c --- /dev/null +++ b/bolt/test/runtime/AArch64/basic-instrumentation.test @@ -0,0 +1,22 @@ +# Try to instrument a very fast test. Input bin will not execute any code during +# runtime besides returning zero in main, so it is a good trivial case. +REQUIRES: system-linux,bolt-runtime + +RUN: %clang %p/Inputs/basic-instrumentation.s -Wl,-q -o %t.exe +RUN: llvm-bolt %t.exe -o %t --instrument \ +RUN: --instrumentation-file=%t \ +RUN: --instrumentation-file-append-pid + +# Execute program to collect profile +RUN: rm %t.*.fdata || echo Nothing to remove +RUN: %t + +# Profile should be written to %t.PID.fdata, check it +RUN: mv %t.*.fdata %t.fdata +RUN: cat %t.fdata | FileCheck -check-prefix=CHECK %s + +# Check BOLT works with this profile +RUN: llvm-bolt %t.exe --data %t.fdata -o %t.2 --reorder-blocks=cache + +# The instrumented profile should at least say main was called once +CHECK: main 0 0 1{{$}} diff --git a/bolt/test/runtime/AArch64/instrumentation-ind-call.c b/bolt/test/runtime/AArch64/instrumentation-ind-call.c new file mode 100644 index 0000000..76ee8c0 --- /dev/null +++ b/bolt/test/runtime/AArch64/instrumentation-ind-call.c @@ -0,0 +1,38 @@ +#include + +typedef int (*func_ptr)(int, int); + +int add(int a, int b) { return a + b; } + +int main() { + func_ptr fun; + fun = add; + int sum = fun(10, 20); // indirect call to 'add' + printf("The sum is: %d\n", sum); + return 0; +} +/* +REQUIRES: system-linux,bolt-runtime + +RUN: %clang %cflags %s -o %t.exe -Wl,-q -nopie -fpie + +RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata \ +RUN: -o %t.instrumented + +# Instrumented program needs to finish returning zero +RUN: %t.instrumented | FileCheck %s -check-prefix=CHECK-OUTPUT + +# Test that the instrumented data makes sense +RUN: llvm-bolt %t.exe -o %t.bolted --data %t.fdata \ +RUN: --reorder-blocks=ext-tsp --reorder-functions=hfsort+ \ +RUN: --print-only=main --print-finalized | FileCheck %s + +RUN: %t.bolted | FileCheck %s -check-prefix=CHECK-OUTPUT + +CHECK-OUTPUT: The sum is: 30 + +# Check that our indirect call has 1 hit recorded in the fdata file and that +# this was processed correctly by BOLT +CHECK: blr x8 # CallProfile: 1 (0 misses) : +CHECK-NEXT: { add: 1 (0 misses) } +*/ diff --git a/bolt/test/runtime/X86/Inputs/exceptions_split.cpp b/bolt/test/runtime/Inputs/exceptions_split.cpp similarity index 85% rename from bolt/test/runtime/X86/Inputs/exceptions_split.cpp rename to bolt/test/runtime/Inputs/exceptions_split.cpp index 2c136b9..de81adf 100644 --- a/bolt/test/runtime/X86/Inputs/exceptions_split.cpp +++ b/bolt/test/runtime/Inputs/exceptions_split.cpp @@ -3,31 +3,25 @@ // // Record performance data with no args. Run test with 2 args. -#include #include +#include -int foo() -{ - return 0; -} +int foo() { return 0; } void bar(int a) { if (a > 2 && a % 2) throw new int(); } -void filter_only(){ - foo(); -} +void filter_only() { foo(); } -int main(int argc, char **argv) -{ +int main(int argc, char **argv) { unsigned r = 0; uint64_t limit = (argc >= 2 ? 10 : 5000); for (uint64_t i = 0; i < limit; ++i) { i += foo(); - try { + try { bar(argc); try { if (argc >= 2) diff --git a/bolt/test/runtime/X86/instrumentation-tail-call.s b/bolt/test/runtime/X86/instrumentation-tail-call.s index 792d084..dfb12f0 100644 --- a/bolt/test/runtime/X86/instrumentation-tail-call.s +++ b/bolt/test/runtime/X86/instrumentation-tail-call.s @@ -14,6 +14,9 @@ # CHECK: leaq 0x80(%rsp), %rsp +# RUN: FileCheck %s --input-file %t.fdata --check-prefix=CHECK-FDATA +# CHECK-FDATA: 1 main {{.*}} 1 targetFunc 0 0 1 + .text .globl main .type main, %function @@ -32,7 +35,8 @@ main: movq %rbp, %rsp pop %rbp mov -0x10(%rsp),%rax - jmp targetFunc + test %rsp, %rsp + jne targetFunc .LBBerror: addq $0x20, %rsp diff --git a/bolt/test/runtime/X86/exceptions-instrumentation.test b/bolt/test/runtime/exceptions-instrumentation.test similarity index 100% rename from bolt/test/runtime/X86/exceptions-instrumentation.test rename to bolt/test/runtime/exceptions-instrumentation.test diff --git a/bolt/test/runtime/X86/pie-exceptions-split.test b/bolt/test/runtime/pie-exceptions-split.test similarity index 95% rename from bolt/test/runtime/X86/pie-exceptions-split.test rename to bolt/test/runtime/pie-exceptions-split.test index 124fef6..30f2d02 100644 --- a/bolt/test/runtime/X86/pie-exceptions-split.test +++ b/bolt/test/runtime/pie-exceptions-split.test @@ -16,9 +16,9 @@ RUN: --print-only=main 2>&1 | FileCheck %s ## All calls to printf() should be from exception handling code that was ## recorded as cold during the profile collection run. Check that the calls ## are placed after the split point. -CHECK-NOT: callq printf +CHECK-NOT: printf CHECK: HOT-COLD SPLIT POINT -CHECK: callq printf +CHECK: printf ## Verify the output still executes correctly when the exception path is being ## taken. -- 2.39.5 (Apple Git-154)