Skip to content

Commit

Permalink
Merge "merge main into amd-staging" into amd-staging
Browse files Browse the repository at this point in the history
  • Loading branch information
ronlieb committed Nov 8, 2024
2 parents 5432d43 + c1df43c commit feb26e7
Show file tree
Hide file tree
Showing 136 changed files with 3,536 additions and 1,008 deletions.
3 changes: 3 additions & 0 deletions bolt/include/bolt/Core/BinaryBasicBlock.h
Original file line number Diff line number Diff line change
Expand Up @@ -819,6 +819,9 @@ class BinaryBasicBlock {
return OutputAddressRange;
}

uint64_t getOutputStartAddress() const { return OutputAddressRange.first; }
uint64_t getOutputEndAddress() const { return OutputAddressRange.second; }

bool hasLocSyms() const { return LocSyms != nullptr; }

/// Return mapping of input offsets to symbols in the output.
Expand Down
3 changes: 2 additions & 1 deletion bolt/include/bolt/Core/FunctionLayout.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,8 @@ class FunctionFragment {
const_iterator begin() const;
iterator end();
const_iterator end() const;
const BinaryBasicBlock *front() const;
BinaryBasicBlock *front() const;
BinaryBasicBlock *back() const;

friend class FunctionLayout;
};
Expand Down
13 changes: 13 additions & 0 deletions bolt/include/bolt/Passes/LongJmp.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,19 @@ class LongJmpPass : public BinaryFunctionPass {
uint32_t NumColdStubs{0};
uint32_t NumSharedStubs{0};

/// The shortest distance for any branch instruction on AArch64.
static constexpr size_t ShortestJumpBits = 16;
static constexpr size_t ShortestJumpSpan = 1ULL << (ShortestJumpBits - 1);

/// The longest single-instruction branch.
static constexpr size_t LongestJumpBits = 28;
static constexpr size_t LongestJumpSpan = 1ULL << (LongestJumpBits - 1);

/// Relax all internal function branches including those between fragments.
/// Assume that fragments are placed in different sections but are within
/// 128MB of each other.
void relaxLocalBranches(BinaryFunction &BF);

/// -- Layout estimation methods --
/// Try to do layout before running the emitter, by looking at BinaryFunctions
/// and MCInsts -- this is an estimation. To be correct for longjmp inserter
Expand Down
4 changes: 3 additions & 1 deletion bolt/lib/Core/FunctionLayout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ FunctionFragment::const_iterator FunctionFragment::end() const {
return const_iterator(Layout->block_begin() + StartIndex + Size);
}

const BinaryBasicBlock *FunctionFragment::front() const { return *begin(); }
BinaryBasicBlock *FunctionFragment::front() const { return *begin(); }

BinaryBasicBlock *FunctionFragment::back() const { return *std::prev(end()); }

FunctionLayout::FunctionLayout() { addFragment(); }

Expand Down
290 changes: 287 additions & 3 deletions bolt/lib/Passes/LongJmp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,26 @@
//===----------------------------------------------------------------------===//

#include "bolt/Passes/LongJmp.h"
#include "bolt/Core/ParallelUtilities.h"
#include "llvm/Support/MathExtras.h"

#define DEBUG_TYPE "longjmp"

using namespace llvm;

namespace opts {
extern cl::OptionCategory BoltCategory;
extern cl::OptionCategory BoltOptCategory;
extern llvm::cl::opt<unsigned> AlignText;
extern cl::opt<unsigned> AlignFunctions;
extern cl::opt<bool> UseOldText;
extern cl::opt<bool> HotFunctionsAtEnd;

static cl::opt<bool>
CompactCodeModel("compact-code-model",
cl::desc("generate code for binaries <128MB on AArch64"),
cl::init(false), cl::cat(BoltCategory));

static cl::opt<bool> GroupStubs("group-stubs",
cl::desc("share stubs across functions"),
cl::init(true), cl::cat(BoltOptCategory));
Expand Down Expand Up @@ -61,10 +69,10 @@ static BinaryBasicBlock *getBBAtHotColdSplitPoint(BinaryFunction &Func) {
if (Next != E && (*Next)->isCold())
return *I;
}
llvm_unreachable("No hot-colt split point found");
llvm_unreachable("No hot-cold split point found");
}

static bool shouldInsertStub(const BinaryContext &BC, const MCInst &Inst) {
static bool mayNeedStub(const BinaryContext &BC, const MCInst &Inst) {
return (BC.MIB->isBranch(Inst) || BC.MIB->isCall(Inst)) &&
!BC.MIB->isIndirectBranch(Inst) && !BC.MIB->isIndirectCall(Inst);
}
Expand Down Expand Up @@ -570,7 +578,7 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) {
if (BC.MIB->isPseudo(Inst))
continue;

if (!shouldInsertStub(BC, Inst)) {
if (!mayNeedStub(BC, Inst)) {
DotAddress += InsnSize;
continue;
}
Expand Down Expand Up @@ -634,7 +642,283 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) {
return Error::success();
}

void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) {
BinaryContext &BC = BF.getBinaryContext();
auto &MIB = BC.MIB;

// Quick path.
if (!BF.isSplit() && BF.estimateSize() < ShortestJumpSpan)
return;

auto isBranchOffsetInRange = [&](const MCInst &Inst, int64_t Offset) {
const unsigned Bits = MIB->getPCRelEncodingSize(Inst);
return isIntN(Bits, Offset);
};

auto isBlockInRange = [&](const MCInst &Inst, uint64_t InstAddress,
const BinaryBasicBlock &BB) {
const int64_t Offset = BB.getOutputStartAddress() - InstAddress;
return isBranchOffsetInRange(Inst, Offset);
};

// Keep track of *all* function trampolines that are going to be added to the
// function layout at the end of relaxation.
std::vector<std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>>>
FunctionTrampolines;

// Function fragments are relaxed independently.
for (FunctionFragment &FF : BF.getLayout().fragments()) {
// Fill out code size estimation for the fragment. Use output BB address
// ranges to store offsets from the start of the function fragment.
uint64_t CodeSize = 0;
for (BinaryBasicBlock *BB : FF) {
BB->setOutputStartAddress(CodeSize);
CodeSize += BB->estimateSize();
BB->setOutputEndAddress(CodeSize);
}

// Dynamically-updated size of the fragment.
uint64_t FragmentSize = CodeSize;

// Size of the trampoline in bytes.
constexpr uint64_t TrampolineSize = 4;

// Trampolines created for the fragment. DestinationBB -> TrampolineBB.
// NB: here we store only the first trampoline created for DestinationBB.
DenseMap<const BinaryBasicBlock *, BinaryBasicBlock *> FragmentTrampolines;

// Create a trampoline code after \p BB or at the end of the fragment if BB
// is nullptr. If \p UpdateOffsets is true, update FragmentSize and offsets
// for basic blocks affected by the insertion of the trampoline.
auto addTrampolineAfter = [&](BinaryBasicBlock *BB,
BinaryBasicBlock *TargetBB, uint64_t Count,
bool UpdateOffsets = true) {
FunctionTrampolines.emplace_back(BB ? BB : FF.back(),
BF.createBasicBlock());
BinaryBasicBlock *TrampolineBB = FunctionTrampolines.back().second.get();

MCInst Inst;
{
auto L = BC.scopeLock();
MIB->createUncondBranch(Inst, TargetBB->getLabel(), BC.Ctx.get());
}
TrampolineBB->addInstruction(Inst);
TrampolineBB->addSuccessor(TargetBB, Count);
TrampolineBB->setExecutionCount(Count);
const uint64_t TrampolineAddress =
BB ? BB->getOutputEndAddress() : FragmentSize;
TrampolineBB->setOutputStartAddress(TrampolineAddress);
TrampolineBB->setOutputEndAddress(TrampolineAddress + TrampolineSize);
TrampolineBB->setFragmentNum(FF.getFragmentNum());

if (!FragmentTrampolines.lookup(TargetBB))
FragmentTrampolines[TargetBB] = TrampolineBB;

if (!UpdateOffsets)
return TrampolineBB;

FragmentSize += TrampolineSize;

// If the trampoline was added at the end of the fragment, offsets of
// other fragments should stay intact.
if (!BB)
return TrampolineBB;

// Update offsets for blocks after BB.
for (BinaryBasicBlock *IBB : FF) {
if (IBB->getOutputStartAddress() >= TrampolineAddress) {
IBB->setOutputStartAddress(IBB->getOutputStartAddress() +
TrampolineSize);
IBB->setOutputEndAddress(IBB->getOutputEndAddress() + TrampolineSize);
}
}

// Update offsets for trampolines in this fragment that are placed after
// the new trampoline. Note that trampoline blocks are not part of the
// function/fragment layout until we add them right before the return
// from relaxLocalBranches().
for (auto &Pair : FunctionTrampolines) {
BinaryBasicBlock *IBB = Pair.second.get();
if (IBB->getFragmentNum() != TrampolineBB->getFragmentNum())
continue;
if (IBB == TrampolineBB)
continue;
if (IBB->getOutputStartAddress() >= TrampolineAddress) {
IBB->setOutputStartAddress(IBB->getOutputStartAddress() +
TrampolineSize);
IBB->setOutputEndAddress(IBB->getOutputEndAddress() + TrampolineSize);
}
}

return TrampolineBB;
};

// Pre-populate trampolines by splitting unconditional branches from the
// containing basic block.
for (BinaryBasicBlock *BB : FF) {
MCInst *Inst = BB->getLastNonPseudoInstr();
if (!Inst || !MIB->isUnconditionalBranch(*Inst))
continue;

const MCSymbol *TargetSymbol = MIB->getTargetSymbol(*Inst);
BB->eraseInstruction(BB->findInstruction(Inst));
BB->setOutputEndAddress(BB->getOutputEndAddress() - TrampolineSize);

BinaryBasicBlock::BinaryBranchInfo BI;
BinaryBasicBlock *TargetBB = BB->getSuccessor(TargetSymbol, BI);

BinaryBasicBlock *TrampolineBB =
addTrampolineAfter(BB, TargetBB, BI.Count, /*UpdateOffsets*/ false);
BB->replaceSuccessor(TargetBB, TrampolineBB, BI.Count);
}

/// Relax the branch \p Inst in basic block \p BB that targets \p TargetBB.
/// \p InstAddress contains offset of the branch from the start of the
/// containing function fragment.
auto relaxBranch = [&](BinaryBasicBlock *BB, MCInst &Inst,
uint64_t InstAddress, BinaryBasicBlock *TargetBB) {
BinaryFunction *BF = BB->getParent();

// Use branch taken count for optimal relaxation.
const uint64_t Count = BB->getBranchInfo(*TargetBB).Count;
assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
"Expected valid branch execution count");

// Try to reuse an existing trampoline without introducing any new code.
BinaryBasicBlock *TrampolineBB = FragmentTrampolines.lookup(TargetBB);
if (TrampolineBB && isBlockInRange(Inst, InstAddress, *TrampolineBB)) {
BB->replaceSuccessor(TargetBB, TrampolineBB, Count);
TrampolineBB->setExecutionCount(TrampolineBB->getExecutionCount() +
Count);
auto L = BC.scopeLock();
MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get());
return;
}

// For cold branches, check if we can introduce a trampoline at the end
// of the fragment that is within the branch reach. Note that such
// trampoline may change address later and become unreachable in which
// case we will need further relaxation.
const int64_t OffsetToEnd = FragmentSize - InstAddress;
if (Count == 0 && isBranchOffsetInRange(Inst, OffsetToEnd)) {
TrampolineBB = addTrampolineAfter(nullptr, TargetBB, Count);
BB->replaceSuccessor(TargetBB, TrampolineBB, Count);
auto L = BC.scopeLock();
MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get());

return;
}

// Insert a new block after the current one and use it as a trampoline.
TrampolineBB = addTrampolineAfter(BB, TargetBB, Count);

// If the other successor is a fall-through, invert the condition code.
const BinaryBasicBlock *const NextBB =
BF->getLayout().getBasicBlockAfter(BB, /*IgnoreSplits*/ false);
if (BB->getConditionalSuccessor(false) == NextBB) {
BB->swapConditionalSuccessors();
auto L = BC.scopeLock();
MIB->reverseBranchCondition(Inst, NextBB->getLabel(), BC.Ctx.get());
} else {
auto L = BC.scopeLock();
MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get());
}
BB->replaceSuccessor(TargetBB, TrampolineBB, Count);
};

bool MayNeedRelaxation;
uint64_t NumIterations = 0;
do {
MayNeedRelaxation = false;
++NumIterations;
for (auto BBI = FF.begin(); BBI != FF.end(); ++BBI) {
BinaryBasicBlock *BB = *BBI;
uint64_t NextInstOffset = BB->getOutputStartAddress();
for (MCInst &Inst : *BB) {
const size_t InstAddress = NextInstOffset;
if (!MIB->isPseudo(Inst))
NextInstOffset += 4;

if (!mayNeedStub(BF.getBinaryContext(), Inst))
continue;

const size_t BitsAvailable = MIB->getPCRelEncodingSize(Inst);

// Span of +/-128MB.
if (BitsAvailable == LongestJumpBits)
continue;

const MCSymbol *TargetSymbol = MIB->getTargetSymbol(Inst);
BinaryBasicBlock *TargetBB = BB->getSuccessor(TargetSymbol);
assert(TargetBB &&
"Basic block target expected for conditional branch.");

// Check if the relaxation is needed.
if (TargetBB->getFragmentNum() == FF.getFragmentNum() &&
isBlockInRange(Inst, InstAddress, *TargetBB))
continue;

relaxBranch(BB, Inst, InstAddress, TargetBB);

MayNeedRelaxation = true;
}
}

// We may have added new instructions, but the whole fragment is less than
// the minimum branch span.
if (FragmentSize < ShortestJumpSpan)
MayNeedRelaxation = false;

} while (MayNeedRelaxation);

LLVM_DEBUG({
if (NumIterations > 2) {
dbgs() << "BOLT-DEBUG: relaxed fragment " << FF.getFragmentNum().get()
<< " of " << BF << " in " << NumIterations << " iterations\n";
}
});
(void)NumIterations;
}

// Add trampoline blocks from all fragments to the layout.
DenseMap<BinaryBasicBlock *, std::vector<std::unique_ptr<BinaryBasicBlock>>>
Insertions;
for (std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>> &Pair :
FunctionTrampolines) {
if (!Pair.second)
continue;
Insertions[Pair.first].emplace_back(std::move(Pair.second));
}

for (auto &Pair : Insertions) {
BF.insertBasicBlocks(Pair.first, std::move(Pair.second),
/*UpdateLayout*/ true, /*UpdateCFI*/ true,
/*RecomputeLPs*/ false);
}
}

Error LongJmpPass::runOnFunctions(BinaryContext &BC) {

if (opts::CompactCodeModel) {
BC.outs()
<< "BOLT-INFO: relaxing branches for compact code model (<128MB)\n";

ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
relaxLocalBranches(BF);
};

ParallelUtilities::PredicateTy SkipPredicate =
[&](const BinaryFunction &BF) {
return !BC.shouldEmit(BF) || !BF.isSimple();
};

ParallelUtilities::runOnEachFunction(
BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
SkipPredicate, "RelaxLocalBranches");

return Error::success();
}

BC.outs() << "BOLT-INFO: Starting stub-insertion pass\n";
std::vector<BinaryFunction *> Sorted = BC.getSortedFunctions();
bool Modified;
Expand Down
Loading

0 comments on commit feb26e7

Please sign in to comment.