Nicolai Haehnle | 981ceb8 | 2018-10-18 09:38:44 +0000 | [diff] [blame] | 1 | //===- DivergenceAnalysis.cpp --------- Divergence Analysis Implementation -==// |
| 2 | // |
| 3 | // The LLVM Compiler Infrastructure |
| 4 | // |
| 5 | // This file is distributed under the University of Illinois Open Source |
| 6 | // License. See LICENSE.TXT for details. |
| 7 | // |
| 8 | //===----------------------------------------------------------------------===// |
| 9 | // |
| 10 | // This file implements a general divergence analysis for loop vectorization |
| 11 | // and GPU programs. It determines which branches and values in a loop or GPU |
| 12 | // program are divergent. It can help branch optimizations such as jump |
| 13 | // threading and loop unswitching to make better decisions. |
| 14 | // |
| 15 | // GPU programs typically use the SIMD execution model, where multiple threads |
| 16 | // in the same execution group have to execute in lock-step. Therefore, if the |
| 17 | // code contains divergent branches (i.e., threads in a group do not agree on |
| 18 | // which path of the branch to take), the group of threads has to execute all |
| 19 | // the paths from that branch with different subsets of threads enabled until |
| 20 | // they re-converge. |
| 21 | // |
| 22 | // Due to this execution model, some optimizations such as jump |
| 23 | // threading and loop unswitching can interfere with thread re-convergence. |
| 24 | // Therefore, an analysis that computes which branches in a GPU program are |
| 25 | // divergent can help the compiler to selectively run these optimizations. |
| 26 | // |
| 27 | // This implementation is derived from the Vectorization Analysis of the |
| 28 | // Region Vectorizer (RV). That implementation in turn is based on the approach |
| 29 | // described in |
| 30 | // |
| 31 | // Improving Performance of OpenCL on CPUs |
| 32 | // Ralf Karrenberg and Sebastian Hack |
| 33 | // CC '12 |
| 34 | // |
| 35 | // This DivergenceAnalysis implementation is generic in the sense that it does |
| 36 | // not itself identify original sources of divergence. |
| 37 | // Instead specialized adapter classes, (LoopDivergenceAnalysis) for loops and |
| 38 | // (GPUDivergenceAnalysis) for GPU programs, identify the sources of divergence |
| 39 | // (e.g., special variables that hold the thread ID or the iteration variable). |
| 40 | // |
| 41 | // The generic implementation propagates divergence to variables that are data |
| 42 | // or sync dependent on a source of divergence. |
| 43 | // |
| 44 | // While data dependency is a well-known concept, the notion of sync dependency |
| 45 | // is worth more explanation. Sync dependence characterizes the control flow |
| 46 | // aspect of the propagation of branch divergence. For example, |
| 47 | // |
| 48 | // %cond = icmp slt i32 %tid, 10 |
| 49 | // br i1 %cond, label %then, label %else |
| 50 | // then: |
| 51 | // br label %merge |
| 52 | // else: |
| 53 | // br label %merge |
| 54 | // merge: |
| 55 | // %a = phi i32 [ 0, %then ], [ 1, %else ] |
| 56 | // |
| 57 | // Suppose %tid holds the thread ID. Although %a is not data dependent on %tid |
| 58 | // because %tid is not on its use-def chains, %a is sync dependent on %tid |
| 59 | // because the branch "br i1 %cond" depends on %tid and affects which value %a |
| 60 | // is assigned to. |
| 61 | // |
| 62 | // The sync dependence detection (which branch induces divergence in which join |
| 63 | // points) is implemented in the SyncDependenceAnalysis. |
| 64 | // |
| 65 | // The current DivergenceAnalysis implementation has the following limitations: |
| 66 | // 1. intra-procedural. It conservatively considers the arguments of a |
| 67 | // non-kernel-entry function and the return value of a function call as |
| 68 | // divergent. |
| 69 | // 2. memory as black box. It conservatively considers values loaded from |
| 70 | // generic or local address as divergent. This can be improved by leveraging |
| 71 | // pointer analysis and/or by modelling non-escaping memory objects in SSA |
| 72 | // as done in RV. |
| 73 | // |
| 74 | //===----------------------------------------------------------------------===// |
| 75 | |
| 76 | #include "llvm/Analysis/DivergenceAnalysis.h" |
| 77 | #include "llvm/Analysis/LoopInfo.h" |
| 78 | #include "llvm/Analysis/Passes.h" |
| 79 | #include "llvm/Analysis/PostDominators.h" |
| 80 | #include "llvm/Analysis/TargetTransformInfo.h" |
| 81 | #include "llvm/IR/Dominators.h" |
| 82 | #include "llvm/IR/InstIterator.h" |
| 83 | #include "llvm/IR/Instructions.h" |
| 84 | #include "llvm/IR/IntrinsicInst.h" |
| 85 | #include "llvm/IR/Value.h" |
| 86 | #include "llvm/Support/Debug.h" |
| 87 | #include "llvm/Support/raw_ostream.h" |
| 88 | #include <vector> |
| 89 | |
| 90 | using namespace llvm; |
| 91 | |
| 92 | #define DEBUG_TYPE "divergence-analysis" |
| 93 | |
| 94 | // class DivergenceAnalysis |
| 95 | DivergenceAnalysis::DivergenceAnalysis( |
| 96 | const Function &F, const Loop *RegionLoop, const DominatorTree &DT, |
| 97 | const LoopInfo &LI, SyncDependenceAnalysis &SDA, bool IsLCSSAForm) |
| 98 | : F(F), RegionLoop(RegionLoop), DT(DT), LI(LI), SDA(SDA), |
| 99 | IsLCSSAForm(IsLCSSAForm) {} |
| 100 | |
| 101 | void DivergenceAnalysis::markDivergent(const Value &DivVal) { |
| 102 | assert(isa<Instruction>(DivVal) || isa<Argument>(DivVal)); |
| 103 | assert(!isAlwaysUniform(DivVal) && "cannot be a divergent"); |
| 104 | DivergentValues.insert(&DivVal); |
| 105 | } |
| 106 | |
| 107 | void DivergenceAnalysis::addUniformOverride(const Value &UniVal) { |
| 108 | UniformOverrides.insert(&UniVal); |
| 109 | } |
| 110 | |
Chandler Carruth | 39edf63 | 2018-10-19 00:22:10 +0000 | [diff] [blame] | 111 | bool DivergenceAnalysis::updateTerminator(const Instruction &Term) const { |
Nicolai Haehnle | 981ceb8 | 2018-10-18 09:38:44 +0000 | [diff] [blame] | 112 | if (Term.getNumSuccessors() <= 1) |
| 113 | return false; |
| 114 | if (auto *BranchTerm = dyn_cast<BranchInst>(&Term)) { |
| 115 | assert(BranchTerm->isConditional()); |
| 116 | return isDivergent(*BranchTerm->getCondition()); |
| 117 | } |
| 118 | if (auto *SwitchTerm = dyn_cast<SwitchInst>(&Term)) { |
| 119 | return isDivergent(*SwitchTerm->getCondition()); |
| 120 | } |
| 121 | if (isa<InvokeInst>(Term)) { |
| 122 | return false; // ignore abnormal executions through landingpad |
| 123 | } |
| 124 | |
| 125 | llvm_unreachable("unexpected terminator"); |
| 126 | } |
| 127 | |
| 128 | bool DivergenceAnalysis::updateNormalInstruction(const Instruction &I) const { |
| 129 | // TODO function calls with side effects, etc |
| 130 | for (const auto &Op : I.operands()) { |
| 131 | if (isDivergent(*Op)) |
| 132 | return true; |
| 133 | } |
| 134 | return false; |
| 135 | } |
| 136 | |
| 137 | bool DivergenceAnalysis::isTemporalDivergent(const BasicBlock &ObservingBlock, |
| 138 | const Value &Val) const { |
| 139 | const auto *Inst = dyn_cast<const Instruction>(&Val); |
| 140 | if (!Inst) |
| 141 | return false; |
| 142 | // check whether any divergent loop carrying Val terminates before control |
| 143 | // proceeds to ObservingBlock |
| 144 | for (const auto *Loop = LI.getLoopFor(Inst->getParent()); |
| 145 | Loop != RegionLoop && !Loop->contains(&ObservingBlock); |
| 146 | Loop = Loop->getParentLoop()) { |
| 147 | if (DivergentLoops.find(Loop) != DivergentLoops.end()) |
| 148 | return true; |
| 149 | } |
| 150 | |
| 151 | return false; |
| 152 | } |
| 153 | |
| 154 | bool DivergenceAnalysis::updatePHINode(const PHINode &Phi) const { |
| 155 | // joining divergent disjoint path in Phi parent block |
| 156 | if (!Phi.hasConstantOrUndefValue() && isJoinDivergent(*Phi.getParent())) { |
| 157 | return true; |
| 158 | } |
| 159 | |
| 160 | // An incoming value could be divergent by itself. |
| 161 | // Otherwise, an incoming value could be uniform within the loop |
| 162 | // that carries its definition but it may appear divergent |
| 163 | // from outside the loop. This happens when divergent loop exits |
| 164 | // drop definitions of that uniform value in different iterations. |
| 165 | // |
| 166 | // for (int i = 0; i < n; ++i) { // 'i' is uniform inside the loop |
| 167 | // if (i % thread_id == 0) break; // divergent loop exit |
| 168 | // } |
| 169 | // int divI = i; // divI is divergent |
| 170 | for (size_t i = 0; i < Phi.getNumIncomingValues(); ++i) { |
| 171 | const auto *InVal = Phi.getIncomingValue(i); |
| 172 | if (isDivergent(*Phi.getIncomingValue(i)) || |
| 173 | isTemporalDivergent(*Phi.getParent(), *InVal)) { |
| 174 | return true; |
| 175 | } |
| 176 | } |
| 177 | return false; |
| 178 | } |
| 179 | |
| 180 | bool DivergenceAnalysis::inRegion(const Instruction &I) const { |
| 181 | return I.getParent() && inRegion(*I.getParent()); |
| 182 | } |
| 183 | |
| 184 | bool DivergenceAnalysis::inRegion(const BasicBlock &BB) const { |
| 185 | return (!RegionLoop && BB.getParent() == &F) || RegionLoop->contains(&BB); |
| 186 | } |
| 187 | |
| 188 | // marks all users of loop-carried values of the loop headed by LoopHeader as |
| 189 | // divergent |
| 190 | void DivergenceAnalysis::taintLoopLiveOuts(const BasicBlock &LoopHeader) { |
| 191 | auto *DivLoop = LI.getLoopFor(&LoopHeader); |
| 192 | assert(DivLoop && "loopHeader is not actually part of a loop"); |
| 193 | |
| 194 | SmallVector<BasicBlock *, 8> TaintStack; |
| 195 | DivLoop->getExitBlocks(TaintStack); |
| 196 | |
| 197 | // Otherwise potential users of loop-carried values could be anywhere in the |
| 198 | // dominance region of DivLoop (including its fringes for phi nodes) |
| 199 | DenseSet<const BasicBlock *> Visited; |
| 200 | for (auto *Block : TaintStack) { |
| 201 | Visited.insert(Block); |
| 202 | } |
| 203 | Visited.insert(&LoopHeader); |
| 204 | |
| 205 | while (!TaintStack.empty()) { |
| 206 | auto *UserBlock = TaintStack.back(); |
| 207 | TaintStack.pop_back(); |
| 208 | |
| 209 | // don't spread divergence beyond the region |
| 210 | if (!inRegion(*UserBlock)) |
| 211 | continue; |
| 212 | |
| 213 | assert(!DivLoop->contains(UserBlock) && |
| 214 | "irreducible control flow detected"); |
| 215 | |
| 216 | // phi nodes at the fringes of the dominance region |
| 217 | if (!DT.dominates(&LoopHeader, UserBlock)) { |
| 218 | // all PHI nodes of UserBlock become divergent |
| 219 | for (auto &Phi : UserBlock->phis()) { |
| 220 | Worklist.push_back(&Phi); |
| 221 | } |
| 222 | continue; |
| 223 | } |
| 224 | |
| 225 | // taint outside users of values carried by DivLoop |
| 226 | for (auto &I : *UserBlock) { |
| 227 | if (isAlwaysUniform(I)) |
| 228 | continue; |
| 229 | if (isDivergent(I)) |
| 230 | continue; |
| 231 | |
| 232 | for (auto &Op : I.operands()) { |
| 233 | auto *OpInst = dyn_cast<Instruction>(&Op); |
| 234 | if (!OpInst) |
| 235 | continue; |
| 236 | if (DivLoop->contains(OpInst->getParent())) { |
| 237 | markDivergent(I); |
| 238 | pushUsers(I); |
| 239 | break; |
| 240 | } |
| 241 | } |
| 242 | } |
| 243 | |
| 244 | // visit all blocks in the dominance region |
| 245 | for (auto *SuccBlock : successors(UserBlock)) { |
| 246 | if (!Visited.insert(SuccBlock).second) { |
| 247 | continue; |
| 248 | } |
| 249 | TaintStack.push_back(SuccBlock); |
| 250 | } |
| 251 | } |
| 252 | } |
| 253 | |
| 254 | void DivergenceAnalysis::pushPHINodes(const BasicBlock &Block) { |
| 255 | for (const auto &Phi : Block.phis()) { |
| 256 | if (isDivergent(Phi)) |
| 257 | continue; |
| 258 | Worklist.push_back(&Phi); |
| 259 | } |
| 260 | } |
| 261 | |
| 262 | void DivergenceAnalysis::pushUsers(const Value &V) { |
| 263 | for (const auto *User : V.users()) { |
| 264 | const auto *UserInst = dyn_cast<const Instruction>(User); |
| 265 | if (!UserInst) |
| 266 | continue; |
| 267 | |
| 268 | if (isDivergent(*UserInst)) |
| 269 | continue; |
| 270 | |
| 271 | // only compute divergent inside loop |
| 272 | if (!inRegion(*UserInst)) |
| 273 | continue; |
| 274 | Worklist.push_back(UserInst); |
| 275 | } |
| 276 | } |
| 277 | |
| 278 | bool DivergenceAnalysis::propagateJoinDivergence(const BasicBlock &JoinBlock, |
| 279 | const Loop *BranchLoop) { |
| 280 | LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << JoinBlock.getName() << "\n"); |
| 281 | |
| 282 | // ignore divergence outside the region |
| 283 | if (!inRegion(JoinBlock)) { |
| 284 | return false; |
| 285 | } |
| 286 | |
| 287 | // push non-divergent phi nodes in JoinBlock to the worklist |
| 288 | pushPHINodes(JoinBlock); |
| 289 | |
| 290 | // JoinBlock is a divergent loop exit |
| 291 | if (BranchLoop && !BranchLoop->contains(&JoinBlock)) { |
| 292 | return true; |
| 293 | } |
| 294 | |
| 295 | // disjoint-paths divergent at JoinBlock |
| 296 | markBlockJoinDivergent(JoinBlock); |
| 297 | return false; |
| 298 | } |
| 299 | |
Chandler Carruth | 39edf63 | 2018-10-19 00:22:10 +0000 | [diff] [blame] | 300 | void DivergenceAnalysis::propagateBranchDivergence(const Instruction &Term) { |
Nicolai Haehnle | 981ceb8 | 2018-10-18 09:38:44 +0000 | [diff] [blame] | 301 | LLVM_DEBUG(dbgs() << "propBranchDiv " << Term.getParent()->getName() << "\n"); |
| 302 | |
| 303 | markDivergent(Term); |
| 304 | |
| 305 | const auto *BranchLoop = LI.getLoopFor(Term.getParent()); |
| 306 | |
| 307 | // whether there is a divergent loop exit from BranchLoop (if any) |
| 308 | bool IsBranchLoopDivergent = false; |
| 309 | |
| 310 | // iterate over all blocks reachable by disjoint from Term within the loop |
| 311 | // also iterates over loop exits that become divergent due to Term. |
| 312 | for (const auto *JoinBlock : SDA.join_blocks(Term)) { |
| 313 | IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop); |
| 314 | } |
| 315 | |
| 316 | // Branch loop is a divergent loop due to the divergent branch in Term |
| 317 | if (IsBranchLoopDivergent) { |
| 318 | assert(BranchLoop); |
| 319 | if (!DivergentLoops.insert(BranchLoop).second) { |
| 320 | return; |
| 321 | } |
| 322 | propagateLoopDivergence(*BranchLoop); |
| 323 | } |
| 324 | } |
| 325 | |
| 326 | void DivergenceAnalysis::propagateLoopDivergence(const Loop &ExitingLoop) { |
| 327 | LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getName() << "\n"); |
| 328 | |
| 329 | // don't propagate beyond region |
| 330 | if (!inRegion(*ExitingLoop.getHeader())) |
| 331 | return; |
| 332 | |
| 333 | const auto *BranchLoop = ExitingLoop.getParentLoop(); |
| 334 | |
| 335 | // Uses of loop-carried values could occur anywhere |
| 336 | // within the dominance region of the definition. All loop-carried |
| 337 | // definitions are dominated by the loop header (reducible control). |
| 338 | // Thus all users have to be in the dominance region of the loop header, |
| 339 | // except PHI nodes that can also live at the fringes of the dom region |
| 340 | // (incoming defining value). |
| 341 | if (!IsLCSSAForm) |
| 342 | taintLoopLiveOuts(*ExitingLoop.getHeader()); |
| 343 | |
| 344 | // whether there is a divergent loop exit from BranchLoop (if any) |
| 345 | bool IsBranchLoopDivergent = false; |
| 346 | |
| 347 | // iterate over all blocks reachable by disjoint paths from exits of |
| 348 | // ExitingLoop also iterates over loop exits (of BranchLoop) that in turn |
| 349 | // become divergent. |
| 350 | for (const auto *JoinBlock : SDA.join_blocks(ExitingLoop)) { |
| 351 | IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop); |
| 352 | } |
| 353 | |
| 354 | // Branch loop is a divergent due to divergent loop exit in ExitingLoop |
| 355 | if (IsBranchLoopDivergent) { |
| 356 | assert(BranchLoop); |
| 357 | if (!DivergentLoops.insert(BranchLoop).second) { |
| 358 | return; |
| 359 | } |
| 360 | propagateLoopDivergence(*BranchLoop); |
| 361 | } |
| 362 | } |
| 363 | |
| 364 | void DivergenceAnalysis::compute() { |
| 365 | for (auto *DivVal : DivergentValues) { |
| 366 | pushUsers(*DivVal); |
| 367 | } |
| 368 | |
| 369 | // propagate divergence |
| 370 | while (!Worklist.empty()) { |
| 371 | const Instruction &I = *Worklist.back(); |
| 372 | Worklist.pop_back(); |
| 373 | |
| 374 | // maintain uniformity of overrides |
| 375 | if (isAlwaysUniform(I)) |
| 376 | continue; |
| 377 | |
| 378 | bool WasDivergent = isDivergent(I); |
| 379 | if (WasDivergent) |
| 380 | continue; |
| 381 | |
| 382 | // propagate divergence caused by terminator |
Chandler Carruth | 39edf63 | 2018-10-19 00:22:10 +0000 | [diff] [blame] | 383 | if (I.isTerminator()) { |
| 384 | if (updateTerminator(I)) { |
Nicolai Haehnle | 981ceb8 | 2018-10-18 09:38:44 +0000 | [diff] [blame] | 385 | // propagate control divergence to affected instructions |
Chandler Carruth | 39edf63 | 2018-10-19 00:22:10 +0000 | [diff] [blame] | 386 | propagateBranchDivergence(I); |
Nicolai Haehnle | 981ceb8 | 2018-10-18 09:38:44 +0000 | [diff] [blame] | 387 | continue; |
| 388 | } |
| 389 | } |
| 390 | |
| 391 | // update divergence of I due to divergent operands |
| 392 | bool DivergentUpd = false; |
| 393 | const auto *Phi = dyn_cast<const PHINode>(&I); |
| 394 | if (Phi) { |
| 395 | DivergentUpd = updatePHINode(*Phi); |
| 396 | } else { |
| 397 | DivergentUpd = updateNormalInstruction(I); |
| 398 | } |
| 399 | |
| 400 | // propagate value divergence to users |
| 401 | if (DivergentUpd) { |
| 402 | markDivergent(I); |
| 403 | pushUsers(I); |
| 404 | } |
| 405 | } |
| 406 | } |
| 407 | |
| 408 | bool DivergenceAnalysis::isAlwaysUniform(const Value &V) const { |
| 409 | return UniformOverrides.find(&V) != UniformOverrides.end(); |
| 410 | } |
| 411 | |
| 412 | bool DivergenceAnalysis::isDivergent(const Value &V) const { |
| 413 | return DivergentValues.find(&V) != DivergentValues.end(); |
| 414 | } |
| 415 | |
| 416 | void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const { |
| 417 | if (DivergentValues.empty()) |
| 418 | return; |
| 419 | // iterate instructions using instructions() to ensure a deterministic order. |
| 420 | for (auto &I : instructions(F)) { |
| 421 | if (isDivergent(I)) |
| 422 | OS << "DIVERGENT:" << I << '\n'; |
| 423 | } |
| 424 | } |
Nicolai Haehnle | 39a2bbd | 2018-11-30 22:55:20 +0000 | [diff] [blame] | 425 | |
| 426 | // class GPUDivergenceAnalysis |
| 427 | GPUDivergenceAnalysis::GPUDivergenceAnalysis(Function &F, |
| 428 | const DominatorTree &DT, |
| 429 | const PostDominatorTree &PDT, |
| 430 | const LoopInfo &LI, |
| 431 | const TargetTransformInfo &TTI) |
| 432 | : SDA(DT, PDT, LI), DA(F, nullptr, DT, LI, SDA, false) { |
| 433 | for (auto &I : instructions(F)) { |
| 434 | if (TTI.isSourceOfDivergence(&I)) { |
| 435 | DA.markDivergent(I); |
| 436 | } else if (TTI.isAlwaysUniform(&I)) { |
| 437 | DA.addUniformOverride(I); |
| 438 | } |
| 439 | } |
| 440 | for (auto &Arg : F.args()) { |
| 441 | if (TTI.isSourceOfDivergence(&Arg)) { |
| 442 | DA.markDivergent(Arg); |
| 443 | } |
| 444 | } |
| 445 | |
| 446 | DA.compute(); |
| 447 | } |
| 448 | |
| 449 | bool GPUDivergenceAnalysis::isDivergent(const Value &val) const { |
| 450 | return DA.isDivergent(val); |
| 451 | } |
| 452 | |
| 453 | void GPUDivergenceAnalysis::print(raw_ostream &OS, const Module *mod) const { |
| 454 | OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n"; |
| 455 | DA.print(OS, mod); |
| 456 | OS << "}\n"; |
| 457 | } |