LLVM 21.0.0git
LoopIdiomRecognize.cpp
Go to the documentation of this file.
1//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements an idiom recognizer that transforms simple loops into a
10// non-loop form. In cases that this kicks in, it can be a significant
11// performance win.
12//
13// If compiling for code size we avoid idiom recognition if the resulting
14// code could be larger than the code for the original loop. One way this could
15// happen is if the loop is not removable after idiom recognition due to the
16// presence of non-idiom instructions. The initial implementation of the
17// heuristics applies to idioms in multi-block loops.
18//
19//===----------------------------------------------------------------------===//
20//
21// TODO List:
22//
23// Future loop memory idioms to recognize:
24// memcmp, strlen, etc.
25//
26// This could recognize common matrix multiplies and dot product idioms and
27// replace them with calls to BLAS (if linked in??).
28//
29//===----------------------------------------------------------------------===//
30
32#include "llvm/ADT/APInt.h"
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/DenseMap.h"
35#include "llvm/ADT/MapVector.h"
36#include "llvm/ADT/SetVector.h"
39#include "llvm/ADT/Statistic.h"
40#include "llvm/ADT/StringRef.h"
55#include "llvm/IR/BasicBlock.h"
56#include "llvm/IR/Constant.h"
57#include "llvm/IR/Constants.h"
58#include "llvm/IR/DataLayout.h"
59#include "llvm/IR/DebugLoc.h"
61#include "llvm/IR/Dominators.h"
62#include "llvm/IR/GlobalValue.h"
64#include "llvm/IR/IRBuilder.h"
65#include "llvm/IR/InstrTypes.h"
66#include "llvm/IR/Instruction.h"
69#include "llvm/IR/Intrinsics.h"
70#include "llvm/IR/LLVMContext.h"
71#include "llvm/IR/Module.h"
72#include "llvm/IR/PassManager.h"
74#include "llvm/IR/Type.h"
75#include "llvm/IR/User.h"
76#include "llvm/IR/Value.h"
77#include "llvm/IR/ValueHandle.h"
80#include "llvm/Support/Debug.h"
87#include <algorithm>
88#include <cassert>
89#include <cstdint>
90#include <utility>
91#include <vector>
92
93using namespace llvm;
94
95#define DEBUG_TYPE "loop-idiom"
96
97STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
98STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
99STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");
101 NumShiftUntilBitTest,
102 "Number of uncountable loops recognized as 'shift until bitttest' idiom");
103STATISTIC(NumShiftUntilZero,
104 "Number of uncountable loops recognized as 'shift until zero' idiom");
105
108 DisableLIRPAll("disable-" DEBUG_TYPE "-all",
109 cl::desc("Options to disable Loop Idiom Recognize Pass."),
112
115 DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
116 cl::desc("Proceed with loop idiom recognize pass, but do "
117 "not convert loop(s) to memset."),
120
123 DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
124 cl::desc("Proceed with loop idiom recognize pass, but do "
125 "not convert loop(s) to memcpy."),
128
130 "use-lir-code-size-heurs",
131 cl::desc("Use loop idiom recognition code size heuristics when compiling "
132 "with -Os/-Oz"),
133 cl::init(true), cl::Hidden);
134
135namespace {
136
137class LoopIdiomRecognize {
138 Loop *CurLoop = nullptr;
139 AliasAnalysis *AA;
140 DominatorTree *DT;
141 LoopInfo *LI;
142 ScalarEvolution *SE;
145 const DataLayout *DL;
147 bool ApplyCodeSizeHeuristics;
148 std::unique_ptr<MemorySSAUpdater> MSSAU;
149
150public:
151 explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
152 LoopInfo *LI, ScalarEvolution *SE,
154 const TargetTransformInfo *TTI, MemorySSA *MSSA,
155 const DataLayout *DL,
157 : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
158 if (MSSA)
159 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
160 }
161
162 bool runOnLoop(Loop *L);
163
164private:
165 using StoreList = SmallVector<StoreInst *, 8>;
166 using StoreListMap = MapVector<Value *, StoreList>;
167
168 StoreListMap StoreRefsForMemset;
169 StoreListMap StoreRefsForMemsetPattern;
170 StoreList StoreRefsForMemcpy;
171 bool HasMemset;
172 bool HasMemsetPattern;
173 bool HasMemcpy;
174
175 /// Return code for isLegalStore()
176 enum LegalStoreKind {
177 None = 0,
178 Memset,
179 MemsetPattern,
180 Memcpy,
181 UnorderedAtomicMemcpy,
182 DontUse // Dummy retval never to be used. Allows catching errors in retval
183 // handling.
184 };
185
186 /// \name Countable Loop Idiom Handling
187 /// @{
188
189 bool runOnCountableLoop();
190 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
192
193 void collectStores(BasicBlock *BB);
194 LegalStoreKind isLegalStore(StoreInst *SI);
195 enum class ForMemset { No, Yes };
196 bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
197 ForMemset For);
198
199 template <typename MemInst>
200 bool processLoopMemIntrinsic(
201 BasicBlock *BB,
202 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
203 const SCEV *BECount);
204 bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
205 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
206
207 bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV,
208 MaybeAlign StoreAlignment, Value *StoredVal,
209 Instruction *TheStore,
211 const SCEVAddRecExpr *Ev, const SCEV *BECount,
212 bool IsNegStride, bool IsLoopMemset = false);
213 bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
214 bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
215 const SCEV *StoreSize, MaybeAlign StoreAlign,
216 MaybeAlign LoadAlign, Instruction *TheStore,
217 Instruction *TheLoad,
218 const SCEVAddRecExpr *StoreEv,
219 const SCEVAddRecExpr *LoadEv,
220 const SCEV *BECount);
221 bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
222 bool IsLoopMemset = false);
223
224 /// @}
225 /// \name Noncountable Loop Idiom Handling
226 /// @{
227
228 bool runOnNoncountableLoop();
229
230 bool recognizePopcount();
231 void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
232 PHINode *CntPhi, Value *Var);
233 bool isProfitableToInsertFFS(Intrinsic::ID IntrinID, Value *InitX,
234 bool ZeroCheck, size_t CanonicalSize);
235 bool insertFFSIfProfitable(Intrinsic::ID IntrinID, Value *InitX,
236 Instruction *DefX, PHINode *CntPhi,
237 Instruction *CntInst);
238 bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
239 bool recognizeShiftUntilLessThan();
240 void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
241 Instruction *CntInst, PHINode *CntPhi,
242 Value *Var, Instruction *DefX,
243 const DebugLoc &DL, bool ZeroCheck,
244 bool IsCntPhiUsedOutsideLoop,
245 bool InsertSub = false);
246
247 bool recognizeShiftUntilBitTest();
248 bool recognizeShiftUntilZero();
249
250 /// @}
251};
252} // end anonymous namespace
253
256 LPMUpdater &) {
258 return PreservedAnalyses::all();
259
260 const auto *DL = &L.getHeader()->getDataLayout();
261
262 // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
263 // pass. Function analyses need to be preserved across loop transformations
264 // but ORE cannot be preserved (see comment before the pass definition).
265 OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
266
267 LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
268 AR.MSSA, DL, ORE);
269 if (!LIR.runOnLoop(&L))
270 return PreservedAnalyses::all();
271
273 if (AR.MSSA)
274 PA.preserve<MemorySSAAnalysis>();
275 return PA;
276}
277
279 I->replaceAllUsesWith(PoisonValue::get(I->getType()));
280 I->eraseFromParent();
281}
282
283//===----------------------------------------------------------------------===//
284//
285// Implementation of LoopIdiomRecognize
286//
287//===----------------------------------------------------------------------===//
288
289bool LoopIdiomRecognize::runOnLoop(Loop *L) {
290 CurLoop = L;
291 // If the loop could not be converted to canonical form, it must have an
292 // indirectbr in it, just give up.
293 if (!L->getLoopPreheader())
294 return false;
295
296 // Disable loop idiom recognition if the function's name is a common idiom.
297 StringRef Name = L->getHeader()->getParent()->getName();
298 if (Name == "memset" || Name == "memcpy")
299 return false;
300
301 // Determine if code size heuristics need to be applied.
302 ApplyCodeSizeHeuristics =
303 L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
304
305 HasMemset = TLI->has(LibFunc_memset);
306 HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
307 HasMemcpy = TLI->has(LibFunc_memcpy);
308
309 if (HasMemset || HasMemsetPattern || HasMemcpy)
311 return runOnCountableLoop();
312
313 return runOnNoncountableLoop();
314}
315
316bool LoopIdiomRecognize::runOnCountableLoop() {
317 const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
318 assert(!isa<SCEVCouldNotCompute>(BECount) &&
319 "runOnCountableLoop() called on a loop without a predictable"
320 "backedge-taken count");
321
322 // If this loop executes exactly one time, then it should be peeled, not
323 // optimized by this pass.
324 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
325 if (BECst->getAPInt() == 0)
326 return false;
327
329 CurLoop->getUniqueExitBlocks(ExitBlocks);
330
331 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
332 << CurLoop->getHeader()->getParent()->getName()
333 << "] Countable Loop %" << CurLoop->getHeader()->getName()
334 << "\n");
335
336 // The following transforms hoist stores/memsets into the loop pre-header.
337 // Give up if the loop has instructions that may throw.
338 SimpleLoopSafetyInfo SafetyInfo;
339 SafetyInfo.computeLoopSafetyInfo(CurLoop);
340 if (SafetyInfo.anyBlockMayThrow())
341 return false;
342
343 bool MadeChange = false;
344
345 // Scan all the blocks in the loop that are not in subloops.
346 for (auto *BB : CurLoop->getBlocks()) {
347 // Ignore blocks in subloops.
348 if (LI->getLoopFor(BB) != CurLoop)
349 continue;
350
351 MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
352 }
353 return MadeChange;
354}
355
356static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
357 const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
358 return ConstStride->getAPInt();
359}
360
361/// getMemSetPatternValue - If a strided store of the specified value is safe to
362/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
363/// be passed in. Otherwise, return null.
364///
365/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
366/// just replicate their input array and then pass on to memset_pattern16.
368 // FIXME: This could check for UndefValue because it can be merged into any
369 // other valid pattern.
370
371 // If the value isn't a constant, we can't promote it to being in a constant
372 // array. We could theoretically do a store to an alloca or something, but
373 // that doesn't seem worthwhile.
374 Constant *C = dyn_cast<Constant>(V);
375 if (!C || isa<ConstantExpr>(C))
376 return nullptr;
377
378 // Only handle simple values that are a power of two bytes in size.
379 uint64_t Size = DL->getTypeSizeInBits(V->getType());
380 if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
381 return nullptr;
382
383 // Don't care enough about darwin/ppc to implement this.
384 if (DL->isBigEndian())
385 return nullptr;
386
387 // Convert to size in bytes.
388 Size /= 8;
389
390 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
391 // if the top and bottom are the same (e.g. for vectors and large integers).
392 if (Size > 16)
393 return nullptr;
394
395 // If the constant is exactly 16 bytes, just use it.
396 if (Size == 16)
397 return C;
398
399 // Otherwise, we'll use an array of the constants.
400 unsigned ArraySize = 16 / Size;
401 ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
402 return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
403}
404
405LoopIdiomRecognize::LegalStoreKind
406LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
407 // Don't touch volatile stores.
408 if (SI->isVolatile())
409 return LegalStoreKind::None;
410 // We only want simple or unordered-atomic stores.
411 if (!SI->isUnordered())
412 return LegalStoreKind::None;
413
414 // Avoid merging nontemporal stores.
415 if (SI->getMetadata(LLVMContext::MD_nontemporal))
416 return LegalStoreKind::None;
417
418 Value *StoredVal = SI->getValueOperand();
419 Value *StorePtr = SI->getPointerOperand();
420
421 // Don't convert stores of non-integral pointer types to memsets (which stores
422 // integers).
423 if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
424 return LegalStoreKind::None;
425
426 // Reject stores that are so large that they overflow an unsigned.
427 // When storing out scalable vectors we bail out for now, since the code
428 // below currently only works for constant strides.
429 TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
430 if (SizeInBits.isScalable() || (SizeInBits.getFixedValue() & 7) ||
431 (SizeInBits.getFixedValue() >> 32) != 0)
432 return LegalStoreKind::None;
433
434 // See if the pointer expression is an AddRec like {base,+,1} on the current
435 // loop, which indicates a strided store. If we have something else, it's a
436 // random store we can't handle.
437 const SCEVAddRecExpr *StoreEv =
438 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
439 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
440 return LegalStoreKind::None;
441
442 // Check to see if we have a constant stride.
443 if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
444 return LegalStoreKind::None;
445
446 // See if the store can be turned into a memset.
447
448 // If the stored value is a byte-wise value (like i32 -1), then it may be
449 // turned into a memset of i8 -1, assuming that all the consecutive bytes
450 // are stored. A store of i32 0x01020304 can never be turned into a memset,
451 // but it can be turned into memset_pattern if the target supports it.
452 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
453
454 // Note: memset and memset_pattern on unordered-atomic is yet not supported
455 bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
456
457 // If we're allowed to form a memset, and the stored value would be
458 // acceptable for memset, use it.
459 if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
460 // Verify that the stored value is loop invariant. If not, we can't
461 // promote the memset.
462 CurLoop->isLoopInvariant(SplatValue)) {
463 // It looks like we can use SplatValue.
464 return LegalStoreKind::Memset;
465 }
466 if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
467 // Don't create memset_pattern16s with address spaces.
468 StorePtr->getType()->getPointerAddressSpace() == 0 &&
469 getMemSetPatternValue(StoredVal, DL)) {
470 // It looks like we can use PatternValue!
471 return LegalStoreKind::MemsetPattern;
472 }
473
474 // Otherwise, see if the store can be turned into a memcpy.
475 if (HasMemcpy && !DisableLIRP::Memcpy) {
476 // Check to see if the stride matches the size of the store. If so, then we
477 // know that every byte is touched in the loop.
478 APInt Stride = getStoreStride(StoreEv);
479 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
480 if (StoreSize != Stride && StoreSize != -Stride)
481 return LegalStoreKind::None;
482
483 // The store must be feeding a non-volatile load.
484 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
485
486 // Only allow non-volatile loads
487 if (!LI || LI->isVolatile())
488 return LegalStoreKind::None;
489 // Only allow simple or unordered-atomic loads
490 if (!LI->isUnordered())
491 return LegalStoreKind::None;
492
493 // See if the pointer expression is an AddRec like {base,+,1} on the current
494 // loop, which indicates a strided load. If we have something else, it's a
495 // random load we can't handle.
496 const SCEVAddRecExpr *LoadEv =
497 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
498 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
499 return LegalStoreKind::None;
500
501 // The store and load must share the same stride.
502 if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
503 return LegalStoreKind::None;
504
505 // Success. This store can be converted into a memcpy.
506 UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
507 return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
508 : LegalStoreKind::Memcpy;
509 }
510 // This store can't be transformed into a memset/memcpy.
511 return LegalStoreKind::None;
512}
513
514void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
515 StoreRefsForMemset.clear();
516 StoreRefsForMemsetPattern.clear();
517 StoreRefsForMemcpy.clear();
518 for (Instruction &I : *BB) {
519 StoreInst *SI = dyn_cast<StoreInst>(&I);
520 if (!SI)
521 continue;
522
523 // Make sure this is a strided store with a constant stride.
524 switch (isLegalStore(SI)) {
525 case LegalStoreKind::None:
526 // Nothing to do
527 break;
528 case LegalStoreKind::Memset: {
529 // Find the base pointer.
530 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
531 StoreRefsForMemset[Ptr].push_back(SI);
532 } break;
533 case LegalStoreKind::MemsetPattern: {
534 // Find the base pointer.
535 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
536 StoreRefsForMemsetPattern[Ptr].push_back(SI);
537 } break;
538 case LegalStoreKind::Memcpy:
539 case LegalStoreKind::UnorderedAtomicMemcpy:
540 StoreRefsForMemcpy.push_back(SI);
541 break;
542 default:
543 assert(false && "unhandled return value");
544 break;
545 }
546 }
547}
548
549/// runOnLoopBlock - Process the specified block, which lives in a counted loop
550/// with the specified backedge count. This block is known to be in the current
551/// loop and not in any subloops.
552bool LoopIdiomRecognize::runOnLoopBlock(
553 BasicBlock *BB, const SCEV *BECount,
554 SmallVectorImpl<BasicBlock *> &ExitBlocks) {
555 // We can only promote stores in this block if they are unconditionally
556 // executed in the loop. For a block to be unconditionally executed, it has
557 // to dominate all the exit blocks of the loop. Verify this now.
558 for (BasicBlock *ExitBlock : ExitBlocks)
559 if (!DT->dominates(BB, ExitBlock))
560 return false;
561
562 bool MadeChange = false;
563 // Look for store instructions, which may be optimized to memset/memcpy.
564 collectStores(BB);
565
566 // Look for a single store or sets of stores with a common base, which can be
567 // optimized into a memset (memset_pattern). The latter most commonly happens
568 // with structs and handunrolled loops.
569 for (auto &SL : StoreRefsForMemset)
570 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
571
572 for (auto &SL : StoreRefsForMemsetPattern)
573 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
574
575 // Optimize the store into a memcpy, if it feeds an similarly strided load.
576 for (auto &SI : StoreRefsForMemcpy)
577 MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
578
579 MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
580 BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
581 MadeChange |= processLoopMemIntrinsic<MemSetInst>(
582 BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
583
584 return MadeChange;
585}
586
587/// See if this store(s) can be promoted to a memset.
588bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
589 const SCEV *BECount, ForMemset For) {
590 // Try to find consecutive stores that can be transformed into memsets.
591 SetVector<StoreInst *> Heads, Tails;
593
594 // Do a quadratic search on all of the given stores and find
595 // all of the pairs of stores that follow each other.
596 SmallVector<unsigned, 16> IndexQueue;
597 for (unsigned i = 0, e = SL.size(); i < e; ++i) {
598 assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
599
600 Value *FirstStoredVal = SL[i]->getValueOperand();
601 Value *FirstStorePtr = SL[i]->getPointerOperand();
602 const SCEVAddRecExpr *FirstStoreEv =
603 cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
604 APInt FirstStride = getStoreStride(FirstStoreEv);
605 unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
606
607 // See if we can optimize just this store in isolation.
608 if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
609 Heads.insert(SL[i]);
610 continue;
611 }
612
613 Value *FirstSplatValue = nullptr;
614 Constant *FirstPatternValue = nullptr;
615
616 if (For == ForMemset::Yes)
617 FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
618 else
619 FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
620
621 assert((FirstSplatValue || FirstPatternValue) &&
622 "Expected either splat value or pattern value.");
623
624 IndexQueue.clear();
625 // If a store has multiple consecutive store candidates, search Stores
626 // array according to the sequence: from i+1 to e, then from i-1 to 0.
627 // This is because usually pairing with immediate succeeding or preceding
628 // candidate create the best chance to find memset opportunity.
629 unsigned j = 0;
630 for (j = i + 1; j < e; ++j)
631 IndexQueue.push_back(j);
632 for (j = i; j > 0; --j)
633 IndexQueue.push_back(j - 1);
634
635 for (auto &k : IndexQueue) {
636 assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
637 Value *SecondStorePtr = SL[k]->getPointerOperand();
638 const SCEVAddRecExpr *SecondStoreEv =
639 cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
640 APInt SecondStride = getStoreStride(SecondStoreEv);
641
642 if (FirstStride != SecondStride)
643 continue;
644
645 Value *SecondStoredVal = SL[k]->getValueOperand();
646 Value *SecondSplatValue = nullptr;
647 Constant *SecondPatternValue = nullptr;
648
649 if (For == ForMemset::Yes)
650 SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
651 else
652 SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
653
654 assert((SecondSplatValue || SecondPatternValue) &&
655 "Expected either splat value or pattern value.");
656
657 if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
658 if (For == ForMemset::Yes) {
659 if (isa<UndefValue>(FirstSplatValue))
660 FirstSplatValue = SecondSplatValue;
661 if (FirstSplatValue != SecondSplatValue)
662 continue;
663 } else {
664 if (isa<UndefValue>(FirstPatternValue))
665 FirstPatternValue = SecondPatternValue;
666 if (FirstPatternValue != SecondPatternValue)
667 continue;
668 }
669 Tails.insert(SL[k]);
670 Heads.insert(SL[i]);
671 ConsecutiveChain[SL[i]] = SL[k];
672 break;
673 }
674 }
675 }
676
677 // We may run into multiple chains that merge into a single chain. We mark the
678 // stores that we transformed so that we don't visit the same store twice.
679 SmallPtrSet<Value *, 16> TransformedStores;
680 bool Changed = false;
681
682 // For stores that start but don't end a link in the chain:
683 for (StoreInst *I : Heads) {
684 if (Tails.count(I))
685 continue;
686
687 // We found a store instr that starts a chain. Now follow the chain and try
688 // to transform it.
689 SmallPtrSet<Instruction *, 8> AdjacentStores;
690 StoreInst *HeadStore = I;
691 unsigned StoreSize = 0;
692
693 // Collect the chain into a list.
694 while (Tails.count(I) || Heads.count(I)) {
695 if (TransformedStores.count(I))
696 break;
697 AdjacentStores.insert(I);
698
699 StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
700 // Move to the next value in the chain.
701 I = ConsecutiveChain[I];
702 }
703
704 Value *StoredVal = HeadStore->getValueOperand();
705 Value *StorePtr = HeadStore->getPointerOperand();
706 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
707 APInt Stride = getStoreStride(StoreEv);
708
709 // Check to see if the stride matches the size of the stores. If so, then
710 // we know that every byte is touched in the loop.
711 if (StoreSize != Stride && StoreSize != -Stride)
712 continue;
713
714 bool IsNegStride = StoreSize == -Stride;
715
716 Type *IntIdxTy = DL->getIndexType(StorePtr->getType());
717 const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize);
718 if (processLoopStridedStore(StorePtr, StoreSizeSCEV,
719 MaybeAlign(HeadStore->getAlign()), StoredVal,
720 HeadStore, AdjacentStores, StoreEv, BECount,
721 IsNegStride)) {
722 TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
723 Changed = true;
724 }
725 }
726
727 return Changed;
728}
729
730/// processLoopMemIntrinsic - Template function for calling different processor
731/// functions based on mem intrinsic type.
732template <typename MemInst>
733bool LoopIdiomRecognize::processLoopMemIntrinsic(
734 BasicBlock *BB,
735 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
736 const SCEV *BECount) {
737 bool MadeChange = false;
738 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
739 Instruction *Inst = &*I++;
740 // Look for memory instructions, which may be optimized to a larger one.
741 if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
742 WeakTrackingVH InstPtr(&*I);
743 if (!(this->*Processor)(MI, BECount))
744 continue;
745 MadeChange = true;
746
747 // If processing the instruction invalidated our iterator, start over from
748 // the top of the block.
749 if (!InstPtr)
750 I = BB->begin();
751 }
752 }
753 return MadeChange;
754}
755
756/// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
757bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
758 const SCEV *BECount) {
759 // We can only handle non-volatile memcpys with a constant size.
760 if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
761 return false;
762
763 // If we're not allowed to hack on memcpy, we fail.
764 if ((!HasMemcpy && !isa<MemCpyInlineInst>(MCI)) || DisableLIRP::Memcpy)
765 return false;
766
767 Value *Dest = MCI->getDest();
768 Value *Source = MCI->getSource();
769 if (!Dest || !Source)
770 return false;
771
772 // See if the load and store pointer expressions are AddRec like {base,+,1} on
773 // the current loop, which indicates a strided load and store. If we have
774 // something else, it's a random load or store we can't handle.
775 const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Dest));
776 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
777 return false;
778 const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Source));
779 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
780 return false;
781
782 // Reject memcpys that are so large that they overflow an unsigned.
783 uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
784 if ((SizeInBytes >> 32) != 0)
785 return false;
786
787 // Check if the stride matches the size of the memcpy. If so, then we know
788 // that every byte is touched in the loop.
789 const SCEVConstant *ConstStoreStride =
790 dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
791 const SCEVConstant *ConstLoadStride =
792 dyn_cast<SCEVConstant>(LoadEv->getOperand(1));
793 if (!ConstStoreStride || !ConstLoadStride)
794 return false;
795
796 APInt StoreStrideValue = ConstStoreStride->getAPInt();
797 APInt LoadStrideValue = ConstLoadStride->getAPInt();
798 // Huge stride value - give up
799 if (StoreStrideValue.getBitWidth() > 64 || LoadStrideValue.getBitWidth() > 64)
800 return false;
801
802 if (SizeInBytes != StoreStrideValue && SizeInBytes != -StoreStrideValue) {
803 ORE.emit([&]() {
804 return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI)
805 << ore::NV("Inst", "memcpy") << " in "
806 << ore::NV("Function", MCI->getFunction())
807 << " function will not be hoisted: "
808 << ore::NV("Reason", "memcpy size is not equal to stride");
809 });
810 return false;
811 }
812
813 int64_t StoreStrideInt = StoreStrideValue.getSExtValue();
814 int64_t LoadStrideInt = LoadStrideValue.getSExtValue();
815 // Check if the load stride matches the store stride.
816 if (StoreStrideInt != LoadStrideInt)
817 return false;
818
819 return processLoopStoreOfLoopLoad(
820 Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes),
821 MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI, StoreEv, LoadEv,
822 BECount);
823}
824
825/// processLoopMemSet - See if this memset can be promoted to a large memset.
826bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
827 const SCEV *BECount) {
828 // We can only handle non-volatile memsets.
829 if (MSI->isVolatile())
830 return false;
831
832 // If we're not allowed to hack on memset, we fail.
833 if (!HasMemset || DisableLIRP::Memset)
834 return false;
835
836 Value *Pointer = MSI->getDest();
837
838 // See if the pointer expression is an AddRec like {base,+,1} on the current
839 // loop, which indicates a strided store. If we have something else, it's a
840 // random store we can't handle.
841 const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
842 if (!Ev || Ev->getLoop() != CurLoop)
843 return false;
844 if (!Ev->isAffine()) {
845 LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n");
846 return false;
847 }
848
849 const SCEV *PointerStrideSCEV = Ev->getOperand(1);
850 const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength());
851 if (!PointerStrideSCEV || !MemsetSizeSCEV)
852 return false;
853
854 bool IsNegStride = false;
855 const bool IsConstantSize = isa<ConstantInt>(MSI->getLength());
856
857 if (IsConstantSize) {
858 // Memset size is constant.
859 // Check if the pointer stride matches the memset size. If so, then
860 // we know that every byte is touched in the loop.
861 LLVM_DEBUG(dbgs() << " memset size is constant\n");
862 uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
863 const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
864 if (!ConstStride)
865 return false;
866
867 APInt Stride = ConstStride->getAPInt();
868 if (SizeInBytes != Stride && SizeInBytes != -Stride)
869 return false;
870
871 IsNegStride = SizeInBytes == -Stride;
872 } else {
873 // Memset size is non-constant.
874 // Check if the pointer stride matches the memset size.
875 // To be conservative, the pass would not promote pointers that aren't in
876 // address space zero. Also, the pass only handles memset length and stride
877 // that are invariant for the top level loop.
878 LLVM_DEBUG(dbgs() << " memset size is non-constant\n");
879 if (Pointer->getType()->getPointerAddressSpace() != 0) {
880 LLVM_DEBUG(dbgs() << " pointer is not in address space zero, "
881 << "abort\n");
882 return false;
883 }
884 if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) {
885 LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, "
886 << "abort\n");
887 return false;
888 }
889
890 // Compare positive direction PointerStrideSCEV with MemsetSizeSCEV
891 IsNegStride = PointerStrideSCEV->isNonConstantNegative();
892 const SCEV *PositiveStrideSCEV =
893 IsNegStride ? SE->getNegativeSCEV(PointerStrideSCEV)
894 : PointerStrideSCEV;
895 LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
896 << " PositiveStrideSCEV: " << *PositiveStrideSCEV
897 << "\n");
898
899 if (PositiveStrideSCEV != MemsetSizeSCEV) {
900 // If an expression is covered by the loop guard, compare again and
901 // proceed with optimization if equal.
902 const SCEV *FoldedPositiveStride =
903 SE->applyLoopGuards(PositiveStrideSCEV, CurLoop);
904 const SCEV *FoldedMemsetSize =
905 SE->applyLoopGuards(MemsetSizeSCEV, CurLoop);
906
907 LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n"
908 << " FoldedMemsetSize: " << *FoldedMemsetSize << "\n"
909 << " FoldedPositiveStride: " << *FoldedPositiveStride
910 << "\n");
911
912 if (FoldedPositiveStride != FoldedMemsetSize) {
913 LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n");
914 return false;
915 }
916 }
917 }
918
919 // Verify that the memset value is loop invariant. If not, we can't promote
920 // the memset.
921 Value *SplatValue = MSI->getValue();
922 if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
923 return false;
924
926 MSIs.insert(MSI);
927 return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),
928 MSI->getDestAlign(), SplatValue, MSI, MSIs, Ev,
929 BECount, IsNegStride, /*IsLoopMemset=*/true);
930}
931
932/// mayLoopAccessLocation - Return true if the specified loop might access the
933/// specified pointer location, which is a loop-strided access. The 'Access'
934/// argument specifies what the verboten forms of access are (read or write).
935static bool
937 const SCEV *BECount, const SCEV *StoreSizeSCEV,
938 AliasAnalysis &AA,
939 SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
940 // Get the location that may be stored across the loop. Since the access is
941 // strided positively through memory, we say that the modified location starts
942 // at the pointer and has infinite size.
944
945 // If the loop iterates a fixed number of times, we can refine the access size
946 // to be exactly the size of the memset, which is (BECount+1)*StoreSize
947 const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount);
948 const SCEVConstant *ConstSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
949 if (BECst && ConstSize) {
950 std::optional<uint64_t> BEInt = BECst->getAPInt().tryZExtValue();
951 std::optional<uint64_t> SizeInt = ConstSize->getAPInt().tryZExtValue();
952 // FIXME: Should this check for overflow?
953 if (BEInt && SizeInt)
954 AccessSize = LocationSize::precise((*BEInt + 1) * *SizeInt);
955 }
956
957 // TODO: For this to be really effective, we have to dive into the pointer
958 // operand in the store. Store to &A[i] of 100 will always return may alias
959 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
960 // which will then no-alias a store to &A[100].
961 MemoryLocation StoreLoc(Ptr, AccessSize);
962
963 for (BasicBlock *B : L->blocks())
964 for (Instruction &I : *B)
965 if (!IgnoredInsts.contains(&I) &&
966 isModOrRefSet(AA.getModRefInfo(&I, StoreLoc) & Access))
967 return true;
968 return false;
969}
970
971// If we have a negative stride, Start refers to the end of the memory location
972// we're trying to memset. Therefore, we need to recompute the base pointer,
973// which is just Start - BECount*Size.
974static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
975 Type *IntPtr, const SCEV *StoreSizeSCEV,
976 ScalarEvolution *SE) {
977 const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
978 if (!StoreSizeSCEV->isOne()) {
979 // index = back edge count * store size
980 Index = SE->getMulExpr(Index,
981 SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
983 }
984 // base pointer = start - index * store size
985 return SE->getMinusSCEV(Start, Index);
986}
987
988/// Compute the number of bytes as a SCEV from the backedge taken count.
989///
990/// This also maps the SCEV into the provided type and tries to handle the
991/// computation in a way that will fold cleanly.
992static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
993 const SCEV *StoreSizeSCEV, Loop *CurLoop,
994 const DataLayout *DL, ScalarEvolution *SE) {
995 const SCEV *TripCountSCEV =
996 SE->getTripCountFromExitCount(BECount, IntPtr, CurLoop);
997 return SE->getMulExpr(TripCountSCEV,
998 SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
1000}
1001
1002/// processLoopStridedStore - We see a strided store of some value. If we can
1003/// transform this into a memset or memset_pattern in the loop preheader, do so.
1004bool LoopIdiomRecognize::processLoopStridedStore(
1005 Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment,
1006 Value *StoredVal, Instruction *TheStore,
1008 const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
1009 Module *M = TheStore->getModule();
1010 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
1011 Constant *PatternValue = nullptr;
1012
1013 if (!SplatValue)
1014 PatternValue = getMemSetPatternValue(StoredVal, DL);
1015
1016 assert((SplatValue || PatternValue) &&
1017 "Expected either splat value or pattern value.");
1018
1019 // The trip count of the loop and the base pointer of the addrec SCEV is
1020 // guaranteed to be loop invariant, which means that it should dominate the
1021 // header. This allows us to insert code for it in the preheader.
1022 unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
1023 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1024 IRBuilder<> Builder(Preheader->getTerminator());
1025 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1026 SCEVExpanderCleaner ExpCleaner(Expander);
1027
1028 Type *DestInt8PtrTy = Builder.getPtrTy(DestAS);
1029 Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
1030
1031 bool Changed = false;
1032 const SCEV *Start = Ev->getStart();
1033 // Handle negative strided loops.
1034 if (IsNegStride)
1035 Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE);
1036
1037 // TODO: ideally we should still be able to generate memset if SCEV expander
1038 // is taught to generate the dependencies at the latest point.
1039 if (!Expander.isSafeToExpand(Start))
1040 return Changed;
1041
1042 // Okay, we have a strided store "p[i]" of a splattable value. We can turn
1043 // this into a memset in the loop preheader now if we want. However, this
1044 // would be unsafe to do if there is anything else in the loop that may read
1045 // or write to the aliased location. Check for any overlap by generating the
1046 // base pointer and checking the region.
1047 Value *BasePtr =
1048 Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
1049
1050 // From here on out, conservatively report to the pass manager that we've
1051 // changed the IR, even if we later clean up these added instructions. There
1052 // may be structural differences e.g. in the order of use lists not accounted
1053 // for in just a textual dump of the IR. This is written as a variable, even
1054 // though statically all the places this dominates could be replaced with
1055 // 'true', with the hope that anyone trying to be clever / "more precise" with
1056 // the return value will read this comment, and leave them alone.
1057 Changed = true;
1058
1059 if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1060 StoreSizeSCEV, *AA, Stores))
1061 return Changed;
1062
1063 if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
1064 return Changed;
1065
1066 // Okay, everything looks good, insert the memset.
1067
1068 const SCEV *NumBytesS =
1069 getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1070
1071 // TODO: ideally we should still be able to generate memset if SCEV expander
1072 // is taught to generate the dependencies at the latest point.
1073 if (!Expander.isSafeToExpand(NumBytesS))
1074 return Changed;
1075
1076 Value *NumBytes =
1077 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1078
1079 if (!SplatValue && !isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16))
1080 return Changed;
1081
1082 AAMDNodes AATags = TheStore->getAAMetadata();
1083 for (Instruction *Store : Stores)
1084 AATags = AATags.merge(Store->getAAMetadata());
1085 if (auto CI = dyn_cast<ConstantInt>(NumBytes))
1086 AATags = AATags.extendTo(CI->getZExtValue());
1087 else
1088 AATags = AATags.extendTo(-1);
1089
1090 CallInst *NewCall;
1091 if (SplatValue) {
1092 NewCall = Builder.CreateMemSet(
1093 BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment),
1094 /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
1095 } else {
1096 assert (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16));
1097 // Everything is emitted in default address space
1098 Type *Int8PtrTy = DestInt8PtrTy;
1099
1100 StringRef FuncName = "memset_pattern16";
1101 FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
1102 Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy);
1103 inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
1104
1105 // Otherwise we should form a memset_pattern16. PatternValue is known to be
1106 // an constant array of 16-bytes. Plop the value into a mergable global.
1107 GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
1109 PatternValue, ".memset_pattern");
1110 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
1111 GV->setAlignment(Align(16));
1112 Value *PatternPtr = GV;
1113 NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
1114
1115 // Set the TBAA info if present.
1116 if (AATags.TBAA)
1117 NewCall->setMetadata(LLVMContext::MD_tbaa, AATags.TBAA);
1118
1119 if (AATags.Scope)
1120 NewCall->setMetadata(LLVMContext::MD_alias_scope, AATags.Scope);
1121
1122 if (AATags.NoAlias)
1123 NewCall->setMetadata(LLVMContext::MD_noalias, AATags.NoAlias);
1124 }
1125
1126 NewCall->setDebugLoc(TheStore->getDebugLoc());
1127
1128 if (MSSAU) {
1129 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1130 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1131 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1132 }
1133
1134 LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
1135 << " from store to: " << *Ev << " at: " << *TheStore
1136 << "\n");
1137
1138 ORE.emit([&]() {
1139 OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore",
1140 NewCall->getDebugLoc(), Preheader);
1141 R << "Transformed loop-strided store in "
1142 << ore::NV("Function", TheStore->getFunction())
1143 << " function into a call to "
1144 << ore::NV("NewFunction", NewCall->getCalledFunction())
1145 << "() intrinsic";
1146 if (!Stores.empty())
1147 R << ore::setExtraArgs();
1148 for (auto *I : Stores) {
1149 R << ore::NV("FromBlock", I->getParent()->getName())
1150 << ore::NV("ToBlock", Preheader->getName());
1151 }
1152 return R;
1153 });
1154
1155 // Okay, the memset has been formed. Zap the original store and anything that
1156 // feeds into it.
1157 for (auto *I : Stores) {
1158 if (MSSAU)
1159 MSSAU->removeMemoryAccess(I, true);
1161 }
1162 if (MSSAU && VerifyMemorySSA)
1163 MSSAU->getMemorySSA()->verifyMemorySSA();
1164 ++NumMemSet;
1165 ExpCleaner.markResultUsed();
1166 return true;
1167}
1168
1169/// If the stored value is a strided load in the same loop with the same stride
1170/// this may be transformable into a memcpy. This kicks in for stuff like
1171/// for (i) A[i] = B[i];
1172bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1173 const SCEV *BECount) {
1174 assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
1175
1176 Value *StorePtr = SI->getPointerOperand();
1177 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1178 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1179
1180 // The store must be feeding a non-volatile load.
1181 LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1182 assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
1183
1184 // See if the pointer expression is an AddRec like {base,+,1} on the current
1185 // loop, which indicates a strided load. If we have something else, it's a
1186 // random load we can't handle.
1187 Value *LoadPtr = LI->getPointerOperand();
1188 const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
1189
1190 const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize);
1191 return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV,
1192 SI->getAlign(), LI->getAlign(), SI, LI,
1193 StoreEv, LoadEv, BECount);
1194}
1195
1196namespace {
1197class MemmoveVerifier {
1198public:
1199 explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,
1200 const DataLayout &DL)
1202 LoadBasePtr.stripPointerCasts(), LoadOff, DL)),
1204 StoreBasePtr.stripPointerCasts(), StoreOff, DL)),
1205 IsSameObject(BP1 == BP2) {}
1206
1207 bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride,
1208 const Instruction &TheLoad,
1209 bool IsMemCpy) const {
1210 if (IsMemCpy) {
1211 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1212 // for negative stride.
1213 if ((!IsNegStride && LoadOff <= StoreOff) ||
1214 (IsNegStride && LoadOff >= StoreOff))
1215 return false;
1216 } else {
1217 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1218 // for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
1219 int64_t LoadSize =
1220 DL.getTypeSizeInBits(TheLoad.getType()).getFixedValue() / 8;
1221 if (BP1 != BP2 || LoadSize != int64_t(StoreSize))
1222 return false;
1223 if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) ||
1224 (IsNegStride && LoadOff + LoadSize > StoreOff))
1225 return false;
1226 }
1227 return true;
1228 }
1229
1230private:
1231 const DataLayout &DL;
1232 int64_t LoadOff = 0;
1233 int64_t StoreOff = 0;
1234 const Value *BP1;
1235 const Value *BP2;
1236
1237public:
1238 const bool IsSameObject;
1239};
1240} // namespace
1241
1242bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
1243 Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV,
1244 MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
1245 Instruction *TheLoad, const SCEVAddRecExpr *StoreEv,
1246 const SCEVAddRecExpr *LoadEv, const SCEV *BECount) {
1247
1248 // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
1249 // conservatively bail here, since otherwise we may have to transform
1250 // llvm.memcpy.inline into llvm.memcpy which is illegal.
1251 if (isa<MemCpyInlineInst>(TheStore))
1252 return false;
1253
1254 // The trip count of the loop and the base pointer of the addrec SCEV is
1255 // guaranteed to be loop invariant, which means that it should dominate the
1256 // header. This allows us to insert code for it in the preheader.
1257 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1258 IRBuilder<> Builder(Preheader->getTerminator());
1259 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1260
1261 SCEVExpanderCleaner ExpCleaner(Expander);
1262
1263 bool Changed = false;
1264 const SCEV *StrStart = StoreEv->getStart();
1265 unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
1266 Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1267
1268 APInt Stride = getStoreStride(StoreEv);
1269 const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1270
1271 // TODO: Deal with non-constant size; Currently expect constant store size
1272 assert(ConstStoreSize && "store size is expected to be a constant");
1273
1274 int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
1275 bool IsNegStride = StoreSize == -Stride;
1276
1277 // Handle negative strided loops.
1278 if (IsNegStride)
1279 StrStart =
1280 getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1281
1282 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1283 // this into a memcpy in the loop preheader now if we want. However, this
1284 // would be unsafe to do if there is anything else in the loop that may read
1285 // or write the memory region we're storing to. This includes the load that
1286 // feeds the stores. Check for an alias by generating the base address and
1287 // checking everything.
1288 Value *StoreBasePtr = Expander.expandCodeFor(
1289 StrStart, Builder.getPtrTy(StrAS), Preheader->getTerminator());
1290
1291 // From here on out, conservatively report to the pass manager that we've
1292 // changed the IR, even if we later clean up these added instructions. There
1293 // may be structural differences e.g. in the order of use lists not accounted
1294 // for in just a textual dump of the IR. This is written as a variable, even
1295 // though statically all the places this dominates could be replaced with
1296 // 'true', with the hope that anyone trying to be clever / "more precise" with
1297 // the return value will read this comment, and leave them alone.
1298 Changed = true;
1299
1300 SmallPtrSet<Instruction *, 2> IgnoredInsts;
1301 IgnoredInsts.insert(TheStore);
1302
1303 bool IsMemCpy = isa<MemCpyInst>(TheStore);
1304 const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
1305
1306 bool LoopAccessStore =
1307 mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1308 StoreSizeSCEV, *AA, IgnoredInsts);
1309 if (LoopAccessStore) {
1310 // For memmove case it's not enough to guarantee that loop doesn't access
1311 // TheStore and TheLoad. Additionally we need to make sure that TheStore is
1312 // the only user of TheLoad.
1313 if (!TheLoad->hasOneUse())
1314 return Changed;
1315 IgnoredInsts.insert(TheLoad);
1316 if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
1317 BECount, StoreSizeSCEV, *AA, IgnoredInsts)) {
1318 ORE.emit([&]() {
1319 return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
1320 TheStore)
1321 << ore::NV("Inst", InstRemark) << " in "
1322 << ore::NV("Function", TheStore->getFunction())
1323 << " function will not be hoisted: "
1324 << ore::NV("Reason", "The loop may access store location");
1325 });
1326 return Changed;
1327 }
1328 IgnoredInsts.erase(TheLoad);
1329 }
1330
1331 const SCEV *LdStart = LoadEv->getStart();
1332 unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
1333
1334 // Handle negative strided loops.
1335 if (IsNegStride)
1336 LdStart =
1337 getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1338
1339 // For a memcpy, we have to make sure that the input array is not being
1340 // mutated by the loop.
1341 Value *LoadBasePtr = Expander.expandCodeFor(LdStart, Builder.getPtrTy(LdAS),
1342 Preheader->getTerminator());
1343
1344 // If the store is a memcpy instruction, we must check if it will write to
1345 // the load memory locations. So remove it from the ignored stores.
1346 MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL);
1347 if (IsMemCpy && !Verifier.IsSameObject)
1348 IgnoredInsts.erase(TheStore);
1349 if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1350 StoreSizeSCEV, *AA, IgnoredInsts)) {
1351 ORE.emit([&]() {
1352 return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
1353 << ore::NV("Inst", InstRemark) << " in "
1354 << ore::NV("Function", TheStore->getFunction())
1355 << " function will not be hoisted: "
1356 << ore::NV("Reason", "The loop may access load location");
1357 });
1358 return Changed;
1359 }
1360
1361 bool IsAtomic = TheStore->isAtomic() || TheLoad->isAtomic();
1362 bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
1363
1364 if (IsAtomic) {
1365 // For now don't support unordered atomic memmove.
1366 if (UseMemMove)
1367 return Changed;
1368
1369 // We cannot allow unaligned ops for unordered load/store, so reject
1370 // anything where the alignment isn't at least the element size.
1371 assert((StoreAlign && LoadAlign) &&
1372 "Expect unordered load/store to have align.");
1373 if (*StoreAlign < StoreSize || *LoadAlign < StoreSize)
1374 return Changed;
1375
1376 // If the element.atomic memcpy is not lowered into explicit
1377 // loads/stores later, then it will be lowered into an element-size
1378 // specific lib call. If the lib call doesn't exist for our store size, then
1379 // we shouldn't generate the memcpy.
1380 if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1381 return Changed;
1382 }
1383
1384 if (UseMemMove)
1385 if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, *TheLoad,
1386 IsMemCpy))
1387 return Changed;
1388
1389 if (avoidLIRForMultiBlockLoop())
1390 return Changed;
1391
1392 // Okay, everything is safe, we can transform this!
1393
1394 const SCEV *NumBytesS =
1395 getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1396
1397 Value *NumBytes =
1398 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1399
1400 AAMDNodes AATags = TheLoad->getAAMetadata();
1401 AAMDNodes StoreAATags = TheStore->getAAMetadata();
1402 AATags = AATags.merge(StoreAATags);
1403 if (auto CI = dyn_cast<ConstantInt>(NumBytes))
1404 AATags = AATags.extendTo(CI->getZExtValue());
1405 else
1406 AATags = AATags.extendTo(-1);
1407
1408 CallInst *NewCall = nullptr;
1409 // Check whether to generate an unordered atomic memcpy:
1410 // If the load or store are atomic, then they must necessarily be unordered
1411 // by previous checks.
1412 if (!IsAtomic) {
1413 if (UseMemMove)
1414 NewCall = Builder.CreateMemMove(
1415 StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
1416 /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
1417 else
1418 NewCall =
1419 Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign,
1420 NumBytes, /*isVolatile=*/false, AATags.TBAA,
1421 AATags.TBAAStruct, AATags.Scope, AATags.NoAlias);
1422 } else {
1423 // Create the call.
1424 // Note that unordered atomic loads/stores are *required* by the spec to
1425 // have an alignment but non-atomic loads/stores may not.
1426 NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1427 StoreBasePtr, *StoreAlign, LoadBasePtr, *LoadAlign, NumBytes, StoreSize,
1428 AATags.TBAA, AATags.TBAAStruct, AATags.Scope, AATags.NoAlias);
1429 }
1430 NewCall->setDebugLoc(TheStore->getDebugLoc());
1431
1432 if (MSSAU) {
1433 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1434 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1435 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1436 }
1437
1438 LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall << "\n"
1439 << " from load ptr=" << *LoadEv << " at: " << *TheLoad
1440 << "\n"
1441 << " from store ptr=" << *StoreEv << " at: " << *TheStore
1442 << "\n");
1443
1444 ORE.emit([&]() {
1445 return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
1446 NewCall->getDebugLoc(), Preheader)
1447 << "Formed a call to "
1448 << ore::NV("NewFunction", NewCall->getCalledFunction())
1449 << "() intrinsic from " << ore::NV("Inst", InstRemark)
1450 << " instruction in " << ore::NV("Function", TheStore->getFunction())
1451 << " function"
1453 << ore::NV("FromBlock", TheStore->getParent()->getName())
1454 << ore::NV("ToBlock", Preheader->getName());
1455 });
1456
1457 // Okay, a new call to memcpy/memmove has been formed. Zap the original store
1458 // and anything that feeds into it.
1459 if (MSSAU)
1460 MSSAU->removeMemoryAccess(TheStore, true);
1461 deleteDeadInstruction(TheStore);
1462 if (MSSAU && VerifyMemorySSA)
1463 MSSAU->getMemorySSA()->verifyMemorySSA();
1464 if (UseMemMove)
1465 ++NumMemMove;
1466 else
1467 ++NumMemCpy;
1468 ExpCleaner.markResultUsed();
1469 return true;
1470}
1471
1472// When compiling for codesize we avoid idiom recognition for a multi-block loop
1473// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1474//
1475bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1476 bool IsLoopMemset) {
1477 if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1478 if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
1479 LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
1480 << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
1481 << " avoided: multi-block top-level loop\n");
1482 return true;
1483 }
1484 }
1485
1486 return false;
1487}
1488
1489bool LoopIdiomRecognize::runOnNoncountableLoop() {
1490 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
1491 << CurLoop->getHeader()->getParent()->getName()
1492 << "] Noncountable Loop %"
1493 << CurLoop->getHeader()->getName() << "\n");
1494
1495 return recognizePopcount() || recognizeAndInsertFFS() ||
1496 recognizeShiftUntilBitTest() || recognizeShiftUntilZero() ||
1497 recognizeShiftUntilLessThan();
1498}
1499
1500/// Check if the given conditional branch is based on the comparison between
1501/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1502/// true), the control yields to the loop entry. If the branch matches the
1503/// behavior, the variable involved in the comparison is returned. This function
1504/// will be called to see if the precondition and postcondition of the loop are
1505/// in desirable form.
1507 bool JmpOnZero = false) {
1508 if (!BI || !BI->isConditional())
1509 return nullptr;
1510
1511 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1512 if (!Cond)
1513 return nullptr;
1514
1515 ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1516 if (!CmpZero || !CmpZero->isZero())
1517 return nullptr;
1518
1519 BasicBlock *TrueSucc = BI->getSuccessor(0);
1520 BasicBlock *FalseSucc = BI->getSuccessor(1);
1521 if (JmpOnZero)
1522 std::swap(TrueSucc, FalseSucc);
1523
1524 ICmpInst::Predicate Pred = Cond->getPredicate();
1525 if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1526 (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1527 return Cond->getOperand(0);
1528
1529 return nullptr;
1530}
1531
1532/// Check if the given conditional branch is based on an unsigned less-than
1533/// comparison between a variable and a constant, and if the comparison is false
1534/// the control yields to the loop entry. If the branch matches the behaviour,
1535/// the variable involved in the comparison is returned.
1537 APInt &Threshold) {
1538 if (!BI || !BI->isConditional())
1539 return nullptr;
1540
1541 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1542 if (!Cond)
1543 return nullptr;
1544
1545 ConstantInt *CmpConst = dyn_cast<ConstantInt>(Cond->getOperand(1));
1546 if (!CmpConst)
1547 return nullptr;
1548
1549 BasicBlock *FalseSucc = BI->getSuccessor(1);
1550 ICmpInst::Predicate Pred = Cond->getPredicate();
1551
1552 if (Pred == ICmpInst::ICMP_ULT && FalseSucc == LoopEntry) {
1553 Threshold = CmpConst->getValue();
1554 return Cond->getOperand(0);
1555 }
1556
1557 return nullptr;
1558}
1559
1560// Check if the recurrence variable `VarX` is in the right form to create
1561// the idiom. Returns the value coerced to a PHINode if so.
1563 BasicBlock *LoopEntry) {
1564 auto *PhiX = dyn_cast<PHINode>(VarX);
1565 if (PhiX && PhiX->getParent() == LoopEntry &&
1566 (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
1567 return PhiX;
1568 return nullptr;
1569}
1570
1571/// Return true if the idiom is detected in the loop.
1572///
1573/// Additionally:
1574/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1575/// or nullptr if there is no such.
1576/// 2) \p CntPhi is set to the corresponding phi node
1577/// or nullptr if there is no such.
1578/// 3) \p InitX is set to the value whose CTLZ could be used.
1579/// 4) \p DefX is set to the instruction calculating Loop exit condition.
1580/// 5) \p Threshold is set to the constant involved in the unsigned less-than
1581/// comparison.
1582///
1583/// The core idiom we are trying to detect is:
1584/// \code
1585/// if (x0 < 2)
1586/// goto loop-exit // the precondition of the loop
1587/// cnt0 = init-val
1588/// do {
1589/// x = phi (x0, x.next); //PhiX
1590/// cnt = phi (cnt0, cnt.next)
1591///
1592/// cnt.next = cnt + 1;
1593/// ...
1594/// x.next = x >> 1; // DefX
1595/// } while (x >= 4)
1596/// loop-exit:
1597/// \endcode
1599 Intrinsic::ID &IntrinID,
1600 Value *&InitX, Instruction *&CntInst,
1601 PHINode *&CntPhi, Instruction *&DefX,
1602 APInt &Threshold) {
1603 BasicBlock *LoopEntry;
1604
1605 DefX = nullptr;
1606 CntInst = nullptr;
1607 CntPhi = nullptr;
1608 LoopEntry = *(CurLoop->block_begin());
1609
1610 // step 1: Check if the loop-back branch is in desirable form.
1612 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry,
1613 Threshold))
1614 DefX = dyn_cast<Instruction>(T);
1615 else
1616 return false;
1617
1618 // step 2: Check the recurrence of variable X
1619 if (!DefX || !isa<PHINode>(DefX))
1620 return false;
1621
1622 PHINode *VarPhi = cast<PHINode>(DefX);
1623 int Idx = VarPhi->getBasicBlockIndex(LoopEntry);
1624 if (Idx == -1)
1625 return false;
1626
1627 DefX = dyn_cast<Instruction>(VarPhi->getIncomingValue(Idx));
1628 if (!DefX || DefX->getNumOperands() == 0 || DefX->getOperand(0) != VarPhi)
1629 return false;
1630
1631 // step 3: detect instructions corresponding to "x.next = x >> 1"
1632 if (DefX->getOpcode() != Instruction::LShr)
1633 return false;
1634
1635 IntrinID = Intrinsic::ctlz;
1636 ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1637 if (!Shft || !Shft->isOne())
1638 return false;
1639
1640 InitX = VarPhi->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1641
1642 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1643 // or cnt.next = cnt + -1.
1644 // TODO: We can skip the step. If loop trip count is known (CTLZ),
1645 // then all uses of "cnt.next" could be optimized to the trip count
1646 // plus "cnt0". Currently it is not optimized.
1647 // This step could be used to detect POPCNT instruction:
1648 // cnt.next = cnt + (x.next & 1)
1649 for (Instruction &Inst :
1650 llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) {
1651 if (Inst.getOpcode() != Instruction::Add)
1652 continue;
1653
1654 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
1655 if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
1656 continue;
1657
1658 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
1659 if (!Phi)
1660 continue;
1661
1662 CntInst = &Inst;
1663 CntPhi = Phi;
1664 break;
1665 }
1666 if (!CntInst)
1667 return false;
1668
1669 return true;
1670}
1671
1672/// Return true iff the idiom is detected in the loop.
1673///
1674/// Additionally:
1675/// 1) \p CntInst is set to the instruction counting the population bit.
1676/// 2) \p CntPhi is set to the corresponding phi node.
1677/// 3) \p Var is set to the value whose population bits are being counted.
1678///
1679/// The core idiom we are trying to detect is:
1680/// \code
1681/// if (x0 != 0)
1682/// goto loop-exit // the precondition of the loop
1683/// cnt0 = init-val;
1684/// do {
1685/// x1 = phi (x0, x2);
1686/// cnt1 = phi(cnt0, cnt2);
1687///
1688/// cnt2 = cnt1 + 1;
1689/// ...
1690/// x2 = x1 & (x1 - 1);
1691/// ...
1692/// } while(x != 0);
1693///
1694/// loop-exit:
1695/// \endcode
1696static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
1697 Instruction *&CntInst, PHINode *&CntPhi,
1698 Value *&Var) {
1699 // step 1: Check to see if the look-back branch match this pattern:
1700 // "if (a!=0) goto loop-entry".
1701 BasicBlock *LoopEntry;
1702 Instruction *DefX2, *CountInst;
1703 Value *VarX1, *VarX0;
1704 PHINode *PhiX, *CountPhi;
1705
1706 DefX2 = CountInst = nullptr;
1707 VarX1 = VarX0 = nullptr;
1708 PhiX = CountPhi = nullptr;
1709 LoopEntry = *(CurLoop->block_begin());
1710
1711 // step 1: Check if the loop-back branch is in desirable form.
1712 {
1713 if (Value *T = matchCondition(
1714 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1715 DefX2 = dyn_cast<Instruction>(T);
1716 else
1717 return false;
1718 }
1719
1720 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1721 {
1722 if (!DefX2 || DefX2->getOpcode() != Instruction::And)
1723 return false;
1724
1725 BinaryOperator *SubOneOp;
1726
1727 if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1728 VarX1 = DefX2->getOperand(1);
1729 else {
1730 VarX1 = DefX2->getOperand(0);
1731 SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1732 }
1733 if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
1734 return false;
1735
1736 ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
1737 if (!Dec ||
1738 !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
1739 (SubOneOp->getOpcode() == Instruction::Add &&
1740 Dec->isMinusOne()))) {
1741 return false;
1742 }
1743 }
1744
1745 // step 3: Check the recurrence of variable X
1746 PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1747 if (!PhiX)
1748 return false;
1749
1750 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1751 {
1752 CountInst = nullptr;
1753 for (Instruction &Inst :
1754 llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) {
1755 if (Inst.getOpcode() != Instruction::Add)
1756 continue;
1757
1758 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
1759 if (!Inc || !Inc->isOne())
1760 continue;
1761
1762 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
1763 if (!Phi)
1764 continue;
1765
1766 // Check if the result of the instruction is live of the loop.
1767 bool LiveOutLoop = false;
1768 for (User *U : Inst.users()) {
1769 if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1770 LiveOutLoop = true;
1771 break;
1772 }
1773 }
1774
1775 if (LiveOutLoop) {
1776 CountInst = &Inst;
1777 CountPhi = Phi;
1778 break;
1779 }
1780 }
1781
1782 if (!CountInst)
1783 return false;
1784 }
1785
1786 // step 5: check if the precondition is in this form:
1787 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1788 {
1789 auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1790 Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1791 if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1792 return false;
1793
1794 CntInst = CountInst;
1795 CntPhi = CountPhi;
1796 Var = T;
1797 }
1798
1799 return true;
1800}
1801
1802/// Return true if the idiom is detected in the loop.
1803///
1804/// Additionally:
1805/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1806/// or nullptr if there is no such.
1807/// 2) \p CntPhi is set to the corresponding phi node
1808/// or nullptr if there is no such.
1809/// 3) \p Var is set to the value whose CTLZ could be used.
1810/// 4) \p DefX is set to the instruction calculating Loop exit condition.
1811///
1812/// The core idiom we are trying to detect is:
1813/// \code
1814/// if (x0 == 0)
1815/// goto loop-exit // the precondition of the loop
1816/// cnt0 = init-val;
1817/// do {
1818/// x = phi (x0, x.next); //PhiX
1819/// cnt = phi(cnt0, cnt.next);
1820///
1821/// cnt.next = cnt + 1;
1822/// ...
1823/// x.next = x >> 1; // DefX
1824/// ...
1825/// } while(x.next != 0);
1826///
1827/// loop-exit:
1828/// \endcode
1829static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
1830 Intrinsic::ID &IntrinID, Value *&InitX,
1831 Instruction *&CntInst, PHINode *&CntPhi,
1832 Instruction *&DefX) {
1833 BasicBlock *LoopEntry;
1834 Value *VarX = nullptr;
1835
1836 DefX = nullptr;
1837 CntInst = nullptr;
1838 CntPhi = nullptr;
1839 LoopEntry = *(CurLoop->block_begin());
1840
1841 // step 1: Check if the loop-back branch is in desirable form.
1842 if (Value *T = matchCondition(
1843 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1844 DefX = dyn_cast<Instruction>(T);
1845 else
1846 return false;
1847
1848 // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1849 if (!DefX || !DefX->isShift())
1850 return false;
1851 IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
1852 Intrinsic::ctlz;
1853 ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1854 if (!Shft || !Shft->isOne())
1855 return false;
1856 VarX = DefX->getOperand(0);
1857
1858 // step 3: Check the recurrence of variable X
1859 PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1860 if (!PhiX)
1861 return false;
1862
1863 InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1864
1865 // Make sure the initial value can't be negative otherwise the ashr in the
1866 // loop might never reach zero which would make the loop infinite.
1867 if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
1868 return false;
1869
1870 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1871 // or cnt.next = cnt + -1.
1872 // TODO: We can skip the step. If loop trip count is known (CTLZ),
1873 // then all uses of "cnt.next" could be optimized to the trip count
1874 // plus "cnt0". Currently it is not optimized.
1875 // This step could be used to detect POPCNT instruction:
1876 // cnt.next = cnt + (x.next & 1)
1877 for (Instruction &Inst :
1878 llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) {
1879 if (Inst.getOpcode() != Instruction::Add)
1880 continue;
1881
1882 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
1883 if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
1884 continue;
1885
1886 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
1887 if (!Phi)
1888 continue;
1889
1890 CntInst = &Inst;
1891 CntPhi = Phi;
1892 break;
1893 }
1894 if (!CntInst)
1895 return false;
1896
1897 return true;
1898}
1899
1900// Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1901// profitable if we delete the loop.
1902bool LoopIdiomRecognize::isProfitableToInsertFFS(Intrinsic::ID IntrinID,
1903 Value *InitX, bool ZeroCheck,
1904 size_t CanonicalSize) {
1905 const Value *Args[] = {InitX,
1906 ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
1907
1908 // @llvm.dbg doesn't count as they have no semantic effect.
1909 auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1911 std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
1912
1913 IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
1916 if (HeaderSize != CanonicalSize && Cost > TargetTransformInfo::TCC_Basic)
1917 return false;
1918
1919 return true;
1920}
1921
1922/// Convert CTLZ / CTTZ idiom loop into countable loop.
1923/// If CTLZ / CTTZ inserted as a new trip count returns true; otherwise,
1924/// returns false.
1925bool LoopIdiomRecognize::insertFFSIfProfitable(Intrinsic::ID IntrinID,
1926 Value *InitX, Instruction *DefX,
1927 PHINode *CntPhi,
1928 Instruction *CntInst) {
1929 bool IsCntPhiUsedOutsideLoop = false;
1930 for (User *U : CntPhi->users())
1931 if (!CurLoop->contains(cast<Instruction>(U))) {
1932 IsCntPhiUsedOutsideLoop = true;
1933 break;
1934 }
1935 bool IsCntInstUsedOutsideLoop = false;
1936 for (User *U : CntInst->users())
1937 if (!CurLoop->contains(cast<Instruction>(U))) {
1938 IsCntInstUsedOutsideLoop = true;
1939 break;
1940 }
1941 // If both CntInst and CntPhi are used outside the loop the profitability
1942 // is questionable.
1943 if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1944 return false;
1945
1946 // For some CPUs result of CTLZ(X) intrinsic is undefined
1947 // when X is 0. If we can not guarantee X != 0, we need to check this
1948 // when expand.
1949 bool ZeroCheck = false;
1950 // It is safe to assume Preheader exist as it was checked in
1951 // parent function RunOnLoop.
1952 BasicBlock *PH = CurLoop->getLoopPreheader();
1953
1954 // If we are using the count instruction outside the loop, make sure we
1955 // have a zero check as a precondition. Without the check the loop would run
1956 // one iteration for before any check of the input value. This means 0 and 1
1957 // would have identical behavior in the original loop and thus
1958 if (!IsCntPhiUsedOutsideLoop) {
1959 auto *PreCondBB = PH->getSinglePredecessor();
1960 if (!PreCondBB)
1961 return false;
1962 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1963 if (!PreCondBI)
1964 return false;
1965 if (matchCondition(PreCondBI, PH) != InitX)
1966 return false;
1967 ZeroCheck = true;
1968 }
1969
1970 // FFS idiom loop has only 6 instructions:
1971 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1972 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1973 // %shr = ashr %n.addr.0, 1
1974 // %tobool = icmp eq %shr, 0
1975 // %inc = add nsw %i.0, 1
1976 // br i1 %tobool
1977 size_t IdiomCanonicalSize = 6;
1978 if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize))
1979 return false;
1980
1981 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
1982 DefX->getDebugLoc(), ZeroCheck,
1983 IsCntPhiUsedOutsideLoop);
1984 return true;
1985}
1986
1987/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1988/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1989/// trip count returns true; otherwise, returns false.
1990bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1991 // Give up if the loop has multiple blocks or multiple backedges.
1992 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1993 return false;
1994
1995 Intrinsic::ID IntrinID;
1996 Value *InitX;
1997 Instruction *DefX = nullptr;
1998 PHINode *CntPhi = nullptr;
1999 Instruction *CntInst = nullptr;
2000
2001 if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX, CntInst, CntPhi,
2002 DefX))
2003 return false;
2004
2005 return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
2006}
2007
2008bool LoopIdiomRecognize::recognizeShiftUntilLessThan() {
2009 // Give up if the loop has multiple blocks or multiple backedges.
2010 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
2011 return false;
2012
2013 Intrinsic::ID IntrinID;
2014 Value *InitX;
2015 Instruction *DefX = nullptr;
2016 PHINode *CntPhi = nullptr;
2017 Instruction *CntInst = nullptr;
2018
2019 APInt LoopThreshold;
2020 if (!detectShiftUntilLessThanIdiom(CurLoop, *DL, IntrinID, InitX, CntInst,
2021 CntPhi, DefX, LoopThreshold))
2022 return false;
2023
2024 if (LoopThreshold == 2) {
2025 // Treat as regular FFS.
2026 return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
2027 }
2028
2029 // Look for Floor Log2 Idiom.
2030 if (LoopThreshold != 4)
2031 return false;
2032
2033 // Abort if CntPhi is used outside of the loop.
2034 for (User *U : CntPhi->users())
2035 if (!CurLoop->contains(cast<Instruction>(U)))
2036 return false;
2037
2038 // It is safe to assume Preheader exist as it was checked in
2039 // parent function RunOnLoop.
2040 BasicBlock *PH = CurLoop->getLoopPreheader();
2041 auto *PreCondBB = PH->getSinglePredecessor();
2042 if (!PreCondBB)
2043 return false;
2044 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
2045 if (!PreCondBI)
2046 return false;
2047
2048 APInt PreLoopThreshold;
2049 if (matchShiftULTCondition(PreCondBI, PH, PreLoopThreshold) != InitX ||
2050 PreLoopThreshold != 2)
2051 return false;
2052
2053 bool ZeroCheck = true;
2054
2055 // the loop has only 6 instructions:
2056 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
2057 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
2058 // %shr = ashr %n.addr.0, 1
2059 // %tobool = icmp ult %n.addr.0, C
2060 // %inc = add nsw %i.0, 1
2061 // br i1 %tobool
2062 size_t IdiomCanonicalSize = 6;
2063 if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize))
2064 return false;
2065
2066 // log2(x) = w − 1 − clz(x)
2067 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
2068 DefX->getDebugLoc(), ZeroCheck,
2069 /*IsCntPhiUsedOutsideLoop=*/false,
2070 /*InsertSub=*/true);
2071 return true;
2072}
2073
2074/// Recognizes a population count idiom in a non-countable loop.
2075///
2076/// If detected, transforms the relevant code to issue the popcount intrinsic
2077/// function call, and returns true; otherwise, returns false.
2078bool LoopIdiomRecognize::recognizePopcount() {
2080 return false;
2081
2082 // Counting population are usually conducted by few arithmetic instructions.
2083 // Such instructions can be easily "absorbed" by vacant slots in a
2084 // non-compact loop. Therefore, recognizing popcount idiom only makes sense
2085 // in a compact loop.
2086
2087 // Give up if the loop has multiple blocks or multiple backedges.
2088 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
2089 return false;
2090
2091 BasicBlock *LoopBody = *(CurLoop->block_begin());
2092 if (LoopBody->size() >= 20) {
2093 // The loop is too big, bail out.
2094 return false;
2095 }
2096
2097 // It should have a preheader containing nothing but an unconditional branch.
2098 BasicBlock *PH = CurLoop->getLoopPreheader();
2099 if (!PH || &PH->front() != PH->getTerminator())
2100 return false;
2101 auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
2102 if (!EntryBI || EntryBI->isConditional())
2103 return false;
2104
2105 // It should have a precondition block where the generated popcount intrinsic
2106 // function can be inserted.
2107 auto *PreCondBB = PH->getSinglePredecessor();
2108 if (!PreCondBB)
2109 return false;
2110 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
2111 if (!PreCondBI || PreCondBI->isUnconditional())
2112 return false;
2113
2114 Instruction *CntInst;
2115 PHINode *CntPhi;
2116 Value *Val;
2117 if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
2118 return false;
2119
2120 transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
2121 return true;
2122}
2123
2125 const DebugLoc &DL) {
2126 Value *Ops[] = {Val};
2127 Type *Tys[] = {Val->getType()};
2128
2129 CallInst *CI = IRBuilder.CreateIntrinsic(Intrinsic::ctpop, Tys, Ops);
2130 CI->setDebugLoc(DL);
2131
2132 return CI;
2133}
2134
2136 const DebugLoc &DL, bool ZeroCheck,
2137 Intrinsic::ID IID) {
2138 Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
2139 Type *Tys[] = {Val->getType()};
2140
2141 CallInst *CI = IRBuilder.CreateIntrinsic(IID, Tys, Ops);
2142 CI->setDebugLoc(DL);
2143
2144 return CI;
2145}
2146
2147/// Transform the following loop (Using CTLZ, CTTZ is similar):
2148/// loop:
2149/// CntPhi = PHI [Cnt0, CntInst]
2150/// PhiX = PHI [InitX, DefX]
2151/// CntInst = CntPhi + 1
2152/// DefX = PhiX >> 1
2153/// LOOP_BODY
2154/// Br: loop if (DefX != 0)
2155/// Use(CntPhi) or Use(CntInst)
2156///
2157/// Into:
2158/// If CntPhi used outside the loop:
2159/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
2160/// Count = CountPrev + 1
2161/// else
2162/// Count = BitWidth(InitX) - CTLZ(InitX)
2163/// loop:
2164/// CntPhi = PHI [Cnt0, CntInst]
2165/// PhiX = PHI [InitX, DefX]
2166/// PhiCount = PHI [Count, Dec]
2167/// CntInst = CntPhi + 1
2168/// DefX = PhiX >> 1
2169/// Dec = PhiCount - 1
2170/// LOOP_BODY
2171/// Br: loop if (Dec != 0)
2172/// Use(CountPrev + Cnt0) // Use(CntPhi)
2173/// or
2174/// Use(Count + Cnt0) // Use(CntInst)
2175///
2176/// If LOOP_BODY is empty the loop will be deleted.
2177/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
2178void LoopIdiomRecognize::transformLoopToCountable(
2179 Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
2180 PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
2181 bool ZeroCheck, bool IsCntPhiUsedOutsideLoop, bool InsertSub) {
2182 BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
2183
2184 // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
2185 IRBuilder<> Builder(PreheaderBr);
2186 Builder.SetCurrentDebugLocation(DL);
2187
2188 // If there are no uses of CntPhi crate:
2189 // Count = BitWidth - CTLZ(InitX);
2190 // NewCount = Count;
2191 // If there are uses of CntPhi create:
2192 // NewCount = BitWidth - CTLZ(InitX >> 1);
2193 // Count = NewCount + 1;
2194 Value *InitXNext;
2195 if (IsCntPhiUsedOutsideLoop) {
2196 if (DefX->getOpcode() == Instruction::AShr)
2197 InitXNext = Builder.CreateAShr(InitX, 1);
2198 else if (DefX->getOpcode() == Instruction::LShr)
2199 InitXNext = Builder.CreateLShr(InitX, 1);
2200 else if (DefX->getOpcode() == Instruction::Shl) // cttz
2201 InitXNext = Builder.CreateShl(InitX, 1);
2202 else
2203 llvm_unreachable("Unexpected opcode!");
2204 } else
2205 InitXNext = InitX;
2206 Value *Count =
2207 createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
2208 Type *CountTy = Count->getType();
2209 Count = Builder.CreateSub(
2210 ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
2211 if (InsertSub)
2212 Count = Builder.CreateSub(Count, ConstantInt::get(CountTy, 1));
2213 Value *NewCount = Count;
2214 if (IsCntPhiUsedOutsideLoop)
2215 Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
2216
2217 NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());
2218
2219 Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
2220 if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
2221 // If the counter was being incremented in the loop, add NewCount to the
2222 // counter's initial value, but only if the initial value is not zero.
2223 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2224 if (!InitConst || !InitConst->isZero())
2225 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2226 } else {
2227 // If the count was being decremented in the loop, subtract NewCount from
2228 // the counter's initial value.
2229 NewCount = Builder.CreateSub(CntInitVal, NewCount);
2230 }
2231
2232 // Step 2: Insert new IV and loop condition:
2233 // loop:
2234 // ...
2235 // PhiCount = PHI [Count, Dec]
2236 // ...
2237 // Dec = PhiCount - 1
2238 // ...
2239 // Br: loop if (Dec != 0)
2240 BasicBlock *Body = *(CurLoop->block_begin());
2241 auto *LbBr = cast<BranchInst>(Body->getTerminator());
2242 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2243
2244 PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi");
2245 TcPhi->insertBefore(Body->begin());
2246
2247 Builder.SetInsertPoint(LbCond);
2248 Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
2249 TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));
2250
2251 TcPhi->addIncoming(Count, Preheader);
2252 TcPhi->addIncoming(TcDec, Body);
2253
2254 CmpInst::Predicate Pred =
2255 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
2256 LbCond->setPredicate(Pred);
2257 LbCond->setOperand(0, TcDec);
2258 LbCond->setOperand(1, ConstantInt::get(CountTy, 0));
2259
2260 // Step 3: All the references to the original counter outside
2261 // the loop are replaced with the NewCount
2262 if (IsCntPhiUsedOutsideLoop)
2263 CntPhi->replaceUsesOutsideBlock(NewCount, Body);
2264 else
2265 CntInst->replaceUsesOutsideBlock(NewCount, Body);
2266
2267 // step 4: Forget the "non-computable" trip-count SCEV associated with the
2268 // loop. The loop would otherwise not be deleted even if it becomes empty.
2269 SE->forgetLoop(CurLoop);
2270}
2271
2272void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
2273 Instruction *CntInst,
2274 PHINode *CntPhi, Value *Var) {
2275 BasicBlock *PreHead = CurLoop->getLoopPreheader();
2276 auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
2277 const DebugLoc &DL = CntInst->getDebugLoc();
2278
2279 // Assuming before transformation, the loop is following:
2280 // if (x) // the precondition
2281 // do { cnt++; x &= x - 1; } while(x);
2282
2283 // Step 1: Insert the ctpop instruction at the end of the precondition block
2284 IRBuilder<> Builder(PreCondBr);
2285 Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
2286 {
2287 PopCnt = createPopcntIntrinsic(Builder, Var, DL);
2288 NewCount = PopCntZext =
2289 Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
2290
2291 if (NewCount != PopCnt)
2292 (cast<Instruction>(NewCount))->setDebugLoc(DL);
2293
2294 // TripCnt is exactly the number of iterations the loop has
2295 TripCnt = NewCount;
2296
2297 // If the population counter's initial value is not zero, insert Add Inst.
2298 Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
2299 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2300 if (!InitConst || !InitConst->isZero()) {
2301 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2302 (cast<Instruction>(NewCount))->setDebugLoc(DL);
2303 }
2304 }
2305
2306 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
2307 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
2308 // function would be partial dead code, and downstream passes will drag
2309 // it back from the precondition block to the preheader.
2310 {
2311 ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
2312
2313 Value *Opnd0 = PopCntZext;
2314 Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
2315 if (PreCond->getOperand(0) != Var)
2316 std::swap(Opnd0, Opnd1);
2317
2318 ICmpInst *NewPreCond = cast<ICmpInst>(
2319 Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
2320 PreCondBr->setCondition(NewPreCond);
2321
2323 }
2324
2325 // Step 3: Note that the population count is exactly the trip count of the
2326 // loop in question, which enable us to convert the loop from noncountable
2327 // loop into a countable one. The benefit is twofold:
2328 //
2329 // - If the loop only counts population, the entire loop becomes dead after
2330 // the transformation. It is a lot easier to prove a countable loop dead
2331 // than to prove a noncountable one. (In some C dialects, an infinite loop
2332 // isn't dead even if it computes nothing useful. In general, DCE needs
2333 // to prove a noncountable loop finite before safely delete it.)
2334 //
2335 // - If the loop also performs something else, it remains alive.
2336 // Since it is transformed to countable form, it can be aggressively
2337 // optimized by some optimizations which are in general not applicable
2338 // to a noncountable loop.
2339 //
2340 // After this step, this loop (conceptually) would look like following:
2341 // newcnt = __builtin_ctpop(x);
2342 // t = newcnt;
2343 // if (x)
2344 // do { cnt++; x &= x-1; t--) } while (t > 0);
2345 BasicBlock *Body = *(CurLoop->block_begin());
2346 {
2347 auto *LbBr = cast<BranchInst>(Body->getTerminator());
2348 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2349 Type *Ty = TripCnt->getType();
2350
2351 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi");
2352 TcPhi->insertBefore(Body->begin());
2353
2354 Builder.SetInsertPoint(LbCond);
2355 Instruction *TcDec = cast<Instruction>(
2356 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
2357 "tcdec", false, true));
2358
2359 TcPhi->addIncoming(TripCnt, PreHead);
2360 TcPhi->addIncoming(TcDec, Body);
2361
2362 CmpInst::Predicate Pred =
2363 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
2364 LbCond->setPredicate(Pred);
2365 LbCond->setOperand(0, TcDec);
2366 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
2367 }
2368
2369 // Step 4: All the references to the original population counter outside
2370 // the loop are replaced with the NewCount -- the value returned from
2371 // __builtin_ctpop().
2372 CntInst->replaceUsesOutsideBlock(NewCount, Body);
2373
2374 // step 5: Forget the "non-computable" trip-count SCEV associated with the
2375 // loop. The loop would otherwise not be deleted even if it becomes empty.
2376 SE->forgetLoop(CurLoop);
2377}
2378
2379/// Match loop-invariant value.
2380template <typename SubPattern_t> struct match_LoopInvariant {
2381 SubPattern_t SubPattern;
2382 const Loop *L;
2383
2384 match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
2385 : SubPattern(SP), L(L) {}
2386
2387 template <typename ITy> bool match(ITy *V) {
2388 return L->isLoopInvariant(V) && SubPattern.match(V);
2389 }
2390};
2391
2392/// Matches if the value is loop-invariant.
2393template <typename Ty>
2394inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
2395 return match_LoopInvariant<Ty>(M, L);
2396}
2397
2398/// Return true if the idiom is detected in the loop.
2399///
2400/// The core idiom we are trying to detect is:
2401/// \code
2402/// entry:
2403/// <...>
2404/// %bitmask = shl i32 1, %bitpos
2405/// br label %loop
2406///
2407/// loop:
2408/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2409/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2410/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2411/// %x.next = shl i32 %x.curr, 1
2412/// <...>
2413/// br i1 %x.curr.isbitunset, label %loop, label %end
2414///
2415/// end:
2416/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2417/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2418/// <...>
2419/// \endcode
2420static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
2421 Value *&BitMask, Value *&BitPos,
2422 Value *&CurrX, Instruction *&NextX) {
2424 " Performing shift-until-bittest idiom detection.\n");
2425
2426 // Give up if the loop has multiple blocks or multiple backedges.
2427 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2428 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2429 return false;
2430 }
2431
2432 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2433 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2434 assert(LoopPreheaderBB && "There is always a loop preheader.");
2435
2436 using namespace PatternMatch;
2437
2438 // Step 1: Check if the loop backedge is in desirable form.
2439
2440 CmpPredicate Pred;
2441 Value *CmpLHS, *CmpRHS;
2442 BasicBlock *TrueBB, *FalseBB;
2443 if (!match(LoopHeaderBB->getTerminator(),
2444 m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
2445 m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
2446 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2447 return false;
2448 }
2449
2450 // Step 2: Check if the backedge's condition is in desirable form.
2451
2452 auto MatchVariableBitMask = [&]() {
2453 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2454 match(CmpLHS,
2455 m_c_And(m_Value(CurrX),
2457 m_Value(BitMask),
2458 m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
2459 CurLoop))));
2460 };
2461 auto MatchConstantBitMask = [&]() {
2462 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2463 match(CmpLHS, m_And(m_Value(CurrX),
2464 m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
2465 (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
2466 };
2467 auto MatchDecomposableConstantBitMask = [&]() {
2468 auto Res = llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred);
2469 if (Res && Res->Mask.isPowerOf2()) {
2470 assert(ICmpInst::isEquality(Res->Pred));
2471 Pred = Res->Pred;
2472 CurrX = Res->X;
2473 BitMask = ConstantInt::get(CurrX->getType(), Res->Mask);
2474 BitPos = ConstantInt::get(CurrX->getType(), Res->Mask.logBase2());
2475 return true;
2476 }
2477 return false;
2478 };
2479
2480 if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
2481 !MatchDecomposableConstantBitMask()) {
2482 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
2483 return false;
2484 }
2485
2486 // Step 3: Check if the recurrence is in desirable form.
2487 auto *CurrXPN = dyn_cast<PHINode>(CurrX);
2488 if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
2489 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2490 return false;
2491 }
2492
2493 BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
2494 NextX =
2495 dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
2496
2497 assert(CurLoop->isLoopInvariant(BaseX) &&
2498 "Expected BaseX to be available in the preheader!");
2499
2500 if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
2501 // FIXME: support right-shift?
2502 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2503 return false;
2504 }
2505
2506 // Step 4: Check if the backedge's destinations are in desirable form.
2507
2509 "Should only get equality predicates here.");
2510
2511 // cmp-br is commutative, so canonicalize to a single variant.
2512 if (Pred != ICmpInst::Predicate::ICMP_EQ) {
2513 Pred = ICmpInst::getInversePredicate(Pred);
2514 std::swap(TrueBB, FalseBB);
2515 }
2516
2517 // We expect to exit loop when comparison yields false,
2518 // so when it yields true we should branch back to loop header.
2519 if (TrueBB != LoopHeaderBB) {
2520 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2521 return false;
2522 }
2523
2524 // Okay, idiom checks out.
2525 return true;
2526}
2527
2528/// Look for the following loop:
2529/// \code
2530/// entry:
2531/// <...>
2532/// %bitmask = shl i32 1, %bitpos
2533/// br label %loop
2534///
2535/// loop:
2536/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2537/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2538/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2539/// %x.next = shl i32 %x.curr, 1
2540/// <...>
2541/// br i1 %x.curr.isbitunset, label %loop, label %end
2542///
2543/// end:
2544/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2545/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2546/// <...>
2547/// \endcode
2548///
2549/// And transform it into:
2550/// \code
2551/// entry:
2552/// %bitmask = shl i32 1, %bitpos
2553/// %lowbitmask = add i32 %bitmask, -1
2554/// %mask = or i32 %lowbitmask, %bitmask
2555/// %x.masked = and i32 %x, %mask
2556/// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
2557/// i1 true)
2558/// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
2559/// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
2560/// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
2561/// %tripcount = add i32 %backedgetakencount, 1
2562/// %x.curr = shl i32 %x, %backedgetakencount
2563/// %x.next = shl i32 %x, %tripcount
2564/// br label %loop
2565///
2566/// loop:
2567/// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
2568/// %loop.iv.next = add nuw i32 %loop.iv, 1
2569/// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
2570/// <...>
2571/// br i1 %loop.ivcheck, label %end, label %loop
2572///
2573/// end:
2574/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2575/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2576/// <...>
2577/// \endcode
2578bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
2579 bool MadeChange = false;
2580
2581 Value *X, *BitMask, *BitPos, *XCurr;
2582 Instruction *XNext;
2583 if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
2584 XNext)) {
2586 " shift-until-bittest idiom detection failed.\n");
2587 return MadeChange;
2588 }
2589 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
2590
2591 // Ok, it is the idiom we were looking for, we *could* transform this loop,
2592 // but is it profitable to transform?
2593
2594 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2595 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2596 assert(LoopPreheaderBB && "There is always a loop preheader.");
2597
2598 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2599 assert(SuccessorBB && "There is only a single successor.");
2600
2601 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2602 Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
2603
2604 Intrinsic::ID IntrID = Intrinsic::ctlz;
2605 Type *Ty = X->getType();
2606 unsigned Bitwidth = Ty->getScalarSizeInBits();
2607
2610
2611 // The rewrite is considered to be unprofitable iff and only iff the
2612 // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
2613 // making the loop countable, even if nothing else changes.
2615 IntrID, Ty, {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getTrue()});
2619 " Intrinsic is too costly, not beneficial\n");
2620 return MadeChange;
2621 }
2622 if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
2624 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
2625 return MadeChange;
2626 }
2627
2628 // Ok, transform appears worthwhile.
2629 MadeChange = true;
2630
2631 if (!isGuaranteedNotToBeUndefOrPoison(BitPos)) {
2632 // BitMask may be computed from BitPos, Freeze BitPos so we can increase
2633 // it's use count.
2634 std::optional<BasicBlock::iterator> InsertPt = std::nullopt;
2635 if (auto *BitPosI = dyn_cast<Instruction>(BitPos))
2636 InsertPt = BitPosI->getInsertionPointAfterDef();
2637 else
2638 InsertPt = DT->getRoot()->getFirstNonPHIOrDbgOrAlloca();
2639 if (!InsertPt)
2640 return false;
2641 FreezeInst *BitPosFrozen =
2642 new FreezeInst(BitPos, BitPos->getName() + ".fr", *InsertPt);
2643 BitPos->replaceUsesWithIf(BitPosFrozen, [BitPosFrozen](Use &U) {
2644 return U.getUser() != BitPosFrozen;
2645 });
2646 BitPos = BitPosFrozen;
2647 }
2648
2649 // Step 1: Compute the loop trip count.
2650
2651 Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
2652 BitPos->getName() + ".lowbitmask");
2653 Value *Mask =
2654 Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
2655 Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
2656 CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
2657 IntrID, Ty, {XMasked, /*is_zero_poison=*/Builder.getTrue()},
2658 /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
2659 Value *XMaskedNumActiveBits = Builder.CreateSub(
2660 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
2661 XMasked->getName() + ".numactivebits", /*HasNUW=*/true,
2662 /*HasNSW=*/Bitwidth != 2);
2663 Value *XMaskedLeadingOnePos =
2664 Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
2665 XMasked->getName() + ".leadingonepos", /*HasNUW=*/false,
2666 /*HasNSW=*/Bitwidth > 2);
2667
2668 Value *LoopBackedgeTakenCount = Builder.CreateSub(
2669 BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",
2670 /*HasNUW=*/true, /*HasNSW=*/true);
2671 // We know loop's backedge-taken count, but what's loop's trip count?
2672 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2673 Value *LoopTripCount =
2674 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2675 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
2676 /*HasNSW=*/Bitwidth != 2);
2677
2678 // Step 2: Compute the recurrence's final value without a loop.
2679
2680 // NewX is always safe to compute, because `LoopBackedgeTakenCount`
2681 // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
2682 Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
2683 NewX->takeName(XCurr);
2684 if (auto *I = dyn_cast<Instruction>(NewX))
2685 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2686
2687 Value *NewXNext;
2688 // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
2689 // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
2690 // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
2691 // that isn't the case, we'll need to emit an alternative, safe IR.
2692 if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
2696 Ty->getScalarSizeInBits() - 1))))
2697 NewXNext = Builder.CreateShl(X, LoopTripCount);
2698 else {
2699 // Otherwise, just additionally shift by one. It's the smallest solution,
2700 // alternatively, we could check that NewX is INT_MIN (or BitPos is )
2701 // and select 0 instead.
2702 NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
2703 }
2704
2705 NewXNext->takeName(XNext);
2706 if (auto *I = dyn_cast<Instruction>(NewXNext))
2707 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2708
2709 // Step 3: Adjust the successor basic block to recieve the computed
2710 // recurrence's final value instead of the recurrence itself.
2711
2712 XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
2713 XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
2714
2715 // Step 4: Rewrite the loop into a countable form, with canonical IV.
2716
2717 // The new canonical induction variable.
2718 Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->begin());
2719 auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2720
2721 // The induction itself.
2722 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2723 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2724 auto *IVNext =
2725 Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",
2726 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
2727
2728 // The loop trip count check.
2729 auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
2730 CurLoop->getName() + ".ivcheck");
2731 Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
2732 LoopHeaderBB->getTerminator()->eraseFromParent();
2733
2734 // Populate the IV PHI.
2735 IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2736 IV->addIncoming(IVNext, LoopHeaderBB);
2737
2738 // Step 5: Forget the "non-computable" trip-count SCEV associated with the
2739 // loop. The loop would otherwise not be deleted even if it becomes empty.
2740
2741 SE->forgetLoop(CurLoop);
2742
2743 // Other passes will take care of actually deleting the loop if possible.
2744
2745 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
2746
2747 ++NumShiftUntilBitTest;
2748 return MadeChange;
2749}
2750
2751/// Return true if the idiom is detected in the loop.
2752///
2753/// The core idiom we are trying to detect is:
2754/// \code
2755/// entry:
2756/// <...>
2757/// %start = <...>
2758/// %extraoffset = <...>
2759/// <...>
2760/// br label %for.cond
2761///
2762/// loop:
2763/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2764/// %nbits = add nsw i8 %iv, %extraoffset
2765/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2766/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2767/// %iv.next = add i8 %iv, 1
2768/// <...>
2769/// br i1 %val.shifted.iszero, label %end, label %loop
2770///
2771/// end:
2772/// %iv.res = phi i8 [ %iv, %loop ] <...>
2773/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2774/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2775/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2776/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2777/// <...>
2778/// \endcode
2780 Instruction *&ValShiftedIsZero,
2781 Intrinsic::ID &IntrinID, Instruction *&IV,
2782 Value *&Start, Value *&Val,
2783 const SCEV *&ExtraOffsetExpr,
2784 bool &InvertedCond) {
2786 " Performing shift-until-zero idiom detection.\n");
2787
2788 // Give up if the loop has multiple blocks or multiple backedges.
2789 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2790 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2791 return false;
2792 }
2793
2794 Instruction *ValShifted, *NBits, *IVNext;
2795 Value *ExtraOffset;
2796
2797 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2798 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2799 assert(LoopPreheaderBB && "There is always a loop preheader.");
2800
2801 using namespace PatternMatch;
2802
2803 // Step 1: Check if the loop backedge, condition is in desirable form.
2804
2805 CmpPredicate Pred;
2806 BasicBlock *TrueBB, *FalseBB;
2807 if (!match(LoopHeaderBB->getTerminator(),
2808 m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB),
2809 m_BasicBlock(FalseBB))) ||
2810 !match(ValShiftedIsZero,
2811 m_ICmp(Pred, m_Instruction(ValShifted), m_Zero())) ||
2812 !ICmpInst::isEquality(Pred)) {
2813 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2814 return false;
2815 }
2816
2817 // Step 2: Check if the comparison's operand is in desirable form.
2818 // FIXME: Val could be a one-input PHI node, which we should look past.
2819 if (!match(ValShifted, m_Shift(m_LoopInvariant(m_Value(Val), CurLoop),
2820 m_Instruction(NBits)))) {
2821 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n");
2822 return false;
2823 }
2824 IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz
2825 : Intrinsic::ctlz;
2826
2827 // Step 3: Check if the shift amount is in desirable form.
2828
2829 if (match(NBits, m_c_Add(m_Instruction(IV),
2830 m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
2831 (NBits->hasNoSignedWrap() || NBits->hasNoUnsignedWrap()))
2832 ExtraOffsetExpr = SE->getNegativeSCEV(SE->getSCEV(ExtraOffset));
2833 else if (match(NBits,
2835 m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
2836 NBits->hasNoSignedWrap())
2837 ExtraOffsetExpr = SE->getSCEV(ExtraOffset);
2838 else {
2839 IV = NBits;
2840 ExtraOffsetExpr = SE->getZero(NBits->getType());
2841 }
2842
2843 // Step 4: Check if the recurrence is in desirable form.
2844 auto *IVPN = dyn_cast<PHINode>(IV);
2845 if (!IVPN || IVPN->getParent() != LoopHeaderBB) {
2846 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2847 return false;
2848 }
2849
2850 Start = IVPN->getIncomingValueForBlock(LoopPreheaderBB);
2851 IVNext = dyn_cast<Instruction>(IVPN->getIncomingValueForBlock(LoopHeaderBB));
2852
2853 if (!IVNext || !match(IVNext, m_Add(m_Specific(IVPN), m_One()))) {
2854 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2855 return false;
2856 }
2857
2858 // Step 4: Check if the backedge's destinations are in desirable form.
2859
2861 "Should only get equality predicates here.");
2862
2863 // cmp-br is commutative, so canonicalize to a single variant.
2864 InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;
2865 if (InvertedCond) {
2866 Pred = ICmpInst::getInversePredicate(Pred);
2867 std::swap(TrueBB, FalseBB);
2868 }
2869
2870 // We expect to exit loop when comparison yields true,
2871 // so when it yields false we should branch back to loop header.
2872 if (FalseBB != LoopHeaderBB) {
2873 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2874 return false;
2875 }
2876
2877 // The new, countable, loop will certainly only run a known number of
2878 // iterations, It won't be infinite. But the old loop might be infinite
2879 // under certain conditions. For logical shifts, the value will become zero
2880 // after at most bitwidth(%Val) loop iterations. However, for arithmetic
2881 // right-shift, iff the sign bit was set, the value will never become zero,
2882 // and the loop may never finish.
2883 if (ValShifted->getOpcode() == Instruction::AShr &&
2884 !isMustProgress(CurLoop) && !SE->isKnownNonNegative(SE->getSCEV(Val))) {
2885 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n");
2886 return false;
2887 }
2888
2889 // Okay, idiom checks out.
2890 return true;
2891}
2892
2893/// Look for the following loop:
2894/// \code
2895/// entry:
2896/// <...>
2897/// %start = <...>
2898/// %extraoffset = <...>
2899/// <...>
2900/// br label %for.cond
2901///
2902/// loop:
2903/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2904/// %nbits = add nsw i8 %iv, %extraoffset
2905/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2906/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2907/// %iv.next = add i8 %iv, 1
2908/// <...>
2909/// br i1 %val.shifted.iszero, label %end, label %loop
2910///
2911/// end:
2912/// %iv.res = phi i8 [ %iv, %loop ] <...>
2913/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2914/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2915/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2916/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2917/// <...>
2918/// \endcode
2919///
2920/// And transform it into:
2921/// \code
2922/// entry:
2923/// <...>
2924/// %start = <...>
2925/// %extraoffset = <...>
2926/// <...>
2927/// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0)
2928/// %val.numactivebits = sub i8 8, %val.numleadingzeros
2929/// %extraoffset.neg = sub i8 0, %extraoffset
2930/// %tmp = add i8 %val.numactivebits, %extraoffset.neg
2931/// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start)
2932/// %loop.tripcount = sub i8 %iv.final, %start
2933/// br label %loop
2934///
2935/// loop:
2936/// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ]
2937/// %loop.iv.next = add i8 %loop.iv, 1
2938/// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount
2939/// %iv = add i8 %loop.iv, %start
2940/// <...>
2941/// br i1 %loop.ivcheck, label %end, label %loop
2942///
2943/// end:
2944/// %iv.res = phi i8 [ %iv.final, %loop ] <...>
2945/// <...>
2946/// \endcode
2947bool LoopIdiomRecognize::recognizeShiftUntilZero() {
2948 bool MadeChange = false;
2949
2950 Instruction *ValShiftedIsZero;
2951 Intrinsic::ID IntrID;
2952 Instruction *IV;
2953 Value *Start, *Val;
2954 const SCEV *ExtraOffsetExpr;
2955 bool InvertedCond;
2956 if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrID, IV,
2957 Start, Val, ExtraOffsetExpr, InvertedCond)) {
2959 " shift-until-zero idiom detection failed.\n");
2960 return MadeChange;
2961 }
2962 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n");
2963
2964 // Ok, it is the idiom we were looking for, we *could* transform this loop,
2965 // but is it profitable to transform?
2966
2967 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2968 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2969 assert(LoopPreheaderBB && "There is always a loop preheader.");
2970
2971 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2972 assert(SuccessorBB && "There is only a single successor.");
2973
2974 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2975 Builder.SetCurrentDebugLocation(IV->getDebugLoc());
2976
2977 Type *Ty = Val->getType();
2978 unsigned Bitwidth = Ty->getScalarSizeInBits();
2979
2982
2983 // The rewrite is considered to be unprofitable iff and only iff the
2984 // intrinsic we'll use are not cheap. Note that we are okay with *just*
2985 // making the loop countable, even if nothing else changes.
2987 IntrID, Ty, {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getFalse()});
2991 " Intrinsic is too costly, not beneficial\n");
2992 return MadeChange;
2993 }
2994
2995 // Ok, transform appears worthwhile.
2996 MadeChange = true;
2997
2998 bool OffsetIsZero = false;
2999 if (auto *ExtraOffsetExprC = dyn_cast<SCEVConstant>(ExtraOffsetExpr))
3000 OffsetIsZero = ExtraOffsetExprC->isZero();
3001
3002 // Step 1: Compute the loop's final IV value / trip count.
3003
3004 CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
3005 IntrID, Ty, {Val, /*is_zero_poison=*/Builder.getFalse()},
3006 /*FMFSource=*/nullptr, Val->getName() + ".numleadingzeros");
3007 Value *ValNumActiveBits = Builder.CreateSub(
3008 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), ValNumLeadingZeros,
3009 Val->getName() + ".numactivebits", /*HasNUW=*/true,
3010 /*HasNSW=*/Bitwidth != 2);
3011
3012 SCEVExpander Expander(*SE, *DL, "loop-idiom");
3013 Expander.setInsertPoint(&*Builder.GetInsertPoint());
3014 Value *ExtraOffset = Expander.expandCodeFor(ExtraOffsetExpr);
3015
3016 Value *ValNumActiveBitsOffset = Builder.CreateAdd(
3017 ValNumActiveBits, ExtraOffset, ValNumActiveBits->getName() + ".offset",
3018 /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true);
3019 Value *IVFinal = Builder.CreateIntrinsic(Intrinsic::smax, {Ty},
3020 {ValNumActiveBitsOffset, Start},
3021 /*FMFSource=*/nullptr, "iv.final");
3022
3023 auto *LoopBackedgeTakenCount = cast<Instruction>(Builder.CreateSub(
3024 IVFinal, Start, CurLoop->getName() + ".backedgetakencount",
3025 /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true));
3026 // FIXME: or when the offset was `add nuw`
3027
3028 // We know loop's backedge-taken count, but what's loop's trip count?
3029 Value *LoopTripCount =
3030 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
3031 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
3032 /*HasNSW=*/Bitwidth != 2);
3033
3034 // Step 2: Adjust the successor basic block to recieve the original
3035 // induction variable's final value instead of the orig. IV itself.
3036
3037 IV->replaceUsesOutsideBlock(IVFinal, LoopHeaderBB);
3038
3039 // Step 3: Rewrite the loop into a countable form, with canonical IV.
3040
3041 // The new canonical induction variable.
3042 Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->begin());
3043 auto *CIV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
3044
3045 // The induction itself.
3046 Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->getFirstNonPHIIt());
3047 auto *CIVNext =
3048 Builder.CreateAdd(CIV, ConstantInt::get(Ty, 1), CIV->getName() + ".next",
3049 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
3050
3051 // The loop trip count check.
3052 auto *CIVCheck = Builder.CreateICmpEQ(CIVNext, LoopTripCount,
3053 CurLoop->getName() + ".ivcheck");
3054 auto *NewIVCheck = CIVCheck;
3055 if (InvertedCond) {
3056 NewIVCheck = Builder.CreateNot(CIVCheck);
3057 NewIVCheck->takeName(ValShiftedIsZero);
3058 }
3059
3060 // The original IV, but rebased to be an offset to the CIV.
3061 auto *IVDePHId = Builder.CreateAdd(CIV, Start, "", /*HasNUW=*/false,
3062 /*HasNSW=*/true); // FIXME: what about NUW?
3063 IVDePHId->takeName(IV);
3064
3065 // The loop terminator.
3066 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
3067 Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
3068 LoopHeaderBB->getTerminator()->eraseFromParent();
3069
3070 // Populate the IV PHI.
3071 CIV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
3072 CIV->addIncoming(CIVNext, LoopHeaderBB);
3073
3074 // Step 4: Forget the "non-computable" trip-count SCEV associated with the
3075 // loop. The loop would otherwise not be deleted even if it becomes empty.
3076
3077 SE->forgetLoop(CurLoop);
3078
3079 // Step 5: Try to cleanup the loop's body somewhat.
3080 IV->replaceAllUsesWith(IVDePHId);
3081 IV->eraseFromParent();
3082
3083 ValShiftedIsZero->replaceAllUsesWith(NewIVCheck);
3084 ValShiftedIsZero->eraseFromParent();
3085
3086 // Other passes will take care of actually deleting the loop if possible.
3087
3088 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n");
3089
3090 ++NumShiftUntilZero;
3091 return MadeChange;
3092}
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
DXIL Resource Access
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
std::string Name
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define DEBUG_TYPE
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, unsigned StoreSize, AliasAnalysis &AA, SmallPtrSetImpl< Instruction * > &Ignored)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This header defines various interfaces for pass management in LLVM.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static Value * matchCondition(BranchInst *BI, BasicBlock *LoopEntry, bool JmpOnZero=false)
Check if the given conditional branch is based on the comparison between a variable and zero,...
static PHINode * getRecurrenceVar(Value *VarX, Instruction *DefX, BasicBlock *LoopEntry)
static cl::opt< bool, true > DisableLIRPMemset("disable-" DEBUG_TYPE "-memset", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memset."), cl::location(DisableLIRP::Memset), cl::init(false), cl::ReallyHidden)
static CallInst * createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL, bool ZeroCheck, Intrinsic::ID IID)
static bool detectShiftUntilLessThanIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX, APInt &Threshold)
Return true if the idiom is detected in the loop.
static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX, Value *&BitMask, Value *&BitPos, Value *&CurrX, Instruction *&NextX)
Return true if the idiom is detected in the loop.
static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, Instruction *&CntInst, PHINode *&CntPhi, Value *&Var)
Return true iff the idiom is detected in the loop.
static Constant * getMemSetPatternValue(Value *V, const DataLayout *DL)
getMemSetPatternValue - If a strided store of the specified value is safe to turn into a memset_patte...
static cl::opt< bool, true > DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memcpy."), cl::location(DisableLIRP::Memcpy), cl::init(false), cl::ReallyHidden)
static CallInst * createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL)
static const SCEV * getNumBytes(const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute the number of bytes as a SCEV from the backedge taken count.
static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX)
Return true if the idiom is detected in the loop.
static const SCEV * getStartForNegStride(const SCEV *Start, const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, ScalarEvolution *SE)
static APInt getStoreStride(const SCEVAddRecExpr *StoreEv)
static Value * matchShiftULTCondition(BranchInst *BI, BasicBlock *LoopEntry, APInt &Threshold)
Check if the given conditional branch is based on an unsigned less-than comparison between a variable...
match_LoopInvariant< Ty > m_LoopInvariant(const Ty &M, const Loop *L)
Matches if the value is loop-invariant.
#define DEBUG_TYPE
static cl::opt< bool, true > DisableLIRPAll("disable-" DEBUG_TYPE "-all", cl::desc("Options to disable Loop Idiom Recognize Pass."), cl::location(DisableLIRP::All), cl::init(false), cl::ReallyHidden)
static void deleteDeadInstruction(Instruction *I)
static cl::opt< bool > UseLIRCodeSizeHeurs("use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling " "with -Os/-Oz"), cl::init(true), cl::Hidden)
#define I(x, y, z)
Definition: MD5.cpp:58
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
Contains a collection of routines for determining if a given instruction is guaranteed to execute if ...
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isSimple(Instruction *I)
verify safepoint Safepoint IR Verifier
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static const uint32_t IV[8]
Definition: blake3_impl.h:78
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Check whether or not an instruction may read or write the optionally specified memory location.
Class for arbitrary precision integers.
Definition: APInt.h:78
std::optional< uint64_t > tryZExtValue() const
Get zero extended value if possible.
Definition: APInt.h:1532
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1468
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:461
iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=true) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
Definition: BasicBlock.cpp:250
InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:381
const Instruction & front() const
Definition: BasicBlock.h:484
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:481
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:220
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
Definition: BasicBlock.cpp:450
size_t size() const
Definition: BasicBlock.h:482
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:240
BinaryOps getOpcode() const
Definition: InstrTypes.h:370
Conditional or Unconditional Branch instruction.
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
This class represents a function call, abstracting a target machine's calling convention.
void setPredicate(Predicate P)
Set the predicate for this instruction to the specified value.
Definition: InstrTypes.h:766
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_EQ
equal
Definition: InstrTypes.h:694
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:787
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:763
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:22
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1312
static Constant * getExactLogBase2(Constant *C)
If C is a scalar/fixed width vector of known powers of 2, then this function returns a new scalar/fix...
Definition: Constants.cpp:2663
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
bool isMinusOne() const
This function will return true iff every bit in this constant is set to true.
Definition: Constants.h:220
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition: Constants.h:214
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:208
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:157
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:148
static ConstantInt * getBool(LLVMContext &Context, bool V)
Definition: Constants.cpp:880
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
A debug info location.
Definition: DebugLoc.h:33
NodeT * getRoot() const
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
This class represents a freeze function that returns random concrete value if an operand is either a ...
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:170
void setAlignment(Align Align)
Sets the alignment attribute of the GlobalObject.
Definition: Globals.cpp:143
void setUnnamedAddr(UnnamedAddr Val)
Definition: GlobalValue.h:232
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:60
This instruction compares its operands according to the predicate given to the constructor.
bool isEquality() const
Return true if this predicate is either EQ or NE.
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Definition: IRBuilder.h:480
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:900
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
Definition: Instruction.cpp:99
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:511
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:68
bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:72
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
Definition: Metadata.cpp:1750
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:310
bool isShift() const
Definition: Instruction.h:318
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:508
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Definition: Instructions.h:205
bool isUnordered() const
Definition: Instructions.h:249
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:211
static LocationSize precise(uint64_t Value)
static constexpr LocationSize afterPointer()
Any location after the base pointer (but still within the underlying object).
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
bool isOutermost() const
Return true if the loop does not have a parent (natural) loop.
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
BlockT * getHeader() const
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
void getUniqueExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all unique successor blocks of this loop.
block_iterator block_begin() const
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:61
StringRef getName() const
Definition: LoopInfo.h:388
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
This class wraps the llvm.memcpy intrinsic.
Value * getLength() const
Value * getDest() const
This is just like getRawDest, but it strips off any cast instructions (including addrspacecast) that ...
MaybeAlign getDestAlign() const
bool isVolatile() const
Value * getValue() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
MaybeAlign getSourceAlign() const
Value * getSource() const
This is just like getRawSource, but it strips off any cast instructions that feed it,...
Representation for a specific memory location.
An analysis that produces MemorySSA for a function.
Definition: MemorySSA.h:928
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition: MemorySSA.h:701
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
const SCEV * getOperand(unsigned i) const
This class represents an analyzed expression in the program.
bool isOne() const
Return true if the expression is a constant one.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
The main scalar evolution driver.
bool isKnownNonNegative(const SCEV *S)
Test if the given expression is known to be non-negative.
const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getTruncateOrZeroExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
Simple and conservative implementation of LoopSafetyInfo that can give false-positive answers to its ...
Definition: MustExecute.h:109
void computeLoopSafetyInfo(const Loop *CurLoop) override
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
Definition: MustExecute.cpp:49
bool anyBlockMayThrow() const override
Returns true iff any block of the loop for which this info is contains an instruction that may throw ...
Definition: MustExecute.cpp:45
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:401
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:458
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
Align getAlign() const
Definition: Instructions.h:333
Value * getValueOperand()
Definition: Instructions.h:378
Value * getPointerOperand()
Definition: Instructions.h:381
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
Provides information about what library functions are available for the current target.
bool has(LibFunc F) const
Tests whether a library function is available.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
unsigned getAtomicMemIntrinsicMaxElementSize() const
PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const
Return hardware support for population count.
TargetCostKind
The kind of cost model.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
@ TCC_Basic
The cost of a typical 'add' instruction.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
void setOperand(unsigned i, Value *Val)
Definition: User.h:233
Value * getOperand(unsigned i) const
Definition: User.h:228
unsigned getNumOperands() const
Definition: User.h:250
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:542
void replaceUsesOutsideBlock(Value *V, BasicBlock *BB)
replaceUsesOutsideBlock - Go through the uses list for this definition and make each use point to "V"...
Definition: Value.cpp:592
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1094
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
const ParentTy * getParent() const
Definition: ilist_node.h:32
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
@ HeaderSize
Definition: BTF.h:61
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
cst_pred_ty< is_power2 > m_Power2()
Match an integer or vector power-of-2.
Definition: PatternMatch.h:619
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:826
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:592
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
Definition: PatternMatch.h:245
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinOpPred_match< LHS, RHS, is_shift_op > m_Shift(const LHS &L, const RHS &R)
Matches shift operations.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
class_match< BasicBlock > m_BasicBlock()
Match an arbitrary basic block value and ignore it.
Definition: PatternMatch.h:189
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:612
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
cst_pred_ty< icmp_pred_with_threshold > m_SpecificInt_ICMP(ICmpInst::Predicate Predicate, const APInt &Threshold)
Match an integer or vector with every element comparing 'pred' (eg/ne/...) to Threshold.
Definition: PatternMatch.h:698
@ ReallyHidden
Definition: CommandLine.h:138
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:463
constexpr double e
Definition: MathExtras.h:47
DiagnosticInfoOptimizationBase::setExtraArgs setExtraArgs
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:543
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
bool inferNonMandatoryLibFuncAttrs(Module *M, StringRef Name, const TargetLibraryInfo &TLI)
Analyze the name and prototype of the given function and set any applicable attributes.
bool isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI, LibFunc TheLibFunc)
Check whether the library function is available on target and also that it in the current Module is a...
bool isMustProgress(const Loop *L)
Return true if this loop can be assumed to make progress.
Definition: LoopInfo.cpp:1162
@ None
Definition: CodeGenData.h:106
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
FunctionCallee getOrInsertLibFunc(Module *M, const TargetLibraryInfo &TLI, LibFunc TheLibFunc, FunctionType *T, AttributeList AttributeList)
Calls getOrInsertFunction() and then makes sure to add mandatory argument attributes.
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition: ModRef.h:27
@ ModRef
The access may reference and may modify the value stored in memory.
@ Mod
The access may modify the value stored in memory.
bool VerifyMemorySSA
Enables verification of MemorySSA.
Definition: MemorySSA.cpp:84
std::optional< DecomposedBitTest > decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate Pred, bool LookThroughTrunc=true, bool AllowNonZeroC=false)
Decompose an icmp into the form ((X & Mask) pred C) if possible.
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
bool isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Return true if this function can prove that V does not have undef bits and is never poison.
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
Value * isBytewiseValue(Value *V, const DataLayout &DL)
If the specified value can be set by repeating the same byte in memory, return the i8 value that it i...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition: Metadata.h:764
MDNode * TBAAStruct
The tag for type-based alias analysis (tbaa struct).
Definition: Metadata.h:784
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition: Metadata.h:787
MDNode * TBAA
The tag for type-based alias analysis.
Definition: Metadata.h:781
AAMDNodes merge(const AAMDNodes &Other) const
Given two sets of AAMDNodes applying to potentially different locations, determine the best AAMDNodes...
MDNode * NoAlias
The tag specifying the noalias scope.
Definition: Metadata.h:790
AAMDNodes extendTo(ssize_t Len) const
Create a new AAMDNode that describes this AAMDNode after extending it to apply to a series of bytes o...
Definition: Metadata.h:831
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static bool Memset
When true, Memset is disabled.
static bool All
When true, the entire pass is disabled.
static bool Memcpy
When true, Memcpy is disabled.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Match loop-invariant value.
match_LoopInvariant(const SubPattern_t &SP, const Loop *L)