LLVM 22.0.0git
LoopIdiomRecognize.cpp
Go to the documentation of this file.
1//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements an idiom recognizer that transforms simple loops into a
10// non-loop form. In cases that this kicks in, it can be a significant
11// performance win.
12//
13// If compiling for code size we avoid idiom recognition if the resulting
14// code could be larger than the code for the original loop. One way this could
15// happen is if the loop is not removable after idiom recognition due to the
16// presence of non-idiom instructions. The initial implementation of the
17// heuristics applies to idioms in multi-block loops.
18//
19//===----------------------------------------------------------------------===//
20//
21// TODO List:
22//
23// Future loop memory idioms to recognize: memcmp, etc.
24//
25// This could recognize common matrix multiplies and dot product idioms and
26// replace them with calls to BLAS (if linked in??).
27//
28//===----------------------------------------------------------------------===//
29
31#include "llvm/ADT/APInt.h"
32#include "llvm/ADT/ArrayRef.h"
33#include "llvm/ADT/DenseMap.h"
34#include "llvm/ADT/MapVector.h"
35#include "llvm/ADT/SetVector.h"
38#include "llvm/ADT/Statistic.h"
39#include "llvm/ADT/StringRef.h"
55#include "llvm/IR/BasicBlock.h"
56#include "llvm/IR/Constant.h"
57#include "llvm/IR/Constants.h"
58#include "llvm/IR/DataLayout.h"
59#include "llvm/IR/DebugLoc.h"
61#include "llvm/IR/Dominators.h"
62#include "llvm/IR/GlobalValue.h"
64#include "llvm/IR/IRBuilder.h"
65#include "llvm/IR/InstrTypes.h"
66#include "llvm/IR/Instruction.h"
69#include "llvm/IR/Intrinsics.h"
70#include "llvm/IR/LLVMContext.h"
71#include "llvm/IR/Module.h"
72#include "llvm/IR/PassManager.h"
74#include "llvm/IR/Type.h"
75#include "llvm/IR/User.h"
76#include "llvm/IR/Value.h"
77#include "llvm/IR/ValueHandle.h"
80#include "llvm/Support/Debug.h"
87#include <algorithm>
88#include <cassert>
89#include <cstdint>
90#include <utility>
91
92using namespace llvm;
93using namespace SCEVPatternMatch;
94
95#define DEBUG_TYPE "loop-idiom"
96
97STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
98STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
99STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");
100STATISTIC(NumStrLen, "Number of strlen's and wcslen's formed from loop loads");
102 NumShiftUntilBitTest,
103 "Number of uncountable loops recognized as 'shift until bitttest' idiom");
104STATISTIC(NumShiftUntilZero,
105 "Number of uncountable loops recognized as 'shift until zero' idiom");
106
109 DisableLIRPAll("disable-" DEBUG_TYPE "-all",
110 cl::desc("Options to disable Loop Idiom Recognize Pass."),
113
116 DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
117 cl::desc("Proceed with loop idiom recognize pass, but do "
118 "not convert loop(s) to memset."),
121
124 DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
125 cl::desc("Proceed with loop idiom recognize pass, but do "
126 "not convert loop(s) to memcpy."),
129
132 DisableLIRPStrlen("disable-loop-idiom-strlen",
133 cl::desc("Proceed with loop idiom recognize pass, but do "
134 "not convert loop(s) to strlen."),
137
140 EnableLIRPWcslen("disable-loop-idiom-wcslen",
141 cl::desc("Proceed with loop idiom recognize pass, "
142 "enable conversion of loop(s) to wcslen."),
145
147 "use-lir-code-size-heurs",
148 cl::desc("Use loop idiom recognition code size heuristics when compiling "
149 "with -Os/-Oz"),
150 cl::init(true), cl::Hidden);
151
153 "loop-idiom-force-memset-pattern-intrinsic",
154 cl::desc("Use memset.pattern intrinsic whenever possible"), cl::init(false),
155 cl::Hidden);
156
157namespace {
158
159class LoopIdiomRecognize {
160 Loop *CurLoop = nullptr;
161 AliasAnalysis *AA;
162 DominatorTree *DT;
163 LoopInfo *LI;
164 ScalarEvolution *SE;
167 const DataLayout *DL;
169 bool ApplyCodeSizeHeuristics;
170 std::unique_ptr<MemorySSAUpdater> MSSAU;
171
172public:
173 explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
174 LoopInfo *LI, ScalarEvolution *SE,
176 const TargetTransformInfo *TTI, MemorySSA *MSSA,
177 const DataLayout *DL,
179 : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
180 if (MSSA)
181 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
182 }
183
184 bool runOnLoop(Loop *L);
185
186private:
187 using StoreList = SmallVector<StoreInst *, 8>;
188 using StoreListMap = MapVector<Value *, StoreList>;
189
190 StoreListMap StoreRefsForMemset;
191 StoreListMap StoreRefsForMemsetPattern;
192 StoreList StoreRefsForMemcpy;
193 bool HasMemset;
194 bool HasMemsetPattern;
195 bool HasMemcpy;
196
197 /// Return code for isLegalStore()
198 enum LegalStoreKind {
199 None = 0,
200 Memset,
201 MemsetPattern,
202 Memcpy,
203 UnorderedAtomicMemcpy,
204 DontUse // Dummy retval never to be used. Allows catching errors in retval
205 // handling.
206 };
207
208 /// \name Countable Loop Idiom Handling
209 /// @{
210
211 bool runOnCountableLoop();
212 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
214
215 void collectStores(BasicBlock *BB);
216 LegalStoreKind isLegalStore(StoreInst *SI);
217 enum class ForMemset { No, Yes };
218 bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
219 ForMemset For);
220
221 template <typename MemInst>
222 bool processLoopMemIntrinsic(
223 BasicBlock *BB,
224 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
225 const SCEV *BECount);
226 bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
227 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
228
229 bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV,
230 MaybeAlign StoreAlignment, Value *StoredVal,
231 Instruction *TheStore,
233 const SCEVAddRecExpr *Ev, const SCEV *BECount,
234 bool IsNegStride, bool IsLoopMemset = false);
235 bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
236 bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
237 const SCEV *StoreSize, MaybeAlign StoreAlign,
238 MaybeAlign LoadAlign, Instruction *TheStore,
239 Instruction *TheLoad,
240 const SCEVAddRecExpr *StoreEv,
241 const SCEVAddRecExpr *LoadEv,
242 const SCEV *BECount);
243 bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
244 bool IsLoopMemset = false);
245
246 /// @}
247 /// \name Noncountable Loop Idiom Handling
248 /// @{
249
250 bool runOnNoncountableLoop();
251
252 bool recognizePopcount();
253 void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
254 PHINode *CntPhi, Value *Var);
255 bool isProfitableToInsertFFS(Intrinsic::ID IntrinID, Value *InitX,
256 bool ZeroCheck, size_t CanonicalSize);
257 bool insertFFSIfProfitable(Intrinsic::ID IntrinID, Value *InitX,
258 Instruction *DefX, PHINode *CntPhi,
259 Instruction *CntInst);
260 bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
261 bool recognizeShiftUntilLessThan();
262 void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
263 Instruction *CntInst, PHINode *CntPhi,
264 Value *Var, Instruction *DefX,
265 const DebugLoc &DL, bool ZeroCheck,
266 bool IsCntPhiUsedOutsideLoop,
267 bool InsertSub = false);
268
269 bool recognizeShiftUntilBitTest();
270 bool recognizeShiftUntilZero();
271 bool recognizeAndInsertStrLen();
272
273 /// @}
274};
275} // end anonymous namespace
276
279 LPMUpdater &) {
281 return PreservedAnalyses::all();
282
283 const auto *DL = &L.getHeader()->getDataLayout();
284
285 // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
286 // pass. Function analyses need to be preserved across loop transformations
287 // but ORE cannot be preserved (see comment before the pass definition).
288 OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
289
290 LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
291 AR.MSSA, DL, ORE);
292 if (!LIR.runOnLoop(&L))
293 return PreservedAnalyses::all();
294
296 if (AR.MSSA)
297 PA.preserve<MemorySSAAnalysis>();
298 return PA;
299}
300
302 I->replaceAllUsesWith(PoisonValue::get(I->getType()));
303 I->eraseFromParent();
304}
305
306//===----------------------------------------------------------------------===//
307//
308// Implementation of LoopIdiomRecognize
309//
310//===----------------------------------------------------------------------===//
311
312bool LoopIdiomRecognize::runOnLoop(Loop *L) {
313 CurLoop = L;
314 // If the loop could not be converted to canonical form, it must have an
315 // indirectbr in it, just give up.
316 if (!L->getLoopPreheader())
317 return false;
318
319 // Disable loop idiom recognition if the function's name is a common idiom.
320 StringRef Name = L->getHeader()->getParent()->getName();
321 if (Name == "memset" || Name == "memcpy" || Name == "strlen" ||
322 Name == "wcslen")
323 return false;
324
325 // Determine if code size heuristics need to be applied.
326 ApplyCodeSizeHeuristics =
327 L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
328
329 HasMemset = TLI->has(LibFunc_memset);
330 // TODO: Unconditionally enable use of the memset pattern intrinsic (or at
331 // least, opt-in via target hook) once we are confident it will never result
332 // in worse codegen than without. For now, use it only when the target
333 // supports memset_pattern16 libcall (or unless this is overridden by
334 // command line option).
335 HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
336 HasMemcpy = TLI->has(LibFunc_memcpy);
337
338 if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic || HasMemcpy)
340 return runOnCountableLoop();
341
342 return runOnNoncountableLoop();
343}
344
345bool LoopIdiomRecognize::runOnCountableLoop() {
346 const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
347 assert(!isa<SCEVCouldNotCompute>(BECount) &&
348 "runOnCountableLoop() called on a loop without a predictable"
349 "backedge-taken count");
350
351 // If this loop executes exactly one time, then it should be peeled, not
352 // optimized by this pass.
353 if (BECount->isZero())
354 return false;
355
357 CurLoop->getUniqueExitBlocks(ExitBlocks);
358
359 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
360 << CurLoop->getHeader()->getParent()->getName()
361 << "] Countable Loop %" << CurLoop->getHeader()->getName()
362 << "\n");
363
364 // The following transforms hoist stores/memsets into the loop pre-header.
365 // Give up if the loop has instructions that may throw.
366 SimpleLoopSafetyInfo SafetyInfo;
367 SafetyInfo.computeLoopSafetyInfo(CurLoop);
368 if (SafetyInfo.anyBlockMayThrow())
369 return false;
370
371 bool MadeChange = false;
372
373 // Scan all the blocks in the loop that are not in subloops.
374 for (auto *BB : CurLoop->getBlocks()) {
375 // Ignore blocks in subloops.
376 if (LI->getLoopFor(BB) != CurLoop)
377 continue;
378
379 MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
380 }
381 return MadeChange;
382}
383
384static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
385 const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
386 return ConstStride->getAPInt();
387}
388
389/// getMemSetPatternValue - If a strided store of the specified value is safe to
390/// turn into a memset.patternn intrinsic, return the Constant that should
391/// be passed in. Otherwise, return null.
392///
393/// TODO this function could allow more constants than it does today (e.g.
394/// those over 16 bytes) now it has transitioned to being used for the
395/// memset.pattern intrinsic rather than directly the memset_pattern16
396/// libcall.
398 // FIXME: This could check for UndefValue because it can be merged into any
399 // other valid pattern.
400
401 // If the value isn't a constant, we can't promote it to being in a constant
402 // array. We could theoretically do a store to an alloca or something, but
403 // that doesn't seem worthwhile.
404 Constant *C = dyn_cast<Constant>(V);
405 if (!C || isa<ConstantExpr>(C))
406 return nullptr;
407
408 // Only handle simple values that are a power of two bytes in size.
409 uint64_t Size = DL->getTypeSizeInBits(V->getType());
410 if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
411 return nullptr;
412
413 // Don't care enough about darwin/ppc to implement this.
414 if (DL->isBigEndian())
415 return nullptr;
416
417 // Convert to size in bytes.
418 Size /= 8;
419
420 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
421 // if the top and bottom are the same (e.g. for vectors and large integers).
422 if (Size > 16)
423 return nullptr;
424
425 // For now, don't handle types that aren't int, floats, or pointers.
426 Type *CTy = C->getType();
427 if (!CTy->isIntOrPtrTy() && !CTy->isFloatingPointTy())
428 return nullptr;
429
430 return C;
431}
432
433LoopIdiomRecognize::LegalStoreKind
434LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
435 // Don't touch volatile stores.
436 if (SI->isVolatile())
437 return LegalStoreKind::None;
438 // We only want simple or unordered-atomic stores.
439 if (!SI->isUnordered())
440 return LegalStoreKind::None;
441
442 // Avoid merging nontemporal stores.
443 if (SI->getMetadata(LLVMContext::MD_nontemporal))
444 return LegalStoreKind::None;
445
446 Value *StoredVal = SI->getValueOperand();
447 Value *StorePtr = SI->getPointerOperand();
448
449 // Don't convert stores of non-integral pointer types to memsets (which stores
450 // integers).
451 if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
452 return LegalStoreKind::None;
453
454 // Reject stores that are so large that they overflow an unsigned.
455 // When storing out scalable vectors we bail out for now, since the code
456 // below currently only works for constant strides.
457 TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
458 if (SizeInBits.isScalable() || (SizeInBits.getFixedValue() & 7) ||
459 (SizeInBits.getFixedValue() >> 32) != 0)
460 return LegalStoreKind::None;
461
462 // See if the pointer expression is an AddRec like {base,+,1} on the current
463 // loop, which indicates a strided store. If we have something else, it's a
464 // random store we can't handle.
465 const SCEV *StoreEv = SE->getSCEV(StorePtr);
466 const SCEVConstant *Stride;
467 if (!match(StoreEv, m_scev_AffineAddRec(m_SCEV(), m_SCEVConstant(Stride),
468 m_SpecificLoop(CurLoop))))
469 return LegalStoreKind::None;
470
471 // See if the store can be turned into a memset.
472
473 // If the stored value is a byte-wise value (like i32 -1), then it may be
474 // turned into a memset of i8 -1, assuming that all the consecutive bytes
475 // are stored. A store of i32 0x01020304 can never be turned into a memset,
476 // but it can be turned into memset_pattern if the target supports it.
477 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
478
479 // Note: memset and memset_pattern on unordered-atomic is yet not supported
480 bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
481
482 // If we're allowed to form a memset, and the stored value would be
483 // acceptable for memset, use it.
484 if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
485 // Verify that the stored value is loop invariant. If not, we can't
486 // promote the memset.
487 CurLoop->isLoopInvariant(SplatValue)) {
488 // It looks like we can use SplatValue.
489 return LegalStoreKind::Memset;
490 }
491 if (!UnorderedAtomic && (HasMemsetPattern || ForceMemsetPatternIntrinsic) &&
493 // Don't create memset_pattern16s with address spaces.
494 StorePtr->getType()->getPointerAddressSpace() == 0 &&
495 getMemSetPatternValue(StoredVal, DL)) {
496 // It looks like we can use PatternValue!
497 return LegalStoreKind::MemsetPattern;
498 }
499
500 // Otherwise, see if the store can be turned into a memcpy.
501 if (HasMemcpy && !DisableLIRP::Memcpy) {
502 // Check to see if the stride matches the size of the store. If so, then we
503 // know that every byte is touched in the loop.
504 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
505 APInt StrideAP = Stride->getAPInt();
506 if (StoreSize != StrideAP && StoreSize != -StrideAP)
507 return LegalStoreKind::None;
508
509 // The store must be feeding a non-volatile load.
510 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
511
512 // Only allow non-volatile loads
513 if (!LI || LI->isVolatile())
514 return LegalStoreKind::None;
515 // Only allow simple or unordered-atomic loads
516 if (!LI->isUnordered())
517 return LegalStoreKind::None;
518
519 // See if the pointer expression is an AddRec like {base,+,1} on the current
520 // loop, which indicates a strided load. If we have something else, it's a
521 // random load we can't handle.
522 const SCEV *LoadEv = SE->getSCEV(LI->getPointerOperand());
523
524 // The store and load must share the same stride.
525 if (!match(LoadEv, m_scev_AffineAddRec(m_SCEV(), m_scev_Specific(Stride),
526 m_SpecificLoop(CurLoop))))
527 return LegalStoreKind::None;
528
529 // Success. This store can be converted into a memcpy.
530 UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
531 return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
532 : LegalStoreKind::Memcpy;
533 }
534 // This store can't be transformed into a memset/memcpy.
535 return LegalStoreKind::None;
536}
537
538void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
539 StoreRefsForMemset.clear();
540 StoreRefsForMemsetPattern.clear();
541 StoreRefsForMemcpy.clear();
542 for (Instruction &I : *BB) {
543 StoreInst *SI = dyn_cast<StoreInst>(&I);
544 if (!SI)
545 continue;
546
547 // Make sure this is a strided store with a constant stride.
548 switch (isLegalStore(SI)) {
549 case LegalStoreKind::None:
550 // Nothing to do
551 break;
552 case LegalStoreKind::Memset: {
553 // Find the base pointer.
554 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
555 StoreRefsForMemset[Ptr].push_back(SI);
556 } break;
557 case LegalStoreKind::MemsetPattern: {
558 // Find the base pointer.
559 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
560 StoreRefsForMemsetPattern[Ptr].push_back(SI);
561 } break;
562 case LegalStoreKind::Memcpy:
563 case LegalStoreKind::UnorderedAtomicMemcpy:
564 StoreRefsForMemcpy.push_back(SI);
565 break;
566 default:
567 assert(false && "unhandled return value");
568 break;
569 }
570 }
571}
572
573/// runOnLoopBlock - Process the specified block, which lives in a counted loop
574/// with the specified backedge count. This block is known to be in the current
575/// loop and not in any subloops.
576bool LoopIdiomRecognize::runOnLoopBlock(
577 BasicBlock *BB, const SCEV *BECount,
578 SmallVectorImpl<BasicBlock *> &ExitBlocks) {
579 // We can only promote stores in this block if they are unconditionally
580 // executed in the loop. For a block to be unconditionally executed, it has
581 // to dominate all the exit blocks of the loop. Verify this now.
582 for (BasicBlock *ExitBlock : ExitBlocks)
583 if (!DT->dominates(BB, ExitBlock))
584 return false;
585
586 bool MadeChange = false;
587 // Look for store instructions, which may be optimized to memset/memcpy.
588 collectStores(BB);
589
590 // Look for a single store or sets of stores with a common base, which can be
591 // optimized into a memset (memset_pattern). The latter most commonly happens
592 // with structs and handunrolled loops.
593 for (auto &SL : StoreRefsForMemset)
594 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
595
596 for (auto &SL : StoreRefsForMemsetPattern)
597 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
598
599 // Optimize the store into a memcpy, if it feeds an similarly strided load.
600 for (auto &SI : StoreRefsForMemcpy)
601 MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
602
603 MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
604 BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
605 MadeChange |= processLoopMemIntrinsic<MemSetInst>(
606 BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
607
608 return MadeChange;
609}
610
611/// See if this store(s) can be promoted to a memset.
612bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
613 const SCEV *BECount, ForMemset For) {
614 // Try to find consecutive stores that can be transformed into memsets.
615 SetVector<StoreInst *> Heads, Tails;
617
618 // Do a quadratic search on all of the given stores and find
619 // all of the pairs of stores that follow each other.
620 SmallVector<unsigned, 16> IndexQueue;
621 for (unsigned i = 0, e = SL.size(); i < e; ++i) {
622 assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
623
624 Value *FirstStoredVal = SL[i]->getValueOperand();
625 Value *FirstStorePtr = SL[i]->getPointerOperand();
626 const SCEVAddRecExpr *FirstStoreEv =
627 cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
628 APInt FirstStride = getStoreStride(FirstStoreEv);
629 unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
630
631 // See if we can optimize just this store in isolation.
632 if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
633 Heads.insert(SL[i]);
634 continue;
635 }
636
637 Value *FirstSplatValue = nullptr;
638 Constant *FirstPatternValue = nullptr;
639
640 if (For == ForMemset::Yes)
641 FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
642 else
643 FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
644
645 assert((FirstSplatValue || FirstPatternValue) &&
646 "Expected either splat value or pattern value.");
647
648 IndexQueue.clear();
649 // If a store has multiple consecutive store candidates, search Stores
650 // array according to the sequence: from i+1 to e, then from i-1 to 0.
651 // This is because usually pairing with immediate succeeding or preceding
652 // candidate create the best chance to find memset opportunity.
653 unsigned j = 0;
654 for (j = i + 1; j < e; ++j)
655 IndexQueue.push_back(j);
656 for (j = i; j > 0; --j)
657 IndexQueue.push_back(j - 1);
658
659 for (auto &k : IndexQueue) {
660 assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
661 Value *SecondStorePtr = SL[k]->getPointerOperand();
662 const SCEVAddRecExpr *SecondStoreEv =
663 cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
664 APInt SecondStride = getStoreStride(SecondStoreEv);
665
666 if (FirstStride != SecondStride)
667 continue;
668
669 Value *SecondStoredVal = SL[k]->getValueOperand();
670 Value *SecondSplatValue = nullptr;
671 Constant *SecondPatternValue = nullptr;
672
673 if (For == ForMemset::Yes)
674 SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
675 else
676 SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
677
678 assert((SecondSplatValue || SecondPatternValue) &&
679 "Expected either splat value or pattern value.");
680
681 if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
682 if (For == ForMemset::Yes) {
683 if (isa<UndefValue>(FirstSplatValue))
684 FirstSplatValue = SecondSplatValue;
685 if (FirstSplatValue != SecondSplatValue)
686 continue;
687 } else {
688 if (isa<UndefValue>(FirstPatternValue))
689 FirstPatternValue = SecondPatternValue;
690 if (FirstPatternValue != SecondPatternValue)
691 continue;
692 }
693 Tails.insert(SL[k]);
694 Heads.insert(SL[i]);
695 ConsecutiveChain[SL[i]] = SL[k];
696 break;
697 }
698 }
699 }
700
701 // We may run into multiple chains that merge into a single chain. We mark the
702 // stores that we transformed so that we don't visit the same store twice.
703 SmallPtrSet<Value *, 16> TransformedStores;
704 bool Changed = false;
705
706 // For stores that start but don't end a link in the chain:
707 for (StoreInst *I : Heads) {
708 if (Tails.count(I))
709 continue;
710
711 // We found a store instr that starts a chain. Now follow the chain and try
712 // to transform it.
713 SmallPtrSet<Instruction *, 8> AdjacentStores;
714 StoreInst *HeadStore = I;
715 unsigned StoreSize = 0;
716
717 // Collect the chain into a list.
718 while (Tails.count(I) || Heads.count(I)) {
719 if (TransformedStores.count(I))
720 break;
721 AdjacentStores.insert(I);
722
723 StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
724 // Move to the next value in the chain.
725 I = ConsecutiveChain[I];
726 }
727
728 Value *StoredVal = HeadStore->getValueOperand();
729 Value *StorePtr = HeadStore->getPointerOperand();
730 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
731 APInt Stride = getStoreStride(StoreEv);
732
733 // Check to see if the stride matches the size of the stores. If so, then
734 // we know that every byte is touched in the loop.
735 if (StoreSize != Stride && StoreSize != -Stride)
736 continue;
737
738 bool IsNegStride = StoreSize == -Stride;
739
740 Type *IntIdxTy = DL->getIndexType(StorePtr->getType());
741 const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize);
742 if (processLoopStridedStore(StorePtr, StoreSizeSCEV,
743 MaybeAlign(HeadStore->getAlign()), StoredVal,
744 HeadStore, AdjacentStores, StoreEv, BECount,
745 IsNegStride)) {
746 TransformedStores.insert_range(AdjacentStores);
747 Changed = true;
748 }
749 }
750
751 return Changed;
752}
753
754/// processLoopMemIntrinsic - Template function for calling different processor
755/// functions based on mem intrinsic type.
756template <typename MemInst>
757bool LoopIdiomRecognize::processLoopMemIntrinsic(
758 BasicBlock *BB,
759 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
760 const SCEV *BECount) {
761 bool MadeChange = false;
762 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
763 Instruction *Inst = &*I++;
764 // Look for memory instructions, which may be optimized to a larger one.
765 if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
766 WeakTrackingVH InstPtr(&*I);
767 if (!(this->*Processor)(MI, BECount))
768 continue;
769 MadeChange = true;
770
771 // If processing the instruction invalidated our iterator, start over from
772 // the top of the block.
773 if (!InstPtr)
774 I = BB->begin();
775 }
776 }
777 return MadeChange;
778}
779
780/// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
781bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
782 const SCEV *BECount) {
783 // We can only handle non-volatile memcpys with a constant size.
784 if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
785 return false;
786
787 // If we're not allowed to hack on memcpy, we fail.
788 if ((!HasMemcpy && !MCI->isForceInlined()) || DisableLIRP::Memcpy)
789 return false;
790
791 Value *Dest = MCI->getDest();
792 Value *Source = MCI->getSource();
793 if (!Dest || !Source)
794 return false;
795
796 // See if the load and store pointer expressions are AddRec like {base,+,1} on
797 // the current loop, which indicates a strided load and store. If we have
798 // something else, it's a random load or store we can't handle.
799 const SCEV *StoreEv = SE->getSCEV(Dest);
800 const SCEV *LoadEv = SE->getSCEV(Source);
801 const APInt *StoreStrideValue, *LoadStrideValue;
802 if (!match(StoreEv,
803 m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StoreStrideValue),
804 m_SpecificLoop(CurLoop))) ||
805 !match(LoadEv,
806 m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(LoadStrideValue),
807 m_SpecificLoop(CurLoop))))
808 return false;
809
810 // Reject memcpys that are so large that they overflow an unsigned.
811 uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
812 if ((SizeInBytes >> 32) != 0)
813 return false;
814
815 // Huge stride value - give up
816 if (StoreStrideValue->getBitWidth() > 64 ||
817 LoadStrideValue->getBitWidth() > 64)
818 return false;
819
820 if (SizeInBytes != *StoreStrideValue && SizeInBytes != -*StoreStrideValue) {
821 ORE.emit([&]() {
822 return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI)
823 << ore::NV("Inst", "memcpy") << " in "
824 << ore::NV("Function", MCI->getFunction())
825 << " function will not be hoisted: "
826 << ore::NV("Reason", "memcpy size is not equal to stride");
827 });
828 return false;
829 }
830
831 int64_t StoreStrideInt = StoreStrideValue->getSExtValue();
832 int64_t LoadStrideInt = LoadStrideValue->getSExtValue();
833 // Check if the load stride matches the store stride.
834 if (StoreStrideInt != LoadStrideInt)
835 return false;
836
837 return processLoopStoreOfLoopLoad(
838 Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes),
839 MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI,
840 cast<SCEVAddRecExpr>(StoreEv), cast<SCEVAddRecExpr>(LoadEv), BECount);
841}
842
843/// processLoopMemSet - See if this memset can be promoted to a large memset.
844bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
845 const SCEV *BECount) {
846 // We can only handle non-volatile memsets.
847 if (MSI->isVolatile())
848 return false;
849
850 // If we're not allowed to hack on memset, we fail.
851 if (!HasMemset || DisableLIRP::Memset)
852 return false;
853
854 Value *Pointer = MSI->getDest();
855
856 // See if the pointer expression is an AddRec like {base,+,1} on the current
857 // loop, which indicates a strided store. If we have something else, it's a
858 // random store we can't handle.
859 const SCEV *Ev = SE->getSCEV(Pointer);
860 const SCEV *PointerStrideSCEV;
861 if (!match(Ev, m_scev_AffineAddRec(m_SCEV(), m_SCEV(PointerStrideSCEV),
862 m_SpecificLoop(CurLoop)))) {
863 LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n");
864 return false;
865 }
866
867 const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength());
868
869 bool IsNegStride = false;
870 const bool IsConstantSize = isa<ConstantInt>(MSI->getLength());
871
872 if (IsConstantSize) {
873 // Memset size is constant.
874 // Check if the pointer stride matches the memset size. If so, then
875 // we know that every byte is touched in the loop.
876 LLVM_DEBUG(dbgs() << " memset size is constant\n");
877 uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
878 const APInt *Stride;
879 if (!match(PointerStrideSCEV, m_scev_APInt(Stride)))
880 return false;
881
882 if (SizeInBytes != *Stride && SizeInBytes != -*Stride)
883 return false;
884
885 IsNegStride = SizeInBytes == -*Stride;
886 } else {
887 // Memset size is non-constant.
888 // Check if the pointer stride matches the memset size.
889 // To be conservative, the pass would not promote pointers that aren't in
890 // address space zero. Also, the pass only handles memset length and stride
891 // that are invariant for the top level loop.
892 LLVM_DEBUG(dbgs() << " memset size is non-constant\n");
893 if (Pointer->getType()->getPointerAddressSpace() != 0) {
894 LLVM_DEBUG(dbgs() << " pointer is not in address space zero, "
895 << "abort\n");
896 return false;
897 }
898 if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) {
899 LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, "
900 << "abort\n");
901 return false;
902 }
903
904 // Compare positive direction PointerStrideSCEV with MemsetSizeSCEV
905 IsNegStride = PointerStrideSCEV->isNonConstantNegative();
906 const SCEV *PositiveStrideSCEV =
907 IsNegStride ? SE->getNegativeSCEV(PointerStrideSCEV)
908 : PointerStrideSCEV;
909 LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
910 << " PositiveStrideSCEV: " << *PositiveStrideSCEV
911 << "\n");
912
913 if (PositiveStrideSCEV != MemsetSizeSCEV) {
914 // If an expression is covered by the loop guard, compare again and
915 // proceed with optimization if equal.
916 const SCEV *FoldedPositiveStride =
917 SE->applyLoopGuards(PositiveStrideSCEV, CurLoop);
918 const SCEV *FoldedMemsetSize =
919 SE->applyLoopGuards(MemsetSizeSCEV, CurLoop);
920
921 LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n"
922 << " FoldedMemsetSize: " << *FoldedMemsetSize << "\n"
923 << " FoldedPositiveStride: " << *FoldedPositiveStride
924 << "\n");
925
926 if (FoldedPositiveStride != FoldedMemsetSize) {
927 LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n");
928 return false;
929 }
930 }
931 }
932
933 // Verify that the memset value is loop invariant. If not, we can't promote
934 // the memset.
935 Value *SplatValue = MSI->getValue();
936 if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
937 return false;
938
940 MSIs.insert(MSI);
941 return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),
942 MSI->getDestAlign(), SplatValue, MSI, MSIs,
943 cast<SCEVAddRecExpr>(Ev), BECount, IsNegStride,
944 /*IsLoopMemset=*/true);
945}
946
947/// mayLoopAccessLocation - Return true if the specified loop might access the
948/// specified pointer location, which is a loop-strided access. The 'Access'
949/// argument specifies what the verboten forms of access are (read or write).
950static bool
952 const SCEV *BECount, const SCEV *StoreSizeSCEV,
953 AliasAnalysis &AA,
954 SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
955 // Get the location that may be stored across the loop. Since the access is
956 // strided positively through memory, we say that the modified location starts
957 // at the pointer and has infinite size.
959
960 // If the loop iterates a fixed number of times, we can refine the access size
961 // to be exactly the size of the memset, which is (BECount+1)*StoreSize
962 const APInt *BECst, *ConstSize;
963 if (match(BECount, m_scev_APInt(BECst)) &&
964 match(StoreSizeSCEV, m_scev_APInt(ConstSize))) {
965 std::optional<uint64_t> BEInt = BECst->tryZExtValue();
966 std::optional<uint64_t> SizeInt = ConstSize->tryZExtValue();
967 // FIXME: Should this check for overflow?
968 if (BEInt && SizeInt)
969 AccessSize = LocationSize::precise((*BEInt + 1) * *SizeInt);
970 }
971
972 // TODO: For this to be really effective, we have to dive into the pointer
973 // operand in the store. Store to &A[i] of 100 will always return may alias
974 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
975 // which will then no-alias a store to &A[100].
976 MemoryLocation StoreLoc(Ptr, AccessSize);
977
978 for (BasicBlock *B : L->blocks())
979 for (Instruction &I : *B)
980 if (!IgnoredInsts.contains(&I) &&
981 isModOrRefSet(AA.getModRefInfo(&I, StoreLoc) & Access))
982 return true;
983 return false;
984}
985
986// If we have a negative stride, Start refers to the end of the memory location
987// we're trying to memset. Therefore, we need to recompute the base pointer,
988// which is just Start - BECount*Size.
989static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
990 Type *IntPtr, const SCEV *StoreSizeSCEV,
991 ScalarEvolution *SE) {
992 const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
993 if (!StoreSizeSCEV->isOne()) {
994 // index = back edge count * store size
995 Index = SE->getMulExpr(Index,
996 SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
998 }
999 // base pointer = start - index * store size
1000 return SE->getMinusSCEV(Start, Index);
1001}
1002
1003/// Compute the number of bytes as a SCEV from the backedge taken count.
1004///
1005/// This also maps the SCEV into the provided type and tries to handle the
1006/// computation in a way that will fold cleanly.
1007static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
1008 const SCEV *StoreSizeSCEV, Loop *CurLoop,
1009 const DataLayout *DL, ScalarEvolution *SE) {
1010 const SCEV *TripCountSCEV =
1011 SE->getTripCountFromExitCount(BECount, IntPtr, CurLoop);
1012 return SE->getMulExpr(TripCountSCEV,
1013 SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
1015}
1016
1017/// processLoopStridedStore - We see a strided store of some value. If we can
1018/// transform this into a memset or memset_pattern in the loop preheader, do so.
1019bool LoopIdiomRecognize::processLoopStridedStore(
1020 Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment,
1021 Value *StoredVal, Instruction *TheStore,
1023 const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
1024 Module *M = TheStore->getModule();
1025
1026 // The trip count of the loop and the base pointer of the addrec SCEV is
1027 // guaranteed to be loop invariant, which means that it should dominate the
1028 // header. This allows us to insert code for it in the preheader.
1029 unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
1030 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1031 IRBuilder<> Builder(Preheader->getTerminator());
1032 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1033 SCEVExpanderCleaner ExpCleaner(Expander);
1034
1035 Type *DestInt8PtrTy = Builder.getPtrTy(DestAS);
1036 Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
1037
1038 bool Changed = false;
1039 const SCEV *Start = Ev->getStart();
1040 // Handle negative strided loops.
1041 if (IsNegStride)
1042 Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE);
1043
1044 // TODO: ideally we should still be able to generate memset if SCEV expander
1045 // is taught to generate the dependencies at the latest point.
1046 if (!Expander.isSafeToExpand(Start))
1047 return Changed;
1048
1049 // Okay, we have a strided store "p[i]" of a splattable value. We can turn
1050 // this into a memset in the loop preheader now if we want. However, this
1051 // would be unsafe to do if there is anything else in the loop that may read
1052 // or write to the aliased location. Check for any overlap by generating the
1053 // base pointer and checking the region.
1054 Value *BasePtr =
1055 Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
1056
1057 // From here on out, conservatively report to the pass manager that we've
1058 // changed the IR, even if we later clean up these added instructions. There
1059 // may be structural differences e.g. in the order of use lists not accounted
1060 // for in just a textual dump of the IR. This is written as a variable, even
1061 // though statically all the places this dominates could be replaced with
1062 // 'true', with the hope that anyone trying to be clever / "more precise" with
1063 // the return value will read this comment, and leave them alone.
1064 Changed = true;
1065
1066 if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1067 StoreSizeSCEV, *AA, Stores))
1068 return Changed;
1069
1070 if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
1071 return Changed;
1072
1073 // Okay, everything looks good, insert the memset.
1074 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
1075 Constant *PatternValue = nullptr;
1076 if (!SplatValue)
1077 PatternValue = getMemSetPatternValue(StoredVal, DL);
1078
1079 // MemsetArg is the number of bytes for the memset libcall, and the number
1080 // of pattern repetitions if the memset.pattern intrinsic is being used.
1081 Value *MemsetArg;
1082 std::optional<int64_t> BytesWritten;
1083
1084 if (PatternValue && (HasMemsetPattern || ForceMemsetPatternIntrinsic)) {
1085 const SCEV *TripCountS =
1086 SE->getTripCountFromExitCount(BECount, IntIdxTy, CurLoop);
1087 if (!Expander.isSafeToExpand(TripCountS))
1088 return Changed;
1089 const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1090 if (!ConstStoreSize)
1091 return Changed;
1092 Value *TripCount = Expander.expandCodeFor(TripCountS, IntIdxTy,
1093 Preheader->getTerminator());
1094 uint64_t PatternRepsPerTrip =
1095 (ConstStoreSize->getValue()->getZExtValue() * 8) /
1096 DL->getTypeSizeInBits(PatternValue->getType());
1097 // If ConstStoreSize is not equal to the width of PatternValue, then
1098 // MemsetArg is TripCount * (ConstStoreSize/PatternValueWidth). Else
1099 // MemSetArg is just TripCount.
1100 MemsetArg =
1101 PatternRepsPerTrip == 1
1102 ? TripCount
1103 : Builder.CreateMul(TripCount,
1104 Builder.getIntN(IntIdxTy->getIntegerBitWidth(),
1105 PatternRepsPerTrip));
1106 if (auto *CI = dyn_cast<ConstantInt>(TripCount))
1107 BytesWritten =
1108 CI->getZExtValue() * ConstStoreSize->getValue()->getZExtValue();
1109
1110 } else {
1111 const SCEV *NumBytesS =
1112 getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1113
1114 // TODO: ideally we should still be able to generate memset if SCEV expander
1115 // is taught to generate the dependencies at the latest point.
1116 if (!Expander.isSafeToExpand(NumBytesS))
1117 return Changed;
1118 MemsetArg =
1119 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1120 if (auto *CI = dyn_cast<ConstantInt>(MemsetArg))
1121 BytesWritten = CI->getZExtValue();
1122 }
1123 assert(MemsetArg && "MemsetArg should have been set");
1124
1125 AAMDNodes AATags = TheStore->getAAMetadata();
1126 for (Instruction *Store : Stores)
1127 AATags = AATags.merge(Store->getAAMetadata());
1128 if (BytesWritten)
1129 AATags = AATags.extendTo(BytesWritten.value());
1130 else
1131 AATags = AATags.extendTo(-1);
1132
1133 CallInst *NewCall;
1134 if (SplatValue) {
1135 NewCall = Builder.CreateMemSet(BasePtr, SplatValue, MemsetArg,
1136 MaybeAlign(StoreAlignment),
1137 /*isVolatile=*/false, AATags);
1138 } else if (ForceMemsetPatternIntrinsic ||
1139 isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) {
1140 assert(isa<SCEVConstant>(StoreSizeSCEV) && "Expected constant store size");
1141
1142 NewCall = Builder.CreateIntrinsic(
1143 Intrinsic::experimental_memset_pattern,
1144 {DestInt8PtrTy, PatternValue->getType(), IntIdxTy},
1145 {BasePtr, PatternValue, MemsetArg,
1146 ConstantInt::getFalse(M->getContext())});
1147 if (StoreAlignment)
1148 cast<MemSetPatternInst>(NewCall)->setDestAlignment(*StoreAlignment);
1149 NewCall->setAAMetadata(AATags);
1150 } else {
1151 // Neither a memset, nor memset_pattern16
1152 return Changed;
1153 }
1154
1155 NewCall->setDebugLoc(TheStore->getDebugLoc());
1156
1157 if (MSSAU) {
1158 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1159 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1160 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1161 }
1162
1163 LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
1164 << " from store to: " << *Ev << " at: " << *TheStore
1165 << "\n");
1166
1167 ORE.emit([&]() {
1168 OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore",
1169 NewCall->getDebugLoc(), Preheader);
1170 R << "Transformed loop-strided store in "
1171 << ore::NV("Function", TheStore->getFunction())
1172 << " function into a call to "
1173 << ore::NV("NewFunction", NewCall->getCalledFunction())
1174 << "() intrinsic";
1175 if (!Stores.empty())
1176 R << ore::setExtraArgs();
1177 for (auto *I : Stores) {
1178 R << ore::NV("FromBlock", I->getParent()->getName())
1179 << ore::NV("ToBlock", Preheader->getName());
1180 }
1181 return R;
1182 });
1183
1184 // Okay, the memset has been formed. Zap the original store and anything that
1185 // feeds into it.
1186 for (auto *I : Stores) {
1187 if (MSSAU)
1188 MSSAU->removeMemoryAccess(I, true);
1190 }
1191 if (MSSAU && VerifyMemorySSA)
1192 MSSAU->getMemorySSA()->verifyMemorySSA();
1193 ++NumMemSet;
1194 ExpCleaner.markResultUsed();
1195 return true;
1196}
1197
1198/// If the stored value is a strided load in the same loop with the same stride
1199/// this may be transformable into a memcpy. This kicks in for stuff like
1200/// for (i) A[i] = B[i];
1201bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1202 const SCEV *BECount) {
1203 assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
1204
1205 Value *StorePtr = SI->getPointerOperand();
1206 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1207 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1208
1209 // The store must be feeding a non-volatile load.
1210 LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1211 assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
1212
1213 // See if the pointer expression is an AddRec like {base,+,1} on the current
1214 // loop, which indicates a strided load. If we have something else, it's a
1215 // random load we can't handle.
1216 Value *LoadPtr = LI->getPointerOperand();
1217 const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
1218
1219 const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize);
1220 return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV,
1221 SI->getAlign(), LI->getAlign(), SI, LI,
1222 StoreEv, LoadEv, BECount);
1223}
1224
1225namespace {
1226class MemmoveVerifier {
1227public:
1228 explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,
1229 const DataLayout &DL)
1231 LoadBasePtr.stripPointerCasts(), LoadOff, DL)),
1233 StoreBasePtr.stripPointerCasts(), StoreOff, DL)),
1234 IsSameObject(BP1 == BP2) {}
1235
1236 bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride,
1237 const Instruction &TheLoad,
1238 bool IsMemCpy) const {
1239 if (IsMemCpy) {
1240 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1241 // for negative stride.
1242 if ((!IsNegStride && LoadOff <= StoreOff) ||
1243 (IsNegStride && LoadOff >= StoreOff))
1244 return false;
1245 } else {
1246 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1247 // for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
1248 int64_t LoadSize =
1249 DL.getTypeSizeInBits(TheLoad.getType()).getFixedValue() / 8;
1250 if (BP1 != BP2 || LoadSize != int64_t(StoreSize))
1251 return false;
1252 if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) ||
1253 (IsNegStride && LoadOff + LoadSize > StoreOff))
1254 return false;
1255 }
1256 return true;
1257 }
1258
1259private:
1260 const DataLayout &DL;
1261 int64_t LoadOff = 0;
1262 int64_t StoreOff = 0;
1263 const Value *BP1;
1264 const Value *BP2;
1265
1266public:
1267 const bool IsSameObject;
1268};
1269} // namespace
1270
1271bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
1272 Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV,
1273 MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
1274 Instruction *TheLoad, const SCEVAddRecExpr *StoreEv,
1275 const SCEVAddRecExpr *LoadEv, const SCEV *BECount) {
1276
1277 // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
1278 // conservatively bail here, since otherwise we may have to transform
1279 // llvm.memcpy.inline into llvm.memcpy which is illegal.
1280 if (auto *MCI = dyn_cast<MemCpyInst>(TheStore); MCI && MCI->isForceInlined())
1281 return false;
1282
1283 // The trip count of the loop and the base pointer of the addrec SCEV is
1284 // guaranteed to be loop invariant, which means that it should dominate the
1285 // header. This allows us to insert code for it in the preheader.
1286 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1287 IRBuilder<> Builder(Preheader->getTerminator());
1288 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1289
1290 SCEVExpanderCleaner ExpCleaner(Expander);
1291
1292 bool Changed = false;
1293 const SCEV *StrStart = StoreEv->getStart();
1294 unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
1295 Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1296
1297 APInt Stride = getStoreStride(StoreEv);
1298 const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1299
1300 // TODO: Deal with non-constant size; Currently expect constant store size
1301 assert(ConstStoreSize && "store size is expected to be a constant");
1302
1303 int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
1304 bool IsNegStride = StoreSize == -Stride;
1305
1306 // Handle negative strided loops.
1307 if (IsNegStride)
1308 StrStart =
1309 getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1310
1311 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1312 // this into a memcpy in the loop preheader now if we want. However, this
1313 // would be unsafe to do if there is anything else in the loop that may read
1314 // or write the memory region we're storing to. This includes the load that
1315 // feeds the stores. Check for an alias by generating the base address and
1316 // checking everything.
1317 Value *StoreBasePtr = Expander.expandCodeFor(
1318 StrStart, Builder.getPtrTy(StrAS), Preheader->getTerminator());
1319
1320 // From here on out, conservatively report to the pass manager that we've
1321 // changed the IR, even if we later clean up these added instructions. There
1322 // may be structural differences e.g. in the order of use lists not accounted
1323 // for in just a textual dump of the IR. This is written as a variable, even
1324 // though statically all the places this dominates could be replaced with
1325 // 'true', with the hope that anyone trying to be clever / "more precise" with
1326 // the return value will read this comment, and leave them alone.
1327 Changed = true;
1328
1329 SmallPtrSet<Instruction *, 2> IgnoredInsts;
1330 IgnoredInsts.insert(TheStore);
1331
1332 bool IsMemCpy = isa<MemCpyInst>(TheStore);
1333 const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
1334
1335 bool LoopAccessStore =
1336 mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1337 StoreSizeSCEV, *AA, IgnoredInsts);
1338 if (LoopAccessStore) {
1339 // For memmove case it's not enough to guarantee that loop doesn't access
1340 // TheStore and TheLoad. Additionally we need to make sure that TheStore is
1341 // the only user of TheLoad.
1342 if (!TheLoad->hasOneUse())
1343 return Changed;
1344 IgnoredInsts.insert(TheLoad);
1345 if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
1346 BECount, StoreSizeSCEV, *AA, IgnoredInsts)) {
1347 ORE.emit([&]() {
1348 return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
1349 TheStore)
1350 << ore::NV("Inst", InstRemark) << " in "
1351 << ore::NV("Function", TheStore->getFunction())
1352 << " function will not be hoisted: "
1353 << ore::NV("Reason", "The loop may access store location");
1354 });
1355 return Changed;
1356 }
1357 IgnoredInsts.erase(TheLoad);
1358 }
1359
1360 const SCEV *LdStart = LoadEv->getStart();
1361 unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
1362
1363 // Handle negative strided loops.
1364 if (IsNegStride)
1365 LdStart =
1366 getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1367
1368 // For a memcpy, we have to make sure that the input array is not being
1369 // mutated by the loop.
1370 Value *LoadBasePtr = Expander.expandCodeFor(LdStart, Builder.getPtrTy(LdAS),
1371 Preheader->getTerminator());
1372
1373 // If the store is a memcpy instruction, we must check if it will write to
1374 // the load memory locations. So remove it from the ignored stores.
1375 MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL);
1376 if (IsMemCpy && !Verifier.IsSameObject)
1377 IgnoredInsts.erase(TheStore);
1378 if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1379 StoreSizeSCEV, *AA, IgnoredInsts)) {
1380 ORE.emit([&]() {
1381 return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
1382 << ore::NV("Inst", InstRemark) << " in "
1383 << ore::NV("Function", TheStore->getFunction())
1384 << " function will not be hoisted: "
1385 << ore::NV("Reason", "The loop may access load location");
1386 });
1387 return Changed;
1388 }
1389
1390 bool IsAtomic = TheStore->isAtomic() || TheLoad->isAtomic();
1391 bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
1392
1393 if (IsAtomic) {
1394 // For now don't support unordered atomic memmove.
1395 if (UseMemMove)
1396 return Changed;
1397
1398 // We cannot allow unaligned ops for unordered load/store, so reject
1399 // anything where the alignment isn't at least the element size.
1400 assert((StoreAlign && LoadAlign) &&
1401 "Expect unordered load/store to have align.");
1402 if (*StoreAlign < StoreSize || *LoadAlign < StoreSize)
1403 return Changed;
1404
1405 // If the element.atomic memcpy is not lowered into explicit
1406 // loads/stores later, then it will be lowered into an element-size
1407 // specific lib call. If the lib call doesn't exist for our store size, then
1408 // we shouldn't generate the memcpy.
1409 if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1410 return Changed;
1411 }
1412
1413 if (UseMemMove)
1414 if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, *TheLoad,
1415 IsMemCpy))
1416 return Changed;
1417
1418 if (avoidLIRForMultiBlockLoop())
1419 return Changed;
1420
1421 // Okay, everything is safe, we can transform this!
1422
1423 const SCEV *NumBytesS =
1424 getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1425
1426 Value *NumBytes =
1427 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1428
1429 AAMDNodes AATags = TheLoad->getAAMetadata();
1430 AAMDNodes StoreAATags = TheStore->getAAMetadata();
1431 AATags = AATags.merge(StoreAATags);
1432 if (auto CI = dyn_cast<ConstantInt>(NumBytes))
1433 AATags = AATags.extendTo(CI->getZExtValue());
1434 else
1435 AATags = AATags.extendTo(-1);
1436
1437 CallInst *NewCall = nullptr;
1438 // Check whether to generate an unordered atomic memcpy:
1439 // If the load or store are atomic, then they must necessarily be unordered
1440 // by previous checks.
1441 if (!IsAtomic) {
1442 if (UseMemMove)
1443 NewCall = Builder.CreateMemMove(StoreBasePtr, StoreAlign, LoadBasePtr,
1444 LoadAlign, NumBytes,
1445 /*isVolatile=*/false, AATags);
1446 else
1447 NewCall =
1448 Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign,
1449 NumBytes, /*isVolatile=*/false, AATags);
1450 } else {
1451 // Create the call.
1452 // Note that unordered atomic loads/stores are *required* by the spec to
1453 // have an alignment but non-atomic loads/stores may not.
1454 NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1455 StoreBasePtr, *StoreAlign, LoadBasePtr, *LoadAlign, NumBytes, StoreSize,
1456 AATags);
1457 }
1458 NewCall->setDebugLoc(TheStore->getDebugLoc());
1459
1460 if (MSSAU) {
1461 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1462 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1463 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1464 }
1465
1466 LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall << "\n"
1467 << " from load ptr=" << *LoadEv << " at: " << *TheLoad
1468 << "\n"
1469 << " from store ptr=" << *StoreEv << " at: " << *TheStore
1470 << "\n");
1471
1472 ORE.emit([&]() {
1473 return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
1474 NewCall->getDebugLoc(), Preheader)
1475 << "Formed a call to "
1476 << ore::NV("NewFunction", NewCall->getCalledFunction())
1477 << "() intrinsic from " << ore::NV("Inst", InstRemark)
1478 << " instruction in " << ore::NV("Function", TheStore->getFunction())
1479 << " function"
1481 << ore::NV("FromBlock", TheStore->getParent()->getName())
1482 << ore::NV("ToBlock", Preheader->getName());
1483 });
1484
1485 // Okay, a new call to memcpy/memmove has been formed. Zap the original store
1486 // and anything that feeds into it.
1487 if (MSSAU)
1488 MSSAU->removeMemoryAccess(TheStore, true);
1489 deleteDeadInstruction(TheStore);
1490 if (MSSAU && VerifyMemorySSA)
1491 MSSAU->getMemorySSA()->verifyMemorySSA();
1492 if (UseMemMove)
1493 ++NumMemMove;
1494 else
1495 ++NumMemCpy;
1496 ExpCleaner.markResultUsed();
1497 return true;
1498}
1499
1500// When compiling for codesize we avoid idiom recognition for a multi-block loop
1501// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1502//
1503bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1504 bool IsLoopMemset) {
1505 if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1506 if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
1507 LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
1508 << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
1509 << " avoided: multi-block top-level loop\n");
1510 return true;
1511 }
1512 }
1513
1514 return false;
1515}
1516
1517bool LoopIdiomRecognize::runOnNoncountableLoop() {
1518 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
1519 << CurLoop->getHeader()->getParent()->getName()
1520 << "] Noncountable Loop %"
1521 << CurLoop->getHeader()->getName() << "\n");
1522
1523 return recognizePopcount() || recognizeAndInsertFFS() ||
1524 recognizeShiftUntilBitTest() || recognizeShiftUntilZero() ||
1525 recognizeShiftUntilLessThan() || recognizeAndInsertStrLen();
1526}
1527
1528/// Check if the given conditional branch is based on the comparison between
1529/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1530/// true), the control yields to the loop entry. If the branch matches the
1531/// behavior, the variable involved in the comparison is returned. This function
1532/// will be called to see if the precondition and postcondition of the loop are
1533/// in desirable form.
1535 bool JmpOnZero = false) {
1536 if (!BI || !BI->isConditional())
1537 return nullptr;
1538
1539 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1540 if (!Cond)
1541 return nullptr;
1542
1543 auto *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1544 if (!CmpZero || !CmpZero->isZero())
1545 return nullptr;
1546
1547 BasicBlock *TrueSucc = BI->getSuccessor(0);
1548 BasicBlock *FalseSucc = BI->getSuccessor(1);
1549 if (JmpOnZero)
1550 std::swap(TrueSucc, FalseSucc);
1551
1552 ICmpInst::Predicate Pred = Cond->getPredicate();
1553 if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1554 (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1555 return Cond->getOperand(0);
1556
1557 return nullptr;
1558}
1559
1560namespace {
1561
1562class StrlenVerifier {
1563public:
1564 explicit StrlenVerifier(const Loop *CurLoop, ScalarEvolution *SE,
1565 const TargetLibraryInfo *TLI)
1566 : CurLoop(CurLoop), SE(SE), TLI(TLI) {}
1567
1568 bool isValidStrlenIdiom() {
1569 // Give up if the loop has multiple blocks, multiple backedges, or
1570 // multiple exit blocks
1571 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1 ||
1572 !CurLoop->getUniqueExitBlock())
1573 return false;
1574
1575 // It should have a preheader and a branch instruction.
1576 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1577 if (!Preheader)
1578 return false;
1579
1580 BranchInst *EntryBI = dyn_cast<BranchInst>(Preheader->getTerminator());
1581 if (!EntryBI)
1582 return false;
1583
1584 // The loop exit must be conditioned on an icmp with 0 the null terminator.
1585 // The icmp operand has to be a load on some SSA reg that increments
1586 // by 1 in the loop.
1587 BasicBlock *LoopBody = *CurLoop->block_begin();
1588
1589 // Skip if the body is too big as it most likely is not a strlen idiom.
1590 if (!LoopBody || LoopBody->size() >= 15)
1591 return false;
1592
1593 BranchInst *LoopTerm = dyn_cast<BranchInst>(LoopBody->getTerminator());
1594 Value *LoopCond = matchCondition(LoopTerm, LoopBody);
1595 if (!LoopCond)
1596 return false;
1597
1598 LoadInst *LoopLoad = dyn_cast<LoadInst>(LoopCond);
1599 if (!LoopLoad || LoopLoad->getPointerAddressSpace() != 0)
1600 return false;
1601
1602 OperandType = LoopLoad->getType();
1603 if (!OperandType || !OperandType->isIntegerTy())
1604 return false;
1605
1606 // See if the pointer expression is an AddRec with constant step a of form
1607 // ({n,+,a}) where a is the width of the char type.
1608 Value *IncPtr = LoopLoad->getPointerOperand();
1609 const SCEV *LoadEv = SE->getSCEV(IncPtr);
1610 const APInt *Step;
1611 if (!match(LoadEv,
1612 m_scev_AffineAddRec(m_SCEV(LoadBaseEv), m_scev_APInt(Step))))
1613 return false;
1614
1615 LLVM_DEBUG(dbgs() << "pointer load scev: " << *LoadEv << "\n");
1616
1617 unsigned StepSize = Step->getZExtValue();
1618
1619 // Verify that StepSize is consistent with platform char width.
1620 OpWidth = OperandType->getIntegerBitWidth();
1621 unsigned WcharSize = TLI->getWCharSize(*LoopLoad->getModule());
1622 if (OpWidth != StepSize * 8)
1623 return false;
1624 if (OpWidth != 8 && OpWidth != 16 && OpWidth != 32)
1625 return false;
1626 if (OpWidth >= 16)
1627 if (OpWidth != WcharSize * 8)
1628 return false;
1629
1630 // Scan every instruction in the loop to ensure there are no side effects.
1631 for (Instruction &I : *LoopBody)
1632 if (I.mayHaveSideEffects())
1633 return false;
1634
1635 BasicBlock *LoopExitBB = CurLoop->getExitBlock();
1636 if (!LoopExitBB)
1637 return false;
1638
1639 for (PHINode &PN : LoopExitBB->phis()) {
1640 if (!SE->isSCEVable(PN.getType()))
1641 return false;
1642
1643 const SCEV *Ev = SE->getSCEV(&PN);
1644 if (!Ev)
1645 return false;
1646
1647 LLVM_DEBUG(dbgs() << "loop exit phi scev: " << *Ev << "\n");
1648
1649 // Since we verified that the loop trip count will be a valid strlen
1650 // idiom, we can expand all lcssa phi with {n,+,1} as (n + strlen) and use
1651 // SCEVExpander materialize the loop output.
1652 const SCEVAddRecExpr *AddRecEv = dyn_cast<SCEVAddRecExpr>(Ev);
1653 if (!AddRecEv || !AddRecEv->isAffine())
1654 return false;
1655
1656 // We only want RecAddExpr with recurrence step that is constant. This
1657 // is good enough for all the idioms we want to recognize. Later we expand
1658 // and materialize the recurrence as {base,+,a} -> (base + a * strlen)
1659 if (!isa<SCEVConstant>(AddRecEv->getStepRecurrence(*SE)))
1660 return false;
1661 }
1662
1663 return true;
1664 }
1665
1666public:
1667 const Loop *CurLoop;
1668 ScalarEvolution *SE;
1669 const TargetLibraryInfo *TLI;
1670
1671 unsigned OpWidth;
1672 ConstantInt *StepSizeCI;
1673 const SCEV *LoadBaseEv;
1675};
1676
1677} // namespace
1678
1679/// The Strlen Idiom we are trying to detect has the following structure
1680///
1681/// preheader:
1682/// ...
1683/// br label %body, ...
1684///
1685/// body:
1686/// ... ; %0 is incremented by a gep
1687/// %1 = load i8, ptr %0, align 1
1688/// %2 = icmp eq i8 %1, 0
1689/// br i1 %2, label %exit, label %body
1690///
1691/// exit:
1692/// %lcssa = phi [%0, %body], ...
1693///
1694/// We expect the strlen idiom to have a load of a character type that
1695/// is compared against '\0', and such load pointer operand must have scev
1696/// expression of the form {%str,+,c} where c is a ConstantInt of the
1697/// appropiate character width for the idiom, and %str is the base of the string
1698/// And, that all lcssa phis have the form {...,+,n} where n is a constant,
1699///
1700/// When transforming the output of the strlen idiom, the lccsa phi are
1701/// expanded using SCEVExpander as {base scev,+,a} -> (base scev + a * strlen)
1702/// and all subsequent uses are replaced. For example,
1703///
1704/// \code{.c}
1705/// const char* base = str;
1706/// while (*str != '\0')
1707/// ++str;
1708/// size_t result = str - base;
1709/// \endcode
1710///
1711/// will be transformed as follows: The idiom will be replaced by a strlen
1712/// computation to compute the address of the null terminator of the string.
1713///
1714/// \code{.c}
1715/// const char* base = str;
1716/// const char* end = base + strlen(str);
1717/// size_t result = end - base;
1718/// \endcode
1719///
1720/// In the case we index by an induction variable, as long as the induction
1721/// variable has a constant int increment, we can replace all such indvars
1722/// with the closed form computation of strlen
1723///
1724/// \code{.c}
1725/// size_t i = 0;
1726/// while (str[i] != '\0')
1727/// ++i;
1728/// size_t result = i;
1729/// \endcode
1730///
1731/// Will be replaced by
1732///
1733/// \code{.c}
1734/// size_t i = 0 + strlen(str);
1735/// size_t result = i;
1736/// \endcode
1737///
1738bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
1739 if (DisableLIRP::All)
1740 return false;
1741
1742 StrlenVerifier Verifier(CurLoop, SE, TLI);
1743
1744 if (!Verifier.isValidStrlenIdiom())
1745 return false;
1746
1747 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1748 BasicBlock *LoopBody = *CurLoop->block_begin();
1749 BasicBlock *LoopExitBB = CurLoop->getExitBlock();
1750 BranchInst *LoopTerm = dyn_cast<BranchInst>(LoopBody->getTerminator());
1751 assert(Preheader && LoopBody && LoopExitBB && LoopTerm &&
1752 "Should be verified to be valid by StrlenVerifier");
1753
1754 if (Verifier.OpWidth == 8) {
1756 return false;
1757 if (!isLibFuncEmittable(Preheader->getModule(), TLI, LibFunc_strlen))
1758 return false;
1759 } else {
1761 return false;
1762 if (!isLibFuncEmittable(Preheader->getModule(), TLI, LibFunc_wcslen))
1763 return false;
1764 }
1765
1766 IRBuilder<> Builder(Preheader->getTerminator());
1767 Builder.SetCurrentDebugLocation(CurLoop->getStartLoc());
1768 SCEVExpander Expander(*SE, Preheader->getModule()->getDataLayout(),
1769 "strlen_idiom");
1770 Value *MaterialzedBase = Expander.expandCodeFor(
1771 Verifier.LoadBaseEv, Verifier.LoadBaseEv->getType(),
1772 Builder.GetInsertPoint());
1773
1774 Value *StrLenFunc = nullptr;
1775 if (Verifier.OpWidth == 8) {
1776 StrLenFunc = emitStrLen(MaterialzedBase, Builder, *DL, TLI);
1777 } else {
1778 StrLenFunc = emitWcsLen(MaterialzedBase, Builder, *DL, TLI);
1779 }
1780 assert(StrLenFunc && "Failed to emit strlen function.");
1781
1782 const SCEV *StrlenEv = SE->getSCEV(StrLenFunc);
1784 for (PHINode &PN : LoopExitBB->phis()) {
1785 // We can now materialize the loop output as all phi have scev {base,+,a}.
1786 // We expand the phi as:
1787 // %strlen = call i64 @strlen(%str)
1788 // %phi.new = base expression + step * %strlen
1789 const SCEV *Ev = SE->getSCEV(&PN);
1790 const SCEVAddRecExpr *AddRecEv = dyn_cast<SCEVAddRecExpr>(Ev);
1791 const SCEVConstant *Step =
1792 dyn_cast<SCEVConstant>(AddRecEv->getStepRecurrence(*SE));
1793 const SCEV *Base = AddRecEv->getStart();
1794
1795 // It is safe to truncate to base since if base is narrower than size_t
1796 // the equivalent user code will have to truncate anyways.
1797 const SCEV *NewEv = SE->getAddExpr(
1799 StrlenEv, Base->getType())));
1800
1801 Value *MaterializedPHI = Expander.expandCodeFor(NewEv, NewEv->getType(),
1802 Builder.GetInsertPoint());
1803 Expander.clear();
1804 PN.replaceAllUsesWith(MaterializedPHI);
1805 Cleanup.push_back(&PN);
1806 }
1807
1808 // All LCSSA Loop Phi are dead, the left over dead loop body can be cleaned
1809 // up by later passes
1810 for (PHINode *PN : Cleanup)
1812
1813 // LoopDeletion only delete invariant loops with known trip-count. We can
1814 // update the condition so it will reliablely delete the invariant loop
1815 assert(LoopTerm->getNumSuccessors() == 2 &&
1816 (LoopTerm->getSuccessor(0) == LoopBody ||
1817 LoopTerm->getSuccessor(1) == LoopBody) &&
1818 "loop body must have a successor that is it self");
1819 ConstantInt *NewLoopCond = LoopTerm->getSuccessor(0) == LoopBody
1820 ? Builder.getFalse()
1821 : Builder.getTrue();
1822 LoopTerm->setCondition(NewLoopCond);
1823 SE->forgetLoop(CurLoop);
1824
1825 ++NumStrLen;
1826 LLVM_DEBUG(dbgs() << " Formed strlen idiom: " << *StrLenFunc << "\n");
1827 ORE.emit([&]() {
1828 return OptimizationRemark(DEBUG_TYPE, "recognizeAndInsertStrLen",
1829 CurLoop->getStartLoc(), Preheader)
1830 << "Transformed " << StrLenFunc->getName() << " loop idiom";
1831 });
1832
1833 return true;
1834}
1835
1836/// Check if the given conditional branch is based on an unsigned less-than
1837/// comparison between a variable and a constant, and if the comparison is false
1838/// the control yields to the loop entry. If the branch matches the behaviour,
1839/// the variable involved in the comparison is returned.
1841 APInt &Threshold) {
1842 if (!BI || !BI->isConditional())
1843 return nullptr;
1844
1845 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1846 if (!Cond)
1847 return nullptr;
1848
1849 ConstantInt *CmpConst = dyn_cast<ConstantInt>(Cond->getOperand(1));
1850 if (!CmpConst)
1851 return nullptr;
1852
1853 BasicBlock *FalseSucc = BI->getSuccessor(1);
1854 ICmpInst::Predicate Pred = Cond->getPredicate();
1855
1856 if (Pred == ICmpInst::ICMP_ULT && FalseSucc == LoopEntry) {
1857 Threshold = CmpConst->getValue();
1858 return Cond->getOperand(0);
1859 }
1860
1861 return nullptr;
1862}
1863
1864// Check if the recurrence variable `VarX` is in the right form to create
1865// the idiom. Returns the value coerced to a PHINode if so.
1867 BasicBlock *LoopEntry) {
1868 auto *PhiX = dyn_cast<PHINode>(VarX);
1869 if (PhiX && PhiX->getParent() == LoopEntry &&
1870 (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
1871 return PhiX;
1872 return nullptr;
1873}
1874
1875/// Return true if the idiom is detected in the loop.
1876///
1877/// Additionally:
1878/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1879/// or nullptr if there is no such.
1880/// 2) \p CntPhi is set to the corresponding phi node
1881/// or nullptr if there is no such.
1882/// 3) \p InitX is set to the value whose CTLZ could be used.
1883/// 4) \p DefX is set to the instruction calculating Loop exit condition.
1884/// 5) \p Threshold is set to the constant involved in the unsigned less-than
1885/// comparison.
1886///
1887/// The core idiom we are trying to detect is:
1888/// \code
1889/// if (x0 < 2)
1890/// goto loop-exit // the precondition of the loop
1891/// cnt0 = init-val
1892/// do {
1893/// x = phi (x0, x.next); //PhiX
1894/// cnt = phi (cnt0, cnt.next)
1895///
1896/// cnt.next = cnt + 1;
1897/// ...
1898/// x.next = x >> 1; // DefX
1899/// } while (x >= 4)
1900/// loop-exit:
1901/// \endcode
1903 Intrinsic::ID &IntrinID,
1904 Value *&InitX, Instruction *&CntInst,
1905 PHINode *&CntPhi, Instruction *&DefX,
1906 APInt &Threshold) {
1907 BasicBlock *LoopEntry;
1908
1909 DefX = nullptr;
1910 CntInst = nullptr;
1911 CntPhi = nullptr;
1912 LoopEntry = *(CurLoop->block_begin());
1913
1914 // step 1: Check if the loop-back branch is in desirable form.
1916 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry,
1917 Threshold))
1918 DefX = dyn_cast<Instruction>(T);
1919 else
1920 return false;
1921
1922 // step 2: Check the recurrence of variable X
1923 if (!DefX || !isa<PHINode>(DefX))
1924 return false;
1925
1926 PHINode *VarPhi = cast<PHINode>(DefX);
1927 int Idx = VarPhi->getBasicBlockIndex(LoopEntry);
1928 if (Idx == -1)
1929 return false;
1930
1931 DefX = dyn_cast<Instruction>(VarPhi->getIncomingValue(Idx));
1932 if (!DefX || DefX->getNumOperands() == 0 || DefX->getOperand(0) != VarPhi)
1933 return false;
1934
1935 // step 3: detect instructions corresponding to "x.next = x >> 1"
1936 if (DefX->getOpcode() != Instruction::LShr)
1937 return false;
1938
1939 IntrinID = Intrinsic::ctlz;
1940 ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1941 if (!Shft || !Shft->isOne())
1942 return false;
1943
1944 InitX = VarPhi->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1945
1946 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1947 // or cnt.next = cnt + -1.
1948 // TODO: We can skip the step. If loop trip count is known (CTLZ),
1949 // then all uses of "cnt.next" could be optimized to the trip count
1950 // plus "cnt0". Currently it is not optimized.
1951 // This step could be used to detect POPCNT instruction:
1952 // cnt.next = cnt + (x.next & 1)
1953 for (Instruction &Inst :
1954 llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) {
1955 if (Inst.getOpcode() != Instruction::Add)
1956 continue;
1957
1958 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
1959 if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
1960 continue;
1961
1962 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
1963 if (!Phi)
1964 continue;
1965
1966 CntInst = &Inst;
1967 CntPhi = Phi;
1968 break;
1969 }
1970 if (!CntInst)
1971 return false;
1972
1973 return true;
1974}
1975
1976/// Return true iff the idiom is detected in the loop.
1977///
1978/// Additionally:
1979/// 1) \p CntInst is set to the instruction counting the population bit.
1980/// 2) \p CntPhi is set to the corresponding phi node.
1981/// 3) \p Var is set to the value whose population bits are being counted.
1982///
1983/// The core idiom we are trying to detect is:
1984/// \code
1985/// if (x0 != 0)
1986/// goto loop-exit // the precondition of the loop
1987/// cnt0 = init-val;
1988/// do {
1989/// x1 = phi (x0, x2);
1990/// cnt1 = phi(cnt0, cnt2);
1991///
1992/// cnt2 = cnt1 + 1;
1993/// ...
1994/// x2 = x1 & (x1 - 1);
1995/// ...
1996/// } while(x != 0);
1997///
1998/// loop-exit:
1999/// \endcode
2000static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
2001 Instruction *&CntInst, PHINode *&CntPhi,
2002 Value *&Var) {
2003 // step 1: Check to see if the look-back branch match this pattern:
2004 // "if (a!=0) goto loop-entry".
2005 BasicBlock *LoopEntry;
2006 Instruction *DefX2, *CountInst;
2007 Value *VarX1, *VarX0;
2008 PHINode *PhiX, *CountPhi;
2009
2010 DefX2 = CountInst = nullptr;
2011 VarX1 = VarX0 = nullptr;
2012 PhiX = CountPhi = nullptr;
2013 LoopEntry = *(CurLoop->block_begin());
2014
2015 // step 1: Check if the loop-back branch is in desirable form.
2016 {
2017 if (Value *T = matchCondition(
2018 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
2019 DefX2 = dyn_cast<Instruction>(T);
2020 else
2021 return false;
2022 }
2023
2024 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
2025 {
2026 if (!DefX2 || DefX2->getOpcode() != Instruction::And)
2027 return false;
2028
2029 BinaryOperator *SubOneOp;
2030
2031 if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
2032 VarX1 = DefX2->getOperand(1);
2033 else {
2034 VarX1 = DefX2->getOperand(0);
2035 SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
2036 }
2037 if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
2038 return false;
2039
2040 ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
2041 if (!Dec ||
2042 !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
2043 (SubOneOp->getOpcode() == Instruction::Add &&
2044 Dec->isMinusOne()))) {
2045 return false;
2046 }
2047 }
2048
2049 // step 3: Check the recurrence of variable X
2050 PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
2051 if (!PhiX)
2052 return false;
2053
2054 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
2055 {
2056 CountInst = nullptr;
2057 for (Instruction &Inst :
2058 llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) {
2059 if (Inst.getOpcode() != Instruction::Add)
2060 continue;
2061
2062 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
2063 if (!Inc || !Inc->isOne())
2064 continue;
2065
2066 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
2067 if (!Phi)
2068 continue;
2069
2070 // Check if the result of the instruction is live of the loop.
2071 bool LiveOutLoop = false;
2072 for (User *U : Inst.users()) {
2073 if ((cast<Instruction>(U))->getParent() != LoopEntry) {
2074 LiveOutLoop = true;
2075 break;
2076 }
2077 }
2078
2079 if (LiveOutLoop) {
2080 CountInst = &Inst;
2081 CountPhi = Phi;
2082 break;
2083 }
2084 }
2085
2086 if (!CountInst)
2087 return false;
2088 }
2089
2090 // step 5: check if the precondition is in this form:
2091 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
2092 {
2093 auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
2094 Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
2095 if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
2096 return false;
2097
2098 CntInst = CountInst;
2099 CntPhi = CountPhi;
2100 Var = T;
2101 }
2102
2103 return true;
2104}
2105
2106/// Return true if the idiom is detected in the loop.
2107///
2108/// Additionally:
2109/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
2110/// or nullptr if there is no such.
2111/// 2) \p CntPhi is set to the corresponding phi node
2112/// or nullptr if there is no such.
2113/// 3) \p Var is set to the value whose CTLZ could be used.
2114/// 4) \p DefX is set to the instruction calculating Loop exit condition.
2115///
2116/// The core idiom we are trying to detect is:
2117/// \code
2118/// if (x0 == 0)
2119/// goto loop-exit // the precondition of the loop
2120/// cnt0 = init-val;
2121/// do {
2122/// x = phi (x0, x.next); //PhiX
2123/// cnt = phi(cnt0, cnt.next);
2124///
2125/// cnt.next = cnt + 1;
2126/// ...
2127/// x.next = x >> 1; // DefX
2128/// ...
2129/// } while(x.next != 0);
2130///
2131/// loop-exit:
2132/// \endcode
2133static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
2134 Intrinsic::ID &IntrinID, Value *&InitX,
2135 Instruction *&CntInst, PHINode *&CntPhi,
2136 Instruction *&DefX) {
2137 BasicBlock *LoopEntry;
2138 Value *VarX = nullptr;
2139
2140 DefX = nullptr;
2141 CntInst = nullptr;
2142 CntPhi = nullptr;
2143 LoopEntry = *(CurLoop->block_begin());
2144
2145 // step 1: Check if the loop-back branch is in desirable form.
2146 if (Value *T = matchCondition(
2147 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
2148 DefX = dyn_cast<Instruction>(T);
2149 else
2150 return false;
2151
2152 // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
2153 if (!DefX || !DefX->isShift())
2154 return false;
2155 IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
2156 Intrinsic::ctlz;
2157 ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
2158 if (!Shft || !Shft->isOne())
2159 return false;
2160 VarX = DefX->getOperand(0);
2161
2162 // step 3: Check the recurrence of variable X
2163 PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
2164 if (!PhiX)
2165 return false;
2166
2167 InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
2168
2169 // Make sure the initial value can't be negative otherwise the ashr in the
2170 // loop might never reach zero which would make the loop infinite.
2171 if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
2172 return false;
2173
2174 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
2175 // or cnt.next = cnt + -1.
2176 // TODO: We can skip the step. If loop trip count is known (CTLZ),
2177 // then all uses of "cnt.next" could be optimized to the trip count
2178 // plus "cnt0". Currently it is not optimized.
2179 // This step could be used to detect POPCNT instruction:
2180 // cnt.next = cnt + (x.next & 1)
2181 for (Instruction &Inst :
2182 llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) {
2183 if (Inst.getOpcode() != Instruction::Add)
2184 continue;
2185
2186 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
2187 if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
2188 continue;
2189
2190 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
2191 if (!Phi)
2192 continue;
2193
2194 CntInst = &Inst;
2195 CntPhi = Phi;
2196 break;
2197 }
2198 if (!CntInst)
2199 return false;
2200
2201 return true;
2202}
2203
2204// Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
2205// profitable if we delete the loop.
2206bool LoopIdiomRecognize::isProfitableToInsertFFS(Intrinsic::ID IntrinID,
2207 Value *InitX, bool ZeroCheck,
2208 size_t CanonicalSize) {
2209 const Value *Args[] = {InitX,
2210 ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
2211
2212 // @llvm.dbg doesn't count as they have no semantic effect.
2213 auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
2215 std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
2216
2217 IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
2220 if (HeaderSize != CanonicalSize && Cost > TargetTransformInfo::TCC_Basic)
2221 return false;
2222
2223 return true;
2224}
2225
2226/// Convert CTLZ / CTTZ idiom loop into countable loop.
2227/// If CTLZ / CTTZ inserted as a new trip count returns true; otherwise,
2228/// returns false.
2229bool LoopIdiomRecognize::insertFFSIfProfitable(Intrinsic::ID IntrinID,
2230 Value *InitX, Instruction *DefX,
2231 PHINode *CntPhi,
2232 Instruction *CntInst) {
2233 bool IsCntPhiUsedOutsideLoop = false;
2234 for (User *U : CntPhi->users())
2235 if (!CurLoop->contains(cast<Instruction>(U))) {
2236 IsCntPhiUsedOutsideLoop = true;
2237 break;
2238 }
2239 bool IsCntInstUsedOutsideLoop = false;
2240 for (User *U : CntInst->users())
2241 if (!CurLoop->contains(cast<Instruction>(U))) {
2242 IsCntInstUsedOutsideLoop = true;
2243 break;
2244 }
2245 // If both CntInst and CntPhi are used outside the loop the profitability
2246 // is questionable.
2247 if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
2248 return false;
2249
2250 // For some CPUs result of CTLZ(X) intrinsic is undefined
2251 // when X is 0. If we can not guarantee X != 0, we need to check this
2252 // when expand.
2253 bool ZeroCheck = false;
2254 // It is safe to assume Preheader exist as it was checked in
2255 // parent function RunOnLoop.
2256 BasicBlock *PH = CurLoop->getLoopPreheader();
2257
2258 // If we are using the count instruction outside the loop, make sure we
2259 // have a zero check as a precondition. Without the check the loop would run
2260 // one iteration for before any check of the input value. This means 0 and 1
2261 // would have identical behavior in the original loop and thus
2262 if (!IsCntPhiUsedOutsideLoop) {
2263 auto *PreCondBB = PH->getSinglePredecessor();
2264 if (!PreCondBB)
2265 return false;
2266 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
2267 if (!PreCondBI)
2268 return false;
2269 if (matchCondition(PreCondBI, PH) != InitX)
2270 return false;
2271 ZeroCheck = true;
2272 }
2273
2274 // FFS idiom loop has only 6 instructions:
2275 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
2276 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
2277 // %shr = ashr %n.addr.0, 1
2278 // %tobool = icmp eq %shr, 0
2279 // %inc = add nsw %i.0, 1
2280 // br i1 %tobool
2281 size_t IdiomCanonicalSize = 6;
2282 if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize))
2283 return false;
2284
2285 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
2286 DefX->getDebugLoc(), ZeroCheck,
2287 IsCntPhiUsedOutsideLoop);
2288 return true;
2289}
2290
2291/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
2292/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
2293/// trip count returns true; otherwise, returns false.
2294bool LoopIdiomRecognize::recognizeAndInsertFFS() {
2295 // Give up if the loop has multiple blocks or multiple backedges.
2296 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
2297 return false;
2298
2299 Intrinsic::ID IntrinID;
2300 Value *InitX;
2301 Instruction *DefX = nullptr;
2302 PHINode *CntPhi = nullptr;
2303 Instruction *CntInst = nullptr;
2304
2305 if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX, CntInst, CntPhi,
2306 DefX))
2307 return false;
2308
2309 return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
2310}
2311
2312bool LoopIdiomRecognize::recognizeShiftUntilLessThan() {
2313 // Give up if the loop has multiple blocks or multiple backedges.
2314 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
2315 return false;
2316
2317 Intrinsic::ID IntrinID;
2318 Value *InitX;
2319 Instruction *DefX = nullptr;
2320 PHINode *CntPhi = nullptr;
2321 Instruction *CntInst = nullptr;
2322
2323 APInt LoopThreshold;
2324 if (!detectShiftUntilLessThanIdiom(CurLoop, *DL, IntrinID, InitX, CntInst,
2325 CntPhi, DefX, LoopThreshold))
2326 return false;
2327
2328 if (LoopThreshold == 2) {
2329 // Treat as regular FFS.
2330 return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
2331 }
2332
2333 // Look for Floor Log2 Idiom.
2334 if (LoopThreshold != 4)
2335 return false;
2336
2337 // Abort if CntPhi is used outside of the loop.
2338 for (User *U : CntPhi->users())
2339 if (!CurLoop->contains(cast<Instruction>(U)))
2340 return false;
2341
2342 // It is safe to assume Preheader exist as it was checked in
2343 // parent function RunOnLoop.
2344 BasicBlock *PH = CurLoop->getLoopPreheader();
2345 auto *PreCondBB = PH->getSinglePredecessor();
2346 if (!PreCondBB)
2347 return false;
2348 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
2349 if (!PreCondBI)
2350 return false;
2351
2352 APInt PreLoopThreshold;
2353 if (matchShiftULTCondition(PreCondBI, PH, PreLoopThreshold) != InitX ||
2354 PreLoopThreshold != 2)
2355 return false;
2356
2357 bool ZeroCheck = true;
2358
2359 // the loop has only 6 instructions:
2360 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
2361 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
2362 // %shr = ashr %n.addr.0, 1
2363 // %tobool = icmp ult %n.addr.0, C
2364 // %inc = add nsw %i.0, 1
2365 // br i1 %tobool
2366 size_t IdiomCanonicalSize = 6;
2367 if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize))
2368 return false;
2369
2370 // log2(x) = w − 1 − clz(x)
2371 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
2372 DefX->getDebugLoc(), ZeroCheck,
2373 /*IsCntPhiUsedOutsideLoop=*/false,
2374 /*InsertSub=*/true);
2375 return true;
2376}
2377
2378/// Recognizes a population count idiom in a non-countable loop.
2379///
2380/// If detected, transforms the relevant code to issue the popcount intrinsic
2381/// function call, and returns true; otherwise, returns false.
2382bool LoopIdiomRecognize::recognizePopcount() {
2384 return false;
2385
2386 // Counting population are usually conducted by few arithmetic instructions.
2387 // Such instructions can be easily "absorbed" by vacant slots in a
2388 // non-compact loop. Therefore, recognizing popcount idiom only makes sense
2389 // in a compact loop.
2390
2391 // Give up if the loop has multiple blocks or multiple backedges.
2392 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
2393 return false;
2394
2395 BasicBlock *LoopBody = *(CurLoop->block_begin());
2396 if (LoopBody->size() >= 20) {
2397 // The loop is too big, bail out.
2398 return false;
2399 }
2400
2401 // It should have a preheader containing nothing but an unconditional branch.
2402 BasicBlock *PH = CurLoop->getLoopPreheader();
2403 if (!PH || &PH->front() != PH->getTerminator())
2404 return false;
2405 auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
2406 if (!EntryBI || EntryBI->isConditional())
2407 return false;
2408
2409 // It should have a precondition block where the generated popcount intrinsic
2410 // function can be inserted.
2411 auto *PreCondBB = PH->getSinglePredecessor();
2412 if (!PreCondBB)
2413 return false;
2414 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
2415 if (!PreCondBI || PreCondBI->isUnconditional())
2416 return false;
2417
2418 Instruction *CntInst;
2419 PHINode *CntPhi;
2420 Value *Val;
2421 if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
2422 return false;
2423
2424 transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
2425 return true;
2426}
2427
2429 const DebugLoc &DL) {
2430 Value *Ops[] = {Val};
2431 Type *Tys[] = {Val->getType()};
2432
2433 CallInst *CI = IRBuilder.CreateIntrinsic(Intrinsic::ctpop, Tys, Ops);
2434 CI->setDebugLoc(DL);
2435
2436 return CI;
2437}
2438
2440 const DebugLoc &DL, bool ZeroCheck,
2441 Intrinsic::ID IID) {
2442 Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
2443 Type *Tys[] = {Val->getType()};
2444
2445 CallInst *CI = IRBuilder.CreateIntrinsic(IID, Tys, Ops);
2446 CI->setDebugLoc(DL);
2447
2448 return CI;
2449}
2450
2451/// Transform the following loop (Using CTLZ, CTTZ is similar):
2452/// loop:
2453/// CntPhi = PHI [Cnt0, CntInst]
2454/// PhiX = PHI [InitX, DefX]
2455/// CntInst = CntPhi + 1
2456/// DefX = PhiX >> 1
2457/// LOOP_BODY
2458/// Br: loop if (DefX != 0)
2459/// Use(CntPhi) or Use(CntInst)
2460///
2461/// Into:
2462/// If CntPhi used outside the loop:
2463/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
2464/// Count = CountPrev + 1
2465/// else
2466/// Count = BitWidth(InitX) - CTLZ(InitX)
2467/// loop:
2468/// CntPhi = PHI [Cnt0, CntInst]
2469/// PhiX = PHI [InitX, DefX]
2470/// PhiCount = PHI [Count, Dec]
2471/// CntInst = CntPhi + 1
2472/// DefX = PhiX >> 1
2473/// Dec = PhiCount - 1
2474/// LOOP_BODY
2475/// Br: loop if (Dec != 0)
2476/// Use(CountPrev + Cnt0) // Use(CntPhi)
2477/// or
2478/// Use(Count + Cnt0) // Use(CntInst)
2479///
2480/// If LOOP_BODY is empty the loop will be deleted.
2481/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
2482void LoopIdiomRecognize::transformLoopToCountable(
2483 Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
2484 PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
2485 bool ZeroCheck, bool IsCntPhiUsedOutsideLoop, bool InsertSub) {
2486 BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
2487
2488 // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
2489 IRBuilder<> Builder(PreheaderBr);
2490 Builder.SetCurrentDebugLocation(DL);
2491
2492 // If there are no uses of CntPhi crate:
2493 // Count = BitWidth - CTLZ(InitX);
2494 // NewCount = Count;
2495 // If there are uses of CntPhi create:
2496 // NewCount = BitWidth - CTLZ(InitX >> 1);
2497 // Count = NewCount + 1;
2498 Value *InitXNext;
2499 if (IsCntPhiUsedOutsideLoop) {
2500 if (DefX->getOpcode() == Instruction::AShr)
2501 InitXNext = Builder.CreateAShr(InitX, 1);
2502 else if (DefX->getOpcode() == Instruction::LShr)
2503 InitXNext = Builder.CreateLShr(InitX, 1);
2504 else if (DefX->getOpcode() == Instruction::Shl) // cttz
2505 InitXNext = Builder.CreateShl(InitX, 1);
2506 else
2507 llvm_unreachable("Unexpected opcode!");
2508 } else
2509 InitXNext = InitX;
2510 Value *Count =
2511 createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
2512 Type *CountTy = Count->getType();
2513 Count = Builder.CreateSub(
2514 ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
2515 if (InsertSub)
2516 Count = Builder.CreateSub(Count, ConstantInt::get(CountTy, 1));
2517 Value *NewCount = Count;
2518 if (IsCntPhiUsedOutsideLoop)
2519 Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
2520
2521 NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());
2522
2523 Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
2524 if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
2525 // If the counter was being incremented in the loop, add NewCount to the
2526 // counter's initial value, but only if the initial value is not zero.
2527 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2528 if (!InitConst || !InitConst->isZero())
2529 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2530 } else {
2531 // If the count was being decremented in the loop, subtract NewCount from
2532 // the counter's initial value.
2533 NewCount = Builder.CreateSub(CntInitVal, NewCount);
2534 }
2535
2536 // Step 2: Insert new IV and loop condition:
2537 // loop:
2538 // ...
2539 // PhiCount = PHI [Count, Dec]
2540 // ...
2541 // Dec = PhiCount - 1
2542 // ...
2543 // Br: loop if (Dec != 0)
2544 BasicBlock *Body = *(CurLoop->block_begin());
2545 auto *LbBr = cast<BranchInst>(Body->getTerminator());
2546 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2547
2548 PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi");
2549 TcPhi->insertBefore(Body->begin());
2550
2551 Builder.SetInsertPoint(LbCond);
2552 Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
2553 TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));
2554
2555 TcPhi->addIncoming(Count, Preheader);
2556 TcPhi->addIncoming(TcDec, Body);
2557
2558 CmpInst::Predicate Pred =
2559 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
2560 LbCond->setPredicate(Pred);
2561 LbCond->setOperand(0, TcDec);
2562 LbCond->setOperand(1, ConstantInt::get(CountTy, 0));
2563
2564 // Step 3: All the references to the original counter outside
2565 // the loop are replaced with the NewCount
2566 if (IsCntPhiUsedOutsideLoop)
2567 CntPhi->replaceUsesOutsideBlock(NewCount, Body);
2568 else
2569 CntInst->replaceUsesOutsideBlock(NewCount, Body);
2570
2571 // step 4: Forget the "non-computable" trip-count SCEV associated with the
2572 // loop. The loop would otherwise not be deleted even if it becomes empty.
2573 SE->forgetLoop(CurLoop);
2574}
2575
2576void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
2577 Instruction *CntInst,
2578 PHINode *CntPhi, Value *Var) {
2579 BasicBlock *PreHead = CurLoop->getLoopPreheader();
2580 auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
2581 const DebugLoc &DL = CntInst->getDebugLoc();
2582
2583 // Assuming before transformation, the loop is following:
2584 // if (x) // the precondition
2585 // do { cnt++; x &= x - 1; } while(x);
2586
2587 // Step 1: Insert the ctpop instruction at the end of the precondition block
2588 IRBuilder<> Builder(PreCondBr);
2589 Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
2590 {
2591 PopCnt = createPopcntIntrinsic(Builder, Var, DL);
2592 NewCount = PopCntZext =
2593 Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
2594
2595 if (NewCount != PopCnt)
2596 (cast<Instruction>(NewCount))->setDebugLoc(DL);
2597
2598 // TripCnt is exactly the number of iterations the loop has
2599 TripCnt = NewCount;
2600
2601 // If the population counter's initial value is not zero, insert Add Inst.
2602 Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
2603 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2604 if (!InitConst || !InitConst->isZero()) {
2605 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2606 (cast<Instruction>(NewCount))->setDebugLoc(DL);
2607 }
2608 }
2609
2610 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
2611 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
2612 // function would be partial dead code, and downstream passes will drag
2613 // it back from the precondition block to the preheader.
2614 {
2615 ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
2616
2617 Value *Opnd0 = PopCntZext;
2618 Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
2619 if (PreCond->getOperand(0) != Var)
2620 std::swap(Opnd0, Opnd1);
2621
2622 ICmpInst *NewPreCond = cast<ICmpInst>(
2623 Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
2624 PreCondBr->setCondition(NewPreCond);
2625
2627 }
2628
2629 // Step 3: Note that the population count is exactly the trip count of the
2630 // loop in question, which enable us to convert the loop from noncountable
2631 // loop into a countable one. The benefit is twofold:
2632 //
2633 // - If the loop only counts population, the entire loop becomes dead after
2634 // the transformation. It is a lot easier to prove a countable loop dead
2635 // than to prove a noncountable one. (In some C dialects, an infinite loop
2636 // isn't dead even if it computes nothing useful. In general, DCE needs
2637 // to prove a noncountable loop finite before safely delete it.)
2638 //
2639 // - If the loop also performs something else, it remains alive.
2640 // Since it is transformed to countable form, it can be aggressively
2641 // optimized by some optimizations which are in general not applicable
2642 // to a noncountable loop.
2643 //
2644 // After this step, this loop (conceptually) would look like following:
2645 // newcnt = __builtin_ctpop(x);
2646 // t = newcnt;
2647 // if (x)
2648 // do { cnt++; x &= x-1; t--) } while (t > 0);
2649 BasicBlock *Body = *(CurLoop->block_begin());
2650 {
2651 auto *LbBr = cast<BranchInst>(Body->getTerminator());
2652 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2653 Type *Ty = TripCnt->getType();
2654
2655 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi");
2656 TcPhi->insertBefore(Body->begin());
2657
2658 Builder.SetInsertPoint(LbCond);
2659 Instruction *TcDec = cast<Instruction>(
2660 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
2661 "tcdec", false, true));
2662
2663 TcPhi->addIncoming(TripCnt, PreHead);
2664 TcPhi->addIncoming(TcDec, Body);
2665
2666 CmpInst::Predicate Pred =
2667 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
2668 LbCond->setPredicate(Pred);
2669 LbCond->setOperand(0, TcDec);
2670 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
2671 }
2672
2673 // Step 4: All the references to the original population counter outside
2674 // the loop are replaced with the NewCount -- the value returned from
2675 // __builtin_ctpop().
2676 CntInst->replaceUsesOutsideBlock(NewCount, Body);
2677
2678 // step 5: Forget the "non-computable" trip-count SCEV associated with the
2679 // loop. The loop would otherwise not be deleted even if it becomes empty.
2680 SE->forgetLoop(CurLoop);
2681}
2682
2683/// Match loop-invariant value.
2684template <typename SubPattern_t> struct match_LoopInvariant {
2685 SubPattern_t SubPattern;
2686 const Loop *L;
2687
2688 match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
2689 : SubPattern(SP), L(L) {}
2690
2691 template <typename ITy> bool match(ITy *V) const {
2692 return L->isLoopInvariant(V) && SubPattern.match(V);
2693 }
2694};
2695
2696/// Matches if the value is loop-invariant.
2697template <typename Ty>
2698inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
2699 return match_LoopInvariant<Ty>(M, L);
2700}
2701
2702/// Return true if the idiom is detected in the loop.
2703///
2704/// The core idiom we are trying to detect is:
2705/// \code
2706/// entry:
2707/// <...>
2708/// %bitmask = shl i32 1, %bitpos
2709/// br label %loop
2710///
2711/// loop:
2712/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2713/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2714/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2715/// %x.next = shl i32 %x.curr, 1
2716/// <...>
2717/// br i1 %x.curr.isbitunset, label %loop, label %end
2718///
2719/// end:
2720/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2721/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2722/// <...>
2723/// \endcode
2724static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
2725 Value *&BitMask, Value *&BitPos,
2726 Value *&CurrX, Instruction *&NextX) {
2728 " Performing shift-until-bittest idiom detection.\n");
2729
2730 // Give up if the loop has multiple blocks or multiple backedges.
2731 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2732 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2733 return false;
2734 }
2735
2736 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2737 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2738 assert(LoopPreheaderBB && "There is always a loop preheader.");
2739
2740 using namespace PatternMatch;
2741
2742 // Step 1: Check if the loop backedge is in desirable form.
2743
2744 CmpPredicate Pred;
2745 Value *CmpLHS, *CmpRHS;
2746 BasicBlock *TrueBB, *FalseBB;
2747 if (!match(LoopHeaderBB->getTerminator(),
2748 m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
2749 m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
2750 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2751 return false;
2752 }
2753
2754 // Step 2: Check if the backedge's condition is in desirable form.
2755
2756 auto MatchVariableBitMask = [&]() {
2757 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2758 match(CmpLHS,
2759 m_c_And(m_Value(CurrX),
2761 m_Value(BitMask),
2762 m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
2763 CurLoop))));
2764 };
2765
2766 auto MatchDecomposableConstantBitMask = [&]() {
2767 auto Res = llvm::decomposeBitTestICmp(
2768 CmpLHS, CmpRHS, Pred, /*LookThroughTrunc=*/true,
2769 /*AllowNonZeroC=*/false, /*DecomposeAnd=*/true);
2770 if (Res && Res->Mask.isPowerOf2()) {
2771 assert(ICmpInst::isEquality(Res->Pred));
2772 Pred = Res->Pred;
2773 CurrX = Res->X;
2774 BitMask = ConstantInt::get(CurrX->getType(), Res->Mask);
2775 BitPos = ConstantInt::get(CurrX->getType(), Res->Mask.logBase2());
2776 return true;
2777 }
2778 return false;
2779 };
2780
2781 if (!MatchVariableBitMask() && !MatchDecomposableConstantBitMask()) {
2782 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
2783 return false;
2784 }
2785
2786 // Step 3: Check if the recurrence is in desirable form.
2787 auto *CurrXPN = dyn_cast<PHINode>(CurrX);
2788 if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
2789 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2790 return false;
2791 }
2792
2793 BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
2794 NextX =
2795 dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
2796
2797 assert(CurLoop->isLoopInvariant(BaseX) &&
2798 "Expected BaseX to be available in the preheader!");
2799
2800 if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
2801 // FIXME: support right-shift?
2802 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2803 return false;
2804 }
2805
2806 // Step 4: Check if the backedge's destinations are in desirable form.
2807
2809 "Should only get equality predicates here.");
2810
2811 // cmp-br is commutative, so canonicalize to a single variant.
2812 if (Pred != ICmpInst::Predicate::ICMP_EQ) {
2813 Pred = ICmpInst::getInversePredicate(Pred);
2814 std::swap(TrueBB, FalseBB);
2815 }
2816
2817 // We expect to exit loop when comparison yields false,
2818 // so when it yields true we should branch back to loop header.
2819 if (TrueBB != LoopHeaderBB) {
2820 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2821 return false;
2822 }
2823
2824 // Okay, idiom checks out.
2825 return true;
2826}
2827
2828/// Look for the following loop:
2829/// \code
2830/// entry:
2831/// <...>
2832/// %bitmask = shl i32 1, %bitpos
2833/// br label %loop
2834///
2835/// loop:
2836/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2837/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2838/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2839/// %x.next = shl i32 %x.curr, 1
2840/// <...>
2841/// br i1 %x.curr.isbitunset, label %loop, label %end
2842///
2843/// end:
2844/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2845/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2846/// <...>
2847/// \endcode
2848///
2849/// And transform it into:
2850/// \code
2851/// entry:
2852/// %bitmask = shl i32 1, %bitpos
2853/// %lowbitmask = add i32 %bitmask, -1
2854/// %mask = or i32 %lowbitmask, %bitmask
2855/// %x.masked = and i32 %x, %mask
2856/// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
2857/// i1 true)
2858/// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
2859/// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
2860/// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
2861/// %tripcount = add i32 %backedgetakencount, 1
2862/// %x.curr = shl i32 %x, %backedgetakencount
2863/// %x.next = shl i32 %x, %tripcount
2864/// br label %loop
2865///
2866/// loop:
2867/// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
2868/// %loop.iv.next = add nuw i32 %loop.iv, 1
2869/// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
2870/// <...>
2871/// br i1 %loop.ivcheck, label %end, label %loop
2872///
2873/// end:
2874/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2875/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2876/// <...>
2877/// \endcode
2878bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
2879 bool MadeChange = false;
2880
2881 Value *X, *BitMask, *BitPos, *XCurr;
2882 Instruction *XNext;
2883 if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
2884 XNext)) {
2886 " shift-until-bittest idiom detection failed.\n");
2887 return MadeChange;
2888 }
2889 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
2890
2891 // Ok, it is the idiom we were looking for, we *could* transform this loop,
2892 // but is it profitable to transform?
2893
2894 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2895 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2896 assert(LoopPreheaderBB && "There is always a loop preheader.");
2897
2898 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2899 assert(SuccessorBB && "There is only a single successor.");
2900
2901 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2902 Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
2903
2904 Intrinsic::ID IntrID = Intrinsic::ctlz;
2905 Type *Ty = X->getType();
2906 unsigned Bitwidth = Ty->getScalarSizeInBits();
2907
2910
2911 // The rewrite is considered to be unprofitable iff and only iff the
2912 // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
2913 // making the loop countable, even if nothing else changes.
2915 IntrID, Ty, {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getTrue()});
2919 " Intrinsic is too costly, not beneficial\n");
2920 return MadeChange;
2921 }
2922 if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
2924 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
2925 return MadeChange;
2926 }
2927
2928 // Ok, transform appears worthwhile.
2929 MadeChange = true;
2930
2931 if (!isGuaranteedNotToBeUndefOrPoison(BitPos)) {
2932 // BitMask may be computed from BitPos, Freeze BitPos so we can increase
2933 // it's use count.
2934 std::optional<BasicBlock::iterator> InsertPt = std::nullopt;
2935 if (auto *BitPosI = dyn_cast<Instruction>(BitPos))
2936 InsertPt = BitPosI->getInsertionPointAfterDef();
2937 else
2938 InsertPt = DT->getRoot()->getFirstNonPHIOrDbgOrAlloca();
2939 if (!InsertPt)
2940 return false;
2941 FreezeInst *BitPosFrozen =
2942 new FreezeInst(BitPos, BitPos->getName() + ".fr", *InsertPt);
2943 BitPos->replaceUsesWithIf(BitPosFrozen, [BitPosFrozen](Use &U) {
2944 return U.getUser() != BitPosFrozen;
2945 });
2946 BitPos = BitPosFrozen;
2947 }
2948
2949 // Step 1: Compute the loop trip count.
2950
2951 Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
2952 BitPos->getName() + ".lowbitmask");
2953 Value *Mask =
2954 Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
2955 Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
2956 CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
2957 IntrID, Ty, {XMasked, /*is_zero_poison=*/Builder.getTrue()},
2958 /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
2959 Value *XMaskedNumActiveBits = Builder.CreateSub(
2960 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
2961 XMasked->getName() + ".numactivebits", /*HasNUW=*/true,
2962 /*HasNSW=*/Bitwidth != 2);
2963 Value *XMaskedLeadingOnePos =
2964 Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
2965 XMasked->getName() + ".leadingonepos", /*HasNUW=*/false,
2966 /*HasNSW=*/Bitwidth > 2);
2967
2968 Value *LoopBackedgeTakenCount = Builder.CreateSub(
2969 BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",
2970 /*HasNUW=*/true, /*HasNSW=*/true);
2971 // We know loop's backedge-taken count, but what's loop's trip count?
2972 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2973 Value *LoopTripCount =
2974 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2975 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
2976 /*HasNSW=*/Bitwidth != 2);
2977
2978 // Step 2: Compute the recurrence's final value without a loop.
2979
2980 // NewX is always safe to compute, because `LoopBackedgeTakenCount`
2981 // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
2982 Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
2983 NewX->takeName(XCurr);
2984 if (auto *I = dyn_cast<Instruction>(NewX))
2985 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2986
2987 Value *NewXNext;
2988 // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
2989 // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
2990 // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
2991 // that isn't the case, we'll need to emit an alternative, safe IR.
2992 if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
2996 Ty->getScalarSizeInBits() - 1))))
2997 NewXNext = Builder.CreateShl(X, LoopTripCount);
2998 else {
2999 // Otherwise, just additionally shift by one. It's the smallest solution,
3000 // alternatively, we could check that NewX is INT_MIN (or BitPos is )
3001 // and select 0 instead.
3002 NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
3003 }
3004
3005 NewXNext->takeName(XNext);
3006 if (auto *I = dyn_cast<Instruction>(NewXNext))
3007 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
3008
3009 // Step 3: Adjust the successor basic block to recieve the computed
3010 // recurrence's final value instead of the recurrence itself.
3011
3012 XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
3013 XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
3014
3015 // Step 4: Rewrite the loop into a countable form, with canonical IV.
3016
3017 // The new canonical induction variable.
3018 Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->begin());
3019 auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
3020
3021 // The induction itself.
3022 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
3023 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
3024 auto *IVNext =
3025 Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",
3026 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
3027
3028 // The loop trip count check.
3029 auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
3030 CurLoop->getName() + ".ivcheck");
3031 Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
3032 LoopHeaderBB->getTerminator()->eraseFromParent();
3033
3034 // Populate the IV PHI.
3035 IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
3036 IV->addIncoming(IVNext, LoopHeaderBB);
3037
3038 // Step 5: Forget the "non-computable" trip-count SCEV associated with the
3039 // loop. The loop would otherwise not be deleted even if it becomes empty.
3040
3041 SE->forgetLoop(CurLoop);
3042
3043 // Other passes will take care of actually deleting the loop if possible.
3044
3045 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
3046
3047 ++NumShiftUntilBitTest;
3048 return MadeChange;
3049}
3050
3051/// Return true if the idiom is detected in the loop.
3052///
3053/// The core idiom we are trying to detect is:
3054/// \code
3055/// entry:
3056/// <...>
3057/// %start = <...>
3058/// %extraoffset = <...>
3059/// <...>
3060/// br label %for.cond
3061///
3062/// loop:
3063/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
3064/// %nbits = add nsw i8 %iv, %extraoffset
3065/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
3066/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
3067/// %iv.next = add i8 %iv, 1
3068/// <...>
3069/// br i1 %val.shifted.iszero, label %end, label %loop
3070///
3071/// end:
3072/// %iv.res = phi i8 [ %iv, %loop ] <...>
3073/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
3074/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
3075/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
3076/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
3077/// <...>
3078/// \endcode
3080 Instruction *&ValShiftedIsZero,
3081 Intrinsic::ID &IntrinID, Instruction *&IV,
3082 Value *&Start, Value *&Val,
3083 const SCEV *&ExtraOffsetExpr,
3084 bool &InvertedCond) {
3086 " Performing shift-until-zero idiom detection.\n");
3087
3088 // Give up if the loop has multiple blocks or multiple backedges.
3089 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
3090 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
3091 return false;
3092 }
3093
3094 Instruction *ValShifted, *NBits, *IVNext;
3095 Value *ExtraOffset;
3096
3097 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
3098 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
3099 assert(LoopPreheaderBB && "There is always a loop preheader.");
3100
3101 using namespace PatternMatch;
3102
3103 // Step 1: Check if the loop backedge, condition is in desirable form.
3104
3105 CmpPredicate Pred;
3106 BasicBlock *TrueBB, *FalseBB;
3107 if (!match(LoopHeaderBB->getTerminator(),
3108 m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB),
3109 m_BasicBlock(FalseBB))) ||
3110 !match(ValShiftedIsZero,
3111 m_ICmp(Pred, m_Instruction(ValShifted), m_Zero())) ||
3112 !ICmpInst::isEquality(Pred)) {
3113 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
3114 return false;
3115 }
3116
3117 // Step 2: Check if the comparison's operand is in desirable form.
3118 // FIXME: Val could be a one-input PHI node, which we should look past.
3119 if (!match(ValShifted, m_Shift(m_LoopInvariant(m_Value(Val), CurLoop),
3120 m_Instruction(NBits)))) {
3121 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n");
3122 return false;
3123 }
3124 IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz
3125 : Intrinsic::ctlz;
3126
3127 // Step 3: Check if the shift amount is in desirable form.
3128
3129 if (match(NBits, m_c_Add(m_Instruction(IV),
3130 m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
3131 (NBits->hasNoSignedWrap() || NBits->hasNoUnsignedWrap()))
3132 ExtraOffsetExpr = SE->getNegativeSCEV(SE->getSCEV(ExtraOffset));
3133 else if (match(NBits,
3135 m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
3136 NBits->hasNoSignedWrap())
3137 ExtraOffsetExpr = SE->getSCEV(ExtraOffset);
3138 else {
3139 IV = NBits;
3140 ExtraOffsetExpr = SE->getZero(NBits->getType());
3141 }
3142
3143 // Step 4: Check if the recurrence is in desirable form.
3144 auto *IVPN = dyn_cast<PHINode>(IV);
3145 if (!IVPN || IVPN->getParent() != LoopHeaderBB) {
3146 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
3147 return false;
3148 }
3149
3150 Start = IVPN->getIncomingValueForBlock(LoopPreheaderBB);
3151 IVNext = dyn_cast<Instruction>(IVPN->getIncomingValueForBlock(LoopHeaderBB));
3152
3153 if (!IVNext || !match(IVNext, m_Add(m_Specific(IVPN), m_One()))) {
3154 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
3155 return false;
3156 }
3157
3158 // Step 4: Check if the backedge's destinations are in desirable form.
3159
3161 "Should only get equality predicates here.");
3162
3163 // cmp-br is commutative, so canonicalize to a single variant.
3164 InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;
3165 if (InvertedCond) {
3166 Pred = ICmpInst::getInversePredicate(Pred);
3167 std::swap(TrueBB, FalseBB);
3168 }
3169
3170 // We expect to exit loop when comparison yields true,
3171 // so when it yields false we should branch back to loop header.
3172 if (FalseBB != LoopHeaderBB) {
3173 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
3174 return false;
3175 }
3176
3177 // The new, countable, loop will certainly only run a known number of
3178 // iterations, It won't be infinite. But the old loop might be infinite
3179 // under certain conditions. For logical shifts, the value will become zero
3180 // after at most bitwidth(%Val) loop iterations. However, for arithmetic
3181 // right-shift, iff the sign bit was set, the value will never become zero,
3182 // and the loop may never finish.
3183 if (ValShifted->getOpcode() == Instruction::AShr &&
3184 !isMustProgress(CurLoop) && !SE->isKnownNonNegative(SE->getSCEV(Val))) {
3185 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n");
3186 return false;
3187 }
3188
3189 // Okay, idiom checks out.
3190 return true;
3191}
3192
3193/// Look for the following loop:
3194/// \code
3195/// entry:
3196/// <...>
3197/// %start = <...>
3198/// %extraoffset = <...>
3199/// <...>
3200/// br label %for.cond
3201///
3202/// loop:
3203/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
3204/// %nbits = add nsw i8 %iv, %extraoffset
3205/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
3206/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
3207/// %iv.next = add i8 %iv, 1
3208/// <...>
3209/// br i1 %val.shifted.iszero, label %end, label %loop
3210///
3211/// end:
3212/// %iv.res = phi i8 [ %iv, %loop ] <...>
3213/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
3214/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
3215/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
3216/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
3217/// <...>
3218/// \endcode
3219///
3220/// And transform it into:
3221/// \code
3222/// entry:
3223/// <...>
3224/// %start = <...>
3225/// %extraoffset = <...>
3226/// <...>
3227/// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0)
3228/// %val.numactivebits = sub i8 8, %val.numleadingzeros
3229/// %extraoffset.neg = sub i8 0, %extraoffset
3230/// %tmp = add i8 %val.numactivebits, %extraoffset.neg
3231/// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start)
3232/// %loop.tripcount = sub i8 %iv.final, %start
3233/// br label %loop
3234///
3235/// loop:
3236/// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ]
3237/// %loop.iv.next = add i8 %loop.iv, 1
3238/// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount
3239/// %iv = add i8 %loop.iv, %start
3240/// <...>
3241/// br i1 %loop.ivcheck, label %end, label %loop
3242///
3243/// end:
3244/// %iv.res = phi i8 [ %iv.final, %loop ] <...>
3245/// <...>
3246/// \endcode
3247bool LoopIdiomRecognize::recognizeShiftUntilZero() {
3248 bool MadeChange = false;
3249
3250 Instruction *ValShiftedIsZero;
3251 Intrinsic::ID IntrID;
3252 Instruction *IV;
3253 Value *Start, *Val;
3254 const SCEV *ExtraOffsetExpr;
3255 bool InvertedCond;
3256 if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrID, IV,
3257 Start, Val, ExtraOffsetExpr, InvertedCond)) {
3259 " shift-until-zero idiom detection failed.\n");
3260 return MadeChange;
3261 }
3262 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n");
3263
3264 // Ok, it is the idiom we were looking for, we *could* transform this loop,
3265 // but is it profitable to transform?
3266
3267 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
3268 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
3269 assert(LoopPreheaderBB && "There is always a loop preheader.");
3270
3271 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
3272 assert(SuccessorBB && "There is only a single successor.");
3273
3274 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
3275 Builder.SetCurrentDebugLocation(IV->getDebugLoc());
3276
3277 Type *Ty = Val->getType();
3278 unsigned Bitwidth = Ty->getScalarSizeInBits();
3279
3282
3283 // The rewrite is considered to be unprofitable iff and only iff the
3284 // intrinsic we'll use are not cheap. Note that we are okay with *just*
3285 // making the loop countable, even if nothing else changes.
3287 IntrID, Ty, {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getFalse()});
3291 " Intrinsic is too costly, not beneficial\n");
3292 return MadeChange;
3293 }
3294
3295 // Ok, transform appears worthwhile.
3296 MadeChange = true;
3297
3298 bool OffsetIsZero = ExtraOffsetExpr->isZero();
3299
3300 // Step 1: Compute the loop's final IV value / trip count.
3301
3302 CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
3303 IntrID, Ty, {Val, /*is_zero_poison=*/Builder.getFalse()},
3304 /*FMFSource=*/nullptr, Val->getName() + ".numleadingzeros");
3305 Value *ValNumActiveBits = Builder.CreateSub(
3306 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), ValNumLeadingZeros,
3307 Val->getName() + ".numactivebits", /*HasNUW=*/true,
3308 /*HasNSW=*/Bitwidth != 2);
3309
3310 SCEVExpander Expander(*SE, *DL, "loop-idiom");
3311 Expander.setInsertPoint(&*Builder.GetInsertPoint());
3312 Value *ExtraOffset = Expander.expandCodeFor(ExtraOffsetExpr);
3313
3314 Value *ValNumActiveBitsOffset = Builder.CreateAdd(
3315 ValNumActiveBits, ExtraOffset, ValNumActiveBits->getName() + ".offset",
3316 /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true);
3317 Value *IVFinal = Builder.CreateIntrinsic(Intrinsic::smax, {Ty},
3318 {ValNumActiveBitsOffset, Start},
3319 /*FMFSource=*/nullptr, "iv.final");
3320
3321 auto *LoopBackedgeTakenCount = cast<Instruction>(Builder.CreateSub(
3322 IVFinal, Start, CurLoop->getName() + ".backedgetakencount",
3323 /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true));
3324 // FIXME: or when the offset was `add nuw`
3325
3326 // We know loop's backedge-taken count, but what's loop's trip count?
3327 Value *LoopTripCount =
3328 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
3329 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
3330 /*HasNSW=*/Bitwidth != 2);
3331
3332 // Step 2: Adjust the successor basic block to recieve the original
3333 // induction variable's final value instead of the orig. IV itself.
3334
3335 IV->replaceUsesOutsideBlock(IVFinal, LoopHeaderBB);
3336
3337 // Step 3: Rewrite the loop into a countable form, with canonical IV.
3338
3339 // The new canonical induction variable.
3340 Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->begin());
3341 auto *CIV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
3342
3343 // The induction itself.
3344 Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->getFirstNonPHIIt());
3345 auto *CIVNext =
3346 Builder.CreateAdd(CIV, ConstantInt::get(Ty, 1), CIV->getName() + ".next",
3347 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
3348
3349 // The loop trip count check.
3350 auto *CIVCheck = Builder.CreateICmpEQ(CIVNext, LoopTripCount,
3351 CurLoop->getName() + ".ivcheck");
3352 auto *NewIVCheck = CIVCheck;
3353 if (InvertedCond) {
3354 NewIVCheck = Builder.CreateNot(CIVCheck);
3355 NewIVCheck->takeName(ValShiftedIsZero);
3356 }
3357
3358 // The original IV, but rebased to be an offset to the CIV.
3359 auto *IVDePHId = Builder.CreateAdd(CIV, Start, "", /*HasNUW=*/false,
3360 /*HasNSW=*/true); // FIXME: what about NUW?
3361 IVDePHId->takeName(IV);
3362
3363 // The loop terminator.
3364 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
3365 Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
3366 LoopHeaderBB->getTerminator()->eraseFromParent();
3367
3368 // Populate the IV PHI.
3369 CIV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
3370 CIV->addIncoming(CIVNext, LoopHeaderBB);
3371
3372 // Step 4: Forget the "non-computable" trip-count SCEV associated with the
3373 // loop. The loop would otherwise not be deleted even if it becomes empty.
3374
3375 SE->forgetLoop(CurLoop);
3376
3377 // Step 5: Try to cleanup the loop's body somewhat.
3378 IV->replaceAllUsesWith(IVDePHId);
3379 IV->eraseFromParent();
3380
3381 ValShiftedIsZero->replaceAllUsesWith(NewIVCheck);
3382 ValShiftedIsZero->eraseFromParent();
3383
3384 // Other passes will take care of actually deleting the loop if possible.
3385
3386 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n");
3387
3388 ++NumShiftUntilZero;
3389 return MadeChange;
3390}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
DXIL Resource Access
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
std::string Name
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define DEBUG_TYPE
static const HTTPClientCleanup Cleanup
Definition: HTTPClient.cpp:42
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, unsigned StoreSize, AliasAnalysis &AA, SmallPtrSetImpl< Instruction * > &Ignored)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This header defines various interfaces for pass management in LLVM.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static Value * matchCondition(BranchInst *BI, BasicBlock *LoopEntry, bool JmpOnZero=false)
Check if the given conditional branch is based on the comparison between a variable and zero,...
static PHINode * getRecurrenceVar(Value *VarX, Instruction *DefX, BasicBlock *LoopEntry)
static cl::opt< bool, true > DisableLIRPMemset("disable-" DEBUG_TYPE "-memset", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memset."), cl::location(DisableLIRP::Memset), cl::init(false), cl::ReallyHidden)
static cl::opt< bool > ForceMemsetPatternIntrinsic("loop-idiom-force-memset-pattern-intrinsic", cl::desc("Use memset.pattern intrinsic whenever possible"), cl::init(false), cl::Hidden)
static CallInst * createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL, bool ZeroCheck, Intrinsic::ID IID)
static cl::opt< bool, true > EnableLIRPWcslen("disable-loop-idiom-wcslen", cl::desc("Proceed with loop idiom recognize pass, " "enable conversion of loop(s) to wcslen."), cl::location(DisableLIRP::Wcslen), cl::init(false), cl::ReallyHidden)
static bool detectShiftUntilLessThanIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX, APInt &Threshold)
Return true if the idiom is detected in the loop.
static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX, Value *&BitMask, Value *&BitPos, Value *&CurrX, Instruction *&NextX)
Return true if the idiom is detected in the loop.
static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, Instruction *&CntInst, PHINode *&CntPhi, Value *&Var)
Return true iff the idiom is detected in the loop.
static Constant * getMemSetPatternValue(Value *V, const DataLayout *DL)
getMemSetPatternValue - If a strided store of the specified value is safe to turn into a memset....
static cl::opt< bool, true > DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memcpy."), cl::location(DisableLIRP::Memcpy), cl::init(false), cl::ReallyHidden)
static CallInst * createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL)
static const SCEV * getNumBytes(const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute the number of bytes as a SCEV from the backedge taken count.
static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX)
Return true if the idiom is detected in the loop.
static cl::opt< bool, true > DisableLIRPStrlen("disable-loop-idiom-strlen", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to strlen."), cl::location(DisableLIRP::Strlen), cl::init(false), cl::ReallyHidden)
static const SCEV * getStartForNegStride(const SCEV *Start, const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, ScalarEvolution *SE)
static APInt getStoreStride(const SCEVAddRecExpr *StoreEv)
static Value * matchShiftULTCondition(BranchInst *BI, BasicBlock *LoopEntry, APInt &Threshold)
Check if the given conditional branch is based on an unsigned less-than comparison between a variable...
match_LoopInvariant< Ty > m_LoopInvariant(const Ty &M, const Loop *L)
Matches if the value is loop-invariant.
#define DEBUG_TYPE
static cl::opt< bool, true > DisableLIRPAll("disable-" DEBUG_TYPE "-all", cl::desc("Options to disable Loop Idiom Recognize Pass."), cl::location(DisableLIRP::All), cl::init(false), cl::ReallyHidden)
static void deleteDeadInstruction(Instruction *I)
static cl::opt< bool > UseLIRCodeSizeHeurs("use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling " "with -Os/-Oz"), cl::init(true), cl::Hidden)
#define I(x, y, z)
Definition: MD5.cpp:58
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
Contains a collection of routines for determining if a given instruction is guaranteed to execute if ...
const SmallVectorImpl< MachineOperand > & Cond
static bool isSimple(Instruction *I)
verify safepoint Safepoint IR Verifier
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
#define LLVM_DEBUG(...)
Definition: Debug.h:119
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static const uint32_t IV[8]
Definition: blake3_impl.h:83
A private abstract base class describing the concept of an individual alias analysis implementation.
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Check whether or not an instruction may read or write the optionally specified memory location.
Class for arbitrary precision integers.
Definition: APInt.h:78
std::optional< uint64_t > tryZExtValue() const
Get zero extended value if possible.
Definition: APInt.h:1552
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1540
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1488
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1562
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:255
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
iterator end()
Definition: BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:459
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:528
LLVM_ABI iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=true) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
Definition: BasicBlock.cpp:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:337
const Instruction & front() const
Definition: BasicBlock.h:482
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:437
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:213
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:170
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
Definition: BasicBlock.cpp:406
size_t size() const
Definition: BasicBlock.h:480
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:233
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:248
BinaryOps getOpcode() const
Definition: InstrTypes.h:374
Conditional or Unconditional Branch instruction.
void setCondition(Value *V)
bool isConditional() const
unsigned getNumSuccessors() const
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1348
This class represents a function call, abstracting a target machine's calling convention.
void setPredicate(Predicate P)
Set the predicate for this instruction to the specified value.
Definition: InstrTypes.h:770
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:708
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:701
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:703
@ ICMP_EQ
equal
Definition: InstrTypes.h:699
@ ICMP_NE
not equal
Definition: InstrTypes.h:700
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:791
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:767
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:23
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
bool isMinusOne() const
This function will return true iff every bit in this constant is set to true.
Definition: Constants.h:226
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition: Constants.h:220
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:214
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:875
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:163
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:154
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
Definition: Constants.cpp:882
This is an important base class in LLVM.
Definition: Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
A debug info location.
Definition: DebugLoc.h:124
NodeT * getRoot() const
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:165
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:135
This class represents a freeze function that returns random concrete value if an operand is either a ...
This instruction compares its operands according to the predicate given to the constructor.
bool isEquality() const
Return true if this predicate is either EQ or NE.
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Definition: IRBuilder.h:497
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:834
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2780
LLVM_ABI bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
LLVM_ABI bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:513
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:78
LLVM_ABI void setAAMetadata(const AAMDNodes &N)
Sets the AA metadata on this instruction from the AAMDNodes structure.
Definition: Metadata.cpp:1804
LLVM_ABI bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
LLVM_ABI void insertBefore(InstListType::iterator InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified position.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:82
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
Definition: Metadata.cpp:1789
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:312
bool isShift() const
Definition: Instruction.h:320
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:510
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
Definition: Instructions.h:180
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:265
Value * getPointerOperand()
Definition: Instructions.h:259
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Definition: Instructions.h:209
bool isUnordered() const
Definition: Instructions.h:253
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:215
static LocationSize precise(uint64_t Value)
static constexpr LocationSize afterPointer()
Any location after the base pointer (but still within the underlying object).
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
bool isOutermost() const
Return true if the loop does not have a parent (natural) loop.
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
BlockT * getHeader() const
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
void getUniqueExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all unique successor blocks of this loop.
block_iterator block_begin() const
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:40
bool isLoopInvariant(const Value *V, bool HasCoroSuspendInst=false) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:61
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:644
StringRef getName() const
Definition: LoopInfo.h:390
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
This class wraps the llvm.memcpy intrinsic.
Value * getLength() const
Value * getDest() const
This is just like getRawDest, but it strips off any cast instructions (including addrspacecast) that ...
MaybeAlign getDestAlign() const
bool isForceInlined() const
bool isVolatile() const
Value * getValue() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
MaybeAlign getSourceAlign() const
Value * getSource() const
This is just like getRawSource, but it strips off any cast instructions that feed it,...
Representation for a specific memory location.
An analysis that produces MemorySSA for a function.
Definition: MemorySSA.h:936
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition: MemorySSA.h:702
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1885
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:118
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
const SCEV * getOperand(unsigned i) const
This class represents an analyzed expression in the program.
LLVM_ABI bool isOne() const
Return true if the expression is a constant one.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI bool isKnownNonNegative(const SCEV *S)
Test if the given expression is known to be non-negative.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
LLVM_ABI void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
LLVM_ABI const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getTruncateOrZeroExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI const SCEV * getTruncateOrSignExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
A vector that has set insertion semantics.
Definition: SetVector.h:59
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:279
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:168
Simple and conservative implementation of LoopSafetyInfo that can give false-positive answers to its ...
Definition: MustExecute.h:111
void computeLoopSafetyInfo(const Loop *CurLoop) override
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
Definition: MustExecute.cpp:49
bool anyBlockMayThrow() const override
Returns true iff any block of the loop for which this info is contains an instruction that may throw ...
Definition: MustExecute.cpp:45
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:380
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:418
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:470
void insert_range(Range &&R)
Definition: SmallPtrSet.h:490
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:476
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
An instruction for storing to memory.
Definition: Instructions.h:296
Align getAlign() const
Definition: Instructions.h:338
Value * getValueOperand()
Definition: Instructions.h:383
Value * getPointerOperand()
Definition: Instructions.h:386
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
Provides information about what library functions are available for the current target.
unsigned getWCharSize(const Module &M) const
Returns the size of the wchar_t type in bytes or 0 if the size is unknown.
bool has(LibFunc F) const
Tests whether a library function is available.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getAtomicMemIntrinsicMaxElementSize() const
LLVM_ABI PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const
Return hardware support for population count.
TargetCostKind
The kind of cost model.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
@ TCC_Basic
The cost of a typical 'add' instruction.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
Definition: Type.h:255
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
LLVM_ABI unsigned getIntegerBitWidth() const
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:352
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
void setOperand(unsigned i, Value *Val)
Definition: User.h:237
Value * getOperand(unsigned i) const
Definition: User.h:232
unsigned getNumOperands() const
Definition: User.h:254
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:546
iterator_range< user_iterator > users()
Definition: Value.h:426
LLVM_ABI void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:554
LLVM_ABI void replaceUsesOutsideBlock(Value *V, BasicBlock *BB)
replaceUsesOutsideBlock - Go through the uses list for this definition and make each use point to "V"...
Definition: Value.cpp:599
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1098
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:322
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:396
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:205
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:203
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:172
const ParentTy * getParent() const
Definition: ilist_node.h:34
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
@ HeaderSize
Definition: BTF.h:61
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
OperandType
Operands are tagged with one of the values of this enum.
Definition: MCInstrDesc.h:59
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:862
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:962
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:592
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
Definition: PatternMatch.h:245
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinOpPred_match< LHS, RHS, is_shift_op > m_Shift(const LHS &L, const RHS &R)
Matches shift operations.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
class_match< BasicBlock > m_BasicBlock()
Match an arbitrary basic block value and ignore it.
Definition: PatternMatch.h:189
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:612
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
cst_pred_ty< icmp_pred_with_threshold > m_SpecificInt_ICMP(ICmpInst::Predicate Predicate, const APInt &Threshold)
Match an integer or vector with every element comparing 'pred' (eg/ne/...) to Threshold.
Definition: PatternMatch.h:700
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
class_match< const SCEVConstant > m_SCEVConstant()
specificloop_ty m_SpecificLoop(const Loop *L)
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
bool match(const SCEV *S, const Pattern &P)
specificscev_ty m_scev_Specific(const SCEV *S)
Match if we have a specific specified SCEV.
class_match< const SCEV > m_SCEV()
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:464
constexpr double e
Definition: MathExtras.h:47
DiagnosticInfoOptimizationBase::setExtraArgs setExtraArgs
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:533
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
LLVM_ABI bool isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI, LibFunc TheLibFunc)
Check whether the library function is available on target and also that it in the current Module is a...
LLVM_ABI bool isMustProgress(const Loop *L)
Return true if this loop can be assumed to make progress.
Definition: LoopInfo.cpp:1174
@ None
Definition: CodeGenData.h:107
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:43
LLVM_ABI Value * emitStrLen(Value *Ptr, IRBuilderBase &B, const DataLayout &DL, const TargetLibraryInfo *TLI)
Emit a call to the strlen function to the builder, for the specified pointer.
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition: ModRef.h:28
@ ModRef
The access may reference and may modify the value stored in memory.
@ Mod
The access may modify the value stored in memory.
LLVM_ABI bool VerifyMemorySSA
Enables verification of MemorySSA.
Definition: MemorySSA.cpp:84
LLVM_ABI bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Return true if this function can prove that V does not have undef bits and is never poison.
LLVM_ABI Value * emitWcsLen(Value *Ptr, IRBuilderBase &B, const DataLayout &DL, const TargetLibraryInfo *TLI)
Emit a call to the wcslen function to the builder, for the specified pointer.
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
LLVM_ABI Value * isBytewiseValue(Value *V, const DataLayout &DL)
If the specified value can be set by repeating the same byte in memory, return the i8 value that it i...
LLVM_ABI bool RecursivelyDeleteDeadPHINode(PHINode *PN, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
If the specified value is an effectively dead PHI node, due to being a def-use chain of single-use no...
Definition: Local.cpp:641
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
std::optional< DecomposedBitTest > decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate Pred, bool LookThroughTrunc=true, bool AllowNonZeroC=false, bool DecomposeAnd=false)
Decompose an icmp into the form ((X & Mask) pred C) if possible.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:858
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition: Metadata.h:760
LLVM_ABI AAMDNodes merge(const AAMDNodes &Other) const
Given two sets of AAMDNodes applying to potentially different locations, determine the best AAMDNodes...
AAMDNodes extendTo(ssize_t Len) const
Create a new AAMDNode that describes this AAMDNode after extending it to apply to a series of bytes o...
Definition: Metadata.h:833
static bool Wcslen
When true, Wcslen is disabled.
static bool Strlen
When true, Strlen is disabled.
static bool Memset
When true, Memset is disabled.
static bool All
When true, the entire pass is disabled.
static bool Memcpy
When true, Memcpy is disabled.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Match loop-invariant value.
bool match(ITy *V) const
match_LoopInvariant(const SubPattern_t &SP, const Loop *L)