LLVM 22.0.0git
X86ISelDAGToDAG.cpp
Go to the documentation of this file.
1//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines a DAG pattern matching instruction selector for X86,
10// converting from a legalized dag to a X86 dag.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelDAGToDAG.h"
15#include "X86.h"
17#include "X86Subtarget.h"
18#include "X86TargetMachine.h"
19#include "llvm/ADT/Statistic.h"
22#include "llvm/Config/llvm-config.h"
24#include "llvm/IR/Function.h"
26#include "llvm/IR/Intrinsics.h"
27#include "llvm/IR/IntrinsicsX86.h"
28#include "llvm/IR/Module.h"
29#include "llvm/IR/Type.h"
30#include "llvm/Support/Debug.h"
34#include <cstdint>
35
36using namespace llvm;
37
38#define DEBUG_TYPE "x86-isel"
39#define PASS_NAME "X86 DAG->DAG Instruction Selection"
40
41STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
42
43static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
44 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
46
48 "x86-promote-anyext-load", cl::init(true),
49 cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
50
52
53//===----------------------------------------------------------------------===//
54// Pattern Matcher Implementation
55//===----------------------------------------------------------------------===//
56
57namespace {
58 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
59 /// numbers for the leaves of the matched tree.
60 struct X86ISelAddressMode {
61 enum {
62 RegBase,
63 FrameIndexBase
64 } BaseType = RegBase;
65
66 // This is really a union, discriminated by BaseType!
67 SDValue Base_Reg;
68 int Base_FrameIndex = 0;
69
70 unsigned Scale = 1;
71 SDValue IndexReg;
72 int32_t Disp = 0;
73 SDValue Segment;
74 const GlobalValue *GV = nullptr;
75 const Constant *CP = nullptr;
76 const BlockAddress *BlockAddr = nullptr;
77 const char *ES = nullptr;
78 MCSymbol *MCSym = nullptr;
79 int JT = -1;
80 Align Alignment; // CP alignment.
81 unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
82 bool NegateIndex = false;
83
84 X86ISelAddressMode() = default;
85
86 bool hasSymbolicDisplacement() const {
87 return GV != nullptr || CP != nullptr || ES != nullptr ||
88 MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
89 }
90
91 bool hasBaseOrIndexReg() const {
92 return BaseType == FrameIndexBase ||
93 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
94 }
95
96 /// Return true if this addressing mode is already RIP-relative.
97 bool isRIPRelative() const {
98 if (BaseType != RegBase) return false;
99 if (RegisterSDNode *RegNode =
100 dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
101 return RegNode->getReg() == X86::RIP;
102 return false;
103 }
104
105 void setBaseReg(SDValue Reg) {
106 BaseType = RegBase;
107 Base_Reg = Reg;
108 }
109
110#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
111 void dump(SelectionDAG *DAG = nullptr) {
112 dbgs() << "X86ISelAddressMode " << this << '\n';
113 dbgs() << "Base_Reg ";
114 if (Base_Reg.getNode())
115 Base_Reg.getNode()->dump(DAG);
116 else
117 dbgs() << "nul\n";
118 if (BaseType == FrameIndexBase)
119 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
120 dbgs() << " Scale " << Scale << '\n'
121 << "IndexReg ";
122 if (NegateIndex)
123 dbgs() << "negate ";
124 if (IndexReg.getNode())
125 IndexReg.getNode()->dump(DAG);
126 else
127 dbgs() << "nul\n";
128 dbgs() << " Disp " << Disp << '\n'
129 << "GV ";
130 if (GV)
131 GV->dump();
132 else
133 dbgs() << "nul";
134 dbgs() << " CP ";
135 if (CP)
136 CP->dump();
137 else
138 dbgs() << "nul";
139 dbgs() << '\n'
140 << "ES ";
141 if (ES)
142 dbgs() << ES;
143 else
144 dbgs() << "nul";
145 dbgs() << " MCSym ";
146 if (MCSym)
147 dbgs() << MCSym;
148 else
149 dbgs() << "nul";
150 dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
151 }
152#endif
153 };
154}
155
156namespace {
157 //===--------------------------------------------------------------------===//
158 /// ISel - X86-specific code to select X86 machine instructions for
159 /// SelectionDAG operations.
160 ///
161 class X86DAGToDAGISel final : public SelectionDAGISel {
162 /// Keep a pointer to the X86Subtarget around so that we can
163 /// make the right decision when generating code for different targets.
164 const X86Subtarget *Subtarget;
165
166 /// If true, selector should try to optimize for minimum code size.
167 bool OptForMinSize;
168
169 /// Disable direct TLS access through segment registers.
170 bool IndirectTlsSegRefs;
171
172 public:
173 X86DAGToDAGISel() = delete;
174
175 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
176 : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
177 OptForMinSize(false), IndirectTlsSegRefs(false) {}
178
179 bool runOnMachineFunction(MachineFunction &MF) override {
180 // Reset the subtarget each time through.
181 Subtarget = &MF.getSubtarget<X86Subtarget>();
182 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
183 "indirect-tls-seg-refs");
184
185 // OptFor[Min]Size are used in pattern predicates that isel is matching.
186 OptForMinSize = MF.getFunction().hasMinSize();
188 }
189
190 void emitFunctionEntryCode() override;
191
192 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
193
194 void PreprocessISelDAG() override;
195 void PostprocessISelDAG() override;
196
197// Include the pieces autogenerated from the target description.
198#include "X86GenDAGISel.inc"
199
200 private:
201 void Select(SDNode *N) override;
202
203 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
204 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
205 bool AllowSegmentRegForX32 = false);
206 bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
207 bool matchAddress(SDValue N, X86ISelAddressMode &AM);
208 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
209 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
210 SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
211 unsigned Depth);
212 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
213 unsigned Depth);
214 bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
215 unsigned Depth);
216 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
217 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
218 SDValue &Scale, SDValue &Index, SDValue &Disp,
219 SDValue &Segment);
220 bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
221 SDValue ScaleOp, SDValue &Base, SDValue &Scale,
222 SDValue &Index, SDValue &Disp, SDValue &Segment);
223 bool selectMOV64Imm32(SDValue N, SDValue &Imm);
224 bool selectLEAAddr(SDValue N, SDValue &Base,
225 SDValue &Scale, SDValue &Index, SDValue &Disp,
226 SDValue &Segment);
227 bool selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale,
228 SDValue &Index, SDValue &Disp, SDValue &Segment);
229 bool selectTLSADDRAddr(SDValue N, SDValue &Base,
230 SDValue &Scale, SDValue &Index, SDValue &Disp,
231 SDValue &Segment);
232 bool selectRelocImm(SDValue N, SDValue &Op);
233
234 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
235 SDValue &Base, SDValue &Scale,
236 SDValue &Index, SDValue &Disp,
237 SDValue &Segment);
238
239 // Convenience method where P is also root.
240 bool tryFoldLoad(SDNode *P, SDValue N,
241 SDValue &Base, SDValue &Scale,
242 SDValue &Index, SDValue &Disp,
243 SDValue &Segment) {
244 return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
245 }
246
247 bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
248 SDValue &Base, SDValue &Scale,
249 SDValue &Index, SDValue &Disp,
250 SDValue &Segment);
251
252 bool isProfitableToFormMaskedOp(SDNode *N) const;
253
254 /// Implement addressing mode selection for inline asm expressions.
255 bool SelectInlineAsmMemoryOperand(const SDValue &Op,
256 InlineAsm::ConstraintCode ConstraintID,
257 std::vector<SDValue> &OutOps) override;
258
259 void emitSpecialCodeForMain();
260
261 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
262 MVT VT, SDValue &Base, SDValue &Scale,
263 SDValue &Index, SDValue &Disp,
264 SDValue &Segment) {
265 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
266 Base = CurDAG->getTargetFrameIndex(
267 AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
268 else if (AM.Base_Reg.getNode())
269 Base = AM.Base_Reg;
270 else
271 Base = CurDAG->getRegister(0, VT);
272
273 Scale = getI8Imm(AM.Scale, DL);
274
275#define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
276 // Negate the index if needed.
277 if (AM.NegateIndex) {
278 unsigned NegOpc;
279 switch (VT.SimpleTy) {
280 default:
281 llvm_unreachable("Unsupported VT!");
282 case MVT::i64:
283 NegOpc = GET_ND_IF_ENABLED(X86::NEG64r);
284 break;
285 case MVT::i32:
286 NegOpc = GET_ND_IF_ENABLED(X86::NEG32r);
287 break;
288 case MVT::i16:
289 NegOpc = GET_ND_IF_ENABLED(X86::NEG16r);
290 break;
291 case MVT::i8:
292 NegOpc = GET_ND_IF_ENABLED(X86::NEG8r);
293 break;
294 }
295 SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
296 AM.IndexReg), 0);
297 AM.IndexReg = Neg;
298 }
299
300 if (AM.IndexReg.getNode())
301 Index = AM.IndexReg;
302 else
303 Index = CurDAG->getRegister(0, VT);
304
305 // These are 32-bit even in 64-bit mode since RIP-relative offset
306 // is 32-bit.
307 if (AM.GV)
308 Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
309 MVT::i32, AM.Disp,
310 AM.SymbolFlags);
311 else if (AM.CP)
312 Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
313 AM.Disp, AM.SymbolFlags);
314 else if (AM.ES) {
315 assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
316 Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
317 } else if (AM.MCSym) {
318 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
319 assert(AM.SymbolFlags == 0 && "oo");
320 Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
321 } else if (AM.JT != -1) {
322 assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
323 Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
324 } else if (AM.BlockAddr)
325 Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
326 AM.SymbolFlags);
327 else
328 Disp = CurDAG->getSignedTargetConstant(AM.Disp, DL, MVT::i32);
329
330 if (AM.Segment.getNode())
331 Segment = AM.Segment;
332 else
333 Segment = CurDAG->getRegister(0, MVT::i16);
334 }
335
336 // Utility function to determine whether it is AMX SDNode right after
337 // lowering but before ISEL.
338 bool isAMXSDNode(SDNode *N) const {
339 // Check if N is AMX SDNode:
340 // 1. check specific opcode since these carry MVT::Untyped instead of
341 // x86amx_type;
342 // 2. check result type;
343 // 3. check operand type;
344 switch (N->getOpcode()) {
345 default:
346 break;
347 case X86::PT2RPNTLVWZ0V:
348 case X86::PT2RPNTLVWZ0T1V:
349 case X86::PT2RPNTLVWZ1V:
350 case X86::PT2RPNTLVWZ1T1V:
351 case X86::PT2RPNTLVWZ0RSV:
352 case X86::PT2RPNTLVWZ0RST1V:
353 case X86::PT2RPNTLVWZ1RSV:
354 case X86::PT2RPNTLVWZ1RST1V:
355 return true;
356 }
357 for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) {
358 if (N->getValueType(Idx) == MVT::x86amx)
359 return true;
360 }
361 for (unsigned Idx = 0, E = N->getNumOperands(); Idx != E; ++Idx) {
362 SDValue Op = N->getOperand(Idx);
363 if (Op.getValueType() == MVT::x86amx)
364 return true;
365 }
366 return false;
367 }
368
369 // Utility function to determine whether we should avoid selecting
370 // immediate forms of instructions for better code size or not.
371 // At a high level, we'd like to avoid such instructions when
372 // we have similar constants used within the same basic block
373 // that can be kept in a register.
374 //
375 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
376 uint32_t UseCount = 0;
377
378 // Do not want to hoist if we're not optimizing for size.
379 // TODO: We'd like to remove this restriction.
380 // See the comment in X86InstrInfo.td for more info.
381 if (!CurDAG->shouldOptForSize())
382 return false;
383
384 // Walk all the users of the immediate.
385 for (const SDNode *User : N->users()) {
386 if (UseCount >= 2)
387 break;
388
389 // This user is already selected. Count it as a legitimate use and
390 // move on.
391 if (User->isMachineOpcode()) {
392 UseCount++;
393 continue;
394 }
395
396 // We want to count stores of immediates as real uses.
397 if (User->getOpcode() == ISD::STORE &&
398 User->getOperand(1).getNode() == N) {
399 UseCount++;
400 continue;
401 }
402
403 // We don't currently match users that have > 2 operands (except
404 // for stores, which are handled above)
405 // Those instruction won't match in ISEL, for now, and would
406 // be counted incorrectly.
407 // This may change in the future as we add additional instruction
408 // types.
409 if (User->getNumOperands() != 2)
410 continue;
411
412 // If this is a sign-extended 8-bit integer immediate used in an ALU
413 // instruction, there is probably an opcode encoding to save space.
415 if (C && isInt<8>(C->getSExtValue()))
416 continue;
417
418 // Immediates that are used for offsets as part of stack
419 // manipulation should be left alone. These are typically
420 // used to indicate SP offsets for argument passing and
421 // will get pulled into stores/pushes (implicitly).
422 if (User->getOpcode() == X86ISD::ADD ||
423 User->getOpcode() == ISD::ADD ||
424 User->getOpcode() == X86ISD::SUB ||
425 User->getOpcode() == ISD::SUB) {
426
427 // Find the other operand of the add/sub.
428 SDValue OtherOp = User->getOperand(0);
429 if (OtherOp.getNode() == N)
430 OtherOp = User->getOperand(1);
431
432 // Don't count if the other operand is SP.
433 RegisterSDNode *RegNode;
434 if (OtherOp->getOpcode() == ISD::CopyFromReg &&
436 OtherOp->getOperand(1).getNode())))
437 if ((RegNode->getReg() == X86::ESP) ||
438 (RegNode->getReg() == X86::RSP))
439 continue;
440 }
441
442 // ... otherwise, count this and move on.
443 UseCount++;
444 }
445
446 // If we have more than 1 use, then recommend for hoisting.
447 return (UseCount > 1);
448 }
449
450 /// Return a target constant with the specified value of type i8.
451 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
452 return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
453 }
454
455 /// Return a target constant with the specified value, of type i32.
456 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
457 return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
458 }
459
460 /// Return a target constant with the specified value, of type i64.
461 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
462 return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
463 }
464
465 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
466 const SDLoc &DL) {
467 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
468 uint64_t Index = N->getConstantOperandVal(1);
469 MVT VecVT = N->getOperand(0).getSimpleValueType();
470 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
471 }
472
473 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
474 const SDLoc &DL) {
475 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
476 uint64_t Index = N->getConstantOperandVal(2);
477 MVT VecVT = N->getSimpleValueType(0);
478 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
479 }
480
481 SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
482 const SDLoc &DL) {
483 assert(VecWidth == 128 && "Unexpected vector width");
484 uint64_t Index = N->getConstantOperandVal(2);
485 MVT VecVT = N->getSimpleValueType(0);
486 uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
487 assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
488 // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
489 // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
490 return getI8Imm(InsertIdx ? 0x02 : 0x30, DL);
491 }
492
493 SDValue getSBBZero(SDNode *N) {
494 SDLoc dl(N);
495 MVT VT = N->getSimpleValueType(0);
496
497 // Create zero.
498 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
499 SDValue Zero =
500 SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, {}), 0);
501 if (VT == MVT::i64) {
502 Zero = SDValue(
503 CurDAG->getMachineNode(
504 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
505 CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
506 CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
507 0);
508 }
509
510 // Copy flags to the EFLAGS register and glue it to next node.
511 unsigned Opcode = N->getOpcode();
512 assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
513 "Unexpected opcode for SBB materialization");
514 unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
515 SDValue EFLAGS =
516 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
517 N->getOperand(FlagOpIndex), SDValue());
518
519 // Create a 64-bit instruction if the result is 64-bits otherwise use the
520 // 32-bit version.
521 unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
522 MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
523 VTs = CurDAG->getVTList(SBBVT, MVT::i32);
524 return SDValue(
525 CurDAG->getMachineNode(Opc, dl, VTs,
526 {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}),
527 0);
528 }
529
530 // Helper to detect unneeded and instructions on shift amounts. Called
531 // from PatFrags in tablegen.
532 bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
533 assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
534 const APInt &Val = N->getConstantOperandAPInt(1);
535
536 if (Val.countr_one() >= Width)
537 return true;
538
539 APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
540 return Mask.countr_one() >= Width;
541 }
542
543 /// Return an SDNode that returns the value of the global base register.
544 /// Output instructions required to initialize the global base register,
545 /// if necessary.
546 SDNode *getGlobalBaseReg();
547
548 /// Return a reference to the TargetMachine, casted to the target-specific
549 /// type.
550 const X86TargetMachine &getTargetMachine() const {
551 return static_cast<const X86TargetMachine &>(TM);
552 }
553
554 /// Return a reference to the TargetInstrInfo, casted to the target-specific
555 /// type.
556 const X86InstrInfo *getInstrInfo() const {
557 return Subtarget->getInstrInfo();
558 }
559
560 /// Return a condition code of the given SDNode
561 X86::CondCode getCondFromNode(SDNode *N) const;
562
563 /// Address-mode matching performs shift-of-and to and-of-shift
564 /// reassociation in order to expose more scaled addressing
565 /// opportunities.
566 bool ComplexPatternFuncMutatesDAG() const override {
567 return true;
568 }
569
570 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
571
572 // Indicates we should prefer to use a non-temporal load for this load.
573 bool useNonTemporalLoad(LoadSDNode *N) const {
574 if (!N->isNonTemporal())
575 return false;
576
577 unsigned StoreSize = N->getMemoryVT().getStoreSize();
578
579 if (N->getAlign().value() < StoreSize)
580 return false;
581
582 switch (StoreSize) {
583 default: llvm_unreachable("Unsupported store size");
584 case 4:
585 case 8:
586 return false;
587 case 16:
588 return Subtarget->hasSSE41();
589 case 32:
590 return Subtarget->hasAVX2();
591 case 64:
592 return Subtarget->hasAVX512();
593 }
594 }
595
596 bool foldLoadStoreIntoMemOperand(SDNode *Node);
597 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
598 bool matchBitExtract(SDNode *Node);
599 bool shrinkAndImmediate(SDNode *N);
600 bool isMaskZeroExtended(SDNode *N) const;
601 bool tryShiftAmountMod(SDNode *N);
602 bool tryShrinkShlLogicImm(SDNode *N);
603 bool tryVPTERNLOG(SDNode *N);
604 bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
605 SDNode *ParentC, SDValue A, SDValue B, SDValue C,
606 uint8_t Imm);
607 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
608 bool tryMatchBitSelect(SDNode *N);
609
610 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
611 const SDLoc &dl, MVT VT, SDNode *Node);
612 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
613 const SDLoc &dl, MVT VT, SDNode *Node,
614 SDValue &InGlue);
615
616 bool tryOptimizeRem8Extend(SDNode *N);
617
618 bool onlyUsesZeroFlag(SDValue Flags) const;
619 bool hasNoSignFlagUses(SDValue Flags) const;
620 bool hasNoCarryFlagUses(SDValue Flags) const;
621 };
622
623 class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
624 public:
625 static char ID;
626 explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm,
627 CodeGenOptLevel OptLevel)
628 : SelectionDAGISelLegacy(
629 ID, std::make_unique<X86DAGToDAGISel>(tm, OptLevel)) {}
630 };
631}
632
633char X86DAGToDAGISelLegacy::ID = 0;
634
635INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
636
637// Returns true if this masked compare can be implemented legally with this
638// type.
639static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
640 unsigned Opcode = N->getOpcode();
641 if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
642 Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
643 Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
644 // We can get 256-bit 8 element types here without VLX being enabled. When
645 // this happens we will use 512-bit operations and the mask will not be
646 // zero extended.
647 EVT OpVT = N->getOperand(0).getValueType();
648 // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
649 // second operand.
650 if (Opcode == X86ISD::STRICT_CMPM)
651 OpVT = N->getOperand(1).getValueType();
652 if (OpVT.is256BitVector() || OpVT.is128BitVector())
653 return Subtarget->hasVLX();
654
655 return true;
656 }
657 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
658 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
659 Opcode == X86ISD::FSETCCM_SAE)
660 return true;
661
662 return false;
663}
664
665// Returns true if we can assume the writer of the mask has zero extended it
666// for us.
667bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
668 // If this is an AND, check if we have a compare on either side. As long as
669 // one side guarantees the mask is zero extended, the AND will preserve those
670 // zeros.
671 if (N->getOpcode() == ISD::AND)
672 return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
673 isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
674
675 return isLegalMaskCompare(N, Subtarget);
676}
677
678bool
679X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
680 if (OptLevel == CodeGenOptLevel::None)
681 return false;
682
683 if (!N.hasOneUse())
684 return false;
685
686 if (N.getOpcode() != ISD::LOAD)
687 return true;
688
689 // Don't fold non-temporal loads if we have an instruction for them.
690 if (useNonTemporalLoad(cast<LoadSDNode>(N)))
691 return false;
692
693 // If N is a load, do additional profitability checks.
694 if (U == Root) {
695 switch (U->getOpcode()) {
696 default: break;
697 case X86ISD::ADD:
698 case X86ISD::ADC:
699 case X86ISD::SUB:
700 case X86ISD::SBB:
701 case X86ISD::AND:
702 case X86ISD::XOR:
703 case X86ISD::OR:
704 case ISD::ADD:
705 case ISD::UADDO_CARRY:
706 case ISD::AND:
707 case ISD::OR:
708 case ISD::XOR: {
709 SDValue Op1 = U->getOperand(1);
710
711 // If the other operand is a 8-bit immediate we should fold the immediate
712 // instead. This reduces code size.
713 // e.g.
714 // movl 4(%esp), %eax
715 // addl $4, %eax
716 // vs.
717 // movl $4, %eax
718 // addl 4(%esp), %eax
719 // The former is 2 bytes shorter. In case where the increment is 1, then
720 // the saving can be 4 bytes (by using incl %eax).
721 if (auto *Imm = dyn_cast<ConstantSDNode>(Op1)) {
722 if (Imm->getAPIntValue().isSignedIntN(8))
723 return false;
724
725 // If this is a 64-bit AND with an immediate that fits in 32-bits,
726 // prefer using the smaller and over folding the load. This is needed to
727 // make sure immediates created by shrinkAndImmediate are always folded.
728 // Ideally we would narrow the load during DAG combine and get the
729 // best of both worlds.
730 if (U->getOpcode() == ISD::AND &&
731 Imm->getAPIntValue().getBitWidth() == 64 &&
732 Imm->getAPIntValue().isIntN(32))
733 return false;
734
735 // If this really a zext_inreg that can be represented with a movzx
736 // instruction, prefer that.
737 // TODO: We could shrink the load and fold if it is non-volatile.
738 if (U->getOpcode() == ISD::AND &&
739 (Imm->getAPIntValue() == UINT8_MAX ||
740 Imm->getAPIntValue() == UINT16_MAX ||
741 Imm->getAPIntValue() == UINT32_MAX))
742 return false;
743
744 // ADD/SUB with can negate the immediate and use the opposite operation
745 // to fit 128 into a sign extended 8 bit immediate.
746 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
747 (-Imm->getAPIntValue()).isSignedIntN(8))
748 return false;
749
750 if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
751 (-Imm->getAPIntValue()).isSignedIntN(8) &&
752 hasNoCarryFlagUses(SDValue(U, 1)))
753 return false;
754 }
755
756 // If the other operand is a TLS address, we should fold it instead.
757 // This produces
758 // movl %gs:0, %eax
759 // leal i@NTPOFF(%eax), %eax
760 // instead of
761 // movl $i@NTPOFF, %eax
762 // addl %gs:0, %eax
763 // if the block also has an access to a second TLS address this will save
764 // a load.
765 // FIXME: This is probably also true for non-TLS addresses.
766 if (Op1.getOpcode() == X86ISD::Wrapper) {
767 SDValue Val = Op1.getOperand(0);
769 return false;
770 }
771
772 // Don't fold load if this matches the BTS/BTR/BTC patterns.
773 // BTS: (or X, (shl 1, n))
774 // BTR: (and X, (rotl -2, n))
775 // BTC: (xor X, (shl 1, n))
776 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
777 if (U->getOperand(0).getOpcode() == ISD::SHL &&
778 isOneConstant(U->getOperand(0).getOperand(0)))
779 return false;
780
781 if (U->getOperand(1).getOpcode() == ISD::SHL &&
782 isOneConstant(U->getOperand(1).getOperand(0)))
783 return false;
784 }
785 if (U->getOpcode() == ISD::AND) {
786 SDValue U0 = U->getOperand(0);
787 SDValue U1 = U->getOperand(1);
788 if (U0.getOpcode() == ISD::ROTL) {
790 if (C && C->getSExtValue() == -2)
791 return false;
792 }
793
794 if (U1.getOpcode() == ISD::ROTL) {
796 if (C && C->getSExtValue() == -2)
797 return false;
798 }
799 }
800
801 break;
802 }
803 case ISD::SHL:
804 case ISD::SRA:
805 case ISD::SRL:
806 // Don't fold a load into a shift by immediate. The BMI2 instructions
807 // support folding a load, but not an immediate. The legacy instructions
808 // support folding an immediate, but can't fold a load. Folding an
809 // immediate is preferable to folding a load.
810 if (isa<ConstantSDNode>(U->getOperand(1)))
811 return false;
812
813 break;
814 }
815 }
816
817 // Prevent folding a load if this can implemented with an insert_subreg or
818 // a move that implicitly zeroes.
819 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
820 isNullConstant(Root->getOperand(2)) &&
821 (Root->getOperand(0).isUndef() ||
823 return false;
824
825 return true;
826}
827
828// Indicates it is profitable to form an AVX512 masked operation. Returning
829// false will favor a masked register-register masked move or vblendm and the
830// operation will be selected separately.
831bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
832 assert(
833 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
834 "Unexpected opcode!");
835
836 // If the operation has additional users, the operation will be duplicated.
837 // Check the use count to prevent that.
838 // FIXME: Are there cheap opcodes we might want to duplicate?
839 return N->getOperand(1).hasOneUse();
840}
841
842/// Replace the original chain operand of the call with
843/// load's chain operand and move load below the call's chain operand.
844static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
845 SDValue Call, SDValue OrigChain) {
847 SDValue Chain = OrigChain.getOperand(0);
848 if (Chain.getNode() == Load.getNode())
849 Ops.push_back(Load.getOperand(0));
850 else {
851 assert(Chain.getOpcode() == ISD::TokenFactor &&
852 "Unexpected chain operand");
853 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
854 if (Chain.getOperand(i).getNode() == Load.getNode())
855 Ops.push_back(Load.getOperand(0));
856 else
857 Ops.push_back(Chain.getOperand(i));
858 SDValue NewChain =
859 CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
860 Ops.clear();
861 Ops.push_back(NewChain);
862 }
863 Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
864 CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
865 CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
866 Load.getOperand(1), Load.getOperand(2));
867
868 Ops.clear();
869 Ops.push_back(SDValue(Load.getNode(), 1));
870 Ops.append(Call->op_begin() + 1, Call->op_end());
871 CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
872}
873
874/// Return true if call address is a load and it can be
875/// moved below CALLSEQ_START and the chains leading up to the call.
876/// Return the CALLSEQ_START by reference as a second output.
877/// In the case of a tail call, there isn't a callseq node between the call
878/// chain and the load.
879static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
880 // The transformation is somewhat dangerous if the call's chain was glued to
881 // the call. After MoveBelowOrigChain the load is moved between the call and
882 // the chain, this can create a cycle if the load is not folded. So it is
883 // *really* important that we are sure the load will be folded.
884 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
885 return false;
886 auto *LD = dyn_cast<LoadSDNode>(Callee.getNode());
887 if (!LD ||
888 !LD->isSimple() ||
889 LD->getAddressingMode() != ISD::UNINDEXED ||
890 LD->getExtensionType() != ISD::NON_EXTLOAD)
891 return false;
892
893 // Now let's find the callseq_start.
894 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
895 if (!Chain.hasOneUse())
896 return false;
897 Chain = Chain.getOperand(0);
898 }
899
900 if (!Chain.getNumOperands())
901 return false;
902 // Since we are not checking for AA here, conservatively abort if the chain
903 // writes to memory. It's not safe to move the callee (a load) across a store.
904 if (isa<MemSDNode>(Chain.getNode()) &&
905 cast<MemSDNode>(Chain.getNode())->writeMem())
906 return false;
907 if (Chain.getOperand(0).getNode() == Callee.getNode())
908 return true;
909 if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
910 Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
911 Callee.getValue(1).hasOneUse())
912 return true;
913 return false;
914}
915
916static bool isEndbrImm64(uint64_t Imm) {
917// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
918// i.g: 0xF3660F1EFA, 0xF3670F1EFA
919 if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
920 return false;
921
922 uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
923 0x65, 0x66, 0x67, 0xf0, 0xf2};
924 int i = 24; // 24bit 0x0F1EFA has matched
925 while (i < 64) {
926 uint8_t Byte = (Imm >> i) & 0xFF;
927 if (Byte == 0xF3)
928 return true;
929 if (!llvm::is_contained(OptionalPrefixBytes, Byte))
930 return false;
931 i += 8;
932 }
933
934 return false;
935}
936
937static bool needBWI(MVT VT) {
938 return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
939}
940
941void X86DAGToDAGISel::PreprocessISelDAG() {
942 bool MadeChange = false;
943 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
944 E = CurDAG->allnodes_end(); I != E; ) {
945 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
946
947 // This is for CET enhancement.
948 //
949 // ENDBR32 and ENDBR64 have specific opcodes:
950 // ENDBR32: F3 0F 1E FB
951 // ENDBR64: F3 0F 1E FA
952 // And we want that attackers won’t find unintended ENDBR32/64
953 // opcode matches in the binary
954 // Here’s an example:
955 // If the compiler had to generate asm for the following code:
956 // a = 0xF30F1EFA
957 // it could, for example, generate:
958 // mov 0xF30F1EFA, dword ptr[a]
959 // In such a case, the binary would include a gadget that starts
960 // with a fake ENDBR64 opcode. Therefore, we split such generation
961 // into multiple operations, let it not shows in the binary
962 if (N->getOpcode() == ISD::Constant) {
963 MVT VT = N->getSimpleValueType(0);
964 int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
965 int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
966 if (Imm == EndbrImm || isEndbrImm64(Imm)) {
967 // Check that the cf-protection-branch is enabled.
968 Metadata *CFProtectionBranch =
970 "cf-protection-branch");
971 if (CFProtectionBranch || IndirectBranchTracking) {
972 SDLoc dl(N);
973 SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
974 Complement = CurDAG->getNOT(dl, Complement, VT);
975 --I;
976 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
977 ++I;
978 MadeChange = true;
979 continue;
980 }
981 }
982 }
983
984 // If this is a target specific AND node with no flag usages, turn it back
985 // into ISD::AND to enable test instruction matching.
986 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
987 SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
988 N->getOperand(0), N->getOperand(1));
989 --I;
990 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
991 ++I;
992 MadeChange = true;
993 continue;
994 }
995
996 // Convert vector increment or decrement to sub/add with an all-ones
997 // constant:
998 // add X, <1, 1...> --> sub X, <-1, -1...>
999 // sub X, <1, 1...> --> add X, <-1, -1...>
1000 // The all-ones vector constant can be materialized using a pcmpeq
1001 // instruction that is commonly recognized as an idiom (has no register
1002 // dependency), so that's better/smaller than loading a splat 1 constant.
1003 //
1004 // But don't do this if it would inhibit a potentially profitable load
1005 // folding opportunity for the other operand. That only occurs with the
1006 // intersection of:
1007 // (1) The other operand (op0) is load foldable.
1008 // (2) The op is an add (otherwise, we are *creating* an add and can still
1009 // load fold the other op).
1010 // (3) The target has AVX (otherwise, we have a destructive add and can't
1011 // load fold the other op without killing the constant op).
1012 // (4) The constant 1 vector has multiple uses (so it is profitable to load
1013 // into a register anyway).
1014 auto mayPreventLoadFold = [&]() {
1015 return X86::mayFoldLoad(N->getOperand(0), *Subtarget) &&
1016 N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
1017 !N->getOperand(1).hasOneUse();
1018 };
1019 if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
1020 N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
1021 APInt SplatVal;
1022 if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
1023 SplatVal.isOne()) {
1024 SDLoc DL(N);
1025
1026 MVT VT = N->getSimpleValueType(0);
1027 unsigned NumElts = VT.getSizeInBits() / 32;
1029 CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
1030 AllOnes = CurDAG->getBitcast(VT, AllOnes);
1031
1032 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
1033 SDValue Res =
1034 CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
1035 --I;
1036 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1037 ++I;
1038 MadeChange = true;
1039 continue;
1040 }
1041 }
1042
1043 switch (N->getOpcode()) {
1044 case X86ISD::VBROADCAST: {
1045 MVT VT = N->getSimpleValueType(0);
1046 // Emulate v32i16/v64i8 broadcast without BWI.
1047 if (!Subtarget->hasBWI() && needBWI(VT)) {
1048 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1049 SDLoc dl(N);
1050 SDValue NarrowBCast =
1051 CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
1052 SDValue Res =
1053 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1054 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1055 unsigned Index = NarrowVT.getVectorMinNumElements();
1056 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1057 CurDAG->getIntPtrConstant(Index, dl));
1058
1059 --I;
1060 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1061 ++I;
1062 MadeChange = true;
1063 continue;
1064 }
1065
1066 break;
1067 }
1069 MVT VT = N->getSimpleValueType(0);
1070 // Emulate v32i16/v64i8 broadcast without BWI.
1071 if (!Subtarget->hasBWI() && needBWI(VT)) {
1072 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1073 auto *MemNode = cast<MemSDNode>(N);
1074 SDLoc dl(N);
1075 SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
1076 SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1077 SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1078 X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
1079 MemNode->getMemOperand());
1080 SDValue Res =
1081 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1082 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1083 unsigned Index = NarrowVT.getVectorMinNumElements();
1084 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1085 CurDAG->getIntPtrConstant(Index, dl));
1086
1087 --I;
1088 SDValue To[] = {Res, NarrowBCast.getValue(1)};
1089 CurDAG->ReplaceAllUsesWith(N, To);
1090 ++I;
1091 MadeChange = true;
1092 continue;
1093 }
1094
1095 break;
1096 }
1097 case ISD::LOAD: {
1098 // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1099 // load, then just extract the lower subvector and avoid the second load.
1100 auto *Ld = cast<LoadSDNode>(N);
1101 MVT VT = N->getSimpleValueType(0);
1102 if (!ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
1103 !(VT.is128BitVector() || VT.is256BitVector()))
1104 break;
1105
1106 MVT MaxVT = VT;
1107 SDNode *MaxLd = nullptr;
1108 SDValue Ptr = Ld->getBasePtr();
1109 SDValue Chain = Ld->getChain();
1110 for (SDNode *User : Ptr->users()) {
1111 auto *UserLd = dyn_cast<LoadSDNode>(User);
1112 MVT UserVT = User->getSimpleValueType(0);
1113 if (User != N && UserLd && ISD::isNormalLoad(User) &&
1114 UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1115 !User->hasAnyUseOfValue(1) &&
1116 (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1117 UserVT.getSizeInBits() > VT.getSizeInBits() &&
1118 (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1119 MaxLd = User;
1120 MaxVT = UserVT;
1121 }
1122 }
1123 if (MaxLd) {
1124 SDLoc dl(N);
1125 unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1126 MVT SubVT = MVT::getVectorVT(MaxVT.getScalarType(), NumSubElts);
1127 SDValue Extract = CurDAG->getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT,
1128 SDValue(MaxLd, 0),
1129 CurDAG->getIntPtrConstant(0, dl));
1130 SDValue Res = CurDAG->getBitcast(VT, Extract);
1131
1132 --I;
1133 SDValue To[] = {Res, SDValue(MaxLd, 1)};
1134 CurDAG->ReplaceAllUsesWith(N, To);
1135 ++I;
1136 MadeChange = true;
1137 continue;
1138 }
1139 break;
1140 }
1141 case ISD::VSELECT: {
1142 // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1143 EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
1144 if (EleVT == MVT::i1)
1145 break;
1146
1147 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1148 assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1149 "We can't replace VSELECT with BLENDV in vXi16!");
1150 SDValue R;
1151 if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(N->getOperand(0)) ==
1152 EleVT.getSizeInBits()) {
1153 R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0),
1154 N->getOperand(0), N->getOperand(1), N->getOperand(2),
1155 CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
1156 } else {
1157 R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
1158 N->getOperand(0), N->getOperand(1),
1159 N->getOperand(2));
1160 }
1161 --I;
1162 CurDAG->ReplaceAllUsesWith(N, R.getNode());
1163 ++I;
1164 MadeChange = true;
1165 continue;
1166 }
1167 case ISD::FP_ROUND:
1169 case ISD::FP_TO_SINT:
1170 case ISD::FP_TO_UINT:
1173 // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1174 // don't need 2 sets of patterns.
1175 if (!N->getSimpleValueType(0).isVector())
1176 break;
1177
1178 unsigned NewOpc;
1179 switch (N->getOpcode()) {
1180 default: llvm_unreachable("Unexpected opcode!");
1181 case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
1182 case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
1183 case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1184 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
1185 case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1186 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
1187 }
1188 SDValue Res;
1189 if (N->isStrictFPOpcode())
1190 Res =
1191 CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
1192 {N->getOperand(0), N->getOperand(1)});
1193 else
1194 Res =
1195 CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1196 N->getOperand(0));
1197 --I;
1198 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1199 ++I;
1200 MadeChange = true;
1201 continue;
1202 }
1203 case ISD::SHL:
1204 case ISD::SRA:
1205 case ISD::SRL: {
1206 // Replace vector shifts with their X86 specific equivalent so we don't
1207 // need 2 sets of patterns.
1208 if (!N->getValueType(0).isVector())
1209 break;
1210
1211 unsigned NewOpc;
1212 switch (N->getOpcode()) {
1213 default: llvm_unreachable("Unexpected opcode!");
1214 case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1215 case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1216 case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1217 }
1218 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1219 N->getOperand(0), N->getOperand(1));
1220 --I;
1221 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1222 ++I;
1223 MadeChange = true;
1224 continue;
1225 }
1226 case ISD::ANY_EXTEND:
1228 // Replace vector any extend with the zero extend equivalents so we don't
1229 // need 2 sets of patterns. Ignore vXi1 extensions.
1230 if (!N->getValueType(0).isVector())
1231 break;
1232
1233 unsigned NewOpc;
1234 if (N->getOperand(0).getScalarValueSizeInBits() == 1) {
1235 assert(N->getOpcode() == ISD::ANY_EXTEND &&
1236 "Unexpected opcode for mask vector!");
1237 NewOpc = ISD::SIGN_EXTEND;
1238 } else {
1239 NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1242 }
1243
1244 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1245 N->getOperand(0));
1246 --I;
1247 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1248 ++I;
1249 MadeChange = true;
1250 continue;
1251 }
1252 case ISD::FCEIL:
1253 case ISD::STRICT_FCEIL:
1254 case ISD::FFLOOR:
1255 case ISD::STRICT_FFLOOR:
1256 case ISD::FTRUNC:
1257 case ISD::STRICT_FTRUNC:
1258 case ISD::FROUNDEVEN:
1260 case ISD::FNEARBYINT:
1262 case ISD::FRINT:
1263 case ISD::STRICT_FRINT: {
1264 // Replace fp rounding with their X86 specific equivalent so we don't
1265 // need 2 sets of patterns.
1266 unsigned Imm;
1267 switch (N->getOpcode()) {
1268 default: llvm_unreachable("Unexpected opcode!");
1269 case ISD::STRICT_FCEIL:
1270 case ISD::FCEIL: Imm = 0xA; break;
1271 case ISD::STRICT_FFLOOR:
1272 case ISD::FFLOOR: Imm = 0x9; break;
1273 case ISD::STRICT_FTRUNC:
1274 case ISD::FTRUNC: Imm = 0xB; break;
1276 case ISD::FROUNDEVEN: Imm = 0x8; break;
1278 case ISD::FNEARBYINT: Imm = 0xC; break;
1279 case ISD::STRICT_FRINT:
1280 case ISD::FRINT: Imm = 0x4; break;
1281 }
1282 SDLoc dl(N);
1283 bool IsStrict = N->isStrictFPOpcode();
1284 SDValue Res;
1285 if (IsStrict)
1286 Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
1287 {N->getValueType(0), MVT::Other},
1288 {N->getOperand(0), N->getOperand(1),
1289 CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
1290 else
1291 Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
1292 N->getOperand(0),
1293 CurDAG->getTargetConstant(Imm, dl, MVT::i32));
1294 --I;
1295 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1296 ++I;
1297 MadeChange = true;
1298 continue;
1299 }
1300 case X86ISD::FANDN:
1301 case X86ISD::FAND:
1302 case X86ISD::FOR:
1303 case X86ISD::FXOR: {
1304 // Widen scalar fp logic ops to vector to reduce isel patterns.
1305 // FIXME: Can we do this during lowering/combine.
1306 MVT VT = N->getSimpleValueType(0);
1307 if (VT.isVector() || VT == MVT::f128)
1308 break;
1309
1310 MVT VecVT = VT == MVT::f64 ? MVT::v2f64
1311 : VT == MVT::f32 ? MVT::v4f32
1312 : MVT::v8f16;
1313
1314 SDLoc dl(N);
1315 SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1316 N->getOperand(0));
1317 SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1318 N->getOperand(1));
1319
1320 SDValue Res;
1321 if (Subtarget->hasSSE2()) {
1322 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1323 Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
1324 Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
1325 unsigned Opc;
1326 switch (N->getOpcode()) {
1327 default: llvm_unreachable("Unexpected opcode!");
1328 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1329 case X86ISD::FAND: Opc = ISD::AND; break;
1330 case X86ISD::FOR: Opc = ISD::OR; break;
1331 case X86ISD::FXOR: Opc = ISD::XOR; break;
1332 }
1333 Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
1334 Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
1335 } else {
1336 Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
1337 }
1338 Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
1339 CurDAG->getIntPtrConstant(0, dl));
1340 --I;
1341 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1342 ++I;
1343 MadeChange = true;
1344 continue;
1345 }
1346 }
1347
1348 if (OptLevel != CodeGenOptLevel::None &&
1349 // Only do this when the target can fold the load into the call or
1350 // jmp.
1351 !Subtarget->useIndirectThunkCalls() &&
1352 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
1353 (N->getOpcode() == X86ISD::TC_RETURN &&
1354 (Subtarget->is64Bit() ||
1355 !getTargetMachine().isPositionIndependent())))) {
1356 /// Also try moving call address load from outside callseq_start to just
1357 /// before the call to allow it to be folded.
1358 ///
1359 /// [Load chain]
1360 /// ^
1361 /// |
1362 /// [Load]
1363 /// ^ ^
1364 /// | |
1365 /// / \--
1366 /// / |
1367 ///[CALLSEQ_START] |
1368 /// ^ |
1369 /// | |
1370 /// [LOAD/C2Reg] |
1371 /// | |
1372 /// \ /
1373 /// \ /
1374 /// [CALL]
1375 bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1376 SDValue Chain = N->getOperand(0);
1377 SDValue Load = N->getOperand(1);
1378 if (!isCalleeLoad(Load, Chain, HasCallSeq))
1379 continue;
1380 moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
1381 ++NumLoadMoved;
1382 MadeChange = true;
1383 continue;
1384 }
1385
1386 // Lower fpround and fpextend nodes that target the FP stack to be store and
1387 // load to the stack. This is a gross hack. We would like to simply mark
1388 // these as being illegal, but when we do that, legalize produces these when
1389 // it expands calls, then expands these in the same legalize pass. We would
1390 // like dag combine to be able to hack on these between the call expansion
1391 // and the node legalization. As such this pass basically does "really
1392 // late" legalization of these inline with the X86 isel pass.
1393 // FIXME: This should only happen when not compiled with -O0.
1394 switch (N->getOpcode()) {
1395 default: continue;
1396 case ISD::FP_ROUND:
1397 case ISD::FP_EXTEND:
1398 {
1399 MVT SrcVT = N->getOperand(0).getSimpleValueType();
1400 MVT DstVT = N->getSimpleValueType(0);
1401
1402 // If any of the sources are vectors, no fp stack involved.
1403 if (SrcVT.isVector() || DstVT.isVector())
1404 continue;
1405
1406 // If the source and destination are SSE registers, then this is a legal
1407 // conversion that should not be lowered.
1408 const X86TargetLowering *X86Lowering =
1409 static_cast<const X86TargetLowering *>(TLI);
1410 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1411 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1412 if (SrcIsSSE && DstIsSSE)
1413 continue;
1414
1415 if (!SrcIsSSE && !DstIsSSE) {
1416 // If this is an FPStack extension, it is a noop.
1417 if (N->getOpcode() == ISD::FP_EXTEND)
1418 continue;
1419 // If this is a value-preserving FPStack truncation, it is a noop.
1420 if (N->getConstantOperandVal(1))
1421 continue;
1422 }
1423
1424 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1425 // FPStack has extload and truncstore. SSE can fold direct loads into other
1426 // operations. Based on this, decide what we want to do.
1427 MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1428 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1429 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1430 MachinePointerInfo MPI =
1431 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1432 SDLoc dl(N);
1433
1434 // FIXME: optimize the case where the src/dest is a load or store?
1435
1436 SDValue Store = CurDAG->getTruncStore(
1437 CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT);
1438 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store,
1439 MemTmp, MPI, MemVT);
1440
1441 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1442 // extload we created. This will cause general havok on the dag because
1443 // anything below the conversion could be folded into other existing nodes.
1444 // To avoid invalidating 'I', back it up to the convert node.
1445 --I;
1446 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1447 break;
1448 }
1449
1450 //The sequence of events for lowering STRICT_FP versions of these nodes requires
1451 //dealing with the chain differently, as there is already a preexisting chain.
1454 {
1455 MVT SrcVT = N->getOperand(1).getSimpleValueType();
1456 MVT DstVT = N->getSimpleValueType(0);
1457
1458 // If any of the sources are vectors, no fp stack involved.
1459 if (SrcVT.isVector() || DstVT.isVector())
1460 continue;
1461
1462 // If the source and destination are SSE registers, then this is a legal
1463 // conversion that should not be lowered.
1464 const X86TargetLowering *X86Lowering =
1465 static_cast<const X86TargetLowering *>(TLI);
1466 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1467 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1468 if (SrcIsSSE && DstIsSSE)
1469 continue;
1470
1471 if (!SrcIsSSE && !DstIsSSE) {
1472 // If this is an FPStack extension, it is a noop.
1473 if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1474 continue;
1475 // If this is a value-preserving FPStack truncation, it is a noop.
1476 if (N->getConstantOperandVal(2))
1477 continue;
1478 }
1479
1480 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1481 // FPStack has extload and truncstore. SSE can fold direct loads into other
1482 // operations. Based on this, decide what we want to do.
1483 MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1484 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1485 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1486 MachinePointerInfo MPI =
1487 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1488 SDLoc dl(N);
1489
1490 // FIXME: optimize the case where the src/dest is a load or store?
1491
1492 //Since the operation is StrictFP, use the preexisting chain.
1494 if (!SrcIsSSE) {
1495 SDVTList VTs = CurDAG->getVTList(MVT::Other);
1496 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
1497 Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
1498 MPI, /*Align*/ std::nullopt,
1500 if (N->getFlags().hasNoFPExcept()) {
1501 SDNodeFlags Flags = Store->getFlags();
1502 Flags.setNoFPExcept(true);
1503 Store->setFlags(Flags);
1504 }
1505 } else {
1506 assert(SrcVT == MemVT && "Unexpected VT!");
1507 Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
1508 MPI);
1509 }
1510
1511 if (!DstIsSSE) {
1512 SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
1513 SDValue Ops[] = {Store, MemTmp};
1514 Result = CurDAG->getMemIntrinsicNode(
1515 X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
1516 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
1517 if (N->getFlags().hasNoFPExcept()) {
1518 SDNodeFlags Flags = Result->getFlags();
1519 Flags.setNoFPExcept(true);
1520 Result->setFlags(Flags);
1521 }
1522 } else {
1523 assert(DstVT == MemVT && "Unexpected VT!");
1524 Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI);
1525 }
1526
1527 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1528 // extload we created. This will cause general havok on the dag because
1529 // anything below the conversion could be folded into other existing nodes.
1530 // To avoid invalidating 'I', back it up to the convert node.
1531 --I;
1532 CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1533 break;
1534 }
1535 }
1536
1537
1538 // Now that we did that, the node is dead. Increment the iterator to the
1539 // next node to process, then delete N.
1540 ++I;
1541 MadeChange = true;
1542 }
1543
1544 // Remove any dead nodes that may have been left behind.
1545 if (MadeChange)
1546 CurDAG->RemoveDeadNodes();
1547}
1548
1549// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1550bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1551 unsigned Opc = N->getMachineOpcode();
1552 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1553 Opc != X86::MOVSX64rr8)
1554 return false;
1555
1556 SDValue N0 = N->getOperand(0);
1557
1558 // We need to be extracting the lower bit of an extend.
1559 if (!N0.isMachineOpcode() ||
1560 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1561 N0.getConstantOperandVal(1) != X86::sub_8bit)
1562 return false;
1563
1564 // We're looking for either a movsx or movzx to match the original opcode.
1565 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1566 : X86::MOVSX32rr8_NOREX;
1567 SDValue N00 = N0.getOperand(0);
1568 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1569 return false;
1570
1571 if (Opc == X86::MOVSX64rr8) {
1572 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1573 // to 64.
1574 MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1575 MVT::i64, N00);
1576 ReplaceUses(N, Extend);
1577 } else {
1578 // Ok we can drop this extend and just use the original extend.
1579 ReplaceUses(N, N00.getNode());
1580 }
1581
1582 return true;
1583}
1584
1585void X86DAGToDAGISel::PostprocessISelDAG() {
1586 // Skip peepholes at -O0.
1587 if (TM.getOptLevel() == CodeGenOptLevel::None)
1588 return;
1589
1590 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1591
1592 bool MadeChange = false;
1593 while (Position != CurDAG->allnodes_begin()) {
1594 SDNode *N = &*--Position;
1595 // Skip dead nodes and any non-machine opcodes.
1596 if (N->use_empty() || !N->isMachineOpcode())
1597 continue;
1598
1599 if (tryOptimizeRem8Extend(N)) {
1600 MadeChange = true;
1601 continue;
1602 }
1603
1604 unsigned Opc = N->getMachineOpcode();
1605 switch (Opc) {
1606 default:
1607 continue;
1608 // ANDrr/rm + TESTrr+ -> TESTrr/TESTmr
1609 case X86::TEST8rr:
1610 case X86::TEST16rr:
1611 case X86::TEST32rr:
1612 case X86::TEST64rr:
1613 // ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr
1614 case X86::CTEST8rr:
1615 case X86::CTEST16rr:
1616 case X86::CTEST32rr:
1617 case X86::CTEST64rr: {
1618 auto &Op0 = N->getOperand(0);
1619 if (Op0 != N->getOperand(1) || !Op0->hasNUsesOfValue(2, Op0.getResNo()) ||
1620 !Op0.isMachineOpcode())
1621 continue;
1622 SDValue And = N->getOperand(0);
1623#define CASE_ND(OP) \
1624 case X86::OP: \
1625 case X86::OP##_ND:
1626 switch (And.getMachineOpcode()) {
1627 default:
1628 continue;
1629 CASE_ND(AND8rr)
1630 CASE_ND(AND16rr)
1631 CASE_ND(AND32rr)
1632 CASE_ND(AND64rr) {
1633 if (And->hasAnyUseOfValue(1))
1634 continue;
1635 SmallVector<SDValue> Ops(N->op_values());
1636 Ops[0] = And.getOperand(0);
1637 Ops[1] = And.getOperand(1);
1638 MachineSDNode *Test =
1639 CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i32, Ops);
1640 ReplaceUses(N, Test);
1641 MadeChange = true;
1642 continue;
1643 }
1644 CASE_ND(AND8rm)
1645 CASE_ND(AND16rm)
1646 CASE_ND(AND32rm)
1647 CASE_ND(AND64rm) {
1648 if (And->hasAnyUseOfValue(1))
1649 continue;
1650 unsigned NewOpc;
1651 bool IsCTESTCC = X86::isCTESTCC(Opc);
1652#define FROM_TO(A, B) \
1653 CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B; \
1654 break;
1655 switch (And.getMachineOpcode()) {
1656 FROM_TO(AND8rm, TEST8mr);
1657 FROM_TO(AND16rm, TEST16mr);
1658 FROM_TO(AND32rm, TEST32mr);
1659 FROM_TO(AND64rm, TEST64mr);
1660 }
1661#undef FROM_TO
1662#undef CASE_ND
1663 // Need to swap the memory and register operand.
1664 SmallVector<SDValue> Ops = {And.getOperand(1), And.getOperand(2),
1665 And.getOperand(3), And.getOperand(4),
1666 And.getOperand(5), And.getOperand(0)};
1667 // CC, Cflags.
1668 if (IsCTESTCC) {
1669 Ops.push_back(N->getOperand(2));
1670 Ops.push_back(N->getOperand(3));
1671 }
1672 // Chain of memory load
1673 Ops.push_back(And.getOperand(6));
1674 // Glue
1675 if (IsCTESTCC)
1676 Ops.push_back(N->getOperand(4));
1677
1678 MachineSDNode *Test = CurDAG->getMachineNode(
1679 NewOpc, SDLoc(N), MVT::i32, MVT::Other, Ops);
1680 CurDAG->setNodeMemRefs(
1681 Test, cast<MachineSDNode>(And.getNode())->memoperands());
1682 ReplaceUses(And.getValue(2), SDValue(Test, 1));
1683 ReplaceUses(SDValue(N, 0), SDValue(Test, 0));
1684 MadeChange = true;
1685 continue;
1686 }
1687 }
1688 }
1689 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1690 // used. We're doing this late so we can prefer to fold the AND into masked
1691 // comparisons. Doing that can be better for the live range of the mask
1692 // register.
1693 case X86::KORTESTBkk:
1694 case X86::KORTESTWkk:
1695 case X86::KORTESTDkk:
1696 case X86::KORTESTQkk: {
1697 SDValue Op0 = N->getOperand(0);
1698 if (Op0 != N->getOperand(1) || !N->isOnlyUserOf(Op0.getNode()) ||
1699 !Op0.isMachineOpcode() || !onlyUsesZeroFlag(SDValue(N, 0)))
1700 continue;
1701#define CASE(A) \
1702 case X86::A: \
1703 break;
1704 switch (Op0.getMachineOpcode()) {
1705 default:
1706 continue;
1707 CASE(KANDBkk)
1708 CASE(KANDWkk)
1709 CASE(KANDDkk)
1710 CASE(KANDQkk)
1711 }
1712 unsigned NewOpc;
1713#define FROM_TO(A, B) \
1714 case X86::A: \
1715 NewOpc = X86::B; \
1716 break;
1717 switch (Opc) {
1718 FROM_TO(KORTESTBkk, KTESTBkk)
1719 FROM_TO(KORTESTWkk, KTESTWkk)
1720 FROM_TO(KORTESTDkk, KTESTDkk)
1721 FROM_TO(KORTESTQkk, KTESTQkk)
1722 }
1723 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1724 // KAND instructions and KTEST use the same ISA feature.
1725 if (NewOpc == X86::KTESTWkk && !Subtarget->hasDQI())
1726 continue;
1727#undef FROM_TO
1728 MachineSDNode *KTest = CurDAG->getMachineNode(
1729 NewOpc, SDLoc(N), MVT::i32, Op0.getOperand(0), Op0.getOperand(1));
1730 ReplaceUses(N, KTest);
1731 MadeChange = true;
1732 continue;
1733 }
1734 // Attempt to remove vectors moves that were inserted to zero upper bits.
1735 case TargetOpcode::SUBREG_TO_REG: {
1736 unsigned SubRegIdx = N->getConstantOperandVal(2);
1737 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1738 continue;
1739
1740 SDValue Move = N->getOperand(1);
1741 if (!Move.isMachineOpcode())
1742 continue;
1743
1744 // Make sure its one of the move opcodes we recognize.
1745 switch (Move.getMachineOpcode()) {
1746 default:
1747 continue;
1748 CASE(VMOVAPDrr) CASE(VMOVUPDrr)
1749 CASE(VMOVAPSrr) CASE(VMOVUPSrr)
1750 CASE(VMOVDQArr) CASE(VMOVDQUrr)
1751 CASE(VMOVAPDYrr) CASE(VMOVUPDYrr)
1752 CASE(VMOVAPSYrr) CASE(VMOVUPSYrr)
1753 CASE(VMOVDQAYrr) CASE(VMOVDQUYrr)
1754 CASE(VMOVAPDZ128rr) CASE(VMOVUPDZ128rr)
1755 CASE(VMOVAPSZ128rr) CASE(VMOVUPSZ128rr)
1756 CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr)
1757 CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr)
1758 CASE(VMOVAPDZ256rr) CASE(VMOVUPDZ256rr)
1759 CASE(VMOVAPSZ256rr) CASE(VMOVUPSZ256rr)
1760 CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr)
1761 CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr)
1762 }
1763#undef CASE
1764
1765 SDValue In = Move.getOperand(0);
1766 if (!In.isMachineOpcode() ||
1767 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1768 continue;
1769
1770 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1771 // the SHA instructions which use a legacy encoding.
1772 uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1773 if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1774 (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1775 (TSFlags & X86II::EncodingMask) != X86II::XOP)
1776 continue;
1777
1778 // Producing instruction is another vector instruction. We can drop the
1779 // move.
1780 CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1781 MadeChange = true;
1782 }
1783 }
1784 }
1785
1786 if (MadeChange)
1787 CurDAG->RemoveDeadNodes();
1788}
1789
1790
1791/// Emit any code that needs to be executed only in the main function.
1792void X86DAGToDAGISel::emitSpecialCodeForMain() {
1793 if (Subtarget->isTargetCygMing()) {
1794 TargetLowering::ArgListTy Args;
1795 auto &DL = CurDAG->getDataLayout();
1796
1797 TargetLowering::CallLoweringInfo CLI(*CurDAG);
1798 CLI.setChain(CurDAG->getRoot())
1799 .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1800 CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1801 std::move(Args));
1802 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1803 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1804 CurDAG->setRoot(Result.second);
1805 }
1806}
1807
1808void X86DAGToDAGISel::emitFunctionEntryCode() {
1809 // If this is main, emit special code for main.
1810 const Function &F = MF->getFunction();
1811 if (F.hasExternalLinkage() && F.getName() == "main")
1812 emitSpecialCodeForMain();
1813}
1814
1815static bool isDispSafeForFrameIndexOrRegBase(int64_t Val) {
1816 // We can run into an issue where a frame index or a register base
1817 // includes a displacement that, when added to the explicit displacement,
1818 // will overflow the displacement field. Assuming that the
1819 // displacement fits into a 31-bit integer (which is only slightly more
1820 // aggressive than the current fundamental assumption that it fits into
1821 // a 32-bit integer), a 31-bit disp should always be safe.
1822 return isInt<31>(Val);
1823}
1824
1825bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1826 X86ISelAddressMode &AM) {
1827 // We may have already matched a displacement and the caller just added the
1828 // symbolic displacement. So we still need to do the checks even if Offset
1829 // is zero.
1830
1831 int64_t Val = AM.Disp + Offset;
1832
1833 // Cannot combine ExternalSymbol displacements with integer offsets.
1834 if (Val != 0 && (AM.ES || AM.MCSym))
1835 return true;
1836
1837 CodeModel::Model M = TM.getCodeModel();
1838 if (Subtarget->is64Bit()) {
1839 if (Val != 0 &&
1841 AM.hasSymbolicDisplacement()))
1842 return true;
1843 // In addition to the checks required for a register base, check that
1844 // we do not try to use an unsafe Disp with a frame index.
1845 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1847 return true;
1848 // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1849 // 64 bits. Instructions with 32-bit register addresses perform this zero
1850 // extension for us and we can safely ignore the high bits of Offset.
1851 // Instructions with only a 32-bit immediate address do not, though: they
1852 // sign extend instead. This means only address the low 2GB of address space
1853 // is directly addressable, we need indirect addressing for the high 2GB of
1854 // address space.
1855 // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1856 // implicit zero extension of instructions would cover up any problem.
1857 // However, we have asserts elsewhere that get triggered if we do, so keep
1858 // the checks for now.
1859 // TODO: We would actually be able to accept these, as well as the same
1860 // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1861 // to get an address size override to be emitted. However, this
1862 // pseudo-register is not part of any register class and therefore causes
1863 // MIR verification to fail.
1864 if (Subtarget->isTarget64BitILP32() &&
1865 !isDispSafeForFrameIndexOrRegBase((uint32_t)Val) &&
1866 !AM.hasBaseOrIndexReg())
1867 return true;
1868 } else if (AM.hasBaseOrIndexReg() && !isDispSafeForFrameIndexOrRegBase(Val))
1869 // For 32-bit X86, make sure the displacement still isn't close to the
1870 // expressible limit.
1871 return true;
1872 AM.Disp = Val;
1873 return false;
1874}
1875
1876bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1877 bool AllowSegmentRegForX32) {
1878 SDValue Address = N->getOperand(1);
1879
1880 // load gs:0 -> GS segment register.
1881 // load fs:0 -> FS segment register.
1882 //
1883 // This optimization is generally valid because the GNU TLS model defines that
1884 // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1885 // with 32-bit registers, as we get in ILP32 mode, those registers are first
1886 // zero-extended to 64 bits and then added it to the base address, which gives
1887 // unwanted results when the register holds a negative value.
1888 // For more information see http://people.redhat.com/drepper/tls.pdf
1889 if (isNullConstant(Address) && AM.Segment.getNode() == nullptr &&
1890 !IndirectTlsSegRefs &&
1891 (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1892 Subtarget->isTargetFuchsia())) {
1893 if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1894 return true;
1895 switch (N->getPointerInfo().getAddrSpace()) {
1896 case X86AS::GS:
1897 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1898 return false;
1899 case X86AS::FS:
1900 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1901 return false;
1902 // Address space X86AS::SS is not handled here, because it is not used to
1903 // address TLS areas.
1904 }
1905 }
1906
1907 return true;
1908}
1909
1910/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1911/// mode. These wrap things that will resolve down into a symbol reference.
1912/// If no match is possible, this returns true, otherwise it returns false.
1913bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1914 // If the addressing mode already has a symbol as the displacement, we can
1915 // never match another symbol.
1916 if (AM.hasSymbolicDisplacement())
1917 return true;
1918
1919 bool IsRIPRelTLS = false;
1920 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1921 if (IsRIPRel) {
1922 SDValue Val = N.getOperand(0);
1924 IsRIPRelTLS = true;
1925 }
1926
1927 // We can't use an addressing mode in the 64-bit large code model.
1928 // Global TLS addressing is an exception. In the medium code model,
1929 // we use can use a mode when RIP wrappers are present.
1930 // That signifies access to globals that are known to be "near",
1931 // such as the GOT itself.
1932 CodeModel::Model M = TM.getCodeModel();
1933 if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1934 return true;
1935
1936 // Base and index reg must be 0 in order to use %rip as base.
1937 if (IsRIPRel && AM.hasBaseOrIndexReg())
1938 return true;
1939
1940 // Make a local copy in case we can't do this fold.
1941 X86ISelAddressMode Backup = AM;
1942
1943 int64_t Offset = 0;
1944 SDValue N0 = N.getOperand(0);
1945 if (auto *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1946 AM.GV = G->getGlobal();
1947 AM.SymbolFlags = G->getTargetFlags();
1948 Offset = G->getOffset();
1949 } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1950 AM.CP = CP->getConstVal();
1951 AM.Alignment = CP->getAlign();
1952 AM.SymbolFlags = CP->getTargetFlags();
1953 Offset = CP->getOffset();
1954 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1955 AM.ES = S->getSymbol();
1956 AM.SymbolFlags = S->getTargetFlags();
1957 } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1958 AM.MCSym = S->getMCSymbol();
1959 } else if (auto *J = dyn_cast<JumpTableSDNode>(N0)) {
1960 AM.JT = J->getIndex();
1961 AM.SymbolFlags = J->getTargetFlags();
1962 } else if (auto *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1963 AM.BlockAddr = BA->getBlockAddress();
1964 AM.SymbolFlags = BA->getTargetFlags();
1965 Offset = BA->getOffset();
1966 } else
1967 llvm_unreachable("Unhandled symbol reference node.");
1968
1969 // Can't use an addressing mode with large globals.
1970 if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1971 TM.isLargeGlobalValue(AM.GV)) {
1972 AM = Backup;
1973 return true;
1974 }
1975
1976 if (foldOffsetIntoAddress(Offset, AM)) {
1977 AM = Backup;
1978 return true;
1979 }
1980
1981 if (IsRIPRel)
1982 AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1983
1984 // Commit the changes now that we know this fold is safe.
1985 return false;
1986}
1987
1988/// Add the specified node to the specified addressing mode, returning true if
1989/// it cannot be done. This just pattern matches for the addressing mode.
1990bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1991 if (matchAddressRecursively(N, AM, 0))
1992 return true;
1993
1994 // Post-processing: Make a second attempt to fold a load, if we now know
1995 // that there will not be any other register. This is only performed for
1996 // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1997 // any foldable load the first time.
1998 if (Subtarget->isTarget64BitILP32() &&
1999 AM.BaseType == X86ISelAddressMode::RegBase &&
2000 AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
2001 SDValue Save_Base_Reg = AM.Base_Reg;
2002 if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
2003 AM.Base_Reg = SDValue();
2004 if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
2005 AM.Base_Reg = Save_Base_Reg;
2006 }
2007 }
2008
2009 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
2010 // a smaller encoding and avoids a scaled-index.
2011 if (AM.Scale == 2 &&
2012 AM.BaseType == X86ISelAddressMode::RegBase &&
2013 AM.Base_Reg.getNode() == nullptr) {
2014 AM.Base_Reg = AM.IndexReg;
2015 AM.Scale = 1;
2016 }
2017
2018 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
2019 // because it has a smaller encoding.
2020 if (TM.getCodeModel() != CodeModel::Large &&
2021 (!AM.GV || !TM.isLargeGlobalValue(AM.GV)) && Subtarget->is64Bit() &&
2022 AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
2023 AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
2024 AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
2025 // However, when GV is a local function symbol and in the same section as
2026 // the current instruction, and AM.Disp is negative and near INT32_MIN,
2027 // referencing GV+Disp generates a relocation referencing the section symbol
2028 // with an even smaller offset, which might underflow. We should bail out if
2029 // the negative offset is too close to INT32_MIN. Actually, we are more
2030 // conservative here, using a smaller magic number also used by
2031 // isOffsetSuitableForCodeModel.
2032 if (isa_and_nonnull<Function>(AM.GV) && AM.Disp < -16 * 1024 * 1024)
2033 return true;
2034
2035 AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
2036 }
2037
2038 return false;
2039}
2040
2041bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
2042 unsigned Depth) {
2043 // Add an artificial use to this node so that we can keep track of
2044 // it if it gets CSE'd with a different node.
2045 HandleSDNode Handle(N);
2046
2047 X86ISelAddressMode Backup = AM;
2048 if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
2049 !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
2050 return false;
2051 AM = Backup;
2052
2053 // Try again after commutating the operands.
2054 if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM,
2055 Depth + 1) &&
2056 !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1))
2057 return false;
2058 AM = Backup;
2059
2060 // If we couldn't fold both operands into the address at the same time,
2061 // see if we can just put each operand into a register and fold at least
2062 // the add.
2063 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2064 !AM.Base_Reg.getNode() &&
2065 !AM.IndexReg.getNode()) {
2066 N = Handle.getValue();
2067 AM.Base_Reg = N.getOperand(0);
2068 AM.IndexReg = N.getOperand(1);
2069 AM.Scale = 1;
2070 return false;
2071 }
2072 N = Handle.getValue();
2073 return true;
2074}
2075
2076// Insert a node into the DAG at least before the Pos node's position. This
2077// will reposition the node as needed, and will assign it a node ID that is <=
2078// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
2079// IDs! The selection DAG must no longer depend on their uniqueness when this
2080// is used.
2081static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
2082 if (N->getNodeId() == -1 ||
2085 DAG.RepositionNode(Pos->getIterator(), N.getNode());
2086 // Mark Node as invalid for pruning as after this it may be a successor to a
2087 // selected node but otherwise be in the same position of Pos.
2088 // Conservatively mark it with the same -abs(Id) to assure node id
2089 // invariant is preserved.
2090 N->setNodeId(Pos->getNodeId());
2092 }
2093}
2094
2095// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2096// safe. This allows us to convert the shift and and into an h-register
2097// extract and a scaled index. Returns false if the simplification is
2098// performed.
2100 uint64_t Mask,
2101 SDValue Shift, SDValue X,
2102 X86ISelAddressMode &AM) {
2103 if (Shift.getOpcode() != ISD::SRL ||
2104 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2105 !Shift.hasOneUse())
2106 return true;
2107
2108 int ScaleLog = 8 - Shift.getConstantOperandVal(1);
2109 if (ScaleLog <= 0 || ScaleLog >= 4 ||
2110 Mask != (0xffu << ScaleLog))
2111 return true;
2112
2113 MVT XVT = X.getSimpleValueType();
2114 MVT VT = N.getSimpleValueType();
2115 SDLoc DL(N);
2116 SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
2117 SDValue NewMask = DAG.getConstant(0xff, DL, XVT);
2118 SDValue Srl = DAG.getNode(ISD::SRL, DL, XVT, X, Eight);
2119 SDValue And = DAG.getNode(ISD::AND, DL, XVT, Srl, NewMask);
2120 SDValue Ext = DAG.getZExtOrTrunc(And, DL, VT);
2121 SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
2122 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Ext, ShlCount);
2123
2124 // Insert the new nodes into the topological ordering. We must do this in
2125 // a valid topological ordering as nothing is going to go back and re-sort
2126 // these nodes. We continually insert before 'N' in sequence as this is
2127 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2128 // hierarchy left to express.
2129 insertDAGNode(DAG, N, Eight);
2130 insertDAGNode(DAG, N, NewMask);
2131 insertDAGNode(DAG, N, Srl);
2132 insertDAGNode(DAG, N, And);
2133 insertDAGNode(DAG, N, Ext);
2134 insertDAGNode(DAG, N, ShlCount);
2135 insertDAGNode(DAG, N, Shl);
2136 DAG.ReplaceAllUsesWith(N, Shl);
2137 DAG.RemoveDeadNode(N.getNode());
2138 AM.IndexReg = Ext;
2139 AM.Scale = (1 << ScaleLog);
2140 return false;
2141}
2142
2143// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2144// allows us to fold the shift into this addressing mode. Returns false if the
2145// transform succeeded.
2147 X86ISelAddressMode &AM) {
2148 SDValue Shift = N.getOperand(0);
2149
2150 // Use a signed mask so that shifting right will insert sign bits. These
2151 // bits will be removed when we shift the result left so it doesn't matter
2152 // what we use. This might allow a smaller immediate encoding.
2153 int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
2154
2155 // If we have an any_extend feeding the AND, look through it to see if there
2156 // is a shift behind it. But only if the AND doesn't use the extended bits.
2157 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2158 bool FoundAnyExtend = false;
2159 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2160 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
2161 isUInt<32>(Mask)) {
2162 FoundAnyExtend = true;
2163 Shift = Shift.getOperand(0);
2164 }
2165
2166 if (Shift.getOpcode() != ISD::SHL ||
2168 return true;
2169
2170 SDValue X = Shift.getOperand(0);
2171
2172 // Not likely to be profitable if either the AND or SHIFT node has more
2173 // than one use (unless all uses are for address computation). Besides,
2174 // isel mechanism requires their node ids to be reused.
2175 if (!N.hasOneUse() || !Shift.hasOneUse())
2176 return true;
2177
2178 // Verify that the shift amount is something we can fold.
2179 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2180 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
2181 return true;
2182
2183 MVT VT = N.getSimpleValueType();
2184 SDLoc DL(N);
2185 if (FoundAnyExtend) {
2186 SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
2187 insertDAGNode(DAG, N, NewX);
2188 X = NewX;
2189 }
2190
2191 SDValue NewMask = DAG.getSignedConstant(Mask >> ShiftAmt, DL, VT);
2192 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
2193 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
2194
2195 // Insert the new nodes into the topological ordering. We must do this in
2196 // a valid topological ordering as nothing is going to go back and re-sort
2197 // these nodes. We continually insert before 'N' in sequence as this is
2198 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2199 // hierarchy left to express.
2200 insertDAGNode(DAG, N, NewMask);
2201 insertDAGNode(DAG, N, NewAnd);
2202 insertDAGNode(DAG, N, NewShift);
2203 DAG.ReplaceAllUsesWith(N, NewShift);
2204 DAG.RemoveDeadNode(N.getNode());
2205
2206 AM.Scale = 1 << ShiftAmt;
2207 AM.IndexReg = NewAnd;
2208 return false;
2209}
2210
2211// Implement some heroics to detect shifts of masked values where the mask can
2212// be replaced by extending the shift and undoing that in the addressing mode
2213// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2214// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2215// the addressing mode. This results in code such as:
2216//
2217// int f(short *y, int *lookup_table) {
2218// ...
2219// return *y + lookup_table[*y >> 11];
2220// }
2221//
2222// Turning into:
2223// movzwl (%rdi), %eax
2224// movl %eax, %ecx
2225// shrl $11, %ecx
2226// addl (%rsi,%rcx,4), %eax
2227//
2228// Instead of:
2229// movzwl (%rdi), %eax
2230// movl %eax, %ecx
2231// shrl $9, %ecx
2232// andl $124, %rcx
2233// addl (%rsi,%rcx), %eax
2234//
2235// Note that this function assumes the mask is provided as a mask *after* the
2236// value is shifted. The input chain may or may not match that, but computing
2237// such a mask is trivial.
2239 uint64_t Mask,
2240 SDValue Shift, SDValue X,
2241 X86ISelAddressMode &AM) {
2242 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
2244 return true;
2245
2246 // We need to ensure that mask is a continuous run of bits.
2247 unsigned MaskIdx, MaskLen;
2248 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2249 return true;
2250 unsigned MaskLZ = 64 - (MaskIdx + MaskLen);
2251
2252 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2253
2254 // The amount of shift we're trying to fit into the addressing mode is taken
2255 // from the shifted mask index (number of trailing zeros of the mask).
2256 unsigned AMShiftAmt = MaskIdx;
2257
2258 // There is nothing we can do here unless the mask is removing some bits.
2259 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2260 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2261
2262 // Scale the leading zero count down based on the actual size of the value.
2263 // Also scale it down based on the size of the shift.
2264 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2265 if (MaskLZ < ScaleDown)
2266 return true;
2267 MaskLZ -= ScaleDown;
2268
2269 // The final check is to ensure that any masked out high bits of X are
2270 // already known to be zero. Otherwise, the mask has a semantic impact
2271 // other than masking out a couple of low bits. Unfortunately, because of
2272 // the mask, zero extensions will be removed from operands in some cases.
2273 // This code works extra hard to look through extensions because we can
2274 // replace them with zero extensions cheaply if necessary.
2275 bool ReplacingAnyExtend = false;
2276 if (X.getOpcode() == ISD::ANY_EXTEND) {
2277 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2278 X.getOperand(0).getSimpleValueType().getSizeInBits();
2279 // Assume that we'll replace the any-extend with a zero-extend, and
2280 // narrow the search to the extended value.
2281 X = X.getOperand(0);
2282 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
2283 ReplacingAnyExtend = true;
2284 }
2285 APInt MaskedHighBits =
2286 APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
2287 if (!DAG.MaskedValueIsZero(X, MaskedHighBits))
2288 return true;
2289
2290 // We've identified a pattern that can be transformed into a single shift
2291 // and an addressing mode. Make it so.
2292 MVT VT = N.getSimpleValueType();
2293 if (ReplacingAnyExtend) {
2294 assert(X.getValueType() != VT);
2295 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2296 SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
2297 insertDAGNode(DAG, N, NewX);
2298 X = NewX;
2299 }
2300
2301 MVT XVT = X.getSimpleValueType();
2302 SDLoc DL(N);
2303 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2304 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2305 SDValue NewExt = DAG.getZExtOrTrunc(NewSRL, DL, VT);
2306 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2307 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2308
2309 // Insert the new nodes into the topological ordering. We must do this in
2310 // a valid topological ordering as nothing is going to go back and re-sort
2311 // these nodes. We continually insert before 'N' in sequence as this is
2312 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2313 // hierarchy left to express.
2314 insertDAGNode(DAG, N, NewSRLAmt);
2315 insertDAGNode(DAG, N, NewSRL);
2316 insertDAGNode(DAG, N, NewExt);
2317 insertDAGNode(DAG, N, NewSHLAmt);
2318 insertDAGNode(DAG, N, NewSHL);
2319 DAG.ReplaceAllUsesWith(N, NewSHL);
2320 DAG.RemoveDeadNode(N.getNode());
2321
2322 AM.Scale = 1 << AMShiftAmt;
2323 AM.IndexReg = NewExt;
2324 return false;
2325}
2326
2327// Transform "(X >> SHIFT) & (MASK << C1)" to
2328// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2329// matched to a BEXTR later. Returns false if the simplification is performed.
2331 uint64_t Mask,
2332 SDValue Shift, SDValue X,
2333 X86ISelAddressMode &AM,
2334 const X86Subtarget &Subtarget) {
2335 if (Shift.getOpcode() != ISD::SRL ||
2336 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2337 !Shift.hasOneUse() || !N.hasOneUse())
2338 return true;
2339
2340 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2341 if (!Subtarget.hasTBM() &&
2342 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2343 return true;
2344
2345 // We need to ensure that mask is a continuous run of bits.
2346 unsigned MaskIdx, MaskLen;
2347 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2348 return true;
2349
2350 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2351
2352 // The amount of shift we're trying to fit into the addressing mode is taken
2353 // from the shifted mask index (number of trailing zeros of the mask).
2354 unsigned AMShiftAmt = MaskIdx;
2355
2356 // There is nothing we can do here unless the mask is removing some bits.
2357 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2358 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2359
2360 MVT XVT = X.getSimpleValueType();
2361 MVT VT = N.getSimpleValueType();
2362 SDLoc DL(N);
2363 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2364 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2365 SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, XVT);
2366 SDValue NewAnd = DAG.getNode(ISD::AND, DL, XVT, NewSRL, NewMask);
2367 SDValue NewExt = DAG.getZExtOrTrunc(NewAnd, DL, VT);
2368 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2369 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2370
2371 // Insert the new nodes into the topological ordering. We must do this in
2372 // a valid topological ordering as nothing is going to go back and re-sort
2373 // these nodes. We continually insert before 'N' in sequence as this is
2374 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2375 // hierarchy left to express.
2376 insertDAGNode(DAG, N, NewSRLAmt);
2377 insertDAGNode(DAG, N, NewSRL);
2378 insertDAGNode(DAG, N, NewMask);
2379 insertDAGNode(DAG, N, NewAnd);
2380 insertDAGNode(DAG, N, NewExt);
2381 insertDAGNode(DAG, N, NewSHLAmt);
2382 insertDAGNode(DAG, N, NewSHL);
2383 DAG.ReplaceAllUsesWith(N, NewSHL);
2384 DAG.RemoveDeadNode(N.getNode());
2385
2386 AM.Scale = 1 << AMShiftAmt;
2387 AM.IndexReg = NewExt;
2388 return false;
2389}
2390
2391// Attempt to peek further into a scaled index register, collecting additional
2392// extensions / offsets / etc. Returns /p N if we can't peek any further.
2393SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2394 X86ISelAddressMode &AM,
2395 unsigned Depth) {
2396 assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2397 assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) &&
2398 "Illegal index scale");
2399
2400 // Limit recursion.
2402 return N;
2403
2404 EVT VT = N.getValueType();
2405 unsigned Opc = N.getOpcode();
2406
2407 // index: add(x,c) -> index: x, disp + c
2408 if (CurDAG->isBaseWithConstantOffset(N)) {
2409 auto *AddVal = cast<ConstantSDNode>(N.getOperand(1));
2410 uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2411 if (!foldOffsetIntoAddress(Offset, AM))
2412 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2413 }
2414
2415 // index: add(x,x) -> index: x, scale * 2
2416 if (Opc == ISD::ADD && N.getOperand(0) == N.getOperand(1)) {
2417 if (AM.Scale <= 4) {
2418 AM.Scale *= 2;
2419 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2420 }
2421 }
2422
2423 // index: shl(x,i) -> index: x, scale * (1 << i)
2424 if (Opc == X86ISD::VSHLI) {
2425 uint64_t ShiftAmt = N.getConstantOperandVal(1);
2426 uint64_t ScaleAmt = 1ULL << ShiftAmt;
2427 if ((AM.Scale * ScaleAmt) <= 8) {
2428 AM.Scale *= ScaleAmt;
2429 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2430 }
2431 }
2432
2433 // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2434 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2435 if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2436 SDValue Src = N.getOperand(0);
2437 if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() &&
2438 Src.hasOneUse()) {
2439 if (CurDAG->isBaseWithConstantOffset(Src)) {
2440 SDValue AddSrc = Src.getOperand(0);
2441 auto *AddVal = cast<ConstantSDNode>(Src.getOperand(1));
2442 int64_t Offset = AddVal->getSExtValue();
2443 if (!foldOffsetIntoAddress((uint64_t)Offset * AM.Scale, AM)) {
2444 SDLoc DL(N);
2445 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2446 SDValue ExtVal = CurDAG->getSignedConstant(Offset, DL, VT);
2447 SDValue ExtAdd = CurDAG->getNode(ISD::ADD, DL, VT, ExtSrc, ExtVal);
2448 insertDAGNode(*CurDAG, N, ExtSrc);
2449 insertDAGNode(*CurDAG, N, ExtVal);
2450 insertDAGNode(*CurDAG, N, ExtAdd);
2451 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2452 CurDAG->RemoveDeadNode(N.getNode());
2453 return ExtSrc;
2454 }
2455 }
2456 }
2457 }
2458
2459 // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2460 // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2461 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2462 if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2463 SDValue Src = N.getOperand(0);
2464 unsigned SrcOpc = Src.getOpcode();
2465 if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
2466 CurDAG->isADDLike(Src, /*NoWrap=*/true)) &&
2467 Src.hasOneUse()) {
2468 if (CurDAG->isBaseWithConstantOffset(Src)) {
2469 SDValue AddSrc = Src.getOperand(0);
2470 uint64_t Offset = Src.getConstantOperandVal(1);
2471 if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2472 SDLoc DL(N);
2473 SDValue Res;
2474 // If we're also scaling, see if we can use that as well.
2475 if (AddSrc.getOpcode() == ISD::SHL &&
2476 isa<ConstantSDNode>(AddSrc.getOperand(1))) {
2477 SDValue ShVal = AddSrc.getOperand(0);
2478 uint64_t ShAmt = AddSrc.getConstantOperandVal(1);
2479 APInt HiBits =
2481 uint64_t ScaleAmt = 1ULL << ShAmt;
2482 if ((AM.Scale * ScaleAmt) <= 8 &&
2483 (AddSrc->getFlags().hasNoUnsignedWrap() ||
2484 CurDAG->MaskedValueIsZero(ShVal, HiBits))) {
2485 AM.Scale *= ScaleAmt;
2486 SDValue ExtShVal = CurDAG->getNode(Opc, DL, VT, ShVal);
2487 SDValue ExtShift = CurDAG->getNode(ISD::SHL, DL, VT, ExtShVal,
2488 AddSrc.getOperand(1));
2489 insertDAGNode(*CurDAG, N, ExtShVal);
2490 insertDAGNode(*CurDAG, N, ExtShift);
2491 AddSrc = ExtShift;
2492 Res = ExtShVal;
2493 }
2494 }
2495 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2496 SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2497 SDValue ExtAdd = CurDAG->getNode(SrcOpc, DL, VT, ExtSrc, ExtVal);
2498 insertDAGNode(*CurDAG, N, ExtSrc);
2499 insertDAGNode(*CurDAG, N, ExtVal);
2500 insertDAGNode(*CurDAG, N, ExtAdd);
2501 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2502 CurDAG->RemoveDeadNode(N.getNode());
2503 return Res ? Res : ExtSrc;
2504 }
2505 }
2506 }
2507 }
2508
2509 // TODO: Handle extensions, shifted masks etc.
2510 return N;
2511}
2512
2513bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2514 unsigned Depth) {
2515 LLVM_DEBUG({
2516 dbgs() << "MatchAddress: ";
2517 AM.dump(CurDAG);
2518 });
2519 // Limit recursion.
2521 return matchAddressBase(N, AM);
2522
2523 // If this is already a %rip relative address, we can only merge immediates
2524 // into it. Instead of handling this in every case, we handle it here.
2525 // RIP relative addressing: %rip + 32-bit displacement!
2526 if (AM.isRIPRelative()) {
2527 // FIXME: JumpTable and ExternalSymbol address currently don't like
2528 // displacements. It isn't very important, but this should be fixed for
2529 // consistency.
2530 if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2531 return true;
2532
2533 if (auto *Cst = dyn_cast<ConstantSDNode>(N))
2534 if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
2535 return false;
2536 return true;
2537 }
2538
2539 switch (N.getOpcode()) {
2540 default: break;
2541 case ISD::LOCAL_RECOVER: {
2542 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2543 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
2544 // Use the symbol and don't prefix it.
2545 AM.MCSym = ESNode->getMCSymbol();
2546 return false;
2547 }
2548 break;
2549 }
2550 case ISD::Constant: {
2551 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2552 if (!foldOffsetIntoAddress(Val, AM))
2553 return false;
2554 break;
2555 }
2556
2557 case X86ISD::Wrapper:
2558 case X86ISD::WrapperRIP:
2559 if (!matchWrapper(N, AM))
2560 return false;
2561 break;
2562
2563 case ISD::LOAD:
2564 if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
2565 return false;
2566 break;
2567
2568 case ISD::FrameIndex:
2569 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2570 AM.Base_Reg.getNode() == nullptr &&
2571 (!Subtarget->is64Bit() || isDispSafeForFrameIndexOrRegBase(AM.Disp))) {
2572 AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2573 AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
2574 return false;
2575 }
2576 break;
2577
2578 case ISD::SHL:
2579 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2580 break;
2581
2582 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
2583 unsigned Val = CN->getZExtValue();
2584 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2585 // that the base operand remains free for further matching. If
2586 // the base doesn't end up getting used, a post-processing step
2587 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2588 if (Val == 1 || Val == 2 || Val == 3) {
2589 SDValue ShVal = N.getOperand(0);
2590 AM.Scale = 1 << Val;
2591 AM.IndexReg = matchIndexRecursively(ShVal, AM, Depth + 1);
2592 return false;
2593 }
2594 }
2595 break;
2596
2597 case ISD::SRL: {
2598 // Scale must not be used already.
2599 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2600
2601 // We only handle up to 64-bit values here as those are what matter for
2602 // addressing mode optimizations.
2603 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2604 "Unexpected value size!");
2605
2606 SDValue And = N.getOperand(0);
2607 if (And.getOpcode() != ISD::AND) break;
2608 SDValue X = And.getOperand(0);
2609
2610 // The mask used for the transform is expected to be post-shift, but we
2611 // found the shift first so just apply the shift to the mask before passing
2612 // it down.
2613 if (!isa<ConstantSDNode>(N.getOperand(1)) ||
2614 !isa<ConstantSDNode>(And.getOperand(1)))
2615 break;
2616 uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
2617
2618 // Try to fold the mask and shift into the scale, and return false if we
2619 // succeed.
2620 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
2621 return false;
2622 break;
2623 }
2624
2625 case ISD::SMUL_LOHI:
2626 case ISD::UMUL_LOHI:
2627 // A mul_lohi where we need the low part can be folded as a plain multiply.
2628 if (N.getResNo() != 0) break;
2629 [[fallthrough]];
2630 case ISD::MUL:
2631 case X86ISD::MUL_IMM:
2632 // X*[3,5,9] -> X+X*[2,4,8]
2633 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2634 AM.Base_Reg.getNode() == nullptr &&
2635 AM.IndexReg.getNode() == nullptr) {
2636 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
2637 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2638 CN->getZExtValue() == 9) {
2639 AM.Scale = unsigned(CN->getZExtValue())-1;
2640
2641 SDValue MulVal = N.getOperand(0);
2642 SDValue Reg;
2643
2644 // Okay, we know that we have a scale by now. However, if the scaled
2645 // value is an add of something and a constant, we can fold the
2646 // constant into the disp field here.
2647 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2648 isa<ConstantSDNode>(MulVal.getOperand(1))) {
2649 Reg = MulVal.getOperand(0);
2650 auto *AddVal = cast<ConstantSDNode>(MulVal.getOperand(1));
2651 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2652 if (foldOffsetIntoAddress(Disp, AM))
2653 Reg = N.getOperand(0);
2654 } else {
2655 Reg = N.getOperand(0);
2656 }
2657
2658 AM.IndexReg = AM.Base_Reg = Reg;
2659 return false;
2660 }
2661 }
2662 break;
2663
2664 case ISD::SUB: {
2665 // Given A-B, if A can be completely folded into the address and
2666 // the index field with the index field unused, use -B as the index.
2667 // This is a win if a has multiple parts that can be folded into
2668 // the address. Also, this saves a mov if the base register has
2669 // other uses, since it avoids a two-address sub instruction, however
2670 // it costs an additional mov if the index register has other uses.
2671
2672 // Add an artificial use to this node so that we can keep track of
2673 // it if it gets CSE'd with a different node.
2674 HandleSDNode Handle(N);
2675
2676 // Test if the LHS of the sub can be folded.
2677 X86ISelAddressMode Backup = AM;
2678 if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2679 N = Handle.getValue();
2680 AM = Backup;
2681 break;
2682 }
2683 N = Handle.getValue();
2684 // Test if the index field is free for use.
2685 if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2686 AM = Backup;
2687 break;
2688 }
2689
2690 int Cost = 0;
2691 SDValue RHS = N.getOperand(1);
2692 // If the RHS involves a register with multiple uses, this
2693 // transformation incurs an extra mov, due to the neg instruction
2694 // clobbering its operand.
2695 if (!RHS.getNode()->hasOneUse() ||
2696 RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2697 RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2698 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2699 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2700 RHS.getOperand(0).getValueType() == MVT::i32))
2701 ++Cost;
2702 // If the base is a register with multiple uses, this
2703 // transformation may save a mov.
2704 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2705 !AM.Base_Reg.getNode()->hasOneUse()) ||
2706 AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2707 --Cost;
2708 // If the folded LHS was interesting, this transformation saves
2709 // address arithmetic.
2710 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2711 ((AM.Disp != 0) && (Backup.Disp == 0)) +
2712 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2713 --Cost;
2714 // If it doesn't look like it may be an overall win, don't do it.
2715 if (Cost >= 0) {
2716 AM = Backup;
2717 break;
2718 }
2719
2720 // Ok, the transformation is legal and appears profitable. Go for it.
2721 // Negation will be emitted later to avoid creating dangling nodes if this
2722 // was an unprofitable LEA.
2723 AM.IndexReg = RHS;
2724 AM.NegateIndex = true;
2725 AM.Scale = 1;
2726 return false;
2727 }
2728
2729 case ISD::OR:
2730 case ISD::XOR:
2731 // See if we can treat the OR/XOR node as an ADD node.
2732 if (!CurDAG->isADDLike(N))
2733 break;
2734 [[fallthrough]];
2735 case ISD::ADD:
2736 if (!matchAdd(N, AM, Depth))
2737 return false;
2738 break;
2739
2740 case ISD::AND: {
2741 // Perform some heroic transforms on an and of a constant-count shift
2742 // with a constant to enable use of the scaled offset field.
2743
2744 // Scale must not be used already.
2745 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2746
2747 // We only handle up to 64-bit values here as those are what matter for
2748 // addressing mode optimizations.
2749 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2750 "Unexpected value size!");
2751
2752 if (!isa<ConstantSDNode>(N.getOperand(1)))
2753 break;
2754
2755 if (N.getOperand(0).getOpcode() == ISD::SRL) {
2756 SDValue Shift = N.getOperand(0);
2757 SDValue X = Shift.getOperand(0);
2758
2759 uint64_t Mask = N.getConstantOperandVal(1);
2760
2761 // Try to fold the mask and shift into an extract and scale.
2762 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2763 return false;
2764
2765 // Try to fold the mask and shift directly into the scale.
2766 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2767 return false;
2768
2769 // Try to fold the mask and shift into BEXTR and scale.
2770 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2771 return false;
2772 }
2773
2774 // Try to swap the mask and shift to place shifts which can be done as
2775 // a scale on the outside of the mask.
2776 if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2777 return false;
2778
2779 break;
2780 }
2781 case ISD::ZERO_EXTEND: {
2782 // Try to widen a zexted shift left to the same size as its use, so we can
2783 // match the shift as a scale factor.
2784 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2785 break;
2786
2787 SDValue Src = N.getOperand(0);
2788
2789 // See if we can match a zext(addlike(x,c)).
2790 // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2791 if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
2792 if (SDValue Index = matchIndexRecursively(N, AM, Depth + 1))
2793 if (Index != N) {
2794 AM.IndexReg = Index;
2795 return false;
2796 }
2797
2798 // Peek through mask: zext(and(shl(x,c1),c2))
2799 APInt Mask = APInt::getAllOnes(Src.getScalarValueSizeInBits());
2800 if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2801 if (auto *MaskC = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
2802 Mask = MaskC->getAPIntValue();
2803 Src = Src.getOperand(0);
2804 }
2805
2806 if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N->hasOneUse()) {
2807 // Give up if the shift is not a valid scale factor [1,2,3].
2808 SDValue ShlSrc = Src.getOperand(0);
2809 SDValue ShlAmt = Src.getOperand(1);
2810 auto *ShAmtC = dyn_cast<ConstantSDNode>(ShlAmt);
2811 if (!ShAmtC)
2812 break;
2813 unsigned ShAmtV = ShAmtC->getZExtValue();
2814 if (ShAmtV > 3)
2815 break;
2816
2817 // The narrow shift must only shift out zero bits (it must be 'nuw').
2818 // That makes it safe to widen to the destination type.
2819 APInt HighZeros =
2820 APInt::getHighBitsSet(ShlSrc.getValueSizeInBits(), ShAmtV);
2821 if (!Src->getFlags().hasNoUnsignedWrap() &&
2822 !CurDAG->MaskedValueIsZero(ShlSrc, HighZeros & Mask))
2823 break;
2824
2825 // zext (shl nuw i8 %x, C1) to i32
2826 // --> shl (zext i8 %x to i32), (zext C1)
2827 // zext (and (shl nuw i8 %x, C1), C2) to i32
2828 // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2829 MVT SrcVT = ShlSrc.getSimpleValueType();
2830 MVT VT = N.getSimpleValueType();
2831 SDLoc DL(N);
2832
2833 SDValue Res = ShlSrc;
2834 if (!Mask.isAllOnes()) {
2835 Res = CurDAG->getConstant(Mask.lshr(ShAmtV), DL, SrcVT);
2836 insertDAGNode(*CurDAG, N, Res);
2837 Res = CurDAG->getNode(ISD::AND, DL, SrcVT, ShlSrc, Res);
2838 insertDAGNode(*CurDAG, N, Res);
2839 }
2840 SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Res);
2841 insertDAGNode(*CurDAG, N, Zext);
2842 SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, ShlAmt);
2843 insertDAGNode(*CurDAG, N, NewShl);
2844 CurDAG->ReplaceAllUsesWith(N, NewShl);
2845 CurDAG->RemoveDeadNode(N.getNode());
2846
2847 // Convert the shift to scale factor.
2848 AM.Scale = 1 << ShAmtV;
2849 // If matchIndexRecursively is not called here,
2850 // Zext may be replaced by other nodes but later used to call a builder
2851 // method
2852 AM.IndexReg = matchIndexRecursively(Zext, AM, Depth + 1);
2853 return false;
2854 }
2855
2856 if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2857 // Try to fold the mask and shift into an extract and scale.
2858 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask.getZExtValue(), Src,
2859 Src.getOperand(0), AM))
2860 return false;
2861
2862 // Try to fold the mask and shift directly into the scale.
2863 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask.getZExtValue(), Src,
2864 Src.getOperand(0), AM))
2865 return false;
2866
2867 // Try to fold the mask and shift into BEXTR and scale.
2868 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask.getZExtValue(), Src,
2869 Src.getOperand(0), AM, *Subtarget))
2870 return false;
2871 }
2872
2873 break;
2874 }
2875 }
2876
2877 return matchAddressBase(N, AM);
2878}
2879
2880/// Helper for MatchAddress. Add the specified node to the
2881/// specified addressing mode without any further recursion.
2882bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2883 // Is the base register already occupied?
2884 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2885 // If so, check to see if the scale index register is set.
2886 if (!AM.IndexReg.getNode()) {
2887 AM.IndexReg = N;
2888 AM.Scale = 1;
2889 return false;
2890 }
2891
2892 // Otherwise, we cannot select it.
2893 return true;
2894 }
2895
2896 // Default, generate it as a register.
2897 AM.BaseType = X86ISelAddressMode::RegBase;
2898 AM.Base_Reg = N;
2899 return false;
2900}
2901
2902bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2903 X86ISelAddressMode &AM,
2904 unsigned Depth) {
2905 LLVM_DEBUG({
2906 dbgs() << "MatchVectorAddress: ";
2907 AM.dump(CurDAG);
2908 });
2909 // Limit recursion.
2911 return matchAddressBase(N, AM);
2912
2913 // TODO: Support other operations.
2914 switch (N.getOpcode()) {
2915 case ISD::Constant: {
2916 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2917 if (!foldOffsetIntoAddress(Val, AM))
2918 return false;
2919 break;
2920 }
2921 case X86ISD::Wrapper:
2922 if (!matchWrapper(N, AM))
2923 return false;
2924 break;
2925 case ISD::ADD: {
2926 // Add an artificial use to this node so that we can keep track of
2927 // it if it gets CSE'd with a different node.
2928 HandleSDNode Handle(N);
2929
2930 X86ISelAddressMode Backup = AM;
2931 if (!matchVectorAddressRecursively(N.getOperand(0), AM, Depth + 1) &&
2932 !matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2933 Depth + 1))
2934 return false;
2935 AM = Backup;
2936
2937 // Try again after commuting the operands.
2938 if (!matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2939 Depth + 1) &&
2940 !matchVectorAddressRecursively(Handle.getValue().getOperand(0), AM,
2941 Depth + 1))
2942 return false;
2943 AM = Backup;
2944
2945 N = Handle.getValue();
2946 break;
2947 }
2948 }
2949
2950 return matchAddressBase(N, AM);
2951}
2952
2953/// Helper for selectVectorAddr. Handles things that can be folded into a
2954/// gather/scatter address. The index register and scale should have already
2955/// been handled.
2956bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2957 return matchVectorAddressRecursively(N, AM, 0);
2958}
2959
2960bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2961 SDValue IndexOp, SDValue ScaleOp,
2962 SDValue &Base, SDValue &Scale,
2963 SDValue &Index, SDValue &Disp,
2964 SDValue &Segment) {
2965 X86ISelAddressMode AM;
2966 AM.Scale = ScaleOp->getAsZExtVal();
2967
2968 // Attempt to match index patterns, as long as we're not relying on implicit
2969 // sign-extension, which is performed BEFORE scale.
2970 if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2971 AM.IndexReg = matchIndexRecursively(IndexOp, AM, 0);
2972 else
2973 AM.IndexReg = IndexOp;
2974
2975 unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2976 if (AddrSpace == X86AS::GS)
2977 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2978 if (AddrSpace == X86AS::FS)
2979 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2980 if (AddrSpace == X86AS::SS)
2981 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2982
2983 SDLoc DL(BasePtr);
2984 MVT VT = BasePtr.getSimpleValueType();
2985
2986 // Try to match into the base and displacement fields.
2987 if (matchVectorAddress(BasePtr, AM))
2988 return false;
2989
2990 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2991 return true;
2992}
2993
2994/// Returns true if it is able to pattern match an addressing mode.
2995/// It returns the operands which make up the maximal addressing mode it can
2996/// match by reference.
2997///
2998/// Parent is the parent node of the addr operand that is being matched. It
2999/// is always a load, store, atomic node, or null. It is only null when
3000/// checking memory operands for inline asm nodes.
3001bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
3002 SDValue &Scale, SDValue &Index,
3003 SDValue &Disp, SDValue &Segment) {
3004 X86ISelAddressMode AM;
3005
3006 if (Parent &&
3007 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
3008 // that are not a MemSDNode, and thus don't have proper addrspace info.
3009 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
3010 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
3011 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
3012 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
3013 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
3014 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
3015 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
3016 unsigned AddrSpace =
3017 cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
3018 if (AddrSpace == X86AS::GS)
3019 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
3020 if (AddrSpace == X86AS::FS)
3021 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
3022 if (AddrSpace == X86AS::SS)
3023 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
3024 }
3025
3026 // Save the DL and VT before calling matchAddress, it can invalidate N.
3027 SDLoc DL(N);
3028 MVT VT = N.getSimpleValueType();
3029
3030 if (matchAddress(N, AM))
3031 return false;
3032
3033 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3034 return true;
3035}
3036
3037bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
3038 // Cannot use 32 bit constants to reference objects in kernel/large code
3039 // model.
3040 if (TM.getCodeModel() == CodeModel::Kernel ||
3041 TM.getCodeModel() == CodeModel::Large)
3042 return false;
3043
3044 // In static codegen with small code model, we can get the address of a label
3045 // into a register with 'movl'
3046 if (N->getOpcode() != X86ISD::Wrapper)
3047 return false;
3048
3049 N = N.getOperand(0);
3050
3051 // At least GNU as does not accept 'movl' for TPOFF relocations.
3052 // FIXME: We could use 'movl' when we know we are targeting MC.
3053 if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
3054 return false;
3055
3056 Imm = N;
3057 // Small/medium code model can reference non-TargetGlobalAddress objects with
3058 // 32 bit constants.
3059 if (N->getOpcode() != ISD::TargetGlobalAddress) {
3060 return TM.getCodeModel() == CodeModel::Small ||
3061 TM.getCodeModel() == CodeModel::Medium;
3062 }
3063
3064 const GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal();
3065 if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
3066 return CR->getUnsignedMax().ult(1ull << 32);
3067
3068 return !TM.isLargeGlobalValue(GV);
3069}
3070
3071bool X86DAGToDAGISel::selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale,
3072 SDValue &Index, SDValue &Disp,
3073 SDValue &Segment) {
3074 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
3075 SDLoc DL(N);
3076
3077 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
3078 return false;
3079
3080 EVT BaseType = Base.getValueType();
3081 unsigned SubReg;
3082 if (BaseType == MVT::i8)
3083 SubReg = X86::sub_8bit;
3084 else if (BaseType == MVT::i16)
3085 SubReg = X86::sub_16bit;
3086 else
3087 SubReg = X86::sub_32bit;
3088
3090 if (RN && RN->getReg() == 0)
3091 Base = CurDAG->getRegister(0, MVT::i64);
3092 else if ((BaseType == MVT::i8 || BaseType == MVT::i16 ||
3093 BaseType == MVT::i32) &&
3095 // Base could already be %rip, particularly in the x32 ABI.
3096 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3097 MVT::i64), 0);
3098 Base = CurDAG->getTargetInsertSubreg(SubReg, DL, MVT::i64, ImplDef, Base);
3099 }
3100
3101 [[maybe_unused]] EVT IndexType = Index.getValueType();
3103 if (RN && RN->getReg() == 0)
3104 Index = CurDAG->getRegister(0, MVT::i64);
3105 else {
3106 assert((IndexType == BaseType) &&
3107 "Expect to be extending 8/16/32-bit registers for use in LEA");
3108 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3109 MVT::i64), 0);
3110 Index = CurDAG->getTargetInsertSubreg(SubReg, DL, MVT::i64, ImplDef, Index);
3111 }
3112
3113 return true;
3114}
3115
3116/// Calls SelectAddr and determines if the maximal addressing
3117/// mode it matches can be cost effectively emitted as an LEA instruction.
3118bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3119 SDValue &Base, SDValue &Scale,
3120 SDValue &Index, SDValue &Disp,
3121 SDValue &Segment) {
3122 X86ISelAddressMode AM;
3123
3124 // Save the DL and VT before calling matchAddress, it can invalidate N.
3125 SDLoc DL(N);
3126 MVT VT = N.getSimpleValueType();
3127
3128 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3129 // segments.
3130 SDValue Copy = AM.Segment;
3131 SDValue T = CurDAG->getRegister(0, MVT::i32);
3132 AM.Segment = T;
3133 if (matchAddress(N, AM))
3134 return false;
3135 assert (T == AM.Segment);
3136 AM.Segment = Copy;
3137
3138 unsigned Complexity = 0;
3139 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3140 Complexity = 1;
3141 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3142 Complexity = 4;
3143
3144 if (AM.IndexReg.getNode())
3145 Complexity++;
3146
3147 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3148 // a simple shift.
3149 if (AM.Scale > 1)
3150 Complexity++;
3151
3152 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3153 // to a LEA. This is determined with some experimentation but is by no means
3154 // optimal (especially for code size consideration). LEA is nice because of
3155 // its three-address nature. Tweak the cost function again when we can run
3156 // convertToThreeAddress() at register allocation time.
3157 if (AM.hasSymbolicDisplacement()) {
3158 // For X86-64, always use LEA to materialize RIP-relative addresses.
3159 if (Subtarget->is64Bit())
3160 Complexity = 4;
3161 else
3162 Complexity += 2;
3163 }
3164
3165 // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3166 // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3167 // duplicating flag-producing instructions later in the pipeline.
3168 if (N.getOpcode() == ISD::ADD) {
3169 auto isMathWithFlags = [](SDValue V) {
3170 switch (V.getOpcode()) {
3171 case X86ISD::ADD:
3172 case X86ISD::SUB:
3173 case X86ISD::ADC:
3174 case X86ISD::SBB:
3175 case X86ISD::SMUL:
3176 case X86ISD::UMUL:
3177 /* TODO: These opcodes can be added safely, but we may want to justify
3178 their inclusion for different reasons (better for reg-alloc).
3179 case X86ISD::OR:
3180 case X86ISD::XOR:
3181 case X86ISD::AND:
3182 */
3183 // Value 1 is the flag output of the node - verify it's not dead.
3184 return !SDValue(V.getNode(), 1).use_empty();
3185 default:
3186 return false;
3187 }
3188 };
3189 // TODO: We might want to factor in whether there's a load folding
3190 // opportunity for the math op that disappears with LEA.
3191 if (isMathWithFlags(N.getOperand(0)) || isMathWithFlags(N.getOperand(1)))
3192 Complexity++;
3193 }
3194
3195 if (AM.Disp)
3196 Complexity++;
3197
3198 // If it isn't worth using an LEA, reject it.
3199 if (Complexity <= 2)
3200 return false;
3201
3202 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3203 return true;
3204}
3205
3206/// This is only run on TargetGlobalTLSAddress nodes.
3207bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3208 SDValue &Scale, SDValue &Index,
3209 SDValue &Disp, SDValue &Segment) {
3210 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
3211 N.getOpcode() == ISD::TargetExternalSymbol);
3212
3213 X86ISelAddressMode AM;
3214 if (auto *GA = dyn_cast<GlobalAddressSDNode>(N)) {
3215 AM.GV = GA->getGlobal();
3216 AM.Disp += GA->getOffset();
3217 AM.SymbolFlags = GA->getTargetFlags();
3218 } else {
3219 auto *SA = cast<ExternalSymbolSDNode>(N);
3220 AM.ES = SA->getSymbol();
3221 AM.SymbolFlags = SA->getTargetFlags();
3222 }
3223
3224 if (Subtarget->is32Bit()) {
3225 AM.Scale = 1;
3226 AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
3227 }
3228
3229 MVT VT = N.getSimpleValueType();
3230 getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
3231 return true;
3232}
3233
3234bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3235 // Keep track of the original value type and whether this value was
3236 // truncated. If we see a truncation from pointer type to VT that truncates
3237 // bits that are known to be zero, we can use a narrow reference.
3238 EVT VT = N.getValueType();
3239 bool WasTruncated = false;
3240 if (N.getOpcode() == ISD::TRUNCATE) {
3241 WasTruncated = true;
3242 N = N.getOperand(0);
3243 }
3244
3245 if (N.getOpcode() != X86ISD::Wrapper)
3246 return false;
3247
3248 // We can only use non-GlobalValues as immediates if they were not truncated,
3249 // as we do not have any range information. If we have a GlobalValue and the
3250 // address was not truncated, we can select it as an operand directly.
3251 unsigned Opc = N.getOperand(0)->getOpcode();
3252 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
3253 Op = N.getOperand(0);
3254 // We can only select the operand directly if we didn't have to look past a
3255 // truncate.
3256 return !WasTruncated;
3257 }
3258
3259 // Check that the global's range fits into VT.
3260 auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
3261 std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3262 if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
3263 return false;
3264
3265 // Okay, we can use a narrow reference.
3266 Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
3267 GA->getOffset(), GA->getTargetFlags());
3268 return true;
3269}
3270
3271bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
3272 SDValue &Base, SDValue &Scale,
3273 SDValue &Index, SDValue &Disp,
3274 SDValue &Segment) {
3275 assert(Root && P && "Unknown root/parent nodes");
3276 if (!ISD::isNON_EXTLoad(N.getNode()) ||
3277 !IsProfitableToFold(N, P, Root) ||
3278 !IsLegalToFold(N, P, Root, OptLevel))
3279 return false;
3280
3281 return selectAddr(N.getNode(),
3282 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3283}
3284
3285bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
3286 SDValue &Base, SDValue &Scale,
3287 SDValue &Index, SDValue &Disp,
3288 SDValue &Segment) {
3289 assert(Root && P && "Unknown root/parent nodes");
3290 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
3291 !IsProfitableToFold(N, P, Root) ||
3292 !IsLegalToFold(N, P, Root, OptLevel))
3293 return false;
3294
3295 return selectAddr(N.getNode(),
3296 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3297}
3298
3299/// Return an SDNode that returns the value of the global base register.
3300/// Output instructions required to initialize the global base register,
3301/// if necessary.
3302SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3303 Register GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3304 auto &DL = MF->getDataLayout();
3305 return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
3306}
3307
3308bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
3309 if (N->getOpcode() == ISD::TRUNCATE)
3310 N = N->getOperand(0).getNode();
3311 if (N->getOpcode() != X86ISD::Wrapper)
3312 return false;
3313
3314 auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
3315 if (!GA)
3316 return false;
3317
3318 auto *GV = GA->getGlobal();
3319 std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3320 if (CR)
3321 return CR->getSignedMin().sge(-1ull << Width) &&
3322 CR->getSignedMax().slt(1ull << Width);
3323 // In the kernel code model, globals are in the negative 2GB of the address
3324 // space, so globals can be a sign extended 32-bit immediate.
3325 // In other code models, small globals are in the low 2GB of the address
3326 // space, so sign extending them is equivalent to zero extending them.
3327 return Width == 32 && !TM.isLargeGlobalValue(GV);
3328}
3329
3330X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
3331 assert(N->isMachineOpcode() && "Unexpected node");
3332 unsigned Opc = N->getMachineOpcode();
3333 const MCInstrDesc &MCID = getInstrInfo()->get(Opc);
3334 int CondNo = X86::getCondSrcNoFromDesc(MCID);
3335 if (CondNo < 0)
3336 return X86::COND_INVALID;
3337
3338 return static_cast<X86::CondCode>(N->getConstantOperandVal(CondNo));
3339}
3340
3341/// Test whether the given X86ISD::CMP node has any users that use a flag
3342/// other than ZF.
3343bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3344 // Examine each user of the node.
3345 for (SDUse &Use : Flags->uses()) {
3346 // Only check things that use the flags.
3347 if (Use.getResNo() != Flags.getResNo())
3348 continue;
3349 SDNode *User = Use.getUser();
3350 // Only examine CopyToReg uses that copy to EFLAGS.
3351 if (User->getOpcode() != ISD::CopyToReg ||
3352 cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3353 return false;
3354 // Examine each user of the CopyToReg use.
3355 for (SDUse &FlagUse : User->uses()) {
3356 // Only examine the Flag result.
3357 if (FlagUse.getResNo() != 1)
3358 continue;
3359 // Anything unusual: assume conservatively.
3360 if (!FlagUse.getUser()->isMachineOpcode())
3361 return false;
3362 // Examine the condition code of the user.
3363 X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3364
3365 switch (CC) {
3366 // Comparisons which only use the zero flag.
3367 case X86::COND_E: case X86::COND_NE:
3368 continue;
3369 // Anything else: assume conservatively.
3370 default:
3371 return false;
3372 }
3373 }
3374 }
3375 return true;
3376}
3377
3378/// Test whether the given X86ISD::CMP node has any uses which require the SF
3379/// flag to be accurate.
3380bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3381 // Examine each user of the node.
3382 for (SDUse &Use : Flags->uses()) {
3383 // Only check things that use the flags.
3384 if (Use.getResNo() != Flags.getResNo())
3385 continue;
3386 SDNode *User = Use.getUser();
3387 // Only examine CopyToReg uses that copy to EFLAGS.
3388 if (User->getOpcode() != ISD::CopyToReg ||
3389 cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3390 return false;
3391 // Examine each user of the CopyToReg use.
3392 for (SDUse &FlagUse : User->uses()) {
3393 // Only examine the Flag result.
3394 if (FlagUse.getResNo() != 1)
3395 continue;
3396 // Anything unusual: assume conservatively.
3397 if (!FlagUse.getUser()->isMachineOpcode())
3398 return false;
3399 // Examine the condition code of the user.
3400 X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3401
3402 switch (CC) {
3403 // Comparisons which don't examine the SF flag.
3404 case X86::COND_A: case X86::COND_AE:
3405 case X86::COND_B: case X86::COND_BE:
3406 case X86::COND_E: case X86::COND_NE:
3407 case X86::COND_O: case X86::COND_NO:
3408 case X86::COND_P: case X86::COND_NP:
3409 continue;
3410 // Anything else: assume conservatively.
3411 default:
3412 return false;
3413 }
3414 }
3415 }
3416 return true;
3417}
3418
3420 switch (CC) {
3421 // Comparisons which don't examine the CF flag.
3422 case X86::COND_O: case X86::COND_NO:
3423 case X86::COND_E: case X86::COND_NE:
3424 case X86::COND_S: case X86::COND_NS:
3425 case X86::COND_P: case X86::COND_NP:
3426 case X86::COND_L: case X86::COND_GE:
3427 case X86::COND_G: case X86::COND_LE:
3428 return false;
3429 // Anything else: assume conservatively.
3430 default:
3431 return true;
3432 }
3433}
3434
3435/// Test whether the given node which sets flags has any uses which require the
3436/// CF flag to be accurate.
3437 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3438 // Examine each user of the node.
3439 for (SDUse &Use : Flags->uses()) {
3440 // Only check things that use the flags.
3441 if (Use.getResNo() != Flags.getResNo())
3442 continue;
3443
3444 SDNode *User = Use.getUser();
3445 unsigned UserOpc = User->getOpcode();
3446
3447 if (UserOpc == ISD::CopyToReg) {
3448 // Only examine CopyToReg uses that copy to EFLAGS.
3449 if (cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3450 return false;
3451 // Examine each user of the CopyToReg use.
3452 for (SDUse &FlagUse : User->uses()) {
3453 // Only examine the Flag result.
3454 if (FlagUse.getResNo() != 1)
3455 continue;
3456 // Anything unusual: assume conservatively.
3457 if (!FlagUse.getUser()->isMachineOpcode())
3458 return false;
3459 // Examine the condition code of the user.
3460 X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3461
3462 if (mayUseCarryFlag(CC))
3463 return false;
3464 }
3465
3466 // This CopyToReg is ok. Move on to the next user.
3467 continue;
3468 }
3469
3470 // This might be an unselected node. So look for the pre-isel opcodes that
3471 // use flags.
3472 unsigned CCOpNo;
3473 switch (UserOpc) {
3474 default:
3475 // Something unusual. Be conservative.
3476 return false;
3477 case X86ISD::SETCC: CCOpNo = 0; break;
3478 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
3479 case X86ISD::CMOV: CCOpNo = 2; break;
3480 case X86ISD::BRCOND: CCOpNo = 2; break;
3481 }
3482
3483 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
3484 if (mayUseCarryFlag(CC))
3485 return false;
3486 }
3487 return true;
3488}
3489
3490/// Check whether or not the chain ending in StoreNode is suitable for doing
3491/// the {load; op; store} to modify transformation.
3493 SDValue StoredVal, SelectionDAG *CurDAG,
3494 unsigned LoadOpNo,
3495 LoadSDNode *&LoadNode,
3496 SDValue &InputChain) {
3497 // Is the stored value result 0 of the operation?
3498 if (StoredVal.getResNo() != 0) return false;
3499
3500 // Are there other uses of the operation other than the store?
3501 if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
3502
3503 // Is the store non-extending and non-indexed?
3504 if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
3505 return false;
3506
3507 SDValue Load = StoredVal->getOperand(LoadOpNo);
3508 // Is the stored value a non-extending and non-indexed load?
3509 if (!ISD::isNormalLoad(Load.getNode())) return false;
3510
3511 // Return LoadNode by reference.
3512 LoadNode = cast<LoadSDNode>(Load);
3513
3514 // Is store the only read of the loaded value?
3515 if (!Load.hasOneUse())
3516 return false;
3517
3518 // Is the address of the store the same as the load?
3519 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
3520 LoadNode->getOffset() != StoreNode->getOffset())
3521 return false;
3522
3523 bool FoundLoad = false;
3524 SmallVector<SDValue, 4> ChainOps;
3525 SmallVector<const SDNode *, 4> LoopWorklist;
3527 const unsigned int Max = 1024;
3528
3529 // Visualization of Load-Op-Store fusion:
3530 // -------------------------
3531 // Legend:
3532 // *-lines = Chain operand dependencies.
3533 // |-lines = Normal operand dependencies.
3534 // Dependencies flow down and right. n-suffix references multiple nodes.
3535 //
3536 // C Xn C
3537 // * * *
3538 // * * *
3539 // Xn A-LD Yn TF Yn
3540 // * * \ | * |
3541 // * * \ | * |
3542 // * * \ | => A--LD_OP_ST
3543 // * * \| \
3544 // TF OP \
3545 // * | \ Zn
3546 // * | \
3547 // A-ST Zn
3548 //
3549
3550 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3551 // #2: Yn -> LD
3552 // #3: ST -> Zn
3553
3554 // Ensure the transform is safe by checking for the dual
3555 // dependencies to make sure we do not induce a loop.
3556
3557 // As LD is a predecessor to both OP and ST we can do this by checking:
3558 // a). if LD is a predecessor to a member of Xn or Yn.
3559 // b). if a Zn is a predecessor to ST.
3560
3561 // However, (b) can only occur through being a chain predecessor to
3562 // ST, which is the same as Zn being a member or predecessor of Xn,
3563 // which is a subset of LD being a predecessor of Xn. So it's
3564 // subsumed by check (a).
3565
3566 SDValue Chain = StoreNode->getChain();
3567
3568 // Gather X elements in ChainOps.
3569 if (Chain == Load.getValue(1)) {
3570 FoundLoad = true;
3571 ChainOps.push_back(Load.getOperand(0));
3572 } else if (Chain.getOpcode() == ISD::TokenFactor) {
3573 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
3574 SDValue Op = Chain.getOperand(i);
3575 if (Op == Load.getValue(1)) {
3576 FoundLoad = true;
3577 // Drop Load, but keep its chain. No cycle check necessary.
3578 ChainOps.push_back(Load.getOperand(0));
3579 continue;
3580 }
3581 LoopWorklist.push_back(Op.getNode());
3582 ChainOps.push_back(Op);
3583 }
3584 }
3585
3586 if (!FoundLoad)
3587 return false;
3588
3589 // Worklist is currently Xn. Add Yn to worklist.
3590 for (SDValue Op : StoredVal->ops())
3591 if (Op.getNode() != LoadNode)
3592 LoopWorklist.push_back(Op.getNode());
3593
3594 // Check (a) if Load is a predecessor to Xn + Yn
3595 if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
3596 true))
3597 return false;
3598
3599 InputChain =
3600 CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
3601 return true;
3602}
3603
3604// Change a chain of {load; op; store} of the same value into a simple op
3605// through memory of that value, if the uses of the modified value and its
3606// address are suitable.
3607//
3608// The tablegen pattern memory operand pattern is currently not able to match
3609// the case where the EFLAGS on the original operation are used.
3610//
3611// To move this to tablegen, we'll need to improve tablegen to allow flags to
3612// be transferred from a node in the pattern to the result node, probably with
3613// a new keyword. For example, we have this
3614// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3615// [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>;
3616// but maybe need something like this
3617// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3618// [(store (X86add_flag (loadi64 addr:$dst), -1), addr:$dst),
3619// (transferrable EFLAGS)]>;
3620//
3621// Until then, we manually fold these and instruction select the operation
3622// here.
3623bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3624 auto *StoreNode = cast<StoreSDNode>(Node);
3625 SDValue StoredVal = StoreNode->getOperand(1);
3626 unsigned Opc = StoredVal->getOpcode();
3627
3628 // Before we try to select anything, make sure this is memory operand size
3629 // and opcode we can handle. Note that this must match the code below that
3630 // actually lowers the opcodes.
3631 EVT MemVT = StoreNode->getMemoryVT();
3632 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3633 MemVT != MVT::i8)
3634 return false;
3635
3636 bool IsCommutable = false;
3637 bool IsNegate = false;
3638 switch (Opc) {
3639 default:
3640 return false;
3641 case X86ISD::SUB:
3642 IsNegate = isNullConstant(StoredVal.getOperand(0));
3643 break;
3644 case X86ISD::SBB:
3645 break;
3646 case X86ISD::ADD:
3647 case X86ISD::ADC:
3648 case X86ISD::AND:
3649 case X86ISD::OR:
3650 case X86ISD::XOR:
3651 IsCommutable = true;
3652 break;
3653 }
3654
3655 unsigned LoadOpNo = IsNegate ? 1 : 0;
3656 LoadSDNode *LoadNode = nullptr;
3657 SDValue InputChain;
3658 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3659 LoadNode, InputChain)) {
3660 if (!IsCommutable)
3661 return false;
3662
3663 // This operation is commutable, try the other operand.
3664 LoadOpNo = 1;
3665 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3666 LoadNode, InputChain))
3667 return false;
3668 }
3669
3670 SDValue Base, Scale, Index, Disp, Segment;
3671 if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3672 Segment))
3673 return false;
3674
3675 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3676 unsigned Opc8) {
3677 switch (MemVT.getSimpleVT().SimpleTy) {
3678 case MVT::i64:
3679 return Opc64;
3680 case MVT::i32:
3681 return Opc32;
3682 case MVT::i16:
3683 return Opc16;
3684 case MVT::i8:
3685 return Opc8;
3686 default:
3687 llvm_unreachable("Invalid size!");
3688 }
3689 };
3690
3691 MachineSDNode *Result;
3692 switch (Opc) {
3693 case X86ISD::SUB:
3694 // Handle negate.
3695 if (IsNegate) {
3696 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3697 X86::NEG8m);
3698 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3699 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3700 MVT::Other, Ops);
3701 break;
3702 }
3703 [[fallthrough]];
3704 case X86ISD::ADD:
3705 // Try to match inc/dec.
3706 if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3707 bool IsOne = isOneConstant(StoredVal.getOperand(1));
3708 bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
3709 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3710 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3711 unsigned NewOpc =
3712 ((Opc == X86ISD::ADD) == IsOne)
3713 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3714 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3715 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3716 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3717 MVT::Other, Ops);
3718 break;
3719 }
3720 }
3721 [[fallthrough]];
3722 case X86ISD::ADC:
3723 case X86ISD::SBB:
3724 case X86ISD::AND:
3725 case X86ISD::OR:
3726 case X86ISD::XOR: {
3727 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3728 switch (Opc) {
3729 case X86ISD::ADD:
3730 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3731 X86::ADD8mr);
3732 case X86ISD::ADC:
3733 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3734 X86::ADC8mr);
3735 case X86ISD::SUB:
3736 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3737 X86::SUB8mr);
3738 case X86ISD::SBB:
3739 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3740 X86::SBB8mr);
3741 case X86ISD::AND:
3742 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3743 X86::AND8mr);
3744 case X86ISD::OR:
3745 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3746 case X86ISD::XOR:
3747 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3748 X86::XOR8mr);
3749 default:
3750 llvm_unreachable("Invalid opcode!");
3751 }
3752 };
3753 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3754 switch (Opc) {
3755 case X86ISD::ADD:
3756 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3757 X86::ADD8mi);
3758 case X86ISD::ADC:
3759 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3760 X86::ADC8mi);
3761 case X86ISD::SUB:
3762 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3763 X86::SUB8mi);
3764 case X86ISD::SBB:
3765 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3766 X86::SBB8mi);
3767 case X86ISD::AND:
3768 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3769 X86::AND8mi);
3770 case X86ISD::OR:
3771 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3772 X86::OR8mi);
3773 case X86ISD::XOR:
3774 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3775 X86::XOR8mi);
3776 default:
3777 llvm_unreachable("Invalid opcode!");
3778 }
3779 };
3780
3781 unsigned NewOpc = SelectRegOpcode(Opc);
3782 SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3783
3784 // See if the operand is a constant that we can fold into an immediate
3785 // operand.
3786 if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3787 int64_t OperandV = OperandC->getSExtValue();
3788
3789 // Check if we can shrink the operand enough to fit in an immediate (or
3790 // fit into a smaller immediate) by negating it and switching the
3791 // operation.
3792 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3793 ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3794 (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3795 isInt<32>(-OperandV))) &&
3796 hasNoCarryFlagUses(StoredVal.getValue(1))) {
3797 OperandV = -OperandV;
3799 }
3800
3801 if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3802 Operand = CurDAG->getSignedTargetConstant(OperandV, SDLoc(Node), MemVT);
3803 NewOpc = SelectImmOpcode(Opc);
3804 }
3805 }
3806
3807 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3808 SDValue CopyTo =
3809 CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3810 StoredVal.getOperand(2), SDValue());
3811
3812 const SDValue Ops[] = {Base, Scale, Index, Disp,
3813 Segment, Operand, CopyTo, CopyTo.getValue(1)};
3814 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3815 Ops);
3816 } else {
3817 const SDValue Ops[] = {Base, Scale, Index, Disp,
3818 Segment, Operand, InputChain};
3819 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3820 Ops);
3821 }
3822 break;
3823 }
3824 default:
3825 llvm_unreachable("Invalid opcode!");
3826 }
3827
3828 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3829 LoadNode->getMemOperand()};
3830 CurDAG->setNodeMemRefs(Result, MemOps);
3831
3832 // Update Load Chain uses as well.
3833 ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3834 ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3835 ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3836 CurDAG->RemoveDeadNode(Node);
3837 return true;
3838}
3839
3840// See if this is an X & Mask that we can match to BEXTR/BZHI.
3841// Where Mask is one of the following patterns:
3842// a) x & (1 << nbits) - 1
3843// b) x & ~(-1 << nbits)
3844// c) x & (-1 >> (32 - y))
3845// d) x << (32 - y) >> (32 - y)
3846// e) (1 << nbits) - 1
3847bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3848 assert(
3849 (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND ||
3850 Node->getOpcode() == ISD::SRL) &&
3851 "Should be either an and-mask, or right-shift after clearing high bits.");
3852
3853 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3854 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3855 return false;
3856
3857 MVT NVT = Node->getSimpleValueType(0);
3858
3859 // Only supported for 32 and 64 bits.
3860 if (NVT != MVT::i32 && NVT != MVT::i64)
3861 return false;
3862
3863 SDValue NBits;
3864 bool NegateNBits;
3865
3866 // If we have BMI2's BZHI, we are ok with muti-use patterns.
3867 // Else, if we only have BMI1's BEXTR, we require one-use.
3868 const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3869 auto checkUses = [AllowExtraUsesByDefault](
3870 SDValue Op, unsigned NUses,
3871 std::optional<bool> AllowExtraUses) {
3872 return AllowExtraUses.value_or(AllowExtraUsesByDefault) ||
3873 Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3874 };
3875 auto checkOneUse = [checkUses](SDValue Op,
3876 std::optional<bool> AllowExtraUses =
3877 std::nullopt) {
3878 return checkUses(Op, 1, AllowExtraUses);
3879 };
3880 auto checkTwoUse = [checkUses](SDValue Op,
3881 std::optional<bool> AllowExtraUses =
3882 std::nullopt) {
3883 return checkUses(Op, 2, AllowExtraUses);
3884 };
3885
3886 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3887 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3888 assert(V.getSimpleValueType() == MVT::i32 &&
3889 V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3890 "Expected i64 -> i32 truncation");
3891 V = V.getOperand(0);
3892 }
3893 return V;
3894 };
3895
3896 // a) x & ((1 << nbits) + (-1))
3897 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3898 &NegateNBits](SDValue Mask) -> bool {
3899 // Match `add`. Must only have one use!
3900 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3901 return false;
3902 // We should be adding all-ones constant (i.e. subtracting one.)
3903 if (!isAllOnesConstant(Mask->getOperand(1)))
3904 return false;
3905 // Match `1 << nbits`. Might be truncated. Must only have one use!
3906 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3907 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3908 return false;
3909 if (!isOneConstant(M0->getOperand(0)))
3910 return false;
3911 NBits = M0->getOperand(1);
3912 NegateNBits = false;
3913 return true;
3914 };
3915
3916 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3917 V = peekThroughOneUseTruncation(V);
3918 return CurDAG->MaskedValueIsAllOnes(
3919 V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3920 NVT.getSizeInBits()));
3921 };
3922
3923 // b) x & ~(-1 << nbits)
3924 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3925 &NBits, &NegateNBits](SDValue Mask) -> bool {
3926 // Match `~()`. Must only have one use!
3927 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3928 return false;
3929 // The -1 only has to be all-ones for the final Node's NVT.
3930 if (!isAllOnes(Mask->getOperand(1)))
3931 return false;
3932 // Match `-1 << nbits`. Might be truncated. Must only have one use!
3933 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3934 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3935 return false;
3936 // The -1 only has to be all-ones for the final Node's NVT.
3937 if (!isAllOnes(M0->getOperand(0)))
3938 return false;
3939 NBits = M0->getOperand(1);
3940 NegateNBits = false;
3941 return true;
3942 };
3943
3944 // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3945 // or leave the shift amount as-is, but then we'll have to negate it.
3946 auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
3947 unsigned Bitwidth) {
3948 NBits = ShiftAmt;
3949 NegateNBits = true;
3950 // Skip over a truncate of the shift amount, if any.
3951 if (NBits.getOpcode() == ISD::TRUNCATE)
3952 NBits = NBits.getOperand(0);
3953 // Try to match the shift amount as (bitwidth - y). It should go away, too.
3954 // If it doesn't match, that's fine, we'll just negate it ourselves.
3955 if (NBits.getOpcode() != ISD::SUB)
3956 return;
3957 auto *V0 = dyn_cast<ConstantSDNode>(NBits.getOperand(0));
3958 if (!V0 || V0->getZExtValue() != Bitwidth)
3959 return;
3960 NBits = NBits.getOperand(1);
3961 NegateNBits = false;
3962 };
3963
3964 // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
3965 // or
3966 // c) x & (-1 >> (32 - y))
3967 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
3968 canonicalizeShiftAmt](SDValue Mask) -> bool {
3969 // The mask itself may be truncated.
3970 Mask = peekThroughOneUseTruncation(Mask);
3971 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3972 // Match `l>>`. Must only have one use!
3973 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3974 return false;
3975 // We should be shifting truly all-ones constant.
3976 if (!isAllOnesConstant(Mask.getOperand(0)))
3977 return false;
3978 SDValue M1 = Mask.getOperand(1);
3979 // The shift amount should not be used externally.
3980 if (!checkOneUse(M1))
3981 return false;
3982 canonicalizeShiftAmt(M1, Bitwidth);
3983 // Pattern c. is non-canonical, and is expanded into pattern d. iff there
3984 // is no extra use of the mask. Clearly, there was one since we are here.
3985 // But at the same time, if we need to negate the shift amount,
3986 // then we don't want the mask to stick around, else it's unprofitable.
3987 return !NegateNBits;
3988 };
3989
3990 SDValue X;
3991
3992 // d) x << z >> z but then we'll have to subtract z from bitwidth
3993 // or
3994 // d) x << (32 - y) >> (32 - y)
3995 auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
3996 AllowExtraUsesByDefault, &NegateNBits,
3997 &X](SDNode *Node) -> bool {
3998 if (Node->getOpcode() != ISD::SRL)
3999 return false;
4000 SDValue N0 = Node->getOperand(0);
4001 if (N0->getOpcode() != ISD::SHL)
4002 return false;
4003 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
4004 SDValue N1 = Node->getOperand(1);
4005 SDValue N01 = N0->getOperand(1);
4006 // Both of the shifts must be by the exact same value.
4007 if (N1 != N01)
4008 return false;
4009 canonicalizeShiftAmt(N1, Bitwidth);
4010 // There should not be any external uses of the inner shift / shift amount.
4011 // Note that while we are generally okay with external uses given BMI2,
4012 // iff we need to negate the shift amount, we are not okay with extra uses.
4013 const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
4014 if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
4015 return false;
4016 X = N0->getOperand(0);
4017 return true;
4018 };
4019
4020 auto matchLowBitMask = [matchPatternA, matchPatternB,
4021 matchPatternC](SDValue Mask) -> bool {
4022 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
4023 };
4024
4025 if (Node->getOpcode() == ISD::AND) {
4026 X = Node->getOperand(0);
4027 SDValue Mask = Node->getOperand(1);
4028
4029 if (matchLowBitMask(Mask)) {
4030 // Great.
4031 } else {
4032 std::swap(X, Mask);
4033 if (!matchLowBitMask(Mask))
4034 return false;
4035 }
4036 } else if (matchLowBitMask(SDValue(Node, 0))) {
4037 X = CurDAG->getAllOnesConstant(SDLoc(Node), NVT);
4038 } else if (!matchPatternD(Node))
4039 return false;
4040
4041 // If we need to negate the shift amount, require BMI2 BZHI support.
4042 // It's just too unprofitable for BMI1 BEXTR.
4043 if (NegateNBits && !Subtarget->hasBMI2())
4044 return false;
4045
4046 SDLoc DL(Node);
4047
4048 // Truncate the shift amount.
4049 NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
4050 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4051
4052 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
4053 // All the other bits are undefined, we do not care about them.
4054 SDValue ImplDef = SDValue(
4055 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
4056 insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
4057
4058 SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
4059 insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
4060 NBits = SDValue(CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
4061 MVT::i32, ImplDef, NBits, SRIdxVal),
4062 0);
4063 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4064
4065 // We might have matched the amount of high bits to be cleared,
4066 // but we want the amount of low bits to be kept, so negate it then.
4067 if (NegateNBits) {
4068 SDValue BitWidthC = CurDAG->getConstant(NVT.getSizeInBits(), DL, MVT::i32);
4069 insertDAGNode(*CurDAG, SDValue(Node, 0), BitWidthC);
4070
4071 NBits = CurDAG->getNode(ISD::SUB, DL, MVT::i32, BitWidthC, NBits);
4072 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4073 }
4074
4075 if (Subtarget->hasBMI2()) {
4076 // Great, just emit the BZHI..
4077 if (NVT != MVT::i32) {
4078 // But have to place the bit count into the wide-enough register first.
4079 NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
4080 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4081 }
4082
4083 SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
4084 ReplaceNode(Node, Extract.getNode());
4085 SelectCode(Extract.getNode());
4086 return true;
4087 }
4088
4089 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
4090 // *logically* shifted (potentially with one-use trunc inbetween),
4091 // and the truncation was the only use of the shift,
4092 // and if so look past one-use truncation.
4093 {
4094 SDValue RealX = peekThroughOneUseTruncation(X);
4095 // FIXME: only if the shift is one-use?
4096 if (RealX != X && RealX.getOpcode() == ISD::SRL)
4097 X = RealX;
4098 }
4099
4100 MVT XVT = X.getSimpleValueType();
4101
4102 // Else, emitting BEXTR requires one more step.
4103 // The 'control' of BEXTR has the pattern of:
4104 // [15...8 bit][ 7...0 bit] location
4105 // [ bit count][ shift] name
4106 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4107
4108 // Shift NBits left by 8 bits, thus producing 'control'.
4109 // This makes the low 8 bits to be zero.
4110 SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
4111 insertDAGNode(*CurDAG, SDValue(Node, 0), C8);
4112 SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
4113 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4114
4115 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
4116 // FIXME: only if the shift is one-use?
4117 if (X.getOpcode() == ISD::SRL) {
4118 SDValue ShiftAmt = X.getOperand(1);
4119 X = X.getOperand(0);
4120
4121 assert(ShiftAmt.getValueType() == MVT::i8 &&
4122 "Expected shift amount to be i8");
4123
4124 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4125 // We could zext to i16 in some form, but we intentionally don't do that.
4126 SDValue OrigShiftAmt = ShiftAmt;
4127 ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
4128 insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
4129
4130 // And now 'or' these low 8 bits of shift amount into the 'control'.
4131 Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
4132 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4133 }
4134
4135 // But have to place the 'control' into the wide-enough register first.
4136 if (XVT != MVT::i32) {
4137 Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
4138 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4139 }
4140
4141 // And finally, form the BEXTR itself.
4142 SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
4143
4144 // The 'X' was originally truncated. Do that now.
4145 if (XVT != NVT) {
4146 insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
4147 Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
4148 }
4149
4150 ReplaceNode(Node, Extract.getNode());
4151 SelectCode(Extract.getNode());
4152
4153 return true;
4154}
4155
4156// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4157MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
4158 MVT NVT = Node->getSimpleValueType(0);
4159 SDLoc dl(Node);
4160
4161 SDValue N0 = Node->getOperand(0);
4162 SDValue N1 = Node->getOperand(1);
4163
4164 // If we have TBM we can use an immediate for the control. If we have BMI
4165 // we should only do this if the BEXTR instruction is implemented well.
4166 // Otherwise moving the control into a register makes this more costly.
4167 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4168 // hoisting the move immediate would make it worthwhile with a less optimal
4169 // BEXTR?
4170 bool PreferBEXTR =
4171 Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4172 if (!PreferBEXTR && !Subtarget->hasBMI2())
4173 return nullptr;
4174
4175 // Must have a shift right.
4176 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
4177 return nullptr;
4178
4179 // Shift can't have additional users.
4180 if (!N0->hasOneUse())
4181 return nullptr;
4182
4183 // Only supported for 32 and 64 bits.
4184 if (NVT != MVT::i32 && NVT != MVT::i64)
4185 return nullptr;
4186
4187 // Shift amount and RHS of and must be constant.
4188 auto *MaskCst = dyn_cast<ConstantSDNode>(N1);
4189 auto *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
4190 if (!MaskCst || !ShiftCst)
4191 return nullptr;
4192
4193 // And RHS must be a mask.
4194 uint64_t Mask = MaskCst->getZExtValue();
4195 if (!isMask_64(Mask))
4196 return nullptr;
4197
4198 uint64_t Shift = ShiftCst->getZExtValue();
4199 uint64_t MaskSize = llvm::popcount(Mask);
4200
4201 // Don't interfere with something that can be handled by extracting AH.
4202 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4203 if (Shift == 8 && MaskSize == 8)
4204 return nullptr;
4205
4206 // Make sure we are only using bits that were in the original value, not
4207 // shifted in.
4208 if (Shift + MaskSize > NVT.getSizeInBits())
4209 return nullptr;
4210
4211 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4212 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4213 // does not fit into 32 bits. Load folding is not a sufficient reason.
4214 if (!PreferBEXTR && MaskSize <= 32)
4215 return nullptr;
4216
4217 SDValue Control;
4218 unsigned ROpc, MOpc;
4219
4220#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4221 if (!PreferBEXTR) {
4222 assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4223 // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4224 // Let's perform the mask first, and apply shift later. Note that we need to
4225 // widen the mask to account for the fact that we'll apply shift afterwards!
4226 Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
4227 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4228 : GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4229 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4230 : GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4231 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4232 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4233 } else {
4234 // The 'control' of BEXTR has the pattern of:
4235 // [15...8 bit][ 7...0 bit] location
4236 // [ bit count][ shift] name
4237 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4238 Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
4239 if (Subtarget->hasTBM()) {
4240 ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4241 MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4242 } else {
4243 assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4244 // BMI requires the immediate to placed in a register.
4245 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4246 : GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4247 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4248 : GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4249 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4250 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4251 }
4252 }
4253
4254 MachineSDNode *NewNode;
4255 SDValue Input = N0->getOperand(0);
4256 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4257 if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4258 SDValue Ops[] = {
4259 Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
4260 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4261 NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4262 // Update the chain.
4263 ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
4264 // Record the mem-refs
4265 CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
4266 } else {
4267 NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
4268 }
4269
4270 if (!PreferBEXTR) {
4271 // We still need to apply the shift.
4272 SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
4273 unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4274 : GET_ND_IF_ENABLED(X86::SHR32ri);
4275 NewNode =
4276 CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
4277 }
4278
4279 return NewNode;
4280}
4281
4282// Emit a PCMISTR(I/M) instruction.
4283MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
4284 bool MayFoldLoad, const SDLoc &dl,
4285 MVT VT, SDNode *Node) {
4286 SDValue N0 = Node->getOperand(0);
4287 SDValue N1 = Node->getOperand(1);
4288 SDValue Imm = Node->getOperand(2);
4289 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4290 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4291
4292 // Try to fold a load. No need to check alignment.
4293 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4294 if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4295 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4296 N1.getOperand(0) };
4297 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
4298 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4299 // Update the chain.
4300 ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
4301 // Record the mem-refs
4302 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4303 return CNode;
4304 }
4305
4306 SDValue Ops[] = { N0, N1, Imm };
4307 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
4308 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4309 return CNode;
4310}
4311
4312// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4313// to emit a second instruction after this one. This is needed since we have two
4314// copyToReg nodes glued before this and we need to continue that glue through.
4315MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
4316 bool MayFoldLoad, const SDLoc &dl,
4317 MVT VT, SDNode *Node,
4318 SDValue &InGlue) {
4319 SDValue N0 = Node->getOperand(0);
4320 SDValue N2 = Node->getOperand(2);
4321 SDValue Imm = Node->getOperand(4);
4322 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4323 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4324
4325 // Try to fold a load. No need to check alignment.
4326 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4327 if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4328 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4329 N2.getOperand(0), InGlue };
4330 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
4331 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4332 InGlue = SDValue(CNode, 3);
4333 // Update the chain.
4334 ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
4335 // Record the mem-refs
4336 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
4337 return CNode;
4338 }
4339
4340 SDValue Ops[] = { N0, N2, Imm, InGlue };
4341 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
4342 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4343 InGlue = SDValue(CNode, 2);
4344 return CNode;
4345}
4346
4347bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4348 EVT VT = N->getValueType(0);
4349
4350 // Only handle scalar shifts.
4351 if (VT.isVector())
4352 return false;
4353
4354 // Narrower shifts only mask to 5 bits in hardware.
4355 unsigned Size = VT == MVT::i64 ? 64 : 32;
4356
4357 SDValue OrigShiftAmt = N->getOperand(1);
4358 SDValue ShiftAmt = OrigShiftAmt;
4359 SDLoc DL(N);
4360
4361 // Skip over a truncate of the shift amount.
4362 if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
4363 ShiftAmt = ShiftAmt->getOperand(0);
4364
4365 // This function is called after X86DAGToDAGISel::matchBitExtract(),
4366 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4367
4368 SDValue NewShiftAmt;
4369 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
4370 ShiftAmt->getOpcode() == ISD::XOR) {
4371 SDValue Add0 = ShiftAmt->getOperand(0);
4372 SDValue Add1 = ShiftAmt->getOperand(1);
4373 auto *Add0C = dyn_cast<ConstantSDNode>(Add0);
4374 auto *Add1C = dyn_cast<ConstantSDNode>(Add1);
4375 // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4376 // to avoid the ADD/SUB/XOR.
4377 if (Add1C && Add1C->getAPIntValue().urem(Size) == 0) {
4378 NewShiftAmt = Add0;
4379
4380 } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4381 ((Add0C && Add0C->getAPIntValue().urem(Size) == Size - 1) ||
4382 (Add1C && Add1C->getAPIntValue().urem(Size) == Size - 1))) {
4383 // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4384 // we can replace it with a NOT. In the XOR case it may save some code
4385 // size, in the SUB case it also may save a move.
4386 assert(Add0C == nullptr || Add1C == nullptr);
4387
4388 // We can only do N-X, not X-N
4389 if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
4390 return false;
4391
4392 EVT OpVT = ShiftAmt.getValueType();
4393
4394 SDValue AllOnes = CurDAG->getAllOnesConstant(DL, OpVT);
4395 NewShiftAmt = CurDAG->getNode(ISD::XOR, DL, OpVT,
4396 Add0C == nullptr ? Add0 : Add1, AllOnes);
4397 insertDAGNode(*CurDAG, OrigShiftAmt, AllOnes);
4398 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4399 // If we are shifting by N-X where N == 0 mod Size, then just shift by
4400 // -X to generate a NEG instead of a SUB of a constant.
4401 } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
4402 Add0C->getZExtValue() != 0) {
4403 EVT SubVT = ShiftAmt.getValueType();
4404 SDValue X;
4405 if (Add0C->getZExtValue() % Size == 0)
4406 X = Add1;
4407 else if (ShiftAmt.hasOneUse() && Size == 64 &&
4408 Add0C->getZExtValue() % 32 == 0) {
4409 // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4410 // This is mainly beneficial if we already compute (x+n*32).
4411 if (Add1.getOpcode() == ISD::TRUNCATE) {
4412 Add1 = Add1.getOperand(0);
4413 SubVT = Add1.getValueType();
4414 }
4415 if (Add0.getValueType() != SubVT) {
4416 Add0 = CurDAG->getZExtOrTrunc(Add0, DL, SubVT);
4417 insertDAGNode(*CurDAG, OrigShiftAmt, Add0);
4418 }
4419
4420 X = CurDAG->getNode(ISD::ADD, DL, SubVT, Add1, Add0);
4421 insertDAGNode(*CurDAG, OrigShiftAmt, X);
4422 } else
4423 return false;
4424 // Insert a negate op.
4425 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4426 // that uses it that's not a shift.
4427 SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
4428 SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, X);
4429 NewShiftAmt = Neg;
4430
4431 // Insert these operands into a valid topological order so they can
4432 // get selected independently.
4433 insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
4434 insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
4435 } else
4436 return false;
4437 } else
4438 return false;
4439
4440 if (NewShiftAmt.getValueType() != MVT::i8) {
4441 // Need to truncate the shift amount.
4442 NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
4443 // Add to a correct topological ordering.
4444 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4445 }
4446
4447 // Insert a new mask to keep the shift amount legal. This should be removed
4448 // by isel patterns.
4449 NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
4450 CurDAG->getConstant(Size - 1, DL, MVT::i8));
4451 // Place in a correct topological ordering.
4452 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4453
4454 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
4455 NewShiftAmt);
4456 if (UpdatedNode != N) {
4457 // If we found an existing node, we should replace ourselves with that node
4458 // and wait for it to be selected after its other users.
4459 ReplaceNode(N, UpdatedNode);
4460 return true;
4461 }
4462
4463 // If the original shift amount is now dead, delete it so that we don't run
4464 // it through isel.
4465 if (OrigShiftAmt.getNode()->use_empty())
4466 CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
4467
4468 // Now that we've optimized the shift amount, defer to normal isel to get
4469 // load folding and legacy vs BMI2 selection without repeating it here.
4470 SelectCode(N);
4471 return true;
4472}
4473
4474bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4475 MVT NVT = N->getSimpleValueType(0);
4476 unsigned Opcode = N->getOpcode();
4477 SDLoc dl(N);
4478
4479 // For operations of the form (x << C1) op C2, check if we can use a smaller
4480 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4481 SDValue Shift = N->getOperand(0);
4482 SDValue N1 = N->getOperand(1);
4483
4484 auto *Cst = dyn_cast<ConstantSDNode>(N1);
4485 if (!Cst)
4486 return false;
4487
4488 int64_t Val = Cst->getSExtValue();
4489
4490 // If we have an any_extend feeding the AND, look through it to see if there
4491 // is a shift behind it. But only if the AND doesn't use the extended bits.
4492 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4493 bool FoundAnyExtend = false;
4494 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4495 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
4496 isUInt<32>(Val)) {
4497 FoundAnyExtend = true;
4498 Shift = Shift.getOperand(0);
4499 }
4500
4501 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
4502 return false;
4503
4504 // i8 is unshrinkable, i16 should be promoted to i32.
4505 if (NVT != MVT::i32 && NVT != MVT::i64)
4506 return false;
4507
4508 auto *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
4509 if (!ShlCst)
4510 return false;
4511
4512 uint64_t ShAmt = ShlCst->getZExtValue();
4513
4514 // Make sure that we don't change the operation by removing bits.
4515 // This only matters for OR and XOR, AND is unaffected.
4516 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
4517 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
4518 return false;
4519
4520 // Check the minimum bitwidth for the new constant.
4521 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4522 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4523 if (Opcode == ISD::AND) {
4524 // AND32ri is the same as AND64ri32 with zext imm.
4525 // Try this before sign extended immediates below.
4526 ShiftedVal = (uint64_t)Val >> ShAmt;
4527 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4528 return true;
4529 // Also swap order when the AND can become MOVZX.
4530 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
4531 return true;
4532 }
4533 ShiftedVal = Val >> ShAmt;
4534 if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
4535 (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
4536 return true;
4537 if (Opcode != ISD::AND) {
4538 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4539 ShiftedVal = (uint64_t)Val >> ShAmt;
4540 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4541 return true;
4542 }
4543 return false;
4544 };
4545
4546 int64_t ShiftedVal;
4547 if (!CanShrinkImmediate(ShiftedVal))
4548 return false;
4549
4550 // Ok, we can reorder to get a smaller immediate.
4551
4552 // But, its possible the original immediate allowed an AND to become MOVZX.
4553 // Doing this late due to avoid the MakedValueIsZero call as late as
4554 // possible.
4555 if (Opcode == ISD::AND) {
4556 // Find the smallest zext this could possibly be.
4557 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4558 ZExtWidth = llvm::bit_ceil(std::max(ZExtWidth, 8U));
4559
4560 // Figure out which bits need to be zero to achieve that mask.
4561 APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
4562 ZExtWidth);
4563 NeededMask &= ~Cst->getAPIntValue();
4564
4565 if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
4566 return false;
4567 }
4568
4569 SDValue X = Shift.getOperand(0);
4570 if (FoundAnyExtend) {
4571 SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
4572 insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
4573 X = NewX;
4574 }
4575
4576 SDValue NewCst = CurDAG->getSignedConstant(ShiftedVal, dl, NVT);
4577 insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
4578 SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
4579 insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
4580 SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
4581 Shift.getOperand(1));
4582 ReplaceNode(N, NewSHL.getNode());
4583 SelectCode(NewSHL.getNode());
4584 return true;
4585}
4586
4587bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
4588 SDNode *ParentB, SDNode *ParentC,
4590 uint8_t Imm) {
4591 assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4592 C.isOperandOf(ParentC) && "Incorrect parent node");
4593
4594 auto tryFoldLoadOrBCast =
4595 [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
4596 SDValue &Index, SDValue &Disp, SDValue &Segment) {
4597 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4598 return true;
4599
4600 // Not a load, check for broadcast which may be behind a bitcast.
4601 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4602 P = L.getNode();
4603 L = L.getOperand(0);
4604 }
4605
4606 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4607 return false;
4608
4609 // Only 32 and 64 bit broadcasts are supported.
4610 auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4611 unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4612 if (Size != 32 && Size != 64)
4613 return false;
4614
4615 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4616 };
4617
4618 bool FoldedLoad = false;
4619 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4620 if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4621 FoldedLoad = true;
4622 } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4623 Tmp4)) {
4624 FoldedLoad = true;
4625 std::swap(A, C);
4626 // Swap bits 1/4 and 3/6.
4627 uint8_t OldImm = Imm;
4628 Imm = OldImm & 0xa5;
4629 if (OldImm & 0x02) Imm |= 0x10;
4630 if (OldImm & 0x10) Imm |= 0x02;
4631 if (OldImm & 0x08) Imm |= 0x40;
4632 if (OldImm & 0x40) Imm |= 0x08;
4633 } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4634 Tmp4)) {
4635 FoldedLoad = true;
4636 std::swap(B, C);
4637 // Swap bits 1/2 and 5/6.
4638 uint8_t OldImm = Imm;
4639 Imm = OldImm & 0x99;
4640 if (OldImm & 0x02) Imm |= 0x04;
4641 if (OldImm & 0x04) Imm |= 0x02;
4642 if (OldImm & 0x20) Imm |= 0x40;
4643 if (OldImm & 0x40) Imm |= 0x20;
4644 }
4645
4646 SDLoc DL(Root);
4647
4648 SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
4649
4650 MVT NVT = Root->getSimpleValueType(0);
4651
4652 MachineSDNode *MNode;
4653 if (FoldedLoad) {
4654 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
4655
4656 unsigned Opc;
4657 if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4658 auto *MemIntr = cast<MemIntrinsicSDNode>(C);
4659 unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4660 assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
4661
4662 bool UseD = EltSize == 32;
4663 if (NVT.is128BitVector())
4664 Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4665 else if (NVT.is256BitVector())
4666 Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4667 else if (NVT.is512BitVector())
4668 Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4669 else
4670 llvm_unreachable("Unexpected vector size!");
4671 } else {
4672 bool UseD = NVT.getVectorElementType() == MVT::i32;
4673 if (NVT.is128BitVector())
4674 Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4675 else if (NVT.is256BitVector())
4676 Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4677 else if (NVT.is512BitVector())
4678 Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4679 else
4680 llvm_unreachable("Unexpected vector size!");
4681 }
4682
4683 SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
4684 MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
4685
4686 // Update the chain.
4687 ReplaceUses(C.getValue(1), SDValue(MNode, 1));
4688 // Record the mem-refs
4689 CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
4690 } else {
4691 bool UseD = NVT.getVectorElementType() == MVT::i32;
4692 unsigned Opc;
4693 if (NVT.is128BitVector())
4694 Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4695 else if (NVT.is256BitVector())
4696 Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4697 else if (NVT.is512BitVector())
4698 Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4699 else
4700 llvm_unreachable("Unexpected vector size!");
4701
4702 MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
4703 }
4704
4705 ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
4706 CurDAG->RemoveDeadNode(Root);
4707 return true;
4708}
4709
4710// Try to match two logic ops to a VPTERNLOG.
4711// FIXME: Handle more complex patterns that use an operand more than once?
4712bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4713 MVT NVT = N->getSimpleValueType(0);
4714
4715 // Make sure we support VPTERNLOG.
4716 if (!NVT.isVector() || !Subtarget->hasAVX512() ||
4717 NVT.getVectorElementType() == MVT::i1)
4718 return false;
4719
4720 // We need VLX for 128/256-bit.
4721 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4722 return false;
4723
4724 SDValue N0 = N->getOperand(0);
4725 SDValue N1 = N->getOperand(1);
4726
4727 auto getFoldableLogicOp = [](SDValue Op) {
4728 // Peek through single use bitcast.
4729 if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4730 Op = Op.getOperand(0);
4731
4732 if (!Op.hasOneUse())
4733 return SDValue();
4734
4735 unsigned Opc = Op.getOpcode();
4736 if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
4737 Opc == X86ISD::ANDNP)
4738 return Op;
4739
4740 return SDValue();
4741 };
4742
4743 SDValue A, FoldableOp;
4744 if ((FoldableOp = getFoldableLogicOp(N1))) {
4745 A = N0;
4746 } else if ((FoldableOp = getFoldableLogicOp(N0))) {
4747 A = N1;
4748 } else
4749 return false;
4750
4751 SDValue B = FoldableOp.getOperand(0);
4752 SDValue C = FoldableOp.getOperand(1);
4753 SDNode *ParentA = N;
4754 SDNode *ParentB = FoldableOp.getNode();
4755 SDNode *ParentC = FoldableOp.getNode();
4756
4757 // We can build the appropriate control immediate by performing the logic
4758 // operation we're matching using these constants for A, B, and C.
4759 uint8_t TernlogMagicA = 0xf0;
4760 uint8_t TernlogMagicB = 0xcc;
4761 uint8_t TernlogMagicC = 0xaa;
4762
4763 // Some of the inputs may be inverted, peek through them and invert the
4764 // magic values accordingly.
4765 // TODO: There may be a bitcast before the xor that we should peek through.
4766 auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4767 if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4768 ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) {
4769 Magic = ~Magic;
4770 Parent = Op.getNode();
4771 Op = Op.getOperand(0);
4772 }
4773 };
4774
4775 PeekThroughNot(A, ParentA, TernlogMagicA);
4776 PeekThroughNot(B, ParentB, TernlogMagicB);
4777 PeekThroughNot(C, ParentC, TernlogMagicC);
4778
4779 uint8_t Imm;
4780 switch (FoldableOp.getOpcode()) {
4781 default: llvm_unreachable("Unexpected opcode!");
4782 case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
4783 case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
4784 case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
4785 case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4786 }
4787
4788 switch (N->getOpcode()) {
4789 default: llvm_unreachable("Unexpected opcode!");
4790 case X86ISD::ANDNP:
4791 if (A == N0)
4792 Imm &= ~TernlogMagicA;
4793 else
4794 Imm = ~(Imm) & TernlogMagicA;
4795 break;
4796 case ISD::AND: Imm &= TernlogMagicA; break;
4797 case ISD::OR: Imm |= TernlogMagicA; break;
4798 case ISD::XOR: Imm ^= TernlogMagicA; break;
4799 }
4800
4801 return matchVPTERNLOG(N, ParentA, ParentB, ParentC, A, B, C, Imm);
4802}
4803
4804/// If the high bits of an 'and' operand are known zero, try setting the
4805/// high bits of an 'and' constant operand to produce a smaller encoding by
4806/// creating a small, sign-extended negative immediate rather than a large
4807/// positive one. This reverses a transform in SimplifyDemandedBits that
4808/// shrinks mask constants by clearing bits. There is also a possibility that
4809/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4810/// case, just replace the 'and'. Return 'true' if the node is replaced.
4811bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4812 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4813 // have immediate operands.
4814 MVT VT = And->getSimpleValueType(0);
4815 if (VT != MVT::i32 && VT != MVT::i64)
4816 return false;
4817
4818 auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
4819 if (!And1C)
4820 return false;
4821
4822 // Bail out if the mask constant is already negative. It's can't shrink more.
4823 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4824 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4825 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4826 // are negative too.
4827 APInt MaskVal = And1C->getAPIntValue();
4828 unsigned MaskLZ = MaskVal.countl_zero();
4829 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4830 return false;
4831
4832 // Don't extend into the upper 32 bits of a 64 bit mask.
4833 if (VT == MVT::i64 && MaskLZ >= 32) {
4834 MaskLZ -= 32;
4835 MaskVal = MaskVal.trunc(32);
4836 }
4837
4838 SDValue And0 = And->getOperand(0);
4839 APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
4840 APInt NegMaskVal = MaskVal | HighZeros;
4841
4842 // If a negative constant would not allow a smaller encoding, there's no need
4843 // to continue. Only change the constant when we know it's a win.
4844 unsigned MinWidth = NegMaskVal.getSignificantBits();
4845 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32))
4846 return false;
4847
4848 // Extend masks if we truncated above.
4849 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4850 NegMaskVal = NegMaskVal.zext(64);
4851 HighZeros = HighZeros.zext(64);
4852 }
4853
4854 // The variable operand must be all zeros in the top bits to allow using the
4855 // new, negative constant as the mask.
4856 // TODO: Handle constant folding?
4857 KnownBits Known0 = CurDAG->computeKnownBits(And0);
4858 if (Known0.isConstant() || !HighZeros.isSubsetOf(Known0.Zero))
4859 return false;
4860
4861 // Check if the mask is -1. In that case, this is an unnecessary instruction
4862 // that escaped earlier analysis.
4863 if (NegMaskVal.isAllOnes()) {
4864 ReplaceNode(And, And0.getNode());
4865 return true;
4866 }
4867
4868 // A negative mask allows a smaller encoding. Create a new 'and' node.
4869 SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
4870 insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
4871 SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
4872 ReplaceNode(And, NewAnd.getNode());
4873 SelectCode(NewAnd.getNode());
4874 return true;
4875}
4876
4877static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4878 bool FoldedBCast, bool Masked) {
4879#define VPTESTM_CASE(VT, SUFFIX) \
4880case MVT::VT: \
4881 if (Masked) \
4882 return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4883 return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4884
4885
4886#define VPTESTM_BROADCAST_CASES(SUFFIX) \
4887default: llvm_unreachable("Unexpected VT!"); \
4888VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4889VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4890VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4891VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4892VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4893VPTESTM_CASE(v8i64, QZ##SUFFIX)
4894
4895#define VPTESTM_FULL_CASES(SUFFIX) \
4896VPTESTM_BROADCAST_CASES(SUFFIX) \
4897VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4898VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4899VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4900VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4901VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4902VPTESTM_CASE(v32i16, WZ##SUFFIX)
4903
4904 if (FoldedBCast) {
4905 switch (TestVT.SimpleTy) {
4907 }
4908 }
4909
4910 if (FoldedLoad) {
4911 switch (TestVT.SimpleTy) {
4913 }
4914 }
4915
4916 switch (TestVT.SimpleTy) {
4918 }
4919
4920#undef VPTESTM_FULL_CASES
4921#undef VPTESTM_BROADCAST_CASES
4922#undef VPTESTM_CASE
4923}
4924
4925// Try to create VPTESTM instruction. If InMask is not null, it will be used
4926// to form a masked operation.
4927bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4928 SDValue InMask) {
4929 assert(Subtarget->hasAVX512() && "Expected AVX512!");
4930 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4931 "Unexpected VT!");
4932
4933 // Look for equal and not equal compares.
4934 ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
4935 if (CC != ISD::SETEQ && CC != ISD::SETNE)
4936 return false;
4937
4938 SDValue SetccOp0 = Setcc.getOperand(0);
4939 SDValue SetccOp1 = Setcc.getOperand(1);
4940
4941 // Canonicalize the all zero vector to the RHS.
4942 if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
4943 std::swap(SetccOp0, SetccOp1);
4944
4945 // See if we're comparing against zero.
4946 if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
4947 return false;
4948
4949 SDValue N0 = SetccOp0;
4950
4951 MVT CmpVT = N0.getSimpleValueType();
4952 MVT CmpSVT = CmpVT.getVectorElementType();
4953
4954 // Start with both operands the same. We'll try to refine this.
4955 SDValue Src0 = N0;
4956 SDValue Src1 = N0;
4957
4958 {
4959 // Look through single use bitcasts.
4960 SDValue N0Temp = N0;
4961 if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4962 N0Temp = N0.getOperand(0);
4963
4964 // Look for single use AND.
4965 if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4966 Src0 = N0Temp.getOperand(0);
4967 Src1 = N0Temp.getOperand(1);
4968 }
4969 }
4970
4971 // Without VLX we need to widen the operation.
4972 bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4973
4974 auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
4975 SDValue &Base, SDValue &Scale, SDValue &Index,
4976 SDValue &Disp, SDValue &Segment) {
4977 // If we need to widen, we can't fold the load.
4978 if (!Widen)
4979 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4980 return true;
4981
4982 // If we didn't fold a load, try to match broadcast. No widening limitation
4983 // for this. But only 32 and 64 bit types are supported.
4984 if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
4985 return false;
4986
4987 // Look through single use bitcasts.
4988 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4989 P = L.getNode();
4990 L = L.getOperand(0);
4991 }
4992
4993 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4994 return false;
4995
4996 auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4997 if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
4998 return false;
4999
5000 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
5001 };
5002
5003 // We can only fold loads if the sources are unique.
5004 bool CanFoldLoads = Src0 != Src1;
5005
5006 bool FoldedLoad = false;
5007 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5008 if (CanFoldLoads) {
5009 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
5010 Tmp3, Tmp4);
5011 if (!FoldedLoad) {
5012 // And is commutative.
5013 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
5014 Tmp2, Tmp3, Tmp4);
5015 if (FoldedLoad)
5016 std::swap(Src0, Src1);
5017 }
5018 }
5019
5020 bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
5021
5022 bool IsMasked = InMask.getNode() != nullptr;
5023
5024 SDLoc dl(Root);
5025
5026 MVT ResVT = Setcc.getSimpleValueType();
5027 MVT MaskVT = ResVT;
5028 if (Widen) {
5029 // Widen the inputs using insert_subreg or copy_to_regclass.
5030 unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
5031 unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
5032 unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
5033 CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
5034 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
5035 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
5036 CmpVT), 0);
5037 Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
5038
5039 if (!FoldedBCast)
5040 Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
5041
5042 if (IsMasked) {
5043 // Widen the mask.
5044 unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID();
5045 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
5046 InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
5047 dl, MaskVT, InMask, RC), 0);
5048 }
5049 }
5050
5051 bool IsTestN = CC == ISD::SETEQ;
5052 unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
5053 IsMasked);
5054
5055 MachineSDNode *CNode;
5056 if (FoldedLoad) {
5057 SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
5058
5059 if (IsMasked) {
5060 SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5061 Src1.getOperand(0) };
5062 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5063 } else {
5064 SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5065 Src1.getOperand(0) };
5066 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5067 }
5068
5069 // Update the chain.
5070 ReplaceUses(Src1.getValue(1), SDValue(CNode, 1));
5071 // Record the mem-refs
5072 CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()});
5073 } else {
5074 if (IsMasked)
5075 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
5076 else
5077 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
5078 }
5079
5080 // If we widened, we need to shrink the mask VT.
5081 if (Widen) {
5082 unsigned RegClass = TLI->getRegClassFor(ResVT)->getID();
5083 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
5084 CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
5085 dl, ResVT, SDValue(CNode, 0), RC);
5086 }
5087
5088 ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
5089 CurDAG->RemoveDeadNode(Root);
5090 return true;
5091}
5092
5093// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
5094// into vpternlog.
5095bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
5096 assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
5097
5098 MVT NVT = N->getSimpleValueType(0);
5099
5100 // Make sure we support VPTERNLOG.
5101 if (!NVT.isVector() || !Subtarget->hasAVX512())
5102 return false;
5103
5104 // We need VLX for 128/256-bit.
5105 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
5106 return false;
5107
5108 SDValue N0 = N->getOperand(0);
5109 SDValue N1 = N->getOperand(1);
5110
5111 // Canonicalize AND to LHS.
5112 if (N1.getOpcode() == ISD::AND)
5113 std::swap(N0, N1);
5114
5115 if (N0.getOpcode() != ISD::AND ||
5116 N1.getOpcode() != X86ISD::ANDNP ||
5117 !N0.hasOneUse() || !N1.hasOneUse())
5118 return false;
5119
5120 // ANDN is not commutable, use it to pick down A and C.
5121 SDValue A = N1.getOperand(0);
5122 SDValue C = N1.getOperand(1);
5123
5124 // AND is commutable, if one operand matches A, the other operand is B.
5125 // Otherwise this isn't a match.
5126 SDValue B;
5127 if (N0.getOperand(0) == A)
5128 B = N0.getOperand(1);
5129 else if (N0.getOperand(1) == A)
5130 B = N0.getOperand(0);
5131 else
5132 return false;
5133
5134 SDLoc dl(N);
5135 SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
5136 SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
5137 ReplaceNode(N, Ternlog.getNode());
5138
5139 return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(),
5140 Ternlog.getNode(), A, B, C, 0xCA);
5141}
5142
5143void X86DAGToDAGISel::Select(SDNode *Node) {
5144 MVT NVT = Node->getSimpleValueType(0);
5145 unsigned Opcode = Node->getOpcode();
5146 SDLoc dl(Node);
5147
5148 if (Node->isMachineOpcode()) {
5149 LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
5150 Node->setNodeId(-1);
5151 return; // Already selected.
5152 }
5153
5154 switch (Opcode) {
5155 default: break;
5157 unsigned IntNo = Node->getConstantOperandVal(1);
5158 switch (IntNo) {
5159 default: break;
5160 case Intrinsic::x86_encodekey128:
5161 case Intrinsic::x86_encodekey256: {
5162 if (!Subtarget->hasKL())
5163 break;
5164
5165 unsigned Opcode;
5166 switch (IntNo) {
5167 default: llvm_unreachable("Impossible intrinsic");
5168 case Intrinsic::x86_encodekey128:
5169 Opcode = X86::ENCODEKEY128;
5170 break;
5171 case Intrinsic::x86_encodekey256:
5172 Opcode = X86::ENCODEKEY256;
5173 break;
5174 }
5175
5176 SDValue Chain = Node->getOperand(0);
5177 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3),
5178 SDValue());
5179 if (Opcode == X86::ENCODEKEY256)
5180 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4),
5181 Chain.getValue(1));
5182
5183 MachineSDNode *Res = CurDAG->getMachineNode(
5184 Opcode, dl, Node->getVTList(),
5185 {Node->getOperand(2), Chain, Chain.getValue(1)});
5186 ReplaceNode(Node, Res);
5187 return;
5188 }
5189 case Intrinsic::x86_tileloaddrs64_internal:
5190 case Intrinsic::x86_tileloaddrst164_internal:
5191 if (!Subtarget->hasAMXMOVRS())
5192 break;
5193 [[fallthrough]];
5194 case Intrinsic::x86_tileloadd64_internal:
5195 case Intrinsic::x86_tileloaddt164_internal: {
5196 if (!Subtarget->hasAMXTILE())
5197 break;
5198 auto *MFI =
5199 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5200 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5201 unsigned Opc;
5202 switch (IntNo) {
5203 default:
5204 llvm_unreachable("Unexpected intrinsic!");
5205 case Intrinsic::x86_tileloaddrs64_internal:
5206 Opc = X86::PTILELOADDRSV;
5207 break;
5208 case Intrinsic::x86_tileloaddrst164_internal:
5209 Opc = X86::PTILELOADDRST1V;
5210 break;
5211 case Intrinsic::x86_tileloadd64_internal:
5212 Opc = X86::PTILELOADDV;
5213 break;
5214 case Intrinsic::x86_tileloaddt164_internal:
5215 Opc = X86::PTILELOADDT1V;
5216 break;
5217 }
5218 // _tile_loadd_internal(row, col, buf, STRIDE)
5219 SDValue Base = Node->getOperand(4);
5220 SDValue Scale = getI8Imm(1, dl);
5221 SDValue Index = Node->getOperand(5);
5222 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5223 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5224 SDValue Chain = Node->getOperand(0);
5225 MachineSDNode *CNode;
5226 SDValue Ops[] = {Node->getOperand(2),
5227 Node->getOperand(3),
5228 Base,
5229 Scale,
5230 Index,
5231 Disp,
5232 Segment,
5233 Chain};
5234 CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
5235 ReplaceNode(Node, CNode);
5236 return;
5237 }
5238 }
5239 break;
5240 }
5241 case ISD::INTRINSIC_VOID: {
5242 unsigned IntNo = Node->getConstantOperandVal(1);
5243 switch (IntNo) {
5244 default: break;
5245 case Intrinsic::x86_sse3_monitor:
5246 case Intrinsic::x86_monitorx:
5247 case Intrinsic::x86_clzero: {
5248 bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
5249
5250 unsigned Opc = 0;
5251 switch (IntNo) {
5252 default: llvm_unreachable("Unexpected intrinsic!");
5253 case Intrinsic::x86_sse3_monitor:
5254 if (!Subtarget->hasSSE3())
5255 break;
5256 Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5257 break;
5258 case Intrinsic::x86_monitorx:
5259 if (!Subtarget->hasMWAITX())
5260 break;
5261 Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5262 break;
5263 case Intrinsic::x86_clzero:
5264 if (!Subtarget->hasCLZERO())
5265 break;
5266 Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5267 break;
5268 }
5269
5270 if (Opc) {
5271 unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5272 SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
5273 Node->getOperand(2), SDValue());
5274 SDValue InGlue = Chain.getValue(1);
5275
5276 if (IntNo == Intrinsic::x86_sse3_monitor ||
5277 IntNo == Intrinsic::x86_monitorx) {
5278 // Copy the other two operands to ECX and EDX.
5279 Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
5280 InGlue);
5281 InGlue = Chain.getValue(1);
5282 Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
5283 InGlue);
5284 InGlue = Chain.getValue(1);
5285 }
5286
5287 MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
5288 { Chain, InGlue});
5289 ReplaceNode(Node, CNode);
5290 return;
5291 }
5292
5293 break;
5294 }
5295 case Intrinsic::x86_tilestored64_internal: {
5296 auto *MFI =
5297 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5298 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5299 unsigned Opc = X86::PTILESTOREDV;
5300 // _tile_stored_internal(row, col, buf, STRIDE, c)
5301 SDValue Base = Node->getOperand(4);
5302 SDValue Scale = getI8Imm(1, dl);
5303 SDValue Index = Node->getOperand(5);
5304 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5305 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5306 SDValue Chain = Node->getOperand(0);
5307 MachineSDNode *CNode;
5308 SDValue Ops[] = {Node->getOperand(2),
5309 Node->getOperand(3),
5310 Base,
5311 Scale,
5312 Index,
5313 Disp,
5314 Segment,
5315 Node->getOperand(6),
5316 Chain};
5317 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5318 ReplaceNode(Node, CNode);
5319 return;
5320 }
5321 case Intrinsic::x86_tileloaddrs64:
5322 case Intrinsic::x86_tileloaddrst164:
5323 if (!Subtarget->hasAMXMOVRS())
5324 break;
5325 [[fallthrough]];
5326 case Intrinsic::x86_tileloadd64:
5327 case Intrinsic::x86_tileloaddt164:
5328 case Intrinsic::x86_tilestored64: {
5329 if (!Subtarget->hasAMXTILE())
5330 break;
5331 auto *MFI =
5332 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5333 MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5334 unsigned Opc;
5335 switch (IntNo) {
5336 default: llvm_unreachable("Unexpected intrinsic!");
5337 case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
5338 case Intrinsic::x86_tileloaddrs64:
5339 Opc = X86::PTILELOADDRS;
5340 break;
5341 case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5342 case Intrinsic::x86_tileloaddrst164:
5343 Opc = X86::PTILELOADDRST1;
5344 break;
5345 case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
5346 }
5347 // FIXME: Match displacement and scale.
5348 unsigned TIndex = Node->getConstantOperandVal(2);
5349 SDValue TReg = getI8Imm(TIndex, dl);
5350 SDValue Base = Node->getOperand(3);
5351 SDValue Scale = getI8Imm(1, dl);
5352 SDValue Index = Node->getOperand(4);
5353 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5354 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5355 SDValue Chain = Node->getOperand(0);
5356 MachineSDNode *CNode;
5357 if (Opc == X86::PTILESTORED) {
5358 SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5359 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5360 } else {
5361 SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5362 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5363 }
5364 ReplaceNode(Node, CNode);
5365 return;
5366 }
5367 case Intrinsic::x86_t2rpntlvwz0rs:
5368 case Intrinsic::x86_t2rpntlvwz0rst1:
5369 case Intrinsic::x86_t2rpntlvwz1rs:
5370 case Intrinsic::x86_t2rpntlvwz1rst1:
5371 if (!Subtarget->hasAMXMOVRS())
5372 break;
5373 [[fallthrough]];
5374 case Intrinsic::x86_t2rpntlvwz0:
5375 case Intrinsic::x86_t2rpntlvwz0t1:
5376 case Intrinsic::x86_t2rpntlvwz1:
5377 case Intrinsic::x86_t2rpntlvwz1t1: {
5378 if (!Subtarget->hasAMXTRANSPOSE())
5379 break;
5380 auto *MFI =
5381 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5382 MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5383 unsigned Opc;
5384 switch (IntNo) {
5385 default:
5386 llvm_unreachable("Unexpected intrinsic!");
5387 case Intrinsic::x86_t2rpntlvwz0:
5388 Opc = X86::PT2RPNTLVWZ0;
5389 break;
5390 case Intrinsic::x86_t2rpntlvwz0t1:
5391 Opc = X86::PT2RPNTLVWZ0T1;
5392 break;
5393 case Intrinsic::x86_t2rpntlvwz1:
5394 Opc = X86::PT2RPNTLVWZ1;
5395 break;
5396 case Intrinsic::x86_t2rpntlvwz1t1:
5397 Opc = X86::PT2RPNTLVWZ1T1;
5398 break;
5399 case Intrinsic::x86_t2rpntlvwz0rs:
5400 Opc = X86::PT2RPNTLVWZ0RS;
5401 break;
5402 case Intrinsic::x86_t2rpntlvwz0rst1:
5403 Opc = X86::PT2RPNTLVWZ0RST1;
5404 break;
5405 case Intrinsic::x86_t2rpntlvwz1rs:
5406 Opc = X86::PT2RPNTLVWZ1RS;
5407 break;
5408 case Intrinsic::x86_t2rpntlvwz1rst1:
5409 Opc = X86::PT2RPNTLVWZ1RST1;
5410 break;
5411 }
5412 // FIXME: Match displacement and scale.
5413 unsigned TIndex = Node->getConstantOperandVal(2);
5414 SDValue TReg = getI8Imm(TIndex, dl);
5415 SDValue Base = Node->getOperand(3);
5416 SDValue Scale = getI8Imm(1, dl);
5417 SDValue Index = Node->getOperand(4);
5418 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5419 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5420 SDValue Chain = Node->getOperand(0);
5421 SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain};
5422 MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5423 ReplaceNode(Node, CNode);
5424 return;
5425 }
5426 }
5427 break;
5428 }
5429 case ISD::BRIND:
5430 case X86ISD::NT_BRIND: {
5431 if (Subtarget->isTarget64BitILP32()) {
5432 // Converts a 32-bit register to a 64-bit, zero-extended version of
5433 // it. This is needed because x86-64 can do many things, but jmp %r32
5434 // ain't one of them.
5435 SDValue Target = Node->getOperand(1);
5436 assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5437 SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
5438 SDValue Brind = CurDAG->getNode(Opcode, dl, MVT::Other,
5439 Node->getOperand(0), ZextTarget);
5440 ReplaceNode(Node, Brind.getNode());
5441 SelectCode(ZextTarget.getNode());
5442 SelectCode(Brind.getNode());
5443 return;
5444 }
5445 break;
5446 }
5448 ReplaceNode(Node, getGlobalBaseReg());
5449 return;
5450
5451 case ISD::BITCAST:
5452 // Just drop all 128/256/512-bit bitcasts.
5453 if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
5454 NVT == MVT::f128) {
5455 ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
5456 CurDAG->RemoveDeadNode(Node);
5457 return;
5458 }
5459 break;
5460
5461 case ISD::SRL:
5462 if (matchBitExtract(Node))
5463 return;
5464 [[fallthrough]];
5465 case ISD::SRA:
5466 case ISD::SHL:
5467 if (tryShiftAmountMod(Node))
5468 return;
5469 break;
5470
5471 case X86ISD::VPTERNLOG: {
5472 uint8_t Imm = Node->getConstantOperandVal(3);
5473 if (matchVPTERNLOG(Node, Node, Node, Node, Node->getOperand(0),
5474 Node->getOperand(1), Node->getOperand(2), Imm))
5475 return;
5476 break;
5477 }
5478
5479 case X86ISD::ANDNP:
5480 if (tryVPTERNLOG(Node))
5481 return;
5482 break;
5483
5484 case ISD::AND:
5485 if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5486 // Try to form a masked VPTESTM. Operands can be in either order.
5487 SDValue N0 = Node->getOperand(0);
5488 SDValue N1 = Node->getOperand(1);
5489 if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5490 tryVPTESTM(Node, N0, N1))
5491 return;
5492 if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5493 tryVPTESTM(Node, N1, N0))
5494 return;
5495 }
5496
5497 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5498 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
5499 CurDAG->RemoveDeadNode(Node);
5500 return;
5501 }
5502 if (matchBitExtract(Node))
5503 return;
5504 if (AndImmShrink && shrinkAndImmediate(Node))
5505 return;
5506
5507 [[fallthrough]];
5508 case ISD::OR:
5509 case ISD::XOR:
5510 if (tryShrinkShlLogicImm(Node))
5511 return;
5512 if (Opcode == ISD::OR && tryMatchBitSelect(Node))
5513 return;
5514 if (tryVPTERNLOG(Node))
5515 return;
5516
5517 [[fallthrough]];
5518 case ISD::ADD:
5519 if (Opcode == ISD::ADD && matchBitExtract(Node))
5520 return;
5521 [[fallthrough]];
5522 case ISD::SUB: {
5523 // Try to avoid folding immediates with multiple uses for optsize.
5524 // This code tries to select to register form directly to avoid going
5525 // through the isel table which might fold the immediate. We can't change
5526 // the patterns on the add/sub/and/or/xor with immediate paterns in the
5527 // tablegen files to check immediate use count without making the patterns
5528 // unavailable to the fast-isel table.
5529 if (!CurDAG->shouldOptForSize())
5530 break;
5531
5532 // Only handle i8/i16/i32/i64.
5533 if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5534 break;
5535
5536 SDValue N0 = Node->getOperand(0);
5537 SDValue N1 = Node->getOperand(1);
5538
5539 auto *Cst = dyn_cast<ConstantSDNode>(N1);
5540 if (!Cst)
5541 break;
5542
5543 int64_t Val = Cst->getSExtValue();
5544
5545 // Make sure its an immediate that is considered foldable.
5546 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5547 if (!isInt<8>(Val) && !isInt<32>(Val))
5548 break;
5549
5550 // If this can match to INC/DEC, let it go.
5551 if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
5552 break;
5553
5554 // Check if we should avoid folding this immediate.
5555 if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
5556 break;
5557
5558 // We should not fold the immediate. So we need a register form instead.
5559 unsigned ROpc, MOpc;
5560 switch (NVT.SimpleTy) {
5561 default: llvm_unreachable("Unexpected VT!");
5562 case MVT::i8:
5563 switch (Opcode) {
5564 default: llvm_unreachable("Unexpected opcode!");
5565 case ISD::ADD:
5566 ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5567 MOpc = GET_ND_IF_ENABLED(X86::ADD8rm);
5568 break;
5569 case ISD::SUB:
5570 ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5571 MOpc = GET_ND_IF_ENABLED(X86::SUB8rm);
5572 break;
5573 case ISD::AND:
5574 ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5575 MOpc = GET_ND_IF_ENABLED(X86::AND8rm);
5576 break;
5577 case ISD::OR:
5578 ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5579 MOpc = GET_ND_IF_ENABLED(X86::OR8rm);
5580 break;
5581 case ISD::XOR:
5582 ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5583 MOpc = GET_ND_IF_ENABLED(X86::XOR8rm);
5584 break;
5585 }
5586 break;
5587 case MVT::i16:
5588 switch (Opcode) {
5589 default: llvm_unreachable("Unexpected opcode!");
5590 case ISD::ADD:
5591 ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5592 MOpc = GET_ND_IF_ENABLED(X86::ADD16rm);
5593 break;
5594 case ISD::SUB:
5595 ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5596 MOpc = GET_ND_IF_ENABLED(X86::SUB16rm);
5597 break;
5598 case ISD::AND:
5599 ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5600 MOpc = GET_ND_IF_ENABLED(X86::AND16rm);
5601 break;
5602 case ISD::OR:
5603 ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5604 MOpc = GET_ND_IF_ENABLED(X86::OR16rm);
5605 break;
5606 case ISD::XOR:
5607 ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5608 MOpc = GET_ND_IF_ENABLED(X86::XOR16rm);
5609 break;
5610 }
5611 break;
5612 case MVT::i32:
5613 switch (Opcode) {
5614 default: llvm_unreachable("Unexpected opcode!");
5615 case ISD::ADD:
5616 ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5617 MOpc = GET_ND_IF_ENABLED(X86::ADD32rm);
5618 break;
5619 case ISD::SUB:
5620 ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5621 MOpc = GET_ND_IF_ENABLED(X86::SUB32rm);
5622 break;
5623 case ISD::AND:
5624 ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5625 MOpc = GET_ND_IF_ENABLED(X86::AND32rm);
5626 break;
5627 case ISD::OR:
5628 ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5629 MOpc = GET_ND_IF_ENABLED(X86::OR32rm);
5630 break;
5631 case ISD::XOR:
5632 ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5633 MOpc = GET_ND_IF_ENABLED(X86::XOR32rm);
5634 break;
5635 }
5636 break;
5637 case MVT::i64:
5638 switch (Opcode) {
5639 default: llvm_unreachable("Unexpected opcode!");
5640 case ISD::ADD:
5641 ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5642 MOpc = GET_ND_IF_ENABLED(X86::ADD64rm);
5643 break;
5644 case ISD::SUB:
5645 ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5646 MOpc = GET_ND_IF_ENABLED(X86::SUB64rm);
5647 break;
5648 case ISD::AND:
5649 ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5650 MOpc = GET_ND_IF_ENABLED(X86::AND64rm);
5651 break;
5652 case ISD::OR:
5653 ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5654 MOpc = GET_ND_IF_ENABLED(X86::OR64rm);
5655 break;
5656 case ISD::XOR:
5657 ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5658 MOpc = GET_ND_IF_ENABLED(X86::XOR64rm);
5659 break;
5660 }
5661 break;
5662 }
5663
5664 // Ok this is a AND/OR/XOR/ADD/SUB with constant.
5665
5666 // If this is a not a subtract, we can still try to fold a load.
5667 if (Opcode != ISD::SUB) {
5668 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5669 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5670 SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5671 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5672 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5673 // Update the chain.
5674 ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
5675 // Record the mem-refs
5676 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
5677 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5678 CurDAG->RemoveDeadNode(Node);
5679 return;
5680 }
5681 }
5682
5683 CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
5684 return;
5685 }
5686
5687 case X86ISD::SMUL:
5688 // i16/i32/i64 are handled with isel patterns.
5689 if (NVT != MVT::i8)
5690 break;
5691 [[fallthrough]];
5692 case X86ISD::UMUL: {
5693 SDValue N0 = Node->getOperand(0);
5694 SDValue N1 = Node->getOperand(1);
5695
5696 unsigned LoReg, ROpc, MOpc;
5697 switch (NVT.SimpleTy) {
5698 default: llvm_unreachable("Unsupported VT!");
5699 case MVT::i8:
5700 LoReg = X86::AL;
5701 ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5702 MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5703 break;
5704 case MVT::i16:
5705 LoReg = X86::AX;
5706 ROpc = X86::MUL16r;
5707 MOpc = X86::MUL16m;
5708 break;
5709 case MVT::i32:
5710 LoReg = X86::EAX;
5711 ROpc = X86::MUL32r;
5712 MOpc = X86::MUL32m;
5713 break;
5714 case MVT::i64:
5715 LoReg = X86::RAX;
5716 ROpc = X86::MUL64r;
5717 MOpc = X86::MUL64m;
5718 break;
5719 }
5720
5721 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5722 bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5723 // Multiply is commutative.
5724 if (!FoldedLoad) {
5725 FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5726 if (FoldedLoad)
5727 std::swap(N0, N1);
5728 }
5729
5730 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5731 N0, SDValue()).getValue(1);
5732
5733 MachineSDNode *CNode;
5734 if (FoldedLoad) {
5735 // i16/i32/i64 use an instruction that produces a low and high result even
5736 // though only the low result is used.
5737 SDVTList VTs;
5738 if (NVT == MVT::i8)
5739 VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5740 else
5741 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
5742
5743 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5744 InGlue };
5745 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5746
5747 // Update the chain.
5748 ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
5749 // Record the mem-refs
5750 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5751 } else {
5752 // i16/i32/i64 use an instruction that produces a low and high result even
5753 // though only the low result is used.
5754 SDVTList VTs;
5755 if (NVT == MVT::i8)
5756 VTs = CurDAG->getVTList(NVT, MVT::i32);
5757 else
5758 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
5759
5760 CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InGlue});
5761 }
5762
5763 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5764 ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
5765 CurDAG->RemoveDeadNode(Node);
5766 return;
5767 }
5768
5769 case ISD::SMUL_LOHI:
5770 case ISD::UMUL_LOHI: {
5771 SDValue N0 = Node->getOperand(0);
5772 SDValue N1 = Node->getOperand(1);
5773
5774 unsigned Opc, MOpc;
5775 unsigned LoReg, HiReg;
5776 bool IsSigned = Opcode == ISD::SMUL_LOHI;
5777 bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5778 bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
5779 switch (NVT.SimpleTy) {
5780 default: llvm_unreachable("Unsupported VT!");
5781 case MVT::i32:
5782 Opc = UseMULXHi ? X86::MULX32Hrr
5783 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5784 : IsSigned ? X86::IMUL32r
5785 : X86::MUL32r;
5786 MOpc = UseMULXHi ? X86::MULX32Hrm
5787 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5788 : IsSigned ? X86::IMUL32m
5789 : X86::MUL32m;
5790 LoReg = UseMULX ? X86::EDX : X86::EAX;
5791 HiReg = X86::EDX;
5792 break;
5793 case MVT::i64:
5794 Opc = UseMULXHi ? X86::MULX64Hrr
5795 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5796 : IsSigned ? X86::IMUL64r
5797 : X86::MUL64r;
5798 MOpc = UseMULXHi ? X86::MULX64Hrm
5799 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5800 : IsSigned ? X86::IMUL64m
5801 : X86::MUL64m;
5802 LoReg = UseMULX ? X86::RDX : X86::RAX;
5803 HiReg = X86::RDX;
5804 break;
5805 }
5806
5807 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5808 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5809 // Multiply is commutative.
5810 if (!foldedLoad) {
5811 foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5812 if (foldedLoad)
5813 std::swap(N0, N1);
5814 }
5815
5816 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5817 N0, SDValue()).getValue(1);
5818 SDValue ResHi, ResLo;
5819 if (foldedLoad) {
5820 SDValue Chain;
5821 MachineSDNode *CNode = nullptr;
5822 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5823 InGlue };
5824 if (UseMULXHi) {
5825 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
5826 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5827 ResHi = SDValue(CNode, 0);
5828 Chain = SDValue(CNode, 1);
5829 } else if (UseMULX) {
5830 SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
5831 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5832 ResHi = SDValue(CNode, 0);
5833 ResLo = SDValue(CNode, 1);
5834 Chain = SDValue(CNode, 2);
5835 } else {
5836 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5837 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5838 Chain = SDValue(CNode, 0);
5839 InGlue = SDValue(CNode, 1);
5840 }
5841
5842 // Update the chain.
5843 ReplaceUses(N1.getValue(1), Chain);
5844 // Record the mem-refs
5845 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5846 } else {
5847 SDValue Ops[] = { N1, InGlue };
5848 if (UseMULXHi) {
5849 SDVTList VTs = CurDAG->getVTList(NVT);
5850 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5851 ResHi = SDValue(CNode, 0);
5852 } else if (UseMULX) {
5853 SDVTList VTs = CurDAG->getVTList(NVT, NVT);
5854 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5855 ResHi = SDValue(CNode, 0);
5856 ResLo = SDValue(CNode, 1);
5857 } else {
5858 SDVTList VTs = CurDAG->getVTList(MVT::Glue);
5859 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5860 InGlue = SDValue(CNode, 0);
5861 }
5862 }
5863
5864 // Copy the low half of the result, if it is needed.
5865 if (!SDValue(Node, 0).use_empty()) {
5866 if (!ResLo) {
5867 assert(LoReg && "Register for low half is not defined!");
5868 ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
5869 NVT, InGlue);
5870 InGlue = ResLo.getValue(2);
5871 }
5872 ReplaceUses(SDValue(Node, 0), ResLo);
5873 LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5874 dbgs() << '\n');
5875 }
5876 // Copy the high half of the result, if it is needed.
5877 if (!SDValue(Node, 1).use_empty()) {
5878 if (!ResHi) {
5879 assert(HiReg && "Register for high half is not defined!");
5880 ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
5881 NVT, InGlue);
5882 InGlue = ResHi.getValue(2);
5883 }
5884 ReplaceUses(SDValue(Node, 1), ResHi);
5885 LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5886 dbgs() << '\n');
5887 }
5888
5889 CurDAG->RemoveDeadNode(Node);
5890 return;
5891 }
5892
5893 case ISD::SDIVREM:
5894 case ISD::UDIVREM: {
5895 SDValue N0 = Node->getOperand(0);
5896 SDValue N1 = Node->getOperand(1);
5897
5898 unsigned ROpc, MOpc;
5899 bool isSigned = Opcode == ISD::SDIVREM;
5900 if (!isSigned) {
5901 switch (NVT.SimpleTy) {
5902 default: llvm_unreachable("Unsupported VT!");
5903 case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break;
5904 case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
5905 case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
5906 case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
5907 }
5908 } else {
5909 switch (NVT.SimpleTy) {
5910 default: llvm_unreachable("Unsupported VT!");
5911 case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
5912 case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
5913 case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
5914 case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
5915 }
5916 }
5917
5918 unsigned LoReg, HiReg, ClrReg;
5919 unsigned SExtOpcode;
5920 switch (NVT.SimpleTy) {
5921 default: llvm_unreachable("Unsupported VT!");
5922 case MVT::i8:
5923 LoReg = X86::AL; ClrReg = HiReg = X86::AH;
5924 SExtOpcode = 0; // Not used.
5925 break;
5926 case MVT::i16:
5927 LoReg = X86::AX; HiReg = X86::DX;
5928 ClrReg = X86::DX;
5929 SExtOpcode = X86::CWD;
5930 break;
5931 case MVT::i32:
5932 LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
5933 SExtOpcode = X86::CDQ;
5934 break;
5935 case MVT::i64:
5936 LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
5937 SExtOpcode = X86::CQO;
5938 break;
5939 }
5940
5941 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5942 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5943 bool signBitIsZero = CurDAG->SignBitIsZero(N0);
5944
5945 SDValue InGlue;
5946 if (NVT == MVT::i8) {
5947 // Special case for div8, just use a move with zero extension to AX to
5948 // clear the upper 8 bits (AH).
5949 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
5950 MachineSDNode *Move;
5951 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5952 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5953 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
5954 : X86::MOVZX16rm8;
5955 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
5956 Chain = SDValue(Move, 1);
5957 ReplaceUses(N0.getValue(1), Chain);
5958 // Record the mem-refs
5959 CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
5960 } else {
5961 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
5962 : X86::MOVZX16rr8;
5963 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
5964 Chain = CurDAG->getEntryNode();
5965 }
5966 Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
5967 SDValue());
5968 InGlue = Chain.getValue(1);
5969 } else {
5970 InGlue =
5971 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
5972 LoReg, N0, SDValue()).getValue(1);
5973 if (isSigned && !signBitIsZero) {
5974 // Sign extend the low part into the high part.
5975 InGlue =
5976 SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InGlue),0);
5977 } else {
5978 // Zero out the high part, effectively zero extending the input.
5979 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
5980 SDValue ClrNode =
5981 SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, {}), 0);
5982 switch (NVT.SimpleTy) {
5983 case MVT::i16:
5984 ClrNode =
5985 SDValue(CurDAG->getMachineNode(
5986 TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
5987 CurDAG->getTargetConstant(X86::sub_16bit, dl,
5988 MVT::i32)),
5989 0);
5990 break;
5991 case MVT::i32:
5992 break;
5993 case MVT::i64:
5994 ClrNode =
5995 SDValue(CurDAG->getMachineNode(
5996 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
5997 CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
5998 CurDAG->getTargetConstant(X86::sub_32bit, dl,
5999 MVT::i32)),
6000 0);
6001 break;
6002 default:
6003 llvm_unreachable("Unexpected division source");
6004 }
6005
6006 InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
6007 ClrNode, InGlue).getValue(1);
6008 }
6009 }
6010
6011 if (foldedLoad) {
6012 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
6013 InGlue };
6014 MachineSDNode *CNode =
6015 CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
6016 InGlue = SDValue(CNode, 1);
6017 // Update the chain.
6018 ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
6019 // Record the mem-refs
6020 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
6021 } else {
6022 InGlue =
6023 SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InGlue), 0);
6024 }
6025
6026 // Prevent use of AH in a REX instruction by explicitly copying it to
6027 // an ABCD_L register.
6028 //
6029 // The current assumption of the register allocator is that isel
6030 // won't generate explicit references to the GR8_ABCD_H registers. If
6031 // the allocator and/or the backend get enhanced to be more robust in
6032 // that regard, this can be, and should be, removed.
6033 if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
6034 SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
6035 unsigned AHExtOpcode =
6036 isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
6037
6038 SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
6039 MVT::Glue, AHCopy, InGlue);
6040 SDValue Result(RNode, 0);
6041 InGlue = SDValue(RNode, 1);
6042
6043 Result =
6044 CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
6045
6046 ReplaceUses(SDValue(Node, 1), Result);
6047 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6048 dbgs() << '\n');
6049 }
6050 // Copy the division (low) result, if it is needed.
6051 if (!SDValue(Node, 0).use_empty()) {
6052 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
6053 LoReg, NVT, InGlue);
6054 InGlue = Result.getValue(2);
6055 ReplaceUses(SDValue(Node, 0), Result);
6056 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6057 dbgs() << '\n');
6058 }
6059 // Copy the remainder (high) result, if it is needed.
6060 if (!SDValue(Node, 1).use_empty()) {
6061 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
6062 HiReg, NVT, InGlue);
6063 InGlue = Result.getValue(2);
6064 ReplaceUses(SDValue(Node, 1), Result);
6065 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6066 dbgs() << '\n');
6067 }
6068 CurDAG->RemoveDeadNode(Node);
6069 return;
6070 }
6071
6072 case X86ISD::FCMP:
6074 case X86ISD::STRICT_FCMPS: {
6075 bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
6076 Node->getOpcode() == X86ISD::STRICT_FCMPS;
6077 SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0);
6078 SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1);
6079
6080 // Save the original VT of the compare.
6081 MVT CmpVT = N0.getSimpleValueType();
6082
6083 // Floating point needs special handling if we don't have FCOMI.
6084 if (Subtarget->canUseCMOV())
6085 break;
6086
6087 bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
6088
6089 unsigned Opc;
6090 switch (CmpVT.SimpleTy) {
6091 default: llvm_unreachable("Unexpected type!");
6092 case MVT::f32:
6093 Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
6094 break;
6095 case MVT::f64:
6096 Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
6097 break;
6098 case MVT::f80:
6099 Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
6100 break;
6101 }
6102
6103 SDValue Chain =
6104 IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode();
6105 SDValue Glue;
6106 if (IsStrictCmp) {
6107 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
6108 Chain = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
6109 Glue = Chain.getValue(1);
6110 } else {
6111 Glue = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N0, N1), 0);
6112 }
6113
6114 // Move FPSW to AX.
6115 SDValue FNSTSW =
6116 SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, Glue), 0);
6117
6118 // Extract upper 8-bits of AX.
6119 SDValue Extract =
6120 CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW);
6121
6122 // Move AH into flags.
6123 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
6124 assert(Subtarget->canUseLAHFSAHF() &&
6125 "Target doesn't support SAHF or FCOMI?");
6126 SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
6127 Chain = AH;
6128 SDValue SAHF = SDValue(
6129 CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0);
6130
6131 if (IsStrictCmp)
6132 ReplaceUses(SDValue(Node, 1), Chain);
6133
6134 ReplaceUses(SDValue(Node, 0), SAHF);
6135 CurDAG->RemoveDeadNode(Node);
6136 return;
6137 }
6138
6139 case X86ISD::CMP: {
6140 SDValue N0 = Node->getOperand(0);
6141 SDValue N1 = Node->getOperand(1);
6142
6143 // Optimizations for TEST compares.
6144 if (!isNullConstant(N1))
6145 break;
6146
6147 // Save the original VT of the compare.
6148 MVT CmpVT = N0.getSimpleValueType();
6149
6150 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
6151 // by a test instruction. The test should be removed later by
6152 // analyzeCompare if we are using only the zero flag.
6153 // TODO: Should we check the users and use the BEXTR flags directly?
6154 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
6155 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
6156 unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
6157 : X86::TEST32rr;
6158 SDValue BEXTR = SDValue(NewNode, 0);
6159 NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
6160 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6161 CurDAG->RemoveDeadNode(Node);
6162 return;
6163 }
6164 }
6165
6166 // We can peek through truncates, but we need to be careful below.
6167 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
6168 N0 = N0.getOperand(0);
6169
6170 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
6171 // use a smaller encoding.
6172 // Look past the truncate if CMP is the only use of it.
6173 if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
6174 N0.getValueType() != MVT::i8) {
6175 auto *MaskC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6176 if (!MaskC)
6177 break;
6178
6179 // We may have looked through a truncate so mask off any bits that
6180 // shouldn't be part of the compare.
6181 uint64_t Mask = MaskC->getZExtValue();
6183
6184 // Check if we can replace AND+IMM{32,64} with a shift. This is possible
6185 // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
6186 // zero flag.
6187 if (CmpVT == MVT::i64 && !isInt<8>(Mask) && isShiftedMask_64(Mask) &&
6188 onlyUsesZeroFlag(SDValue(Node, 0))) {
6189 unsigned ShiftOpcode = ISD::DELETED_NODE;
6190 unsigned ShiftAmt;
6191 unsigned SubRegIdx;
6192 MVT SubRegVT;
6193 unsigned TestOpcode;
6194 unsigned LeadingZeros = llvm::countl_zero(Mask);
6195 unsigned TrailingZeros = llvm::countr_zero(Mask);
6196
6197 // With leading/trailing zeros, the transform is profitable if we can
6198 // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
6199 // incurring any extra register moves.
6200 bool SavesBytes = !isInt<32>(Mask) || N0.getOperand(0).hasOneUse();
6201 if (LeadingZeros == 0 && SavesBytes) {
6202 // If the mask covers the most significant bit, then we can replace
6203 // TEST+AND with a SHR and check eflags.
6204 // This emits a redundant TEST which is subsequently eliminated.
6205 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6206 ShiftAmt = TrailingZeros;
6207 SubRegIdx = 0;
6208 TestOpcode = X86::TEST64rr;
6209 } else if (TrailingZeros == 0 && SavesBytes) {
6210 // If the mask covers the least significant bit, then we can replace
6211 // TEST+AND with a SHL and check eflags.
6212 // This emits a redundant TEST which is subsequently eliminated.
6213 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6214 ShiftAmt = LeadingZeros;
6215 SubRegIdx = 0;
6216 TestOpcode = X86::TEST64rr;
6217 } else if (MaskC->hasOneUse() && !isInt<32>(Mask)) {
6218 // If the shifted mask extends into the high half and is 8/16/32 bits
6219 // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6220 unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
6221 if (PopCount == 8) {
6222 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6223 ShiftAmt = TrailingZeros;
6224 SubRegIdx = X86::sub_8bit;
6225 SubRegVT = MVT::i8;
6226 TestOpcode = X86::TEST8rr;
6227 } else if (PopCount == 16) {
6228 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6229 ShiftAmt = TrailingZeros;
6230 SubRegIdx = X86::sub_16bit;
6231 SubRegVT = MVT::i16;
6232 TestOpcode = X86::TEST16rr;
6233 } else if (PopCount == 32) {
6234 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6235 ShiftAmt = TrailingZeros;
6236 SubRegIdx = X86::sub_32bit;
6237 SubRegVT = MVT::i32;
6238 TestOpcode = X86::TEST32rr;
6239 }
6240 }
6241 if (ShiftOpcode != ISD::DELETED_NODE) {
6242 SDValue ShiftC = CurDAG->getTargetConstant(ShiftAmt, dl, MVT::i64);
6243 SDValue Shift = SDValue(
6244 CurDAG->getMachineNode(ShiftOpcode, dl, MVT::i64, MVT::i32,
6245 N0.getOperand(0), ShiftC),
6246 0);
6247 if (SubRegIdx != 0) {
6248 Shift =
6249 CurDAG->getTargetExtractSubreg(SubRegIdx, dl, SubRegVT, Shift);
6250 }
6251 MachineSDNode *Test =
6252 CurDAG->getMachineNode(TestOpcode, dl, MVT::i32, Shift, Shift);
6253 ReplaceNode(Node, Test);
6254 return;
6255 }
6256 }
6257
6258 MVT VT;
6259 int SubRegOp;
6260 unsigned ROpc, MOpc;
6261
6262 // For each of these checks we need to be careful if the sign flag is
6263 // being used. It is only safe to use the sign flag in two conditions,
6264 // either the sign bit in the shrunken mask is zero or the final test
6265 // size is equal to the original compare size.
6266
6267 if (isUInt<8>(Mask) &&
6268 (!(Mask & 0x80) || CmpVT == MVT::i8 ||
6269 hasNoSignFlagUses(SDValue(Node, 0)))) {
6270 // For example, convert "testl %eax, $8" to "testb %al, $8"
6271 VT = MVT::i8;
6272 SubRegOp = X86::sub_8bit;
6273 ROpc = X86::TEST8ri;
6274 MOpc = X86::TEST8mi;
6275 } else if (OptForMinSize && isUInt<16>(Mask) &&
6276 (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
6277 hasNoSignFlagUses(SDValue(Node, 0)))) {
6278 // For example, "testl %eax, $32776" to "testw %ax, $32776".
6279 // NOTE: We only want to form TESTW instructions if optimizing for
6280 // min size. Otherwise we only save one byte and possibly get a length
6281 // changing prefix penalty in the decoders.
6282 VT = MVT::i16;
6283 SubRegOp = X86::sub_16bit;
6284 ROpc = X86::TEST16ri;
6285 MOpc = X86::TEST16mi;
6286 } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
6287 ((!(Mask & 0x80000000) &&
6288 // Without minsize 16-bit Cmps can get here so we need to
6289 // be sure we calculate the correct sign flag if needed.
6290 (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
6291 CmpVT == MVT::i32 ||
6292 hasNoSignFlagUses(SDValue(Node, 0)))) {
6293 // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6294 // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6295 // Otherwize, we find ourselves in a position where we have to do
6296 // promotion. If previous passes did not promote the and, we assume
6297 // they had a good reason not to and do not promote here.
6298 VT = MVT::i32;
6299 SubRegOp = X86::sub_32bit;
6300 ROpc = X86::TEST32ri;
6301 MOpc = X86::TEST32mi;
6302 } else {
6303 // No eligible transformation was found.
6304 break;
6305 }
6306
6307 SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
6308 SDValue Reg = N0.getOperand(0);
6309
6310 // Emit a testl or testw.
6311 MachineSDNode *NewNode;
6312 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6313 if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
6314 if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
6315 if (!LoadN->isSimple()) {
6316 unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
6317 if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
6318 (MOpc == X86::TEST16mi && NumVolBits != 16) ||
6319 (MOpc == X86::TEST32mi && NumVolBits != 32))
6320 break;
6321 }
6322 }
6323 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6324 Reg.getOperand(0) };
6325 NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
6326 // Update the chain.
6327 ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
6328 // Record the mem-refs
6329 CurDAG->setNodeMemRefs(NewNode,
6330 {cast<LoadSDNode>(Reg)->getMemOperand()});
6331 } else {
6332 // Extract the subregister if necessary.
6333 if (N0.getValueType() != VT)
6334 Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
6335
6336 NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
6337 }
6338 // Replace CMP with TEST.
6339 ReplaceNode(Node, NewNode);
6340 return;
6341 }
6342 break;
6343 }
6344 case X86ISD::PCMPISTR: {
6345 if (!Subtarget->hasSSE42())
6346 break;
6347
6348 bool NeedIndex = !SDValue(Node, 0).use_empty();
6349 bool NeedMask = !SDValue(Node, 1).use_empty();
6350 // We can't fold a load if we are going to make two instructions.
6351 bool MayFoldLoad = !NeedIndex || !NeedMask;
6352
6353 MachineSDNode *CNode;
6354 if (NeedMask) {
6355 unsigned ROpc =
6356 Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6357 unsigned MOpc =
6358 Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6359 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
6360 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6361 }
6362 if (NeedIndex || !NeedMask) {
6363 unsigned ROpc =
6364 Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6365 unsigned MOpc =
6366 Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6367 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
6368 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6369 }
6370
6371 // Connect the flag usage to the last instruction created.
6372 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6373 CurDAG->RemoveDeadNode(Node);
6374 return;
6375 }
6376 case X86ISD::PCMPESTR: {
6377 if (!Subtarget->hasSSE42())
6378 break;
6379
6380 // Copy the two implicit register inputs.
6381 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
6382 Node->getOperand(1),
6383 SDValue()).getValue(1);
6384 InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
6385 Node->getOperand(3), InGlue).getValue(1);
6386
6387 bool NeedIndex = !SDValue(Node, 0).use_empty();
6388 bool NeedMask = !SDValue(Node, 1).use_empty();
6389 // We can't fold a load if we are going to make two instructions.
6390 bool MayFoldLoad = !NeedIndex || !NeedMask;
6391
6392 MachineSDNode *CNode;
6393 if (NeedMask) {
6394 unsigned ROpc =
6395 Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6396 unsigned MOpc =
6397 Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6398 CNode =
6399 emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, InGlue);
6400 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6401 }
6402 if (NeedIndex || !NeedMask) {
6403 unsigned ROpc =
6404 Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6405 unsigned MOpc =
6406 Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6407 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InGlue);
6408 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6409 }
6410 // Connect the flag usage to the last instruction created.
6411 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6412 CurDAG->RemoveDeadNode(Node);
6413 return;
6414 }
6415
6416 case ISD::SETCC: {
6417 if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
6418 return;
6419
6420 break;
6421 }
6422
6423 case ISD::STORE:
6424 if (foldLoadStoreIntoMemOperand(Node))
6425 return;
6426 break;
6427
6428 case X86ISD::SETCC_CARRY: {
6429 MVT VT = Node->getSimpleValueType(0);
6431 if (Subtarget->hasSBBDepBreaking()) {
6432 // We have to do this manually because tblgen will put the eflags copy in
6433 // the wrong place if we use an extract_subreg in the pattern.
6434 // Copy flags to the EFLAGS register and glue it to next node.
6435 SDValue EFLAGS =
6436 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
6437 Node->getOperand(1), SDValue());
6438
6439 // Create a 64-bit instruction if the result is 64-bits otherwise use the
6440 // 32-bit version.
6441 unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6442 MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6443 Result = SDValue(
6444 CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)),
6445 0);
6446 } else {
6447 // The target does not recognize sbb with the same reg operand as a
6448 // no-source idiom, so we explicitly zero the input values.
6449 Result = getSBBZero(Node);
6450 }
6451
6452 // For less than 32-bits we need to extract from the 32-bit node.
6453 if (VT == MVT::i8 || VT == MVT::i16) {
6454 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6455 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6456 }
6457
6458 ReplaceUses(SDValue(Node, 0), Result);
6459 CurDAG->RemoveDeadNode(Node);
6460 return;
6461 }
6462 case X86ISD::SBB: {
6463 if (isNullConstant(Node->getOperand(0)) &&
6464 isNullConstant(Node->getOperand(1))) {
6465 SDValue Result = getSBBZero(Node);
6466
6467 // Replace the flag use.
6468 ReplaceUses(SDValue(Node, 1), Result.getValue(1));
6469
6470 // Replace the result use.
6471 if (!SDValue(Node, 0).use_empty()) {
6472 // For less than 32-bits we need to extract from the 32-bit node.
6473 MVT VT = Node->getSimpleValueType(0);
6474 if (VT == MVT::i8 || VT == MVT::i16) {
6475 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6476 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6477 }
6478 ReplaceUses(SDValue(Node, 0), Result);
6479 }
6480
6481 CurDAG->RemoveDeadNode(Node);
6482 return;
6483 }
6484 break;
6485 }
6486 case X86ISD::MGATHER: {
6487 auto *Mgt = cast<X86MaskedGatherSDNode>(Node);
6488 SDValue IndexOp = Mgt->getIndex();
6489 SDValue Mask = Mgt->getMask();
6490 MVT IndexVT = IndexOp.getSimpleValueType();
6491 MVT ValueVT = Node->getSimpleValueType(0);
6492 MVT MaskVT = Mask.getSimpleValueType();
6493
6494 // This is just to prevent crashes if the nodes are malformed somehow. We're
6495 // otherwise only doing loose type checking in here based on type what
6496 // a type constraint would say just like table based isel.
6497 if (!ValueVT.isVector() || !MaskVT.isVector())
6498 break;
6499
6500 unsigned NumElts = ValueVT.getVectorNumElements();
6501 MVT ValueSVT = ValueVT.getVectorElementType();
6502
6503 bool IsFP = ValueSVT.isFloatingPoint();
6504 unsigned EltSize = ValueSVT.getSizeInBits();
6505
6506 unsigned Opc = 0;
6507 bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6508 if (AVX512Gather) {
6509 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6510 Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6511 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6512 Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6513 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6514 Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6515 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6516 Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6517 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6518 Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6519 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6520 Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6521 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6522 Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6523 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6524 Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6525 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6526 Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6527 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6528 Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6529 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6530 Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6531 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6532 Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6533 } else {
6534 assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6535 "Unexpected mask VT!");
6536 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6537 Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6538 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6539 Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6540 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6541 Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6542 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6543 Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6544 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6545 Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6546 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6547 Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6548 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6549 Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6550 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6551 Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6552 }
6553
6554 if (!Opc)
6555 break;
6556
6557 SDValue Base, Scale, Index, Disp, Segment;
6558 if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(),
6559 Base, Scale, Index, Disp, Segment))
6560 break;
6561
6562 SDValue PassThru = Mgt->getPassThru();
6563 SDValue Chain = Mgt->getChain();
6564 // Gather instructions have a mask output not in the ISD node.
6565 SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other);
6566
6567 MachineSDNode *NewNode;
6568 if (AVX512Gather) {
6569 SDValue Ops[] = {PassThru, Mask, Base, Scale,
6570 Index, Disp, Segment, Chain};
6571 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6572 } else {
6573 SDValue Ops[] = {PassThru, Base, Scale, Index,
6574 Disp, Segment, Mask, Chain};
6575 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6576 }
6577 CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()});
6578 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6579 ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2));
6580 CurDAG->RemoveDeadNode(Node);
6581 return;
6582 }
6583 case X86ISD::MSCATTER: {
6584 auto *Sc = cast<X86MaskedScatterSDNode>(Node);
6585 SDValue Value = Sc->getValue();
6586 SDValue IndexOp = Sc->getIndex();
6587 MVT IndexVT = IndexOp.getSimpleValueType();
6588 MVT ValueVT = Value.getSimpleValueType();
6589
6590 // This is just to prevent crashes if the nodes are malformed somehow. We're
6591 // otherwise only doing loose type checking in here based on type what
6592 // a type constraint would say just like table based isel.
6593 if (!ValueVT.isVector())
6594 break;
6595
6596 unsigned NumElts = ValueVT.getVectorNumElements();
6597 MVT ValueSVT = ValueVT.getVectorElementType();
6598
6599 bool IsFP = ValueSVT.isFloatingPoint();
6600 unsigned EltSize = ValueSVT.getSizeInBits();
6601
6602 unsigned Opc;
6603 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6604 Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6605 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6606 Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6607 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6608 Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6609 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6610 Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6611 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6612 Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6613 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6614 Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6615 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6616 Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6617 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6618 Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6619 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6620 Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6621 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6622 Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6623 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6624 Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6625 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6626 Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6627 else
6628 break;
6629
6630 SDValue Base, Scale, Index, Disp, Segment;
6631 if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(),
6632 Base, Scale, Index, Disp, Segment))
6633 break;
6634
6635 SDValue Mask = Sc->getMask();
6636 SDValue Chain = Sc->getChain();
6637 // Scatter instructions have a mask output not in the ISD node.
6638 SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other);
6639 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6640
6641 MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6642 CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()});
6643 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1));
6644 CurDAG->RemoveDeadNode(Node);
6645 return;
6646 }
6647 case ISD::PREALLOCATED_SETUP: {
6648 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6649 auto CallId = MFI->getPreallocatedIdForCallSite(
6650 cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6651 SDValue Chain = Node->getOperand(0);
6652 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6653 MachineSDNode *New = CurDAG->getMachineNode(
6654 TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
6655 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
6656 CurDAG->RemoveDeadNode(Node);
6657 return;
6658 }
6659 case ISD::PREALLOCATED_ARG: {
6660 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6661 auto CallId = MFI->getPreallocatedIdForCallSite(
6662 cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6663 SDValue Chain = Node->getOperand(0);
6664 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6665 SDValue ArgIndex = Node->getOperand(2);
6666 SDValue Ops[3];
6667 Ops[0] = CallIdValue;
6668 Ops[1] = ArgIndex;
6669 Ops[2] = Chain;
6670 MachineSDNode *New = CurDAG->getMachineNode(
6671 TargetOpcode::PREALLOCATED_ARG, dl,
6672 CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
6673 MVT::Other),
6674 Ops);
6675 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
6676 ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
6677 CurDAG->RemoveDeadNode(Node);
6678 return;
6679 }
6684 if (!Subtarget->hasWIDEKL())
6685 break;
6686
6687 unsigned Opcode;
6688 switch (Node->getOpcode()) {
6689 default:
6690 llvm_unreachable("Unexpected opcode!");
6692 Opcode = X86::AESENCWIDE128KL;
6693 break;
6695 Opcode = X86::AESDECWIDE128KL;
6696 break;
6698 Opcode = X86::AESENCWIDE256KL;
6699 break;
6701 Opcode = X86::AESDECWIDE256KL;
6702 break;
6703 }
6704
6705 SDValue Chain = Node->getOperand(0);
6706 SDValue Addr = Node->getOperand(1);
6707
6708 SDValue Base, Scale, Index, Disp, Segment;
6709 if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
6710 break;
6711
6712 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
6713 SDValue());
6714 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
6715 Chain.getValue(1));
6716 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
6717 Chain.getValue(1));
6718 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
6719 Chain.getValue(1));
6720 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
6721 Chain.getValue(1));
6722 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
6723 Chain.getValue(1));
6724 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
6725 Chain.getValue(1));
6726 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
6727 Chain.getValue(1));
6728
6729 MachineSDNode *Res = CurDAG->getMachineNode(
6730 Opcode, dl, Node->getVTList(),
6731 {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
6732 CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand());
6733 ReplaceNode(Node, Res);
6734 return;
6735 }
6737 SDValue Chain = Node->getOperand(0);
6738 Register Reg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
6739 SDValue Glue;
6740 if (Node->getNumValues() == 3)
6741 Glue = Node->getOperand(2);
6742 SDValue Copy =
6743 CurDAG->getCopyFromReg(Chain, dl, Reg, Node->getValueType(0), Glue);
6744 ReplaceNode(Node, Copy.getNode());
6745 return;
6746 }
6747 }
6748
6749 SelectCode(Node);
6750}
6751
6752bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6753 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6754 std::vector<SDValue> &OutOps) {
6755 SDValue Op0, Op1, Op2, Op3, Op4;
6756 switch (ConstraintID) {
6757 default:
6758 llvm_unreachable("Unexpected asm memory constraint");
6759 case InlineAsm::ConstraintCode::o: // offsetable ??
6760 case InlineAsm::ConstraintCode::v: // not offsetable ??
6761 case InlineAsm::ConstraintCode::m: // memory
6762 case InlineAsm::ConstraintCode::X:
6763 case InlineAsm::ConstraintCode::p: // address
6764 if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
6765 return true;
6766 break;
6767 }
6768
6769 OutOps.push_back(Op0);
6770 OutOps.push_back(Op1);
6771 OutOps.push_back(Op2);
6772 OutOps.push_back(Op3);
6773 OutOps.push_back(Op4);
6774 return false;
6775}
6776
6779 std::make_unique<X86DAGToDAGISel>(TM, TM.getOptLevel())) {}
6780
6781/// This pass converts a legalized DAG into a X86-specific DAG,
6782/// ready for instruction scheduling.
6784 CodeGenOptLevel OptLevel) {
6785 return new X86DAGToDAGISelLegacy(TM, OptLevel);
6786}
unsigned SubReg
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis false
#define CASE(ATTRNAME, AANAME,...)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
dxil translate DXIL Translate Metadata
static bool isSigned(unsigned int Opcode)
#define DEBUG_TYPE
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
Register Reg
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
#define P(N)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
BaseType
A given derived pointer can have multiple base pointers through phi/selects.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:167
#define LLVM_DEBUG(...)
Definition Debug.h:114
static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, SDValue StoredVal, SelectionDAG *CurDAG, LoadSDNode *&LoadNode, SDValue &InputChain)
static void insertDAGNode(SelectionDAG *DAG, SDNode *Pos, SDValue N)
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
#define PASS_NAME
static bool isRIPRelative(const MCInst &MI, const MCInstrInfo &MCII)
Check if the instruction uses RIP relative addressing.
#define FROM_TO(FROM, TO)
#define GET_EGPR_IF_ENABLED(OPC)
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget)
static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM)
static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM)
static bool needBWI(MVT VT)
static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, bool FoldedBCast, bool Masked)
static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM, const X86Subtarget &Subtarget)
static bool mayUseCarryFlag(X86::CondCode CC)
static cl::opt< bool > EnablePromoteAnyextLoad("x86-promote-anyext-load", cl::init(true), cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden)
static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, SDValue Call, SDValue OrigChain)
Replace the original chain operand of the call with load's chain operand and move load below the call...
#define GET_ND_IF_ENABLED(OPC)
#define VPTESTM_BROADCAST_CASES(SUFFIX)
static cl::opt< bool > AndImmShrink("x86-and-imm-shrink", cl::init(true), cl::desc("Enable setting constant bits to reduce size of mask immediates"), cl::Hidden)
static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, X86ISelAddressMode &AM)
#define VPTESTM_FULL_CASES(SUFFIX)
static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq)
Return true if call address is a load and it can be moved below CALLSEQ_START and the chains leading ...
static bool isDispSafeForFrameIndexOrRegBase(int64_t Val)
static bool isEndbrImm64(uint64_t Imm)
cl::opt< bool > IndirectBranchTracking("x86-indirect-branch-tracking", cl::init(false), cl::Hidden, cl::desc("Enable X86 indirect branch tracking pass."))
#define GET_ND_IF_ENABLED(OPC)
#define CASE_ND(OP)
Value * RHS
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1598
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1531
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1257
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1656
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI std::optional< ConstantRange > getAbsoluteSymbolRange() const
If this is an absolute symbol reference, returns the range of the symbol, otherwise returns std::null...
Definition Globals.cpp:432
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:353
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
int getNodeId() const
Return the unique node id.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
const SDValue & getOperand(unsigned Num) const
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
op_iterator op_end() const
op_iterator op_begin() const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
unsigned getNumOperands() const
SelectionDAGISelPass(std::unique_ptr< SelectionDAGISel > Selector)
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
static int getUninvalidatedNodeId(SDNode *N)
virtual bool runOnMachineFunction(MachineFunction &mf)
static void InvalidateNodeId(SDNode *N)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
static constexpr unsigned MaxRecursionDepth
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
void RepositionNode(allnodes_iterator Position, SDNode *N)
Move node N in the AllNodes list to be immediately before the given iterator Position.
ilist< SDNode >::iterator allnodes_iterator
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
unsigned getID() const
Return the register class ID number.
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
X86ISelDAGToDAGPass(X86TargetMachine &TM)
size_t getPreallocatedIdForCallSite(const Value *CS)
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
self_iterator getIterator()
Definition ilist_node.h:134
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ TargetExternalSymbol
Definition ISDOpcodes.h:185
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ TargetGlobalAddress
TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or anything else with this node...
Definition ISDOpcodes.h:180
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition ISDOpcodes.h:130
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition ISDOpcodes.h:881
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:903
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ TargetGlobalTLSAddress
Definition ISDOpcodes.h:181
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
@ GlobalBaseReg
The result of the mflr at function entry, used for PIC code.
@ X86
Windows x64, Windows Itanium (IA-64)
Definition MCAsmInfo.h:50
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
@ EVEX
EVEX - Specifies that this instruction use EVEX form which provides syntax support up to 32 512-bit r...
@ VEX
VEX - encoding using 0xC4/0xC5.
@ XOP
XOP - Opcode prefix used by XOP instructions.
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ CALL
These operations represent an abstract X86 call instruction, which includes a bunch of information.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FLD
This instruction implements an extending load to FP stack slots.
@ TC_RETURN
Tail call return.
@ FOR
Bitwise logical OR of floating point values.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ POP_FROM_X87_REG
The same as ISD::CopyFromReg except that this node makes it explicit that it may lower to an x87 FPU ...
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
int getCondSrcNoFromDesc(const MCInstrDesc &MCID)
Return the source operand # for condition code by MCID.
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
constexpr uint16_t Magic
Definition SFrame.h:32
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:682
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:314
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:282
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:759
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:270
FunctionPass * createX86ISelDag(X86TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a X86-specific DAG, ready for instruction scheduling.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
@ And
Bitwise or logical AND of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1879
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:851
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:212
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
Matching combinators.
LLVM_ABI unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
bool hasNoUnsignedWrap() const