LLVM 22.0.0git
X86InstrInfo.cpp
Go to the documentation of this file.
1//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the X86 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "X86InstrInfo.h"
14#include "X86.h"
15#include "X86InstrBuilder.h"
16#include "X86InstrFoldTables.h"
18#include "X86Subtarget.h"
19#include "X86TargetMachine.h"
20#include "llvm/ADT/STLExtras.h"
21#include "llvm/ADT/Sequence.h"
36#include "llvm/IR/Function.h"
37#include "llvm/IR/InstrTypes.h"
38#include "llvm/IR/Module.h"
39#include "llvm/MC/MCAsmInfo.h"
40#include "llvm/MC/MCExpr.h"
41#include "llvm/MC/MCInst.h"
43#include "llvm/Support/Debug.h"
47#include <optional>
48
49using namespace llvm;
50
51#define DEBUG_TYPE "x86-instr-info"
52
53#define GET_INSTRINFO_CTOR_DTOR
54#include "X86GenInstrInfo.inc"
55
57
58static cl::opt<bool>
59 NoFusing("disable-spill-fusing",
60 cl::desc("Disable fusing of spill code into instructions"),
62static cl::opt<bool>
63 PrintFailedFusing("print-failed-fuse-candidates",
64 cl::desc("Print instructions that the allocator wants to"
65 " fuse, but the X86 backend currently can't"),
67static cl::opt<bool>
68 ReMatPICStubLoad("remat-pic-stub-load",
69 cl::desc("Re-materialize load from stub in PIC mode"),
70 cl::init(false), cl::Hidden);
72 PartialRegUpdateClearance("partial-reg-update-clearance",
73 cl::desc("Clearance between two register writes "
74 "for inserting XOR to avoid partial "
75 "register update"),
76 cl::init(64), cl::Hidden);
78 "undef-reg-clearance",
79 cl::desc("How many idle instructions we would like before "
80 "certain undef register reads"),
81 cl::init(128), cl::Hidden);
82
83// Pin the vtable to this file.
84void X86InstrInfo::anchor() {}
85
87 : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
88 : X86::ADJCALLSTACKDOWN32),
89 (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
90 : X86::ADJCALLSTACKUP32),
91 X86::CATCHRET, (STI.is64Bit() ? X86::RET64 : X86::RET32)),
92 Subtarget(STI), RI(STI.getTargetTriple()) {}
93
95X86InstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
97 const MachineFunction &MF) const {
98 auto *RC = TargetInstrInfo::getRegClass(MCID, OpNum, TRI, MF);
99 // If the target does not have egpr, then r16-r31 will be resereved for all
100 // instructions.
101 if (!RC || !Subtarget.hasEGPR())
102 return RC;
103
105 return RC;
106
107 const X86RegisterInfo *RI = Subtarget.getRegisterInfo();
108 return RI->constrainRegClassToNonRex2(RC);
109}
110
112 Register &SrcReg, Register &DstReg,
113 unsigned &SubIdx) const {
114 switch (MI.getOpcode()) {
115 default:
116 break;
117 case X86::MOVSX16rr8:
118 case X86::MOVZX16rr8:
119 case X86::MOVSX32rr8:
120 case X86::MOVZX32rr8:
121 case X86::MOVSX64rr8:
122 if (!Subtarget.is64Bit())
123 // It's not always legal to reference the low 8-bit of the larger
124 // register in 32-bit mode.
125 return false;
126 [[fallthrough]];
127 case X86::MOVSX32rr16:
128 case X86::MOVZX32rr16:
129 case X86::MOVSX64rr16:
130 case X86::MOVSX64rr32: {
131 if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
132 // Be conservative.
133 return false;
134 SrcReg = MI.getOperand(1).getReg();
135 DstReg = MI.getOperand(0).getReg();
136 switch (MI.getOpcode()) {
137 default:
138 llvm_unreachable("Unreachable!");
139 case X86::MOVSX16rr8:
140 case X86::MOVZX16rr8:
141 case X86::MOVSX32rr8:
142 case X86::MOVZX32rr8:
143 case X86::MOVSX64rr8:
144 SubIdx = X86::sub_8bit;
145 break;
146 case X86::MOVSX32rr16:
147 case X86::MOVZX32rr16:
148 case X86::MOVSX64rr16:
149 SubIdx = X86::sub_16bit;
150 break;
151 case X86::MOVSX64rr32:
152 SubIdx = X86::sub_32bit;
153 break;
154 }
155 return true;
156 }
157 }
158 return false;
159}
160
162 if (MI.mayLoad() || MI.mayStore())
163 return false;
164
165 // Some target-independent operations that trivially lower to data-invariant
166 // instructions.
167 if (MI.isCopyLike() || MI.isInsertSubreg())
168 return true;
169
170 unsigned Opcode = MI.getOpcode();
171 using namespace X86;
172 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
173 // However, they set flags and are perhaps the most surprisingly constant
174 // time operations so we call them out here separately.
175 if (isIMUL(Opcode))
176 return true;
177 // Bit scanning and counting instructions that are somewhat surprisingly
178 // constant time as they scan across bits and do other fairly complex
179 // operations like popcnt, but are believed to be constant time on x86.
180 // However, these set flags.
181 if (isBSF(Opcode) || isBSR(Opcode) || isLZCNT(Opcode) || isPOPCNT(Opcode) ||
182 isTZCNT(Opcode))
183 return true;
184 // Bit manipulation instructions are effectively combinations of basic
185 // arithmetic ops, and should still execute in constant time. These also
186 // set flags.
187 if (isBLCFILL(Opcode) || isBLCI(Opcode) || isBLCIC(Opcode) ||
188 isBLCMSK(Opcode) || isBLCS(Opcode) || isBLSFILL(Opcode) ||
189 isBLSI(Opcode) || isBLSIC(Opcode) || isBLSMSK(Opcode) || isBLSR(Opcode) ||
190 isTZMSK(Opcode))
191 return true;
192 // Bit extracting and clearing instructions should execute in constant time,
193 // and set flags.
194 if (isBEXTR(Opcode) || isBZHI(Opcode))
195 return true;
196 // Shift and rotate.
197 if (isROL(Opcode) || isROR(Opcode) || isSAR(Opcode) || isSHL(Opcode) ||
198 isSHR(Opcode) || isSHLD(Opcode) || isSHRD(Opcode))
199 return true;
200 // Basic arithmetic is constant time on the input but does set flags.
201 if (isADC(Opcode) || isADD(Opcode) || isAND(Opcode) || isOR(Opcode) ||
202 isSBB(Opcode) || isSUB(Opcode) || isXOR(Opcode))
203 return true;
204 // Arithmetic with just 32-bit and 64-bit variants and no immediates.
205 if (isANDN(Opcode))
206 return true;
207 // Unary arithmetic operations.
208 if (isDEC(Opcode) || isINC(Opcode) || isNEG(Opcode))
209 return true;
210 // Unlike other arithmetic, NOT doesn't set EFLAGS.
211 if (isNOT(Opcode))
212 return true;
213 // Various move instructions used to zero or sign extend things. Note that we
214 // intentionally don't support the _NOREX variants as we can't handle that
215 // register constraint anyways.
216 if (isMOVSX(Opcode) || isMOVZX(Opcode) || isMOVSXD(Opcode) || isMOV(Opcode))
217 return true;
218 // Arithmetic instructions that are both constant time and don't set flags.
219 if (isRORX(Opcode) || isSARX(Opcode) || isSHLX(Opcode) || isSHRX(Opcode))
220 return true;
221 // LEA doesn't actually access memory, and its arithmetic is constant time.
222 if (isLEA(Opcode))
223 return true;
224 // By default, assume that the instruction is not data invariant.
225 return false;
226}
227
229 switch (MI.getOpcode()) {
230 default:
231 // By default, assume that the load will immediately leak.
232 return false;
233
234 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
235 // However, they set flags and are perhaps the most surprisingly constant
236 // time operations so we call them out here separately.
237 case X86::IMUL16rm:
238 case X86::IMUL16rmi:
239 case X86::IMUL32rm:
240 case X86::IMUL32rmi:
241 case X86::IMUL64rm:
242 case X86::IMUL64rmi32:
243
244 // Bit scanning and counting instructions that are somewhat surprisingly
245 // constant time as they scan across bits and do other fairly complex
246 // operations like popcnt, but are believed to be constant time on x86.
247 // However, these set flags.
248 case X86::BSF16rm:
249 case X86::BSF32rm:
250 case X86::BSF64rm:
251 case X86::BSR16rm:
252 case X86::BSR32rm:
253 case X86::BSR64rm:
254 case X86::LZCNT16rm:
255 case X86::LZCNT32rm:
256 case X86::LZCNT64rm:
257 case X86::POPCNT16rm:
258 case X86::POPCNT32rm:
259 case X86::POPCNT64rm:
260 case X86::TZCNT16rm:
261 case X86::TZCNT32rm:
262 case X86::TZCNT64rm:
263
264 // Bit manipulation instructions are effectively combinations of basic
265 // arithmetic ops, and should still execute in constant time. These also
266 // set flags.
267 case X86::BLCFILL32rm:
268 case X86::BLCFILL64rm:
269 case X86::BLCI32rm:
270 case X86::BLCI64rm:
271 case X86::BLCIC32rm:
272 case X86::BLCIC64rm:
273 case X86::BLCMSK32rm:
274 case X86::BLCMSK64rm:
275 case X86::BLCS32rm:
276 case X86::BLCS64rm:
277 case X86::BLSFILL32rm:
278 case X86::BLSFILL64rm:
279 case X86::BLSI32rm:
280 case X86::BLSI64rm:
281 case X86::BLSIC32rm:
282 case X86::BLSIC64rm:
283 case X86::BLSMSK32rm:
284 case X86::BLSMSK64rm:
285 case X86::BLSR32rm:
286 case X86::BLSR64rm:
287 case X86::TZMSK32rm:
288 case X86::TZMSK64rm:
289
290 // Bit extracting and clearing instructions should execute in constant time,
291 // and set flags.
292 case X86::BEXTR32rm:
293 case X86::BEXTR64rm:
294 case X86::BEXTRI32mi:
295 case X86::BEXTRI64mi:
296 case X86::BZHI32rm:
297 case X86::BZHI64rm:
298
299 // Basic arithmetic is constant time on the input but does set flags.
300 case X86::ADC8rm:
301 case X86::ADC16rm:
302 case X86::ADC32rm:
303 case X86::ADC64rm:
304 case X86::ADD8rm:
305 case X86::ADD16rm:
306 case X86::ADD32rm:
307 case X86::ADD64rm:
308 case X86::AND8rm:
309 case X86::AND16rm:
310 case X86::AND32rm:
311 case X86::AND64rm:
312 case X86::ANDN32rm:
313 case X86::ANDN64rm:
314 case X86::OR8rm:
315 case X86::OR16rm:
316 case X86::OR32rm:
317 case X86::OR64rm:
318 case X86::SBB8rm:
319 case X86::SBB16rm:
320 case X86::SBB32rm:
321 case X86::SBB64rm:
322 case X86::SUB8rm:
323 case X86::SUB16rm:
324 case X86::SUB32rm:
325 case X86::SUB64rm:
326 case X86::XOR8rm:
327 case X86::XOR16rm:
328 case X86::XOR32rm:
329 case X86::XOR64rm:
330
331 // Integer multiply w/o affecting flags is still believed to be constant
332 // time on x86. Called out separately as this is among the most surprising
333 // instructions to exhibit that behavior.
334 case X86::MULX32rm:
335 case X86::MULX64rm:
336
337 // Arithmetic instructions that are both constant time and don't set flags.
338 case X86::RORX32mi:
339 case X86::RORX64mi:
340 case X86::SARX32rm:
341 case X86::SARX64rm:
342 case X86::SHLX32rm:
343 case X86::SHLX64rm:
344 case X86::SHRX32rm:
345 case X86::SHRX64rm:
346
347 // Conversions are believed to be constant time and don't set flags.
348 case X86::CVTTSD2SI64rm:
349 case X86::VCVTTSD2SI64rm:
350 case X86::VCVTTSD2SI64Zrm:
351 case X86::CVTTSD2SIrm:
352 case X86::VCVTTSD2SIrm:
353 case X86::VCVTTSD2SIZrm:
354 case X86::CVTTSS2SI64rm:
355 case X86::VCVTTSS2SI64rm:
356 case X86::VCVTTSS2SI64Zrm:
357 case X86::CVTTSS2SIrm:
358 case X86::VCVTTSS2SIrm:
359 case X86::VCVTTSS2SIZrm:
360 case X86::CVTSI2SDrm:
361 case X86::VCVTSI2SDrm:
362 case X86::VCVTSI2SDZrm:
363 case X86::CVTSI2SSrm:
364 case X86::VCVTSI2SSrm:
365 case X86::VCVTSI2SSZrm:
366 case X86::CVTSI642SDrm:
367 case X86::VCVTSI642SDrm:
368 case X86::VCVTSI642SDZrm:
369 case X86::CVTSI642SSrm:
370 case X86::VCVTSI642SSrm:
371 case X86::VCVTSI642SSZrm:
372 case X86::CVTSS2SDrm:
373 case X86::VCVTSS2SDrm:
374 case X86::VCVTSS2SDZrm:
375 case X86::CVTSD2SSrm:
376 case X86::VCVTSD2SSrm:
377 case X86::VCVTSD2SSZrm:
378 // AVX512 added unsigned integer conversions.
379 case X86::VCVTTSD2USI64Zrm:
380 case X86::VCVTTSD2USIZrm:
381 case X86::VCVTTSS2USI64Zrm:
382 case X86::VCVTTSS2USIZrm:
383 case X86::VCVTUSI2SDZrm:
384 case X86::VCVTUSI642SDZrm:
385 case X86::VCVTUSI2SSZrm:
386 case X86::VCVTUSI642SSZrm:
387
388 // Loads to register don't set flags.
389 case X86::MOV8rm:
390 case X86::MOV8rm_NOREX:
391 case X86::MOV16rm:
392 case X86::MOV32rm:
393 case X86::MOV64rm:
394 case X86::MOVSX16rm8:
395 case X86::MOVSX32rm16:
396 case X86::MOVSX32rm8:
397 case X86::MOVSX32rm8_NOREX:
398 case X86::MOVSX64rm16:
399 case X86::MOVSX64rm32:
400 case X86::MOVSX64rm8:
401 case X86::MOVZX16rm8:
402 case X86::MOVZX32rm16:
403 case X86::MOVZX32rm8:
404 case X86::MOVZX32rm8_NOREX:
405 case X86::MOVZX64rm16:
406 case X86::MOVZX64rm8:
407 return true;
408 }
409}
410
412 const MachineFunction *MF = MI.getParent()->getParent();
414
415 if (isFrameInstr(MI)) {
416 int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign());
417 SPAdj -= getFrameAdjustment(MI);
418 if (!isFrameSetup(MI))
419 SPAdj = -SPAdj;
420 return SPAdj;
421 }
422
423 // To know whether a call adjusts the stack, we need information
424 // that is bound to the following ADJCALLSTACKUP pseudo.
425 // Look for the next ADJCALLSTACKUP that follows the call.
426 if (MI.isCall()) {
427 const MachineBasicBlock *MBB = MI.getParent();
429 for (auto E = MBB->end(); I != E; ++I) {
430 if (I->getOpcode() == getCallFrameDestroyOpcode() || I->isCall())
431 break;
432 }
433
434 // If we could not find a frame destroy opcode, then it has already
435 // been simplified, so we don't care.
436 if (I->getOpcode() != getCallFrameDestroyOpcode())
437 return 0;
438
439 return -(I->getOperand(1).getImm());
440 }
441
442 // Currently handle only PUSHes we can reasonably expect to see
443 // in call sequences
444 switch (MI.getOpcode()) {
445 default:
446 return 0;
447 case X86::PUSH32r:
448 case X86::PUSH32rmm:
449 case X86::PUSH32rmr:
450 case X86::PUSH32i:
451 return 4;
452 case X86::PUSH64r:
453 case X86::PUSH64rmm:
454 case X86::PUSH64rmr:
455 case X86::PUSH64i32:
456 return 8;
457 }
458}
459
460/// Return true and the FrameIndex if the specified
461/// operand and follow operands form a reference to the stack frame.
462bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
463 int &FrameIndex) const {
464 if (MI.getOperand(Op + X86::AddrBaseReg).isFI() &&
465 MI.getOperand(Op + X86::AddrScaleAmt).isImm() &&
466 MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
467 MI.getOperand(Op + X86::AddrDisp).isImm() &&
468 MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 &&
469 MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 &&
470 MI.getOperand(Op + X86::AddrDisp).getImm() == 0) {
471 FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex();
472 return true;
473 }
474 return false;
475}
476
477static bool isFrameLoadOpcode(int Opcode, TypeSize &MemBytes) {
478 switch (Opcode) {
479 default:
480 return false;
481 case X86::MOV8rm:
482 case X86::KMOVBkm:
483 case X86::KMOVBkm_EVEX:
484 MemBytes = TypeSize::getFixed(1);
485 return true;
486 case X86::MOV16rm:
487 case X86::KMOVWkm:
488 case X86::KMOVWkm_EVEX:
489 case X86::VMOVSHZrm:
490 case X86::VMOVSHZrm_alt:
491 MemBytes = TypeSize::getFixed(2);
492 return true;
493 case X86::MOV32rm:
494 case X86::MOVSSrm:
495 case X86::MOVSSrm_alt:
496 case X86::VMOVSSrm:
497 case X86::VMOVSSrm_alt:
498 case X86::VMOVSSZrm:
499 case X86::VMOVSSZrm_alt:
500 case X86::KMOVDkm:
501 case X86::KMOVDkm_EVEX:
502 MemBytes = TypeSize::getFixed(4);
503 return true;
504 case X86::MOV64rm:
505 case X86::LD_Fp64m:
506 case X86::MOVSDrm:
507 case X86::MOVSDrm_alt:
508 case X86::VMOVSDrm:
509 case X86::VMOVSDrm_alt:
510 case X86::VMOVSDZrm:
511 case X86::VMOVSDZrm_alt:
512 case X86::MMX_MOVD64rm:
513 case X86::MMX_MOVQ64rm:
514 case X86::KMOVQkm:
515 case X86::KMOVQkm_EVEX:
516 MemBytes = TypeSize::getFixed(8);
517 return true;
518 case X86::MOVAPSrm:
519 case X86::MOVUPSrm:
520 case X86::MOVAPDrm:
521 case X86::MOVUPDrm:
522 case X86::MOVDQArm:
523 case X86::MOVDQUrm:
524 case X86::VMOVAPSrm:
525 case X86::VMOVUPSrm:
526 case X86::VMOVAPDrm:
527 case X86::VMOVUPDrm:
528 case X86::VMOVDQArm:
529 case X86::VMOVDQUrm:
530 case X86::VMOVAPSZ128rm:
531 case X86::VMOVUPSZ128rm:
532 case X86::VMOVAPSZ128rm_NOVLX:
533 case X86::VMOVUPSZ128rm_NOVLX:
534 case X86::VMOVAPDZ128rm:
535 case X86::VMOVUPDZ128rm:
536 case X86::VMOVDQU8Z128rm:
537 case X86::VMOVDQU16Z128rm:
538 case X86::VMOVDQA32Z128rm:
539 case X86::VMOVDQU32Z128rm:
540 case X86::VMOVDQA64Z128rm:
541 case X86::VMOVDQU64Z128rm:
542 MemBytes = TypeSize::getFixed(16);
543 return true;
544 case X86::VMOVAPSYrm:
545 case X86::VMOVUPSYrm:
546 case X86::VMOVAPDYrm:
547 case X86::VMOVUPDYrm:
548 case X86::VMOVDQAYrm:
549 case X86::VMOVDQUYrm:
550 case X86::VMOVAPSZ256rm:
551 case X86::VMOVUPSZ256rm:
552 case X86::VMOVAPSZ256rm_NOVLX:
553 case X86::VMOVUPSZ256rm_NOVLX:
554 case X86::VMOVAPDZ256rm:
555 case X86::VMOVUPDZ256rm:
556 case X86::VMOVDQU8Z256rm:
557 case X86::VMOVDQU16Z256rm:
558 case X86::VMOVDQA32Z256rm:
559 case X86::VMOVDQU32Z256rm:
560 case X86::VMOVDQA64Z256rm:
561 case X86::VMOVDQU64Z256rm:
562 MemBytes = TypeSize::getFixed(32);
563 return true;
564 case X86::VMOVAPSZrm:
565 case X86::VMOVUPSZrm:
566 case X86::VMOVAPDZrm:
567 case X86::VMOVUPDZrm:
568 case X86::VMOVDQU8Zrm:
569 case X86::VMOVDQU16Zrm:
570 case X86::VMOVDQA32Zrm:
571 case X86::VMOVDQU32Zrm:
572 case X86::VMOVDQA64Zrm:
573 case X86::VMOVDQU64Zrm:
574 MemBytes = TypeSize::getFixed(64);
575 return true;
576 }
577}
578
579static bool isFrameStoreOpcode(int Opcode, TypeSize &MemBytes) {
580 switch (Opcode) {
581 default:
582 return false;
583 case X86::MOV8mr:
584 case X86::KMOVBmk:
585 case X86::KMOVBmk_EVEX:
586 MemBytes = TypeSize::getFixed(1);
587 return true;
588 case X86::MOV16mr:
589 case X86::KMOVWmk:
590 case X86::KMOVWmk_EVEX:
591 case X86::VMOVSHZmr:
592 MemBytes = TypeSize::getFixed(2);
593 return true;
594 case X86::MOV32mr:
595 case X86::MOVSSmr:
596 case X86::VMOVSSmr:
597 case X86::VMOVSSZmr:
598 case X86::KMOVDmk:
599 case X86::KMOVDmk_EVEX:
600 MemBytes = TypeSize::getFixed(4);
601 return true;
602 case X86::MOV64mr:
603 case X86::ST_FpP64m:
604 case X86::MOVSDmr:
605 case X86::VMOVSDmr:
606 case X86::VMOVSDZmr:
607 case X86::MMX_MOVD64mr:
608 case X86::MMX_MOVQ64mr:
609 case X86::MMX_MOVNTQmr:
610 case X86::KMOVQmk:
611 case X86::KMOVQmk_EVEX:
612 MemBytes = TypeSize::getFixed(8);
613 return true;
614 case X86::MOVAPSmr:
615 case X86::MOVUPSmr:
616 case X86::MOVAPDmr:
617 case X86::MOVUPDmr:
618 case X86::MOVDQAmr:
619 case X86::MOVDQUmr:
620 case X86::VMOVAPSmr:
621 case X86::VMOVUPSmr:
622 case X86::VMOVAPDmr:
623 case X86::VMOVUPDmr:
624 case X86::VMOVDQAmr:
625 case X86::VMOVDQUmr:
626 case X86::VMOVUPSZ128mr:
627 case X86::VMOVAPSZ128mr:
628 case X86::VMOVUPSZ128mr_NOVLX:
629 case X86::VMOVAPSZ128mr_NOVLX:
630 case X86::VMOVUPDZ128mr:
631 case X86::VMOVAPDZ128mr:
632 case X86::VMOVDQA32Z128mr:
633 case X86::VMOVDQU32Z128mr:
634 case X86::VMOVDQA64Z128mr:
635 case X86::VMOVDQU64Z128mr:
636 case X86::VMOVDQU8Z128mr:
637 case X86::VMOVDQU16Z128mr:
638 MemBytes = TypeSize::getFixed(16);
639 return true;
640 case X86::VMOVUPSYmr:
641 case X86::VMOVAPSYmr:
642 case X86::VMOVUPDYmr:
643 case X86::VMOVAPDYmr:
644 case X86::VMOVDQUYmr:
645 case X86::VMOVDQAYmr:
646 case X86::VMOVUPSZ256mr:
647 case X86::VMOVAPSZ256mr:
648 case X86::VMOVUPSZ256mr_NOVLX:
649 case X86::VMOVAPSZ256mr_NOVLX:
650 case X86::VMOVUPDZ256mr:
651 case X86::VMOVAPDZ256mr:
652 case X86::VMOVDQU8Z256mr:
653 case X86::VMOVDQU16Z256mr:
654 case X86::VMOVDQA32Z256mr:
655 case X86::VMOVDQU32Z256mr:
656 case X86::VMOVDQA64Z256mr:
657 case X86::VMOVDQU64Z256mr:
658 MemBytes = TypeSize::getFixed(32);
659 return true;
660 case X86::VMOVUPSZmr:
661 case X86::VMOVAPSZmr:
662 case X86::VMOVUPDZmr:
663 case X86::VMOVAPDZmr:
664 case X86::VMOVDQU8Zmr:
665 case X86::VMOVDQU16Zmr:
666 case X86::VMOVDQA32Zmr:
667 case X86::VMOVDQU32Zmr:
668 case X86::VMOVDQA64Zmr:
669 case X86::VMOVDQU64Zmr:
670 MemBytes = TypeSize::getFixed(64);
671 return true;
672 }
673 return false;
674}
675
677 int &FrameIndex) const {
678 TypeSize Dummy = TypeSize::getZero();
679 return X86InstrInfo::isLoadFromStackSlot(MI, FrameIndex, Dummy);
680}
681
683 int &FrameIndex,
684 TypeSize &MemBytes) const {
685 if (isFrameLoadOpcode(MI.getOpcode(), MemBytes))
686 if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
687 return MI.getOperand(0).getReg();
688 return Register();
689}
690
692 int &FrameIndex) const {
693 TypeSize Dummy = TypeSize::getZero();
694 if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) {
695 if (Register Reg = isLoadFromStackSlot(MI, FrameIndex))
696 return Reg;
697 // Check for post-frame index elimination operations
699 if (hasLoadFromStackSlot(MI, Accesses)) {
700 FrameIndex =
701 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
702 ->getFrameIndex();
703 return MI.getOperand(0).getReg();
704 }
705 }
706 return Register();
707}
708
710 int &FrameIndex) const {
711 TypeSize Dummy = TypeSize::getZero();
712 return X86InstrInfo::isStoreToStackSlot(MI, FrameIndex, Dummy);
713}
714
716 int &FrameIndex,
717 TypeSize &MemBytes) const {
718 if (isFrameStoreOpcode(MI.getOpcode(), MemBytes))
719 if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
720 isFrameOperand(MI, 0, FrameIndex))
721 return MI.getOperand(X86::AddrNumOperands).getReg();
722 return Register();
723}
724
726 int &FrameIndex) const {
727 TypeSize Dummy = TypeSize::getZero();
728 if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) {
729 if (Register Reg = isStoreToStackSlot(MI, FrameIndex))
730 return Reg;
731 // Check for post-frame index elimination operations
733 if (hasStoreToStackSlot(MI, Accesses)) {
734 FrameIndex =
735 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
736 ->getFrameIndex();
737 return MI.getOperand(X86::AddrNumOperands).getReg();
738 }
739 }
740 return Register();
741}
742
743/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
744static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
745 // Don't waste compile time scanning use-def chains of physregs.
746 if (!BaseReg.isVirtual())
747 return false;
748 bool isPICBase = false;
749 for (const MachineInstr &DefMI : MRI.def_instructions(BaseReg)) {
750 if (DefMI.getOpcode() != X86::MOVPC32r)
751 return false;
752 assert(!isPICBase && "More than one PIC base?");
753 isPICBase = true;
754 }
755 return isPICBase;
756}
757
759 const MachineInstr &MI) const {
760 switch (MI.getOpcode()) {
761 default:
762 // This function should only be called for opcodes with the ReMaterializable
763 // flag set.
764 llvm_unreachable("Unknown rematerializable operation!");
765 break;
766 case X86::IMPLICIT_DEF:
767 // Defer to generic logic.
768 break;
769 case X86::LOAD_STACK_GUARD:
770 case X86::LD_Fp032:
771 case X86::LD_Fp064:
772 case X86::LD_Fp080:
773 case X86::LD_Fp132:
774 case X86::LD_Fp164:
775 case X86::LD_Fp180:
776 case X86::AVX1_SETALLONES:
777 case X86::AVX2_SETALLONES:
778 case X86::AVX512_128_SET0:
779 case X86::AVX512_256_SET0:
780 case X86::AVX512_512_SET0:
781 case X86::AVX512_512_SETALLONES:
782 case X86::AVX512_FsFLD0SD:
783 case X86::AVX512_FsFLD0SH:
784 case X86::AVX512_FsFLD0SS:
785 case X86::AVX512_FsFLD0F128:
786 case X86::AVX_SET0:
787 case X86::FsFLD0SD:
788 case X86::FsFLD0SS:
789 case X86::FsFLD0SH:
790 case X86::FsFLD0F128:
791 case X86::KSET0D:
792 case X86::KSET0Q:
793 case X86::KSET0W:
794 case X86::KSET1D:
795 case X86::KSET1Q:
796 case X86::KSET1W:
797 case X86::MMX_SET0:
798 case X86::MOV32ImmSExti8:
799 case X86::MOV32r0:
800 case X86::MOV32r1:
801 case X86::MOV32r_1:
802 case X86::MOV32ri64:
803 case X86::MOV64ImmSExti8:
804 case X86::V_SET0:
805 case X86::V_SETALLONES:
806 case X86::MOV16ri:
807 case X86::MOV32ri:
808 case X86::MOV64ri:
809 case X86::MOV64ri32:
810 case X86::MOV8ri:
811 case X86::PTILEZEROV:
812 return true;
813
814 case X86::MOV8rm:
815 case X86::MOV8rm_NOREX:
816 case X86::MOV16rm:
817 case X86::MOV32rm:
818 case X86::MOV64rm:
819 case X86::MOVSSrm:
820 case X86::MOVSSrm_alt:
821 case X86::MOVSDrm:
822 case X86::MOVSDrm_alt:
823 case X86::MOVAPSrm:
824 case X86::MOVUPSrm:
825 case X86::MOVAPDrm:
826 case X86::MOVUPDrm:
827 case X86::MOVDQArm:
828 case X86::MOVDQUrm:
829 case X86::VMOVSSrm:
830 case X86::VMOVSSrm_alt:
831 case X86::VMOVSDrm:
832 case X86::VMOVSDrm_alt:
833 case X86::VMOVAPSrm:
834 case X86::VMOVUPSrm:
835 case X86::VMOVAPDrm:
836 case X86::VMOVUPDrm:
837 case X86::VMOVDQArm:
838 case X86::VMOVDQUrm:
839 case X86::VMOVAPSYrm:
840 case X86::VMOVUPSYrm:
841 case X86::VMOVAPDYrm:
842 case X86::VMOVUPDYrm:
843 case X86::VMOVDQAYrm:
844 case X86::VMOVDQUYrm:
845 case X86::MMX_MOVD64rm:
846 case X86::MMX_MOVQ64rm:
847 case X86::VBROADCASTSSrm:
848 case X86::VBROADCASTSSYrm:
849 case X86::VBROADCASTSDYrm:
850 // AVX-512
851 case X86::VPBROADCASTBZ128rm:
852 case X86::VPBROADCASTBZ256rm:
853 case X86::VPBROADCASTBZrm:
854 case X86::VBROADCASTF32X2Z256rm:
855 case X86::VBROADCASTF32X2Zrm:
856 case X86::VBROADCASTI32X2Z128rm:
857 case X86::VBROADCASTI32X2Z256rm:
858 case X86::VBROADCASTI32X2Zrm:
859 case X86::VPBROADCASTWZ128rm:
860 case X86::VPBROADCASTWZ256rm:
861 case X86::VPBROADCASTWZrm:
862 case X86::VPBROADCASTDZ128rm:
863 case X86::VPBROADCASTDZ256rm:
864 case X86::VPBROADCASTDZrm:
865 case X86::VBROADCASTSSZ128rm:
866 case X86::VBROADCASTSSZ256rm:
867 case X86::VBROADCASTSSZrm:
868 case X86::VPBROADCASTQZ128rm:
869 case X86::VPBROADCASTQZ256rm:
870 case X86::VPBROADCASTQZrm:
871 case X86::VBROADCASTSDZ256rm:
872 case X86::VBROADCASTSDZrm:
873 case X86::VMOVSSZrm:
874 case X86::VMOVSSZrm_alt:
875 case X86::VMOVSDZrm:
876 case X86::VMOVSDZrm_alt:
877 case X86::VMOVSHZrm:
878 case X86::VMOVSHZrm_alt:
879 case X86::VMOVAPDZ128rm:
880 case X86::VMOVAPDZ256rm:
881 case X86::VMOVAPDZrm:
882 case X86::VMOVAPSZ128rm:
883 case X86::VMOVAPSZ256rm:
884 case X86::VMOVAPSZ128rm_NOVLX:
885 case X86::VMOVAPSZ256rm_NOVLX:
886 case X86::VMOVAPSZrm:
887 case X86::VMOVDQA32Z128rm:
888 case X86::VMOVDQA32Z256rm:
889 case X86::VMOVDQA32Zrm:
890 case X86::VMOVDQA64Z128rm:
891 case X86::VMOVDQA64Z256rm:
892 case X86::VMOVDQA64Zrm:
893 case X86::VMOVDQU16Z128rm:
894 case X86::VMOVDQU16Z256rm:
895 case X86::VMOVDQU16Zrm:
896 case X86::VMOVDQU32Z128rm:
897 case X86::VMOVDQU32Z256rm:
898 case X86::VMOVDQU32Zrm:
899 case X86::VMOVDQU64Z128rm:
900 case X86::VMOVDQU64Z256rm:
901 case X86::VMOVDQU64Zrm:
902 case X86::VMOVDQU8Z128rm:
903 case X86::VMOVDQU8Z256rm:
904 case X86::VMOVDQU8Zrm:
905 case X86::VMOVUPDZ128rm:
906 case X86::VMOVUPDZ256rm:
907 case X86::VMOVUPDZrm:
908 case X86::VMOVUPSZ128rm:
909 case X86::VMOVUPSZ256rm:
910 case X86::VMOVUPSZ128rm_NOVLX:
911 case X86::VMOVUPSZ256rm_NOVLX:
912 case X86::VMOVUPSZrm: {
913 // Loads from constant pools are trivially rematerializable.
914 if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
915 MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
916 MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
917 MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
918 MI.isDereferenceableInvariantLoad()) {
919 Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
920 if (BaseReg == 0 || BaseReg == X86::RIP)
921 return true;
922 // Allow re-materialization of PIC load.
923 if (!(!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal())) {
924 const MachineFunction &MF = *MI.getParent()->getParent();
925 const MachineRegisterInfo &MRI = MF.getRegInfo();
926 if (regIsPICBase(BaseReg, MRI))
927 return true;
928 }
929 }
930 break;
931 }
932
933 case X86::LEA32r:
934 case X86::LEA64r: {
935 if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
936 MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
937 MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
938 !MI.getOperand(1 + X86::AddrDisp).isReg()) {
939 // lea fi#, lea GV, etc. are all rematerializable.
940 if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
941 return true;
942 Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
943 if (BaseReg == 0)
944 return true;
945 // Allow re-materialization of lea PICBase + x.
946 const MachineFunction &MF = *MI.getParent()->getParent();
947 const MachineRegisterInfo &MRI = MF.getRegInfo();
948 if (regIsPICBase(BaseReg, MRI))
949 return true;
950 }
951 break;
952 }
953 }
955}
956
959 Register DestReg, unsigned SubIdx,
960 const MachineInstr &Orig,
961 const TargetRegisterInfo &TRI) const {
962 bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
963 if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) !=
965 // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
966 // effects.
967 int Value;
968 switch (Orig.getOpcode()) {
969 case X86::MOV32r0:
970 Value = 0;
971 break;
972 case X86::MOV32r1:
973 Value = 1;
974 break;
975 case X86::MOV32r_1:
976 Value = -1;
977 break;
978 default:
979 llvm_unreachable("Unexpected instruction!");
980 }
981
982 const DebugLoc &DL = Orig.getDebugLoc();
983 BuildMI(MBB, I, DL, get(X86::MOV32ri))
984 .add(Orig.getOperand(0))
985 .addImm(Value);
986 } else {
988 MBB.insert(I, MI);
989 }
990
991 MachineInstr &NewMI = *std::prev(I);
992 NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
993}
994
995/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
997 for (const MachineOperand &MO : MI.operands()) {
998 if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS &&
999 !MO.isDead()) {
1000 return true;
1001 }
1002 }
1003 return false;
1004}
1005
1006/// Check whether the shift count for a machine operand is non-zero.
1007inline static unsigned getTruncatedShiftCount(const MachineInstr &MI,
1008 unsigned ShiftAmtOperandIdx) {
1009 // The shift count is six bits with the REX.W prefix and five bits without.
1010 unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
1011 unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm();
1012 return Imm & ShiftCountMask;
1013}
1014
1015/// Check whether the given shift count is appropriate
1016/// can be represented by a LEA instruction.
1017inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
1018 // Left shift instructions can be transformed into load-effective-address
1019 // instructions if we can encode them appropriately.
1020 // A LEA instruction utilizes a SIB byte to encode its scale factor.
1021 // The SIB.scale field is two bits wide which means that we can encode any
1022 // shift amount less than 4.
1023 return ShAmt < 4 && ShAmt > 0;
1024}
1025
1026static bool
1028 const MachineRegisterInfo *MRI, MachineInstr **AndInstr,
1029 const TargetRegisterInfo *TRI, const X86Subtarget &ST,
1030 bool &NoSignFlag, bool &ClearsOverflowFlag) {
1031 if (!(CmpValDefInstr.getOpcode() == X86::SUBREG_TO_REG &&
1032 CmpInstr.getOpcode() == X86::TEST64rr) &&
1033 !(CmpValDefInstr.getOpcode() == X86::COPY &&
1034 CmpInstr.getOpcode() == X86::TEST16rr))
1035 return false;
1036
1037 // CmpInstr is a TEST16rr/TEST64rr instruction, and
1038 // `X86InstrInfo::analyzeCompare` guarantees that it's analyzable only if two
1039 // registers are identical.
1040 assert((CmpInstr.getOperand(0).getReg() == CmpInstr.getOperand(1).getReg()) &&
1041 "CmpInstr is an analyzable TEST16rr/TEST64rr, and "
1042 "`X86InstrInfo::analyzeCompare` requires two reg operands are the"
1043 "same.");
1044
1045 // Caller (`X86InstrInfo::optimizeCompareInstr`) guarantees that
1046 // `CmpValDefInstr` defines the value that's used by `CmpInstr`; in this case
1047 // if `CmpValDefInstr` sets the EFLAGS, it is likely that `CmpInstr` is
1048 // redundant.
1049 assert(
1050 (MRI->getVRegDef(CmpInstr.getOperand(0).getReg()) == &CmpValDefInstr) &&
1051 "Caller guarantees that TEST64rr is a user of SUBREG_TO_REG or TEST16rr "
1052 "is a user of COPY sub16bit.");
1053 MachineInstr *VregDefInstr = nullptr;
1054 if (CmpInstr.getOpcode() == X86::TEST16rr) {
1055 if (!CmpValDefInstr.getOperand(1).getReg().isVirtual())
1056 return false;
1057 VregDefInstr = MRI->getVRegDef(CmpValDefInstr.getOperand(1).getReg());
1058 if (!VregDefInstr)
1059 return false;
1060 // We can only remove test when AND32ri or AND64ri32 whose imm can fit 16bit
1061 // size, others 32/64 bit ops would test higher bits which test16rr don't
1062 // want to.
1063 if (!((VregDefInstr->getOpcode() == X86::AND32ri ||
1064 VregDefInstr->getOpcode() == X86::AND64ri32) &&
1065 isUInt<16>(VregDefInstr->getOperand(2).getImm())))
1066 return false;
1067 }
1068
1069 if (CmpInstr.getOpcode() == X86::TEST64rr) {
1070 // As seen in X86 td files, CmpValDefInstr.getOperand(1).getImm() is
1071 // typically 0.
1072 if (CmpValDefInstr.getOperand(1).getImm() != 0)
1073 return false;
1074
1075 // As seen in X86 td files, CmpValDefInstr.getOperand(3) is typically
1076 // sub_32bit or sub_xmm.
1077 if (CmpValDefInstr.getOperand(3).getImm() != X86::sub_32bit)
1078 return false;
1079
1080 VregDefInstr = MRI->getVRegDef(CmpValDefInstr.getOperand(2).getReg());
1081 }
1082
1083 assert(VregDefInstr && "Must have a definition (SSA)");
1084
1085 // Requires `CmpValDefInstr` and `VregDefInstr` are from the same MBB
1086 // to simplify the subsequent analysis.
1087 //
1088 // FIXME: If `VregDefInstr->getParent()` is the only predecessor of
1089 // `CmpValDefInstr.getParent()`, this could be handled.
1090 if (VregDefInstr->getParent() != CmpValDefInstr.getParent())
1091 return false;
1092
1093 if (X86::isAND(VregDefInstr->getOpcode()) &&
1094 (!ST.hasNF() || VregDefInstr->modifiesRegister(X86::EFLAGS, TRI))) {
1095 // Get a sequence of instructions like
1096 // %reg = and* ... // Set EFLAGS
1097 // ... // EFLAGS not changed
1098 // %extended_reg = subreg_to_reg 0, %reg, %subreg.sub_32bit
1099 // test64rr %extended_reg, %extended_reg, implicit-def $eflags
1100 // or
1101 // %reg = and32* ...
1102 // ... // EFLAGS not changed.
1103 // %src_reg = copy %reg.sub_16bit:gr32
1104 // test16rr %src_reg, %src_reg, implicit-def $eflags
1105 //
1106 // If subsequent readers use a subset of bits that don't change
1107 // after `and*` instructions, it's likely that the test64rr could
1108 // be optimized away.
1109 for (const MachineInstr &Instr :
1110 make_range(std::next(MachineBasicBlock::iterator(VregDefInstr)),
1111 MachineBasicBlock::iterator(CmpValDefInstr))) {
1112 // There are instructions between 'VregDefInstr' and
1113 // 'CmpValDefInstr' that modifies EFLAGS.
1114 if (Instr.modifiesRegister(X86::EFLAGS, TRI))
1115 return false;
1116 }
1117
1118 *AndInstr = VregDefInstr;
1119
1120 // AND instruction will essentially update SF and clear OF, so
1121 // NoSignFlag should be false in the sense that SF is modified by `AND`.
1122 //
1123 // However, the implementation artifically sets `NoSignFlag` to true
1124 // to poison the SF bit; that is to say, if SF is looked at later, the
1125 // optimization (to erase TEST64rr) will be disabled.
1126 //
1127 // The reason to poison SF bit is that SF bit value could be different
1128 // in the `AND` and `TEST` operation; signed bit is not known for `AND`,
1129 // and is known to be 0 as a result of `TEST64rr`.
1130 //
1131 // FIXME: As opposed to poisoning the SF bit directly, consider peeking into
1132 // the AND instruction and using the static information to guide peephole
1133 // optimization if possible. For example, it's possible to fold a
1134 // conditional move into a copy if the relevant EFLAG bits could be deduced
1135 // from an immediate operand of and operation.
1136 //
1137 NoSignFlag = true;
1138 // ClearsOverflowFlag is true for AND operation (no surprise).
1139 ClearsOverflowFlag = true;
1140 return true;
1141 }
1142 return false;
1143}
1144
1146 unsigned Opc, bool AllowSP, Register &NewSrc,
1147 unsigned &NewSrcSubReg, bool &isKill,
1148 MachineOperand &ImplicitOp, LiveVariables *LV,
1149 LiveIntervals *LIS) const {
1150 MachineFunction &MF = *MI.getParent()->getParent();
1151 const TargetRegisterClass *RC;
1152 if (AllowSP) {
1153 RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
1154 } else {
1155 RC = Opc != X86::LEA32r ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
1156 }
1157 Register SrcReg = Src.getReg();
1158 unsigned SubReg = Src.getSubReg();
1159 isKill = MI.killsRegister(SrcReg, /*TRI=*/nullptr);
1160
1161 NewSrcSubReg = X86::NoSubRegister;
1162
1163 // For both LEA64 and LEA32 the register already has essentially the right
1164 // type (32-bit or 64-bit) we may just need to forbid SP.
1165 if (Opc != X86::LEA64_32r) {
1166 NewSrc = SrcReg;
1167 NewSrcSubReg = SubReg;
1168 assert(!Src.isUndef() && "Undef op doesn't need optimization");
1169
1170 if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC))
1171 return false;
1172
1173 return true;
1174 }
1175
1176 // This is for an LEA64_32r and incoming registers are 32-bit. One way or
1177 // another we need to add 64-bit registers to the final MI.
1178 if (SrcReg.isPhysical()) {
1179 ImplicitOp = Src;
1180 ImplicitOp.setImplicit();
1181
1182 NewSrc = getX86SubSuperRegister(SrcReg, 64);
1183 assert(!SubReg && "no superregister for source");
1184 assert(NewSrc.isValid() && "Invalid Operand");
1185 assert(!Src.isUndef() && "Undef op doesn't need optimization");
1186 } else {
1187 // Virtual register of the wrong class, we have to create a temporary 64-bit
1188 // vreg to feed into the LEA.
1189 NewSrc = MF.getRegInfo().createVirtualRegister(RC);
1190 NewSrcSubReg = X86::NoSubRegister;
1191 MachineInstr *Copy =
1192 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1193 .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
1194 .addReg(SrcReg, getKillRegState(isKill), SubReg);
1195
1196 // Which is obviously going to be dead after we're done with it.
1197 isKill = true;
1198
1199 if (LV)
1200 LV->replaceKillInstruction(SrcReg, MI, *Copy);
1201
1202 if (LIS) {
1203 SlotIndex CopyIdx = LIS->InsertMachineInstrInMaps(*Copy);
1205 LiveInterval &LI = LIS->getInterval(SrcReg);
1207 if (S->end.getBaseIndex() == Idx)
1208 S->end = CopyIdx.getRegSlot();
1209 }
1210 }
1211
1212 // We've set all the parameters without issue.
1213 return true;
1214}
1215
1216MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
1218 LiveVariables *LV,
1219 LiveIntervals *LIS,
1220 bool Is8BitOp) const {
1221 // We handle 8-bit adds and various 16-bit opcodes in the switch below.
1222 MachineBasicBlock &MBB = *MI.getParent();
1224 assert((Is8BitOp ||
1226 *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
1227 "Unexpected type for LEA transform");
1228
1229 // TODO: For a 32-bit target, we need to adjust the LEA variables with
1230 // something like this:
1231 // Opcode = X86::LEA32r;
1232 // InRegLEA = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1233 // OutRegLEA =
1234 // Is8BitOp ? RegInfo.createVirtualRegister(&X86::GR32ABCD_RegClass)
1235 // : RegInfo.createVirtualRegister(&X86::GR32RegClass);
1236 if (!Subtarget.is64Bit())
1237 return nullptr;
1238
1239 unsigned Opcode = X86::LEA64_32r;
1240 Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1241 Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
1242 Register InRegLEA2;
1243
1244 // Build and insert into an implicit UNDEF value. This is OK because
1245 // we will be shifting and then extracting the lower 8/16-bits.
1246 // This has the potential to cause partial register stall. e.g.
1247 // movw (%rbp,%rcx,2), %dx
1248 // leal -65(%rdx), %esi
1249 // But testing has shown this *does* help performance in 64-bit mode (at
1250 // least on modern x86 machines).
1251 MachineBasicBlock::iterator MBBI = MI.getIterator();
1252 Register Dest = MI.getOperand(0).getReg();
1253 Register Src = MI.getOperand(1).getReg();
1254 unsigned SrcSubReg = MI.getOperand(1).getSubReg();
1255 Register Src2;
1256 unsigned Src2SubReg;
1257 bool IsDead = MI.getOperand(0).isDead();
1258 bool IsKill = MI.getOperand(1).isKill();
1259 unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
1260 assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
1261 MachineInstr *ImpDef =
1262 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
1263 MachineInstr *InsMI =
1264 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1265 .addReg(InRegLEA, RegState::Define, SubReg)
1266 .addReg(Src, getKillRegState(IsKill), SrcSubReg);
1267 MachineInstr *ImpDef2 = nullptr;
1268 MachineInstr *InsMI2 = nullptr;
1269
1271 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA);
1272#define CASE_NF(OP) \
1273 case X86::OP: \
1274 case X86::OP##_NF:
1275 switch (MIOpc) {
1276 default:
1277 llvm_unreachable("Unreachable!");
1278 CASE_NF(SHL8ri)
1279 CASE_NF(SHL16ri) {
1280 unsigned ShAmt = MI.getOperand(2).getImm();
1281 MIB.addReg(0)
1282 .addImm(1LL << ShAmt)
1283 .addReg(InRegLEA, RegState::Kill)
1284 .addImm(0)
1285 .addReg(0);
1286 break;
1287 }
1288 CASE_NF(INC8r)
1289 CASE_NF(INC16r)
1290 addRegOffset(MIB, InRegLEA, true, 1);
1291 break;
1292 CASE_NF(DEC8r)
1293 CASE_NF(DEC16r)
1294 addRegOffset(MIB, InRegLEA, true, -1);
1295 break;
1296 CASE_NF(ADD8ri)
1297 CASE_NF(ADD16ri)
1298 case X86::ADD8ri_DB:
1299 case X86::ADD16ri_DB:
1300 addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
1301 break;
1302 CASE_NF(ADD8rr)
1303 CASE_NF(ADD16rr)
1304 case X86::ADD8rr_DB:
1305 case X86::ADD16rr_DB: {
1306 Src2 = MI.getOperand(2).getReg();
1307 Src2SubReg = MI.getOperand(2).getSubReg();
1308 bool IsKill2 = MI.getOperand(2).isKill();
1309 assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization");
1310 if (Src == Src2) {
1311 // ADD8rr/ADD16rr killed %reg1028, %reg1028
1312 // just a single insert_subreg.
1313 addRegReg(MIB, InRegLEA, true, X86::NoSubRegister, InRegLEA, false,
1314 X86::NoSubRegister);
1315 } else {
1316 if (Subtarget.is64Bit())
1317 InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1318 else
1319 InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1320 // Build and insert into an implicit UNDEF value. This is OK because
1321 // we will be shifting and then extracting the lower 8/16-bits.
1322 ImpDef2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF),
1323 InRegLEA2);
1324 InsMI2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
1325 .addReg(InRegLEA2, RegState::Define, SubReg)
1326 .addReg(Src2, getKillRegState(IsKill2), Src2SubReg);
1327 addRegReg(MIB, InRegLEA, true, X86::NoSubRegister, InRegLEA2, true,
1328 X86::NoSubRegister);
1329 }
1330 if (LV && IsKill2 && InsMI2)
1331 LV->replaceKillInstruction(Src2, MI, *InsMI2);
1332 break;
1333 }
1334 }
1335
1336 MachineInstr *NewMI = MIB;
1337 MachineInstr *ExtMI =
1338 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1340 .addReg(OutRegLEA, RegState::Kill, SubReg);
1341
1342 if (LV) {
1343 // Update live variables.
1344 LV->getVarInfo(InRegLEA).Kills.push_back(NewMI);
1345 if (InRegLEA2)
1346 LV->getVarInfo(InRegLEA2).Kills.push_back(NewMI);
1347 LV->getVarInfo(OutRegLEA).Kills.push_back(ExtMI);
1348 if (IsKill)
1349 LV->replaceKillInstruction(Src, MI, *InsMI);
1350 if (IsDead)
1351 LV->replaceKillInstruction(Dest, MI, *ExtMI);
1352 }
1353
1354 if (LIS) {
1355 LIS->InsertMachineInstrInMaps(*ImpDef);
1356 SlotIndex InsIdx = LIS->InsertMachineInstrInMaps(*InsMI);
1357 if (ImpDef2)
1358 LIS->InsertMachineInstrInMaps(*ImpDef2);
1359 SlotIndex Ins2Idx;
1360 if (InsMI2)
1361 Ins2Idx = LIS->InsertMachineInstrInMaps(*InsMI2);
1362 SlotIndex NewIdx = LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
1363 SlotIndex ExtIdx = LIS->InsertMachineInstrInMaps(*ExtMI);
1364 LIS->getInterval(InRegLEA);
1365 LIS->getInterval(OutRegLEA);
1366 if (InRegLEA2)
1367 LIS->getInterval(InRegLEA2);
1368
1369 // Move the use of Src up to InsMI.
1370 LiveInterval &SrcLI = LIS->getInterval(Src);
1371 LiveRange::Segment *SrcSeg = SrcLI.getSegmentContaining(NewIdx);
1372 if (SrcSeg->end == NewIdx.getRegSlot())
1373 SrcSeg->end = InsIdx.getRegSlot();
1374
1375 if (InsMI2) {
1376 // Move the use of Src2 up to InsMI2.
1377 LiveInterval &Src2LI = LIS->getInterval(Src2);
1378 LiveRange::Segment *Src2Seg = Src2LI.getSegmentContaining(NewIdx);
1379 if (Src2Seg->end == NewIdx.getRegSlot())
1380 Src2Seg->end = Ins2Idx.getRegSlot();
1381 }
1382
1383 // Move the definition of Dest down to ExtMI.
1384 LiveInterval &DestLI = LIS->getInterval(Dest);
1385 LiveRange::Segment *DestSeg =
1386 DestLI.getSegmentContaining(NewIdx.getRegSlot());
1387 assert(DestSeg->start == NewIdx.getRegSlot() &&
1388 DestSeg->valno->def == NewIdx.getRegSlot());
1389 DestSeg->start = ExtIdx.getRegSlot();
1390 DestSeg->valno->def = ExtIdx.getRegSlot();
1391 }
1392
1393 return ExtMI;
1394}
1395
1396/// This method must be implemented by targets that
1397/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
1398/// may be able to convert a two-address instruction into a true
1399/// three-address instruction on demand. This allows the X86 target (for
1400/// example) to convert ADD and SHL instructions into LEA instructions if they
1401/// would require register copies due to two-addressness.
1402///
1403/// This method returns a null pointer if the transformation cannot be
1404/// performed, otherwise it returns the new instruction.
1405///
1407 LiveVariables *LV,
1408 LiveIntervals *LIS) const {
1409 // The following opcodes also sets the condition code register(s). Only
1410 // convert them to equivalent lea if the condition code register def's
1411 // are dead!
1413 return nullptr;
1414
1415 MachineFunction &MF = *MI.getParent()->getParent();
1416 // All instructions input are two-addr instructions. Get the known operands.
1417 const MachineOperand &Dest = MI.getOperand(0);
1418 const MachineOperand &Src = MI.getOperand(1);
1419
1420 // Ideally, operations with undef should be folded before we get here, but we
1421 // can't guarantee it. Bail out because optimizing undefs is a waste of time.
1422 // Without this, we have to forward undef state to new register operands to
1423 // avoid machine verifier errors.
1424 if (Src.isUndef())
1425 return nullptr;
1426 if (MI.getNumOperands() > 2)
1427 if (MI.getOperand(2).isReg() && MI.getOperand(2).isUndef())
1428 return nullptr;
1429
1430 MachineInstr *NewMI = nullptr;
1431 Register SrcReg, SrcReg2;
1432 unsigned SrcSubReg, SrcSubReg2;
1433 bool Is64Bit = Subtarget.is64Bit();
1434
1435 bool Is8BitOp = false;
1436 unsigned NumRegOperands = 2;
1437 unsigned MIOpc = MI.getOpcode();
1438 switch (MIOpc) {
1439 default:
1440 llvm_unreachable("Unreachable!");
1441 CASE_NF(SHL64ri) {
1442 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1443 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1444 if (!isTruncatedShiftCountForLEA(ShAmt))
1445 return nullptr;
1446
1447 // LEA can't handle RSP.
1448 if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
1449 Src.getReg(), &X86::GR64_NOSPRegClass))
1450 return nullptr;
1451
1452 NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
1453 .add(Dest)
1454 .addReg(0)
1455 .addImm(1LL << ShAmt)
1456 .add(Src)
1457 .addImm(0)
1458 .addReg(0);
1459 break;
1460 }
1461 CASE_NF(SHL32ri) {
1462 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1463 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1464 if (!isTruncatedShiftCountForLEA(ShAmt))
1465 return nullptr;
1466
1467 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1468
1469 // LEA can't handle ESP.
1470 bool isKill;
1471 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1472 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
1473 isKill, ImplicitOp, LV, LIS))
1474 return nullptr;
1475
1477 BuildMI(MF, MI.getDebugLoc(), get(Opc))
1478 .add(Dest)
1479 .addReg(0)
1480 .addImm(1LL << ShAmt)
1481 .addReg(SrcReg, getKillRegState(isKill), SrcSubReg)
1482 .addImm(0)
1483 .addReg(0);
1484 if (ImplicitOp.getReg() != 0)
1485 MIB.add(ImplicitOp);
1486 NewMI = MIB;
1487
1488 // Add kills if classifyLEAReg created a new register.
1489 if (LV && SrcReg != Src.getReg())
1490 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1491 break;
1492 }
1493 CASE_NF(SHL8ri)
1494 Is8BitOp = true;
1495 [[fallthrough]];
1496 CASE_NF(SHL16ri) {
1497 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1498 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1499 if (!isTruncatedShiftCountForLEA(ShAmt))
1500 return nullptr;
1501 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1502 }
1503 CASE_NF(INC64r)
1504 CASE_NF(INC32r) {
1505 assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
1506 unsigned Opc = (MIOpc == X86::INC64r || MIOpc == X86::INC64r_NF)
1507 ? X86::LEA64r
1508 : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1509 bool isKill;
1510 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1511 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
1512 isKill, ImplicitOp, LV, LIS))
1513 return nullptr;
1514
1515 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1516 .add(Dest)
1517 .addReg(SrcReg, getKillRegState(isKill));
1518 if (ImplicitOp.getReg() != 0)
1519 MIB.add(ImplicitOp);
1520
1521 NewMI = addOffset(MIB, 1);
1522
1523 // Add kills if classifyLEAReg created a new register.
1524 if (LV && SrcReg != Src.getReg())
1525 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1526 break;
1527 }
1528 CASE_NF(DEC64r)
1529 CASE_NF(DEC32r) {
1530 assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
1531 unsigned Opc = (MIOpc == X86::DEC64r || MIOpc == X86::DEC64r_NF)
1532 ? X86::LEA64r
1533 : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1534
1535 bool isKill;
1536 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1537 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
1538 isKill, ImplicitOp, LV, LIS))
1539 return nullptr;
1540
1541 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1542 .add(Dest)
1543 .addReg(SrcReg, getKillRegState(isKill));
1544 if (ImplicitOp.getReg() != 0)
1545 MIB.add(ImplicitOp);
1546
1547 NewMI = addOffset(MIB, -1);
1548
1549 // Add kills if classifyLEAReg created a new register.
1550 if (LV && SrcReg != Src.getReg())
1551 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1552 break;
1553 }
1554 CASE_NF(DEC8r)
1555 CASE_NF(INC8r)
1556 Is8BitOp = true;
1557 [[fallthrough]];
1558 CASE_NF(DEC16r)
1559 CASE_NF(INC16r)
1560 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1561 CASE_NF(ADD64rr)
1562 CASE_NF(ADD32rr)
1563 case X86::ADD64rr_DB:
1564 case X86::ADD32rr_DB: {
1565 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1566 unsigned Opc;
1567 if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_NF ||
1568 MIOpc == X86::ADD64rr_DB)
1569 Opc = X86::LEA64r;
1570 else
1571 Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1572
1573 const MachineOperand &Src2 = MI.getOperand(2);
1574 bool isKill2;
1575 MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
1576 if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/false, SrcReg2, SrcSubReg2,
1577 isKill2, ImplicitOp2, LV, LIS))
1578 return nullptr;
1579
1580 bool isKill;
1581 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1582 if (Src.getReg() == Src2.getReg()) {
1583 // Don't call classify LEAReg a second time on the same register, in case
1584 // the first call inserted a COPY from Src2 and marked it as killed.
1585 isKill = isKill2;
1586 SrcReg = SrcReg2;
1587 SrcSubReg = SrcSubReg2;
1588 } else {
1589 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
1590 isKill, ImplicitOp, LV, LIS))
1591 return nullptr;
1592 }
1593
1594 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest);
1595 if (ImplicitOp.getReg() != 0)
1596 MIB.add(ImplicitOp);
1597 if (ImplicitOp2.getReg() != 0)
1598 MIB.add(ImplicitOp2);
1599
1600 NewMI =
1601 addRegReg(MIB, SrcReg, isKill, SrcSubReg, SrcReg2, isKill2, SrcSubReg2);
1602
1603 // Add kills if classifyLEAReg created a new register.
1604 if (LV) {
1605 if (SrcReg2 != Src2.getReg())
1606 LV->getVarInfo(SrcReg2).Kills.push_back(NewMI);
1607 if (SrcReg != SrcReg2 && SrcReg != Src.getReg())
1608 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1609 }
1610 NumRegOperands = 3;
1611 break;
1612 }
1613 CASE_NF(ADD8rr)
1614 case X86::ADD8rr_DB:
1615 Is8BitOp = true;
1616 [[fallthrough]];
1617 CASE_NF(ADD16rr)
1618 case X86::ADD16rr_DB:
1619 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1620 CASE_NF(ADD64ri32)
1621 case X86::ADD64ri32_DB:
1622 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1623 NewMI = addOffset(
1624 BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src),
1625 MI.getOperand(2));
1626 break;
1627 CASE_NF(ADD32ri)
1628 case X86::ADD32ri_DB: {
1629 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1630 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1631
1632 bool isKill;
1633 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1634 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
1635 isKill, ImplicitOp, LV, LIS))
1636 return nullptr;
1637
1639 BuildMI(MF, MI.getDebugLoc(), get(Opc))
1640 .add(Dest)
1641 .addReg(SrcReg, getKillRegState(isKill), SrcSubReg);
1642 if (ImplicitOp.getReg() != 0)
1643 MIB.add(ImplicitOp);
1644
1645 NewMI = addOffset(MIB, MI.getOperand(2));
1646
1647 // Add kills if classifyLEAReg created a new register.
1648 if (LV && SrcReg != Src.getReg())
1649 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1650 break;
1651 }
1652 CASE_NF(ADD8ri)
1653 case X86::ADD8ri_DB:
1654 Is8BitOp = true;
1655 [[fallthrough]];
1656 CASE_NF(ADD16ri)
1657 case X86::ADD16ri_DB:
1658 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1659 CASE_NF(SUB8ri)
1660 CASE_NF(SUB16ri)
1661 /// FIXME: Support these similar to ADD8ri/ADD16ri*.
1662 return nullptr;
1663 CASE_NF(SUB32ri) {
1664 if (!MI.getOperand(2).isImm())
1665 return nullptr;
1666 int64_t Imm = MI.getOperand(2).getImm();
1667 if (!isInt<32>(-Imm))
1668 return nullptr;
1669
1670 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1671 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1672
1673 bool isKill;
1674 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1675 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
1676 isKill, ImplicitOp, LV, LIS))
1677 return nullptr;
1678
1680 BuildMI(MF, MI.getDebugLoc(), get(Opc))
1681 .add(Dest)
1682 .addReg(SrcReg, getKillRegState(isKill), SrcSubReg);
1683 if (ImplicitOp.getReg() != 0)
1684 MIB.add(ImplicitOp);
1685
1686 NewMI = addOffset(MIB, -Imm);
1687
1688 // Add kills if classifyLEAReg created a new register.
1689 if (LV && SrcReg != Src.getReg())
1690 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1691 break;
1692 }
1693
1694 CASE_NF(SUB64ri32) {
1695 if (!MI.getOperand(2).isImm())
1696 return nullptr;
1697 int64_t Imm = MI.getOperand(2).getImm();
1698 if (!isInt<32>(-Imm))
1699 return nullptr;
1700
1701 assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
1702
1704 BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src);
1705 NewMI = addOffset(MIB, -Imm);
1706 break;
1707 }
1708
1709 case X86::VMOVDQU8Z128rmk:
1710 case X86::VMOVDQU8Z256rmk:
1711 case X86::VMOVDQU8Zrmk:
1712 case X86::VMOVDQU16Z128rmk:
1713 case X86::VMOVDQU16Z256rmk:
1714 case X86::VMOVDQU16Zrmk:
1715 case X86::VMOVDQU32Z128rmk:
1716 case X86::VMOVDQA32Z128rmk:
1717 case X86::VMOVDQU32Z256rmk:
1718 case X86::VMOVDQA32Z256rmk:
1719 case X86::VMOVDQU32Zrmk:
1720 case X86::VMOVDQA32Zrmk:
1721 case X86::VMOVDQU64Z128rmk:
1722 case X86::VMOVDQA64Z128rmk:
1723 case X86::VMOVDQU64Z256rmk:
1724 case X86::VMOVDQA64Z256rmk:
1725 case X86::VMOVDQU64Zrmk:
1726 case X86::VMOVDQA64Zrmk:
1727 case X86::VMOVUPDZ128rmk:
1728 case X86::VMOVAPDZ128rmk:
1729 case X86::VMOVUPDZ256rmk:
1730 case X86::VMOVAPDZ256rmk:
1731 case X86::VMOVUPDZrmk:
1732 case X86::VMOVAPDZrmk:
1733 case X86::VMOVUPSZ128rmk:
1734 case X86::VMOVAPSZ128rmk:
1735 case X86::VMOVUPSZ256rmk:
1736 case X86::VMOVAPSZ256rmk:
1737 case X86::VMOVUPSZrmk:
1738 case X86::VMOVAPSZrmk:
1739 case X86::VBROADCASTSDZ256rmk:
1740 case X86::VBROADCASTSDZrmk:
1741 case X86::VBROADCASTSSZ128rmk:
1742 case X86::VBROADCASTSSZ256rmk:
1743 case X86::VBROADCASTSSZrmk:
1744 case X86::VPBROADCASTDZ128rmk:
1745 case X86::VPBROADCASTDZ256rmk:
1746 case X86::VPBROADCASTDZrmk:
1747 case X86::VPBROADCASTQZ128rmk:
1748 case X86::VPBROADCASTQZ256rmk:
1749 case X86::VPBROADCASTQZrmk: {
1750 unsigned Opc;
1751 switch (MIOpc) {
1752 default:
1753 llvm_unreachable("Unreachable!");
1754 case X86::VMOVDQU8Z128rmk:
1755 Opc = X86::VPBLENDMBZ128rmk;
1756 break;
1757 case X86::VMOVDQU8Z256rmk:
1758 Opc = X86::VPBLENDMBZ256rmk;
1759 break;
1760 case X86::VMOVDQU8Zrmk:
1761 Opc = X86::VPBLENDMBZrmk;
1762 break;
1763 case X86::VMOVDQU16Z128rmk:
1764 Opc = X86::VPBLENDMWZ128rmk;
1765 break;
1766 case X86::VMOVDQU16Z256rmk:
1767 Opc = X86::VPBLENDMWZ256rmk;
1768 break;
1769 case X86::VMOVDQU16Zrmk:
1770 Opc = X86::VPBLENDMWZrmk;
1771 break;
1772 case X86::VMOVDQU32Z128rmk:
1773 Opc = X86::VPBLENDMDZ128rmk;
1774 break;
1775 case X86::VMOVDQU32Z256rmk:
1776 Opc = X86::VPBLENDMDZ256rmk;
1777 break;
1778 case X86::VMOVDQU32Zrmk:
1779 Opc = X86::VPBLENDMDZrmk;
1780 break;
1781 case X86::VMOVDQU64Z128rmk:
1782 Opc = X86::VPBLENDMQZ128rmk;
1783 break;
1784 case X86::VMOVDQU64Z256rmk:
1785 Opc = X86::VPBLENDMQZ256rmk;
1786 break;
1787 case X86::VMOVDQU64Zrmk:
1788 Opc = X86::VPBLENDMQZrmk;
1789 break;
1790 case X86::VMOVUPDZ128rmk:
1791 Opc = X86::VBLENDMPDZ128rmk;
1792 break;
1793 case X86::VMOVUPDZ256rmk:
1794 Opc = X86::VBLENDMPDZ256rmk;
1795 break;
1796 case X86::VMOVUPDZrmk:
1797 Opc = X86::VBLENDMPDZrmk;
1798 break;
1799 case X86::VMOVUPSZ128rmk:
1800 Opc = X86::VBLENDMPSZ128rmk;
1801 break;
1802 case X86::VMOVUPSZ256rmk:
1803 Opc = X86::VBLENDMPSZ256rmk;
1804 break;
1805 case X86::VMOVUPSZrmk:
1806 Opc = X86::VBLENDMPSZrmk;
1807 break;
1808 case X86::VMOVDQA32Z128rmk:
1809 Opc = X86::VPBLENDMDZ128rmk;
1810 break;
1811 case X86::VMOVDQA32Z256rmk:
1812 Opc = X86::VPBLENDMDZ256rmk;
1813 break;
1814 case X86::VMOVDQA32Zrmk:
1815 Opc = X86::VPBLENDMDZrmk;
1816 break;
1817 case X86::VMOVDQA64Z128rmk:
1818 Opc = X86::VPBLENDMQZ128rmk;
1819 break;
1820 case X86::VMOVDQA64Z256rmk:
1821 Opc = X86::VPBLENDMQZ256rmk;
1822 break;
1823 case X86::VMOVDQA64Zrmk:
1824 Opc = X86::VPBLENDMQZrmk;
1825 break;
1826 case X86::VMOVAPDZ128rmk:
1827 Opc = X86::VBLENDMPDZ128rmk;
1828 break;
1829 case X86::VMOVAPDZ256rmk:
1830 Opc = X86::VBLENDMPDZ256rmk;
1831 break;
1832 case X86::VMOVAPDZrmk:
1833 Opc = X86::VBLENDMPDZrmk;
1834 break;
1835 case X86::VMOVAPSZ128rmk:
1836 Opc = X86::VBLENDMPSZ128rmk;
1837 break;
1838 case X86::VMOVAPSZ256rmk:
1839 Opc = X86::VBLENDMPSZ256rmk;
1840 break;
1841 case X86::VMOVAPSZrmk:
1842 Opc = X86::VBLENDMPSZrmk;
1843 break;
1844 case X86::VBROADCASTSDZ256rmk:
1845 Opc = X86::VBLENDMPDZ256rmbk;
1846 break;
1847 case X86::VBROADCASTSDZrmk:
1848 Opc = X86::VBLENDMPDZrmbk;
1849 break;
1850 case X86::VBROADCASTSSZ128rmk:
1851 Opc = X86::VBLENDMPSZ128rmbk;
1852 break;
1853 case X86::VBROADCASTSSZ256rmk:
1854 Opc = X86::VBLENDMPSZ256rmbk;
1855 break;
1856 case X86::VBROADCASTSSZrmk:
1857 Opc = X86::VBLENDMPSZrmbk;
1858 break;
1859 case X86::VPBROADCASTDZ128rmk:
1860 Opc = X86::VPBLENDMDZ128rmbk;
1861 break;
1862 case X86::VPBROADCASTDZ256rmk:
1863 Opc = X86::VPBLENDMDZ256rmbk;
1864 break;
1865 case X86::VPBROADCASTDZrmk:
1866 Opc = X86::VPBLENDMDZrmbk;
1867 break;
1868 case X86::VPBROADCASTQZ128rmk:
1869 Opc = X86::VPBLENDMQZ128rmbk;
1870 break;
1871 case X86::VPBROADCASTQZ256rmk:
1872 Opc = X86::VPBLENDMQZ256rmbk;
1873 break;
1874 case X86::VPBROADCASTQZrmk:
1875 Opc = X86::VPBLENDMQZrmbk;
1876 break;
1877 }
1878
1879 NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1880 .add(Dest)
1881 .add(MI.getOperand(2))
1882 .add(Src)
1883 .add(MI.getOperand(3))
1884 .add(MI.getOperand(4))
1885 .add(MI.getOperand(5))
1886 .add(MI.getOperand(6))
1887 .add(MI.getOperand(7));
1888 NumRegOperands = 4;
1889 break;
1890 }
1891
1892 case X86::VMOVDQU8Z128rrk:
1893 case X86::VMOVDQU8Z256rrk:
1894 case X86::VMOVDQU8Zrrk:
1895 case X86::VMOVDQU16Z128rrk:
1896 case X86::VMOVDQU16Z256rrk:
1897 case X86::VMOVDQU16Zrrk:
1898 case X86::VMOVDQU32Z128rrk:
1899 case X86::VMOVDQA32Z128rrk:
1900 case X86::VMOVDQU32Z256rrk:
1901 case X86::VMOVDQA32Z256rrk:
1902 case X86::VMOVDQU32Zrrk:
1903 case X86::VMOVDQA32Zrrk:
1904 case X86::VMOVDQU64Z128rrk:
1905 case X86::VMOVDQA64Z128rrk:
1906 case X86::VMOVDQU64Z256rrk:
1907 case X86::VMOVDQA64Z256rrk:
1908 case X86::VMOVDQU64Zrrk:
1909 case X86::VMOVDQA64Zrrk:
1910 case X86::VMOVUPDZ128rrk:
1911 case X86::VMOVAPDZ128rrk:
1912 case X86::VMOVUPDZ256rrk:
1913 case X86::VMOVAPDZ256rrk:
1914 case X86::VMOVUPDZrrk:
1915 case X86::VMOVAPDZrrk:
1916 case X86::VMOVUPSZ128rrk:
1917 case X86::VMOVAPSZ128rrk:
1918 case X86::VMOVUPSZ256rrk:
1919 case X86::VMOVAPSZ256rrk:
1920 case X86::VMOVUPSZrrk:
1921 case X86::VMOVAPSZrrk: {
1922 unsigned Opc;
1923 switch (MIOpc) {
1924 default:
1925 llvm_unreachable("Unreachable!");
1926 case X86::VMOVDQU8Z128rrk:
1927 Opc = X86::VPBLENDMBZ128rrk;
1928 break;
1929 case X86::VMOVDQU8Z256rrk:
1930 Opc = X86::VPBLENDMBZ256rrk;
1931 break;
1932 case X86::VMOVDQU8Zrrk:
1933 Opc = X86::VPBLENDMBZrrk;
1934 break;
1935 case X86::VMOVDQU16Z128rrk:
1936 Opc = X86::VPBLENDMWZ128rrk;
1937 break;
1938 case X86::VMOVDQU16Z256rrk:
1939 Opc = X86::VPBLENDMWZ256rrk;
1940 break;
1941 case X86::VMOVDQU16Zrrk:
1942 Opc = X86::VPBLENDMWZrrk;
1943 break;
1944 case X86::VMOVDQU32Z128rrk:
1945 Opc = X86::VPBLENDMDZ128rrk;
1946 break;
1947 case X86::VMOVDQU32Z256rrk:
1948 Opc = X86::VPBLENDMDZ256rrk;
1949 break;
1950 case X86::VMOVDQU32Zrrk:
1951 Opc = X86::VPBLENDMDZrrk;
1952 break;
1953 case X86::VMOVDQU64Z128rrk:
1954 Opc = X86::VPBLENDMQZ128rrk;
1955 break;
1956 case X86::VMOVDQU64Z256rrk:
1957 Opc = X86::VPBLENDMQZ256rrk;
1958 break;
1959 case X86::VMOVDQU64Zrrk:
1960 Opc = X86::VPBLENDMQZrrk;
1961 break;
1962 case X86::VMOVUPDZ128rrk:
1963 Opc = X86::VBLENDMPDZ128rrk;
1964 break;
1965 case X86::VMOVUPDZ256rrk:
1966 Opc = X86::VBLENDMPDZ256rrk;
1967 break;
1968 case X86::VMOVUPDZrrk:
1969 Opc = X86::VBLENDMPDZrrk;
1970 break;
1971 case X86::VMOVUPSZ128rrk:
1972 Opc = X86::VBLENDMPSZ128rrk;
1973 break;
1974 case X86::VMOVUPSZ256rrk:
1975 Opc = X86::VBLENDMPSZ256rrk;
1976 break;
1977 case X86::VMOVUPSZrrk:
1978 Opc = X86::VBLENDMPSZrrk;
1979 break;
1980 case X86::VMOVDQA32Z128rrk:
1981 Opc = X86::VPBLENDMDZ128rrk;
1982 break;
1983 case X86::VMOVDQA32Z256rrk:
1984 Opc = X86::VPBLENDMDZ256rrk;
1985 break;
1986 case X86::VMOVDQA32Zrrk:
1987 Opc = X86::VPBLENDMDZrrk;
1988 break;
1989 case X86::VMOVDQA64Z128rrk:
1990 Opc = X86::VPBLENDMQZ128rrk;
1991 break;
1992 case X86::VMOVDQA64Z256rrk:
1993 Opc = X86::VPBLENDMQZ256rrk;
1994 break;
1995 case X86::VMOVDQA64Zrrk:
1996 Opc = X86::VPBLENDMQZrrk;
1997 break;
1998 case X86::VMOVAPDZ128rrk:
1999 Opc = X86::VBLENDMPDZ128rrk;
2000 break;
2001 case X86::VMOVAPDZ256rrk:
2002 Opc = X86::VBLENDMPDZ256rrk;
2003 break;
2004 case X86::VMOVAPDZrrk:
2005 Opc = X86::VBLENDMPDZrrk;
2006 break;
2007 case X86::VMOVAPSZ128rrk:
2008 Opc = X86::VBLENDMPSZ128rrk;
2009 break;
2010 case X86::VMOVAPSZ256rrk:
2011 Opc = X86::VBLENDMPSZ256rrk;
2012 break;
2013 case X86::VMOVAPSZrrk:
2014 Opc = X86::VBLENDMPSZrrk;
2015 break;
2016 }
2017
2018 NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
2019 .add(Dest)
2020 .add(MI.getOperand(2))
2021 .add(Src)
2022 .add(MI.getOperand(3));
2023 NumRegOperands = 4;
2024 break;
2025 }
2026 }
2027#undef CASE_NF
2028
2029 if (!NewMI)
2030 return nullptr;
2031
2032 if (LV) { // Update live variables
2033 for (unsigned I = 0; I < NumRegOperands; ++I) {
2034 MachineOperand &Op = MI.getOperand(I);
2035 if (Op.isReg() && (Op.isDead() || Op.isKill()))
2036 LV->replaceKillInstruction(Op.getReg(), MI, *NewMI);
2037 }
2038 }
2039
2040 MachineBasicBlock &MBB = *MI.getParent();
2041 MBB.insert(MI.getIterator(), NewMI); // Insert the new inst
2042
2043 if (LIS) {
2044 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
2045 if (SrcReg)
2046 LIS->getInterval(SrcReg);
2047 if (SrcReg2)
2048 LIS->getInterval(SrcReg2);
2049 }
2050
2051 return NewMI;
2052}
2053
2054/// This determines which of three possible cases of a three source commute
2055/// the source indexes correspond to taking into account any mask operands.
2056/// All prevents commuting a passthru operand. Returns -1 if the commute isn't
2057/// possible.
2058/// Case 0 - Possible to commute the first and second operands.
2059/// Case 1 - Possible to commute the first and third operands.
2060/// Case 2 - Possible to commute the second and third operands.
2061static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
2062 unsigned SrcOpIdx2) {
2063 // Put the lowest index to SrcOpIdx1 to simplify the checks below.
2064 if (SrcOpIdx1 > SrcOpIdx2)
2065 std::swap(SrcOpIdx1, SrcOpIdx2);
2066
2067 unsigned Op1 = 1, Op2 = 2, Op3 = 3;
2068 if (X86II::isKMasked(TSFlags)) {
2069 Op2++;
2070 Op3++;
2071 }
2072
2073 if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2)
2074 return 0;
2075 if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3)
2076 return 1;
2077 if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
2078 return 2;
2079 llvm_unreachable("Unknown three src commute case.");
2080}
2081
2083 const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
2084 const X86InstrFMA3Group &FMA3Group) const {
2085
2086 unsigned Opc = MI.getOpcode();
2087
2088 // TODO: Commuting the 1st operand of FMA*_Int requires some additional
2089 // analysis. The commute optimization is legal only if all users of FMA*_Int
2090 // use only the lowest element of the FMA*_Int instruction. Such analysis are
2091 // not implemented yet. So, just return 0 in that case.
2092 // When such analysis are available this place will be the right place for
2093 // calling it.
2094 assert(!(FMA3Group.isIntrinsic() && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) &&
2095 "Intrinsic instructions can't commute operand 1");
2096
2097 // Determine which case this commute is or if it can't be done.
2098 unsigned Case =
2099 getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
2100 assert(Case < 3 && "Unexpected case number!");
2101
2102 // Define the FMA forms mapping array that helps to map input FMA form
2103 // to output FMA form to preserve the operation semantics after
2104 // commuting the operands.
2105 const unsigned Form132Index = 0;
2106 const unsigned Form213Index = 1;
2107 const unsigned Form231Index = 2;
2108 static const unsigned FormMapping[][3] = {
2109 // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
2110 // FMA132 A, C, b; ==> FMA231 C, A, b;
2111 // FMA213 B, A, c; ==> FMA213 A, B, c;
2112 // FMA231 C, A, b; ==> FMA132 A, C, b;
2113 {Form231Index, Form213Index, Form132Index},
2114 // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
2115 // FMA132 A, c, B; ==> FMA132 B, c, A;
2116 // FMA213 B, a, C; ==> FMA231 C, a, B;
2117 // FMA231 C, a, B; ==> FMA213 B, a, C;
2118 {Form132Index, Form231Index, Form213Index},
2119 // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
2120 // FMA132 a, C, B; ==> FMA213 a, B, C;
2121 // FMA213 b, A, C; ==> FMA132 b, C, A;
2122 // FMA231 c, A, B; ==> FMA231 c, B, A;
2123 {Form213Index, Form132Index, Form231Index}};
2124
2125 unsigned FMAForms[3];
2126 FMAForms[0] = FMA3Group.get132Opcode();
2127 FMAForms[1] = FMA3Group.get213Opcode();
2128 FMAForms[2] = FMA3Group.get231Opcode();
2129
2130 // Everything is ready, just adjust the FMA opcode and return it.
2131 for (unsigned FormIndex = 0; FormIndex < 3; FormIndex++)
2132 if (Opc == FMAForms[FormIndex])
2133 return FMAForms[FormMapping[Case][FormIndex]];
2134
2135 llvm_unreachable("Illegal FMA3 format");
2136}
2137
2138static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
2139 unsigned SrcOpIdx2) {
2140 // Determine which case this commute is or if it can't be done.
2141 unsigned Case =
2142 getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
2143 assert(Case < 3 && "Unexpected case value!");
2144
2145 // For each case we need to swap two pairs of bits in the final immediate.
2146 static const uint8_t SwapMasks[3][4] = {
2147 {0x04, 0x10, 0x08, 0x20}, // Swap bits 2/4 and 3/5.
2148 {0x02, 0x10, 0x08, 0x40}, // Swap bits 1/4 and 3/6.
2149 {0x02, 0x04, 0x20, 0x40}, // Swap bits 1/2 and 5/6.
2150 };
2151
2152 uint8_t Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
2153 // Clear out the bits we are swapping.
2154 uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
2155 SwapMasks[Case][2] | SwapMasks[Case][3]);
2156 // If the immediate had a bit of the pair set, then set the opposite bit.
2157 if (Imm & SwapMasks[Case][0])
2158 NewImm |= SwapMasks[Case][1];
2159 if (Imm & SwapMasks[Case][1])
2160 NewImm |= SwapMasks[Case][0];
2161 if (Imm & SwapMasks[Case][2])
2162 NewImm |= SwapMasks[Case][3];
2163 if (Imm & SwapMasks[Case][3])
2164 NewImm |= SwapMasks[Case][2];
2165 MI.getOperand(MI.getNumOperands() - 1).setImm(NewImm);
2166}
2167
2168// Returns true if this is a VPERMI2 or VPERMT2 instruction that can be
2169// commuted.
2170static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
2171#define VPERM_CASES(Suffix) \
2172 case X86::VPERMI2##Suffix##Z128rr: \
2173 case X86::VPERMT2##Suffix##Z128rr: \
2174 case X86::VPERMI2##Suffix##Z256rr: \
2175 case X86::VPERMT2##Suffix##Z256rr: \
2176 case X86::VPERMI2##Suffix##Zrr: \
2177 case X86::VPERMT2##Suffix##Zrr: \
2178 case X86::VPERMI2##Suffix##Z128rm: \
2179 case X86::VPERMT2##Suffix##Z128rm: \
2180 case X86::VPERMI2##Suffix##Z256rm: \
2181 case X86::VPERMT2##Suffix##Z256rm: \
2182 case X86::VPERMI2##Suffix##Zrm: \
2183 case X86::VPERMT2##Suffix##Zrm: \
2184 case X86::VPERMI2##Suffix##Z128rrkz: \
2185 case X86::VPERMT2##Suffix##Z128rrkz: \
2186 case X86::VPERMI2##Suffix##Z256rrkz: \
2187 case X86::VPERMT2##Suffix##Z256rrkz: \
2188 case X86::VPERMI2##Suffix##Zrrkz: \
2189 case X86::VPERMT2##Suffix##Zrrkz: \
2190 case X86::VPERMI2##Suffix##Z128rmkz: \
2191 case X86::VPERMT2##Suffix##Z128rmkz: \
2192 case X86::VPERMI2##Suffix##Z256rmkz: \
2193 case X86::VPERMT2##Suffix##Z256rmkz: \
2194 case X86::VPERMI2##Suffix##Zrmkz: \
2195 case X86::VPERMT2##Suffix##Zrmkz:
2196
2197#define VPERM_CASES_BROADCAST(Suffix) \
2198 VPERM_CASES(Suffix) \
2199 case X86::VPERMI2##Suffix##Z128rmb: \
2200 case X86::VPERMT2##Suffix##Z128rmb: \
2201 case X86::VPERMI2##Suffix##Z256rmb: \
2202 case X86::VPERMT2##Suffix##Z256rmb: \
2203 case X86::VPERMI2##Suffix##Zrmb: \
2204 case X86::VPERMT2##Suffix##Zrmb: \
2205 case X86::VPERMI2##Suffix##Z128rmbkz: \
2206 case X86::VPERMT2##Suffix##Z128rmbkz: \
2207 case X86::VPERMI2##Suffix##Z256rmbkz: \
2208 case X86::VPERMT2##Suffix##Z256rmbkz: \
2209 case X86::VPERMI2##Suffix##Zrmbkz: \
2210 case X86::VPERMT2##Suffix##Zrmbkz:
2211
2212 switch (Opcode) {
2213 default:
2214 return false;
2215 VPERM_CASES(B)
2220 VPERM_CASES(W)
2221 return true;
2222 }
2223#undef VPERM_CASES_BROADCAST
2224#undef VPERM_CASES
2225}
2226
2227// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
2228// from the I opcode to the T opcode and vice versa.
2229static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
2230#define VPERM_CASES(Orig, New) \
2231 case X86::Orig##Z128rr: \
2232 return X86::New##Z128rr; \
2233 case X86::Orig##Z128rrkz: \
2234 return X86::New##Z128rrkz; \
2235 case X86::Orig##Z128rm: \
2236 return X86::New##Z128rm; \
2237 case X86::Orig##Z128rmkz: \
2238 return X86::New##Z128rmkz; \
2239 case X86::Orig##Z256rr: \
2240 return X86::New##Z256rr; \
2241 case X86::Orig##Z256rrkz: \
2242 return X86::New##Z256rrkz; \
2243 case X86::Orig##Z256rm: \
2244 return X86::New##Z256rm; \
2245 case X86::Orig##Z256rmkz: \
2246 return X86::New##Z256rmkz; \
2247 case X86::Orig##Zrr: \
2248 return X86::New##Zrr; \
2249 case X86::Orig##Zrrkz: \
2250 return X86::New##Zrrkz; \
2251 case X86::Orig##Zrm: \
2252 return X86::New##Zrm; \
2253 case X86::Orig##Zrmkz: \
2254 return X86::New##Zrmkz;
2255
2256#define VPERM_CASES_BROADCAST(Orig, New) \
2257 VPERM_CASES(Orig, New) \
2258 case X86::Orig##Z128rmb: \
2259 return X86::New##Z128rmb; \
2260 case X86::Orig##Z128rmbkz: \
2261 return X86::New##Z128rmbkz; \
2262 case X86::Orig##Z256rmb: \
2263 return X86::New##Z256rmb; \
2264 case X86::Orig##Z256rmbkz: \
2265 return X86::New##Z256rmbkz; \
2266 case X86::Orig##Zrmb: \
2267 return X86::New##Zrmb; \
2268 case X86::Orig##Zrmbkz: \
2269 return X86::New##Zrmbkz;
2270
2271 switch (Opcode) {
2272 VPERM_CASES(VPERMI2B, VPERMT2B)
2273 VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
2274 VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
2275 VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
2276 VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
2277 VPERM_CASES(VPERMI2W, VPERMT2W)
2278 VPERM_CASES(VPERMT2B, VPERMI2B)
2279 VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
2280 VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
2281 VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
2282 VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
2283 VPERM_CASES(VPERMT2W, VPERMI2W)
2284 }
2285
2286 llvm_unreachable("Unreachable!");
2287#undef VPERM_CASES_BROADCAST
2288#undef VPERM_CASES
2289}
2290
2292 unsigned OpIdx1,
2293 unsigned OpIdx2) const {
2294 auto CloneIfNew = [&](MachineInstr &MI) {
2295 return std::exchange(NewMI, false)
2296 ? MI.getParent()->getParent()->CloneMachineInstr(&MI)
2297 : &MI;
2298 };
2299 MachineInstr *WorkingMI = nullptr;
2300 unsigned Opc = MI.getOpcode();
2301
2302#define CASE_ND(OP) \
2303 case X86::OP: \
2304 case X86::OP##_ND:
2305
2306 switch (Opc) {
2307 // SHLD B, C, I <-> SHRD C, B, (BitWidth - I)
2308 CASE_ND(SHRD16rri8)
2309 CASE_ND(SHLD16rri8)
2310 CASE_ND(SHRD32rri8)
2311 CASE_ND(SHLD32rri8)
2312 CASE_ND(SHRD64rri8)
2313 CASE_ND(SHLD64rri8) {
2314 unsigned Size;
2315 switch (Opc) {
2316 default:
2317 llvm_unreachable("Unreachable!");
2318#define FROM_TO_SIZE(A, B, S) \
2319 case X86::A: \
2320 Opc = X86::B; \
2321 Size = S; \
2322 break; \
2323 case X86::A##_ND: \
2324 Opc = X86::B##_ND; \
2325 Size = S; \
2326 break; \
2327 case X86::B: \
2328 Opc = X86::A; \
2329 Size = S; \
2330 break; \
2331 case X86::B##_ND: \
2332 Opc = X86::A##_ND; \
2333 Size = S; \
2334 break;
2335
2336 FROM_TO_SIZE(SHRD16rri8, SHLD16rri8, 16)
2337 FROM_TO_SIZE(SHRD32rri8, SHLD32rri8, 32)
2338 FROM_TO_SIZE(SHRD64rri8, SHLD64rri8, 64)
2339#undef FROM_TO_SIZE
2340 }
2341 WorkingMI = CloneIfNew(MI);
2342 WorkingMI->setDesc(get(Opc));
2343 WorkingMI->getOperand(3).setImm(Size - MI.getOperand(3).getImm());
2344 break;
2345 }
2346 case X86::PFSUBrr:
2347 case X86::PFSUBRrr:
2348 // PFSUB x, y: x = x - y
2349 // PFSUBR x, y: x = y - x
2350 WorkingMI = CloneIfNew(MI);
2351 WorkingMI->setDesc(
2352 get(X86::PFSUBRrr == Opc ? X86::PFSUBrr : X86::PFSUBRrr));
2353 break;
2354 case X86::BLENDPDrri:
2355 case X86::BLENDPSrri:
2356 case X86::PBLENDWrri:
2357 case X86::VBLENDPDrri:
2358 case X86::VBLENDPSrri:
2359 case X86::VBLENDPDYrri:
2360 case X86::VBLENDPSYrri:
2361 case X86::VPBLENDDrri:
2362 case X86::VPBLENDWrri:
2363 case X86::VPBLENDDYrri:
2364 case X86::VPBLENDWYrri: {
2365 int8_t Mask;
2366 switch (Opc) {
2367 default:
2368 llvm_unreachable("Unreachable!");
2369 case X86::BLENDPDrri:
2370 Mask = (int8_t)0x03;
2371 break;
2372 case X86::BLENDPSrri:
2373 Mask = (int8_t)0x0F;
2374 break;
2375 case X86::PBLENDWrri:
2376 Mask = (int8_t)0xFF;
2377 break;
2378 case X86::VBLENDPDrri:
2379 Mask = (int8_t)0x03;
2380 break;
2381 case X86::VBLENDPSrri:
2382 Mask = (int8_t)0x0F;
2383 break;
2384 case X86::VBLENDPDYrri:
2385 Mask = (int8_t)0x0F;
2386 break;
2387 case X86::VBLENDPSYrri:
2388 Mask = (int8_t)0xFF;
2389 break;
2390 case X86::VPBLENDDrri:
2391 Mask = (int8_t)0x0F;
2392 break;
2393 case X86::VPBLENDWrri:
2394 Mask = (int8_t)0xFF;
2395 break;
2396 case X86::VPBLENDDYrri:
2397 Mask = (int8_t)0xFF;
2398 break;
2399 case X86::VPBLENDWYrri:
2400 Mask = (int8_t)0xFF;
2401 break;
2402 }
2403 // Only the least significant bits of Imm are used.
2404 // Using int8_t to ensure it will be sign extended to the int64_t that
2405 // setImm takes in order to match isel behavior.
2406 int8_t Imm = MI.getOperand(3).getImm() & Mask;
2407 WorkingMI = CloneIfNew(MI);
2408 WorkingMI->getOperand(3).setImm(Mask ^ Imm);
2409 break;
2410 }
2411 case X86::INSERTPSrri:
2412 case X86::VINSERTPSrri:
2413 case X86::VINSERTPSZrri: {
2414 unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
2415 unsigned ZMask = Imm & 15;
2416 unsigned DstIdx = (Imm >> 4) & 3;
2417 unsigned SrcIdx = (Imm >> 6) & 3;
2418
2419 // We can commute insertps if we zero 2 of the elements, the insertion is
2420 // "inline" and we don't override the insertion with a zero.
2421 if (DstIdx == SrcIdx && (ZMask & (1 << DstIdx)) == 0 &&
2422 llvm::popcount(ZMask) == 2) {
2423 unsigned AltIdx = llvm::countr_zero((ZMask | (1 << DstIdx)) ^ 15);
2424 assert(AltIdx < 4 && "Illegal insertion index");
2425 unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask;
2426 WorkingMI = CloneIfNew(MI);
2427 WorkingMI->getOperand(MI.getNumOperands() - 1).setImm(AltImm);
2428 break;
2429 }
2430 return nullptr;
2431 }
2432 case X86::MOVSDrr:
2433 case X86::MOVSSrr:
2434 case X86::VMOVSDrr:
2435 case X86::VMOVSSrr: {
2436 // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
2437 if (Subtarget.hasSSE41()) {
2438 unsigned Mask;
2439 switch (Opc) {
2440 default:
2441 llvm_unreachable("Unreachable!");
2442 case X86::MOVSDrr:
2443 Opc = X86::BLENDPDrri;
2444 Mask = 0x02;
2445 break;
2446 case X86::MOVSSrr:
2447 Opc = X86::BLENDPSrri;
2448 Mask = 0x0E;
2449 break;
2450 case X86::VMOVSDrr:
2451 Opc = X86::VBLENDPDrri;
2452 Mask = 0x02;
2453 break;
2454 case X86::VMOVSSrr:
2455 Opc = X86::VBLENDPSrri;
2456 Mask = 0x0E;
2457 break;
2458 }
2459
2460 WorkingMI = CloneIfNew(MI);
2461 WorkingMI->setDesc(get(Opc));
2462 WorkingMI->addOperand(MachineOperand::CreateImm(Mask));
2463 break;
2464 }
2465
2466 assert(Opc == X86::MOVSDrr && "Only MOVSD can commute to SHUFPD");
2467 WorkingMI = CloneIfNew(MI);
2468 WorkingMI->setDesc(get(X86::SHUFPDrri));
2469 WorkingMI->addOperand(MachineOperand::CreateImm(0x02));
2470 break;
2471 }
2472 case X86::SHUFPDrri: {
2473 // Commute to MOVSD.
2474 assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!");
2475 WorkingMI = CloneIfNew(MI);
2476 WorkingMI->setDesc(get(X86::MOVSDrr));
2477 WorkingMI->removeOperand(3);
2478 break;
2479 }
2480 case X86::PCLMULQDQrri:
2481 case X86::VPCLMULQDQrri:
2482 case X86::VPCLMULQDQYrri:
2483 case X86::VPCLMULQDQZrri:
2484 case X86::VPCLMULQDQZ128rri:
2485 case X86::VPCLMULQDQZ256rri: {
2486 // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
2487 // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
2488 unsigned Imm = MI.getOperand(3).getImm();
2489 unsigned Src1Hi = Imm & 0x01;
2490 unsigned Src2Hi = Imm & 0x10;
2491 WorkingMI = CloneIfNew(MI);
2492 WorkingMI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
2493 break;
2494 }
2495 case X86::VPCMPBZ128rri:
2496 case X86::VPCMPUBZ128rri:
2497 case X86::VPCMPBZ256rri:
2498 case X86::VPCMPUBZ256rri:
2499 case X86::VPCMPBZrri:
2500 case X86::VPCMPUBZrri:
2501 case X86::VPCMPDZ128rri:
2502 case X86::VPCMPUDZ128rri:
2503 case X86::VPCMPDZ256rri:
2504 case X86::VPCMPUDZ256rri:
2505 case X86::VPCMPDZrri:
2506 case X86::VPCMPUDZrri:
2507 case X86::VPCMPQZ128rri:
2508 case X86::VPCMPUQZ128rri:
2509 case X86::VPCMPQZ256rri:
2510 case X86::VPCMPUQZ256rri:
2511 case X86::VPCMPQZrri:
2512 case X86::VPCMPUQZrri:
2513 case X86::VPCMPWZ128rri:
2514 case X86::VPCMPUWZ128rri:
2515 case X86::VPCMPWZ256rri:
2516 case X86::VPCMPUWZ256rri:
2517 case X86::VPCMPWZrri:
2518 case X86::VPCMPUWZrri:
2519 case X86::VPCMPBZ128rrik:
2520 case X86::VPCMPUBZ128rrik:
2521 case X86::VPCMPBZ256rrik:
2522 case X86::VPCMPUBZ256rrik:
2523 case X86::VPCMPBZrrik:
2524 case X86::VPCMPUBZrrik:
2525 case X86::VPCMPDZ128rrik:
2526 case X86::VPCMPUDZ128rrik:
2527 case X86::VPCMPDZ256rrik:
2528 case X86::VPCMPUDZ256rrik:
2529 case X86::VPCMPDZrrik:
2530 case X86::VPCMPUDZrrik:
2531 case X86::VPCMPQZ128rrik:
2532 case X86::VPCMPUQZ128rrik:
2533 case X86::VPCMPQZ256rrik:
2534 case X86::VPCMPUQZ256rrik:
2535 case X86::VPCMPQZrrik:
2536 case X86::VPCMPUQZrrik:
2537 case X86::VPCMPWZ128rrik:
2538 case X86::VPCMPUWZ128rrik:
2539 case X86::VPCMPWZ256rrik:
2540 case X86::VPCMPUWZ256rrik:
2541 case X86::VPCMPWZrrik:
2542 case X86::VPCMPUWZrrik:
2543 WorkingMI = CloneIfNew(MI);
2544 // Flip comparison mode immediate (if necessary).
2545 WorkingMI->getOperand(MI.getNumOperands() - 1)
2547 MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7));
2548 break;
2549 case X86::VPCOMBri:
2550 case X86::VPCOMUBri:
2551 case X86::VPCOMDri:
2552 case X86::VPCOMUDri:
2553 case X86::VPCOMQri:
2554 case X86::VPCOMUQri:
2555 case X86::VPCOMWri:
2556 case X86::VPCOMUWri:
2557 WorkingMI = CloneIfNew(MI);
2558 // Flip comparison mode immediate (if necessary).
2559 WorkingMI->getOperand(3).setImm(
2560 X86::getSwappedVPCOMImm(MI.getOperand(3).getImm() & 0x7));
2561 break;
2562 case X86::VCMPSDZrri:
2563 case X86::VCMPSSZrri:
2564 case X86::VCMPPDZrri:
2565 case X86::VCMPPSZrri:
2566 case X86::VCMPSHZrri:
2567 case X86::VCMPPHZrri:
2568 case X86::VCMPPHZ128rri:
2569 case X86::VCMPPHZ256rri:
2570 case X86::VCMPPDZ128rri:
2571 case X86::VCMPPSZ128rri:
2572 case X86::VCMPPDZ256rri:
2573 case X86::VCMPPSZ256rri:
2574 case X86::VCMPPDZrrik:
2575 case X86::VCMPPSZrrik:
2576 case X86::VCMPPDZ128rrik:
2577 case X86::VCMPPSZ128rrik:
2578 case X86::VCMPPDZ256rrik:
2579 case X86::VCMPPSZ256rrik:
2580 WorkingMI = CloneIfNew(MI);
2581 WorkingMI->getOperand(MI.getNumExplicitOperands() - 1)
2583 MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f));
2584 break;
2585 case X86::VPERM2F128rri:
2586 case X86::VPERM2I128rri:
2587 // Flip permute source immediate.
2588 // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
2589 // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
2590 WorkingMI = CloneIfNew(MI);
2591 WorkingMI->getOperand(3).setImm((MI.getOperand(3).getImm() & 0xFF) ^ 0x22);
2592 break;
2593 case X86::MOVHLPSrr:
2594 case X86::UNPCKHPDrr:
2595 case X86::VMOVHLPSrr:
2596 case X86::VUNPCKHPDrr:
2597 case X86::VMOVHLPSZrr:
2598 case X86::VUNPCKHPDZ128rr:
2599 assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!");
2600
2601 switch (Opc) {
2602 default:
2603 llvm_unreachable("Unreachable!");
2604 case X86::MOVHLPSrr:
2605 Opc = X86::UNPCKHPDrr;
2606 break;
2607 case X86::UNPCKHPDrr:
2608 Opc = X86::MOVHLPSrr;
2609 break;
2610 case X86::VMOVHLPSrr:
2611 Opc = X86::VUNPCKHPDrr;
2612 break;
2613 case X86::VUNPCKHPDrr:
2614 Opc = X86::VMOVHLPSrr;
2615 break;
2616 case X86::VMOVHLPSZrr:
2617 Opc = X86::VUNPCKHPDZ128rr;
2618 break;
2619 case X86::VUNPCKHPDZ128rr:
2620 Opc = X86::VMOVHLPSZrr;
2621 break;
2622 }
2623 WorkingMI = CloneIfNew(MI);
2624 WorkingMI->setDesc(get(Opc));
2625 break;
2626 CASE_ND(CMOV16rr)
2627 CASE_ND(CMOV32rr)
2628 CASE_ND(CMOV64rr) {
2629 WorkingMI = CloneIfNew(MI);
2630 unsigned OpNo = MI.getDesc().getNumOperands() - 1;
2631 X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
2633 break;
2634 }
2635 case X86::VPTERNLOGDZrri:
2636 case X86::VPTERNLOGDZrmi:
2637 case X86::VPTERNLOGDZ128rri:
2638 case X86::VPTERNLOGDZ128rmi:
2639 case X86::VPTERNLOGDZ256rri:
2640 case X86::VPTERNLOGDZ256rmi:
2641 case X86::VPTERNLOGQZrri:
2642 case X86::VPTERNLOGQZrmi:
2643 case X86::VPTERNLOGQZ128rri:
2644 case X86::VPTERNLOGQZ128rmi:
2645 case X86::VPTERNLOGQZ256rri:
2646 case X86::VPTERNLOGQZ256rmi:
2647 case X86::VPTERNLOGDZrrik:
2648 case X86::VPTERNLOGDZ128rrik:
2649 case X86::VPTERNLOGDZ256rrik:
2650 case X86::VPTERNLOGQZrrik:
2651 case X86::VPTERNLOGQZ128rrik:
2652 case X86::VPTERNLOGQZ256rrik:
2653 case X86::VPTERNLOGDZrrikz:
2654 case X86::VPTERNLOGDZrmikz:
2655 case X86::VPTERNLOGDZ128rrikz:
2656 case X86::VPTERNLOGDZ128rmikz:
2657 case X86::VPTERNLOGDZ256rrikz:
2658 case X86::VPTERNLOGDZ256rmikz:
2659 case X86::VPTERNLOGQZrrikz:
2660 case X86::VPTERNLOGQZrmikz:
2661 case X86::VPTERNLOGQZ128rrikz:
2662 case X86::VPTERNLOGQZ128rmikz:
2663 case X86::VPTERNLOGQZ256rrikz:
2664 case X86::VPTERNLOGQZ256rmikz:
2665 case X86::VPTERNLOGDZ128rmbi:
2666 case X86::VPTERNLOGDZ256rmbi:
2667 case X86::VPTERNLOGDZrmbi:
2668 case X86::VPTERNLOGQZ128rmbi:
2669 case X86::VPTERNLOGQZ256rmbi:
2670 case X86::VPTERNLOGQZrmbi:
2671 case X86::VPTERNLOGDZ128rmbikz:
2672 case X86::VPTERNLOGDZ256rmbikz:
2673 case X86::VPTERNLOGDZrmbikz:
2674 case X86::VPTERNLOGQZ128rmbikz:
2675 case X86::VPTERNLOGQZ256rmbikz:
2676 case X86::VPTERNLOGQZrmbikz: {
2677 WorkingMI = CloneIfNew(MI);
2678 commuteVPTERNLOG(*WorkingMI, OpIdx1, OpIdx2);
2679 break;
2680 }
2681 default:
2683 WorkingMI = CloneIfNew(MI);
2685 break;
2686 }
2687
2688 if (auto *FMA3Group = getFMA3Group(Opc, MI.getDesc().TSFlags)) {
2689 WorkingMI = CloneIfNew(MI);
2690 WorkingMI->setDesc(
2691 get(getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group)));
2692 break;
2693 }
2694 }
2695 return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
2696}
2697
2698bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
2699 unsigned &SrcOpIdx1,
2700 unsigned &SrcOpIdx2,
2701 bool IsIntrinsic) const {
2702 uint64_t TSFlags = MI.getDesc().TSFlags;
2703
2704 unsigned FirstCommutableVecOp = 1;
2705 unsigned LastCommutableVecOp = 3;
2706 unsigned KMaskOp = -1U;
2707 if (X86II::isKMasked(TSFlags)) {
2708 // For k-zero-masked operations it is Ok to commute the first vector
2709 // operand. Unless this is an intrinsic instruction.
2710 // For regular k-masked operations a conservative choice is done as the
2711 // elements of the first vector operand, for which the corresponding bit
2712 // in the k-mask operand is set to 0, are copied to the result of the
2713 // instruction.
2714 // TODO/FIXME: The commute still may be legal if it is known that the
2715 // k-mask operand is set to either all ones or all zeroes.
2716 // It is also Ok to commute the 1st operand if all users of MI use only
2717 // the elements enabled by the k-mask operand. For example,
2718 // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
2719 // : v1[i];
2720 // VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
2721 // // Ok, to commute v1 in FMADD213PSZrk.
2722
2723 // The k-mask operand has index = 2 for masked and zero-masked operations.
2724 KMaskOp = 2;
2725
2726 // The operand with index = 1 is used as a source for those elements for
2727 // which the corresponding bit in the k-mask is set to 0.
2728 if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic)
2729 FirstCommutableVecOp = 3;
2730
2731 LastCommutableVecOp++;
2732 } else if (IsIntrinsic) {
2733 // Commuting the first operand of an intrinsic instruction isn't possible
2734 // unless we can prove that only the lowest element of the result is used.
2735 FirstCommutableVecOp = 2;
2736 }
2737
2738 if (isMem(MI, LastCommutableVecOp))
2739 LastCommutableVecOp--;
2740
2741 // Only the first RegOpsNum operands are commutable.
2742 // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
2743 // that the operand is not specified/fixed.
2744 if (SrcOpIdx1 != CommuteAnyOperandIndex &&
2745 (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
2746 SrcOpIdx1 == KMaskOp))
2747 return false;
2748 if (SrcOpIdx2 != CommuteAnyOperandIndex &&
2749 (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
2750 SrcOpIdx2 == KMaskOp))
2751 return false;
2752
2753 // Look for two different register operands assumed to be commutable
2754 // regardless of the FMA opcode. The FMA opcode is adjusted later.
2755 if (SrcOpIdx1 == CommuteAnyOperandIndex ||
2756 SrcOpIdx2 == CommuteAnyOperandIndex) {
2757 unsigned CommutableOpIdx2 = SrcOpIdx2;
2758
2759 // At least one of operands to be commuted is not specified and
2760 // this method is free to choose appropriate commutable operands.
2761 if (SrcOpIdx1 == SrcOpIdx2)
2762 // Both of operands are not fixed. By default set one of commutable
2763 // operands to the last register operand of the instruction.
2764 CommutableOpIdx2 = LastCommutableVecOp;
2765 else if (SrcOpIdx2 == CommuteAnyOperandIndex)
2766 // Only one of operands is not fixed.
2767 CommutableOpIdx2 = SrcOpIdx1;
2768
2769 // CommutableOpIdx2 is well defined now. Let's choose another commutable
2770 // operand and assign its index to CommutableOpIdx1.
2771 Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
2772
2773 unsigned CommutableOpIdx1;
2774 for (CommutableOpIdx1 = LastCommutableVecOp;
2775 CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
2776 // Just ignore and skip the k-mask operand.
2777 if (CommutableOpIdx1 == KMaskOp)
2778 continue;
2779
2780 // The commuted operands must have different registers.
2781 // Otherwise, the commute transformation does not change anything and
2782 // is useless then.
2783 if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg())
2784 break;
2785 }
2786
2787 // No appropriate commutable operands were found.
2788 if (CommutableOpIdx1 < FirstCommutableVecOp)
2789 return false;
2790
2791 // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
2792 // to return those values.
2793 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
2794 CommutableOpIdx2))
2795 return false;
2796 }
2797
2798 return true;
2799}
2800
2802 unsigned &SrcOpIdx1,
2803 unsigned &SrcOpIdx2) const {
2804 const MCInstrDesc &Desc = MI.getDesc();
2805 if (!Desc.isCommutable())
2806 return false;
2807
2808 switch (MI.getOpcode()) {
2809 case X86::CMPSDrri:
2810 case X86::CMPSSrri:
2811 case X86::CMPPDrri:
2812 case X86::CMPPSrri:
2813 case X86::VCMPSDrri:
2814 case X86::VCMPSSrri:
2815 case X86::VCMPPDrri:
2816 case X86::VCMPPSrri:
2817 case X86::VCMPPDYrri:
2818 case X86::VCMPPSYrri:
2819 case X86::VCMPSDZrri:
2820 case X86::VCMPSSZrri:
2821 case X86::VCMPPDZrri:
2822 case X86::VCMPPSZrri:
2823 case X86::VCMPSHZrri:
2824 case X86::VCMPPHZrri:
2825 case X86::VCMPPHZ128rri:
2826 case X86::VCMPPHZ256rri:
2827 case X86::VCMPPDZ128rri:
2828 case X86::VCMPPSZ128rri:
2829 case X86::VCMPPDZ256rri:
2830 case X86::VCMPPSZ256rri:
2831 case X86::VCMPPDZrrik:
2832 case X86::VCMPPSZrrik:
2833 case X86::VCMPPDZ128rrik:
2834 case X86::VCMPPSZ128rrik:
2835 case X86::VCMPPDZ256rrik:
2836 case X86::VCMPPSZ256rrik: {
2837 unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0;
2838
2839 // Float comparison can be safely commuted for
2840 // Ordered/Unordered/Equal/NotEqual tests
2841 unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7;
2842 switch (Imm) {
2843 default:
2844 // EVEX versions can be commuted.
2845 if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX)
2846 break;
2847 return false;
2848 case 0x00: // EQUAL
2849 case 0x03: // UNORDERED
2850 case 0x04: // NOT EQUAL
2851 case 0x07: // ORDERED
2852 break;
2853 }
2854
2855 // The indices of the commutable operands are 1 and 2 (or 2 and 3
2856 // when masked).
2857 // Assign them to the returned operand indices here.
2858 return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
2859 2 + OpOffset);
2860 }
2861 case X86::MOVSSrr:
2862 // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can
2863 // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since
2864 // AVX implies sse4.1.
2865 if (Subtarget.hasSSE41())
2866 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2867 return false;
2868 case X86::SHUFPDrri:
2869 // We can commute this to MOVSD.
2870 if (MI.getOperand(3).getImm() == 0x02)
2871 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2872 return false;
2873 case X86::MOVHLPSrr:
2874 case X86::UNPCKHPDrr:
2875 case X86::VMOVHLPSrr:
2876 case X86::VUNPCKHPDrr:
2877 case X86::VMOVHLPSZrr:
2878 case X86::VUNPCKHPDZ128rr:
2879 if (Subtarget.hasSSE2())
2880 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2881 return false;
2882 case X86::VPTERNLOGDZrri:
2883 case X86::VPTERNLOGDZrmi:
2884 case X86::VPTERNLOGDZ128rri:
2885 case X86::VPTERNLOGDZ128rmi:
2886 case X86::VPTERNLOGDZ256rri:
2887 case X86::VPTERNLOGDZ256rmi:
2888 case X86::VPTERNLOGQZrri:
2889 case X86::VPTERNLOGQZrmi:
2890 case X86::VPTERNLOGQZ128rri:
2891 case X86::VPTERNLOGQZ128rmi:
2892 case X86::VPTERNLOGQZ256rri:
2893 case X86::VPTERNLOGQZ256rmi:
2894 case X86::VPTERNLOGDZrrik:
2895 case X86::VPTERNLOGDZ128rrik:
2896 case X86::VPTERNLOGDZ256rrik:
2897 case X86::VPTERNLOGQZrrik:
2898 case X86::VPTERNLOGQZ128rrik:
2899 case X86::VPTERNLOGQZ256rrik:
2900 case X86::VPTERNLOGDZrrikz:
2901 case X86::VPTERNLOGDZrmikz:
2902 case X86::VPTERNLOGDZ128rrikz:
2903 case X86::VPTERNLOGDZ128rmikz:
2904 case X86::VPTERNLOGDZ256rrikz:
2905 case X86::VPTERNLOGDZ256rmikz:
2906 case X86::VPTERNLOGQZrrikz:
2907 case X86::VPTERNLOGQZrmikz:
2908 case X86::VPTERNLOGQZ128rrikz:
2909 case X86::VPTERNLOGQZ128rmikz:
2910 case X86::VPTERNLOGQZ256rrikz:
2911 case X86::VPTERNLOGQZ256rmikz:
2912 case X86::VPTERNLOGDZ128rmbi:
2913 case X86::VPTERNLOGDZ256rmbi:
2914 case X86::VPTERNLOGDZrmbi:
2915 case X86::VPTERNLOGQZ128rmbi:
2916 case X86::VPTERNLOGQZ256rmbi:
2917 case X86::VPTERNLOGQZrmbi:
2918 case X86::VPTERNLOGDZ128rmbikz:
2919 case X86::VPTERNLOGDZ256rmbikz:
2920 case X86::VPTERNLOGDZrmbikz:
2921 case X86::VPTERNLOGQZ128rmbikz:
2922 case X86::VPTERNLOGQZ256rmbikz:
2923 case X86::VPTERNLOGQZrmbikz:
2924 return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2925 case X86::VPDPWSSDYrr:
2926 case X86::VPDPWSSDrr:
2927 case X86::VPDPWSSDSYrr:
2928 case X86::VPDPWSSDSrr:
2929 case X86::VPDPWUUDrr:
2930 case X86::VPDPWUUDYrr:
2931 case X86::VPDPWUUDSrr:
2932 case X86::VPDPWUUDSYrr:
2933 case X86::VPDPBSSDSrr:
2934 case X86::VPDPBSSDSYrr:
2935 case X86::VPDPBSSDrr:
2936 case X86::VPDPBSSDYrr:
2937 case X86::VPDPBUUDSrr:
2938 case X86::VPDPBUUDSYrr:
2939 case X86::VPDPBUUDrr:
2940 case X86::VPDPBUUDYrr:
2941 case X86::VPDPBSSDSZ128r:
2942 case X86::VPDPBSSDSZ128rk:
2943 case X86::VPDPBSSDSZ128rkz:
2944 case X86::VPDPBSSDSZ256r:
2945 case X86::VPDPBSSDSZ256rk:
2946 case X86::VPDPBSSDSZ256rkz:
2947 case X86::VPDPBSSDSZr:
2948 case X86::VPDPBSSDSZrk:
2949 case X86::VPDPBSSDSZrkz:
2950 case X86::VPDPBSSDZ128r:
2951 case X86::VPDPBSSDZ128rk:
2952 case X86::VPDPBSSDZ128rkz:
2953 case X86::VPDPBSSDZ256r:
2954 case X86::VPDPBSSDZ256rk:
2955 case X86::VPDPBSSDZ256rkz:
2956 case X86::VPDPBSSDZr:
2957 case X86::VPDPBSSDZrk:
2958 case X86::VPDPBSSDZrkz:
2959 case X86::VPDPBUUDSZ128r:
2960 case X86::VPDPBUUDSZ128rk:
2961 case X86::VPDPBUUDSZ128rkz:
2962 case X86::VPDPBUUDSZ256r:
2963 case X86::VPDPBUUDSZ256rk:
2964 case X86::VPDPBUUDSZ256rkz:
2965 case X86::VPDPBUUDSZr:
2966 case X86::VPDPBUUDSZrk:
2967 case X86::VPDPBUUDSZrkz:
2968 case X86::VPDPBUUDZ128r:
2969 case X86::VPDPBUUDZ128rk:
2970 case X86::VPDPBUUDZ128rkz:
2971 case X86::VPDPBUUDZ256r:
2972 case X86::VPDPBUUDZ256rk:
2973 case X86::VPDPBUUDZ256rkz:
2974 case X86::VPDPBUUDZr:
2975 case X86::VPDPBUUDZrk:
2976 case X86::VPDPBUUDZrkz:
2977 case X86::VPDPWSSDZ128r:
2978 case X86::VPDPWSSDZ128rk:
2979 case X86::VPDPWSSDZ128rkz:
2980 case X86::VPDPWSSDZ256r:
2981 case X86::VPDPWSSDZ256rk:
2982 case X86::VPDPWSSDZ256rkz:
2983 case X86::VPDPWSSDZr:
2984 case X86::VPDPWSSDZrk:
2985 case X86::VPDPWSSDZrkz:
2986 case X86::VPDPWSSDSZ128r:
2987 case X86::VPDPWSSDSZ128rk:
2988 case X86::VPDPWSSDSZ128rkz:
2989 case X86::VPDPWSSDSZ256r:
2990 case X86::VPDPWSSDSZ256rk:
2991 case X86::VPDPWSSDSZ256rkz:
2992 case X86::VPDPWSSDSZr:
2993 case X86::VPDPWSSDSZrk:
2994 case X86::VPDPWSSDSZrkz:
2995 case X86::VPDPWUUDZ128r:
2996 case X86::VPDPWUUDZ128rk:
2997 case X86::VPDPWUUDZ128rkz:
2998 case X86::VPDPWUUDZ256r:
2999 case X86::VPDPWUUDZ256rk:
3000 case X86::VPDPWUUDZ256rkz:
3001 case X86::VPDPWUUDZr:
3002 case X86::VPDPWUUDZrk:
3003 case X86::VPDPWUUDZrkz:
3004 case X86::VPDPWUUDSZ128r:
3005 case X86::VPDPWUUDSZ128rk:
3006 case X86::VPDPWUUDSZ128rkz:
3007 case X86::VPDPWUUDSZ256r:
3008 case X86::VPDPWUUDSZ256rk:
3009 case X86::VPDPWUUDSZ256rkz:
3010 case X86::VPDPWUUDSZr:
3011 case X86::VPDPWUUDSZrk:
3012 case X86::VPDPWUUDSZrkz:
3013 case X86::VPMADD52HUQrr:
3014 case X86::VPMADD52HUQYrr:
3015 case X86::VPMADD52HUQZ128r:
3016 case X86::VPMADD52HUQZ128rk:
3017 case X86::VPMADD52HUQZ128rkz:
3018 case X86::VPMADD52HUQZ256r:
3019 case X86::VPMADD52HUQZ256rk:
3020 case X86::VPMADD52HUQZ256rkz:
3021 case X86::VPMADD52HUQZr:
3022 case X86::VPMADD52HUQZrk:
3023 case X86::VPMADD52HUQZrkz:
3024 case X86::VPMADD52LUQrr:
3025 case X86::VPMADD52LUQYrr:
3026 case X86::VPMADD52LUQZ128r:
3027 case X86::VPMADD52LUQZ128rk:
3028 case X86::VPMADD52LUQZ128rkz:
3029 case X86::VPMADD52LUQZ256r:
3030 case X86::VPMADD52LUQZ256rk:
3031 case X86::VPMADD52LUQZ256rkz:
3032 case X86::VPMADD52LUQZr:
3033 case X86::VPMADD52LUQZrk:
3034 case X86::VPMADD52LUQZrkz:
3035 case X86::VFMADDCPHZr:
3036 case X86::VFMADDCPHZrk:
3037 case X86::VFMADDCPHZrkz:
3038 case X86::VFMADDCPHZ128r:
3039 case X86::VFMADDCPHZ128rk:
3040 case X86::VFMADDCPHZ128rkz:
3041 case X86::VFMADDCPHZ256r:
3042 case X86::VFMADDCPHZ256rk:
3043 case X86::VFMADDCPHZ256rkz:
3044 case X86::VFMADDCSHZr:
3045 case X86::VFMADDCSHZrk:
3046 case X86::VFMADDCSHZrkz: {
3047 unsigned CommutableOpIdx1 = 2;
3048 unsigned CommutableOpIdx2 = 3;
3049 if (X86II::isKMasked(Desc.TSFlags)) {
3050 // Skip the mask register.
3051 ++CommutableOpIdx1;
3052 ++CommutableOpIdx2;
3053 }
3054 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
3055 CommutableOpIdx2))
3056 return false;
3057 if (!MI.getOperand(SrcOpIdx1).isReg() || !MI.getOperand(SrcOpIdx2).isReg())
3058 // No idea.
3059 return false;
3060 return true;
3061 }
3062
3063 default:
3064 const X86InstrFMA3Group *FMA3Group =
3065 getFMA3Group(MI.getOpcode(), MI.getDesc().TSFlags);
3066 if (FMA3Group)
3067 return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2,
3068 FMA3Group->isIntrinsic());
3069
3070 // Handled masked instructions since we need to skip over the mask input
3071 // and the preserved input.
3072 if (X86II::isKMasked(Desc.TSFlags)) {
3073 // First assume that the first input is the mask operand and skip past it.
3074 unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
3075 unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
3076 // Check if the first input is tied. If there isn't one then we only
3077 // need to skip the mask operand which we did above.
3078 if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
3079 MCOI::TIED_TO) != -1)) {
3080 // If this is zero masking instruction with a tied operand, we need to
3081 // move the first index back to the first input since this must
3082 // be a 3 input instruction and we want the first two non-mask inputs.
3083 // Otherwise this is a 2 input instruction with a preserved input and
3084 // mask, so we need to move the indices to skip one more input.
3085 if (X86II::isKMergeMasked(Desc.TSFlags)) {
3086 ++CommutableOpIdx1;
3087 ++CommutableOpIdx2;
3088 } else {
3089 --CommutableOpIdx1;
3090 }
3091 }
3092
3093 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
3094 CommutableOpIdx2))
3095 return false;
3096
3097 if (!MI.getOperand(SrcOpIdx1).isReg() ||
3098 !MI.getOperand(SrcOpIdx2).isReg())
3099 // No idea.
3100 return false;
3101 return true;
3102 }
3103
3104 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
3105 }
3106 return false;
3107}
3108
3110 unsigned Opcode = MI->getOpcode();
3111 if (Opcode != X86::LEA32r && Opcode != X86::LEA64r &&
3112 Opcode != X86::LEA64_32r)
3113 return false;
3114
3115 const MachineOperand &Scale = MI->getOperand(1 + X86::AddrScaleAmt);
3116 const MachineOperand &Disp = MI->getOperand(1 + X86::AddrDisp);
3117 const MachineOperand &Segment = MI->getOperand(1 + X86::AddrSegmentReg);
3118
3119 if (Segment.getReg() != 0 || !Disp.isImm() || Disp.getImm() != 0 ||
3120 Scale.getImm() > 1)
3121 return false;
3122
3123 return true;
3124}
3125
3127 // Currently we're interested in following sequence only.
3128 // r3 = lea r1, r2
3129 // r5 = add r3, r4
3130 // Both r3 and r4 are killed in add, we hope the add instruction has the
3131 // operand order
3132 // r5 = add r4, r3
3133 // So later in X86FixupLEAs the lea instruction can be rewritten as add.
3134 unsigned Opcode = MI.getOpcode();
3135 if (Opcode != X86::ADD32rr && Opcode != X86::ADD64rr)
3136 return false;
3137
3138 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
3139 Register Reg1 = MI.getOperand(1).getReg();
3140 Register Reg2 = MI.getOperand(2).getReg();
3141
3142 // Check if Reg1 comes from LEA in the same MBB.
3143 if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg1)) {
3144 if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
3145 Commute = true;
3146 return true;
3147 }
3148 }
3149
3150 // Check if Reg2 comes from LEA in the same MBB.
3151 if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg2)) {
3152 if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
3153 Commute = false;
3154 return true;
3155 }
3156 }
3157
3158 return false;
3159}
3160
3162 unsigned Opcode = MCID.getOpcode();
3163 if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isSETZUCC(Opcode) ||
3164 X86::isCMOVCC(Opcode) || X86::isCFCMOVCC(Opcode) ||
3165 X86::isCCMPCC(Opcode) || X86::isCTESTCC(Opcode)))
3166 return -1;
3167 // Assume that condition code is always the last use operand.
3168 unsigned NumUses = MCID.getNumOperands() - MCID.getNumDefs();
3169 return NumUses - 1;
3170}
3171
3173 const MCInstrDesc &MCID = MI.getDesc();
3174 int CondNo = getCondSrcNoFromDesc(MCID);
3175 if (CondNo < 0)
3176 return X86::COND_INVALID;
3177 CondNo += MCID.getNumDefs();
3178 return static_cast<X86::CondCode>(MI.getOperand(CondNo).getImm());
3179}
3180
3182 return X86::isJCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3184}
3185
3187 return X86::isSETCC(MI.getOpcode()) || X86::isSETZUCC(MI.getOpcode())
3190}
3191
3193 return X86::isCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3195}
3196
3198 return X86::isCFCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3200}
3201
3203 return X86::isCCMPCC(MI.getOpcode()) || X86::isCTESTCC(MI.getOpcode())
3206}
3207
3209 // CCMP/CTEST has two conditional operands:
3210 // - SCC: source conditonal code (same as CMOV)
3211 // - DCF: destination conditional flags, which has 4 valid bits
3212 //
3213 // +----+----+----+----+
3214 // | OF | SF | ZF | CF |
3215 // +----+----+----+----+
3216 //
3217 // If SCC(source conditional code) evaluates to false, CCMP/CTEST will updates
3218 // the conditional flags by as follows:
3219 //
3220 // OF = DCF.OF
3221 // SF = DCF.SF
3222 // ZF = DCF.ZF
3223 // CF = DCF.CF
3224 // PF = DCF.CF
3225 // AF = 0 (Auxiliary Carry Flag)
3226 //
3227 // Otherwise, the CMP or TEST is executed and it updates the
3228 // CSPAZO flags normally.
3229 //
3230 // NOTE:
3231 // If SCC = P, then SCC evaluates to true regardless of the CSPAZO value.
3232 // If SCC = NP, then SCC evaluates to false regardless of the CSPAZO value.
3233
3234 enum { CF = 1, ZF = 2, SF = 4, OF = 8, PF = CF };
3235
3236 switch (CC) {
3237 default:
3238 llvm_unreachable("Illegal condition code!");
3239 case X86::COND_NO:
3240 case X86::COND_NE:
3241 case X86::COND_GE:
3242 case X86::COND_G:
3243 case X86::COND_AE:
3244 case X86::COND_A:
3245 case X86::COND_NS:
3246 case X86::COND_NP:
3247 return 0;
3248 case X86::COND_O:
3249 return OF;
3250 case X86::COND_B:
3251 case X86::COND_BE:
3252 return CF;
3253 break;
3254 case X86::COND_E:
3255 case X86::COND_LE:
3256 return ZF;
3257 case X86::COND_S:
3258 case X86::COND_L:
3259 return SF;
3260 case X86::COND_P:
3261 return PF;
3262 }
3263}
3264
3265#define GET_X86_NF_TRANSFORM_TABLE
3266#define GET_X86_ND2NONND_TABLE
3267#include "X86GenInstrMapping.inc"
3268
3270 unsigned Opc) {
3271 const auto I = llvm::lower_bound(Table, Opc);
3272 return (I == Table.end() || I->OldOpc != Opc) ? 0U : I->NewOpc;
3273}
3274unsigned X86::getNFVariant(unsigned Opc) {
3275#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
3276 // Make sure the tables are sorted.
3277 static std::atomic<bool> NFTableChecked(false);
3278 if (!NFTableChecked.load(std::memory_order_relaxed)) {
3279 assert(llvm::is_sorted(X86NFTransformTable) &&
3280 "X86NFTransformTable is not sorted!");
3281 NFTableChecked.store(true, std::memory_order_relaxed);
3282 }
3283#endif
3284 return getNewOpcFromTable(X86NFTransformTable, Opc);
3285}
3286
3287unsigned X86::getNonNDVariant(unsigned Opc) {
3288#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
3289 // Make sure the tables are sorted.
3290 static std::atomic<bool> NDTableChecked(false);
3291 if (!NDTableChecked.load(std::memory_order_relaxed)) {
3292 assert(llvm::is_sorted(X86ND2NonNDTable) &&
3293 "X86ND2NonNDTableis not sorted!");
3294 NDTableChecked.store(true, std::memory_order_relaxed);
3295 }
3296#endif
3297 return getNewOpcFromTable(X86ND2NonNDTable, Opc);
3298}
3299
3300/// Return the inverse of the specified condition,
3301/// e.g. turning COND_E to COND_NE.
3303 switch (CC) {
3304 default:
3305 llvm_unreachable("Illegal condition code!");
3306 case X86::COND_E:
3307 return X86::COND_NE;
3308 case X86::COND_NE:
3309 return X86::COND_E;
3310 case X86::COND_L:
3311 return X86::COND_GE;
3312 case X86::COND_LE:
3313 return X86::COND_G;
3314 case X86::COND_G:
3315 return X86::COND_LE;
3316 case X86::COND_GE:
3317 return X86::COND_L;
3318 case X86::COND_B:
3319 return X86::COND_AE;
3320 case X86::COND_BE:
3321 return X86::COND_A;
3322 case X86::COND_A:
3323 return X86::COND_BE;
3324 case X86::COND_AE:
3325 return X86::COND_B;
3326 case X86::COND_S:
3327 return X86::COND_NS;
3328 case X86::COND_NS:
3329 return X86::COND_S;
3330 case X86::COND_P:
3331 return X86::COND_NP;
3332 case X86::COND_NP:
3333 return X86::COND_P;
3334 case X86::COND_O:
3335 return X86::COND_NO;
3336 case X86::COND_NO:
3337 return X86::COND_O;
3338 case X86::COND_NE_OR_P:
3339 return X86::COND_E_AND_NP;
3340 case X86::COND_E_AND_NP:
3341 return X86::COND_NE_OR_P;
3342 }
3343}
3344
3345/// Assuming the flags are set by MI(a,b), return the condition code if we
3346/// modify the instructions such that flags are set by MI(b,a).
3348 switch (CC) {
3349 default:
3350 return X86::COND_INVALID;
3351 case X86::COND_E:
3352 return X86::COND_E;
3353 case X86::COND_NE:
3354 return X86::COND_NE;
3355 case X86::COND_L:
3356 return X86::COND_G;
3357 case X86::COND_LE:
3358 return X86::COND_GE;
3359 case X86::COND_G:
3360 return X86::COND_L;
3361 case X86::COND_GE:
3362 return X86::COND_LE;
3363 case X86::COND_B:
3364 return X86::COND_A;
3365 case X86::COND_BE:
3366 return X86::COND_AE;
3367 case X86::COND_A:
3368 return X86::COND_B;
3369 case X86::COND_AE:
3370 return X86::COND_BE;
3371 }
3372}
3373
3374std::pair<X86::CondCode, bool>
3377 bool NeedSwap = false;
3378 switch (Predicate) {
3379 default:
3380 break;
3381 // Floating-point Predicates
3382 case CmpInst::FCMP_UEQ:
3383 CC = X86::COND_E;
3384 break;
3385 case CmpInst::FCMP_OLT:
3386 NeedSwap = true;
3387 [[fallthrough]];
3388 case CmpInst::FCMP_OGT:
3389 CC = X86::COND_A;
3390 break;
3391 case CmpInst::FCMP_OLE:
3392 NeedSwap = true;
3393 [[fallthrough]];
3394 case CmpInst::FCMP_OGE:
3395 CC = X86::COND_AE;
3396 break;
3397 case CmpInst::FCMP_UGT:
3398 NeedSwap = true;
3399 [[fallthrough]];
3400 case CmpInst::FCMP_ULT:
3401 CC = X86::COND_B;
3402 break;
3403 case CmpInst::FCMP_UGE:
3404 NeedSwap = true;
3405 [[fallthrough]];
3406 case CmpInst::FCMP_ULE:
3407 CC = X86::COND_BE;
3408 break;
3409 case CmpInst::FCMP_ONE:
3410 CC = X86::COND_NE;
3411 break;
3412 case CmpInst::FCMP_UNO:
3413 CC = X86::COND_P;
3414 break;
3415 case CmpInst::FCMP_ORD:
3416 CC = X86::COND_NP;
3417 break;
3418 case CmpInst::FCMP_OEQ:
3419 [[fallthrough]];
3420 case CmpInst::FCMP_UNE:
3421 CC = X86::COND_INVALID;
3422 break;
3423
3424 // Integer Predicates
3425 case CmpInst::ICMP_EQ:
3426 CC = X86::COND_E;
3427 break;
3428 case CmpInst::ICMP_NE:
3429 CC = X86::COND_NE;
3430 break;
3431 case CmpInst::ICMP_UGT:
3432 CC = X86::COND_A;
3433 break;
3434 case CmpInst::ICMP_UGE:
3435 CC = X86::COND_AE;
3436 break;
3437 case CmpInst::ICMP_ULT:
3438 CC = X86::COND_B;
3439 break;
3440 case CmpInst::ICMP_ULE:
3441 CC = X86::COND_BE;
3442 break;
3443 case CmpInst::ICMP_SGT:
3444 CC = X86::COND_G;
3445 break;
3446 case CmpInst::ICMP_SGE:
3447 CC = X86::COND_GE;
3448 break;
3449 case CmpInst::ICMP_SLT:
3450 CC = X86::COND_L;
3451 break;
3452 case CmpInst::ICMP_SLE:
3453 CC = X86::COND_LE;
3454 break;
3455 }
3456
3457 return std::make_pair(CC, NeedSwap);
3458}
3459
3460/// Return a cmov opcode for the given register size in bytes, and operand type.
3461unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand,
3462 bool HasNDD) {
3463 switch (RegBytes) {
3464 default:
3465 llvm_unreachable("Illegal register size!");
3466#define GET_ND_IF_ENABLED(OPC) (HasNDD ? OPC##_ND : OPC)
3467 case 2:
3468 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV16rm)
3469 : GET_ND_IF_ENABLED(X86::CMOV16rr);
3470 case 4:
3471 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV32rm)
3472 : GET_ND_IF_ENABLED(X86::CMOV32rr);
3473 case 8:
3474 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV64rm)
3475 : GET_ND_IF_ENABLED(X86::CMOV64rr);
3476 }
3477}
3478
3479/// Get the VPCMP immediate for the given condition.
3481 switch (CC) {
3482 default:
3483 llvm_unreachable("Unexpected SETCC condition");
3484 case ISD::SETNE:
3485 return 4;
3486 case ISD::SETEQ:
3487 return 0;
3488 case ISD::SETULT:
3489 case ISD::SETLT:
3490 return 1;
3491 case ISD::SETUGT:
3492 case ISD::SETGT:
3493 return 6;
3494 case ISD::SETUGE:
3495 case ISD::SETGE:
3496 return 5;
3497 case ISD::SETULE:
3498 case ISD::SETLE:
3499 return 2;
3500 }
3501}
3502
3503/// Get the VPCMP immediate if the operands are swapped.
3504unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
3505 switch (Imm) {
3506 default:
3507 llvm_unreachable("Unreachable!");
3508 case 0x01:
3509 Imm = 0x06;
3510 break; // LT -> NLE
3511 case 0x02:
3512 Imm = 0x05;
3513 break; // LE -> NLT
3514 case 0x05:
3515 Imm = 0x02;
3516 break; // NLT -> LE
3517 case 0x06:
3518 Imm = 0x01;
3519 break; // NLE -> LT
3520 case 0x00: // EQ
3521 case 0x03: // FALSE
3522 case 0x04: // NE
3523 case 0x07: // TRUE
3524 break;
3525 }
3526
3527 return Imm;
3528}
3529
3530/// Get the VPCOM immediate if the operands are swapped.
3531unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
3532 switch (Imm) {
3533 default:
3534 llvm_unreachable("Unreachable!");
3535 case 0x00:
3536 Imm = 0x02;
3537 break; // LT -> GT
3538 case 0x01:
3539 Imm = 0x03;
3540 break; // LE -> GE
3541 case 0x02:
3542 Imm = 0x00;
3543 break; // GT -> LT
3544 case 0x03:
3545 Imm = 0x01;
3546 break; // GE -> LE
3547 case 0x04: // EQ
3548 case 0x05: // NE
3549 case 0x06: // FALSE
3550 case 0x07: // TRUE
3551 break;
3552 }
3553
3554 return Imm;
3555}
3556
3557/// Get the VCMP immediate if the operands are swapped.
3558unsigned X86::getSwappedVCMPImm(unsigned Imm) {
3559 // Only need the lower 2 bits to distinquish.
3560 switch (Imm & 0x3) {
3561 default:
3562 llvm_unreachable("Unreachable!");
3563 case 0x00:
3564 case 0x03:
3565 // EQ/NE/TRUE/FALSE/ORD/UNORD don't change immediate when commuted.
3566 break;
3567 case 0x01:
3568 case 0x02:
3569 // Need to toggle bits 3:0. Bit 4 stays the same.
3570 Imm ^= 0xf;
3571 break;
3572 }
3573
3574 return Imm;
3575}
3576
3578 if (Info.RegClass == X86::VR128RegClassID ||
3579 Info.RegClass == X86::VR128XRegClassID)
3580 return 128;
3581 if (Info.RegClass == X86::VR256RegClassID ||
3582 Info.RegClass == X86::VR256XRegClassID)
3583 return 256;
3584 if (Info.RegClass == X86::VR512RegClassID)
3585 return 512;
3586 llvm_unreachable("Unknown register class!");
3587}
3588
3589/// Return true if the Reg is X87 register.
3590static bool isX87Reg(Register Reg) {
3591 return (Reg == X86::FPCW || Reg == X86::FPSW ||
3592 (Reg >= X86::ST0 && Reg <= X86::ST7));
3593}
3594
3595/// check if the instruction is X87 instruction
3597 // Call and inlineasm defs X87 register, so we special case it here because
3598 // otherwise calls are incorrectly flagged as x87 instructions
3599 // as a result.
3600 if (MI.isCall() || MI.isInlineAsm())
3601 return false;
3602 for (const MachineOperand &MO : MI.operands()) {
3603 if (!MO.isReg())
3604 continue;
3605 if (isX87Reg(MO.getReg()))
3606 return true;
3607 }
3608 return false;
3609}
3610
3612 auto IsMemOp = [](const MCOperandInfo &OpInfo) {
3613 return OpInfo.OperandType == MCOI::OPERAND_MEMORY;
3614 };
3615
3616 const MCInstrDesc &Desc = MI.getDesc();
3617
3618 // Directly invoke the MC-layer routine for real (i.e., non-pseudo)
3619 // instructions (fast case).
3620 if (!X86II::isPseudo(Desc.TSFlags)) {
3621 int MemRefIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
3622 if (MemRefIdx >= 0)
3623 return MemRefIdx + X86II::getOperandBias(Desc);
3624#ifdef EXPENSIVE_CHECKS
3625 assert(none_of(Desc.operands(), IsMemOp) &&
3626 "Got false negative from X86II::getMemoryOperandNo()!");
3627#endif
3628 return -1;
3629 }
3630
3631 // Otherwise, handle pseudo instructions by examining the type of their
3632 // operands (slow case). An instruction cannot have a memory reference if it
3633 // has fewer than AddrNumOperands (= 5) explicit operands.
3634 unsigned NumOps = Desc.getNumOperands();
3635 if (NumOps < X86::AddrNumOperands) {
3636#ifdef EXPENSIVE_CHECKS
3637 assert(none_of(Desc.operands(), IsMemOp) &&
3638 "Expected no operands to have OPERAND_MEMORY type!");
3639#endif
3640 return -1;
3641 }
3642
3643 // The first operand with type OPERAND_MEMORY indicates the start of a memory
3644 // reference. We expect the following AddrNumOperand-1 operands to also have
3645 // OPERAND_MEMORY type.
3646 for (unsigned I = 0, E = NumOps - X86::AddrNumOperands; I != E; ++I) {
3647 if (IsMemOp(Desc.operands()[I])) {
3648#ifdef EXPENSIVE_CHECKS
3649 assert(std::all_of(Desc.operands().begin() + I,
3650 Desc.operands().begin() + I + X86::AddrNumOperands,
3651 IsMemOp) &&
3652 "Expected all five operands in the memory reference to have "
3653 "OPERAND_MEMORY type!");
3654#endif
3655 return I;
3656 }
3657 }
3658
3659 return -1;
3660}
3661
3663 unsigned OpNo) {
3664 assert(MI.getNumOperands() >= (OpNo + X86::AddrNumOperands) &&
3665 "Unexpected number of operands!");
3666
3667 const MachineOperand &Index = MI.getOperand(OpNo + X86::AddrIndexReg);
3668 if (!Index.isReg() || Index.getReg() != X86::NoRegister)
3669 return nullptr;
3670
3671 const MachineOperand &Disp = MI.getOperand(OpNo + X86::AddrDisp);
3672 if (!Disp.isCPI() || Disp.getOffset() != 0)
3673 return nullptr;
3674
3676 MI.getParent()->getParent()->getConstantPool()->getConstants();
3677 const MachineConstantPoolEntry &ConstantEntry = Constants[Disp.getIndex()];
3678
3679 // Bail if this is a machine constant pool entry, we won't be able to dig out
3680 // anything useful.
3681 if (ConstantEntry.isMachineConstantPoolEntry())
3682 return nullptr;
3683
3684 return ConstantEntry.Val.ConstVal;
3685}
3686
3688 switch (MI.getOpcode()) {
3689 case X86::TCRETURNdi:
3690 case X86::TCRETURNri:
3691 case X86::TCRETURNmi:
3692 case X86::TCRETURNdi64:
3693 case X86::TCRETURNri64:
3694 case X86::TCRETURNri64_ImpCall:
3695 case X86::TCRETURNmi64:
3696 return true;
3697 default:
3698 return false;
3699 }
3700}
3701
3704 const MachineInstr &TailCall) const {
3705
3706 const MachineFunction *MF = TailCall.getMF();
3707
3708 if (MF->getTarget().getCodeModel() == CodeModel::Kernel) {
3709 // Kernel patches thunk calls in runtime, these should never be conditional.
3710 const MachineOperand &Target = TailCall.getOperand(0);
3711 if (Target.isSymbol()) {
3712 StringRef Symbol(Target.getSymbolName());
3713 // this is currently only relevant to r11/kernel indirect thunk.
3714 if (Symbol == "__x86_indirect_thunk_r11")
3715 return false;
3716 }
3717 }
3718
3719 if (TailCall.getOpcode() != X86::TCRETURNdi &&
3720 TailCall.getOpcode() != X86::TCRETURNdi64) {
3721 // Only direct calls can be done with a conditional branch.
3722 return false;
3723 }
3724
3725 if (Subtarget.isTargetWin64() && MF->hasWinCFI()) {
3726 // Conditional tail calls confuse the Win64 unwinder.
3727 return false;
3728 }
3729
3730 assert(BranchCond.size() == 1);
3731 if (BranchCond[0].getImm() > X86::LAST_VALID_COND) {
3732 // Can't make a conditional tail call with this condition.
3733 return false;
3734 }
3735
3737 if (X86FI->getTCReturnAddrDelta() != 0 ||
3738 TailCall.getOperand(1).getImm() != 0) {
3739 // A conditional tail call cannot do any stack adjustment.
3740 return false;
3741 }
3742
3743 return true;
3744}
3745
3748 const MachineInstr &TailCall) const {
3749 assert(canMakeTailCallConditional(BranchCond, TailCall));
3750
3752 while (I != MBB.begin()) {
3753 --I;
3754 if (I->isDebugInstr())
3755 continue;
3756 if (!I->isBranch())
3757 assert(0 && "Can't find the branch to replace!");
3758
3760 assert(BranchCond.size() == 1);
3761 if (CC != BranchCond[0].getImm())
3762 continue;
3763
3764 break;
3765 }
3766
3767 unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc
3768 : X86::TCRETURNdi64cc;
3769
3770 auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc));
3771 MIB->addOperand(TailCall.getOperand(0)); // Destination.
3772 MIB.addImm(0); // Stack offset (not used).
3773 MIB->addOperand(BranchCond[0]); // Condition.
3774 MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters.
3775
3776 // Add implicit uses and defs of all live regs potentially clobbered by the
3777 // call. This way they still appear live across the call.
3778 LivePhysRegs LiveRegs(getRegisterInfo());
3779 LiveRegs.addLiveOuts(MBB);
3781 LiveRegs.stepForward(*MIB, Clobbers);
3782 for (const auto &C : Clobbers) {
3783 MIB.addReg(C.first, RegState::Implicit);
3785 }
3786
3787 I->eraseFromParent();
3788}
3789
3790// Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may
3791// not be a fallthrough MBB now due to layout changes). Return nullptr if the
3792// fallthrough MBB cannot be identified.
3795 // Look for non-EHPad successors other than TBB. If we find exactly one, it
3796 // is the fallthrough MBB. If we find zero, then TBB is both the target MBB
3797 // and fallthrough MBB. If we find more than one, we cannot identify the
3798 // fallthrough MBB and should return nullptr.
3799 MachineBasicBlock *FallthroughBB = nullptr;
3800 for (MachineBasicBlock *Succ : MBB->successors()) {
3801 if (Succ->isEHPad() || (Succ == TBB && FallthroughBB))
3802 continue;
3803 // Return a nullptr if we found more than one fallthrough successor.
3804 if (FallthroughBB && FallthroughBB != TBB)
3805 return nullptr;
3806 FallthroughBB = Succ;
3807 }
3808 return FallthroughBB;
3809}
3810
3811bool X86InstrInfo::analyzeBranchImpl(
3814 SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
3815
3816 // Start from the bottom of the block and work up, examining the
3817 // terminator instructions.
3819 MachineBasicBlock::iterator UnCondBrIter = MBB.end();
3820 while (I != MBB.begin()) {
3821 --I;
3822 if (I->isDebugInstr())
3823 continue;
3824
3825 // Working from the bottom, when we see a non-terminator instruction, we're
3826 // done.
3827 if (!isUnpredicatedTerminator(*I))
3828 break;
3829
3830 // A terminator that isn't a branch can't easily be handled by this
3831 // analysis.
3832 if (!I->isBranch())
3833 return true;
3834
3835 // Handle unconditional branches.
3836 if (I->getOpcode() == X86::JMP_1) {
3837 UnCondBrIter = I;
3838
3839 if (!AllowModify) {
3840 TBB = I->getOperand(0).getMBB();
3841 continue;
3842 }
3843
3844 // If the block has any instructions after a JMP, delete them.
3845 MBB.erase(std::next(I), MBB.end());
3846
3847 Cond.clear();
3848 FBB = nullptr;
3849
3850 // Delete the JMP if it's equivalent to a fall-through.
3851 if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
3852 TBB = nullptr;
3853 I->eraseFromParent();
3854 I = MBB.end();
3855 UnCondBrIter = MBB.end();
3856 continue;
3857 }
3858
3859 // TBB is used to indicate the unconditional destination.
3860 TBB = I->getOperand(0).getMBB();
3861 continue;
3862 }
3863
3864 // Handle conditional branches.
3865 X86::CondCode BranchCode = X86::getCondFromBranch(*I);
3866 if (BranchCode == X86::COND_INVALID)
3867 return true; // Can't handle indirect branch.
3868
3869 // In practice we should never have an undef eflags operand, if we do
3870 // abort here as we are not prepared to preserve the flag.
3871 if (I->findRegisterUseOperand(X86::EFLAGS, /*TRI=*/nullptr)->isUndef())
3872 return true;
3873
3874 // Working from the bottom, handle the first conditional branch.
3875 if (Cond.empty()) {
3876 FBB = TBB;
3877 TBB = I->getOperand(0).getMBB();
3878 Cond.push_back(MachineOperand::CreateImm(BranchCode));
3879 CondBranches.push_back(&*I);
3880 continue;
3881 }
3882
3883 // Handle subsequent conditional branches. Only handle the case where all
3884 // conditional branches branch to the same destination and their condition
3885 // opcodes fit one of the special multi-branch idioms.
3886 assert(Cond.size() == 1);
3887 assert(TBB);
3888
3889 // If the conditions are the same, we can leave them alone.
3890 X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
3891 auto NewTBB = I->getOperand(0).getMBB();
3892 if (OldBranchCode == BranchCode && TBB == NewTBB)
3893 continue;
3894
3895 // If they differ, see if they fit one of the known patterns. Theoretically,
3896 // we could handle more patterns here, but we shouldn't expect to see them
3897 // if instruction selection has done a reasonable job.
3898 if (TBB == NewTBB &&
3899 ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
3900 (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
3901 BranchCode = X86::COND_NE_OR_P;
3902 } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
3903 (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
3904 if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB)))
3905 return true;
3906
3907 // X86::COND_E_AND_NP usually has two different branch destinations.
3908 //
3909 // JP B1
3910 // JE B2
3911 // JMP B1
3912 // B1:
3913 // B2:
3914 //
3915 // Here this condition branches to B2 only if NP && E. It has another
3916 // equivalent form:
3917 //
3918 // JNE B1
3919 // JNP B2
3920 // JMP B1
3921 // B1:
3922 // B2:
3923 //
3924 // Similarly it branches to B2 only if E && NP. That is why this condition
3925 // is named with COND_E_AND_NP.
3926 BranchCode = X86::COND_E_AND_NP;
3927 } else
3928 return true;
3929
3930 // Update the MachineOperand.
3931 Cond[0].setImm(BranchCode);
3932 CondBranches.push_back(&*I);
3933 }
3934
3935 return false;
3936}
3937
3940 MachineBasicBlock *&FBB,
3942 bool AllowModify) const {
3943 SmallVector<MachineInstr *, 4> CondBranches;
3944 return analyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
3945}
3946
3948 const MCInstrDesc &Desc = MI.getDesc();
3949 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
3950 assert(MemRefBegin >= 0 && "instr should have memory operand");
3951 MemRefBegin += X86II::getOperandBias(Desc);
3952
3953 const MachineOperand &MO = MI.getOperand(MemRefBegin + X86::AddrDisp);
3954 if (!MO.isJTI())
3955 return -1;
3956
3957 return MO.getIndex();
3958}
3959
3961 Register Reg) {
3962 if (!Reg.isVirtual())
3963 return -1;
3964 MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
3965 if (MI == nullptr)
3966 return -1;
3967 unsigned Opcode = MI->getOpcode();
3968 if (Opcode != X86::LEA64r && Opcode != X86::LEA32r)
3969 return -1;
3971}
3972
3974 unsigned Opcode = MI.getOpcode();
3975 // Switch-jump pattern for non-PIC code looks like:
3976 // JMP64m $noreg, 8, %X, %jump-table.X, $noreg
3977 if (Opcode == X86::JMP64m || Opcode == X86::JMP32m) {
3979 }
3980 // The pattern for PIC code looks like:
3981 // %0 = LEA64r $rip, 1, $noreg, %jump-table.X
3982 // %1 = MOVSX64rm32 %0, 4, XX, 0, $noreg
3983 // %2 = ADD64rr %1, %0
3984 // JMP64r %2
3985 if (Opcode == X86::JMP64r || Opcode == X86::JMP32r) {
3986 Register Reg = MI.getOperand(0).getReg();
3987 if (!Reg.isVirtual())
3988 return -1;
3989 const MachineFunction &MF = *MI.getParent()->getParent();
3990 const MachineRegisterInfo &MRI = MF.getRegInfo();
3991 MachineInstr *Add = MRI.getUniqueVRegDef(Reg);
3992 if (Add == nullptr)
3993 return -1;
3994 if (Add->getOpcode() != X86::ADD64rr && Add->getOpcode() != X86::ADD32rr)
3995 return -1;
3996 int JTI1 = getJumpTableIndexFromReg(MRI, Add->getOperand(1).getReg());
3997 if (JTI1 >= 0)
3998 return JTI1;
3999 int JTI2 = getJumpTableIndexFromReg(MRI, Add->getOperand(2).getReg());
4000 if (JTI2 >= 0)
4001 return JTI2;
4002 }
4003 return -1;
4004}
4005
4007 MachineBranchPredicate &MBP,
4008 bool AllowModify) const {
4009 using namespace std::placeholders;
4010
4012 SmallVector<MachineInstr *, 4> CondBranches;
4013 if (analyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
4014 AllowModify))
4015 return true;
4016
4017 if (Cond.size() != 1)
4018 return true;
4019
4020 assert(MBP.TrueDest && "expected!");
4021
4022 if (!MBP.FalseDest)
4023 MBP.FalseDest = MBB.getNextNode();
4024
4026
4027 MachineInstr *ConditionDef = nullptr;
4028 bool SingleUseCondition = true;
4029
4031 if (MI.modifiesRegister(X86::EFLAGS, TRI)) {
4032 ConditionDef = &MI;
4033 break;
4034 }
4035
4036 if (MI.readsRegister(X86::EFLAGS, TRI))
4037 SingleUseCondition = false;
4038 }
4039
4040 if (!ConditionDef)
4041 return true;
4042
4043 if (SingleUseCondition) {
4044 for (auto *Succ : MBB.successors())
4045 if (Succ->isLiveIn(X86::EFLAGS))
4046 SingleUseCondition = false;
4047 }
4048
4049 MBP.ConditionDef = ConditionDef;
4050 MBP.SingleUseCondition = SingleUseCondition;
4051
4052 // Currently we only recognize the simple pattern:
4053 //
4054 // test %reg, %reg
4055 // je %label
4056 //
4057 const unsigned TestOpcode =
4058 Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;
4059
4060 if (ConditionDef->getOpcode() == TestOpcode &&
4061 ConditionDef->getNumOperands() == 3 &&
4062 ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
4063 (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) {
4064 MBP.LHS = ConditionDef->getOperand(0);
4065 MBP.RHS = MachineOperand::CreateImm(0);
4066 MBP.Predicate = Cond[0].getImm() == X86::COND_NE
4067 ? MachineBranchPredicate::PRED_NE
4068 : MachineBranchPredicate::PRED_EQ;
4069 return false;
4070 }
4071
4072 return true;
4073}
4074
4076 int *BytesRemoved) const {
4077 assert(!BytesRemoved && "code size not handled");
4078
4080 unsigned Count = 0;
4081
4082 while (I != MBB.begin()) {
4083 --I;
4084 if (I->isDebugInstr())
4085 continue;
4086 if (I->getOpcode() != X86::JMP_1 &&
4088 break;
4089 // Remove the branch.
4090 I->eraseFromParent();
4091 I = MBB.end();
4092 ++Count;
4093 }
4094
4095 return Count;
4096}
4097
4100 MachineBasicBlock *FBB,
4102 const DebugLoc &DL, int *BytesAdded) const {
4103 // Shouldn't be a fall through.
4104 assert(TBB && "insertBranch must not be told to insert a fallthrough");
4105 assert((Cond.size() == 1 || Cond.size() == 0) &&
4106 "X86 branch conditions have one component!");
4107 assert(!BytesAdded && "code size not handled");
4108
4109 if (Cond.empty()) {
4110 // Unconditional branch?
4111 assert(!FBB && "Unconditional branch with multiple successors!");
4112 BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
4113 return 1;
4114 }
4115
4116 // If FBB is null, it is implied to be a fall-through block.
4117 bool FallThru = FBB == nullptr;
4118
4119 // Conditional branch.
4120 unsigned Count = 0;
4122 switch (CC) {
4123 case X86::COND_NE_OR_P:
4124 // Synthesize NE_OR_P with two branches.
4125 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NE);
4126 ++Count;
4127 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_P);
4128 ++Count;
4129 break;
4130 case X86::COND_E_AND_NP:
4131 // Use the next block of MBB as FBB if it is null.
4132 if (FBB == nullptr) {
4133 FBB = getFallThroughMBB(&MBB, TBB);
4134 assert(FBB && "MBB cannot be the last block in function when the false "
4135 "body is a fall-through.");
4136 }
4137 // Synthesize COND_E_AND_NP with two branches.
4138 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(FBB).addImm(X86::COND_NE);
4139 ++Count;
4140 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NP);
4141 ++Count;
4142 break;
4143 default: {
4144 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(CC);
4145 ++Count;
4146 }
4147 }
4148 if (!FallThru) {
4149 // Two-way Conditional branch. Insert the second branch.
4150 BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
4151 ++Count;
4152 }
4153 return Count;
4154}
4155
4158 Register DstReg, Register TrueReg,
4159 Register FalseReg, int &CondCycles,
4160 int &TrueCycles, int &FalseCycles) const {
4161 // Not all subtargets have cmov instructions.
4162 if (!Subtarget.canUseCMOV())
4163 return false;
4164 if (Cond.size() != 1)
4165 return false;
4166 // We cannot do the composite conditions, at least not in SSA form.
4168 return false;
4169
4170 // Check register classes.
4172 const TargetRegisterClass *RC =
4173 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
4174 if (!RC)
4175 return false;
4176
4177 // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
4178 if (X86::GR16RegClass.hasSubClassEq(RC) ||
4179 X86::GR32RegClass.hasSubClassEq(RC) ||
4180 X86::GR64RegClass.hasSubClassEq(RC)) {
4181 // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
4182 // Bridge. Probably Ivy Bridge as well.
4183 CondCycles = 2;
4184 TrueCycles = 2;
4185 FalseCycles = 2;
4186 return true;
4187 }
4188
4189 // Can't do vectors.
4190 return false;
4191}
4192
4195 const DebugLoc &DL, Register DstReg,
4197 Register FalseReg) const {
4199 const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
4200 const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
4201 assert(Cond.size() == 1 && "Invalid Cond array");
4202 unsigned Opc =
4203 X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8,
4204 false /*HasMemoryOperand*/, Subtarget.hasNDD());
4205 BuildMI(MBB, I, DL, get(Opc), DstReg)
4206 .addReg(FalseReg)
4207 .addReg(TrueReg)
4208 .addImm(Cond[0].getImm());
4209}
4210
4211/// Test if the given register is a physical h register.
4212static bool isHReg(Register Reg) {
4213 return X86::GR8_ABCD_HRegClass.contains(Reg);
4214}
4215
4216// Try and copy between VR128/VR64 and GR64 registers.
4217static unsigned CopyToFromAsymmetricReg(Register DestReg, Register SrcReg,
4218 const X86Subtarget &Subtarget) {
4219 bool HasAVX = Subtarget.hasAVX();
4220 bool HasAVX512 = Subtarget.hasAVX512();
4221 bool HasEGPR = Subtarget.hasEGPR();
4222
4223 // SrcReg(MaskReg) -> DestReg(GR64)
4224 // SrcReg(MaskReg) -> DestReg(GR32)
4225
4226 // All KMASK RegClasses hold the same k registers, can be tested against
4227 // anyone.
4228 if (X86::VK16RegClass.contains(SrcReg)) {
4229 if (X86::GR64RegClass.contains(DestReg)) {
4230 assert(Subtarget.hasBWI());
4231 return HasEGPR ? X86::KMOVQrk_EVEX : X86::KMOVQrk;
4232 }
4233 if (X86::GR32RegClass.contains(DestReg))
4234 return Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVDrk_EVEX : X86::KMOVDrk)
4235 : (HasEGPR ? X86::KMOVWrk_EVEX : X86::KMOVWrk);
4236 }
4237
4238 // SrcReg(GR64) -> DestReg(MaskReg)
4239 // SrcReg(GR32) -> DestReg(MaskReg)
4240
4241 // All KMASK RegClasses hold the same k registers, can be tested against
4242 // anyone.
4243 if (X86::VK16RegClass.contains(DestReg)) {
4244 if (X86::GR64RegClass.contains(SrcReg)) {
4245 assert(Subtarget.hasBWI());
4246 return HasEGPR ? X86::KMOVQkr_EVEX : X86::KMOVQkr;
4247 }
4248 if (X86::GR32RegClass.contains(SrcReg))
4249 return Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVDkr_EVEX : X86::KMOVDkr)
4250 : (HasEGPR ? X86::KMOVWkr_EVEX : X86::KMOVWkr);
4251 }
4252
4253 // SrcReg(VR128) -> DestReg(GR64)
4254 // SrcReg(VR64) -> DestReg(GR64)
4255 // SrcReg(GR64) -> DestReg(VR128)
4256 // SrcReg(GR64) -> DestReg(VR64)
4257
4258 if (X86::GR64RegClass.contains(DestReg)) {
4259 if (X86::VR128XRegClass.contains(SrcReg))
4260 // Copy from a VR128 register to a GR64 register.
4261 return HasAVX512 ? X86::VMOVPQIto64Zrr
4262 : HasAVX ? X86::VMOVPQIto64rr
4263 : X86::MOVPQIto64rr;
4264 if (X86::VR64RegClass.contains(SrcReg))
4265 // Copy from a VR64 register to a GR64 register.
4266 return X86::MMX_MOVD64from64rr;
4267 } else if (X86::GR64RegClass.contains(SrcReg)) {
4268 // Copy from a GR64 register to a VR128 register.
4269 if (X86::VR128XRegClass.contains(DestReg))
4270 return HasAVX512 ? X86::VMOV64toPQIZrr
4271 : HasAVX ? X86::VMOV64toPQIrr
4272 : X86::MOV64toPQIrr;
4273 // Copy from a GR64 register to a VR64 register.
4274 if (X86::VR64RegClass.contains(DestReg))
4275 return X86::MMX_MOVD64to64rr;
4276 }
4277
4278 // SrcReg(VR128) -> DestReg(GR32)
4279 // SrcReg(GR32) -> DestReg(VR128)
4280
4281 if (X86::GR32RegClass.contains(DestReg) &&
4282 X86::VR128XRegClass.contains(SrcReg))
4283 // Copy from a VR128 register to a GR32 register.
4284 return HasAVX512 ? X86::VMOVPDI2DIZrr
4285 : HasAVX ? X86::VMOVPDI2DIrr
4286 : X86::MOVPDI2DIrr;
4287
4288 if (X86::VR128XRegClass.contains(DestReg) &&
4289 X86::GR32RegClass.contains(SrcReg))
4290 // Copy from a VR128 register to a VR128 register.
4291 return HasAVX512 ? X86::VMOVDI2PDIZrr
4292 : HasAVX ? X86::VMOVDI2PDIrr
4293 : X86::MOVDI2PDIrr;
4294 return 0;
4295}
4296
4299 const DebugLoc &DL, Register DestReg,
4300 Register SrcReg, bool KillSrc,
4301 bool RenamableDest, bool RenamableSrc) const {
4302 // First deal with the normal symmetric copies.
4303 bool HasAVX = Subtarget.hasAVX();
4304 bool HasVLX = Subtarget.hasVLX();
4305 bool HasEGPR = Subtarget.hasEGPR();
4306 unsigned Opc = 0;
4307 if (X86::GR64RegClass.contains(DestReg, SrcReg))
4308 Opc = X86::MOV64rr;
4309 else if (X86::GR32RegClass.contains(DestReg, SrcReg))
4310 Opc = X86::MOV32rr;
4311 else if (X86::GR16RegClass.contains(DestReg, SrcReg))
4312 Opc = X86::MOV16rr;
4313 else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
4314 // Copying to or from a physical H register on x86-64 requires a NOREX
4315 // move. Otherwise use a normal move.
4316 if ((isHReg(DestReg) || isHReg(SrcReg)) && Subtarget.is64Bit()) {
4317 Opc = X86::MOV8rr_NOREX;
4318 // Both operands must be encodable without an REX prefix.
4319 assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
4320 "8-bit H register can not be copied outside GR8_NOREX");
4321 } else
4322 Opc = X86::MOV8rr;
4323 } else if (X86::VR64RegClass.contains(DestReg, SrcReg))
4324 Opc = X86::MMX_MOVQ64rr;
4325 else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
4326 if (HasVLX)
4327 Opc = X86::VMOVAPSZ128rr;
4328 else if (X86::VR128RegClass.contains(DestReg, SrcReg))
4329 Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
4330 else {
4331 // If this an extended register and we don't have VLX we need to use a
4332 // 512-bit move.
4333 Opc = X86::VMOVAPSZrr;
4335 DestReg =
4336 TRI->getMatchingSuperReg(DestReg, X86::sub_xmm, &X86::VR512RegClass);
4337 SrcReg =
4338 TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
4339 }
4340 } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
4341 if (HasVLX)
4342 Opc = X86::VMOVAPSZ256rr;
4343 else if (X86::VR256RegClass.contains(DestReg, SrcReg))
4344 Opc = X86::VMOVAPSYrr;
4345 else {
4346 // If this an extended register and we don't have VLX we need to use a
4347 // 512-bit move.
4348 Opc = X86::VMOVAPSZrr;
4350 DestReg =
4351 TRI->getMatchingSuperReg(DestReg, X86::sub_ymm, &X86::VR512RegClass);
4352 SrcReg =
4353 TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
4354 }
4355 } else if (X86::VR512RegClass.contains(DestReg, SrcReg))
4356 Opc = X86::VMOVAPSZrr;
4357 // All KMASK RegClasses hold the same k registers, can be tested against
4358 // anyone.
4359 else if (X86::VK16RegClass.contains(DestReg, SrcReg))
4360 Opc = Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVQkk)
4361 : (HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVWkk);
4362 if (!Opc)
4363 Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
4364
4365 if (Opc) {
4366 BuildMI(MBB, MI, DL, get(Opc), DestReg)
4367 .addReg(SrcReg, getKillRegState(KillSrc));
4368 return;
4369 }
4370
4371 if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) {
4372 // FIXME: We use a fatal error here because historically LLVM has tried
4373 // lower some of these physreg copies and we want to ensure we get
4374 // reasonable bug reports if someone encounters a case no other testing
4375 // found. This path should be removed after the LLVM 7 release.
4376 report_fatal_error("Unable to copy EFLAGS physical register!");
4377 }
4378
4379 LLVM_DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) << " to "
4380 << RI.getName(DestReg) << '\n');
4381 report_fatal_error("Cannot emit physreg copy instruction");
4382}
4383
4384std::optional<DestSourcePair>
4386 if (MI.isMoveReg()) {
4387 // FIXME: Dirty hack for apparent invariant that doesn't hold when
4388 // subreg_to_reg is coalesced with ordinary copies, such that the bits that
4389 // were asserted as 0 are now undef.
4390 if (MI.getOperand(0).isUndef() && MI.getOperand(0).getSubReg())
4391 return std::nullopt;
4392
4393 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
4394 }
4395 return std::nullopt;
4396}
4397
4398static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI) {
4399 if (STI.hasFP16())
4400 return Load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
4401 if (Load)
4402 return X86::MOVSHPrm;
4403 return X86::MOVSHPmr;
4404}
4405
4407 const TargetRegisterClass *RC,
4408 bool IsStackAligned,
4409 const X86Subtarget &STI, bool Load) {
4410 bool HasAVX = STI.hasAVX();
4411 bool HasAVX512 = STI.hasAVX512();
4412 bool HasVLX = STI.hasVLX();
4413 bool HasEGPR = STI.hasEGPR();
4414
4415 assert(RC != nullptr && "Invalid target register class");
4416 switch (STI.getRegisterInfo()->getSpillSize(*RC)) {
4417 default:
4418 llvm_unreachable("Unknown spill size");
4419 case 1:
4420 assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
4421 if (STI.is64Bit())
4422 // Copying to or from a physical H register on x86-64 requires a NOREX
4423 // move. Otherwise use a normal move.
4424 if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
4425 return Load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
4426 return Load ? X86::MOV8rm : X86::MOV8mr;
4427 case 2:
4428 if (X86::VK16RegClass.hasSubClassEq(RC))
4429 return Load ? (HasEGPR ? X86::KMOVWkm_EVEX : X86::KMOVWkm)
4430 : (HasEGPR ? X86::KMOVWmk_EVEX : X86::KMOVWmk);
4431 assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
4432 return Load ? X86::MOV16rm : X86::MOV16mr;
4433 case 4:
4434 if (X86::GR32RegClass.hasSubClassEq(RC))
4435 return Load ? X86::MOV32rm : X86::MOV32mr;
4436 if (X86::FR32XRegClass.hasSubClassEq(RC))
4437 return Load ? (HasAVX512 ? X86::VMOVSSZrm_alt
4438 : HasAVX ? X86::VMOVSSrm_alt
4439 : X86::MOVSSrm_alt)
4440 : (HasAVX512 ? X86::VMOVSSZmr
4441 : HasAVX ? X86::VMOVSSmr
4442 : X86::MOVSSmr);
4443 if (X86::RFP32RegClass.hasSubClassEq(RC))
4444 return Load ? X86::LD_Fp32m : X86::ST_Fp32m;
4445 if (X86::VK32RegClass.hasSubClassEq(RC)) {
4446 assert(STI.hasBWI() && "KMOVD requires BWI");
4447 return Load ? (HasEGPR ? X86::KMOVDkm_EVEX : X86::KMOVDkm)
4448 : (HasEGPR ? X86::KMOVDmk_EVEX : X86::KMOVDmk);
4449 }
4450 // All of these mask pair classes have the same spill size, the same kind
4451 // of kmov instructions can be used with all of them.
4452 if (X86::VK1PAIRRegClass.hasSubClassEq(RC) ||
4453 X86::VK2PAIRRegClass.hasSubClassEq(RC) ||
4454 X86::VK4PAIRRegClass.hasSubClassEq(RC) ||
4455 X86::VK8PAIRRegClass.hasSubClassEq(RC) ||
4456 X86::VK16PAIRRegClass.hasSubClassEq(RC))
4457 return Load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE;
4458 if (X86::FR16RegClass.hasSubClassEq(RC) ||
4459 X86::FR16XRegClass.hasSubClassEq(RC))
4460 return getLoadStoreOpcodeForFP16(Load, STI);
4461 llvm_unreachable("Unknown 4-byte regclass");
4462 case 8:
4463 if (X86::GR64RegClass.hasSubClassEq(RC))
4464 return Load ? X86::MOV64rm : X86::MOV64mr;
4465 if (X86::FR64XRegClass.hasSubClassEq(RC))
4466 return Load ? (HasAVX512 ? X86::VMOVSDZrm_alt
4467 : HasAVX ? X86::VMOVSDrm_alt
4468 : X86::MOVSDrm_alt)
4469 : (HasAVX512 ? X86::VMOVSDZmr
4470 : HasAVX ? X86::VMOVSDmr
4471 : X86::MOVSDmr);
4472 if (X86::VR64RegClass.hasSubClassEq(RC))
4473 return Load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
4474 if (X86::RFP64RegClass.hasSubClassEq(RC))
4475 return Load ? X86::LD_Fp64m : X86::ST_Fp64m;
4476 if (X86::VK64RegClass.hasSubClassEq(RC)) {
4477 assert(STI.hasBWI() && "KMOVQ requires BWI");
4478 return Load ? (HasEGPR ? X86::KMOVQkm_EVEX : X86::KMOVQkm)
4479 : (HasEGPR ? X86::KMOVQmk_EVEX : X86::KMOVQmk);
4480 }
4481 llvm_unreachable("Unknown 8-byte regclass");
4482 case 10:
4483 assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
4484 return Load ? X86::LD_Fp80m : X86::ST_FpP80m;
4485 case 16: {
4486 if (X86::VR128XRegClass.hasSubClassEq(RC)) {
4487 // If stack is realigned we can use aligned stores.
4488 if (IsStackAligned)
4489 return Load ? (HasVLX ? X86::VMOVAPSZ128rm
4490 : HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX
4491 : HasAVX ? X86::VMOVAPSrm
4492 : X86::MOVAPSrm)
4493 : (HasVLX ? X86::VMOVAPSZ128mr
4494 : HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX
4495 : HasAVX ? X86::VMOVAPSmr
4496 : X86::MOVAPSmr);
4497 else
4498 return Load ? (HasVLX ? X86::VMOVUPSZ128rm
4499 : HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX
4500 : HasAVX ? X86::VMOVUPSrm
4501 : X86::MOVUPSrm)
4502 : (HasVLX ? X86::VMOVUPSZ128mr
4503 : HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX
4504 : HasAVX ? X86::VMOVUPSmr
4505 : X86::MOVUPSmr);
4506 }
4507 llvm_unreachable("Unknown 16-byte regclass");
4508 }
4509 case 32:
4510 assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
4511 // If stack is realigned we can use aligned stores.
4512 if (IsStackAligned)
4513 return Load ? (HasVLX ? X86::VMOVAPSZ256rm
4514 : HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX
4515 : X86::VMOVAPSYrm)
4516 : (HasVLX ? X86::VMOVAPSZ256mr
4517 : HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX
4518 : X86::VMOVAPSYmr);
4519 else
4520 return Load ? (HasVLX ? X86::VMOVUPSZ256rm
4521 : HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX
4522 : X86::VMOVUPSYrm)
4523 : (HasVLX ? X86::VMOVUPSZ256mr
4524 : HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX
4525 : X86::VMOVUPSYmr);
4526 case 64:
4527 assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
4528 assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
4529 if (IsStackAligned)
4530 return Load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
4531 else
4532 return Load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
4533 case 1024:
4534 assert(X86::TILERegClass.hasSubClassEq(RC) && "Unknown 1024-byte regclass");
4535 assert(STI.hasAMXTILE() && "Using 8*1024-bit register requires AMX-TILE");
4536#define GET_EGPR_IF_ENABLED(OPC) (STI.hasEGPR() ? OPC##_EVEX : OPC)
4537 return Load ? GET_EGPR_IF_ENABLED(X86::TILELOADD)
4538 : GET_EGPR_IF_ENABLED(X86::TILESTORED);
4539#undef GET_EGPR_IF_ENABLED
4540 case 2048:
4541 assert(X86::TILEPAIRRegClass.hasSubClassEq(RC) &&
4542 "Unknown 2048-byte regclass");
4543 assert(STI.hasAMXTILE() && "Using 2048-bit register requires AMX-TILE");
4544 return Load ? X86::PTILEPAIRLOAD : X86::PTILEPAIRSTORE;
4545 }
4546}
4547
4548std::optional<ExtAddrMode>
4550 const TargetRegisterInfo *TRI) const {
4551 const MCInstrDesc &Desc = MemI.getDesc();
4552 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
4553 if (MemRefBegin < 0)
4554 return std::nullopt;
4555
4556 MemRefBegin += X86II::getOperandBias(Desc);
4557
4558 auto &BaseOp = MemI.getOperand(MemRefBegin + X86::AddrBaseReg);
4559 if (!BaseOp.isReg()) // Can be an MO_FrameIndex
4560 return std::nullopt;
4561
4562 const MachineOperand &DispMO = MemI.getOperand(MemRefBegin + X86::AddrDisp);
4563 // Displacement can be symbolic
4564 if (!DispMO.isImm())
4565 return std::nullopt;
4566
4567 ExtAddrMode AM;
4568 AM.BaseReg = BaseOp.getReg();
4569 AM.ScaledReg = MemI.getOperand(MemRefBegin + X86::AddrIndexReg).getReg();
4570 AM.Scale = MemI.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm();
4571 AM.Displacement = DispMO.getImm();
4572 return AM;
4573}
4574
4576 StringRef &ErrInfo) const {
4577 std::optional<ExtAddrMode> AMOrNone = getAddrModeFromMemoryOp(MI, nullptr);
4578 if (!AMOrNone)
4579 return true;
4580
4581 ExtAddrMode AM = *AMOrNone;
4583 if (AM.ScaledReg != X86::NoRegister) {
4584 switch (AM.Scale) {
4585 case 1:
4586 case 2:
4587 case 4:
4588 case 8:
4589 break;
4590 default:
4591 ErrInfo = "Scale factor in address must be 1, 2, 4 or 8";
4592 return false;
4593 }
4594 }
4595 if (!isInt<32>(AM.Displacement)) {
4596 ErrInfo = "Displacement in address must fit into 32-bit signed "
4597 "integer";
4598 return false;
4599 }
4600
4601 return true;
4602}
4603
4605 const Register Reg,
4606 int64_t &ImmVal) const {
4607 Register MovReg = Reg;
4608 const MachineInstr *MovMI = &MI;
4609
4610 // Follow use-def for SUBREG_TO_REG to find the real move immediate
4611 // instruction. It is quite common for x86-64.
4612 if (MI.isSubregToReg()) {
4613 // We use following pattern to setup 64b immediate.
4614 // %8:gr32 = MOV32r0 implicit-def dead $eflags
4615 // %6:gr64 = SUBREG_TO_REG 0, killed %8:gr32, %subreg.sub_32bit
4616 if (!MI.getOperand(1).isImm())
4617 return false;
4618 unsigned FillBits = MI.getOperand(1).getImm();
4619 unsigned SubIdx = MI.getOperand(3).getImm();
4620 MovReg = MI.getOperand(2).getReg();
4621 if (SubIdx != X86::sub_32bit || FillBits != 0)
4622 return false;
4623 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
4624 MovMI = MRI.getUniqueVRegDef(MovReg);
4625 if (!MovMI)
4626 return false;
4627 }
4628
4629 if (MovMI->getOpcode() == X86::MOV32r0 &&
4630 MovMI->getOperand(0).getReg() == MovReg) {
4631 ImmVal = 0;
4632 return true;
4633 }
4634
4635 if (MovMI->getOpcode() != X86::MOV32ri &&
4636 MovMI->getOpcode() != X86::MOV64ri &&
4637 MovMI->getOpcode() != X86::MOV32ri64 && MovMI->getOpcode() != X86::MOV8ri)
4638 return false;
4639 // Mov Src can be a global address.
4640 if (!MovMI->getOperand(1).isImm() || MovMI->getOperand(0).getReg() != MovReg)
4641 return false;
4642 ImmVal = MovMI->getOperand(1).getImm();
4643 return true;
4644}
4645
4647 const MachineInstr *MI, const Register NullValueReg,
4648 const TargetRegisterInfo *TRI) const {
4649 if (!MI->modifiesRegister(NullValueReg, TRI))
4650 return true;
4651 switch (MI->getOpcode()) {
4652 // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax
4653 // X.
4654 case X86::SHR64ri:
4655 case X86::SHR32ri:
4656 case X86::SHL64ri:
4657 case X86::SHL32ri:
4658 assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() &&
4659 "expected for shift opcode!");
4660 return MI->getOperand(0).getReg() == NullValueReg &&
4661 MI->getOperand(1).getReg() == NullValueReg;
4662 // Zero extend of a sub-reg of NullValueReg into itself does not change the
4663 // null value.
4664 case X86::MOV32rr:
4665 return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) {
4666 return TRI->isSubRegisterEq(NullValueReg, MO.getReg());
4667 });
4668 default:
4669 return false;
4670 }
4671 llvm_unreachable("Should be handled above!");
4672}
4673
4676 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
4677 const TargetRegisterInfo *TRI) const {
4678 const MCInstrDesc &Desc = MemOp.getDesc();
4679 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
4680 if (MemRefBegin < 0)
4681 return false;
4682
4683 MemRefBegin += X86II::getOperandBias(Desc);
4684
4685 const MachineOperand *BaseOp =
4686 &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
4687 if (!BaseOp->isReg()) // Can be an MO_FrameIndex
4688 return false;
4689
4690 if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
4691 return false;
4692
4693 if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
4694 X86::NoRegister)
4695 return false;
4696
4697 const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp);
4698
4699 // Displacement can be symbolic
4700 if (!DispMO.isImm())
4701 return false;
4702
4703 Offset = DispMO.getImm();
4704
4705 if (!BaseOp->isReg())
4706 return false;
4707
4708 OffsetIsScalable = false;
4709 // FIXME: Relying on memoperands() may not be right thing to do here. Check
4710 // with X86 maintainers, and fix it accordingly. For now, it is ok, since
4711 // there is no use of `Width` for X86 back-end at the moment.
4712 Width = !MemOp.memoperands_empty() ? MemOp.memoperands().front()->getSize()
4714 BaseOps.push_back(BaseOp);
4715 return true;
4716}
4717
4718static unsigned getStoreRegOpcode(Register SrcReg,
4719 const TargetRegisterClass *RC,
4720 bool IsStackAligned,
4721 const X86Subtarget &STI) {
4722 return getLoadStoreRegOpcode(SrcReg, RC, IsStackAligned, STI, false);
4723}
4724
4725static unsigned getLoadRegOpcode(Register DestReg,
4726 const TargetRegisterClass *RC,
4727 bool IsStackAligned, const X86Subtarget &STI) {
4728 return getLoadStoreRegOpcode(DestReg, RC, IsStackAligned, STI, true);
4729}
4730
4731static bool isAMXOpcode(unsigned Opc) {
4732 switch (Opc) {
4733 default:
4734 return false;
4735 case X86::TILELOADD:
4736 case X86::TILESTORED:
4737 case X86::TILELOADD_EVEX:
4738 case X86::TILESTORED_EVEX:
4739 case X86::PTILEPAIRLOAD:
4740 case X86::PTILEPAIRSTORE:
4741 return true;
4742 }
4743}
4744
4747 unsigned Opc, Register Reg, int FrameIdx,
4748 bool isKill) const {
4749 switch (Opc) {
4750 default:
4751 llvm_unreachable("Unexpected special opcode!");
4752 case X86::TILESTORED:
4753 case X86::TILESTORED_EVEX:
4754 case X86::PTILEPAIRSTORE: {
4755 // tilestored %tmm, (%sp, %idx)
4757 Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
4758 BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
4759 MachineInstr *NewMI =
4760 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
4761 .addReg(Reg, getKillRegState(isKill));
4763 MO.setReg(VirtReg);
4764 MO.setIsKill(true);
4765 break;
4766 }
4767 case X86::TILELOADD:
4768 case X86::TILELOADD_EVEX:
4769 case X86::PTILEPAIRLOAD: {
4770 // tileloadd (%sp, %idx), %tmm
4772 Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
4773 BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
4775 BuildMI(MBB, MI, DebugLoc(), get(Opc), Reg), FrameIdx);
4777 MO.setReg(VirtReg);
4778 MO.setIsKill(true);
4779 break;
4780 }
4781 }
4782}
4783
4786 bool isKill, int FrameIdx, const TargetRegisterClass *RC,
4787 const TargetRegisterInfo *TRI, Register VReg,
4788 MachineInstr::MIFlag Flags) const {
4789 const MachineFunction &MF = *MBB.getParent();
4790 const MachineFrameInfo &MFI = MF.getFrameInfo();
4791 assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
4792 "Stack slot too small for store");
4793
4794 unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
4795 bool isAligned =
4796 (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
4797 (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
4798
4799 unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
4800 if (isAMXOpcode(Opc))
4801 loadStoreTileReg(MBB, MI, Opc, SrcReg, FrameIdx, isKill);
4802 else
4803 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
4804 .addReg(SrcReg, getKillRegState(isKill))
4805 .setMIFlag(Flags);
4806}
4807
4810 int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
4811 Register VReg, MachineInstr::MIFlag Flags) const {
4812 const MachineFunction &MF = *MBB.getParent();
4813 const MachineFrameInfo &MFI = MF.getFrameInfo();
4814 assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
4815 "Load size exceeds stack slot");
4816 unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
4817 bool isAligned =
4818 (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
4819 (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
4820
4821 unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
4822 if (isAMXOpcode(Opc))
4823 loadStoreTileReg(MBB, MI, Opc, DestReg, FrameIdx);
4824 else
4825 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx)
4826 .setMIFlag(Flags);
4827}
4828
4830 Register &SrcReg2, int64_t &CmpMask,
4831 int64_t &CmpValue) const {
4832 switch (MI.getOpcode()) {
4833 default:
4834 break;
4835 case X86::CMP64ri32:
4836 case X86::CMP32ri:
4837 case X86::CMP16ri:
4838 case X86::CMP8ri:
4839 SrcReg = MI.getOperand(0).getReg();
4840 SrcReg2 = 0;
4841 if (MI.getOperand(1).isImm()) {
4842 CmpMask = ~0;
4843 CmpValue = MI.getOperand(1).getImm();
4844 } else {
4845 CmpMask = CmpValue = 0;
4846 }
4847 return true;
4848 // A SUB can be used to perform comparison.
4849 CASE_ND(SUB64rm)
4850 CASE_ND(SUB32rm)
4851 CASE_ND(SUB16rm)
4852 CASE_ND(SUB8rm)
4853 SrcReg = MI.getOperand(1).getReg();
4854 SrcReg2 = 0;
4855 CmpMask = 0;
4856 CmpValue = 0;
4857 return true;
4858 CASE_ND(SUB64rr)
4859 CASE_ND(SUB32rr)
4860 CASE_ND(SUB16rr)
4861 CASE_ND(SUB8rr)
4862 SrcReg = MI.getOperand(1).getReg();
4863 SrcReg2 = MI.getOperand(2).getReg();
4864 CmpMask = 0;
4865 CmpValue = 0;
4866 return true;
4867 CASE_ND(SUB64ri32)
4868 CASE_ND(SUB32ri)
4869 CASE_ND(SUB16ri)
4870 CASE_ND(SUB8ri)
4871 SrcReg = MI.getOperand(1).getReg();
4872 SrcReg2 = 0;
4873 if (MI.getOperand(2).isImm()) {
4874 CmpMask = ~0;
4875 CmpValue = MI.getOperand(2).getImm();
4876 } else {
4877 CmpMask = CmpValue = 0;
4878 }
4879 return true;
4880 case X86::CMP64rr:
4881 case X86::CMP32rr:
4882 case X86::CMP16rr:
4883 case X86::CMP8rr:
4884 SrcReg = MI.getOperand(0).getReg();
4885 SrcReg2 = MI.getOperand(1).getReg();
4886 CmpMask = 0;
4887 CmpValue = 0;
4888 return true;
4889 case X86::TEST8rr:
4890 case X86::TEST16rr:
4891 case X86::TEST32rr:
4892 case X86::TEST64rr:
4893 SrcReg = MI.getOperand(0).getReg();
4894 if (MI.getOperand(1).getReg() != SrcReg)
4895 return false;
4896 // Compare against zero.
4897 SrcReg2 = 0;
4898 CmpMask = ~0;
4899 CmpValue = 0;
4900 return true;
4901 case X86::TEST64ri32:
4902 case X86::TEST32ri:
4903 case X86::TEST16ri:
4904 case X86::TEST8ri:
4905 SrcReg = MI.getOperand(0).getReg();
4906 SrcReg2 = 0;
4907 // Force identical compare.
4908 CmpMask = 0;
4909 CmpValue = 0;
4910 return true;
4911 }
4912 return false;
4913}
4914
4915bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
4916 Register SrcReg, Register SrcReg2,
4917 int64_t ImmMask, int64_t ImmValue,
4918 const MachineInstr &OI, bool *IsSwapped,
4919 int64_t *ImmDelta) const {
4920 switch (OI.getOpcode()) {
4921 case X86::CMP64rr:
4922 case X86::CMP32rr:
4923 case X86::CMP16rr:
4924 case X86::CMP8rr:
4925 CASE_ND(SUB64rr)
4926 CASE_ND(SUB32rr)
4927 CASE_ND(SUB16rr)
4928 CASE_ND(SUB8rr) {
4929 Register OISrcReg;
4930 Register OISrcReg2;
4931 int64_t OIMask;
4932 int64_t OIValue;
4933 if (!analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) ||
4934 OIMask != ImmMask || OIValue != ImmValue)
4935 return false;
4936 if (SrcReg == OISrcReg && SrcReg2 == OISrcReg2) {
4937 *IsSwapped = false;
4938 return true;
4939 }
4940 if (SrcReg == OISrcReg2 && SrcReg2 == OISrcReg) {
4941 *IsSwapped = true;
4942 return true;
4943 }
4944 return false;
4945 }
4946 case X86::CMP64ri32:
4947 case X86::CMP32ri:
4948 case X86::CMP16ri:
4949 case X86::CMP8ri:
4950 case X86::TEST64ri32:
4951 case X86::TEST32ri:
4952 case X86::TEST16ri:
4953 case X86::TEST8ri:
4954 CASE_ND(SUB64ri32)
4955 CASE_ND(SUB32ri)
4956 CASE_ND(SUB16ri)
4957 CASE_ND(SUB8ri)
4958 case X86::TEST64rr:
4959 case X86::TEST32rr:
4960 case X86::TEST16rr:
4961 case X86::TEST8rr: {
4962 if (ImmMask != 0) {
4963 Register OISrcReg;
4964 Register OISrcReg2;
4965 int64_t OIMask;
4966 int64_t OIValue;
4967 if (analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) &&
4968 SrcReg == OISrcReg && ImmMask == OIMask) {
4969 if (OIValue == ImmValue) {
4970 *ImmDelta = 0;
4971 return true;
4972 } else if (static_cast<uint64_t>(ImmValue) ==
4973 static_cast<uint64_t>(OIValue) - 1) {
4974 *ImmDelta = -1;
4975 return true;
4976 } else if (static_cast<uint64_t>(ImmValue) ==
4977 static_cast<uint64_t>(OIValue) + 1) {
4978 *ImmDelta = 1;
4979 return true;
4980 } else {
4981 return false;
4982 }
4983 }
4984 }
4985 return FlagI.isIdenticalTo(OI);
4986 }
4987 default:
4988 return false;
4989 }
4990}
4991
4992/// Check whether the definition can be converted
4993/// to remove a comparison against zero.
4994inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
4995 bool &ClearsOverflowFlag) {
4996 NoSignFlag = false;
4997 ClearsOverflowFlag = false;
4998
4999 // "ELF Handling for Thread-Local Storage" specifies that x86-64 GOTTPOFF, and
5000 // i386 GOTNTPOFF/INDNTPOFF relocations can convert an ADD to a LEA during
5001 // Initial Exec to Local Exec relaxation. In these cases, we must not depend
5002 // on the EFLAGS modification of ADD actually happening in the final binary.
5003 if (MI.getOpcode() == X86::ADD64rm || MI.getOpcode() == X86::ADD32rm) {
5004 unsigned Flags = MI.getOperand(5).getTargetFlags();
5005 if (Flags == X86II::MO_GOTTPOFF || Flags == X86II::MO_INDNTPOFF ||
5006 Flags == X86II::MO_GOTNTPOFF)
5007 return false;
5008 }
5009
5010 switch (MI.getOpcode()) {
5011 default:
5012 return false;
5013
5014 // The shift instructions only modify ZF if their shift count is non-zero.
5015 // N.B.: The processor truncates the shift count depending on the encoding.
5016 CASE_ND(SAR8ri)
5017 CASE_ND(SAR16ri)
5018 CASE_ND(SAR32ri)
5019 CASE_ND(SAR64ri)
5020 CASE_ND(SHR8ri)
5021 CASE_ND(SHR16ri)
5022 CASE_ND(SHR32ri)
5023 CASE_ND(SHR64ri)
5024 return getTruncatedShiftCount(MI, 2) != 0;
5025
5026 // Some left shift instructions can be turned into LEA instructions but only
5027 // if their flags aren't used. Avoid transforming such instructions.
5028 CASE_ND(SHL8ri)
5029 CASE_ND(SHL16ri)
5030 CASE_ND(SHL32ri)
5031 CASE_ND(SHL64ri) {
5032 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
5033 if (isTruncatedShiftCountForLEA(ShAmt))
5034 return false;
5035 return ShAmt != 0;
5036 }
5037
5038 CASE_ND(SHRD16rri8)
5039 CASE_ND(SHRD32rri8)
5040 CASE_ND(SHRD64rri8)
5041 CASE_ND(SHLD16rri8)
5042 CASE_ND(SHLD32rri8)
5043 CASE_ND(SHLD64rri8)
5044 return getTruncatedShiftCount(MI, 3) != 0;
5045
5046 CASE_ND(SUB64ri32)
5047 CASE_ND(SUB32ri)
5048 CASE_ND(SUB16ri)
5049 CASE_ND(SUB8ri)
5050 CASE_ND(SUB64rr)
5051 CASE_ND(SUB32rr)
5052 CASE_ND(SUB16rr)
5053 CASE_ND(SUB8rr)
5054 CASE_ND(SUB64rm)
5055 CASE_ND(SUB32rm)
5056 CASE_ND(SUB16rm)
5057 CASE_ND(SUB8rm)
5058 CASE_ND(DEC64r)
5059 CASE_ND(DEC32r)
5060 CASE_ND(DEC16r)
5061 CASE_ND(DEC8r)
5062 CASE_ND(ADD64ri32)
5063 CASE_ND(ADD32ri)
5064 CASE_ND(ADD16ri)
5065 CASE_ND(ADD8ri)
5066 CASE_ND(ADD64rr)
5067 CASE_ND(ADD32rr)
5068 CASE_ND(ADD16rr)
5069 CASE_ND(ADD8rr)
5070 CASE_ND(ADD64rm)
5071 CASE_ND(ADD32rm)
5072 CASE_ND(ADD16rm)
5073 CASE_ND(ADD8rm)
5074 CASE_ND(INC64r)
5075 CASE_ND(INC32r)
5076 CASE_ND(INC16r)
5077 CASE_ND(INC8r)
5078 CASE_ND(ADC64ri32)
5079 CASE_ND(ADC32ri)
5080 CASE_ND(ADC16ri)
5081 CASE_ND(ADC8ri)
5082 CASE_ND(ADC64rr)
5083 CASE_ND(ADC32rr)
5084 CASE_ND(ADC16rr)
5085 CASE_ND(ADC8rr)
5086 CASE_ND(ADC64rm)
5087 CASE_ND(ADC32rm)
5088 CASE_ND(ADC16rm)
5089 CASE_ND(ADC8rm)
5090 CASE_ND(SBB64ri32)
5091 CASE_ND(SBB32ri)
5092 CASE_ND(SBB16ri)
5093 CASE_ND(SBB8ri)
5094 CASE_ND(SBB64rr)
5095 CASE_ND(SBB32rr)
5096 CASE_ND(SBB16rr)
5097 CASE_ND(SBB8rr)
5098 CASE_ND(SBB64rm)
5099 CASE_ND(SBB32rm)
5100 CASE_ND(SBB16rm)
5101 CASE_ND(SBB8rm)
5102 CASE_ND(NEG8r)
5103 CASE_ND(NEG16r)
5104 CASE_ND(NEG32r)
5105 CASE_ND(NEG64r)
5106 case X86::LZCNT16rr:
5107 case X86::LZCNT16rm:
5108 case X86::LZCNT32rr:
5109 case X86::LZCNT32rm:
5110 case X86::LZCNT64rr:
5111 case X86::LZCNT64rm:
5112 case X86::POPCNT16rr:
5113 case X86::POPCNT16rm:
5114 case X86::POPCNT32rr:
5115 case X86::POPCNT32rm:
5116 case X86::POPCNT64rr:
5117 case X86::POPCNT64rm:
5118 case X86::TZCNT16rr:
5119 case X86::TZCNT16rm:
5120 case X86::TZCNT32rr:
5121 case X86::TZCNT32rm:
5122 case X86::TZCNT64rr:
5123 case X86::TZCNT64rm:
5124 return true;
5125 CASE_ND(AND64ri32)
5126 CASE_ND(AND32ri)
5127 CASE_ND(AND16ri)
5128 CASE_ND(AND8ri)
5129 CASE_ND(AND64rr)
5130 CASE_ND(AND32rr)
5131 CASE_ND(AND16rr)
5132 CASE_ND(AND8rr)
5133 CASE_ND(AND64rm)
5134 CASE_ND(AND32rm)
5135 CASE_ND(AND16rm)
5136 CASE_ND(AND8rm)
5137 CASE_ND(XOR64ri32)
5138 CASE_ND(XOR32ri)
5139 CASE_ND(XOR16ri)
5140 CASE_ND(XOR8ri)
5141 CASE_ND(XOR64rr)
5142 CASE_ND(XOR32rr)
5143 CASE_ND(XOR16rr)
5144 CASE_ND(XOR8rr)
5145 CASE_ND(XOR64rm)
5146 CASE_ND(XOR32rm)
5147 CASE_ND(XOR16rm)
5148 CASE_ND(XOR8rm)
5149 CASE_ND(OR64ri32)
5150 CASE_ND(OR32ri)
5151 CASE_ND(OR16ri)
5152 CASE_ND(OR8ri)
5153 CASE_ND(OR64rr)
5154 CASE_ND(OR32rr)
5155 CASE_ND(OR16rr)
5156 CASE_ND(OR8rr)
5157 CASE_ND(OR64rm)
5158 CASE_ND(OR32rm)
5159 CASE_ND(OR16rm)
5160 CASE_ND(OR8rm)
5161 case X86::ANDN32rr:
5162 case X86::ANDN32rm:
5163 case X86::ANDN64rr:
5164 case X86::ANDN64rm:
5165 case X86::BLSI32rr:
5166 case X86::BLSI32rm:
5167 case X86::BLSI64rr:
5168 case X86::BLSI64rm:
5169 case X86::BLSMSK32rr:
5170 case X86::BLSMSK32rm:
5171 case X86::BLSMSK64rr:
5172 case X86::BLSMSK64rm:
5173 case X86::BLSR32rr:
5174 case X86::BLSR32rm:
5175 case X86::BLSR64rr:
5176 case X86::BLSR64rm:
5177 case X86::BLCFILL32rr:
5178 case X86::BLCFILL32rm:
5179 case X86::BLCFILL64rr:
5180 case X86::BLCFILL64rm:
5181 case X86::BLCI32rr:
5182 case X86::BLCI32rm:
5183 case X86::BLCI64rr:
5184 case X86::BLCI64rm:
5185 case X86::BLCIC32rr:
5186 case X86::BLCIC32rm:
5187 case X86::BLCIC64rr:
5188 case X86::BLCIC64rm:
5189 case X86::BLCMSK32rr:
5190 case X86::BLCMSK32rm:
5191 case X86::BLCMSK64rr:
5192 case X86::BLCMSK64rm:
5193 case X86::BLCS32rr:
5194 case X86::BLCS32rm:
5195 case X86::BLCS64rr:
5196 case X86::BLCS64rm:
5197 case X86::BLSFILL32rr:
5198 case X86::BLSFILL32rm:
5199 case X86::BLSFILL64rr:
5200 case X86::BLSFILL64rm:
5201 case X86::BLSIC32rr:
5202 case X86::BLSIC32rm:
5203 case X86::BLSIC64rr:
5204 case X86::BLSIC64rm:
5205 case X86::BZHI32rr:
5206 case X86::BZHI32rm:
5207 case X86::BZHI64rr:
5208 case X86::BZHI64rm:
5209 case X86::T1MSKC32rr:
5210 case X86::T1MSKC32rm:
5211 case X86::T1MSKC64rr:
5212 case X86::T1MSKC64rm:
5213 case X86::TZMSK32rr:
5214 case X86::TZMSK32rm:
5215 case X86::TZMSK64rr:
5216 case X86::TZMSK64rm:
5217 // These instructions clear the overflow flag just like TEST.
5218 // FIXME: These are not the only instructions in this switch that clear the
5219 // overflow flag.
5220 ClearsOverflowFlag = true;
5221 return true;
5222 case X86::BEXTR32rr:
5223 case X86::BEXTR64rr:
5224 case X86::BEXTR32rm:
5225 case X86::BEXTR64rm:
5226 case X86::BEXTRI32ri:
5227 case X86::BEXTRI32mi:
5228 case X86::BEXTRI64ri:
5229 case X86::BEXTRI64mi:
5230 // BEXTR doesn't update the sign flag so we can't use it. It does clear
5231 // the overflow flag, but that's not useful without the sign flag.
5232 NoSignFlag = true;
5233 return true;
5234 }
5235}
5236
5237/// Check whether the use can be converted to remove a comparison against zero.
5238/// Returns the EFLAGS condition and the operand that we are comparing against zero.
5239static std::pair<X86::CondCode, unsigned> isUseDefConvertible(const MachineInstr &MI) {
5240 switch (MI.getOpcode()) {
5241 default:
5242 return std::make_pair(X86::COND_INVALID, ~0U);
5243 CASE_ND(NEG8r)
5244 CASE_ND(NEG16r)
5245 CASE_ND(NEG32r)
5246 CASE_ND(NEG64r)
5247 return std::make_pair(X86::COND_AE, 1U);
5248 case X86::LZCNT16rr:
5249 case X86::LZCNT32rr:
5250 case X86::LZCNT64rr:
5251 return std::make_pair(X86::COND_B, 1U);
5252 case X86::POPCNT16rr:
5253 case X86::POPCNT32rr:
5254 case X86::POPCNT64rr:
5255 return std::make_pair(X86::COND_E, 1U);
5256 case X86::TZCNT16rr:
5257 case X86::TZCNT32rr:
5258 case X86::TZCNT64rr:
5259 return std::make_pair(X86::COND_B, 1U);
5260 case X86::BSF16rr:
5261 case X86::BSF32rr:
5262 case X86::BSF64rr:
5263 case X86::BSR16rr:
5264 case X86::BSR32rr:
5265 case X86::BSR64rr:
5266 return std::make_pair(X86::COND_E, 2U);
5267 case X86::BLSI32rr:
5268 case X86::BLSI64rr:
5269 return std::make_pair(X86::COND_AE, 1U);
5270 case X86::BLSR32rr:
5271 case X86::BLSR64rr:
5272 case X86::BLSMSK32rr:
5273 case X86::BLSMSK64rr:
5274 return std::make_pair(X86::COND_B, 1U);
5275 // TODO: TBM instructions.
5276 }
5277}
5278
5279/// Check if there exists an earlier instruction that
5280/// operates on the same source operands and sets flags in the same way as
5281/// Compare; remove Compare if possible.
5283 Register SrcReg2, int64_t CmpMask,
5284 int64_t CmpValue,
5285 const MachineRegisterInfo *MRI) const {
5286 // Check whether we can replace SUB with CMP.
5287 switch (CmpInstr.getOpcode()) {
5288 default:
5289 break;
5290 CASE_ND(SUB64ri32)
5291 CASE_ND(SUB32ri)
5292 CASE_ND(SUB16ri)
5293 CASE_ND(SUB8ri)
5294 CASE_ND(SUB64rm)
5295 CASE_ND(SUB32rm)
5296 CASE_ND(SUB16rm)
5297 CASE_ND(SUB8rm)
5298 CASE_ND(SUB64rr)
5299 CASE_ND(SUB32rr)
5300 CASE_ND(SUB16rr)
5301 CASE_ND(SUB8rr) {
5302 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
5303 return false;
5304 // There is no use of the destination register, we can replace SUB with CMP.
5305 unsigned NewOpcode = 0;
5306#define FROM_TO(A, B) \
5307 CASE_ND(A) NewOpcode = X86::B; \
5308 break;
5309 switch (CmpInstr.getOpcode()) {
5310 default:
5311 llvm_unreachable("Unreachable!");
5312 FROM_TO(SUB64rm, CMP64rm)
5313 FROM_TO(SUB32rm, CMP32rm)
5314 FROM_TO(SUB16rm, CMP16rm)
5315 FROM_TO(SUB8rm, CMP8rm)
5316 FROM_TO(SUB64rr, CMP64rr)
5317 FROM_TO(SUB32rr, CMP32rr)
5318 FROM_TO(SUB16rr, CMP16rr)
5319 FROM_TO(SUB8rr, CMP8rr)
5320 FROM_TO(SUB64ri32, CMP64ri32)
5321 FROM_TO(SUB32ri, CMP32ri)
5322 FROM_TO(SUB16ri, CMP16ri)
5323 FROM_TO(SUB8ri, CMP8ri)
5324 }
5325#undef FROM_TO
5326 CmpInstr.setDesc(get(NewOpcode));
5327 CmpInstr.removeOperand(0);
5328 // Mutating this instruction invalidates any debug data associated with it.
5329 CmpInstr.dropDebugNumber();
5330 // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
5331 if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
5332 NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
5333 return false;
5334 }
5335 }
5336
5337 // The following code tries to remove the comparison by re-using EFLAGS
5338 // from earlier instructions.
5339
5340 bool IsCmpZero = (CmpMask != 0 && CmpValue == 0);
5341
5342 // Transformation currently requires SSA values.
5343 if (SrcReg2.isPhysical())
5344 return false;
5345 MachineInstr *SrcRegDef = MRI->getVRegDef(SrcReg);
5346 assert(SrcRegDef && "Must have a definition (SSA)");
5347
5348 MachineInstr *MI = nullptr;
5349 MachineInstr *Sub = nullptr;
5350 MachineInstr *Movr0Inst = nullptr;
5352 bool NoSignFlag = false;
5353 bool ClearsOverflowFlag = false;
5354 bool ShouldUpdateCC = false;
5355 bool IsSwapped = false;
5356 bool HasNF = Subtarget.hasNF();
5357 unsigned OpNo = 0;
5359 int64_t ImmDelta = 0;
5360
5361 // Search backward from CmpInstr for the next instruction defining EFLAGS.
5363 MachineBasicBlock &CmpMBB = *CmpInstr.getParent();
5365 std::next(MachineBasicBlock::reverse_iterator(CmpInstr));
5366 for (MachineBasicBlock *MBB = &CmpMBB;;) {
5367 for (MachineInstr &Inst : make_range(From, MBB->rend())) {
5368 // Try to use EFLAGS from the instruction defining %SrcReg. Example:
5369 // %eax = addl ...
5370 // ... // EFLAGS not changed
5371 // testl %eax, %eax // <-- can be removed
5372 if (&Inst == SrcRegDef) {
5373 if (IsCmpZero &&
5374 isDefConvertible(Inst, NoSignFlag, ClearsOverflowFlag)) {
5375 MI = &Inst;
5376 break;
5377 }
5378
5379 // Look back for the following pattern, in which case the
5380 // test16rr/test64rr instruction could be erased.
5381 //
5382 // Example for test16rr:
5383 // %reg = and32ri %in_reg, 5
5384 // ... // EFLAGS not changed.
5385 // %src_reg = copy %reg.sub_16bit:gr32
5386 // test16rr %src_reg, %src_reg, implicit-def $eflags
5387 // Example for test64rr:
5388 // %reg = and32ri %in_reg, 5
5389 // ... // EFLAGS not changed.
5390 // %src_reg = subreg_to_reg 0, %reg, %subreg.sub_index
5391 // test64rr %src_reg, %src_reg, implicit-def $eflags
5392 MachineInstr *AndInstr = nullptr;
5393 if (IsCmpZero &&
5394 findRedundantFlagInstr(CmpInstr, Inst, MRI, &AndInstr, TRI,
5395 Subtarget, NoSignFlag, ClearsOverflowFlag)) {
5396 assert(AndInstr != nullptr && X86::isAND(AndInstr->getOpcode()));
5397 MI = AndInstr;
5398 break;
5399 }
5400 // Cannot find other candidates before definition of SrcReg.
5401 return false;
5402 }
5403
5404 if (Inst.modifiesRegister(X86::EFLAGS, TRI)) {
5405 // Try to use EFLAGS produced by an instruction reading %SrcReg.
5406 // Example:
5407 // %eax = ...
5408 // ...
5409 // popcntl %eax
5410 // ... // EFLAGS not changed
5411 // testl %eax, %eax // <-- can be removed
5412 if (IsCmpZero) {
5413 std::tie(NewCC, OpNo) = isUseDefConvertible(Inst);
5414 if (NewCC != X86::COND_INVALID && Inst.getOperand(OpNo).isReg() &&
5415 Inst.getOperand(OpNo).getReg() == SrcReg) {
5416 ShouldUpdateCC = true;
5417 MI = &Inst;
5418 break;
5419 }
5420 }
5421
5422 // Try to use EFLAGS from an instruction with similar flag results.
5423 // Example:
5424 // sub x, y or cmp x, y
5425 // ... // EFLAGS not changed
5426 // cmp x, y // <-- can be removed
5427 if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask, CmpValue,
5428 Inst, &IsSwapped, &ImmDelta)) {
5429 Sub = &Inst;
5430 break;
5431 }
5432
5433 // MOV32r0 is implemented with xor which clobbers condition code. It is
5434 // safe to move up, if the definition to EFLAGS is dead and earlier
5435 // instructions do not read or write EFLAGS.
5436 if (!Movr0Inst && Inst.getOpcode() == X86::MOV32r0 &&
5437 Inst.registerDefIsDead(X86::EFLAGS, TRI)) {
5438 Movr0Inst = &Inst;
5439 continue;
5440 }
5441
5442 // For the instructions are ADDrm/ADDmr with relocation, we'll skip the
5443 // optimization for replacing non-NF with NF. This is to keep backward
5444 // compatiblity with old version of linkers without APX relocation type
5445 // support on Linux OS.
5446 bool IsWithReloc = X86EnableAPXForRelocation
5447 ? false
5449
5450 // Try to replace non-NF with NF instructions.
5451 if (HasNF && Inst.registerDefIsDead(X86::EFLAGS, TRI) && !IsWithReloc) {
5452 unsigned NewOp = X86::getNFVariant(Inst.getOpcode());
5453 if (!NewOp)
5454 return false;
5455
5456 InstsToUpdate.push_back(std::make_pair(&Inst, NewOp));
5457 continue;
5458 }
5459
5460 // Cannot do anything for any other EFLAG changes.
5461 return false;
5462 }
5463 }
5464
5465 if (MI || Sub)
5466 break;
5467
5468 // Reached begin of basic block. Continue in predecessor if there is
5469 // exactly one.
5470 if (MBB->pred_size() != 1)
5471 return false;
5472 MBB = *MBB->pred_begin();
5473 From = MBB->rbegin();
5474 }
5475
5476 // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
5477 // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
5478 // If we are done with the basic block, we need to check whether EFLAGS is
5479 // live-out.
5480 bool FlagsMayLiveOut = true;
5482 MachineBasicBlock::iterator AfterCmpInstr =
5483 std::next(MachineBasicBlock::iterator(CmpInstr));
5484 for (MachineInstr &Instr : make_range(AfterCmpInstr, CmpMBB.end())) {
5485 bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
5486 bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
5487 // We should check the usage if this instruction uses and updates EFLAGS.
5488 if (!UseEFLAGS && ModifyEFLAGS) {
5489 // It is safe to remove CmpInstr if EFLAGS is updated again.
5490 FlagsMayLiveOut = false;
5491 break;
5492 }
5493 if (!UseEFLAGS && !ModifyEFLAGS)
5494 continue;
5495
5496 // EFLAGS is used by this instruction.
5497 X86::CondCode OldCC = X86::getCondFromMI(Instr);
5498 if ((MI || IsSwapped || ImmDelta != 0) && OldCC == X86::COND_INVALID)
5499 return false;
5500
5501 X86::CondCode ReplacementCC = X86::COND_INVALID;
5502 if (MI) {
5503 switch (OldCC) {
5504 default:
5505 break;
5506 case X86::COND_A:
5507 case X86::COND_AE:
5508 case X86::COND_B:
5509 case X86::COND_BE:
5510 // CF is used, we can't perform this optimization.
5511 return false;
5512 case X86::COND_G:
5513 case X86::COND_GE:
5514 case X86::COND_L:
5515 case X86::COND_LE:
5516 // If SF is used, but the instruction doesn't update the SF, then we
5517 // can't do the optimization.
5518 if (NoSignFlag)
5519 return false;
5520 [[fallthrough]];
5521 case X86::COND_O:
5522 case X86::COND_NO:
5523 // If OF is used, the instruction needs to clear it like CmpZero does.
5524 if (!ClearsOverflowFlag)
5525 return false;
5526 break;
5527 case X86::COND_S:
5528 case X86::COND_NS:
5529 // If SF is used, but the instruction doesn't update the SF, then we
5530 // can't do the optimization.
5531 if (NoSignFlag)
5532 return false;
5533 break;
5534 }
5535
5536 // If we're updating the condition code check if we have to reverse the
5537 // condition.
5538 if (ShouldUpdateCC)
5539 switch (OldCC) {
5540 default:
5541 return false;
5542 case X86::COND_E:
5543 ReplacementCC = NewCC;
5544 break;
5545 case X86::COND_NE:
5546 ReplacementCC = GetOppositeBranchCondition(NewCC);
5547 break;
5548 }
5549 } else if (IsSwapped) {
5550 // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
5551 // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
5552 // We swap the condition code and synthesize the new opcode.
5553 ReplacementCC = getSwappedCondition(OldCC);
5554 if (ReplacementCC == X86::COND_INVALID)
5555 return false;
5556 ShouldUpdateCC = true;
5557 } else if (ImmDelta != 0) {
5558 unsigned BitWidth = TRI->getRegSizeInBits(*MRI->getRegClass(SrcReg));
5559 // Shift amount for min/max constants to adjust for 8/16/32 instruction
5560 // sizes.
5561 switch (OldCC) {
5562 case X86::COND_L: // x <s (C + 1) --> x <=s C
5563 if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
5564 return false;
5565 ReplacementCC = X86::COND_LE;
5566 break;
5567 case X86::COND_B: // x <u (C + 1) --> x <=u C
5568 if (ImmDelta != 1 || CmpValue == 0)
5569 return false;
5570 ReplacementCC = X86::COND_BE;
5571 break;
5572 case X86::COND_GE: // x >=s (C + 1) --> x >s C
5573 if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
5574 return false;
5575 ReplacementCC = X86::COND_G;
5576 break;
5577 case X86::COND_AE: // x >=u (C + 1) --> x >u C
5578 if (ImmDelta != 1 || CmpValue == 0)
5579 return false;
5580 ReplacementCC = X86::COND_A;
5581 break;
5582 case X86::COND_G: // x >s (C - 1) --> x >=s C
5583 if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
5584 return false;
5585 ReplacementCC = X86::COND_GE;
5586 break;
5587 case X86::COND_A: // x >u (C - 1) --> x >=u C
5588 if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
5589 return false;
5590 ReplacementCC = X86::COND_AE;
5591 break;
5592 case X86::COND_LE: // x <=s (C - 1) --> x <s C
5593 if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
5594 return false;
5595 ReplacementCC = X86::COND_L;
5596 break;
5597 case X86::COND_BE: // x <=u (C - 1) --> x <u C
5598 if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
5599 return false;
5600 ReplacementCC = X86::COND_B;
5601 break;
5602 default:
5603 return false;
5604 }
5605 ShouldUpdateCC = true;
5606 }
5607
5608 if (ShouldUpdateCC && ReplacementCC != OldCC) {
5609 // Push the MachineInstr to OpsToUpdate.
5610 // If it is safe to remove CmpInstr, the condition code of these
5611 // instructions will be modified.
5612 OpsToUpdate.push_back(std::make_pair(&Instr, ReplacementCC));
5613 }
5614 if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
5615 // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
5616 FlagsMayLiveOut = false;
5617 break;
5618 }
5619 }
5620
5621 // If we have to update users but EFLAGS is live-out abort, since we cannot
5622 // easily find all of the users.
5623 if ((MI != nullptr || ShouldUpdateCC) && FlagsMayLiveOut) {
5624 for (MachineBasicBlock *Successor : CmpMBB.successors())
5625 if (Successor->isLiveIn(X86::EFLAGS))
5626 return false;
5627 }
5628
5629 // The instruction to be updated is either Sub or MI.
5630 assert((MI == nullptr || Sub == nullptr) && "Should not have Sub and MI set");
5631 Sub = MI != nullptr ? MI : Sub;
5632 MachineBasicBlock *SubBB = Sub->getParent();
5633 // Move Movr0Inst to the appropriate place before Sub.
5634 if (Movr0Inst) {
5635 // Only move within the same block so we don't accidentally move to a
5636 // block with higher execution frequency.
5637 if (&CmpMBB != SubBB)
5638 return false;
5639 // Look backwards until we find a def that doesn't use the current EFLAGS.
5641 InsertE = Sub->getParent()->rend();
5642 for (; InsertI != InsertE; ++InsertI) {
5643 MachineInstr *Instr = &*InsertI;
5644 if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
5645 Instr->modifiesRegister(X86::EFLAGS, TRI)) {
5646 Movr0Inst->getParent()->remove(Movr0Inst);
5647 Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
5648 Movr0Inst);
5649 break;
5650 }
5651 }
5652 if (InsertI == InsertE)
5653 return false;
5654 }
5655
5656 // Replace non-NF with NF instructions.
5657 for (auto &Inst : InstsToUpdate) {
5658 Inst.first->setDesc(get(Inst.second));
5659 Inst.first->removeOperand(
5660 Inst.first->findRegisterDefOperandIdx(X86::EFLAGS, /*TRI=*/nullptr));
5661 }
5662
5663 // Make sure Sub instruction defines EFLAGS and mark the def live.
5664 MachineOperand *FlagDef =
5665 Sub->findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
5666 assert(FlagDef && "Unable to locate a def EFLAGS operand");
5667 FlagDef->setIsDead(false);
5668
5669 CmpInstr.eraseFromParent();
5670
5671 // Modify the condition code of instructions in OpsToUpdate.
5672 for (auto &Op : OpsToUpdate) {
5673 Op.first->getOperand(Op.first->getDesc().getNumOperands() - 1)
5674 .setImm(Op.second);
5675 }
5676 // Add EFLAGS to block live-ins between CmpBB and block of flags producer.
5677 for (MachineBasicBlock *MBB = &CmpMBB; MBB != SubBB;
5678 MBB = *MBB->pred_begin()) {
5679 assert(MBB->pred_size() == 1 && "Expected exactly one predecessor");
5680 if (!MBB->isLiveIn(X86::EFLAGS))
5681 MBB->addLiveIn(X86::EFLAGS);
5682 }
5683 return true;
5684}
5685
5686/// \returns true if the instruction can be changed to COPY when imm is 0.
5687static bool canConvert2Copy(unsigned Opc) {
5688 switch (Opc) {
5689 default:
5690 return false;
5691 CASE_ND(ADD64ri32)
5692 CASE_ND(SUB64ri32)
5693 CASE_ND(OR64ri32)
5694 CASE_ND(XOR64ri32)
5695 CASE_ND(ADD32ri)
5696 CASE_ND(SUB32ri)
5697 CASE_ND(OR32ri)
5698 CASE_ND(XOR32ri)
5699 return true;
5700 }
5701}
5702
5703/// Convert an ALUrr opcode to corresponding ALUri opcode. Such as
5704/// ADD32rr ==> ADD32ri
5705static unsigned convertALUrr2ALUri(unsigned Opc) {
5706 switch (Opc) {
5707 default:
5708 return 0;
5709#define FROM_TO(FROM, TO) \
5710 case X86::FROM: \
5711 return X86::TO; \
5712 case X86::FROM##_ND: \
5713 return X86::TO##_ND;
5714 FROM_TO(ADD64rr, ADD64ri32)
5715 FROM_TO(ADC64rr, ADC64ri32)
5716 FROM_TO(SUB64rr, SUB64ri32)
5717 FROM_TO(SBB64rr, SBB64ri32)
5718 FROM_TO(AND64rr, AND64ri32)
5719 FROM_TO(OR64rr, OR64ri32)
5720 FROM_TO(XOR64rr, XOR64ri32)
5721 FROM_TO(SHR64rCL, SHR64ri)
5722 FROM_TO(SHL64rCL, SHL64ri)
5723 FROM_TO(SAR64rCL, SAR64ri)
5724 FROM_TO(ROL64rCL, ROL64ri)
5725 FROM_TO(ROR64rCL, ROR64ri)
5726 FROM_TO(RCL64rCL, RCL64ri)
5727 FROM_TO(RCR64rCL, RCR64ri)
5728 FROM_TO(ADD32rr, ADD32ri)
5729 FROM_TO(ADC32rr, ADC32ri)
5730 FROM_TO(SUB32rr, SUB32ri)
5731 FROM_TO(SBB32rr, SBB32ri)
5732 FROM_TO(AND32rr, AND32ri)
5733 FROM_TO(OR32rr, OR32ri)
5734 FROM_TO(XOR32rr, XOR32ri)
5735 FROM_TO(SHR32rCL, SHR32ri)
5736 FROM_TO(SHL32rCL, SHL32ri)
5737 FROM_TO(SAR32rCL, SAR32ri)
5738 FROM_TO(ROL32rCL, ROL32ri)
5739 FROM_TO(ROR32rCL, ROR32ri)
5740 FROM_TO(RCL32rCL, RCL32ri)
5741 FROM_TO(RCR32rCL, RCR32ri)
5742#undef FROM_TO
5743#define FROM_TO(FROM, TO) \
5744 case X86::FROM: \
5745 return X86::TO;
5746 FROM_TO(TEST64rr, TEST64ri32)
5747 FROM_TO(CTEST64rr, CTEST64ri32)
5748 FROM_TO(CMP64rr, CMP64ri32)
5749 FROM_TO(CCMP64rr, CCMP64ri32)
5750 FROM_TO(TEST32rr, TEST32ri)
5751 FROM_TO(CTEST32rr, CTEST32ri)
5752 FROM_TO(CMP32rr, CMP32ri)
5753 FROM_TO(CCMP32rr, CCMP32ri)
5754#undef FROM_TO
5755 }
5756}
5757
5758/// Reg is assigned ImmVal in DefMI, and is used in UseMI.
5759/// If MakeChange is true, this function tries to replace Reg by ImmVal in
5760/// UseMI. If MakeChange is false, just check if folding is possible.
5761//
5762/// \returns true if folding is successful or possible.
5763bool X86InstrInfo::foldImmediateImpl(MachineInstr &UseMI, MachineInstr *DefMI,
5764 Register Reg, int64_t ImmVal,
5766 bool MakeChange) const {
5767 bool Modified = false;
5768
5769 // 64 bit operations accept sign extended 32 bit immediates.
5770 // 32 bit operations accept all 32 bit immediates, so we don't need to check
5771 // them.
5772 const TargetRegisterClass *RC = nullptr;
5773 if (Reg.isVirtual())
5774 RC = MRI->getRegClass(Reg);
5775 if ((Reg.isPhysical() && X86::GR64RegClass.contains(Reg)) ||
5776 (Reg.isVirtual() && X86::GR64RegClass.hasSubClassEq(RC))) {
5777 if (!isInt<32>(ImmVal))
5778 return false;
5779 }
5780
5781 if (UseMI.findRegisterUseOperand(Reg, /*TRI=*/nullptr)->getSubReg())
5782 return false;
5783 // Immediate has larger code size than register. So avoid folding the
5784 // immediate if it has more than 1 use and we are optimizing for size.
5785 if (UseMI.getMF()->getFunction().hasOptSize() && Reg.isVirtual() &&
5786 !MRI->hasOneNonDBGUse(Reg))
5787 return false;
5788
5789 unsigned Opc = UseMI.getOpcode();
5790 unsigned NewOpc;
5791 if (Opc == TargetOpcode::COPY) {
5792 Register ToReg = UseMI.getOperand(0).getReg();
5793 const TargetRegisterClass *RC = nullptr;
5794 if (ToReg.isVirtual())
5795 RC = MRI->getRegClass(ToReg);
5796 bool GR32Reg = (ToReg.isVirtual() && X86::GR32RegClass.hasSubClassEq(RC)) ||
5797 (ToReg.isPhysical() && X86::GR32RegClass.contains(ToReg));
5798 bool GR64Reg = (ToReg.isVirtual() && X86::GR64RegClass.hasSubClassEq(RC)) ||
5799 (ToReg.isPhysical() && X86::GR64RegClass.contains(ToReg));
5800 bool GR8Reg = (ToReg.isVirtual() && X86::GR8RegClass.hasSubClassEq(RC)) ||
5801 (ToReg.isPhysical() && X86::GR8RegClass.contains(ToReg));
5802
5803 if (ImmVal == 0) {
5804 // We have MOV32r0 only.
5805 if (!GR32Reg)
5806 return false;
5807 }
5808
5809 if (GR64Reg) {
5810 if (isUInt<32>(ImmVal))
5811 NewOpc = X86::MOV32ri64;
5812 else
5813 NewOpc = X86::MOV64ri;
5814 } else if (GR32Reg) {
5815 NewOpc = X86::MOV32ri;
5816 if (ImmVal == 0) {
5817 // MOV32r0 clobbers EFLAGS.
5819 if (UseMI.getParent()->computeRegisterLiveness(
5820 TRI, X86::EFLAGS, UseMI) != MachineBasicBlock::LQR_Dead)
5821 return false;
5822
5823 // MOV32r0 is different than other cases because it doesn't encode the
5824 // immediate in the instruction. So we directly modify it here.
5825 if (!MakeChange)
5826 return true;
5827 UseMI.setDesc(get(X86::MOV32r0));
5828 UseMI.removeOperand(
5829 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr));
5830 UseMI.addOperand(MachineOperand::CreateReg(X86::EFLAGS, /*isDef=*/true,
5831 /*isImp=*/true,
5832 /*isKill=*/false,
5833 /*isDead=*/true));
5834 Modified = true;
5835 }
5836 } else if (GR8Reg)
5837 NewOpc = X86::MOV8ri;
5838 else
5839 return false;
5840 } else
5841 NewOpc = convertALUrr2ALUri(Opc);
5842
5843 if (!NewOpc)
5844 return false;
5845
5846 // For SUB instructions the immediate can only be the second source operand.
5847 if ((NewOpc == X86::SUB64ri32 || NewOpc == X86::SUB32ri ||
5848 NewOpc == X86::SBB64ri32 || NewOpc == X86::SBB32ri ||
5849 NewOpc == X86::SUB64ri32_ND || NewOpc == X86::SUB32ri_ND ||
5850 NewOpc == X86::SBB64ri32_ND || NewOpc == X86::SBB32ri_ND) &&
5851 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr) != 2)
5852 return false;
5853 // For CMP instructions the immediate can only be at index 1.
5854 if (((NewOpc == X86::CMP64ri32 || NewOpc == X86::CMP32ri) ||
5855 (NewOpc == X86::CCMP64ri32 || NewOpc == X86::CCMP32ri)) &&
5856 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr) != 1)
5857 return false;
5858
5859 using namespace X86;
5860 if (isSHL(Opc) || isSHR(Opc) || isSAR(Opc) || isROL(Opc) || isROR(Opc) ||
5861 isRCL(Opc) || isRCR(Opc)) {
5862 unsigned RegIdx = UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr);
5863 if (RegIdx < 2)
5864 return false;
5865 if (!isInt<8>(ImmVal))
5866 return false;
5867 assert(Reg == X86::CL);
5868
5869 if (!MakeChange)
5870 return true;
5871 UseMI.setDesc(get(NewOpc));
5872 UseMI.removeOperand(RegIdx);
5873 UseMI.addOperand(MachineOperand::CreateImm(ImmVal));
5874 // Reg is physical register $cl, so we don't know if DefMI is dead through
5875 // MRI. Let the caller handle it, or pass dead-mi-elimination can delete
5876 // the dead physical register define instruction.
5877 return true;
5878 }
5879
5880 if (!MakeChange)
5881 return true;
5882
5883 if (!Modified) {
5884 // Modify the instruction.
5885 if (ImmVal == 0 && canConvert2Copy(NewOpc) &&
5886 UseMI.registerDefIsDead(X86::EFLAGS, /*TRI=*/nullptr)) {
5887 // %100 = add %101, 0
5888 // ==>
5889 // %100 = COPY %101
5890 UseMI.setDesc(get(TargetOpcode::COPY));
5891 UseMI.removeOperand(
5892 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr));
5893 UseMI.removeOperand(
5894 UseMI.findRegisterDefOperandIdx(X86::EFLAGS, /*TRI=*/nullptr));
5895 UseMI.untieRegOperand(0);
5898 } else {
5899 unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
5900 unsigned ImmOpNum = 2;
5901 if (!UseMI.getOperand(0).isDef()) {
5902 Op1 = 0; // TEST, CMP, CTEST, CCMP
5903 ImmOpNum = 1;
5904 }
5905 if (Opc == TargetOpcode::COPY)
5906 ImmOpNum = 1;
5907 if (findCommutedOpIndices(UseMI, Op1, Op2) &&
5908 UseMI.getOperand(Op1).getReg() == Reg)
5909 commuteInstruction(UseMI);
5910
5911 assert(UseMI.getOperand(ImmOpNum).getReg() == Reg);
5912 UseMI.setDesc(get(NewOpc));
5913 UseMI.getOperand(ImmOpNum).ChangeToImmediate(ImmVal);
5914 }
5915 }
5916
5917 if (Reg.isVirtual() && MRI->use_nodbg_empty(Reg))
5919
5920 return true;
5921}
5922
5923/// foldImmediate - 'Reg' is known to be defined by a move immediate
5924/// instruction, try to fold the immediate into the use instruction.
5926 Register Reg, MachineRegisterInfo *MRI) const {
5927 int64_t ImmVal;
5928 if (!getConstValDefinedInReg(DefMI, Reg, ImmVal))
5929 return false;
5930
5931 return foldImmediateImpl(UseMI, &DefMI, Reg, ImmVal, MRI, true);
5932}
5933
5934/// Expand a single-def pseudo instruction to a two-addr
5935/// instruction with two undef reads of the register being defined.
5936/// This is used for mapping:
5937/// %xmm4 = V_SET0
5938/// to:
5939/// %xmm4 = PXORrr undef %xmm4, undef %xmm4
5940///
5942 const MCInstrDesc &Desc) {
5943 assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
5944 Register Reg = MIB.getReg(0);
5945 MIB->setDesc(Desc);
5946
5947 // MachineInstr::addOperand() will insert explicit operands before any
5948 // implicit operands.
5950 // But we don't trust that.
5951 assert(MIB.getReg(1) == Reg && MIB.getReg(2) == Reg && "Misplaced operand");
5952 return true;
5953}
5954
5955/// Expand a single-def pseudo instruction to a two-addr
5956/// instruction with two %k0 reads.
5957/// This is used for mapping:
5958/// %k4 = K_SET1
5959/// to:
5960/// %k4 = KXNORrr %k0, %k0
5962 Register Reg) {
5963 assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
5964 MIB->setDesc(Desc);
5966 return true;
5967}
5968
5970 bool MinusOne) {
5971 MachineBasicBlock &MBB = *MIB->getParent();
5972 const DebugLoc &DL = MIB->getDebugLoc();
5973 Register Reg = MIB.getReg(0);
5974
5975 // Insert the XOR.
5976 BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
5977 .addReg(Reg, RegState::Undef)
5978 .addReg(Reg, RegState::Undef);
5979
5980 // Turn the pseudo into an INC or DEC.
5981 MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r));
5982 MIB.addReg(Reg);
5983
5984 return true;
5985}
5986
5988 const TargetInstrInfo &TII,
5989 const X86Subtarget &Subtarget) {
5990 MachineBasicBlock &MBB = *MIB->getParent();
5991 const DebugLoc &DL = MIB->getDebugLoc();
5992 int64_t Imm = MIB->getOperand(1).getImm();
5993 assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
5995
5996 int StackAdjustment;
5997
5998 if (Subtarget.is64Bit()) {
5999 assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
6000 MIB->getOpcode() == X86::MOV32ImmSExti8);
6001
6002 // Can't use push/pop lowering if the function might write to the red zone.
6003 X86MachineFunctionInfo *X86FI =
6005 if (X86FI->getUsesRedZone()) {
6006 MIB->setDesc(TII.get(MIB->getOpcode() == X86::MOV32ImmSExti8
6007 ? X86::MOV32ri
6008 : X86::MOV64ri));
6009 return true;
6010 }
6011
6012 // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
6013 // widen the register if necessary.
6014 StackAdjustment = 8;
6015 BuildMI(MBB, I, DL, TII.get(X86::PUSH64i32)).addImm(Imm);
6016 MIB->setDesc(TII.get(X86::POP64r));
6017 MIB->getOperand(0).setReg(getX86SubSuperRegister(MIB.getReg(0), 64));
6018 } else {
6019 assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
6020 StackAdjustment = 4;
6021 BuildMI(MBB, I, DL, TII.get(X86::PUSH32i)).addImm(Imm);
6022 MIB->setDesc(TII.get(X86::POP32r));
6023 }
6024 MIB->removeOperand(1);
6026
6027 // Build CFI if necessary.
6028 MachineFunction &MF = *MBB.getParent();
6029 const X86FrameLowering *TFL = Subtarget.getFrameLowering();
6030 bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
6031 bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves();
6032 bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
6033 if (EmitCFI) {
6034 TFL->BuildCFI(
6035 MBB, I, DL,
6036 MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));
6037 TFL->BuildCFI(
6038 MBB, std::next(I), DL,
6039 MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment));
6040 }
6041
6042 return true;
6043}
6044
6045// LoadStackGuard has so far only been implemented for 64-bit MachO. Different
6046// code sequence is needed for other targets.
6048 const TargetInstrInfo &TII) {
6049 MachineBasicBlock &MBB = *MIB->getParent();
6050 const DebugLoc &DL = MIB->getDebugLoc();
6051 Register Reg = MIB.getReg(0);
6052 const GlobalValue *GV =
6053 cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
6054 auto Flags = MachineMemOperand::MOLoad |
6058 MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, Align(8));
6060
6061 BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg)
6062 .addReg(X86::RIP)
6063 .addImm(1)
6064 .addReg(0)
6066 .addReg(0)
6067 .addMemOperand(MMO);
6068 MIB->setDebugLoc(DL);
6069 MIB->setDesc(TII.get(X86::MOV64rm));
6070 MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
6071}
6072
6074 MachineBasicBlock &MBB = *MIB->getParent();
6075 MachineFunction &MF = *MBB.getParent();
6076 const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
6077 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
6078 unsigned XorOp =
6079 MIB->getOpcode() == X86::XOR64_FP ? X86::XOR64rr : X86::XOR32rr;
6080 MIB->setDesc(TII.get(XorOp));
6081 MIB.addReg(TRI->getFrameRegister(MF), RegState::Undef);
6082 return true;
6083}
6084
6085// This is used to handle spills for 128/256-bit registers when we have AVX512,
6086// but not VLX. If it uses an extended register we need to use an instruction
6087// that loads the lower 128/256-bit, but is available with only AVX512F.
6089 const TargetRegisterInfo *TRI,
6090 const MCInstrDesc &LoadDesc,
6091 const MCInstrDesc &BroadcastDesc, unsigned SubIdx) {
6092 Register DestReg = MIB.getReg(0);
6093 // Check if DestReg is XMM16-31 or YMM16-31.
6094 if (TRI->getEncodingValue(DestReg) < 16) {
6095 // We can use a normal VEX encoded load.
6096 MIB->setDesc(LoadDesc);
6097 } else {
6098 // Use a 128/256-bit VBROADCAST instruction.
6099 MIB->setDesc(BroadcastDesc);
6100 // Change the destination to a 512-bit register.
6101 DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass);
6102 MIB->getOperand(0).setReg(DestReg);
6103 }
6104 return true;
6105}
6106
6107// This is used to handle spills for 128/256-bit registers when we have AVX512,
6108// but not VLX. If it uses an extended register we need to use an instruction
6109// that stores the lower 128/256-bit, but is available with only AVX512F.
6111 const TargetRegisterInfo *TRI,
6112 const MCInstrDesc &StoreDesc,
6113 const MCInstrDesc &ExtractDesc, unsigned SubIdx) {
6114 Register SrcReg = MIB.getReg(X86::AddrNumOperands);
6115 // Check if DestReg is XMM16-31 or YMM16-31.
6116 if (TRI->getEncodingValue(SrcReg) < 16) {
6117 // We can use a normal VEX encoded store.
6118 MIB->setDesc(StoreDesc);
6119 } else {
6120 // Use a VEXTRACTF instruction.
6121 MIB->setDesc(ExtractDesc);
6122 // Change the destination to a 512-bit register.
6123 SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass);
6125 MIB.addImm(0x0); // Append immediate to extract from the lower bits.
6126 }
6127
6128 return true;
6129}
6130
6132 MIB->setDesc(Desc);
6133 int64_t ShiftAmt = MIB->getOperand(2).getImm();
6134 // Temporarily remove the immediate so we can add another source register.
6135 MIB->removeOperand(2);
6136 // Add the register. Don't copy the kill flag if there is one.
6137 MIB.addReg(MIB.getReg(1), getUndefRegState(MIB->getOperand(1).isUndef()));
6138 // Add back the immediate.
6139 MIB.addImm(ShiftAmt);
6140 return true;
6141}
6142
6144 const TargetInstrInfo &TII, bool HasAVX) {
6145 unsigned NewOpc;
6146 if (MI.getOpcode() == X86::MOVSHPrm) {
6147 NewOpc = HasAVX ? X86::VMOVSSrm : X86::MOVSSrm;
6148 Register Reg = MI.getOperand(0).getReg();
6149 if (Reg > X86::XMM15)
6150 NewOpc = X86::VMOVSSZrm;
6151 } else {
6152 NewOpc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
6153 Register Reg = MI.getOperand(5).getReg();
6154 if (Reg > X86::XMM15)
6155 NewOpc = X86::VMOVSSZmr;
6156 }
6157
6158 MIB->setDesc(TII.get(NewOpc));
6159 return true;
6160}
6161
6163 bool HasAVX = Subtarget.hasAVX();
6164 MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
6165 switch (MI.getOpcode()) {
6166 case X86::MOV32r0:
6167 return Expand2AddrUndef(MIB, get(X86::XOR32rr));
6168 case X86::MOV32r1:
6169 return expandMOV32r1(MIB, *this, /*MinusOne=*/false);
6170 case X86::MOV32r_1:
6171 return expandMOV32r1(MIB, *this, /*MinusOne=*/true);
6172 case X86::MOV32ImmSExti8:
6173 case X86::MOV64ImmSExti8:
6174 return ExpandMOVImmSExti8(MIB, *this, Subtarget);
6175 case X86::SETB_C32r:
6176 return Expand2AddrUndef(MIB, get(X86::SBB32rr));
6177 case X86::SETB_C64r:
6178 return Expand2AddrUndef(MIB, get(X86::SBB64rr));
6179 case X86::MMX_SET0:
6180 return Expand2AddrUndef(MIB, get(X86::MMX_PXORrr));
6181 case X86::V_SET0:
6182 case X86::FsFLD0SS:
6183 case X86::FsFLD0SD:
6184 case X86::FsFLD0SH:
6185 case X86::FsFLD0F128:
6186 return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
6187 case X86::AVX_SET0: {
6188 assert(HasAVX && "AVX not supported");
6190 Register SrcReg = MIB.getReg(0);
6191 Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
6192 MIB->getOperand(0).setReg(XReg);
6193 Expand2AddrUndef(MIB, get(X86::VXORPSrr));
6194 MIB.addReg(SrcReg, RegState::ImplicitDefine);
6195 return true;
6196 }
6197 case X86::AVX512_128_SET0:
6198 case X86::AVX512_FsFLD0SH:
6199 case X86::AVX512_FsFLD0SS:
6200 case X86::AVX512_FsFLD0SD:
6201 case X86::AVX512_FsFLD0F128: {
6202 bool HasVLX = Subtarget.hasVLX();
6203 Register SrcReg = MIB.getReg(0);
6205 if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
6206 return Expand2AddrUndef(MIB,
6207 get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
6208 // Extended register without VLX. Use a larger XOR.
6209 SrcReg =
6210 TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
6211 MIB->getOperand(0).setReg(SrcReg);
6212 return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
6213 }
6214 case X86::AVX512_256_SET0:
6215 case X86::AVX512_512_SET0: {
6216 bool HasVLX = Subtarget.hasVLX();
6217 Register SrcReg = MIB.getReg(0);
6219 if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) {
6220 Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
6221 MIB->getOperand(0).setReg(XReg);
6222 Expand2AddrUndef(MIB, get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
6223 MIB.addReg(SrcReg, RegState::ImplicitDefine);
6224 return true;
6225 }
6226 if (MI.getOpcode() == X86::AVX512_256_SET0) {
6227 // No VLX so we must reference a zmm.
6228 MCRegister ZReg =
6229 TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
6230 MIB->getOperand(0).setReg(ZReg);
6231 }
6232 return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
6233 }
6234 case X86::MOVSHPmr:
6235 case X86::MOVSHPrm:
6236 return expandMOVSHP(MIB, MI, *this, Subtarget.hasAVX());
6237 case X86::V_SETALLONES:
6238 return Expand2AddrUndef(MIB,
6239 get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
6240 case X86::AVX2_SETALLONES:
6241 return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
6242 case X86::AVX1_SETALLONES: {
6243 Register Reg = MIB.getReg(0);
6244 // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
6245 MIB->setDesc(get(X86::VCMPPSYrri));
6246 MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
6247 return true;
6248 }
6249 case X86::AVX512_512_SETALLONES: {
6250 Register Reg = MIB.getReg(0);
6251 MIB->setDesc(get(X86::VPTERNLOGDZrri));
6252 // VPTERNLOGD needs 3 register inputs and an immediate.
6253 // 0xff will return 1s for any input.
6254 MIB.addReg(Reg, RegState::Undef)
6255 .addReg(Reg, RegState::Undef)
6256 .addReg(Reg, RegState::Undef)
6257 .addImm(0xff);
6258 return true;
6259 }
6260 case X86::AVX512_512_SEXT_MASK_32:
6261 case X86::AVX512_512_SEXT_MASK_64: {
6262 Register Reg = MIB.getReg(0);
6263 Register MaskReg = MIB.getReg(1);
6264 unsigned MaskState = getRegState(MIB->getOperand(1));
6265 unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64)
6266 ? X86::VPTERNLOGQZrrikz
6267 : X86::VPTERNLOGDZrrikz;
6268 MI.removeOperand(1);
6269 MIB->setDesc(get(Opc));
6270 // VPTERNLOG needs 3 register inputs and an immediate.
6271 // 0xff will return 1s for any input.
6272 MIB.addReg(Reg, RegState::Undef)
6273 .addReg(MaskReg, MaskState)
6274 .addReg(Reg, RegState::Undef)
6275 .addReg(Reg, RegState::Undef)
6276 .addImm(0xff);
6277 return true;
6278 }
6279 case X86::VMOVAPSZ128rm_NOVLX:
6280 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
6281 get(X86::VBROADCASTF32X4Zrm), X86::sub_xmm);
6282 case X86::VMOVUPSZ128rm_NOVLX:
6283 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm),
6284 get(X86::VBROADCASTF32X4Zrm), X86::sub_xmm);
6285 case X86::VMOVAPSZ256rm_NOVLX:
6286 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm),
6287 get(X86::VBROADCASTF64X4Zrm), X86::sub_ymm);
6288 case X86::VMOVUPSZ256rm_NOVLX:
6289 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm),
6290 get(X86::VBROADCASTF64X4Zrm), X86::sub_ymm);
6291 case X86::VMOVAPSZ128mr_NOVLX:
6292 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr),
6293 get(X86::VEXTRACTF32X4Zmri), X86::sub_xmm);
6294 case X86::VMOVUPSZ128mr_NOVLX:
6295 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr),
6296 get(X86::VEXTRACTF32X4Zmri), X86::sub_xmm);
6297 case X86::VMOVAPSZ256mr_NOVLX:
6298 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr),
6299 get(X86::VEXTRACTF64X4Zmri), X86::sub_ymm);
6300 case X86::VMOVUPSZ256mr_NOVLX:
6301 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
6302 get(X86::VEXTRACTF64X4Zmri), X86::sub_ymm);
6303 case X86::MOV32ri64: {
6304 Register Reg = MIB.getReg(0);
6305 Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit);
6306 MI.setDesc(get(X86::MOV32ri));
6307 MIB->getOperand(0).setReg(Reg32);
6309 return true;
6310 }
6311
6312 case X86::RDFLAGS32:
6313 case X86::RDFLAGS64: {
6314 unsigned Is64Bit = MI.getOpcode() == X86::RDFLAGS64;
6315 MachineBasicBlock &MBB = *MIB->getParent();
6316
6317 MachineInstr *NewMI = BuildMI(MBB, MI, MIB->getDebugLoc(),
6318 get(Is64Bit ? X86::PUSHF64 : X86::PUSHF32))
6319 .getInstr();
6320
6321 // Permit reads of the EFLAGS and DF registers without them being defined.
6322 // This intrinsic exists to read external processor state in flags, such as
6323 // the trap flag, interrupt flag, and direction flag, none of which are
6324 // modeled by the backend.
6325 assert(NewMI->getOperand(2).getReg() == X86::EFLAGS &&
6326 "Unexpected register in operand! Should be EFLAGS.");
6327 NewMI->getOperand(2).setIsUndef();
6328 assert(NewMI->getOperand(3).getReg() == X86::DF &&
6329 "Unexpected register in operand! Should be DF.");
6330 NewMI->getOperand(3).setIsUndef();
6331
6332 MIB->setDesc(get(Is64Bit ? X86::POP64r : X86::POP32r));
6333 return true;
6334 }
6335
6336 case X86::WRFLAGS32:
6337 case X86::WRFLAGS64: {
6338 unsigned Is64Bit = MI.getOpcode() == X86::WRFLAGS64;
6339 MachineBasicBlock &MBB = *MIB->getParent();
6340
6341 BuildMI(MBB, MI, MIB->getDebugLoc(),
6342 get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
6343 .addReg(MI.getOperand(0).getReg());
6344 BuildMI(MBB, MI, MIB->getDebugLoc(),
6345 get(Is64Bit ? X86::POPF64 : X86::POPF32));
6346 MI.eraseFromParent();
6347 return true;
6348 }
6349
6350 // KNL does not recognize dependency-breaking idioms for mask registers,
6351 // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1.
6352 // Using %k0 as the undef input register is a performance heuristic based
6353 // on the assumption that %k0 is used less frequently than the other mask
6354 // registers, since it is not usable as a write mask.
6355 // FIXME: A more advanced approach would be to choose the best input mask
6356 // register based on context.
6357 case X86::KSET0W:
6358 return Expand2AddrKreg(MIB, get(X86::KXORWkk), X86::K0);
6359 case X86::KSET0D:
6360 return Expand2AddrKreg(MIB, get(X86::KXORDkk), X86::K0);
6361 case X86::KSET0Q:
6362 return Expand2AddrKreg(MIB, get(X86::KXORQkk), X86::K0);
6363 case X86::KSET1W:
6364 return Expand2AddrKreg(MIB, get(X86::KXNORWkk), X86::K0);
6365 case X86::KSET1D:
6366 return Expand2AddrKreg(MIB, get(X86::KXNORDkk), X86::K0);
6367 case X86::KSET1Q:
6368 return Expand2AddrKreg(MIB, get(X86::KXNORQkk), X86::K0);
6369 case TargetOpcode::LOAD_STACK_GUARD:
6370 expandLoadStackGuard(MIB, *this);
6371 return true;
6372 case X86::XOR64_FP:
6373 case X86::XOR32_FP:
6374 return expandXorFP(MIB, *this);
6375 case X86::SHLDROT32ri:
6376 return expandSHXDROT(MIB, get(X86::SHLD32rri8));
6377 case X86::SHLDROT64ri:
6378 return expandSHXDROT(MIB, get(X86::SHLD64rri8));
6379 case X86::SHRDROT32ri:
6380 return expandSHXDROT(MIB, get(X86::SHRD32rri8));
6381 case X86::SHRDROT64ri:
6382 return expandSHXDROT(MIB, get(X86::SHRD64rri8));
6383 case X86::ADD8rr_DB:
6384 MIB->setDesc(get(X86::OR8rr));
6385 break;
6386 case X86::ADD16rr_DB:
6387 MIB->setDesc(get(X86::OR16rr));
6388 break;
6389 case X86::ADD32rr_DB:
6390 MIB->setDesc(get(X86::OR32rr));
6391 break;
6392 case X86::ADD64rr_DB:
6393 MIB->setDesc(get(X86::OR64rr));
6394 break;
6395 case X86::ADD8ri_DB:
6396 MIB->setDesc(get(X86::OR8ri));
6397 break;
6398 case X86::ADD16ri_DB:
6399 MIB->setDesc(get(X86::OR16ri));
6400 break;
6401 case X86::ADD32ri_DB:
6402 MIB->setDesc(get(X86::OR32ri));
6403 break;
6404 case X86::ADD64ri32_DB:
6405 MIB->setDesc(get(X86::OR64ri32));
6406 break;
6407 }
6408 return false;
6409}
6410
6411/// Return true for all instructions that only update
6412/// the first 32 or 64-bits of the destination register and leave the rest
6413/// unmodified. This can be used to avoid folding loads if the instructions
6414/// only update part of the destination register, and the non-updated part is
6415/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
6416/// instructions breaks the partial register dependency and it can improve
6417/// performance. e.g.:
6418///
6419/// movss (%rdi), %xmm0
6420/// cvtss2sd %xmm0, %xmm0
6421///
6422/// Instead of
6423/// cvtss2sd (%rdi), %xmm0
6424///
6425/// FIXME: This should be turned into a TSFlags.
6426///
6427static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget,
6428 bool ForLoadFold = false) {
6429 switch (Opcode) {
6430 case X86::CVTSI2SSrr:
6431 case X86::CVTSI2SSrm:
6432 case X86::CVTSI642SSrr:
6433 case X86::CVTSI642SSrm:
6434 case X86::CVTSI2SDrr:
6435 case X86::CVTSI2SDrm:
6436 case X86::CVTSI642SDrr:
6437 case X86::CVTSI642SDrm:
6438 // Load folding won't effect the undef register update since the input is
6439 // a GPR.
6440 return !ForLoadFold;
6441 case X86::CVTSD2SSrr:
6442 case X86::CVTSD2SSrm:
6443 case X86::CVTSS2SDrr:
6444 case X86::CVTSS2SDrm:
6445 case X86::MOVHPDrm:
6446 case X86::MOVHPSrm:
6447 case X86::MOVLPDrm:
6448 case X86::MOVLPSrm:
6449 case X86::RCPSSr:
6450 case X86::RCPSSm:
6451 case X86::RCPSSr_Int:
6452 case X86::RCPSSm_Int:
6453 case X86::ROUNDSDri:
6454 case X86::ROUNDSDmi:
6455 case X86::ROUNDSSri:
6456 case X86::ROUNDSSmi:
6457 case X86::RSQRTSSr:
6458 case X86::RSQRTSSm:
6459 case X86::RSQRTSSr_Int:
6460 case X86::RSQRTSSm_Int:
6461 case X86::SQRTSSr:
6462 case X86::SQRTSSm:
6463 case X86::SQRTSSr_Int:
6464 case X86::SQRTSSm_Int:
6465 case X86::SQRTSDr:
6466 case X86::SQRTSDm:
6467 case X86::SQRTSDr_Int:
6468 case X86::SQRTSDm_Int:
6469 return true;
6470 case X86::VFCMULCPHZ128rm:
6471 case X86::VFCMULCPHZ128rmb:
6472 case X86::VFCMULCPHZ128rmbkz:
6473 case X86::VFCMULCPHZ128rmkz:
6474 case X86::VFCMULCPHZ128rr:
6475 case X86::VFCMULCPHZ128rrkz:
6476 case X86::VFCMULCPHZ256rm:
6477 case X86::VFCMULCPHZ256rmb:
6478 case X86::VFCMULCPHZ256rmbkz:
6479 case X86::VFCMULCPHZ256rmkz:
6480 case X86::VFCMULCPHZ256rr:
6481 case X86::VFCMULCPHZ256rrkz:
6482 case X86::VFCMULCPHZrm:
6483 case X86::VFCMULCPHZrmb:
6484 case X86::VFCMULCPHZrmbkz:
6485 case X86::VFCMULCPHZrmkz:
6486 case X86::VFCMULCPHZrr:
6487 case X86::VFCMULCPHZrrb:
6488 case X86::VFCMULCPHZrrbkz:
6489 case X86::VFCMULCPHZrrkz:
6490 case X86::VFMULCPHZ128rm:
6491 case X86::VFMULCPHZ128rmb:
6492 case X86::VFMULCPHZ128rmbkz:
6493 case X86::VFMULCPHZ128rmkz:
6494 case X86::VFMULCPHZ128rr:
6495 case X86::VFMULCPHZ128rrkz:
6496 case X86::VFMULCPHZ256rm:
6497 case X86::VFMULCPHZ256rmb:
6498 case X86::VFMULCPHZ256rmbkz:
6499 case X86::VFMULCPHZ256rmkz:
6500 case X86::VFMULCPHZ256rr:
6501 case X86::VFMULCPHZ256rrkz:
6502 case X86::VFMULCPHZrm:
6503 case X86::VFMULCPHZrmb:
6504 case X86::VFMULCPHZrmbkz:
6505 case X86::VFMULCPHZrmkz:
6506 case X86::VFMULCPHZrr:
6507 case X86::VFMULCPHZrrb:
6508 case X86::VFMULCPHZrrbkz:
6509 case X86::VFMULCPHZrrkz:
6510 case X86::VFCMULCSHZrm:
6511 case X86::VFCMULCSHZrmkz:
6512 case X86::VFCMULCSHZrr:
6513 case X86::VFCMULCSHZrrb:
6514 case X86::VFCMULCSHZrrbkz:
6515 case X86::VFCMULCSHZrrkz:
6516 case X86::VFMULCSHZrm:
6517 case X86::VFMULCSHZrmkz:
6518 case X86::VFMULCSHZrr:
6519 case X86::VFMULCSHZrrb:
6520 case X86::VFMULCSHZrrbkz:
6521 case X86::VFMULCSHZrrkz:
6522 return Subtarget.hasMULCFalseDeps();
6523 case X86::VPERMDYrm:
6524 case X86::VPERMDYrr:
6525 case X86::VPERMQYmi:
6526 case X86::VPERMQYri:
6527 case X86::VPERMPSYrm:
6528 case X86::VPERMPSYrr:
6529 case X86::VPERMPDYmi:
6530 case X86::VPERMPDYri:
6531 case X86::VPERMDZ256rm:
6532 case X86::VPERMDZ256rmb:
6533 case X86::VPERMDZ256rmbkz:
6534 case X86::VPERMDZ256rmkz:
6535 case X86::VPERMDZ256rr:
6536 case X86::VPERMDZ256rrkz:
6537 case X86::VPERMDZrm:
6538 case X86::VPERMDZrmb:
6539 case X86::VPERMDZrmbkz:
6540 case X86::VPERMDZrmkz:
6541 case X86::VPERMDZrr:
6542 case X86::VPERMDZrrkz:
6543 case X86::VPERMQZ256mbi:
6544 case X86::VPERMQZ256mbikz:
6545 case X86::VPERMQZ256mi:
6546 case X86::VPERMQZ256mikz:
6547 case X86::VPERMQZ256ri:
6548 case X86::VPERMQZ256rikz:
6549 case X86::VPERMQZ256rm:
6550 case X86::VPERMQZ256rmb:
6551 case X86::VPERMQZ256rmbkz:
6552 case X86::VPERMQZ256rmkz:
6553 case X86::VPERMQZ256rr:
6554 case X86::VPERMQZ256rrkz:
6555 case X86::VPERMQZmbi:
6556 case X86::VPERMQZmbikz:
6557 case X86::VPERMQZmi:
6558 case X86::VPERMQZmikz:
6559 case X86::VPERMQZri:
6560 case X86::VPERMQZrikz:
6561 case X86::VPERMQZrm:
6562 case X86::VPERMQZrmb:
6563 case X86::VPERMQZrmbkz:
6564 case X86::VPERMQZrmkz:
6565 case X86::VPERMQZrr:
6566 case X86::VPERMQZrrkz:
6567 case X86::VPERMPSZ256rm:
6568 case X86::VPERMPSZ256rmb:
6569 case X86::VPERMPSZ256rmbkz:
6570 case X86::VPERMPSZ256rmkz:
6571 case X86::VPERMPSZ256rr:
6572 case X86::VPERMPSZ256rrkz:
6573 case X86::VPERMPSZrm:
6574 case X86::VPERMPSZrmb:
6575 case X86::VPERMPSZrmbkz:
6576 case X86::VPERMPSZrmkz:
6577 case X86::VPERMPSZrr:
6578 case X86::VPERMPSZrrkz:
6579 case X86::VPERMPDZ256mbi:
6580 case X86::VPERMPDZ256mbikz:
6581 case X86::VPERMPDZ256mi:
6582 case X86::VPERMPDZ256mikz:
6583 case X86::VPERMPDZ256ri:
6584 case X86::VPERMPDZ256rikz:
6585 case X86::VPERMPDZ256rm:
6586 case X86::VPERMPDZ256rmb:
6587 case X86::VPERMPDZ256rmbkz:
6588 case X86::VPERMPDZ256rmkz:
6589 case X86::VPERMPDZ256rr:
6590 case X86::VPERMPDZ256rrkz:
6591 case X86::VPERMPDZmbi:
6592 case X86::VPERMPDZmbikz:
6593 case X86::VPERMPDZmi:
6594 case X86::VPERMPDZmikz:
6595 case X86::VPERMPDZri:
6596 case X86::VPERMPDZrikz:
6597 case X86::VPERMPDZrm:
6598 case X86::VPERMPDZrmb:
6599 case X86::VPERMPDZrmbkz:
6600 case X86::VPERMPDZrmkz:
6601 case X86::VPERMPDZrr:
6602 case X86::VPERMPDZrrkz:
6603 return Subtarget.hasPERMFalseDeps();
6604 case X86::VRANGEPDZ128rmbi:
6605 case X86::VRANGEPDZ128rmbikz:
6606 case X86::VRANGEPDZ128rmi:
6607 case X86::VRANGEPDZ128rmikz:
6608 case X86::VRANGEPDZ128rri:
6609 case X86::VRANGEPDZ128rrikz:
6610 case X86::VRANGEPDZ256rmbi:
6611 case X86::VRANGEPDZ256rmbikz:
6612 case X86::VRANGEPDZ256rmi:
6613 case X86::VRANGEPDZ256rmikz:
6614 case X86::VRANGEPDZ256rri:
6615 case X86::VRANGEPDZ256rrikz:
6616 case X86::VRANGEPDZrmbi:
6617 case X86::VRANGEPDZrmbikz:
6618 case X86::VRANGEPDZrmi:
6619 case X86::VRANGEPDZrmikz:
6620 case X86::VRANGEPDZrri:
6621 case X86::VRANGEPDZrrib:
6622 case X86::VRANGEPDZrribkz:
6623 case X86::VRANGEPDZrrikz:
6624 case X86::VRANGEPSZ128rmbi:
6625 case X86::VRANGEPSZ128rmbikz:
6626 case X86::VRANGEPSZ128rmi:
6627 case X86::VRANGEPSZ128rmikz:
6628 case X86::VRANGEPSZ128rri:
6629 case X86::VRANGEPSZ128rrikz:
6630 case X86::VRANGEPSZ256rmbi:
6631 case X86::VRANGEPSZ256rmbikz:
6632 case X86::VRANGEPSZ256rmi:
6633 case X86::VRANGEPSZ256rmikz:
6634 case X86::VRANGEPSZ256rri:
6635 case X86::VRANGEPSZ256rrikz:
6636 case X86::VRANGEPSZrmbi:
6637 case X86::VRANGEPSZrmbikz:
6638 case X86::VRANGEPSZrmi:
6639 case X86::VRANGEPSZrmikz:
6640 case X86::VRANGEPSZrri:
6641 case X86::VRANGEPSZrrib:
6642 case X86::VRANGEPSZrribkz:
6643 case X86::VRANGEPSZrrikz:
6644 case X86::VRANGESDZrmi:
6645 case X86::VRANGESDZrmikz:
6646 case X86::VRANGESDZrri:
6647 case X86::VRANGESDZrrib:
6648 case X86::VRANGESDZrribkz:
6649 case X86::VRANGESDZrrikz:
6650 case X86::VRANGESSZrmi:
6651 case X86::VRANGESSZrmikz:
6652 case X86::VRANGESSZrri:
6653 case X86::VRANGESSZrrib:
6654 case X86::VRANGESSZrribkz:
6655 case X86::VRANGESSZrrikz:
6656 return Subtarget.hasRANGEFalseDeps();
6657 case X86::VGETMANTSSZrmi:
6658 case X86::VGETMANTSSZrmikz:
6659 case X86::VGETMANTSSZrri:
6660 case X86::VGETMANTSSZrrib:
6661 case X86::VGETMANTSSZrribkz:
6662 case X86::VGETMANTSSZrrikz:
6663 case X86::VGETMANTSDZrmi:
6664 case X86::VGETMANTSDZrmikz:
6665 case X86::VGETMANTSDZrri:
6666 case X86::VGETMANTSDZrrib:
6667 case X86::VGETMANTSDZrribkz:
6668 case X86::VGETMANTSDZrrikz:
6669 case X86::VGETMANTSHZrmi:
6670 case X86::VGETMANTSHZrmikz:
6671 case X86::VGETMANTSHZrri:
6672 case X86::VGETMANTSHZrrib:
6673 case X86::VGETMANTSHZrribkz:
6674 case X86::VGETMANTSHZrrikz:
6675 case X86::VGETMANTPSZ128rmbi:
6676 case X86::VGETMANTPSZ128rmbikz:
6677 case X86::VGETMANTPSZ128rmi:
6678 case X86::VGETMANTPSZ128rmikz:
6679 case X86::VGETMANTPSZ256rmbi:
6680 case X86::VGETMANTPSZ256rmbikz:
6681 case X86::VGETMANTPSZ256rmi:
6682 case X86::VGETMANTPSZ256rmikz:
6683 case X86::VGETMANTPSZrmbi:
6684 case X86::VGETMANTPSZrmbikz:
6685 case X86::VGETMANTPSZrmi:
6686 case X86::VGETMANTPSZrmikz:
6687 case X86::VGETMANTPDZ128rmbi:
6688 case X86::VGETMANTPDZ128rmbikz:
6689 case X86::VGETMANTPDZ128rmi:
6690 case X86::VGETMANTPDZ128rmikz:
6691 case X86::VGETMANTPDZ256rmbi:
6692 case X86::VGETMANTPDZ256rmbikz:
6693 case X86::VGETMANTPDZ256rmi:
6694 case X86::VGETMANTPDZ256rmikz:
6695 case X86::VGETMANTPDZrmbi:
6696 case X86::VGETMANTPDZrmbikz:
6697 case X86::VGETMANTPDZrmi:
6698 case X86::VGETMANTPDZrmikz:
6699 return Subtarget.hasGETMANTFalseDeps();
6700 case X86::VPMULLQZ128rm:
6701 case X86::VPMULLQZ128rmb:
6702 case X86::VPMULLQZ128rmbkz:
6703 case X86::VPMULLQZ128rmkz:
6704 case X86::VPMULLQZ128rr:
6705 case X86::VPMULLQZ128rrkz:
6706 case X86::VPMULLQZ256rm:
6707 case X86::VPMULLQZ256rmb:
6708 case X86::VPMULLQZ256rmbkz:
6709 case X86::VPMULLQZ256rmkz:
6710 case X86::VPMULLQZ256rr:
6711 case X86::VPMULLQZ256rrkz:
6712 case X86::VPMULLQZrm:
6713 case X86::VPMULLQZrmb:
6714 case X86::VPMULLQZrmbkz:
6715 case X86::VPMULLQZrmkz:
6716 case X86::VPMULLQZrr:
6717 case X86::VPMULLQZrrkz:
6718 return Subtarget.hasMULLQFalseDeps();
6719 // GPR
6720 case X86::POPCNT32rm:
6721 case X86::POPCNT32rr:
6722 case X86::POPCNT64rm:
6723 case X86::POPCNT64rr:
6724 return Subtarget.hasPOPCNTFalseDeps();
6725 case X86::LZCNT32rm:
6726 case X86::LZCNT32rr:
6727 case X86::LZCNT64rm:
6728 case X86::LZCNT64rr:
6729 case X86::TZCNT32rm:
6730 case X86::TZCNT32rr:
6731 case X86::TZCNT64rm:
6732 case X86::TZCNT64rr:
6733 return Subtarget.hasLZCNTFalseDeps();
6734 }
6735
6736 return false;
6737}
6738
6739/// Inform the BreakFalseDeps pass how many idle
6740/// instructions we would like before a partial register update.
6742 const MachineInstr &MI, unsigned OpNum,
6743 const TargetRegisterInfo *TRI) const {
6744
6745 if (OpNum != 0)
6746 return 0;
6747
6748 // NDD ops with 8/16b results may appear to be partial register
6749 // updates after register allocation.
6750 bool HasNDDPartialWrite = false;
6751 if (X86II::hasNewDataDest(MI.getDesc().TSFlags)) {
6752 Register Reg = MI.getOperand(0).getReg();
6753 if (!Reg.isVirtual())
6754 HasNDDPartialWrite =
6755 X86::GR8RegClass.contains(Reg) || X86::GR16RegClass.contains(Reg);
6756 }
6757
6758 if (!(HasNDDPartialWrite || hasPartialRegUpdate(MI.getOpcode(), Subtarget)))
6759 return 0;
6760
6761 // Check if the result register is also used as a source.
6762 // For non-NDD ops, this means a partial update is wanted, hence we return 0.
6763 // For NDD ops, this means it is possible to compress the instruction
6764 // to a legacy form in CompressEVEX, which would create an unwanted partial
6765 // update, so we return the clearance.
6766 const MachineOperand &MO = MI.getOperand(0);
6767 Register Reg = MO.getReg();
6768 bool ReadsReg = false;
6769 if (Reg.isVirtual())
6770 ReadsReg = (MO.readsReg() || MI.readsVirtualRegister(Reg));
6771 else
6772 ReadsReg = MI.readsRegister(Reg, TRI);
6773 if (ReadsReg != HasNDDPartialWrite)
6774 return 0;
6775
6776 // If any instructions in the clearance range are reading Reg, insert a
6777 // dependency breaking instruction, which is inexpensive and is likely to
6778 // be hidden in other instruction's cycles.
6780}
6781
6782// Return true for any instruction the copies the high bits of the first source
6783// operand into the unused high bits of the destination operand.
6784// Also returns true for instructions that have two inputs where one may
6785// be undef and we want it to use the same register as the other input.
6786static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
6787 bool ForLoadFold = false) {
6788 // Set the OpNum parameter to the first source operand.
6789 switch (Opcode) {
6790 case X86::MMX_PUNPCKHBWrr:
6791 case X86::MMX_PUNPCKHWDrr:
6792 case X86::MMX_PUNPCKHDQrr:
6793 case X86::MMX_PUNPCKLBWrr:
6794 case X86::MMX_PUNPCKLWDrr:
6795 case X86::MMX_PUNPCKLDQrr:
6796 case X86::MOVHLPSrr:
6797 case X86::PACKSSWBrr:
6798 case X86::PACKUSWBrr:
6799 case X86::PACKSSDWrr:
6800 case X86::PACKUSDWrr:
6801 case X86::PUNPCKHBWrr:
6802 case X86::PUNPCKLBWrr:
6803 case X86::PUNPCKHWDrr:
6804 case X86::PUNPCKLWDrr:
6805 case X86::PUNPCKHDQrr:
6806 case X86::PUNPCKLDQrr:
6807 case X86::PUNPCKHQDQrr:
6808 case X86::PUNPCKLQDQrr:
6809 case X86::SHUFPDrri:
6810 case X86::SHUFPSrri:
6811 // These instructions are sometimes used with an undef first or second
6812 // source. Return true here so BreakFalseDeps will assign this source to the
6813 // same register as the first source to avoid a false dependency.
6814 // Operand 1 of these instructions is tied so they're separate from their
6815 // VEX counterparts.
6816 return OpNum == 2 && !ForLoadFold;
6817
6818 case X86::VMOVLHPSrr:
6819 case X86::VMOVLHPSZrr:
6820 case X86::VPACKSSWBrr:
6821 case X86::VPACKUSWBrr:
6822 case X86::VPACKSSDWrr:
6823 case X86::VPACKUSDWrr:
6824 case X86::VPACKSSWBZ128rr:
6825 case X86::VPACKUSWBZ128rr:
6826 case X86::VPACKSSDWZ128rr:
6827 case X86::VPACKUSDWZ128rr:
6828 case X86::VPERM2F128rri:
6829 case X86::VPERM2I128rri:
6830 case X86::VSHUFF32X4Z256rri:
6831 case X86::VSHUFF32X4Zrri:
6832 case X86::VSHUFF64X2Z256rri:
6833 case X86::VSHUFF64X2Zrri:
6834 case X86::VSHUFI32X4Z256rri:
6835 case X86::VSHUFI32X4Zrri:
6836 case X86::VSHUFI64X2Z256rri:
6837 case X86::VSHUFI64X2Zrri:
6838 case X86::VPUNPCKHBWrr:
6839 case X86::VPUNPCKLBWrr:
6840 case X86::VPUNPCKHBWYrr:
6841 case X86::VPUNPCKLBWYrr:
6842 case X86::VPUNPCKHBWZ128rr:
6843 case X86::VPUNPCKLBWZ128rr:
6844 case X86::VPUNPCKHBWZ256rr:
6845 case X86::VPUNPCKLBWZ256rr:
6846 case X86::VPUNPCKHBWZrr:
6847 case X86::VPUNPCKLBWZrr:
6848 case X86::VPUNPCKHWDrr:
6849 case X86::VPUNPCKLWDrr:
6850 case X86::VPUNPCKHWDYrr:
6851 case X86::VPUNPCKLWDYrr:
6852 case X86::VPUNPCKHWDZ128rr:
6853 case X86::VPUNPCKLWDZ128rr:
6854 case X86::VPUNPCKHWDZ256rr:
6855 case X86::VPUNPCKLWDZ256rr:
6856 case X86::VPUNPCKHWDZrr:
6857 case X86::VPUNPCKLWDZrr:
6858 case X86::VPUNPCKHDQrr:
6859 case X86::VPUNPCKLDQrr:
6860 case X86::VPUNPCKHDQYrr:
6861 case X86::VPUNPCKLDQYrr:
6862 case X86::VPUNPCKHDQZ128rr:
6863 case X86::VPUNPCKLDQZ128rr:
6864 case X86::VPUNPCKHDQZ256rr:
6865 case X86::VPUNPCKLDQZ256rr:
6866 case X86::VPUNPCKHDQZrr:
6867 case X86::VPUNPCKLDQZrr:
6868 case X86::VPUNPCKHQDQrr:
6869 case X86::VPUNPCKLQDQrr:
6870 case X86::VPUNPCKHQDQYrr:
6871 case X86::VPUNPCKLQDQYrr:
6872 case X86::VPUNPCKHQDQZ128rr:
6873 case X86::VPUNPCKLQDQZ128rr:
6874 case X86::VPUNPCKHQDQZ256rr:
6875 case X86::VPUNPCKLQDQZ256rr:
6876 case X86::VPUNPCKHQDQZrr:
6877 case X86::VPUNPCKLQDQZrr:
6878 // These instructions are sometimes used with an undef first or second
6879 // source. Return true here so BreakFalseDeps will assign this source to the
6880 // same register as the first source to avoid a false dependency.
6881 return (OpNum == 1 || OpNum == 2) && !ForLoadFold;
6882
6883 case X86::VCVTSI2SSrr:
6884 case X86::VCVTSI2SSrm:
6885 case X86::VCVTSI2SSrr_Int:
6886 case X86::VCVTSI2SSrm_Int:
6887 case X86::VCVTSI642SSrr:
6888 case X86::VCVTSI642SSrm:
6889 case X86::VCVTSI642SSrr_Int:
6890 case X86::VCVTSI642SSrm_Int:
6891 case X86::VCVTSI2SDrr:
6892 case X86::VCVTSI2SDrm:
6893 case X86::VCVTSI2SDrr_Int:
6894 case X86::VCVTSI2SDrm_Int:
6895 case X86::VCVTSI642SDrr:
6896 case X86::VCVTSI642SDrm:
6897 case X86::VCVTSI642SDrr_Int:
6898 case X86::VCVTSI642SDrm_Int:
6899 // AVX-512
6900 case X86::VCVTSI2SSZrr:
6901 case X86::VCVTSI2SSZrm:
6902 case X86::VCVTSI2SSZrr_Int:
6903 case X86::VCVTSI2SSZrrb_Int:
6904 case X86::VCVTSI2SSZrm_Int:
6905 case X86::VCVTSI642SSZrr:
6906 case X86::VCVTSI642SSZrm:
6907 case X86::VCVTSI642SSZrr_Int:
6908 case X86::VCVTSI642SSZrrb_Int:
6909 case X86::VCVTSI642SSZrm_Int:
6910 case X86::VCVTSI2SDZrr:
6911 case X86::VCVTSI2SDZrm:
6912 case X86::VCVTSI2SDZrr_Int:
6913 case X86::VCVTSI2SDZrm_Int:
6914 case X86::VCVTSI642SDZrr:
6915 case X86::VCVTSI642SDZrm:
6916 case X86::VCVTSI642SDZrr_Int:
6917 case X86::VCVTSI642SDZrrb_Int:
6918 case X86::VCVTSI642SDZrm_Int:
6919 case X86::VCVTUSI2SSZrr:
6920 case X86::VCVTUSI2SSZrm:
6921 case X86::VCVTUSI2SSZrr_Int:
6922 case X86::VCVTUSI2SSZrrb_Int:
6923 case X86::VCVTUSI2SSZrm_Int:
6924 case X86::VCVTUSI642SSZrr:
6925 case X86::VCVTUSI642SSZrm:
6926 case X86::VCVTUSI642SSZrr_Int:
6927 case X86::VCVTUSI642SSZrrb_Int:
6928 case X86::VCVTUSI642SSZrm_Int:
6929 case X86::VCVTUSI2SDZrr:
6930 case X86::VCVTUSI2SDZrm:
6931 case X86::VCVTUSI2SDZrr_Int:
6932 case X86::VCVTUSI2SDZrm_Int:
6933 case X86::VCVTUSI642SDZrr:
6934 case X86::VCVTUSI642SDZrm:
6935 case X86::VCVTUSI642SDZrr_Int:
6936 case X86::VCVTUSI642SDZrrb_Int:
6937 case X86::VCVTUSI642SDZrm_Int:
6938 case X86::VCVTSI2SHZrr:
6939 case X86::VCVTSI2SHZrm:
6940 case X86::VCVTSI2SHZrr_Int:
6941 case X86::VCVTSI2SHZrrb_Int:
6942 case X86::VCVTSI2SHZrm_Int:
6943 case X86::VCVTSI642SHZrr:
6944 case X86::VCVTSI642SHZrm:
6945 case X86::VCVTSI642SHZrr_Int:
6946 case X86::VCVTSI642SHZrrb_Int:
6947 case X86::VCVTSI642SHZrm_Int:
6948 case X86::VCVTUSI2SHZrr:
6949 case X86::VCVTUSI2SHZrm:
6950 case X86::VCVTUSI2SHZrr_Int:
6951 case X86::VCVTUSI2SHZrrb_Int:
6952 case X86::VCVTUSI2SHZrm_Int:
6953 case X86::VCVTUSI642SHZrr:
6954 case X86::VCVTUSI642SHZrm:
6955 case X86::VCVTUSI642SHZrr_Int:
6956 case X86::VCVTUSI642SHZrrb_Int:
6957 case X86::VCVTUSI642SHZrm_Int:
6958 // Load folding won't effect the undef register update since the input is
6959 // a GPR.
6960 return OpNum == 1 && !ForLoadFold;
6961 case X86::VCVTSD2SSrr:
6962 case X86::VCVTSD2SSrm:
6963 case X86::VCVTSD2SSrr_Int:
6964 case X86::VCVTSD2SSrm_Int:
6965 case X86::VCVTSS2SDrr:
6966 case X86::VCVTSS2SDrm:
6967 case X86::VCVTSS2SDrr_Int:
6968 case X86::VCVTSS2SDrm_Int:
6969 case X86::VRCPSSr:
6970 case X86::VRCPSSr_Int:
6971 case X86::VRCPSSm:
6972 case X86::VRCPSSm_Int:
6973 case X86::VROUNDSDri:
6974 case X86::VROUNDSDmi:
6975 case X86::VROUNDSDri_Int:
6976 case X86::VROUNDSDmi_Int:
6977 case X86::VROUNDSSri:
6978 case X86::VROUNDSSmi:
6979 case X86::VROUNDSSri_Int:
6980 case X86::VROUNDSSmi_Int:
6981 case X86::VRSQRTSSr:
6982 case X86::VRSQRTSSr_Int:
6983 case X86::VRSQRTSSm:
6984 case X86::VRSQRTSSm_Int:
6985 case X86::VSQRTSSr:
6986 case X86::VSQRTSSr_Int:
6987 case X86::VSQRTSSm:
6988 case X86::VSQRTSSm_Int:
6989 case X86::VSQRTSDr:
6990 case X86::VSQRTSDr_Int:
6991 case X86::VSQRTSDm:
6992 case X86::VSQRTSDm_Int:
6993 // AVX-512
6994 case X86::VCVTSD2SSZrr:
6995 case X86::VCVTSD2SSZrr_Int:
6996 case X86::VCVTSD2SSZrrb_Int:
6997 case X86::VCVTSD2SSZrm:
6998 case X86::VCVTSD2SSZrm_Int:
6999 case X86::VCVTSS2SDZrr:
7000 case X86::VCVTSS2SDZrr_Int:
7001 case X86::VCVTSS2SDZrrb_Int:
7002 case X86::VCVTSS2SDZrm:
7003 case X86::VCVTSS2SDZrm_Int:
7004 case X86::VGETEXPSDZr:
7005 case X86::VGETEXPSDZrb:
7006 case X86::VGETEXPSDZm:
7007 case X86::VGETEXPSSZr:
7008 case X86::VGETEXPSSZrb:
7009 case X86::VGETEXPSSZm:
7010 case X86::VGETMANTSDZrri:
7011 case X86::VGETMANTSDZrrib:
7012 case X86::VGETMANTSDZrmi:
7013 case X86::VGETMANTSSZrri:
7014 case X86::VGETMANTSSZrrib:
7015 case X86::VGETMANTSSZrmi:
7016 case X86::VRNDSCALESDZrri:
7017 case X86::VRNDSCALESDZrri_Int:
7018 case X86::VRNDSCALESDZrrib_Int:
7019 case X86::VRNDSCALESDZrmi:
7020 case X86::VRNDSCALESDZrmi_Int:
7021 case X86::VRNDSCALESSZrri:
7022 case X86::VRNDSCALESSZrri_Int:
7023 case X86::VRNDSCALESSZrrib_Int:
7024 case X86::VRNDSCALESSZrmi:
7025 case X86::VRNDSCALESSZrmi_Int:
7026 case X86::VRCP14SDZrr:
7027 case X86::VRCP14SDZrm:
7028 case X86::VRCP14SSZrr:
7029 case X86::VRCP14SSZrm:
7030 case X86::VRCPSHZrr:
7031 case X86::VRCPSHZrm:
7032 case X86::VRSQRTSHZrr:
7033 case X86::VRSQRTSHZrm:
7034 case X86::VREDUCESHZrmi:
7035 case X86::VREDUCESHZrri:
7036 case X86::VREDUCESHZrrib:
7037 case X86::VGETEXPSHZr:
7038 case X86::VGETEXPSHZrb:
7039 case X86::VGETEXPSHZm:
7040 case X86::VGETMANTSHZrri:
7041 case X86::VGETMANTSHZrrib:
7042 case X86::VGETMANTSHZrmi:
7043 case X86::VRNDSCALESHZrri:
7044 case X86::VRNDSCALESHZrri_Int:
7045 case X86::VRNDSCALESHZrrib_Int:
7046 case X86::VRNDSCALESHZrmi:
7047 case X86::VRNDSCALESHZrmi_Int:
7048 case X86::VSQRTSHZr:
7049 case X86::VSQRTSHZr_Int:
7050 case X86::VSQRTSHZrb_Int:
7051 case X86::VSQRTSHZm:
7052 case X86::VSQRTSHZm_Int:
7053 case X86::VRCP28SDZr:
7054 case X86::VRCP28SDZrb:
7055 case X86::VRCP28SDZm:
7056 case X86::VRCP28SSZr:
7057 case X86::VRCP28SSZrb:
7058 case X86::VRCP28SSZm:
7059 case X86::VREDUCESSZrmi:
7060 case X86::VREDUCESSZrri:
7061 case X86::VREDUCESSZrrib:
7062 case X86::VRSQRT14SDZrr:
7063 case X86::VRSQRT14SDZrm:
7064 case X86::VRSQRT14SSZrr:
7065 case X86::VRSQRT14SSZrm:
7066 case X86::VRSQRT28SDZr:
7067 case X86::VRSQRT28SDZrb:
7068 case X86::VRSQRT28SDZm:
7069 case X86::VRSQRT28SSZr:
7070 case X86::VRSQRT28SSZrb:
7071 case X86::VRSQRT28SSZm:
7072 case X86::VSQRTSSZr:
7073 case X86::VSQRTSSZr_Int:
7074 case X86::VSQRTSSZrb_Int:
7075 case X86::VSQRTSSZm:
7076 case X86::VSQRTSSZm_Int:
7077 case X86::VSQRTSDZr:
7078 case X86::VSQRTSDZr_Int:
7079 case X86::VSQRTSDZrb_Int:
7080 case X86::VSQRTSDZm:
7081 case X86::VSQRTSDZm_Int:
7082 case X86::VCVTSD2SHZrr:
7083 case X86::VCVTSD2SHZrr_Int:
7084 case X86::VCVTSD2SHZrrb_Int:
7085 case X86::VCVTSD2SHZrm:
7086 case X86::VCVTSD2SHZrm_Int:
7087 case X86::VCVTSS2SHZrr:
7088 case X86::VCVTSS2SHZrr_Int:
7089 case X86::VCVTSS2SHZrrb_Int:
7090 case X86::VCVTSS2SHZrm:
7091 case X86::VCVTSS2SHZrm_Int:
7092 case X86::VCVTSH2SDZrr:
7093 case X86::VCVTSH2SDZrr_Int:
7094 case X86::VCVTSH2SDZrrb_Int:
7095 case X86::VCVTSH2SDZrm:
7096 case X86::VCVTSH2SDZrm_Int:
7097 case X86::VCVTSH2SSZrr:
7098 case X86::VCVTSH2SSZrr_Int:
7099 case X86::VCVTSH2SSZrrb_Int:
7100 case X86::VCVTSH2SSZrm:
7101 case X86::VCVTSH2SSZrm_Int:
7102 return OpNum == 1;
7103 case X86::VMOVSSZrrk:
7104 case X86::VMOVSDZrrk:
7105 return OpNum == 3 && !ForLoadFold;
7106 case X86::VMOVSSZrrkz:
7107 case X86::VMOVSDZrrkz:
7108 return OpNum == 2 && !ForLoadFold;
7109 }
7110
7111 return false;
7112}
7113
7114/// Inform the BreakFalseDeps pass how many idle instructions we would like
7115/// before certain undef register reads.
7116///
7117/// This catches the VCVTSI2SD family of instructions:
7118///
7119/// vcvtsi2sdq %rax, undef %xmm0, %xmm14
7120///
7121/// We should to be careful *not* to catch VXOR idioms which are presumably
7122/// handled specially in the pipeline:
7123///
7124/// vxorps undef %xmm1, undef %xmm1, %xmm1
7125///
7126/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
7127/// high bits that are passed-through are not live.
7128unsigned
7130 const TargetRegisterInfo *TRI) const {
7131 const MachineOperand &MO = MI.getOperand(OpNum);
7132 if (MO.getReg().isPhysical() && hasUndefRegUpdate(MI.getOpcode(), OpNum))
7133 return UndefRegClearance;
7134
7135 return 0;
7136}
7137
7139 MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
7140 Register Reg = MI.getOperand(OpNum).getReg();
7141 // If MI kills this register, the false dependence is already broken.
7142 if (MI.killsRegister(Reg, TRI))
7143 return;
7144
7145 if (X86::VR128RegClass.contains(Reg)) {
7146 // These instructions are all floating point domain, so xorps is the best
7147 // choice.
7148 unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr;
7149 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg)
7150 .addReg(Reg, RegState::Undef)
7151 .addReg(Reg, RegState::Undef);
7152 MI.addRegisterKilled(Reg, TRI, true);
7153 } else if (X86::VR256RegClass.contains(Reg)) {
7154 // Use vxorps to clear the full ymm register.
7155 // It wants to read and write the xmm sub-register.
7156 Register XReg = TRI->getSubReg(Reg, X86::sub_xmm);
7157 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)
7158 .addReg(XReg, RegState::Undef)
7159 .addReg(XReg, RegState::Undef)
7161 MI.addRegisterKilled(Reg, TRI, true);
7162 } else if (X86::VR128XRegClass.contains(Reg)) {
7163 // Only handle VLX targets.
7164 if (!Subtarget.hasVLX())
7165 return;
7166 // Since vxorps requires AVX512DQ, vpxord should be the best choice.
7167 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), Reg)
7168 .addReg(Reg, RegState::Undef)
7169 .addReg(Reg, RegState::Undef);
7170 MI.addRegisterKilled(Reg, TRI, true);
7171 } else if (X86::VR256XRegClass.contains(Reg) ||
7172 X86::VR512RegClass.contains(Reg)) {
7173 // Only handle VLX targets.
7174 if (!Subtarget.hasVLX())
7175 return;
7176 // Use vpxord to clear the full ymm/zmm register.
7177 // It wants to read and write the xmm sub-register.
7178 Register XReg = TRI->getSubReg(Reg, X86::sub_xmm);
7179 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), XReg)
7180 .addReg(XReg, RegState::Undef)
7181 .addReg(XReg, RegState::Undef)
7183 MI.addRegisterKilled(Reg, TRI, true);
7184 } else if (X86::GR64RegClass.contains(Reg)) {
7185 // Using XOR32rr because it has shorter encoding and zeros up the upper bits
7186 // as well.
7187 Register XReg = TRI->getSubReg(Reg, X86::sub_32bit);
7188 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), XReg)
7189 .addReg(XReg, RegState::Undef)
7190 .addReg(XReg, RegState::Undef)
7192 MI.addRegisterKilled(Reg, TRI, true);
7193 } else if (X86::GR32RegClass.contains(Reg)) {
7194 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), Reg)
7195 .addReg(Reg, RegState::Undef)
7196 .addReg(Reg, RegState::Undef);
7197 MI.addRegisterKilled(Reg, TRI, true);
7198 } else if ((X86::GR16RegClass.contains(Reg) ||
7199 X86::GR8RegClass.contains(Reg)) &&
7200 X86II::hasNewDataDest(MI.getDesc().TSFlags)) {
7201 // This case is only expected for NDD ops which appear to be partial
7202 // writes, but are not due to the zeroing of the upper part. Here
7203 // we add an implicit def of the superegister, which prevents
7204 // CompressEVEX from converting this to a legacy form.
7205 Register SuperReg = getX86SubSuperRegister(Reg, 64);
7206 MachineInstrBuilder BuildMI(*MI.getParent()->getParent(), &MI);
7207 if (!MI.definesRegister(SuperReg, /*TRI=*/nullptr))
7209 }
7210}
7211
7213 int PtrOffset = 0) {
7214 unsigned NumAddrOps = MOs.size();
7215
7216 if (NumAddrOps < 4) {
7217 // FrameIndex only - add an immediate offset (whether its zero or not).
7218 for (unsigned i = 0; i != NumAddrOps; ++i)
7219 MIB.add(MOs[i]);
7220 addOffset(MIB, PtrOffset);
7221 } else {
7222 // General Memory Addressing - we need to add any offset to an existing
7223 // offset.
7224 assert(MOs.size() == 5 && "Unexpected memory operand list length");
7225 for (unsigned i = 0; i != NumAddrOps; ++i) {
7226 const MachineOperand &MO = MOs[i];
7227 if (i == 3 && PtrOffset != 0) {
7228 MIB.addDisp(MO, PtrOffset);
7229 } else {
7230 MIB.add(MO);
7231 }
7232 }
7233 }
7234}
7235
7237 MachineInstr &NewMI,
7238 const TargetInstrInfo &TII) {
7240 const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
7241
7242 for (int Idx : llvm::seq<int>(0, NewMI.getNumOperands())) {
7243 MachineOperand &MO = NewMI.getOperand(Idx);
7244 // We only need to update constraints on virtual register operands.
7245 if (!MO.isReg())
7246 continue;
7247 Register Reg = MO.getReg();
7248 if (!Reg.isVirtual())
7249 continue;
7250
7251 auto *NewRC = MRI.constrainRegClass(
7252 Reg, TII.getRegClass(NewMI.getDesc(), Idx, &TRI, MF));
7253 if (!NewRC) {
7254 LLVM_DEBUG(
7255 dbgs() << "WARNING: Unable to update register constraint for operand "
7256 << Idx << " of instruction:\n";
7257 NewMI.dump(); dbgs() << "\n");
7258 }
7259 }
7260}
7261
7262static MachineInstr *fuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
7266 const TargetInstrInfo &TII) {
7267 // Create the base instruction with the memory operand as the first part.
7268 // Omit the implicit operands, something BuildMI can't do.
7269 MachineInstr *NewMI =
7270 MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
7271 MachineInstrBuilder MIB(MF, NewMI);
7272 addOperands(MIB, MOs);
7273
7274 // Loop over the rest of the ri operands, converting them over.
7275 unsigned NumOps = MI.getDesc().getNumOperands() - 2;
7276 for (unsigned i = 0; i != NumOps; ++i) {
7277 MachineOperand &MO = MI.getOperand(i + 2);
7278 MIB.add(MO);
7279 }
7280 for (const MachineOperand &MO : llvm::drop_begin(MI.operands(), NumOps + 2))
7281 MIB.add(MO);
7282
7283 updateOperandRegConstraints(MF, *NewMI, TII);
7284
7285 MachineBasicBlock *MBB = InsertPt->getParent();
7286 MBB->insert(InsertPt, NewMI);
7287
7288 return MIB;
7289}
7290
7291static MachineInstr *fuseInst(MachineFunction &MF, unsigned Opcode,
7292 unsigned OpNo, ArrayRef<MachineOperand> MOs,
7295 int PtrOffset = 0) {
7296 // Omit the implicit operands, something BuildMI can't do.
7297 MachineInstr *NewMI =
7298 MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
7299 MachineInstrBuilder MIB(MF, NewMI);
7300
7301 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
7302 MachineOperand &MO = MI.getOperand(i);
7303 if (i == OpNo) {
7304 assert(MO.isReg() && "Expected to fold into reg operand!");
7305 addOperands(MIB, MOs, PtrOffset);
7306 } else {
7307 MIB.add(MO);
7308 }
7309 }
7310
7311 updateOperandRegConstraints(MF, *NewMI, TII);
7312
7313 // Copy the NoFPExcept flag from the instruction we're fusing.
7316
7317 MachineBasicBlock *MBB = InsertPt->getParent();
7318 MBB->insert(InsertPt, NewMI);
7319
7320 return MIB;
7321}
7322
7323static MachineInstr *makeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
7326 MachineInstr &MI) {
7327 MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
7328 MI.getDebugLoc(), TII.get(Opcode));
7329 addOperands(MIB, MOs);
7330 return MIB.addImm(0);
7331}
7332
7333MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
7334 MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
7336 unsigned Size, Align Alignment) const {
7337 switch (MI.getOpcode()) {
7338 case X86::INSERTPSrri:
7339 case X86::VINSERTPSrri:
7340 case X86::VINSERTPSZrri:
7341 // Attempt to convert the load of inserted vector into a fold load
7342 // of a single float.
7343 if (OpNum == 2) {
7344 unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
7345 unsigned ZMask = Imm & 15;
7346 unsigned DstIdx = (Imm >> 4) & 3;
7347 unsigned SrcIdx = (Imm >> 6) & 3;
7348
7350 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
7351 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7352 if ((Size == 0 || Size >= 16) && RCSize >= 16 &&
7353 (MI.getOpcode() != X86::INSERTPSrri || Alignment >= Align(4))) {
7354 int PtrOffset = SrcIdx * 4;
7355 unsigned NewImm = (DstIdx << 4) | ZMask;
7356 unsigned NewOpCode =
7357 (MI.getOpcode() == X86::VINSERTPSZrri) ? X86::VINSERTPSZrmi
7358 : (MI.getOpcode() == X86::VINSERTPSrri) ? X86::VINSERTPSrmi
7359 : X86::INSERTPSrmi;
7360 MachineInstr *NewMI =
7361 fuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
7362 NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
7363 return NewMI;
7364 }
7365 }
7366 break;
7367 case X86::MOVHLPSrr:
7368 case X86::VMOVHLPSrr:
7369 case X86::VMOVHLPSZrr:
7370 // Move the upper 64-bits of the second operand to the lower 64-bits.
7371 // To fold the load, adjust the pointer to the upper and use (V)MOVLPS.
7372 // TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
7373 if (OpNum == 2) {
7375 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
7376 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7377 if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(8)) {
7378 unsigned NewOpCode =
7379 (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm
7380 : (MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm
7381 : X86::MOVLPSrm;
7382 MachineInstr *NewMI =
7383 fuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8);
7384 return NewMI;
7385 }
7386 }
7387 break;
7388 case X86::UNPCKLPDrr:
7389 // If we won't be able to fold this to the memory form of UNPCKL, use
7390 // MOVHPD instead. Done as custom because we can't have this in the load
7391 // table twice.
7392 if (OpNum == 2) {
7394 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
7395 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7396 if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment < Align(16)) {
7397 MachineInstr *NewMI =
7398 fuseInst(MF, X86::MOVHPDrm, OpNum, MOs, InsertPt, MI, *this);
7399 return NewMI;
7400 }
7401 }
7402 break;
7403 case X86::MOV32r0:
7404 if (auto *NewMI =
7405 makeM0Inst(*this, (Size == 4) ? X86::MOV32mi : X86::MOV64mi32, MOs,
7406 InsertPt, MI))
7407 return NewMI;
7408 break;
7409 }
7410
7411 return nullptr;
7412}
7413
7415 MachineInstr &MI) {
7416 if (!hasUndefRegUpdate(MI.getOpcode(), 1, /*ForLoadFold*/ true) ||
7417 !MI.getOperand(1).isReg())
7418 return false;
7419
7420 // The are two cases we need to handle depending on where in the pipeline
7421 // the folding attempt is being made.
7422 // -Register has the undef flag set.
7423 // -Register is produced by the IMPLICIT_DEF instruction.
7424
7425 if (MI.getOperand(1).isUndef())
7426 return true;
7427
7429 MachineInstr *VRegDef = RegInfo.getUniqueVRegDef(MI.getOperand(1).getReg());
7430 return VRegDef && VRegDef->isImplicitDef();
7431}
7432
7433unsigned X86InstrInfo::commuteOperandsForFold(MachineInstr &MI,
7434 unsigned Idx1) const {
7435 unsigned Idx2 = CommuteAnyOperandIndex;
7436 if (!findCommutedOpIndices(MI, Idx1, Idx2))
7437 return Idx1;
7438
7439 bool HasDef = MI.getDesc().getNumDefs();
7440 Register Reg0 = HasDef ? MI.getOperand(0).getReg() : Register();
7441 Register Reg1 = MI.getOperand(Idx1).getReg();
7442 Register Reg2 = MI.getOperand(Idx2).getReg();
7443 bool Tied1 = 0 == MI.getDesc().getOperandConstraint(Idx1, MCOI::TIED_TO);
7444 bool Tied2 = 0 == MI.getDesc().getOperandConstraint(Idx2, MCOI::TIED_TO);
7445
7446 // If either of the commutable operands are tied to the destination
7447 // then we can not commute + fold.
7448 if ((HasDef && Reg0 == Reg1 && Tied1) || (HasDef && Reg0 == Reg2 && Tied2))
7449 return Idx1;
7450
7451 return commuteInstruction(MI, false, Idx1, Idx2) ? Idx2 : Idx1;
7452}
7453
7454static void printFailMsgforFold(const MachineInstr &MI, unsigned Idx) {
7455 if (PrintFailedFusing && !MI.isCopy())
7456 dbgs() << "We failed to fuse operand " << Idx << " in " << MI;
7457}
7458
7460 MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
7462 unsigned Size, Align Alignment, bool AllowCommute) const {
7463 bool isSlowTwoMemOps = Subtarget.slowTwoMemOps();
7464 unsigned Opc = MI.getOpcode();
7465
7466 // For CPUs that favor the register form of a call or push,
7467 // do not fold loads into calls or pushes, unless optimizing for size
7468 // aggressively.
7469 if (isSlowTwoMemOps && !MF.getFunction().hasMinSize() &&
7470 (Opc == X86::CALL32r || Opc == X86::CALL64r ||
7471 Opc == X86::CALL64r_ImpCall || Opc == X86::PUSH16r ||
7472 Opc == X86::PUSH32r || Opc == X86::PUSH64r))
7473 return nullptr;
7474
7475 // Avoid partial and undef register update stalls unless optimizing for size.
7476 if (!MF.getFunction().hasOptSize() &&
7477 (hasPartialRegUpdate(Opc, Subtarget, /*ForLoadFold*/ true) ||
7479 return nullptr;
7480
7481 unsigned NumOps = MI.getDesc().getNumOperands();
7482 bool IsTwoAddr = NumOps > 1 && OpNum < 2 && MI.getOperand(0).isReg() &&
7483 MI.getOperand(1).isReg() &&
7484 MI.getOperand(0).getReg() == MI.getOperand(1).getReg();
7485
7486 // FIXME: AsmPrinter doesn't know how to handle
7487 // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
7488 if (Opc == X86::ADD32ri &&
7489 MI.getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
7490 return nullptr;
7491
7492 // GOTTPOFF relocation loads can only be folded into add instructions.
7493 // FIXME: Need to exclude other relocations that only support specific
7494 // instructions.
7495 if (MOs.size() == X86::AddrNumOperands &&
7496 MOs[X86::AddrDisp].getTargetFlags() == X86II::MO_GOTTPOFF &&
7497 Opc != X86::ADD64rr)
7498 return nullptr;
7499
7500 // Don't fold loads into indirect calls that need a KCFI check as we'll
7501 // have to unfold these in X86TargetLowering::EmitKCFICheck anyway.
7502 if (MI.isCall() && MI.getCFIType())
7503 return nullptr;
7504
7505 // Attempt to fold any custom cases we have.
7506 if (auto *CustomMI = foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt,
7507 Size, Alignment))
7508 return CustomMI;
7509
7510 // Folding a memory location into the two-address part of a two-address
7511 // instruction is different than folding it other places. It requires
7512 // replacing the *two* registers with the memory location.
7513 //
7514 // Utilize the mapping NonNDD -> RMW for the NDD variant.
7515 unsigned NonNDOpc = Subtarget.hasNDD() ? X86::getNonNDVariant(Opc) : 0U;
7516 const X86FoldTableEntry *I =
7517 IsTwoAddr ? lookupTwoAddrFoldTable(NonNDOpc ? NonNDOpc : Opc)
7518 : lookupFoldTable(Opc, OpNum);
7519
7520 MachineInstr *NewMI = nullptr;
7521 if (I) {
7522 unsigned Opcode = I->DstOp;
7523 if (Alignment <
7524 Align(1ULL << ((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT)))
7525 return nullptr;
7526 bool NarrowToMOV32rm = false;
7527 if (Size) {
7529 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
7530 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7531 // Check if it's safe to fold the load. If the size of the object is
7532 // narrower than the load width, then it's not.
7533 // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
7534 if ((I->Flags & TB_FOLDED_LOAD) && Size < RCSize) {
7535 // If this is a 64-bit load, but the spill slot is 32, then we can do
7536 // a 32-bit load which is implicitly zero-extended. This likely is
7537 // due to live interval analysis remat'ing a load from stack slot.
7538 if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
7539 return nullptr;
7540 if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
7541 return nullptr;
7542 Opcode = X86::MOV32rm;
7543 NarrowToMOV32rm = true;
7544 }
7545 // For stores, make sure the size of the object is equal to the size of
7546 // the store. If the object is larger, the extra bits would be garbage. If
7547 // the object is smaller we might overwrite another object or fault.
7548 if ((I->Flags & TB_FOLDED_STORE) && Size != RCSize)
7549 return nullptr;
7550 }
7551
7552 NewMI = IsTwoAddr ? fuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this)
7553 : fuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);
7554
7555 if (NarrowToMOV32rm) {
7556 // If this is the special case where we use a MOV32rm to load a 32-bit
7557 // value and zero-extend the top bits. Change the destination register
7558 // to a 32-bit one.
7559 Register DstReg = NewMI->getOperand(0).getReg();
7560 if (DstReg.isPhysical())
7561 NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
7562 else
7563 NewMI->getOperand(0).setSubReg(X86::sub_32bit);
7564 }
7565 return NewMI;
7566 }
7567
7568 if (AllowCommute) {
7569 // If the instruction and target operand are commutable, commute the
7570 // instruction and try again.
7571 unsigned CommuteOpIdx2 = commuteOperandsForFold(MI, OpNum);
7572 if (CommuteOpIdx2 == OpNum) {
7573 printFailMsgforFold(MI, OpNum);
7574 return nullptr;
7575 }
7576 // Attempt to fold with the commuted version of the instruction.
7577 NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, Size,
7578 Alignment, /*AllowCommute=*/false);
7579 if (NewMI)
7580 return NewMI;
7581 // Folding failed again - undo the commute before returning.
7582 commuteInstruction(MI, false, OpNum, CommuteOpIdx2);
7583 }
7584
7585 printFailMsgforFold(MI, OpNum);
7586 return nullptr;
7587}
7588
7591 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
7592 VirtRegMap *VRM) const {
7593 // Check switch flag
7594 if (NoFusing)
7595 return nullptr;
7596
7597 // Avoid partial and undef register update stalls unless optimizing for size.
7598 if (!MF.getFunction().hasOptSize() &&
7599 (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/ true) ||
7601 return nullptr;
7602
7603 // Don't fold subreg spills, or reloads that use a high subreg.
7604 for (auto Op : Ops) {
7605 MachineOperand &MO = MI.getOperand(Op);
7606 auto SubReg = MO.getSubReg();
7607 // MOV32r0 is special b/c it's used to clear a 64-bit register too.
7608 // (See patterns for MOV32r0 in TD files).
7609 if (MI.getOpcode() == X86::MOV32r0 && SubReg == X86::sub_32bit)
7610 continue;
7611 if (SubReg && (MO.isDef() || SubReg == X86::sub_8bit_hi))
7612 return nullptr;
7613 }
7614
7615 const MachineFrameInfo &MFI = MF.getFrameInfo();
7616 unsigned Size = MFI.getObjectSize(FrameIndex);
7617 Align Alignment = MFI.getObjectAlign(FrameIndex);
7618 // If the function stack isn't realigned we don't want to fold instructions
7619 // that need increased alignment.
7620 if (!RI.hasStackRealignment(MF))
7621 Alignment =
7622 std::min(Alignment, Subtarget.getFrameLowering()->getStackAlign());
7623
7624 auto Impl = [&]() {
7625 return foldMemoryOperandImpl(MF, MI, Ops[0],
7626 MachineOperand::CreateFI(FrameIndex), InsertPt,
7627 Size, Alignment, /*AllowCommute=*/true);
7628 };
7629 if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
7630 unsigned NewOpc = 0;
7631 unsigned RCSize = 0;
7632 unsigned Opc = MI.getOpcode();
7633 switch (Opc) {
7634 default:
7635 // NDD can be folded into RMW though its Op0 and Op1 are not tied.
7636 return (Subtarget.hasNDD() ? X86::getNonNDVariant(Opc) : 0U) ? Impl()
7637 : nullptr;
7638 case X86::TEST8rr:
7639 NewOpc = X86::CMP8ri;
7640 RCSize = 1;
7641 break;
7642 case X86::TEST16rr:
7643 NewOpc = X86::CMP16ri;
7644 RCSize = 2;
7645 break;
7646 case X86::TEST32rr:
7647 NewOpc = X86::CMP32ri;
7648 RCSize = 4;
7649 break;
7650 case X86::TEST64rr:
7651 NewOpc = X86::CMP64ri32;
7652 RCSize = 8;
7653 break;
7654 }
7655 // Check if it's safe to fold the load. If the size of the object is
7656 // narrower than the load width, then it's not.
7657 if (Size < RCSize)
7658 return nullptr;
7659 // Change to CMPXXri r, 0 first.
7660 MI.setDesc(get(NewOpc));
7661 MI.getOperand(1).ChangeToImmediate(0);
7662 } else if (Ops.size() != 1)
7663 return nullptr;
7664
7665 return Impl();
7666}
7667
7668/// Check if \p LoadMI is a partial register load that we can't fold into \p MI
7669/// because the latter uses contents that wouldn't be defined in the folded
7670/// version. For instance, this transformation isn't legal:
7671/// movss (%rdi), %xmm0
7672/// addps %xmm0, %xmm0
7673/// ->
7674/// addps (%rdi), %xmm0
7675///
7676/// But this one is:
7677/// movss (%rdi), %xmm0
7678/// addss %xmm0, %xmm0
7679/// ->
7680/// addss (%rdi), %xmm0
7681///
7683 const MachineInstr &UserMI,
7684 const MachineFunction &MF) {
7685 unsigned Opc = LoadMI.getOpcode();
7686 unsigned UserOpc = UserMI.getOpcode();
7688 const TargetRegisterClass *RC =
7689 MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg());
7690 unsigned RegSize = TRI.getRegSizeInBits(*RC);
7691
7692 if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm ||
7693 Opc == X86::MOVSSrm_alt || Opc == X86::VMOVSSrm_alt ||
7694 Opc == X86::VMOVSSZrm_alt) &&
7695 RegSize > 32) {
7696 // These instructions only load 32 bits, we can't fold them if the
7697 // destination register is wider than 32 bits (4 bytes), and its user
7698 // instruction isn't scalar (SS).
7699 switch (UserOpc) {
7700 case X86::CVTSS2SDrr_Int:
7701 case X86::VCVTSS2SDrr_Int:
7702 case X86::VCVTSS2SDZrr_Int:
7703 case X86::VCVTSS2SDZrrk_Int:
7704 case X86::VCVTSS2SDZrrkz_Int:
7705 case X86::CVTSS2SIrr_Int:
7706 case X86::CVTSS2SI64rr_Int:
7707 case X86::VCVTSS2SIrr_Int:
7708 case X86::VCVTSS2SI64rr_Int:
7709 case X86::VCVTSS2SIZrr_Int:
7710 case X86::VCVTSS2SI64Zrr_Int:
7711 case X86::CVTTSS2SIrr_Int:
7712 case X86::CVTTSS2SI64rr_Int:
7713 case X86::VCVTTSS2SIrr_Int:
7714 case X86::VCVTTSS2SI64rr_Int:
7715 case X86::VCVTTSS2SIZrr_Int:
7716 case X86::VCVTTSS2SI64Zrr_Int:
7717 case X86::VCVTSS2USIZrr_Int:
7718 case X86::VCVTSS2USI64Zrr_Int:
7719 case X86::VCVTTSS2USIZrr_Int:
7720 case X86::VCVTTSS2USI64Zrr_Int:
7721 case X86::RCPSSr_Int:
7722 case X86::VRCPSSr_Int:
7723 case X86::RSQRTSSr_Int:
7724 case X86::VRSQRTSSr_Int:
7725 case X86::ROUNDSSri_Int:
7726 case X86::VROUNDSSri_Int:
7727 case X86::COMISSrr_Int:
7728 case X86::VCOMISSrr_Int:
7729 case X86::VCOMISSZrr_Int:
7730 case X86::UCOMISSrr_Int:
7731 case X86::VUCOMISSrr_Int:
7732 case X86::VUCOMISSZrr_Int:
7733 case X86::ADDSSrr_Int:
7734 case X86::VADDSSrr_Int:
7735 case X86::VADDSSZrr_Int:
7736 case X86::CMPSSrri_Int:
7737 case X86::VCMPSSrri_Int:
7738 case X86::VCMPSSZrri_Int:
7739 case X86::DIVSSrr_Int:
7740 case X86::VDIVSSrr_Int:
7741 case X86::VDIVSSZrr_Int:
7742 case X86::MAXSSrr_Int:
7743 case X86::VMAXSSrr_Int:
7744 case X86::VMAXSSZrr_Int:
7745 case X86::MINSSrr_Int:
7746 case X86::VMINSSrr_Int:
7747 case X86::VMINSSZrr_Int:
7748 case X86::MULSSrr_Int:
7749 case X86::VMULSSrr_Int:
7750 case X86::VMULSSZrr_Int:
7751 case X86::SQRTSSr_Int:
7752 case X86::VSQRTSSr_Int:
7753 case X86::VSQRTSSZr_Int:
7754 case X86::SUBSSrr_Int:
7755 case X86::VSUBSSrr_Int:
7756 case X86::VSUBSSZrr_Int:
7757 case X86::VADDSSZrrk_Int:
7758 case X86::VADDSSZrrkz_Int:
7759 case X86::VCMPSSZrrik_Int:
7760 case X86::VDIVSSZrrk_Int:
7761 case X86::VDIVSSZrrkz_Int:
7762 case X86::VMAXSSZrrk_Int:
7763 case X86::VMAXSSZrrkz_Int:
7764 case X86::VMINSSZrrk_Int:
7765 case X86::VMINSSZrrkz_Int:
7766 case X86::VMULSSZrrk_Int:
7767 case X86::VMULSSZrrkz_Int:
7768 case X86::VSQRTSSZrk_Int:
7769 case X86::VSQRTSSZrkz_Int:
7770 case X86::VSUBSSZrrk_Int:
7771 case X86::VSUBSSZrrkz_Int:
7772 case X86::VFMADDSS4rr_Int:
7773 case X86::VFNMADDSS4rr_Int:
7774 case X86::VFMSUBSS4rr_Int:
7775 case X86::VFNMSUBSS4rr_Int:
7776 case X86::VFMADD132SSr_Int:
7777 case X86::VFNMADD132SSr_Int:
7778 case X86::VFMADD213SSr_Int:
7779 case X86::VFNMADD213SSr_Int:
7780 case X86::VFMADD231SSr_Int:
7781 case X86::VFNMADD231SSr_Int:
7782 case X86::VFMSUB132SSr_Int:
7783 case X86::VFNMSUB132SSr_Int:
7784 case X86::VFMSUB213SSr_Int:
7785 case X86::VFNMSUB213SSr_Int:
7786 case X86::VFMSUB231SSr_Int:
7787 case X86::VFNMSUB231SSr_Int:
7788 case X86::VFMADD132SSZr_Int:
7789 case X86::VFNMADD132SSZr_Int:
7790 case X86::VFMADD213SSZr_Int:
7791 case X86::VFNMADD213SSZr_Int:
7792 case X86::VFMADD231SSZr_Int:
7793 case X86::VFNMADD231SSZr_Int:
7794 case X86::VFMSUB132SSZr_Int:
7795 case X86::VFNMSUB132SSZr_Int:
7796 case X86::VFMSUB213SSZr_Int:
7797 case X86::VFNMSUB213SSZr_Int:
7798 case X86::VFMSUB231SSZr_Int:
7799 case X86::VFNMSUB231SSZr_Int:
7800 case X86::VFMADD132SSZrk_Int:
7801 case X86::VFNMADD132SSZrk_Int:
7802 case X86::VFMADD213SSZrk_Int:
7803 case X86::VFNMADD213SSZrk_Int:
7804 case X86::VFMADD231SSZrk_Int:
7805 case X86::VFNMADD231SSZrk_Int:
7806 case X86::VFMSUB132SSZrk_Int:
7807 case X86::VFNMSUB132SSZrk_Int:
7808 case X86::VFMSUB213SSZrk_Int:
7809 case X86::VFNMSUB213SSZrk_Int:
7810 case X86::VFMSUB231SSZrk_Int:
7811 case X86::VFNMSUB231SSZrk_Int:
7812 case X86::VFMADD132SSZrkz_Int:
7813 case X86::VFNMADD132SSZrkz_Int:
7814 case X86::VFMADD213SSZrkz_Int:
7815 case X86::VFNMADD213SSZrkz_Int:
7816 case X86::VFMADD231SSZrkz_Int:
7817 case X86::VFNMADD231SSZrkz_Int:
7818 case X86::VFMSUB132SSZrkz_Int:
7819 case X86::VFNMSUB132SSZrkz_Int:
7820 case X86::VFMSUB213SSZrkz_Int:
7821 case X86::VFNMSUB213SSZrkz_Int:
7822 case X86::VFMSUB231SSZrkz_Int:
7823 case X86::VFNMSUB231SSZrkz_Int:
7824 case X86::VFIXUPIMMSSZrri:
7825 case X86::VFIXUPIMMSSZrrik:
7826 case X86::VFIXUPIMMSSZrrikz:
7827 case X86::VFPCLASSSSZri:
7828 case X86::VFPCLASSSSZrik:
7829 case X86::VGETEXPSSZr:
7830 case X86::VGETEXPSSZrk:
7831 case X86::VGETEXPSSZrkz:
7832 case X86::VGETMANTSSZrri:
7833 case X86::VGETMANTSSZrrik:
7834 case X86::VGETMANTSSZrrikz:
7835 case X86::VRANGESSZrri:
7836 case X86::VRANGESSZrrik:
7837 case X86::VRANGESSZrrikz:
7838 case X86::VRCP14SSZrr:
7839 case X86::VRCP14SSZrrk:
7840 case X86::VRCP14SSZrrkz:
7841 case X86::VRCP28SSZr:
7842 case X86::VRCP28SSZrk:
7843 case X86::VRCP28SSZrkz:
7844 case X86::VREDUCESSZrri:
7845 case X86::VREDUCESSZrrik:
7846 case X86::VREDUCESSZrrikz:
7847 case X86::VRNDSCALESSZrri_Int:
7848 case X86::VRNDSCALESSZrrik_Int:
7849 case X86::VRNDSCALESSZrrikz_Int:
7850 case X86::VRSQRT14SSZrr:
7851 case X86::VRSQRT14SSZrrk:
7852 case X86::VRSQRT14SSZrrkz:
7853 case X86::VRSQRT28SSZr:
7854 case X86::VRSQRT28SSZrk:
7855 case X86::VRSQRT28SSZrkz:
7856 case X86::VSCALEFSSZrr:
7857 case X86::VSCALEFSSZrrk:
7858 case X86::VSCALEFSSZrrkz:
7859 return false;
7860 default:
7861 return true;
7862 }
7863 }
7864
7865 if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm ||
7866 Opc == X86::MOVSDrm_alt || Opc == X86::VMOVSDrm_alt ||
7867 Opc == X86::VMOVSDZrm_alt) &&
7868 RegSize > 64) {
7869 // These instructions only load 64 bits, we can't fold them if the
7870 // destination register is wider than 64 bits (8 bytes), and its user
7871 // instruction isn't scalar (SD).
7872 switch (UserOpc) {
7873 case X86::CVTSD2SSrr_Int:
7874 case X86::VCVTSD2SSrr_Int:
7875 case X86::VCVTSD2SSZrr_Int:
7876 case X86::VCVTSD2SSZrrk_Int:
7877 case X86::VCVTSD2SSZrrkz_Int:
7878 case X86::CVTSD2SIrr_Int:
7879 case X86::CVTSD2SI64rr_Int:
7880 case X86::VCVTSD2SIrr_Int:
7881 case X86::VCVTSD2SI64rr_Int:
7882 case X86::VCVTSD2SIZrr_Int:
7883 case X86::VCVTSD2SI64Zrr_Int:
7884 case X86::CVTTSD2SIrr_Int:
7885 case X86::CVTTSD2SI64rr_Int:
7886 case X86::VCVTTSD2SIrr_Int:
7887 case X86::VCVTTSD2SI64rr_Int:
7888 case X86::VCVTTSD2SIZrr_Int:
7889 case X86::VCVTTSD2SI64Zrr_Int:
7890 case X86::VCVTSD2USIZrr_Int:
7891 case X86::VCVTSD2USI64Zrr_Int:
7892 case X86::VCVTTSD2USIZrr_Int:
7893 case X86::VCVTTSD2USI64Zrr_Int:
7894 case X86::ROUNDSDri_Int:
7895 case X86::VROUNDSDri_Int:
7896 case X86::COMISDrr_Int:
7897 case X86::VCOMISDrr_Int:
7898 case X86::VCOMISDZrr_Int:
7899 case X86::UCOMISDrr_Int:
7900 case X86::VUCOMISDrr_Int:
7901 case X86::VUCOMISDZrr_Int:
7902 case X86::ADDSDrr_Int:
7903 case X86::VADDSDrr_Int:
7904 case X86::VADDSDZrr_Int:
7905 case X86::CMPSDrri_Int:
7906 case X86::VCMPSDrri_Int:
7907 case X86::VCMPSDZrri_Int:
7908 case X86::DIVSDrr_Int:
7909 case X86::VDIVSDrr_Int:
7910 case X86::VDIVSDZrr_Int:
7911 case X86::MAXSDrr_Int:
7912 case X86::VMAXSDrr_Int:
7913 case X86::VMAXSDZrr_Int:
7914 case X86::MINSDrr_Int:
7915 case X86::VMINSDrr_Int:
7916 case X86::VMINSDZrr_Int:
7917 case X86::MULSDrr_Int:
7918 case X86::VMULSDrr_Int:
7919 case X86::VMULSDZrr_Int:
7920 case X86::SQRTSDr_Int:
7921 case X86::VSQRTSDr_Int:
7922 case X86::VSQRTSDZr_Int:
7923 case X86::SUBSDrr_Int:
7924 case X86::VSUBSDrr_Int:
7925 case X86::VSUBSDZrr_Int:
7926 case X86::VADDSDZrrk_Int:
7927 case X86::VADDSDZrrkz_Int:
7928 case X86::VCMPSDZrrik_Int:
7929 case X86::VDIVSDZrrk_Int:
7930 case X86::VDIVSDZrrkz_Int:
7931 case X86::VMAXSDZrrk_Int:
7932 case X86::VMAXSDZrrkz_Int:
7933 case X86::VMINSDZrrk_Int:
7934 case X86::VMINSDZrrkz_Int:
7935 case X86::VMULSDZrrk_Int:
7936 case X86::VMULSDZrrkz_Int:
7937 case X86::VSQRTSDZrk_Int:
7938 case X86::VSQRTSDZrkz_Int:
7939 case X86::VSUBSDZrrk_Int:
7940 case X86::VSUBSDZrrkz_Int:
7941 case X86::VFMADDSD4rr_Int:
7942 case X86::VFNMADDSD4rr_Int:
7943 case X86::VFMSUBSD4rr_Int:
7944 case X86::VFNMSUBSD4rr_Int:
7945 case X86::VFMADD132SDr_Int:
7946 case X86::VFNMADD132SDr_Int:
7947 case X86::VFMADD213SDr_Int:
7948 case X86::VFNMADD213SDr_Int:
7949 case X86::VFMADD231SDr_Int:
7950 case X86::VFNMADD231SDr_Int:
7951 case X86::VFMSUB132SDr_Int:
7952 case X86::VFNMSUB132SDr_Int:
7953 case X86::VFMSUB213SDr_Int:
7954 case X86::VFNMSUB213SDr_Int:
7955 case X86::VFMSUB231SDr_Int:
7956 case X86::VFNMSUB231SDr_Int:
7957 case X86::VFMADD132SDZr_Int:
7958 case X86::VFNMADD132SDZr_Int:
7959 case X86::VFMADD213SDZr_Int:
7960 case X86::VFNMADD213SDZr_Int:
7961 case X86::VFMADD231SDZr_Int:
7962 case X86::VFNMADD231SDZr_Int:
7963 case X86::VFMSUB132SDZr_Int:
7964 case X86::VFNMSUB132SDZr_Int:
7965 case X86::VFMSUB213SDZr_Int:
7966 case X86::VFNMSUB213SDZr_Int:
7967 case X86::VFMSUB231SDZr_Int:
7968 case X86::VFNMSUB231SDZr_Int:
7969 case X86::VFMADD132SDZrk_Int:
7970 case X86::VFNMADD132SDZrk_Int:
7971 case X86::VFMADD213SDZrk_Int:
7972 case X86::VFNMADD213SDZrk_Int:
7973 case X86::VFMADD231SDZrk_Int:
7974 case X86::VFNMADD231SDZrk_Int:
7975 case X86::VFMSUB132SDZrk_Int:
7976 case X86::VFNMSUB132SDZrk_Int:
7977 case X86::VFMSUB213SDZrk_Int:
7978 case X86::VFNMSUB213SDZrk_Int:
7979 case X86::VFMSUB231SDZrk_Int:
7980 case X86::VFNMSUB231SDZrk_Int:
7981 case X86::VFMADD132SDZrkz_Int:
7982 case X86::VFNMADD132SDZrkz_Int:
7983 case X86::VFMADD213SDZrkz_Int:
7984 case X86::VFNMADD213SDZrkz_Int:
7985 case X86::VFMADD231SDZrkz_Int:
7986 case X86::VFNMADD231SDZrkz_Int:
7987 case X86::VFMSUB132SDZrkz_Int:
7988 case X86::VFNMSUB132SDZrkz_Int:
7989 case X86::VFMSUB213SDZrkz_Int:
7990 case X86::VFNMSUB213SDZrkz_Int:
7991 case X86::VFMSUB231SDZrkz_Int:
7992 case X86::VFNMSUB231SDZrkz_Int:
7993 case X86::VFIXUPIMMSDZrri:
7994 case X86::VFIXUPIMMSDZrrik:
7995 case X86::VFIXUPIMMSDZrrikz:
7996 case X86::VFPCLASSSDZri:
7997 case X86::VFPCLASSSDZrik:
7998 case X86::VGETEXPSDZr:
7999 case X86::VGETEXPSDZrk:
8000 case X86::VGETEXPSDZrkz:
8001 case X86::VGETMANTSDZrri:
8002 case X86::VGETMANTSDZrrik:
8003 case X86::VGETMANTSDZrrikz:
8004 case X86::VRANGESDZrri:
8005 case X86::VRANGESDZrrik:
8006 case X86::VRANGESDZrrikz:
8007 case X86::VRCP14SDZrr:
8008 case X86::VRCP14SDZrrk:
8009 case X86::VRCP14SDZrrkz:
8010 case X86::VRCP28SDZr:
8011 case X86::VRCP28SDZrk:
8012 case X86::VRCP28SDZrkz:
8013 case X86::VREDUCESDZrri:
8014 case X86::VREDUCESDZrrik:
8015 case X86::VREDUCESDZrrikz:
8016 case X86::VRNDSCALESDZrri_Int:
8017 case X86::VRNDSCALESDZrrik_Int:
8018 case X86::VRNDSCALESDZrrikz_Int:
8019 case X86::VRSQRT14SDZrr:
8020 case X86::VRSQRT14SDZrrk:
8021 case X86::VRSQRT14SDZrrkz:
8022 case X86::VRSQRT28SDZr:
8023 case X86::VRSQRT28SDZrk:
8024 case X86::VRSQRT28SDZrkz:
8025 case X86::VSCALEFSDZrr:
8026 case X86::VSCALEFSDZrrk:
8027 case X86::VSCALEFSDZrrkz:
8028 return false;
8029 default:
8030 return true;
8031 }
8032 }
8033
8034 if ((Opc == X86::VMOVSHZrm || Opc == X86::VMOVSHZrm_alt) && RegSize > 16) {
8035 // These instructions only load 16 bits, we can't fold them if the
8036 // destination register is wider than 16 bits (2 bytes), and its user
8037 // instruction isn't scalar (SH).
8038 switch (UserOpc) {
8039 case X86::VADDSHZrr_Int:
8040 case X86::VCMPSHZrri_Int:
8041 case X86::VDIVSHZrr_Int:
8042 case X86::VMAXSHZrr_Int:
8043 case X86::VMINSHZrr_Int:
8044 case X86::VMULSHZrr_Int:
8045 case X86::VSUBSHZrr_Int:
8046 case X86::VADDSHZrrk_Int:
8047 case X86::VADDSHZrrkz_Int:
8048 case X86::VCMPSHZrrik_Int:
8049 case X86::VDIVSHZrrk_Int:
8050 case X86::VDIVSHZrrkz_Int:
8051 case X86::VMAXSHZrrk_Int:
8052 case X86::VMAXSHZrrkz_Int:
8053 case X86::VMINSHZrrk_Int:
8054 case X86::VMINSHZrrkz_Int:
8055 case X86::VMULSHZrrk_Int:
8056 case X86::VMULSHZrrkz_Int:
8057 case X86::VSUBSHZrrk_Int:
8058 case X86::VSUBSHZrrkz_Int:
8059 case X86::VFMADD132SHZr_Int:
8060 case X86::VFNMADD132SHZr_Int:
8061 case X86::VFMADD213SHZr_Int:
8062 case X86::VFNMADD213SHZr_Int:
8063 case X86::VFMADD231SHZr_Int:
8064 case X86::VFNMADD231SHZr_Int:
8065 case X86::VFMSUB132SHZr_Int:
8066 case X86::VFNMSUB132SHZr_Int:
8067 case X86::VFMSUB213SHZr_Int:
8068 case X86::VFNMSUB213SHZr_Int:
8069 case X86::VFMSUB231SHZr_Int:
8070 case X86::VFNMSUB231SHZr_Int:
8071 case X86::VFMADD132SHZrk_Int:
8072 case X86::VFNMADD132SHZrk_Int:
8073 case X86::VFMADD213SHZrk_Int:
8074 case X86::VFNMADD213SHZrk_Int:
8075 case X86::VFMADD231SHZrk_Int:
8076 case X86::VFNMADD231SHZrk_Int:
8077 case X86::VFMSUB132SHZrk_Int:
8078 case X86::VFNMSUB132SHZrk_Int:
8079 case X86::VFMSUB213SHZrk_Int:
8080 case X86::VFNMSUB213SHZrk_Int:
8081 case X86::VFMSUB231SHZrk_Int:
8082 case X86::VFNMSUB231SHZrk_Int:
8083 case X86::VFMADD132SHZrkz_Int:
8084 case X86::VFNMADD132SHZrkz_Int:
8085 case X86::VFMADD213SHZrkz_Int:
8086 case X86::VFNMADD213SHZrkz_Int:
8087 case X86::VFMADD231SHZrkz_Int:
8088 case X86::VFNMADD231SHZrkz_Int:
8089 case X86::VFMSUB132SHZrkz_Int:
8090 case X86::VFNMSUB132SHZrkz_Int:
8091 case X86::VFMSUB213SHZrkz_Int:
8092 case X86::VFNMSUB213SHZrkz_Int:
8093 case X86::VFMSUB231SHZrkz_Int:
8094 case X86::VFNMSUB231SHZrkz_Int:
8095 return false;
8096 default:
8097 return true;
8098 }
8099 }
8100
8101 return false;
8102}
8103
8106 MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
8107 LiveIntervals *LIS) const {
8108
8109 // TODO: Support the case where LoadMI loads a wide register, but MI
8110 // only uses a subreg.
8111 for (auto Op : Ops) {
8112 if (MI.getOperand(Op).getSubReg())
8113 return nullptr;
8114 }
8115
8116 // If loading from a FrameIndex, fold directly from the FrameIndex.
8117 unsigned NumOps = LoadMI.getDesc().getNumOperands();
8118 int FrameIndex;
8119 if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
8120 if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
8121 return nullptr;
8122 return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, LIS);
8123 }
8124
8125 // Check switch flag
8126 if (NoFusing)
8127 return nullptr;
8128
8129 // Avoid partial and undef register update stalls unless optimizing for size.
8130 if (!MF.getFunction().hasOptSize() &&
8131 (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/ true) ||
8133 return nullptr;
8134
8135 // Do not fold a NDD instruction and a memory instruction with relocation to
8136 // avoid emit APX relocation when the flag is disabled for backward
8137 // compatibility.
8138 uint64_t TSFlags = MI.getDesc().TSFlags;
8140 X86II::hasNewDataDest(TSFlags))
8141 return nullptr;
8142
8143 // Determine the alignment of the load.
8144 Align Alignment;
8145 unsigned LoadOpc = LoadMI.getOpcode();
8146 if (LoadMI.hasOneMemOperand())
8147 Alignment = (*LoadMI.memoperands_begin())->getAlign();
8148 else
8149 switch (LoadOpc) {
8150 case X86::AVX512_512_SET0:
8151 case X86::AVX512_512_SETALLONES:
8152 Alignment = Align(64);
8153 break;
8154 case X86::AVX2_SETALLONES:
8155 case X86::AVX1_SETALLONES:
8156 case X86::AVX_SET0:
8157 case X86::AVX512_256_SET0:
8158 Alignment = Align(32);
8159 break;
8160 case X86::V_SET0:
8161 case X86::V_SETALLONES:
8162 case X86::AVX512_128_SET0:
8163 case X86::FsFLD0F128:
8164 case X86::AVX512_FsFLD0F128:
8165 Alignment = Align(16);
8166 break;
8167 case X86::MMX_SET0:
8168 case X86::FsFLD0SD:
8169 case X86::AVX512_FsFLD0SD:
8170 Alignment = Align(8);
8171 break;
8172 case X86::FsFLD0SS:
8173 case X86::AVX512_FsFLD0SS:
8174 Alignment = Align(4);
8175 break;
8176 case X86::FsFLD0SH:
8177 case X86::AVX512_FsFLD0SH:
8178 Alignment = Align(2);
8179 break;
8180 default:
8181 return nullptr;
8182 }
8183 if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
8184 unsigned NewOpc = 0;
8185 switch (MI.getOpcode()) {
8186 default:
8187 return nullptr;
8188 case X86::TEST8rr:
8189 NewOpc = X86::CMP8ri;
8190 break;
8191 case X86::TEST16rr:
8192 NewOpc = X86::CMP16ri;
8193 break;
8194 case X86::TEST32rr:
8195 NewOpc = X86::CMP32ri;
8196 break;
8197 case X86::TEST64rr:
8198 NewOpc = X86::CMP64ri32;
8199 break;
8200 }
8201 // Change to CMPXXri r, 0 first.
8202 MI.setDesc(get(NewOpc));
8203 MI.getOperand(1).ChangeToImmediate(0);
8204 } else if (Ops.size() != 1)
8205 return nullptr;
8206
8207 // Make sure the subregisters match.
8208 // Otherwise we risk changing the size of the load.
8209 if (LoadMI.getOperand(0).getSubReg() != MI.getOperand(Ops[0]).getSubReg())
8210 return nullptr;
8211
8213 switch (LoadOpc) {
8214 case X86::MMX_SET0:
8215 case X86::V_SET0:
8216 case X86::V_SETALLONES:
8217 case X86::AVX2_SETALLONES:
8218 case X86::AVX1_SETALLONES:
8219 case X86::AVX_SET0:
8220 case X86::AVX512_128_SET0:
8221 case X86::AVX512_256_SET0:
8222 case X86::AVX512_512_SET0:
8223 case X86::AVX512_512_SETALLONES:
8224 case X86::FsFLD0SH:
8225 case X86::AVX512_FsFLD0SH:
8226 case X86::FsFLD0SD:
8227 case X86::AVX512_FsFLD0SD:
8228 case X86::FsFLD0SS:
8229 case X86::AVX512_FsFLD0SS:
8230 case X86::FsFLD0F128:
8231 case X86::AVX512_FsFLD0F128: {
8232 // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
8233 // Create a constant-pool entry and operands to load from it.
8234
8235 // Large code model can't fold loads this way.
8237 return nullptr;
8238
8239 // x86-32 PIC requires a PIC base register for constant pools.
8240 unsigned PICBase = 0;
8241 // Since we're using Small or Kernel code model, we can always use
8242 // RIP-relative addressing for a smaller encoding.
8243 if (Subtarget.is64Bit()) {
8244 PICBase = X86::RIP;
8245 } else if (MF.getTarget().isPositionIndependent()) {
8246 // FIXME: PICBase = getGlobalBaseReg(&MF);
8247 // This doesn't work for several reasons.
8248 // 1. GlobalBaseReg may have been spilled.
8249 // 2. It may not be live at MI.
8250 return nullptr;
8251 }
8252
8253 // Create a constant-pool entry.
8255 Type *Ty;
8256 bool IsAllOnes = false;
8257 switch (LoadOpc) {
8258 case X86::FsFLD0SS:
8259 case X86::AVX512_FsFLD0SS:
8261 break;
8262 case X86::FsFLD0SD:
8263 case X86::AVX512_FsFLD0SD:
8265 break;
8266 case X86::FsFLD0F128:
8267 case X86::AVX512_FsFLD0F128:
8269 break;
8270 case X86::FsFLD0SH:
8271 case X86::AVX512_FsFLD0SH:
8273 break;
8274 case X86::AVX512_512_SETALLONES:
8275 IsAllOnes = true;
8276 [[fallthrough]];
8277 case X86::AVX512_512_SET0:
8279 16);
8280 break;
8281 case X86::AVX1_SETALLONES:
8282 case X86::AVX2_SETALLONES:
8283 IsAllOnes = true;
8284 [[fallthrough]];
8285 case X86::AVX512_256_SET0:
8286 case X86::AVX_SET0:
8288 8);
8289
8290 break;
8291 case X86::MMX_SET0:
8293 2);
8294 break;
8295 case X86::V_SETALLONES:
8296 IsAllOnes = true;
8297 [[fallthrough]];
8298 case X86::V_SET0:
8299 case X86::AVX512_128_SET0:
8301 4);
8302 break;
8303 }
8304
8305 const Constant *C =
8307 unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
8308
8309 // Create operands to load from the constant pool entry.
8310 MOs.push_back(MachineOperand::CreateReg(PICBase, false));
8312 MOs.push_back(MachineOperand::CreateReg(0, false));
8314 MOs.push_back(MachineOperand::CreateReg(0, false));
8315 break;
8316 }
8317 case X86::VPBROADCASTBZ128rm:
8318 case X86::VPBROADCASTBZ256rm:
8319 case X86::VPBROADCASTBZrm:
8320 case X86::VBROADCASTF32X2Z256rm:
8321 case X86::VBROADCASTF32X2Zrm:
8322 case X86::VBROADCASTI32X2Z128rm:
8323 case X86::VBROADCASTI32X2Z256rm:
8324 case X86::VBROADCASTI32X2Zrm:
8325 // No instructions currently fuse with 8bits or 32bits x 2.
8326 return nullptr;
8327
8328#define FOLD_BROADCAST(SIZE) \
8329 MOs.append(LoadMI.operands_begin() + NumOps - X86::AddrNumOperands, \
8330 LoadMI.operands_begin() + NumOps); \
8331 return foldMemoryBroadcast(MF, MI, Ops[0], MOs, InsertPt, /*Size=*/SIZE, \
8332 /*AllowCommute=*/true);
8333 case X86::VPBROADCASTWZ128rm:
8334 case X86::VPBROADCASTWZ256rm:
8335 case X86::VPBROADCASTWZrm:
8336 FOLD_BROADCAST(16);
8337 case X86::VPBROADCASTDZ128rm:
8338 case X86::VPBROADCASTDZ256rm:
8339 case X86::VPBROADCASTDZrm:
8340 case X86::VBROADCASTSSZ128rm:
8341 case X86::VBROADCASTSSZ256rm:
8342 case X86::VBROADCASTSSZrm:
8343 FOLD_BROADCAST(32);
8344 case X86::VPBROADCASTQZ128rm:
8345 case X86::VPBROADCASTQZ256rm:
8346 case X86::VPBROADCASTQZrm:
8347 case X86::VBROADCASTSDZ256rm:
8348 case X86::VBROADCASTSDZrm:
8349 FOLD_BROADCAST(64);
8350 default: {
8351 if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
8352 return nullptr;
8353
8354 // Folding a normal load. Just copy the load's address operands.
8355 MOs.append(LoadMI.operands_begin() + NumOps - X86::AddrNumOperands,
8356 LoadMI.operands_begin() + NumOps);
8357 break;
8358 }
8359 }
8360 return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt,
8361 /*Size=*/0, Alignment, /*AllowCommute=*/true);
8362}
8363
8365X86InstrInfo::foldMemoryBroadcast(MachineFunction &MF, MachineInstr &MI,
8366 unsigned OpNum, ArrayRef<MachineOperand> MOs,
8368 unsigned BitsSize, bool AllowCommute) const {
8369
8370 if (auto *I = lookupBroadcastFoldTable(MI.getOpcode(), OpNum))
8371 return matchBroadcastSize(*I, BitsSize)
8372 ? fuseInst(MF, I->DstOp, OpNum, MOs, InsertPt, MI, *this)
8373 : nullptr;
8374
8375 if (AllowCommute) {
8376 // If the instruction and target operand are commutable, commute the
8377 // instruction and try again.
8378 unsigned CommuteOpIdx2 = commuteOperandsForFold(MI, OpNum);
8379 if (CommuteOpIdx2 == OpNum) {
8380 printFailMsgforFold(MI, OpNum);
8381 return nullptr;
8382 }
8383 MachineInstr *NewMI =
8384 foldMemoryBroadcast(MF, MI, CommuteOpIdx2, MOs, InsertPt, BitsSize,
8385 /*AllowCommute=*/false);
8386 if (NewMI)
8387 return NewMI;
8388 // Folding failed again - undo the commute before returning.
8389 commuteInstruction(MI, false, OpNum, CommuteOpIdx2);
8390 }
8391
8392 printFailMsgforFold(MI, OpNum);
8393 return nullptr;
8394}
8395
8399
8400 for (MachineMemOperand *MMO : MMOs) {
8401 if (!MMO->isLoad())
8402 continue;
8403
8404 if (!MMO->isStore()) {
8405 // Reuse the MMO.
8406 LoadMMOs.push_back(MMO);
8407 } else {
8408 // Clone the MMO and unset the store flag.
8409 LoadMMOs.push_back(MF.getMachineMemOperand(
8410 MMO, MMO->getFlags() & ~MachineMemOperand::MOStore));
8411 }
8412 }
8413
8414 return LoadMMOs;
8415}
8416
8420
8421 for (MachineMemOperand *MMO : MMOs) {
8422 if (!MMO->isStore())
8423 continue;
8424
8425 if (!MMO->isLoad()) {
8426 // Reuse the MMO.
8427 StoreMMOs.push_back(MMO);
8428 } else {
8429 // Clone the MMO and unset the load flag.
8430 StoreMMOs.push_back(MF.getMachineMemOperand(
8431 MMO, MMO->getFlags() & ~MachineMemOperand::MOLoad));
8432 }
8433 }
8434
8435 return StoreMMOs;
8436}
8437
8439 const TargetRegisterClass *RC,
8440 const X86Subtarget &STI) {
8441 assert(STI.hasAVX512() && "Expected at least AVX512!");
8442 unsigned SpillSize = STI.getRegisterInfo()->getSpillSize(*RC);
8443 assert((SpillSize == 64 || STI.hasVLX()) &&
8444 "Can't broadcast less than 64 bytes without AVX512VL!");
8445
8446#define CASE_BCAST_TYPE_OPC(TYPE, OP16, OP32, OP64) \
8447 case TYPE: \
8448 switch (SpillSize) { \
8449 default: \
8450 llvm_unreachable("Unknown spill size"); \
8451 case 16: \
8452 return X86::OP16; \
8453 case 32: \
8454 return X86::OP32; \
8455 case 64: \
8456 return X86::OP64; \
8457 } \
8458 break;
8459
8460 switch (I->Flags & TB_BCAST_MASK) {
8461 default:
8462 llvm_unreachable("Unexpected broadcast type!");
8463 CASE_BCAST_TYPE_OPC(TB_BCAST_W, VPBROADCASTWZ128rm, VPBROADCASTWZ256rm,
8464 VPBROADCASTWZrm)
8465 CASE_BCAST_TYPE_OPC(TB_BCAST_D, VPBROADCASTDZ128rm, VPBROADCASTDZ256rm,
8466 VPBROADCASTDZrm)
8467 CASE_BCAST_TYPE_OPC(TB_BCAST_Q, VPBROADCASTQZ128rm, VPBROADCASTQZ256rm,
8468 VPBROADCASTQZrm)
8469 CASE_BCAST_TYPE_OPC(TB_BCAST_SH, VPBROADCASTWZ128rm, VPBROADCASTWZ256rm,
8470 VPBROADCASTWZrm)
8471 CASE_BCAST_TYPE_OPC(TB_BCAST_SS, VBROADCASTSSZ128rm, VBROADCASTSSZ256rm,
8472 VBROADCASTSSZrm)
8473 CASE_BCAST_TYPE_OPC(TB_BCAST_SD, VMOVDDUPZ128rm, VBROADCASTSDZ256rm,
8474 VBROADCASTSDZrm)
8475 }
8476}
8477
8479 MachineFunction &MF, MachineInstr &MI, Register Reg, bool UnfoldLoad,
8480 bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const {
8481 const X86FoldTableEntry *I = lookupUnfoldTable(MI.getOpcode());
8482 if (I == nullptr)
8483 return false;
8484 unsigned Opc = I->DstOp;
8485 unsigned Index = I->Flags & TB_INDEX_MASK;
8486 bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
8487 bool FoldedStore = I->Flags & TB_FOLDED_STORE;
8488 if (UnfoldLoad && !FoldedLoad)
8489 return false;
8490 UnfoldLoad &= FoldedLoad;
8491 if (UnfoldStore && !FoldedStore)
8492 return false;
8493 UnfoldStore &= FoldedStore;
8494
8495 const MCInstrDesc &MCID = get(Opc);
8496
8497 const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
8499 // TODO: Check if 32-byte or greater accesses are slow too?
8500 if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass &&
8501 Subtarget.isUnalignedMem16Slow())
8502 // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
8503 // conservatively assume the address is unaligned. That's bad for
8504 // performance.
8505 return false;
8510 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
8511 MachineOperand &Op = MI.getOperand(i);
8512 if (i >= Index && i < Index + X86::AddrNumOperands)
8513 AddrOps.push_back(Op);
8514 else if (Op.isReg() && Op.isImplicit())
8515 ImpOps.push_back(Op);
8516 else if (i < Index)
8517 BeforeOps.push_back(Op);
8518 else if (i > Index)
8519 AfterOps.push_back(Op);
8520 }
8521
8522 // Emit the load or broadcast instruction.
8523 if (UnfoldLoad) {
8524 auto MMOs = extractLoadMMOs(MI.memoperands(), MF);
8525
8526 unsigned Opc;
8527 if (I->Flags & TB_BCAST_MASK) {
8528 Opc = getBroadcastOpcode(I, RC, Subtarget);
8529 } else {
8530 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
8531 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8532 Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget);
8533 }
8534
8535 DebugLoc DL;
8536 MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), Reg);
8537 for (const MachineOperand &AddrOp : AddrOps)
8538 MIB.add(AddrOp);
8539 MIB.setMemRefs(MMOs);
8540 NewMIs.push_back(MIB);
8541
8542 if (UnfoldStore) {
8543 // Address operands cannot be marked isKill.
8544 for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
8545 MachineOperand &MO = NewMIs[0]->getOperand(i);
8546 if (MO.isReg())
8547 MO.setIsKill(false);
8548 }
8549 }
8550 }
8551
8552 // Emit the data processing instruction.
8553 MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI.getDebugLoc(), true);
8554 MachineInstrBuilder MIB(MF, DataMI);
8555
8556 if (FoldedStore)
8557 MIB.addReg(Reg, RegState::Define);
8558 for (MachineOperand &BeforeOp : BeforeOps)
8559 MIB.add(BeforeOp);
8560 if (FoldedLoad)
8561 MIB.addReg(Reg);
8562 for (MachineOperand &AfterOp : AfterOps)
8563 MIB.add(AfterOp);
8564 for (MachineOperand &ImpOp : ImpOps) {
8565 MIB.addReg(ImpOp.getReg(), getDefRegState(ImpOp.isDef()) |
8567 getKillRegState(ImpOp.isKill()) |
8568 getDeadRegState(ImpOp.isDead()) |
8569 getUndefRegState(ImpOp.isUndef()));
8570 }
8571 // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
8572 switch (DataMI->getOpcode()) {
8573 default:
8574 break;
8575 case X86::CMP64ri32:
8576 case X86::CMP32ri:
8577 case X86::CMP16ri:
8578 case X86::CMP8ri: {
8579 MachineOperand &MO0 = DataMI->getOperand(0);
8580 MachineOperand &MO1 = DataMI->getOperand(1);
8581 if (MO1.isImm() && MO1.getImm() == 0) {
8582 unsigned NewOpc;
8583 switch (DataMI->getOpcode()) {
8584 default:
8585 llvm_unreachable("Unreachable!");
8586 case X86::CMP64ri32:
8587 NewOpc = X86::TEST64rr;
8588 break;
8589 case X86::CMP32ri:
8590 NewOpc = X86::TEST32rr;
8591 break;
8592 case X86::CMP16ri:
8593 NewOpc = X86::TEST16rr;
8594 break;
8595 case X86::CMP8ri:
8596 NewOpc = X86::TEST8rr;
8597 break;
8598 }
8599 DataMI->setDesc(get(NewOpc));
8600 MO1.ChangeToRegister(MO0.getReg(), false);
8601 }
8602 }
8603 }
8604 NewMIs.push_back(DataMI);
8605
8606 // Emit the store instruction.
8607 if (UnfoldStore) {
8608 const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
8609 auto MMOs = extractStoreMMOs(MI.memoperands(), MF);
8610 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*DstRC), 16);
8611 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8612 unsigned Opc = getStoreRegOpcode(Reg, DstRC, isAligned, Subtarget);
8613 DebugLoc DL;
8614 MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
8615 for (const MachineOperand &AddrOp : AddrOps)
8616 MIB.add(AddrOp);
8617 MIB.addReg(Reg, RegState::Kill);
8618 MIB.setMemRefs(MMOs);
8619 NewMIs.push_back(MIB);
8620 }
8621
8622 return true;
8623}
8624
8626 SelectionDAG &DAG, SDNode *N, SmallVectorImpl<SDNode *> &NewNodes) const {
8627 if (!N->isMachineOpcode())
8628 return false;
8629
8630 const X86FoldTableEntry *I = lookupUnfoldTable(N->getMachineOpcode());
8631 if (I == nullptr)
8632 return false;
8633 unsigned Opc = I->DstOp;
8634 unsigned Index = I->Flags & TB_INDEX_MASK;
8635 bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
8636 bool FoldedStore = I->Flags & TB_FOLDED_STORE;
8637 const MCInstrDesc &MCID = get(Opc);
8640 const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
8641 unsigned NumDefs = MCID.NumDefs;
8642 std::vector<SDValue> AddrOps;
8643 std::vector<SDValue> BeforeOps;
8644 std::vector<SDValue> AfterOps;
8645 SDLoc dl(N);
8646 unsigned NumOps = N->getNumOperands();
8647 for (unsigned i = 0; i != NumOps - 1; ++i) {
8648 SDValue Op = N->getOperand(i);
8649 if (i >= Index - NumDefs && i < Index - NumDefs + X86::AddrNumOperands)
8650 AddrOps.push_back(Op);
8651 else if (i < Index - NumDefs)
8652 BeforeOps.push_back(Op);
8653 else if (i > Index - NumDefs)
8654 AfterOps.push_back(Op);
8655 }
8656 SDValue Chain = N->getOperand(NumOps - 1);
8657 AddrOps.push_back(Chain);
8658
8659 // Emit the load instruction.
8660 SDNode *Load = nullptr;
8661 if (FoldedLoad) {
8662 EVT VT = *TRI.legalclasstypes_begin(*RC);
8663 auto MMOs = extractLoadMMOs(cast<MachineSDNode>(N)->memoperands(), MF);
8664 if (MMOs.empty() && RC == &X86::VR128RegClass &&
8665 Subtarget.isUnalignedMem16Slow())
8666 // Do not introduce a slow unaligned load.
8667 return false;
8668 // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
8669 // memory access is slow above.
8670
8671 unsigned Opc;
8672 if (I->Flags & TB_BCAST_MASK) {
8673 Opc = getBroadcastOpcode(I, RC, Subtarget);
8674 } else {
8675 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
8676 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8677 Opc = getLoadRegOpcode(0, RC, isAligned, Subtarget);
8678 }
8679
8680 Load = DAG.getMachineNode(Opc, dl, VT, MVT::Other, AddrOps);
8681 NewNodes.push_back(Load);
8682
8683 // Preserve memory reference information.
8684 DAG.setNodeMemRefs(cast<MachineSDNode>(Load), MMOs);
8685 }
8686
8687 // Emit the data processing instruction.
8688 std::vector<EVT> VTs;
8689 const TargetRegisterClass *DstRC = nullptr;
8690 if (MCID.getNumDefs() > 0) {
8691 DstRC = getRegClass(MCID, 0, &RI, MF);
8692 VTs.push_back(*TRI.legalclasstypes_begin(*DstRC));
8693 }
8694 for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
8695 EVT VT = N->getValueType(i);
8696 if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs())
8697 VTs.push_back(VT);
8698 }
8699 if (Load)
8700 BeforeOps.push_back(SDValue(Load, 0));
8701 llvm::append_range(BeforeOps, AfterOps);
8702 // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
8703 switch (Opc) {
8704 default:
8705 break;
8706 case X86::CMP64ri32:
8707 case X86::CMP32ri:
8708 case X86::CMP16ri:
8709 case X86::CMP8ri:
8710 if (isNullConstant(BeforeOps[1])) {
8711 switch (Opc) {
8712 default:
8713 llvm_unreachable("Unreachable!");
8714 case X86::CMP64ri32:
8715 Opc = X86::TEST64rr;
8716 break;
8717 case X86::CMP32ri:
8718 Opc = X86::TEST32rr;
8719 break;
8720 case X86::CMP16ri:
8721 Opc = X86::TEST16rr;
8722 break;
8723 case X86::CMP8ri:
8724 Opc = X86::TEST8rr;
8725 break;
8726 }
8727 BeforeOps[1] = BeforeOps[0];
8728 }
8729 }
8730 SDNode *NewNode = DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
8731 NewNodes.push_back(NewNode);
8732
8733 // Emit the store instruction.
8734 if (FoldedStore) {
8735 AddrOps.pop_back();
8736 AddrOps.push_back(SDValue(NewNode, 0));
8737 AddrOps.push_back(Chain);
8738 auto MMOs = extractStoreMMOs(cast<MachineSDNode>(N)->memoperands(), MF);
8739 if (MMOs.empty() && RC == &X86::VR128RegClass &&
8740 Subtarget.isUnalignedMem16Slow())
8741 // Do not introduce a slow unaligned store.
8742 return false;
8743 // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
8744 // memory access is slow above.
8745 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
8746 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8747 SDNode *Store =
8748 DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget),
8749 dl, MVT::Other, AddrOps);
8750 NewNodes.push_back(Store);
8751
8752 // Preserve memory reference information.
8753 DAG.setNodeMemRefs(cast<MachineSDNode>(Store), MMOs);
8754 }
8755
8756 return true;
8757}
8758
8759unsigned
8761 bool UnfoldStore,
8762 unsigned *LoadRegIndex) const {
8764 if (I == nullptr)
8765 return 0;
8766 bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
8767 bool FoldedStore = I->Flags & TB_FOLDED_STORE;
8768 if (UnfoldLoad && !FoldedLoad)
8769 return 0;
8770 if (UnfoldStore && !FoldedStore)
8771 return 0;
8772 if (LoadRegIndex)
8773 *LoadRegIndex = I->Flags & TB_INDEX_MASK;
8774 return I->DstOp;
8775}
8776
8778 int64_t &Offset1,
8779 int64_t &Offset2) const {
8780 if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
8781 return false;
8782
8783 auto IsLoadOpcode = [&](unsigned Opcode) {
8784 switch (Opcode) {
8785 default:
8786 return false;
8787 case X86::MOV8rm:
8788 case X86::MOV16rm:
8789 case X86::MOV32rm:
8790 case X86::MOV64rm:
8791 case X86::LD_Fp32m:
8792 case X86::LD_Fp64m:
8793 case X86::LD_Fp80m:
8794 case X86::MOVSSrm:
8795 case X86::MOVSSrm_alt:
8796 case X86::MOVSDrm:
8797 case X86::MOVSDrm_alt:
8798 case X86::MMX_MOVD64rm:
8799 case X86::MMX_MOVQ64rm:
8800 case X86::MOVAPSrm:
8801 case X86::MOVUPSrm:
8802 case X86::MOVAPDrm:
8803 case X86::MOVUPDrm:
8804 case X86::MOVDQArm:
8805 case X86::MOVDQUrm:
8806 // AVX load instructions
8807 case X86::VMOVSSrm:
8808 case X86::VMOVSSrm_alt:
8809 case X86::VMOVSDrm:
8810 case X86::VMOVSDrm_alt:
8811 case X86::VMOVAPSrm:
8812 case X86::VMOVUPSrm:
8813 case X86::VMOVAPDrm:
8814 case X86::VMOVUPDrm:
8815 case X86::VMOVDQArm:
8816 case X86::VMOVDQUrm:
8817 case X86::VMOVAPSYrm:
8818 case X86::VMOVUPSYrm:
8819 case X86::VMOVAPDYrm:
8820 case X86::VMOVUPDYrm:
8821 case X86::VMOVDQAYrm:
8822 case X86::VMOVDQUYrm:
8823 // AVX512 load instructions
8824 case X86::VMOVSSZrm:
8825 case X86::VMOVSSZrm_alt:
8826 case X86::VMOVSDZrm:
8827 case X86::VMOVSDZrm_alt:
8828 case X86::VMOVAPSZ128rm:
8829 case X86::VMOVUPSZ128rm:
8830 case X86::VMOVAPSZ128rm_NOVLX:
8831 case X86::VMOVUPSZ128rm_NOVLX:
8832 case X86::VMOVAPDZ128rm:
8833 case X86::VMOVUPDZ128rm:
8834 case X86::VMOVDQU8Z128rm:
8835 case X86::VMOVDQU16Z128rm:
8836 case X86::VMOVDQA32Z128rm:
8837 case X86::VMOVDQU32Z128rm:
8838 case X86::VMOVDQA64Z128rm:
8839 case X86::VMOVDQU64Z128rm:
8840 case X86::VMOVAPSZ256rm:
8841 case X86::VMOVUPSZ256rm:
8842 case X86::VMOVAPSZ256rm_NOVLX:
8843 case X86::VMOVUPSZ256rm_NOVLX:
8844 case X86::VMOVAPDZ256rm:
8845 case X86::VMOVUPDZ256rm:
8846 case X86::VMOVDQU8Z256rm:
8847 case X86::VMOVDQU16Z256rm:
8848 case X86::VMOVDQA32Z256rm:
8849 case X86::VMOVDQU32Z256rm:
8850 case X86::VMOVDQA64Z256rm:
8851 case X86::VMOVDQU64Z256rm:
8852 case X86::VMOVAPSZrm:
8853 case X86::VMOVUPSZrm:
8854 case X86::VMOVAPDZrm:
8855 case X86::VMOVUPDZrm:
8856 case X86::VMOVDQU8Zrm:
8857 case X86::VMOVDQU16Zrm:
8858 case X86::VMOVDQA32Zrm:
8859 case X86::VMOVDQU32Zrm:
8860 case X86::VMOVDQA64Zrm:
8861 case X86::VMOVDQU64Zrm:
8862 case X86::KMOVBkm:
8863 case X86::KMOVBkm_EVEX:
8864 case X86::KMOVWkm:
8865 case X86::KMOVWkm_EVEX:
8866 case X86::KMOVDkm:
8867 case X86::KMOVDkm_EVEX:
8868 case X86::KMOVQkm:
8869 case X86::KMOVQkm_EVEX:
8870 return true;
8871 }
8872 };
8873
8874 if (!IsLoadOpcode(Load1->getMachineOpcode()) ||
8875 !IsLoadOpcode(Load2->getMachineOpcode()))
8876 return false;
8877
8878 // Lambda to check if both the loads have the same value for an operand index.
8879 auto HasSameOp = [&](int I) {
8880 return Load1->getOperand(I) == Load2->getOperand(I);
8881 };
8882
8883 // All operands except the displacement should match.
8884 if (!HasSameOp(X86::AddrBaseReg) || !HasSameOp(X86::AddrScaleAmt) ||
8885 !HasSameOp(X86::AddrIndexReg) || !HasSameOp(X86::AddrSegmentReg))
8886 return false;
8887
8888 // Chain Operand must be the same.
8889 if (!HasSameOp(5))
8890 return false;
8891
8892 // Now let's examine if the displacements are constants.
8893 auto Disp1 = dyn_cast<ConstantSDNode>(Load1->getOperand(X86::AddrDisp));
8894 auto Disp2 = dyn_cast<ConstantSDNode>(Load2->getOperand(X86::AddrDisp));
8895 if (!Disp1 || !Disp2)
8896 return false;
8897
8898 Offset1 = Disp1->getSExtValue();
8899 Offset2 = Disp2->getSExtValue();
8900 return true;
8901}
8902
8904 int64_t Offset1, int64_t Offset2,
8905 unsigned NumLoads) const {
8906 assert(Offset2 > Offset1);
8907 if ((Offset2 - Offset1) / 8 > 64)
8908 return false;
8909
8910 unsigned Opc1 = Load1->getMachineOpcode();
8911 unsigned Opc2 = Load2->getMachineOpcode();
8912 if (Opc1 != Opc2)
8913 return false; // FIXME: overly conservative?
8914
8915 switch (Opc1) {
8916 default:
8917 break;
8918 case X86::LD_Fp32m:
8919 case X86::LD_Fp64m:
8920 case X86::LD_Fp80m:
8921 case X86::MMX_MOVD64rm:
8922 case X86::MMX_MOVQ64rm:
8923 return false;
8924 }
8925
8926 EVT VT = Load1->getValueType(0);
8927 switch (VT.getSimpleVT().SimpleTy) {
8928 default:
8929 // XMM registers. In 64-bit mode we can be a bit more aggressive since we
8930 // have 16 of them to play with.
8931 if (Subtarget.is64Bit()) {
8932 if (NumLoads >= 3)
8933 return false;
8934 } else if (NumLoads) {
8935 return false;
8936 }
8937 break;
8938 case MVT::i8:
8939 case MVT::i16:
8940 case MVT::i32:
8941 case MVT::i64:
8942 case MVT::f32:
8943 case MVT::f64:
8944 if (NumLoads)
8945 return false;
8946 break;
8947 }
8948
8949 return true;
8950}
8951
8953 const MachineBasicBlock *MBB,
8954 const MachineFunction &MF) const {
8955
8956 // ENDBR instructions should not be scheduled around.
8957 unsigned Opcode = MI.getOpcode();
8958 if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32 ||
8959 Opcode == X86::PLDTILECFGV)
8960 return true;
8961
8962 // Frame setup and destroy can't be scheduled around.
8963 if (MI.getFlag(MachineInstr::FrameSetup) ||
8965 return true;
8966
8968}
8969
8972 assert(Cond.size() == 1 && "Invalid X86 branch condition!");
8973 X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
8974 Cond[0].setImm(GetOppositeBranchCondition(CC));
8975 return false;
8976}
8977
8979 const TargetRegisterClass *RC) const {
8980 // FIXME: Return false for x87 stack register classes for now. We can't
8981 // allow any loads of these registers before FpGet_ST0_80.
8982 return !(RC == &X86::CCRRegClass || RC == &X86::DFCCRRegClass ||
8983 RC == &X86::RFP32RegClass || RC == &X86::RFP64RegClass ||
8984 RC == &X86::RFP80RegClass);
8985}
8986
8987/// Return a virtual register initialized with the
8988/// the global base register value. Output instructions required to
8989/// initialize the register in the function entry block, if necessary.
8990///
8991/// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
8992///
8995 Register GlobalBaseReg = X86FI->getGlobalBaseReg();
8996 if (GlobalBaseReg)
8997 return GlobalBaseReg;
8998
8999 // Create the register. The code to initialize it is inserted
9000 // later, by the CGBR pass (below).
9001 MachineRegisterInfo &RegInfo = MF->getRegInfo();
9002 GlobalBaseReg = RegInfo.createVirtualRegister(
9003 Subtarget.is64Bit() ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass);
9004 X86FI->setGlobalBaseReg(GlobalBaseReg);
9005 return GlobalBaseReg;
9006}
9007
9008// FIXME: Some shuffle and unpack instructions have equivalents in different
9009// domains, but they require a bit more work than just switching opcodes.
9010
9011static const uint16_t *lookup(unsigned opcode, unsigned domain,
9012 ArrayRef<uint16_t[3]> Table) {
9013 for (const uint16_t(&Row)[3] : Table)
9014 if (Row[domain - 1] == opcode)
9015 return Row;
9016 return nullptr;
9017}
9018
9019static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain,
9020 ArrayRef<uint16_t[4]> Table) {
9021 // If this is the integer domain make sure to check both integer columns.
9022 for (const uint16_t(&Row)[4] : Table)
9023 if (Row[domain - 1] == opcode || (domain == 3 && Row[3] == opcode))
9024 return Row;
9025 return nullptr;
9026}
9027
9028// Helper to attempt to widen/narrow blend masks.
9029static bool AdjustBlendMask(unsigned OldMask, unsigned OldWidth,
9030 unsigned NewWidth, unsigned *pNewMask = nullptr) {
9031 assert(((OldWidth % NewWidth) == 0 || (NewWidth % OldWidth) == 0) &&
9032 "Illegal blend mask scale");
9033 unsigned NewMask = 0;
9034
9035 if ((OldWidth % NewWidth) == 0) {
9036 unsigned Scale = OldWidth / NewWidth;
9037 unsigned SubMask = (1u << Scale) - 1;
9038 for (unsigned i = 0; i != NewWidth; ++i) {
9039 unsigned Sub = (OldMask >> (i * Scale)) & SubMask;
9040 if (Sub == SubMask)
9041 NewMask |= (1u << i);
9042 else if (Sub != 0x0)
9043 return false;
9044 }
9045 } else {
9046 unsigned Scale = NewWidth / OldWidth;
9047 unsigned SubMask = (1u << Scale) - 1;
9048 for (unsigned i = 0; i != OldWidth; ++i) {
9049 if (OldMask & (1 << i)) {
9050 NewMask |= (SubMask << (i * Scale));
9051 }
9052 }
9053 }
9054
9055 if (pNewMask)
9056 *pNewMask = NewMask;
9057 return true;
9058}
9059
9061 unsigned Opcode = MI.getOpcode();
9062 unsigned NumOperands = MI.getDesc().getNumOperands();
9063
9064 auto GetBlendDomains = [&](unsigned ImmWidth, bool Is256) {
9065 uint16_t validDomains = 0;
9066 if (MI.getOperand(NumOperands - 1).isImm()) {
9067 unsigned Imm = MI.getOperand(NumOperands - 1).getImm();
9068 if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4))
9069 validDomains |= 0x2; // PackedSingle
9070 if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2))
9071 validDomains |= 0x4; // PackedDouble
9072 if (!Is256 || Subtarget.hasAVX2())
9073 validDomains |= 0x8; // PackedInt
9074 }
9075 return validDomains;
9076 };
9077
9078 switch (Opcode) {
9079 case X86::BLENDPDrmi:
9080 case X86::BLENDPDrri:
9081 case X86::VBLENDPDrmi:
9082 case X86::VBLENDPDrri:
9083 return GetBlendDomains(2, false);
9084 case X86::VBLENDPDYrmi:
9085 case X86::VBLENDPDYrri:
9086 return GetBlendDomains(4, true);
9087 case X86::BLENDPSrmi:
9088 case X86::BLENDPSrri:
9089 case X86::VBLENDPSrmi:
9090 case X86::VBLENDPSrri:
9091 case X86::VPBLENDDrmi:
9092 case X86::VPBLENDDrri:
9093 return GetBlendDomains(4, false);
9094 case X86::VBLENDPSYrmi:
9095 case X86::VBLENDPSYrri:
9096 case X86::VPBLENDDYrmi:
9097 case X86::VPBLENDDYrri:
9098 return GetBlendDomains(8, true);
9099 case X86::PBLENDWrmi:
9100 case X86::PBLENDWrri:
9101 case X86::VPBLENDWrmi:
9102 case X86::VPBLENDWrri:
9103 // Treat VPBLENDWY as a 128-bit vector as it repeats the lo/hi masks.
9104 case X86::VPBLENDWYrmi:
9105 case X86::VPBLENDWYrri:
9106 return GetBlendDomains(8, false);
9107 case X86::VPANDDZ128rr:
9108 case X86::VPANDDZ128rm:
9109 case X86::VPANDDZ256rr:
9110 case X86::VPANDDZ256rm:
9111 case X86::VPANDQZ128rr:
9112 case X86::VPANDQZ128rm:
9113 case X86::VPANDQZ256rr:
9114 case X86::VPANDQZ256rm:
9115 case X86::VPANDNDZ128rr:
9116 case X86::VPANDNDZ128rm:
9117 case X86::VPANDNDZ256rr:
9118 case X86::VPANDNDZ256rm:
9119 case X86::VPANDNQZ128rr:
9120 case X86::VPANDNQZ128rm:
9121 case X86::VPANDNQZ256rr:
9122 case X86::VPANDNQZ256rm:
9123 case X86::VPORDZ128rr:
9124 case X86::VPORDZ128rm:
9125 case X86::VPORDZ256rr:
9126 case X86::VPORDZ256rm:
9127 case X86::VPORQZ128rr:
9128 case X86::VPORQZ128rm:
9129 case X86::VPORQZ256rr:
9130 case X86::VPORQZ256rm:
9131 case X86::VPXORDZ128rr:
9132 case X86::VPXORDZ128rm:
9133 case X86::VPXORDZ256rr:
9134 case X86::VPXORDZ256rm:
9135 case X86::VPXORQZ128rr:
9136 case X86::VPXORQZ128rm:
9137 case X86::VPXORQZ256rr:
9138 case X86::VPXORQZ256rm:
9139 // If we don't have DQI see if we can still switch from an EVEX integer
9140 // instruction to a VEX floating point instruction.
9141 if (Subtarget.hasDQI())
9142 return 0;
9143
9144 if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16)
9145 return 0;
9146 if (RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16)
9147 return 0;
9148 // Register forms will have 3 operands. Memory form will have more.
9149 if (NumOperands == 3 &&
9150 RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16)
9151 return 0;
9152
9153 // All domains are valid.
9154 return 0xe;
9155 case X86::MOVHLPSrr:
9156 // We can swap domains when both inputs are the same register.
9157 // FIXME: This doesn't catch all the cases we would like. If the input
9158 // register isn't KILLed by the instruction, the two address instruction
9159 // pass puts a COPY on one input. The other input uses the original
9160 // register. This prevents the same physical register from being used by
9161 // both inputs.
9162 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg() &&
9163 MI.getOperand(0).getSubReg() == 0 &&
9164 MI.getOperand(1).getSubReg() == 0 && MI.getOperand(2).getSubReg() == 0)
9165 return 0x6;
9166 return 0;
9167 case X86::SHUFPDrri:
9168 return 0x6;
9169 }
9170 return 0;
9171}
9172
9173#include "X86ReplaceableInstrs.def"
9174
9176 unsigned Domain) const {
9177 assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
9178 uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
9179 assert(dom && "Not an SSE instruction");
9180
9181 unsigned Opcode = MI.getOpcode();
9182 unsigned NumOperands = MI.getDesc().getNumOperands();
9183
9184 auto SetBlendDomain = [&](unsigned ImmWidth, bool Is256) {
9185 if (MI.getOperand(NumOperands - 1).isImm()) {
9186 unsigned Imm = MI.getOperand(NumOperands - 1).getImm() & 255;
9187 Imm = (ImmWidth == 16 ? ((Imm << 8) | Imm) : Imm);
9188 unsigned NewImm = Imm;
9189
9190 const uint16_t *table = lookup(Opcode, dom, ReplaceableBlendInstrs);
9191 if (!table)
9192 table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs);
9193
9194 if (Domain == 1) { // PackedSingle
9195 AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
9196 } else if (Domain == 2) { // PackedDouble
9197 AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2, &NewImm);
9198 } else if (Domain == 3) { // PackedInt
9199 if (Subtarget.hasAVX2()) {
9200 // If we are already VPBLENDW use that, else use VPBLENDD.
9201 if ((ImmWidth / (Is256 ? 2 : 1)) != 8) {
9202 table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs);
9203 AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
9204 }
9205 } else {
9206 assert(!Is256 && "128-bit vector expected");
9207 AdjustBlendMask(Imm, ImmWidth, 8, &NewImm);
9208 }
9209 }
9210
9211 assert(table && table[Domain - 1] && "Unknown domain op");
9212 MI.setDesc(get(table[Domain - 1]));
9213 MI.getOperand(NumOperands - 1).setImm(NewImm & 255);
9214 }
9215 return true;
9216 };
9217
9218 switch (Opcode) {
9219 case X86::BLENDPDrmi:
9220 case X86::BLENDPDrri:
9221 case X86::VBLENDPDrmi:
9222 case X86::VBLENDPDrri:
9223 return SetBlendDomain(2, false);
9224 case X86::VBLENDPDYrmi:
9225 case X86::VBLENDPDYrri:
9226 return SetBlendDomain(4, true);
9227 case X86::BLENDPSrmi:
9228 case X86::BLENDPSrri:
9229 case X86::VBLENDPSrmi:
9230 case X86::VBLENDPSrri:
9231 case X86::VPBLENDDrmi:
9232 case X86::VPBLENDDrri:
9233 return SetBlendDomain(4, false);
9234 case X86::VBLENDPSYrmi:
9235 case X86::VBLENDPSYrri:
9236 case X86::VPBLENDDYrmi:
9237 case X86::VPBLENDDYrri:
9238 return SetBlendDomain(8, true);
9239 case X86::PBLENDWrmi:
9240 case X86::PBLENDWrri:
9241 case X86::VPBLENDWrmi:
9242 case X86::VPBLENDWrri:
9243 return SetBlendDomain(8, false);
9244 case X86::VPBLENDWYrmi:
9245 case X86::VPBLENDWYrri:
9246 return SetBlendDomain(16, true);
9247 case X86::VPANDDZ128rr:
9248 case X86::VPANDDZ128rm:
9249 case X86::VPANDDZ256rr:
9250 case X86::VPANDDZ256rm:
9251 case X86::VPANDQZ128rr:
9252 case X86::VPANDQZ128rm:
9253 case X86::VPANDQZ256rr:
9254 case X86::VPANDQZ256rm:
9255 case X86::VPANDNDZ128rr:
9256 case X86::VPANDNDZ128rm:
9257 case X86::VPANDNDZ256rr:
9258 case X86::VPANDNDZ256rm:
9259 case X86::VPANDNQZ128rr:
9260 case X86::VPANDNQZ128rm:
9261 case X86::VPANDNQZ256rr:
9262 case X86::VPANDNQZ256rm:
9263 case X86::VPORDZ128rr:
9264 case X86::VPORDZ128rm:
9265 case X86::VPORDZ256rr:
9266 case X86::VPORDZ256rm:
9267 case X86::VPORQZ128rr:
9268 case X86::VPORQZ128rm:
9269 case X86::VPORQZ256rr:
9270 case X86::VPORQZ256rm:
9271 case X86::VPXORDZ128rr:
9272 case X86::VPXORDZ128rm:
9273 case X86::VPXORDZ256rr:
9274 case X86::VPXORDZ256rm:
9275 case X86::VPXORQZ128rr:
9276 case X86::VPXORQZ128rm:
9277 case X86::VPXORQZ256rr:
9278 case X86::VPXORQZ256rm: {
9279 // Without DQI, convert EVEX instructions to VEX instructions.
9280 if (Subtarget.hasDQI())
9281 return false;
9282
9283 const uint16_t *table =
9284 lookupAVX512(MI.getOpcode(), dom, ReplaceableCustomAVX512LogicInstrs);
9285 assert(table && "Instruction not found in table?");
9286 // Don't change integer Q instructions to D instructions and
9287 // use D intructions if we started with a PS instruction.
9288 if (Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
9289 Domain = 4;
9290 MI.setDesc(get(table[Domain - 1]));
9291 return true;
9292 }
9293 case X86::UNPCKHPDrr:
9294 case X86::MOVHLPSrr:
9295 // We just need to commute the instruction which will switch the domains.
9296 if (Domain != dom && Domain != 3 &&
9297 MI.getOperand(1).getReg() == MI.getOperand(2).getReg() &&
9298 MI.getOperand(0).getSubReg() == 0 &&
9299 MI.getOperand(1).getSubReg() == 0 &&
9300 MI.getOperand(2).getSubReg() == 0) {
9301 commuteInstruction(MI, false);
9302 return true;
9303 }
9304 // We must always return true for MOVHLPSrr.
9305 if (Opcode == X86::MOVHLPSrr)
9306 return true;
9307 break;
9308 case X86::SHUFPDrri: {
9309 if (Domain == 1) {
9310 unsigned Imm = MI.getOperand(3).getImm();
9311 unsigned NewImm = 0x44;
9312 if (Imm & 1)
9313 NewImm |= 0x0a;
9314 if (Imm & 2)
9315 NewImm |= 0xa0;
9316 MI.getOperand(3).setImm(NewImm);
9317 MI.setDesc(get(X86::SHUFPSrri));
9318 }
9319 return true;
9320 }
9321 }
9322 return false;
9323}
9324
9325std::pair<uint16_t, uint16_t>
9327 uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
9328 unsigned opcode = MI.getOpcode();
9329 uint16_t validDomains = 0;
9330 if (domain) {
9331 // Attempt to match for custom instructions.
9332 validDomains = getExecutionDomainCustom(MI);
9333 if (validDomains)
9334 return std::make_pair(domain, validDomains);
9335
9336 if (lookup(opcode, domain, ReplaceableInstrs)) {
9337 validDomains = 0xe;
9338 } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
9339 validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
9340 } else if (lookup(opcode, domain, ReplaceableInstrsFP)) {
9341 validDomains = 0x6;
9342 } else if (lookup(opcode, domain, ReplaceableInstrsAVX2InsertExtract)) {
9343 // Insert/extract instructions should only effect domain if AVX2
9344 // is enabled.
9345 if (!Subtarget.hasAVX2())
9346 return std::make_pair(0, 0);
9347 validDomains = 0xe;
9348 } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512)) {
9349 validDomains = 0xe;
9350 } else if (Subtarget.hasDQI() &&
9351 lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQ)) {
9352 validDomains = 0xe;
9353 } else if (Subtarget.hasDQI()) {
9354 if (const uint16_t *table =
9355 lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQMasked)) {
9356 if (domain == 1 || (domain == 3 && table[3] == opcode))
9357 validDomains = 0xa;
9358 else
9359 validDomains = 0xc;
9360 }
9361 }
9362 }
9363 return std::make_pair(domain, validDomains);
9364}
9365
9367 assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
9368 uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
9369 assert(dom && "Not an SSE instruction");
9370
9371 // Attempt to match for custom instructions.
9373 return;
9374
9375 const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs);
9376 if (!table) { // try the other table
9377 assert((Subtarget.hasAVX2() || Domain < 3) &&
9378 "256-bit vector operations only available in AVX2");
9379 table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2);
9380 }
9381 if (!table) { // try the FP table
9382 table = lookup(MI.getOpcode(), dom, ReplaceableInstrsFP);
9383 assert((!table || Domain < 3) &&
9384 "Can only select PackedSingle or PackedDouble");
9385 }
9386 if (!table) { // try the other table
9387 assert(Subtarget.hasAVX2() &&
9388 "256-bit insert/extract only available in AVX2");
9389 table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2InsertExtract);
9390 }
9391 if (!table) { // try the AVX512 table
9392 assert(Subtarget.hasAVX512() && "Requires AVX-512");
9393 table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512);
9394 // Don't change integer Q instructions to D instructions.
9395 if (table && Domain == 3 && table[3] == MI.getOpcode())
9396 Domain = 4;
9397 }
9398 if (!table) { // try the AVX512DQ table
9399 assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
9400 table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQ);
9401 // Don't change integer Q instructions to D instructions and
9402 // use D instructions if we started with a PS instruction.
9403 if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
9404 Domain = 4;
9405 }
9406 if (!table) { // try the AVX512DQMasked table
9407 assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
9408 table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQMasked);
9409 if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
9410 Domain = 4;
9411 }
9412 assert(table && "Cannot change domain");
9413 MI.setDesc(get(table[Domain - 1]));
9414}
9415
9418 DebugLoc DL;
9419 BuildMI(MBB, MI, DL, get(X86::NOOP));
9420}
9421
9422/// Return the noop instruction to use for a noop.
9424 MCInst Nop;
9425 Nop.setOpcode(X86::NOOP);
9426 return Nop;
9427}
9428
9430 switch (opc) {
9431 default:
9432 return false;
9433 case X86::DIVPDrm:
9434 case X86::DIVPDrr:
9435 case X86::DIVPSrm:
9436 case X86::DIVPSrr:
9437 case X86::DIVSDrm:
9438 case X86::DIVSDrm_Int:
9439 case X86::DIVSDrr:
9440 case X86::DIVSDrr_Int:
9441 case X86::DIVSSrm:
9442 case X86::DIVSSrm_Int:
9443 case X86::DIVSSrr:
9444 case X86::DIVSSrr_Int:
9445 case X86::SQRTPDm:
9446 case X86::SQRTPDr:
9447 case X86::SQRTPSm:
9448 case X86::SQRTPSr:
9449 case X86::SQRTSDm:
9450 case X86::SQRTSDm_Int:
9451 case X86::SQRTSDr:
9452 case X86::SQRTSDr_Int:
9453 case X86::SQRTSSm:
9454 case X86::SQRTSSm_Int:
9455 case X86::SQRTSSr:
9456 case X86::SQRTSSr_Int:
9457 // AVX instructions with high latency
9458 case X86::VDIVPDrm:
9459 case X86::VDIVPDrr:
9460 case X86::VDIVPDYrm:
9461 case X86::VDIVPDYrr:
9462 case X86::VDIVPSrm:
9463 case X86::VDIVPSrr:
9464 case X86::VDIVPSYrm:
9465 case X86::VDIVPSYrr:
9466 case X86::VDIVSDrm:
9467 case X86::VDIVSDrm_Int:
9468 case X86::VDIVSDrr:
9469 case X86::VDIVSDrr_Int:
9470 case X86::VDIVSSrm:
9471 case X86::VDIVSSrm_Int:
9472 case X86::VDIVSSrr:
9473 case X86::VDIVSSrr_Int:
9474 case X86::VSQRTPDm:
9475 case X86::VSQRTPDr:
9476 case X86::VSQRTPDYm:
9477 case X86::VSQRTPDYr:
9478 case X86::VSQRTPSm:
9479 case X86::VSQRTPSr:
9480 case X86::VSQRTPSYm:
9481 case X86::VSQRTPSYr:
9482 case X86::VSQRTSDm:
9483 case X86::VSQRTSDm_Int:
9484 case X86::VSQRTSDr:
9485 case X86::VSQRTSDr_Int:
9486 case X86::VSQRTSSm:
9487 case X86::VSQRTSSm_Int:
9488 case X86::VSQRTSSr:
9489 case X86::VSQRTSSr_Int:
9490 // AVX512 instructions with high latency
9491 case X86::VDIVPDZ128rm:
9492 case X86::VDIVPDZ128rmb:
9493 case X86::VDIVPDZ128rmbk:
9494 case X86::VDIVPDZ128rmbkz:
9495 case X86::VDIVPDZ128rmk:
9496 case X86::VDIVPDZ128rmkz:
9497 case X86::VDIVPDZ128rr:
9498 case X86::VDIVPDZ128rrk:
9499 case X86::VDIVPDZ128rrkz:
9500 case X86::VDIVPDZ256rm:
9501 case X86::VDIVPDZ256rmb:
9502 case X86::VDIVPDZ256rmbk:
9503 case X86::VDIVPDZ256rmbkz:
9504 case X86::VDIVPDZ256rmk:
9505 case X86::VDIVPDZ256rmkz:
9506 case X86::VDIVPDZ256rr:
9507 case X86::VDIVPDZ256rrk:
9508 case X86::VDIVPDZ256rrkz:
9509 case X86::VDIVPDZrrb:
9510 case X86::VDIVPDZrrbk:
9511 case X86::VDIVPDZrrbkz:
9512 case X86::VDIVPDZrm:
9513 case X86::VDIVPDZrmb:
9514 case X86::VDIVPDZrmbk:
9515 case X86::VDIVPDZrmbkz:
9516 case X86::VDIVPDZrmk:
9517 case X86::VDIVPDZrmkz:
9518 case X86::VDIVPDZrr:
9519 case X86::VDIVPDZrrk:
9520 case X86::VDIVPDZrrkz:
9521 case X86::VDIVPSZ128rm:
9522 case X86::VDIVPSZ128rmb:
9523 case X86::VDIVPSZ128rmbk:
9524 case X86::VDIVPSZ128rmbkz:
9525 case X86::VDIVPSZ128rmk:
9526 case X86::VDIVPSZ128rmkz:
9527 case X86::VDIVPSZ128rr:
9528 case X86::VDIVPSZ128rrk:
9529 case X86::VDIVPSZ128rrkz:
9530 case X86::VDIVPSZ256rm:
9531 case X86::VDIVPSZ256rmb:
9532 case X86::VDIVPSZ256rmbk:
9533 case X86::VDIVPSZ256rmbkz:
9534 case X86::VDIVPSZ256rmk:
9535 case X86::VDIVPSZ256rmkz:
9536 case X86::VDIVPSZ256rr:
9537 case X86::VDIVPSZ256rrk:
9538 case X86::VDIVPSZ256rrkz:
9539 case X86::VDIVPSZrrb:
9540 case X86::VDIVPSZrrbk:
9541 case X86::VDIVPSZrrbkz:
9542 case X86::VDIVPSZrm:
9543 case X86::VDIVPSZrmb:
9544 case X86::VDIVPSZrmbk:
9545 case X86::VDIVPSZrmbkz:
9546 case X86::VDIVPSZrmk:
9547 case X86::VDIVPSZrmkz:
9548 case X86::VDIVPSZrr:
9549 case X86::VDIVPSZrrk:
9550 case X86::VDIVPSZrrkz:
9551 case X86::VDIVSDZrm:
9552 case X86::VDIVSDZrr:
9553 case X86::VDIVSDZrm_Int:
9554 case X86::VDIVSDZrmk_Int:
9555 case X86::VDIVSDZrmkz_Int:
9556 case X86::VDIVSDZrr_Int:
9557 case X86::VDIVSDZrrk_Int:
9558 case X86::VDIVSDZrrkz_Int:
9559 case X86::VDIVSDZrrb_Int:
9560 case X86::VDIVSDZrrbk_Int:
9561 case X86::VDIVSDZrrbkz_Int:
9562 case X86::VDIVSSZrm:
9563 case X86::VDIVSSZrr:
9564 case X86::VDIVSSZrm_Int:
9565 case X86::VDIVSSZrmk_Int:
9566 case X86::VDIVSSZrmkz_Int:
9567 case X86::VDIVSSZrr_Int:
9568 case X86::VDIVSSZrrk_Int:
9569 case X86::VDIVSSZrrkz_Int:
9570 case X86::VDIVSSZrrb_Int:
9571 case X86::VDIVSSZrrbk_Int:
9572 case X86::VDIVSSZrrbkz_Int:
9573 case X86::VSQRTPDZ128m:
9574 case X86::VSQRTPDZ128mb:
9575 case X86::VSQRTPDZ128mbk:
9576 case X86::VSQRTPDZ128mbkz:
9577 case X86::VSQRTPDZ128mk:
9578 case X86::VSQRTPDZ128mkz:
9579 case X86::VSQRTPDZ128r:
9580 case X86::VSQRTPDZ128rk:
9581 case X86::VSQRTPDZ128rkz:
9582 case X86::VSQRTPDZ256m:
9583 case X86::VSQRTPDZ256mb:
9584 case X86::VSQRTPDZ256mbk:
9585 case X86::VSQRTPDZ256mbkz:
9586 case X86::VSQRTPDZ256mk:
9587 case X86::VSQRTPDZ256mkz:
9588 case X86::VSQRTPDZ256r:
9589 case X86::VSQRTPDZ256rk:
9590 case X86::VSQRTPDZ256rkz:
9591 case X86::VSQRTPDZm:
9592 case X86::VSQRTPDZmb:
9593 case X86::VSQRTPDZmbk:
9594 case X86::VSQRTPDZmbkz:
9595 case X86::VSQRTPDZmk:
9596 case X86::VSQRTPDZmkz:
9597 case X86::VSQRTPDZr:
9598 case X86::VSQRTPDZrb:
9599 case X86::VSQRTPDZrbk:
9600 case X86::VSQRTPDZrbkz:
9601 case X86::VSQRTPDZrk:
9602 case X86::VSQRTPDZrkz:
9603 case X86::VSQRTPSZ128m:
9604 case X86::VSQRTPSZ128mb:
9605 case X86::VSQRTPSZ128mbk:
9606 case X86::VSQRTPSZ128mbkz:
9607 case X86::VSQRTPSZ128mk:
9608 case X86::VSQRTPSZ128mkz:
9609 case X86::VSQRTPSZ128r:
9610 case X86::VSQRTPSZ128rk:
9611 case X86::VSQRTPSZ128rkz:
9612 case X86::VSQRTPSZ256m:
9613 case X86::VSQRTPSZ256mb:
9614 case X86::VSQRTPSZ256mbk:
9615 case X86::VSQRTPSZ256mbkz:
9616 case X86::VSQRTPSZ256mk:
9617 case X86::VSQRTPSZ256mkz:
9618 case X86::VSQRTPSZ256r:
9619 case X86::VSQRTPSZ256rk:
9620 case X86::VSQRTPSZ256rkz:
9621 case X86::VSQRTPSZm:
9622 case X86::VSQRTPSZmb:
9623 case X86::VSQRTPSZmbk:
9624 case X86::VSQRTPSZmbkz:
9625 case X86::VSQRTPSZmk:
9626 case X86::VSQRTPSZmkz:
9627 case X86::VSQRTPSZr:
9628 case X86::VSQRTPSZrb:
9629 case X86::VSQRTPSZrbk:
9630 case X86::VSQRTPSZrbkz:
9631 case X86::VSQRTPSZrk:
9632 case X86::VSQRTPSZrkz:
9633 case X86::VSQRTSDZm:
9634 case X86::VSQRTSDZm_Int:
9635 case X86::VSQRTSDZmk_Int:
9636 case X86::VSQRTSDZmkz_Int:
9637 case X86::VSQRTSDZr:
9638 case X86::VSQRTSDZr_Int:
9639 case X86::VSQRTSDZrk_Int:
9640 case X86::VSQRTSDZrkz_Int:
9641 case X86::VSQRTSDZrb_Int:
9642 case X86::VSQRTSDZrbk_Int:
9643 case X86::VSQRTSDZrbkz_Int:
9644 case X86::VSQRTSSZm:
9645 case X86::VSQRTSSZm_Int:
9646 case X86::VSQRTSSZmk_Int:
9647 case X86::VSQRTSSZmkz_Int:
9648 case X86::VSQRTSSZr:
9649 case X86::VSQRTSSZr_Int:
9650 case X86::VSQRTSSZrk_Int:
9651 case X86::VSQRTSSZrkz_Int:
9652 case X86::VSQRTSSZrb_Int:
9653 case X86::VSQRTSSZrbk_Int:
9654 case X86::VSQRTSSZrbkz_Int:
9655
9656 case X86::VGATHERDPDYrm:
9657 case X86::VGATHERDPDZ128rm:
9658 case X86::VGATHERDPDZ256rm:
9659 case X86::VGATHERDPDZrm:
9660 case X86::VGATHERDPDrm:
9661 case X86::VGATHERDPSYrm:
9662 case X86::VGATHERDPSZ128rm:
9663 case X86::VGATHERDPSZ256rm:
9664 case X86::VGATHERDPSZrm:
9665 case X86::VGATHERDPSrm:
9666 case X86::VGATHERPF0DPDm:
9667 case X86::VGATHERPF0DPSm:
9668 case X86::VGATHERPF0QPDm:
9669 case X86::VGATHERPF0QPSm:
9670 case X86::VGATHERPF1DPDm:
9671 case X86::VGATHERPF1DPSm:
9672 case X86::VGATHERPF1QPDm:
9673 case X86::VGATHERPF1QPSm:
9674 case X86::VGATHERQPDYrm:
9675 case X86::VGATHERQPDZ128rm:
9676 case X86::VGATHERQPDZ256rm:
9677 case X86::VGATHERQPDZrm:
9678 case X86::VGATHERQPDrm:
9679 case X86::VGATHERQPSYrm:
9680 case X86::VGATHERQPSZ128rm:
9681 case X86::VGATHERQPSZ256rm:
9682 case X86::VGATHERQPSZrm:
9683 case X86::VGATHERQPSrm:
9684 case X86::VPGATHERDDYrm:
9685 case X86::VPGATHERDDZ128rm:
9686 case X86::VPGATHERDDZ256rm:
9687 case X86::VPGATHERDDZrm:
9688 case X86::VPGATHERDDrm:
9689 case X86::VPGATHERDQYrm:
9690 case X86::VPGATHERDQZ128rm:
9691 case X86::VPGATHERDQZ256rm:
9692 case X86::VPGATHERDQZrm:
9693 case X86::VPGATHERDQrm:
9694 case X86::VPGATHERQDYrm:
9695 case X86::VPGATHERQDZ128rm:
9696 case X86::VPGATHERQDZ256rm:
9697 case X86::VPGATHERQDZrm:
9698 case X86::VPGATHERQDrm:
9699 case X86::VPGATHERQQYrm:
9700 case X86::VPGATHERQQZ128rm:
9701 case X86::VPGATHERQQZ256rm:
9702 case X86::VPGATHERQQZrm:
9703 case X86::VPGATHERQQrm:
9704 case X86::VSCATTERDPDZ128mr:
9705 case X86::VSCATTERDPDZ256mr:
9706 case X86::VSCATTERDPDZmr:
9707 case X86::VSCATTERDPSZ128mr:
9708 case X86::VSCATTERDPSZ256mr:
9709 case X86::VSCATTERDPSZmr:
9710 case X86::VSCATTERPF0DPDm:
9711 case X86::VSCATTERPF0DPSm:
9712 case X86::VSCATTERPF0QPDm:
9713 case X86::VSCATTERPF0QPSm:
9714 case X86::VSCATTERPF1DPDm:
9715 case X86::VSCATTERPF1DPSm:
9716 case X86::VSCATTERPF1QPDm:
9717 case X86::VSCATTERPF1QPSm:
9718 case X86::VSCATTERQPDZ128mr:
9719 case X86::VSCATTERQPDZ256mr:
9720 case X86::VSCATTERQPDZmr:
9721 case X86::VSCATTERQPSZ128mr:
9722 case X86::VSCATTERQPSZ256mr:
9723 case X86::VSCATTERQPSZmr:
9724 case X86::VPSCATTERDDZ128mr:
9725 case X86::VPSCATTERDDZ256mr:
9726 case X86::VPSCATTERDDZmr:
9727 case X86::VPSCATTERDQZ128mr:
9728 case X86::VPSCATTERDQZ256mr:
9729 case X86::VPSCATTERDQZmr:
9730 case X86::VPSCATTERQDZ128mr:
9731 case X86::VPSCATTERQDZ256mr:
9732 case X86::VPSCATTERQDZmr:
9733 case X86::VPSCATTERQQZ128mr:
9734 case X86::VPSCATTERQQZ256mr:
9735 case X86::VPSCATTERQQZmr:
9736 return true;
9737 }
9738}
9739
9741 const MachineRegisterInfo *MRI,
9742 const MachineInstr &DefMI,
9743 unsigned DefIdx,
9744 const MachineInstr &UseMI,
9745 unsigned UseIdx) const {
9746 return isHighLatencyDef(DefMI.getOpcode());
9747}
9748
9750 const MachineBasicBlock *MBB) const {
9751 assert(Inst.getNumExplicitOperands() == 3 && Inst.getNumExplicitDefs() == 1 &&
9752 Inst.getNumDefs() <= 2 && "Reassociation needs binary operators");
9753
9754 // Integer binary math/logic instructions have a third source operand:
9755 // the EFLAGS register. That operand must be both defined here and never
9756 // used; ie, it must be dead. If the EFLAGS operand is live, then we can
9757 // not change anything because rearranging the operands could affect other
9758 // instructions that depend on the exact status flags (zero, sign, etc.)
9759 // that are set by using these particular operands with this operation.
9760 const MachineOperand *FlagDef =
9761 Inst.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
9762 assert((Inst.getNumDefs() == 1 || FlagDef) && "Implicit def isn't flags?");
9763 if (FlagDef && !FlagDef->isDead())
9764 return false;
9765
9767}
9768
9769// TODO: There are many more machine instruction opcodes to match:
9770// 1. Other data types (integer, vectors)
9771// 2. Other math / logic operations (xor, or)
9772// 3. Other forms of the same operation (intrinsics and other variants)
9774 bool Invert) const {
9775 if (Invert)
9776 return false;
9777 switch (Inst.getOpcode()) {
9778 CASE_ND(ADD8rr)
9779 CASE_ND(ADD16rr)
9780 CASE_ND(ADD32rr)
9781 CASE_ND(ADD64rr)
9782 CASE_ND(AND8rr)
9783 CASE_ND(AND16rr)
9784 CASE_ND(AND32rr)
9785 CASE_ND(AND64rr)
9786 CASE_ND(OR8rr)
9787 CASE_ND(OR16rr)
9788 CASE_ND(OR32rr)
9789 CASE_ND(OR64rr)
9790 CASE_ND(XOR8rr)
9791 CASE_ND(XOR16rr)
9792 CASE_ND(XOR32rr)
9793 CASE_ND(XOR64rr)
9794 CASE_ND(IMUL16rr)
9795 CASE_ND(IMUL32rr)
9796 CASE_ND(IMUL64rr)
9797 case X86::PANDrr:
9798 case X86::PORrr:
9799 case X86::PXORrr:
9800 case X86::ANDPDrr:
9801 case X86::ANDPSrr:
9802 case X86::ORPDrr:
9803 case X86::ORPSrr:
9804 case X86::XORPDrr:
9805 case X86::XORPSrr:
9806 case X86::PADDBrr:
9807 case X86::PADDWrr:
9808 case X86::PADDDrr:
9809 case X86::PADDQrr:
9810 case X86::PMULLWrr:
9811 case X86::PMULLDrr:
9812 case X86::PMAXSBrr:
9813 case X86::PMAXSDrr:
9814 case X86::PMAXSWrr:
9815 case X86::PMAXUBrr:
9816 case X86::PMAXUDrr:
9817 case X86::PMAXUWrr:
9818 case X86::PMINSBrr:
9819 case X86::PMINSDrr:
9820 case X86::PMINSWrr:
9821 case X86::PMINUBrr:
9822 case X86::PMINUDrr:
9823 case X86::PMINUWrr:
9824 case X86::VPANDrr:
9825 case X86::VPANDYrr:
9826 case X86::VPANDDZ128rr:
9827 case X86::VPANDDZ256rr:
9828 case X86::VPANDDZrr:
9829 case X86::VPANDQZ128rr:
9830 case X86::VPANDQZ256rr:
9831 case X86::VPANDQZrr:
9832 case X86::VPORrr:
9833 case X86::VPORYrr:
9834 case X86::VPORDZ128rr:
9835 case X86::VPORDZ256rr:
9836 case X86::VPORDZrr:
9837 case X86::VPORQZ128rr:
9838 case X86::VPORQZ256rr:
9839 case X86::VPORQZrr:
9840 case X86::VPXORrr:
9841 case X86::VPXORYrr:
9842 case X86::VPXORDZ128rr:
9843 case X86::VPXORDZ256rr:
9844 case X86::VPXORDZrr:
9845 case X86::VPXORQZ128rr:
9846 case X86::VPXORQZ256rr:
9847 case X86::VPXORQZrr:
9848 case X86::VANDPDrr:
9849 case X86::VANDPSrr:
9850 case X86::VANDPDYrr:
9851 case X86::VANDPSYrr:
9852 case X86::VANDPDZ128rr:
9853 case X86::VANDPSZ128rr:
9854 case X86::VANDPDZ256rr:
9855 case X86::VANDPSZ256rr:
9856 case X86::VANDPDZrr:
9857 case X86::VANDPSZrr:
9858 case X86::VORPDrr:
9859 case X86::VORPSrr:
9860 case X86::VORPDYrr:
9861 case X86::VORPSYrr:
9862 case X86::VORPDZ128rr:
9863 case X86::VORPSZ128rr:
9864 case X86::VORPDZ256rr:
9865 case X86::VORPSZ256rr:
9866 case X86::VORPDZrr:
9867 case X86::VORPSZrr:
9868 case X86::VXORPDrr:
9869 case X86::VXORPSrr:
9870 case X86::VXORPDYrr:
9871 case X86::VXORPSYrr:
9872 case X86::VXORPDZ128rr:
9873 case X86::VXORPSZ128rr:
9874 case X86::VXORPDZ256rr:
9875 case X86::VXORPSZ256rr:
9876 case X86::VXORPDZrr:
9877 case X86::VXORPSZrr:
9878 case X86::KADDBkk:
9879 case X86::KADDWkk:
9880 case X86::KADDDkk:
9881 case X86::KADDQkk:
9882 case X86::KANDBkk:
9883 case X86::KANDWkk:
9884 case X86::KANDDkk:
9885 case X86::KANDQkk:
9886 case X86::KORBkk:
9887 case X86::KORWkk:
9888 case X86::KORDkk:
9889 case X86::KORQkk:
9890 case X86::KXORBkk:
9891 case X86::KXORWkk:
9892 case X86::KXORDkk:
9893 case X86::KXORQkk:
9894 case X86::VPADDBrr:
9895 case X86::VPADDWrr:
9896 case X86::VPADDDrr:
9897 case X86::VPADDQrr:
9898 case X86::VPADDBYrr:
9899 case X86::VPADDWYrr:
9900 case X86::VPADDDYrr:
9901 case X86::VPADDQYrr:
9902 case X86::VPADDBZ128rr:
9903 case X86::VPADDWZ128rr:
9904 case X86::VPADDDZ128rr:
9905 case X86::VPADDQZ128rr:
9906 case X86::VPADDBZ256rr:
9907 case X86::VPADDWZ256rr:
9908 case X86::VPADDDZ256rr:
9909 case X86::VPADDQZ256rr:
9910 case X86::VPADDBZrr:
9911 case X86::VPADDWZrr:
9912 case X86::VPADDDZrr:
9913 case X86::VPADDQZrr:
9914 case X86::VPMULLWrr:
9915 case X86::VPMULLWYrr:
9916 case X86::VPMULLWZ128rr:
9917 case X86::VPMULLWZ256rr:
9918 case X86::VPMULLWZrr:
9919 case X86::VPMULLDrr:
9920 case X86::VPMULLDYrr:
9921 case X86::VPMULLDZ128rr:
9922 case X86::VPMULLDZ256rr:
9923 case X86::VPMULLDZrr:
9924 case X86::VPMULLQZ128rr:
9925 case X86::VPMULLQZ256rr:
9926 case X86::VPMULLQZrr:
9927 case X86::VPMAXSBrr:
9928 case X86::VPMAXSBYrr:
9929 case X86::VPMAXSBZ128rr:
9930 case X86::VPMAXSBZ256rr:
9931 case X86::VPMAXSBZrr:
9932 case X86::VPMAXSDrr:
9933 case X86::VPMAXSDYrr:
9934 case X86::VPMAXSDZ128rr:
9935 case X86::VPMAXSDZ256rr:
9936 case X86::VPMAXSDZrr:
9937 case X86::VPMAXSQZ128rr:
9938 case X86::VPMAXSQZ256rr:
9939 case X86::VPMAXSQZrr:
9940 case X86::VPMAXSWrr:
9941 case X86::VPMAXSWYrr:
9942 case X86::VPMAXSWZ128rr:
9943 case X86::VPMAXSWZ256rr:
9944 case X86::VPMAXSWZrr:
9945 case X86::VPMAXUBrr:
9946 case X86::VPMAXUBYrr:
9947 case X86::VPMAXUBZ128rr:
9948 case X86::VPMAXUBZ256rr:
9949 case X86::VPMAXUBZrr:
9950 case X86::VPMAXUDrr:
9951 case X86::VPMAXUDYrr:
9952 case X86::VPMAXUDZ128rr:
9953 case X86::VPMAXUDZ256rr:
9954 case X86::VPMAXUDZrr:
9955 case X86::VPMAXUQZ128rr:
9956 case X86::VPMAXUQZ256rr:
9957 case X86::VPMAXUQZrr:
9958 case X86::VPMAXUWrr:
9959 case X86::VPMAXUWYrr:
9960 case X86::VPMAXUWZ128rr:
9961 case X86::VPMAXUWZ256rr:
9962 case X86::VPMAXUWZrr:
9963 case X86::VPMINSBrr:
9964 case X86::VPMINSBYrr:
9965 case X86::VPMINSBZ128rr:
9966 case X86::VPMINSBZ256rr:
9967 case X86::VPMINSBZrr:
9968 case X86::VPMINSDrr:
9969 case X86::VPMINSDYrr:
9970 case X86::VPMINSDZ128rr:
9971 case X86::VPMINSDZ256rr:
9972 case X86::VPMINSDZrr:
9973 case X86::VPMINSQZ128rr:
9974 case X86::VPMINSQZ256rr:
9975 case X86::VPMINSQZrr:
9976 case X86::VPMINSWrr:
9977 case X86::VPMINSWYrr:
9978 case X86::VPMINSWZ128rr:
9979 case X86::VPMINSWZ256rr:
9980 case X86::VPMINSWZrr:
9981 case X86::VPMINUBrr:
9982 case X86::VPMINUBYrr:
9983 case X86::VPMINUBZ128rr:
9984 case X86::VPMINUBZ256rr:
9985 case X86::VPMINUBZrr:
9986 case X86::VPMINUDrr:
9987 case X86::VPMINUDYrr:
9988 case X86::VPMINUDZ128rr:
9989 case X86::VPMINUDZ256rr:
9990 case X86::VPMINUDZrr:
9991 case X86::VPMINUQZ128rr:
9992 case X86::VPMINUQZ256rr:
9993 case X86::VPMINUQZrr:
9994 case X86::VPMINUWrr:
9995 case X86::VPMINUWYrr:
9996 case X86::VPMINUWZ128rr:
9997 case X86::VPMINUWZ256rr:
9998 case X86::VPMINUWZrr:
9999 // Normal min/max instructions are not commutative because of NaN and signed
10000 // zero semantics, but these are. Thus, there's no need to check for global
10001 // relaxed math; the instructions themselves have the properties we need.
10002 case X86::MAXCPDrr:
10003 case X86::MAXCPSrr:
10004 case X86::MAXCSDrr:
10005 case X86::MAXCSSrr:
10006 case X86::MINCPDrr:
10007 case X86::MINCPSrr:
10008 case X86::MINCSDrr:
10009 case X86::MINCSSrr:
10010 case X86::VMAXCPDrr:
10011 case X86::VMAXCPSrr:
10012 case X86::VMAXCPDYrr:
10013 case X86::VMAXCPSYrr:
10014 case X86::VMAXCPDZ128rr:
10015 case X86::VMAXCPSZ128rr:
10016 case X86::VMAXCPDZ256rr:
10017 case X86::VMAXCPSZ256rr:
10018 case X86::VMAXCPDZrr:
10019 case X86::VMAXCPSZrr:
10020 case X86::VMAXCSDrr:
10021 case X86::VMAXCSSrr:
10022 case X86::VMAXCSDZrr:
10023 case X86::VMAXCSSZrr:
10024 case X86::VMINCPDrr:
10025 case X86::VMINCPSrr:
10026 case X86::VMINCPDYrr:
10027 case X86::VMINCPSYrr:
10028 case X86::VMINCPDZ128rr:
10029 case X86::VMINCPSZ128rr:
10030 case X86::VMINCPDZ256rr:
10031 case X86::VMINCPSZ256rr:
10032 case X86::VMINCPDZrr:
10033 case X86::VMINCPSZrr:
10034 case X86::VMINCSDrr:
10035 case X86::VMINCSSrr:
10036 case X86::VMINCSDZrr:
10037 case X86::VMINCSSZrr:
10038 case X86::VMAXCPHZ128rr:
10039 case X86::VMAXCPHZ256rr:
10040 case X86::VMAXCPHZrr:
10041 case X86::VMAXCSHZrr:
10042 case X86::VMINCPHZ128rr:
10043 case X86::VMINCPHZ256rr:
10044 case X86::VMINCPHZrr:
10045 case X86::VMINCSHZrr:
10046 return true;
10047 case X86::ADDPDrr:
10048 case X86::ADDPSrr:
10049 case X86::ADDSDrr:
10050 case X86::ADDSSrr:
10051 case X86::MULPDrr:
10052 case X86::MULPSrr:
10053 case X86::MULSDrr:
10054 case X86::MULSSrr:
10055 case X86::VADDPDrr:
10056 case X86::VADDPSrr:
10057 case X86::VADDPDYrr:
10058 case X86::VADDPSYrr:
10059 case X86::VADDPDZ128rr:
10060 case X86::VADDPSZ128rr:
10061 case X86::VADDPDZ256rr:
10062 case X86::VADDPSZ256rr:
10063 case X86::VADDPDZrr:
10064 case X86::VADDPSZrr:
10065 case X86::VADDSDrr:
10066 case X86::VADDSSrr:
10067 case X86::VADDSDZrr:
10068 case X86::VADDSSZrr:
10069 case X86::VMULPDrr:
10070 case X86::VMULPSrr:
10071 case X86::VMULPDYrr:
10072 case X86::VMULPSYrr:
10073 case X86::VMULPDZ128rr:
10074 case X86::VMULPSZ128rr:
10075 case X86::VMULPDZ256rr:
10076 case X86::VMULPSZ256rr:
10077 case X86::VMULPDZrr:
10078 case X86::VMULPSZrr:
10079 case X86::VMULSDrr:
10080 case X86::VMULSSrr:
10081 case X86::VMULSDZrr:
10082 case X86::VMULSSZrr:
10083 case X86::VADDPHZ128rr:
10084 case X86::VADDPHZ256rr:
10085 case X86::VADDPHZrr:
10086 case X86::VADDSHZrr:
10087 case X86::VMULPHZ128rr:
10088 case X86::VMULPHZ256rr:
10089 case X86::VMULPHZrr:
10090 case X86::VMULSHZrr:
10093 default:
10094 return false;
10095 }
10096}
10097
10098/// If \p DescribedReg overlaps with the MOVrr instruction's destination
10099/// register then, if possible, describe the value in terms of the source
10100/// register.
10101static std::optional<ParamLoadedValue>
10103 const TargetRegisterInfo *TRI) {
10104 Register DestReg = MI.getOperand(0).getReg();
10105 Register SrcReg = MI.getOperand(1).getReg();
10106
10107 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
10108
10109 // If the described register is the destination, just return the source.
10110 if (DestReg == DescribedReg)
10111 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
10112
10113 // If the described register is a sub-register of the destination register,
10114 // then pick out the source register's corresponding sub-register.
10115 if (unsigned SubRegIdx = TRI->getSubRegIndex(DestReg, DescribedReg)) {
10116 Register SrcSubReg = TRI->getSubReg(SrcReg, SubRegIdx);
10117 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
10118 }
10119
10120 // The remaining case to consider is when the described register is a
10121 // super-register of the destination register. MOV8rr and MOV16rr does not
10122 // write to any of the other bytes in the register, meaning that we'd have to
10123 // describe the value using a combination of the source register and the
10124 // non-overlapping bits in the described register, which is not currently
10125 // possible.
10126 if (MI.getOpcode() == X86::MOV8rr || MI.getOpcode() == X86::MOV16rr ||
10127 !TRI->isSuperRegister(DestReg, DescribedReg))
10128 return std::nullopt;
10129
10130 assert(MI.getOpcode() == X86::MOV32rr && "Unexpected super-register case");
10131 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
10132}
10133
10134std::optional<ParamLoadedValue>
10136 const MachineOperand *Op = nullptr;
10137 DIExpression *Expr = nullptr;
10138
10140
10141 switch (MI.getOpcode()) {
10142 case X86::LEA32r:
10143 case X86::LEA64r:
10144 case X86::LEA64_32r: {
10145 // We may need to describe a 64-bit parameter with a 32-bit LEA.
10146 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
10147 return std::nullopt;
10148
10149 // Operand 4 could be global address. For now we do not support
10150 // such situation.
10151 if (!MI.getOperand(4).isImm() || !MI.getOperand(2).isImm())
10152 return std::nullopt;
10153
10154 const MachineOperand &Op1 = MI.getOperand(1);
10155 const MachineOperand &Op2 = MI.getOperand(3);
10156 assert(Op2.isReg() &&
10157 (Op2.getReg() == X86::NoRegister || Op2.getReg().isPhysical()));
10158
10159 // Omit situations like:
10160 // %rsi = lea %rsi, 4, ...
10161 if ((Op1.isReg() && Op1.getReg() == MI.getOperand(0).getReg()) ||
10162 Op2.getReg() == MI.getOperand(0).getReg())
10163 return std::nullopt;
10164 else if ((Op1.isReg() && Op1.getReg() != X86::NoRegister &&
10165 TRI->regsOverlap(Op1.getReg(), MI.getOperand(0).getReg())) ||
10166 (Op2.getReg() != X86::NoRegister &&
10167 TRI->regsOverlap(Op2.getReg(), MI.getOperand(0).getReg())))
10168 return std::nullopt;
10169
10170 int64_t Coef = MI.getOperand(2).getImm();
10171 int64_t Offset = MI.getOperand(4).getImm();
10173
10174 if ((Op1.isReg() && Op1.getReg() != X86::NoRegister)) {
10175 Op = &Op1;
10176 } else if (Op1.isFI())
10177 Op = &Op1;
10178
10179 if (Op && Op->isReg() && Op->getReg() == Op2.getReg() && Coef > 0) {
10180 Ops.push_back(dwarf::DW_OP_constu);
10181 Ops.push_back(Coef + 1);
10182 Ops.push_back(dwarf::DW_OP_mul);
10183 } else {
10184 if (Op && Op2.getReg() != X86::NoRegister) {
10185 int dwarfReg = TRI->getDwarfRegNum(Op2.getReg(), false);
10186 if (dwarfReg < 0)
10187 return std::nullopt;
10188 else if (dwarfReg < 32) {
10189 Ops.push_back(dwarf::DW_OP_breg0 + dwarfReg);
10190 Ops.push_back(0);
10191 } else {
10192 Ops.push_back(dwarf::DW_OP_bregx);
10193 Ops.push_back(dwarfReg);
10194 Ops.push_back(0);
10195 }
10196 } else if (!Op) {
10197 assert(Op2.getReg() != X86::NoRegister);
10198 Op = &Op2;
10199 }
10200
10201 if (Coef > 1) {
10202 assert(Op2.getReg() != X86::NoRegister);
10203 Ops.push_back(dwarf::DW_OP_constu);
10204 Ops.push_back(Coef);
10205 Ops.push_back(dwarf::DW_OP_mul);
10206 }
10207
10208 if (((Op1.isReg() && Op1.getReg() != X86::NoRegister) || Op1.isFI()) &&
10209 Op2.getReg() != X86::NoRegister) {
10210 Ops.push_back(dwarf::DW_OP_plus);
10211 }
10212 }
10213
10215 Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), Ops);
10216
10217 return ParamLoadedValue(*Op, Expr);
10218 }
10219 case X86::MOV8ri:
10220 case X86::MOV16ri:
10221 // TODO: Handle MOV8ri and MOV16ri.
10222 return std::nullopt;
10223 case X86::MOV32ri:
10224 case X86::MOV64ri:
10225 case X86::MOV64ri32:
10226 // MOV32ri may be used for producing zero-extended 32-bit immediates in
10227 // 64-bit parameters, so we need to consider super-registers.
10228 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
10229 return std::nullopt;
10230 return ParamLoadedValue(MI.getOperand(1), Expr);
10231 case X86::MOV8rr:
10232 case X86::MOV16rr:
10233 case X86::MOV32rr:
10234 case X86::MOV64rr:
10235 return describeMOVrrLoadedValue(MI, Reg, TRI);
10236 case X86::XOR32rr: {
10237 // 64-bit parameters are zero-materialized using XOR32rr, so also consider
10238 // super-registers.
10239 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
10240 return std::nullopt;
10241 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg())
10243 return std::nullopt;
10244 }
10245 case X86::MOVSX64rr32: {
10246 // We may need to describe the lower 32 bits of the MOVSX; for example, in
10247 // cases like this:
10248 //
10249 // $ebx = [...]
10250 // $rdi = MOVSX64rr32 $ebx
10251 // $esi = MOV32rr $edi
10252 if (!TRI->isSubRegisterEq(MI.getOperand(0).getReg(), Reg))
10253 return std::nullopt;
10254
10255 Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
10256
10257 // If the described register is the destination register we need to
10258 // sign-extend the source register from 32 bits. The other case we handle
10259 // is when the described register is the 32-bit sub-register of the
10260 // destination register, in case we just need to return the source
10261 // register.
10262 if (Reg == MI.getOperand(0).getReg())
10263 Expr = DIExpression::appendExt(Expr, 32, 64, true);
10264 else
10265 assert(X86MCRegisterClasses[X86::GR32RegClassID].contains(Reg) &&
10266 "Unhandled sub-register case for MOVSX64rr32");
10267
10268 return ParamLoadedValue(MI.getOperand(1), Expr);
10269 }
10270 default:
10271 assert(!MI.isMoveImmediate() && "Unexpected MoveImm instruction");
10273 }
10274}
10275
10276/// This is an architecture-specific helper function of reassociateOps.
10277/// Set special operand attributes for new instructions after reassociation.
10279 MachineInstr &OldMI2,
10280 MachineInstr &NewMI1,
10281 MachineInstr &NewMI2) const {
10282 // Integer instructions may define an implicit EFLAGS dest register operand.
10283 MachineOperand *OldFlagDef1 =
10284 OldMI1.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
10285 MachineOperand *OldFlagDef2 =
10286 OldMI2.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
10287
10288 assert(!OldFlagDef1 == !OldFlagDef2 &&
10289 "Unexpected instruction type for reassociation");
10290
10291 if (!OldFlagDef1 || !OldFlagDef2)
10292 return;
10293
10294 assert(OldFlagDef1->isDead() && OldFlagDef2->isDead() &&
10295 "Must have dead EFLAGS operand in reassociable instruction");
10296
10297 MachineOperand *NewFlagDef1 =
10298 NewMI1.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
10299 MachineOperand *NewFlagDef2 =
10300 NewMI2.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
10301
10302 assert(NewFlagDef1 && NewFlagDef2 &&
10303 "Unexpected operand in reassociable instruction");
10304
10305 // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations
10306 // of this pass or other passes. The EFLAGS operands must be dead in these new
10307 // instructions because the EFLAGS operands in the original instructions must
10308 // be dead in order for reassociation to occur.
10309 NewFlagDef1->setIsDead();
10310 NewFlagDef2->setIsDead();
10311}
10312
10313std::pair<unsigned, unsigned>
10315 return std::make_pair(TF, 0u);
10316}
10317
10320 using namespace X86II;
10321 static const std::pair<unsigned, const char *> TargetFlags[] = {
10322 {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"},
10323 {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"},
10324 {MO_GOT, "x86-got"},
10325 {MO_GOTOFF, "x86-gotoff"},
10326 {MO_GOTPCREL, "x86-gotpcrel"},
10327 {MO_GOTPCREL_NORELAX, "x86-gotpcrel-norelax"},
10328 {MO_PLT, "x86-plt"},
10329 {MO_TLSGD, "x86-tlsgd"},
10330 {MO_TLSLD, "x86-tlsld"},
10331 {MO_TLSLDM, "x86-tlsldm"},
10332 {MO_GOTTPOFF, "x86-gottpoff"},
10333 {MO_INDNTPOFF, "x86-indntpoff"},
10334 {MO_TPOFF, "x86-tpoff"},
10335 {MO_DTPOFF, "x86-dtpoff"},
10336 {MO_NTPOFF, "x86-ntpoff"},
10337 {MO_GOTNTPOFF, "x86-gotntpoff"},
10338 {MO_DLLIMPORT, "x86-dllimport"},
10339 {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"},
10340 {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"},
10341 {MO_TLVP, "x86-tlvp"},
10342 {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"},
10343 {MO_SECREL, "x86-secrel"},
10344 {MO_COFFSTUB, "x86-coffstub"}};
10345 return ArrayRef(TargetFlags);
10346}
10347
10348namespace {
10349/// Create Global Base Reg pass. This initializes the PIC
10350/// global base register for x86-32.
10351struct CGBR : public MachineFunctionPass {
10352 static char ID;
10353 CGBR() : MachineFunctionPass(ID) {}
10354
10355 bool runOnMachineFunction(MachineFunction &MF) override {
10356 const X86TargetMachine *TM =
10357 static_cast<const X86TargetMachine *>(&MF.getTarget());
10358 const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
10359
10360 // Only emit a global base reg in PIC mode.
10361 if (!TM->isPositionIndependent())
10362 return false;
10363
10365 Register GlobalBaseReg = X86FI->getGlobalBaseReg();
10366
10367 // If we didn't need a GlobalBaseReg, don't insert code.
10368 if (GlobalBaseReg == 0)
10369 return false;
10370
10371 // Insert the set of GlobalBaseReg into the first MBB of the function
10372 MachineBasicBlock &FirstMBB = MF.front();
10374 DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
10376 const X86InstrInfo *TII = STI.getInstrInfo();
10377
10378 Register PC;
10379 if (STI.isPICStyleGOT())
10380 PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
10381 else
10382 PC = GlobalBaseReg;
10383
10384 if (STI.is64Bit()) {
10385 if (TM->getCodeModel() == CodeModel::Large) {
10386 // In the large code model, we are aiming for this code, though the
10387 // register allocation may vary:
10388 // leaq .LN$pb(%rip), %rax
10389 // movq $_GLOBAL_OFFSET_TABLE_ - .LN$pb, %rcx
10390 // addq %rcx, %rax
10391 // RAX now holds address of _GLOBAL_OFFSET_TABLE_.
10392 Register PBReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
10393 Register GOTReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
10394 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PBReg)
10395 .addReg(X86::RIP)
10396 .addImm(0)
10397 .addReg(0)
10399 .addReg(0);
10400 std::prev(MBBI)->setPreInstrSymbol(MF, MF.getPICBaseSymbol());
10401 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOV64ri), GOTReg)
10402 .addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
10404 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD64rr), PC)
10405 .addReg(PBReg, RegState::Kill)
10406 .addReg(GOTReg, RegState::Kill);
10407 } else {
10408 // In other code models, use a RIP-relative LEA to materialize the
10409 // GOT.
10410 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PC)
10411 .addReg(X86::RIP)
10412 .addImm(0)
10413 .addReg(0)
10414 .addExternalSymbol("_GLOBAL_OFFSET_TABLE_")
10415 .addReg(0);
10416 }
10417 } else {
10418 // Operand of MovePCtoStack is completely ignored by asm printer. It's
10419 // only used in JIT code emission as displacement to pc.
10420 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
10421
10422 // If we're using vanilla 'GOT' PIC style, we should use relative
10423 // addressing not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
10424 if (STI.isPICStyleGOT()) {
10425 // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel],
10426 // %some_register
10427 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
10428 .addReg(PC)
10429 .addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
10431 }
10432 }
10433
10434 return true;
10435 }
10436
10437 StringRef getPassName() const override {
10438 return "X86 PIC Global Base Reg Initialization";
10439 }
10440
10441 void getAnalysisUsage(AnalysisUsage &AU) const override {
10442 AU.setPreservesCFG();
10444 }
10445};
10446} // namespace
10447
10448char CGBR::ID = 0;
10450
10451namespace {
10452struct LDTLSCleanup : public MachineFunctionPass {
10453 static char ID;
10454 LDTLSCleanup() : MachineFunctionPass(ID) {}
10455
10456 bool runOnMachineFunction(MachineFunction &MF) override {
10457 if (skipFunction(MF.getFunction()))
10458 return false;
10459
10461 if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
10462 // No point folding accesses if there isn't at least two.
10463 return false;
10464 }
10465
10467 &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
10468 return VisitNode(DT->getRootNode(), Register());
10469 }
10470
10471 // Visit the dominator subtree rooted at Node in pre-order.
10472 // If TLSBaseAddrReg is non-null, then use that to replace any
10473 // TLS_base_addr instructions. Otherwise, create the register
10474 // when the first such instruction is seen, and then use it
10475 // as we encounter more instructions.
10476 bool VisitNode(MachineDomTreeNode *Node, Register TLSBaseAddrReg) {
10477 MachineBasicBlock *BB = Node->getBlock();
10478 bool Changed = false;
10479
10480 // Traverse the current block.
10481 for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
10482 ++I) {
10483 switch (I->getOpcode()) {
10484 case X86::TLS_base_addr32:
10485 case X86::TLS_base_addr64:
10486 if (TLSBaseAddrReg)
10487 I = ReplaceTLSBaseAddrCall(*I, TLSBaseAddrReg);
10488 else
10489 I = SetRegister(*I, &TLSBaseAddrReg);
10490 Changed = true;
10491 break;
10492 default:
10493 break;
10494 }
10495 }
10496
10497 // Visit the children of this block in the dominator tree.
10498 for (auto &I : *Node) {
10499 Changed |= VisitNode(I, TLSBaseAddrReg);
10500 }
10501
10502 return Changed;
10503 }
10504
10505 // Replace the TLS_base_addr instruction I with a copy from
10506 // TLSBaseAddrReg, returning the new instruction.
10507 MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr &I,
10508 Register TLSBaseAddrReg) {
10509 MachineFunction *MF = I.getParent()->getParent();
10510 const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
10511 const bool is64Bit = STI.is64Bit();
10512 const X86InstrInfo *TII = STI.getInstrInfo();
10513
10514 // Insert a Copy from TLSBaseAddrReg to RAX/EAX.
10516 BuildMI(*I.getParent(), I, I.getDebugLoc(),
10517 TII->get(TargetOpcode::COPY), is64Bit ? X86::RAX : X86::EAX)
10518 .addReg(TLSBaseAddrReg);
10519
10520 // Erase the TLS_base_addr instruction.
10521 I.eraseFromParent();
10522
10523 return Copy;
10524 }
10525
10526 // Create a virtual register in *TLSBaseAddrReg, and populate it by
10527 // inserting a copy instruction after I. Returns the new instruction.
10528 MachineInstr *SetRegister(MachineInstr &I, Register *TLSBaseAddrReg) {
10529 MachineFunction *MF = I.getParent()->getParent();
10530 const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
10531 const bool is64Bit = STI.is64Bit();
10532 const X86InstrInfo *TII = STI.getInstrInfo();
10533
10534 // Create a virtual register for the TLS base address.
10536 *TLSBaseAddrReg = RegInfo.createVirtualRegister(
10537 is64Bit ? &X86::GR64RegClass : &X86::GR32RegClass);
10538
10539 // Insert a copy from RAX/EAX to TLSBaseAddrReg.
10540 MachineInstr *Next = I.getNextNode();
10541 MachineInstr *Copy = BuildMI(*I.getParent(), Next, I.getDebugLoc(),
10542 TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
10543 .addReg(is64Bit ? X86::RAX : X86::EAX);
10544
10545 return Copy;
10546 }
10547
10548 StringRef getPassName() const override {
10549 return "Local Dynamic TLS Access Clean-up";
10550 }
10551
10552 void getAnalysisUsage(AnalysisUsage &AU) const override {
10553 AU.setPreservesCFG();
10556 }
10557};
10558} // namespace
10559
10560char LDTLSCleanup::ID = 0;
10562 return new LDTLSCleanup();
10563}
10564
10565/// Constants defining how certain sequences should be outlined.
10566///
10567/// \p MachineOutlinerDefault implies that the function is called with a call
10568/// instruction, and a return must be emitted for the outlined function frame.
10569///
10570/// That is,
10571///
10572/// I1 OUTLINED_FUNCTION:
10573/// I2 --> call OUTLINED_FUNCTION I1
10574/// I3 I2
10575/// I3
10576/// ret
10577///
10578/// * Call construction overhead: 1 (call instruction)
10579/// * Frame construction overhead: 1 (return instruction)
10580///
10581/// \p MachineOutlinerTailCall implies that the function is being tail called.
10582/// A jump is emitted instead of a call, and the return is already present in
10583/// the outlined sequence. That is,
10584///
10585/// I1 OUTLINED_FUNCTION:
10586/// I2 --> jmp OUTLINED_FUNCTION I1
10587/// ret I2
10588/// ret
10589///
10590/// * Call construction overhead: 1 (jump instruction)
10591/// * Frame construction overhead: 0 (don't need to return)
10592///
10594
10595std::optional<std::unique_ptr<outliner::OutlinedFunction>>
10597 const MachineModuleInfo &MMI,
10598 std::vector<outliner::Candidate> &RepeatedSequenceLocs,
10599 unsigned MinRepeats) const {
10600 unsigned SequenceSize = 0;
10601 for (auto &MI : RepeatedSequenceLocs[0]) {
10602 // FIXME: x86 doesn't implement getInstSizeInBytes, so
10603 // we can't tell the cost. Just assume each instruction
10604 // is one byte.
10605 if (MI.isDebugInstr() || MI.isKill())
10606 continue;
10607 SequenceSize += 1;
10608 }
10609
10610 // We check to see if CFI Instructions are present, and if they are
10611 // we find the number of CFI Instructions in the candidates.
10612 unsigned CFICount = 0;
10613 for (auto &I : RepeatedSequenceLocs[0]) {
10614 if (I.isCFIInstruction())
10615 CFICount++;
10616 }
10617
10618 // We compare the number of found CFI Instructions to the number of CFI
10619 // instructions in the parent function for each candidate. We must check this
10620 // since if we outline one of the CFI instructions in a function, we have to
10621 // outline them all for correctness. If we do not, the address offsets will be
10622 // incorrect between the two sections of the program.
10623 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10624 std::vector<MCCFIInstruction> CFIInstructions =
10625 C.getMF()->getFrameInstructions();
10626
10627 if (CFICount > 0 && CFICount != CFIInstructions.size())
10628 return std::nullopt;
10629 }
10630
10631 // FIXME: Use real size in bytes for call and ret instructions.
10632 if (RepeatedSequenceLocs[0].back().isTerminator()) {
10633 for (outliner::Candidate &C : RepeatedSequenceLocs)
10634 C.setCallInfo(MachineOutlinerTailCall, 1);
10635
10636 return std::make_unique<outliner::OutlinedFunction>(
10637 RepeatedSequenceLocs, SequenceSize,
10638 0, // Number of bytes to emit frame.
10639 MachineOutlinerTailCall // Type of frame.
10640 );
10641 }
10642
10643 if (CFICount > 0)
10644 return std::nullopt;
10645
10646 for (outliner::Candidate &C : RepeatedSequenceLocs)
10647 C.setCallInfo(MachineOutlinerDefault, 1);
10648
10649 return std::make_unique<outliner::OutlinedFunction>(
10650 RepeatedSequenceLocs, SequenceSize, 1, MachineOutlinerDefault);
10651}
10652
10654 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
10655 const Function &F = MF.getFunction();
10656
10657 // Does the function use a red zone? If it does, then we can't risk messing
10658 // with the stack.
10659 if (Subtarget.getFrameLowering()->has128ByteRedZone(MF)) {
10660 // It could have a red zone. If it does, then we don't want to touch it.
10662 if (!X86FI || X86FI->getUsesRedZone())
10663 return false;
10664 }
10665
10666 // If we *don't* want to outline from things that could potentially be deduped
10667 // then return false.
10668 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
10669 return false;
10670
10671 // This function is viable for outlining, so return true.
10672 return true;
10673}
10674
10678 unsigned Flags) const {
10679 MachineInstr &MI = *MIT;
10680
10681 // Is this a terminator for a basic block?
10682 if (MI.isTerminator())
10683 // TargetInstrInfo::getOutliningType has already filtered out anything
10684 // that would break this, so we can allow it here.
10686
10687 // Don't outline anything that modifies or reads from the stack pointer.
10688 //
10689 // FIXME: There are instructions which are being manually built without
10690 // explicit uses/defs so we also have to check the MCInstrDesc. We should be
10691 // able to remove the extra checks once those are fixed up. For example,
10692 // sometimes we might get something like %rax = POP64r 1. This won't be
10693 // caught by modifiesRegister or readsRegister even though the instruction
10694 // really ought to be formed so that modifiesRegister/readsRegister would
10695 // catch it.
10696 if (MI.modifiesRegister(X86::RSP, &RI) || MI.readsRegister(X86::RSP, &RI) ||
10697 MI.getDesc().hasImplicitUseOfPhysReg(X86::RSP) ||
10698 MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP))
10700
10701 // Outlined calls change the instruction pointer, so don't read from it.
10702 if (MI.readsRegister(X86::RIP, &RI) ||
10703 MI.getDesc().hasImplicitUseOfPhysReg(X86::RIP) ||
10704 MI.getDesc().hasImplicitDefOfPhysReg(X86::RIP))
10706
10707 // Don't outline CFI instructions.
10708 if (MI.isCFIInstruction())
10710
10712}
10713
10716 const outliner::OutlinedFunction &OF) const {
10717 // If we're a tail call, we already have a return, so don't do anything.
10718 if (OF.FrameConstructionID == MachineOutlinerTailCall)
10719 return;
10720
10721 // We're a normal call, so our sequence doesn't have a return instruction.
10722 // Add it in.
10723 MachineInstr *retq = BuildMI(MF, DebugLoc(), get(X86::RET64));
10724 MBB.insert(MBB.end(), retq);
10725}
10726
10730 // Is it a tail call?
10731 if (C.CallConstructionID == MachineOutlinerTailCall) {
10732 // Yes, just insert a JMP.
10733 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(X86::TAILJMPd64))
10734 .addGlobalAddress(M.getNamedValue(MF.getName())));
10735 } else {
10736 // No, insert a call.
10737 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(X86::CALL64pcrel32))
10738 .addGlobalAddress(M.getNamedValue(MF.getName())));
10739 }
10740
10741 return It;
10742}
10743
10746 DebugLoc &DL,
10747 bool AllowSideEffects) const {
10748 const MachineFunction &MF = *MBB.getParent();
10749 const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
10751
10752 if (ST.hasMMX() && X86::VR64RegClass.contains(Reg))
10753 // FIXME: Should we ignore MMX registers?
10754 return;
10755
10756 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
10757 // Convert register to the 32-bit version. Both 'movl' and 'xorl' clear the
10758 // upper bits of a 64-bit register automagically.
10759 Reg = getX86SubSuperRegister(Reg, 32);
10760
10761 if (!AllowSideEffects)
10762 // XOR affects flags, so use a MOV instead.
10763 BuildMI(MBB, Iter, DL, get(X86::MOV32ri), Reg).addImm(0);
10764 else
10765 BuildMI(MBB, Iter, DL, get(X86::XOR32rr), Reg)
10766 .addReg(Reg, RegState::Undef)
10767 .addReg(Reg, RegState::Undef);
10768 } else if (X86::VR128RegClass.contains(Reg)) {
10769 // XMM#
10770 if (!ST.hasSSE1())
10771 return;
10772
10773 // PXOR is safe to use because it doesn't affect flags.
10774 BuildMI(MBB, Iter, DL, get(X86::PXORrr), Reg)
10775 .addReg(Reg, RegState::Undef)
10776 .addReg(Reg, RegState::Undef);
10777 } else if (X86::VR256RegClass.contains(Reg)) {
10778 // YMM#
10779 if (!ST.hasAVX())
10780 return;
10781
10782 // VPXOR is safe to use because it doesn't affect flags.
10783 BuildMI(MBB, Iter, DL, get(X86::VPXORrr), Reg)
10784 .addReg(Reg, RegState::Undef)
10785 .addReg(Reg, RegState::Undef);
10786 } else if (X86::VR512RegClass.contains(Reg)) {
10787 // ZMM#
10788 if (!ST.hasAVX512())
10789 return;
10790
10791 // VPXORY is safe to use because it doesn't affect flags.
10792 BuildMI(MBB, Iter, DL, get(X86::VPXORYrr), Reg)
10793 .addReg(Reg, RegState::Undef)
10794 .addReg(Reg, RegState::Undef);
10795 } else if (X86::VK1RegClass.contains(Reg) || X86::VK2RegClass.contains(Reg) ||
10796 X86::VK4RegClass.contains(Reg) || X86::VK8RegClass.contains(Reg) ||
10797 X86::VK16RegClass.contains(Reg)) {
10798 if (!ST.hasVLX())
10799 return;
10800
10801 // KXOR is safe to use because it doesn't affect flags.
10802 unsigned Op = ST.hasBWI() ? X86::KXORQkk : X86::KXORWkk;
10803 BuildMI(MBB, Iter, DL, get(Op), Reg)
10804 .addReg(Reg, RegState::Undef)
10805 .addReg(Reg, RegState::Undef);
10806 }
10807}
10808
10810 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
10811 bool DoRegPressureReduce) const {
10812 unsigned Opc = Root.getOpcode();
10813 switch (Opc) {
10814 case X86::VPDPWSSDrr:
10815 case X86::VPDPWSSDrm:
10816 case X86::VPDPWSSDYrr:
10817 case X86::VPDPWSSDYrm: {
10818 if (!Subtarget.hasFastDPWSSD()) {
10820 return true;
10821 }
10822 break;
10823 }
10824 case X86::VPDPWSSDZ128r:
10825 case X86::VPDPWSSDZ128m:
10826 case X86::VPDPWSSDZ256r:
10827 case X86::VPDPWSSDZ256m:
10828 case X86::VPDPWSSDZr:
10829 case X86::VPDPWSSDZm: {
10830 if (Subtarget.hasBWI() && !Subtarget.hasFastDPWSSD()) {
10832 return true;
10833 }
10834 break;
10835 }
10836 }
10838 Patterns, DoRegPressureReduce);
10839}
10840
10841static void
10845 DenseMap<Register, unsigned> &InstrIdxForVirtReg) {
10846 MachineFunction *MF = Root.getMF();
10848
10849 unsigned Opc = Root.getOpcode();
10850 unsigned AddOpc = 0;
10851 unsigned MaddOpc = 0;
10852 switch (Opc) {
10853 default:
10854 assert(false && "It should not reach here");
10855 break;
10856 // vpdpwssd xmm2,xmm3,xmm1
10857 // -->
10858 // vpmaddwd xmm3,xmm3,xmm1
10859 // vpaddd xmm2,xmm2,xmm3
10860 case X86::VPDPWSSDrr:
10861 MaddOpc = X86::VPMADDWDrr;
10862 AddOpc = X86::VPADDDrr;
10863 break;
10864 case X86::VPDPWSSDrm:
10865 MaddOpc = X86::VPMADDWDrm;
10866 AddOpc = X86::VPADDDrr;
10867 break;
10868 case X86::VPDPWSSDZ128r:
10869 MaddOpc = X86::VPMADDWDZ128rr;
10870 AddOpc = X86::VPADDDZ128rr;
10871 break;
10872 case X86::VPDPWSSDZ128m:
10873 MaddOpc = X86::VPMADDWDZ128rm;
10874 AddOpc = X86::VPADDDZ128rr;
10875 break;
10876 // vpdpwssd ymm2,ymm3,ymm1
10877 // -->
10878 // vpmaddwd ymm3,ymm3,ymm1
10879 // vpaddd ymm2,ymm2,ymm3
10880 case X86::VPDPWSSDYrr:
10881 MaddOpc = X86::VPMADDWDYrr;
10882 AddOpc = X86::VPADDDYrr;
10883 break;
10884 case X86::VPDPWSSDYrm:
10885 MaddOpc = X86::VPMADDWDYrm;
10886 AddOpc = X86::VPADDDYrr;
10887 break;
10888 case X86::VPDPWSSDZ256r:
10889 MaddOpc = X86::VPMADDWDZ256rr;
10890 AddOpc = X86::VPADDDZ256rr;
10891 break;
10892 case X86::VPDPWSSDZ256m:
10893 MaddOpc = X86::VPMADDWDZ256rm;
10894 AddOpc = X86::VPADDDZ256rr;
10895 break;
10896 // vpdpwssd zmm2,zmm3,zmm1
10897 // -->
10898 // vpmaddwd zmm3,zmm3,zmm1
10899 // vpaddd zmm2,zmm2,zmm3
10900 case X86::VPDPWSSDZr:
10901 MaddOpc = X86::VPMADDWDZrr;
10902 AddOpc = X86::VPADDDZrr;
10903 break;
10904 case X86::VPDPWSSDZm:
10905 MaddOpc = X86::VPMADDWDZrm;
10906 AddOpc = X86::VPADDDZrr;
10907 break;
10908 }
10909 // Create vpmaddwd.
10910 const TargetRegisterClass *RC =
10911 RegInfo.getRegClass(Root.getOperand(0).getReg());
10912 Register NewReg = RegInfo.createVirtualRegister(RC);
10913 MachineInstr *Madd = Root.getMF()->CloneMachineInstr(&Root);
10914 Madd->setDesc(TII.get(MaddOpc));
10915 Madd->untieRegOperand(1);
10916 Madd->removeOperand(1);
10917 Madd->getOperand(0).setReg(NewReg);
10918 InstrIdxForVirtReg.insert(std::make_pair(NewReg, 0));
10919 // Create vpaddd.
10920 Register DstReg = Root.getOperand(0).getReg();
10921 bool IsKill = Root.getOperand(1).isKill();
10922 MachineInstr *Add =
10923 BuildMI(*MF, MIMetadata(Root), TII.get(AddOpc), DstReg)
10924 .addReg(Root.getOperand(1).getReg(), getKillRegState(IsKill))
10925 .addReg(Madd->getOperand(0).getReg(), getKillRegState(true));
10926 InsInstrs.push_back(Madd);
10927 InsInstrs.push_back(Add);
10928 DelInstrs.push_back(&Root);
10929}
10930
10932 MachineInstr &Root, unsigned Pattern,
10935 DenseMap<Register, unsigned> &InstrIdxForVirtReg) const {
10936 switch (Pattern) {
10937 default:
10938 // Reassociate instructions.
10940 DelInstrs, InstrIdxForVirtReg);
10941 return;
10943 genAlternativeDpCodeSequence(Root, *this, InsInstrs, DelInstrs,
10944 InstrIdxForVirtReg);
10945 return;
10946 }
10947}
10948
10949// See also: X86DAGToDAGISel::SelectInlineAsmMemoryOperand().
10951 int FI) const {
10953 M.BaseType = X86AddressMode::FrameIndexBase;
10954 M.Base.FrameIndex = FI;
10955 M.getFullAddress(Ops);
10956}
10957
10958#define GET_INSTRINFO_HELPERS
10959#include "X86GenInstrInfo.inc"
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
MachineOutlinerClass
Constants defining how certain sequences should be outlined.
@ MachineOutlinerTailCall
Emit a save, restore, call, and return.
@ MachineOutlinerDefault
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
BlockVerifier::State From
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
DXIL Forward Handle Accesses
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
static bool lookup(const GsymReader &GR, DataExtractor &Data, uint64_t &Offset, uint64_t BaseAddr, uint64_t Addr, SourceLocations &SrcLocs, llvm::Error &Err)
A Lookup helper functions.
Definition: InlineInfo.cpp:108
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
static bool Expand2AddrUndef(MachineInstrBuilder &MIB, const MCInstrDesc &Desc)
Expand a single-def pseudo instruction to a two-addr instruction with two undef reads of the register...
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file declares the MachineConstantPool class which is an abstract constant pool to keep track of ...
Register const TargetRegisterInfo * TRI
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:480
Provides some synthesis utilities to produce sequences of values.
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
#define LLVM_DEBUG(...)
Definition: Debug.h:119
#define FROM_TO(FROM, TO)
cl::opt< bool > X86EnableAPXForRelocation
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static bool isLEA(unsigned Opcode)
static void addOperands(MachineInstrBuilder &MIB, ArrayRef< MachineOperand > MOs, int PtrOffset=0)
static std::optional< ParamLoadedValue > describeMOVrrLoadedValue(const MachineInstr &MI, Register DescribedReg, const TargetRegisterInfo *TRI)
If DescribedReg overlaps with the MOVrr instruction's destination register then, if possible,...
static cl::opt< unsigned > PartialRegUpdateClearance("partial-reg-update-clearance", cl::desc("Clearance between two register writes " "for inserting XOR to avoid partial " "register update"), cl::init(64), cl::Hidden)
static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF, MachineInstr &MI)
static unsigned CopyToFromAsymmetricReg(Register DestReg, Register SrcReg, const X86Subtarget &Subtarget)
static bool isConvertibleLEA(MachineInstr *MI)
static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, const X86Subtarget &Subtarget)
static bool isAMXOpcode(unsigned Opc)
static int getJumpTableIndexFromReg(const MachineRegisterInfo &MRI, Register Reg)
static void updateOperandRegConstraints(MachineFunction &MF, MachineInstr &NewMI, const TargetInstrInfo &TII)
static int getJumpTableIndexFromAddr(const MachineInstr &MI)
static bool AdjustBlendMask(unsigned OldMask, unsigned OldWidth, unsigned NewWidth, unsigned *pNewMask=nullptr)
static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, bool MinusOne)
static unsigned getNewOpcFromTable(ArrayRef< X86TableEntry > Table, unsigned Opc)
static unsigned getStoreRegOpcode(Register SrcReg, const TargetRegisterClass *RC, bool IsStackAligned, const X86Subtarget &STI)
#define FOLD_BROADCAST(SIZE)
static cl::opt< unsigned > UndefRegClearance("undef-reg-clearance", cl::desc("How many idle instructions we would like before " "certain undef register reads"), cl::init(128), cl::Hidden)
#define CASE_BCAST_TYPE_OPC(TYPE, OP16, OP32, OP64)
static bool isTruncatedShiftCountForLEA(unsigned ShAmt)
Check whether the given shift count is appropriate can be represented by a LEA instruction.
static cl::opt< bool > ReMatPICStubLoad("remat-pic-stub-load", cl::desc("Re-materialize load from stub in PIC mode"), cl::init(false), cl::Hidden)
static SmallVector< MachineMemOperand *, 2 > extractLoadMMOs(ArrayRef< MachineMemOperand * > MMOs, MachineFunction &MF)
static MachineInstr * fuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, ArrayRef< MachineOperand > MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI, const TargetInstrInfo &TII)
static void printFailMsgforFold(const MachineInstr &MI, unsigned Idx)
static bool canConvert2Copy(unsigned Opc)
static cl::opt< bool > NoFusing("disable-spill-fusing", cl::desc("Disable fusing of spill code into instructions"), cl::Hidden)
static bool expandNOVLXStore(MachineInstrBuilder &MIB, const TargetRegisterInfo *TRI, const MCInstrDesc &StoreDesc, const MCInstrDesc &ExtractDesc, unsigned SubIdx)
static bool isX87Reg(Register Reg)
Return true if the Reg is X87 register.
static bool Expand2AddrKreg(MachineInstrBuilder &MIB, const MCInstrDesc &Desc, Register Reg)
Expand a single-def pseudo instruction to a two-addr instruction with two k0 reads.
static bool isFrameLoadOpcode(int Opcode, TypeSize &MemBytes)
#define VPERM_CASES_BROADCAST(Suffix)
static std::pair< X86::CondCode, unsigned > isUseDefConvertible(const MachineInstr &MI)
Check whether the use can be converted to remove a comparison against zero.
static bool findRedundantFlagInstr(MachineInstr &CmpInstr, MachineInstr &CmpValDefInstr, const MachineRegisterInfo *MRI, MachineInstr **AndInstr, const TargetRegisterInfo *TRI, const X86Subtarget &ST, bool &NoSignFlag, bool &ClearsOverflowFlag)
static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc)
static unsigned getLoadRegOpcode(Register DestReg, const TargetRegisterClass *RC, bool IsStackAligned, const X86Subtarget &STI)
static void expandLoadStackGuard(MachineInstrBuilder &MIB, const TargetInstrInfo &TII)
static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum, bool ForLoadFold=false)
static MachineInstr * makeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, ArrayRef< MachineOperand > MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI)
#define GET_ND_IF_ENABLED(OPC)
static bool expandMOVSHP(MachineInstrBuilder &MIB, MachineInstr &MI, const TargetInstrInfo &TII, bool HasAVX)
static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget, bool ForLoadFold=false)
Return true for all instructions that only update the first 32 or 64-bits of the destination register...
#define CASE_NF(OP)
static const uint16_t * lookupAVX512(unsigned opcode, unsigned domain, ArrayRef< uint16_t[4]> Table)
static unsigned getLoadStoreRegOpcode(Register Reg, const TargetRegisterClass *RC, bool IsStackAligned, const X86Subtarget &STI, bool Load)
#define VPERM_CASES(Suffix)
#define FROM_TO_SIZE(A, B, S)
static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2)
static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag, bool &ClearsOverflowFlag)
Check whether the definition can be converted to remove a comparison against zero.
static MachineInstr * fuseInst(MachineFunction &MF, unsigned Opcode, unsigned OpNo, ArrayRef< MachineOperand > MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI, const TargetInstrInfo &TII, int PtrOffset=0)
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static unsigned getCommutedVPERMV3Opcode(unsigned Opcode)
static bool expandXorFP(MachineInstrBuilder &MIB, const TargetInstrInfo &TII)
static MachineBasicBlock * getFallThroughMBB(MachineBasicBlock *MBB, MachineBasicBlock *TBB)
static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, const MachineInstr &UserMI, const MachineFunction &MF)
Check if LoadMI is a partial register load that we can't fold into MI because the latter uses content...
cl::opt< bool > X86EnableAPXForRelocation
static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI)
static bool isHReg(Register Reg)
Test if the given register is a physical h register.
static cl::opt< bool > PrintFailedFusing("print-failed-fuse-candidates", cl::desc("Print instructions that the allocator wants to" " fuse, but the X86 backend currently can't"), cl::Hidden)
static bool expandNOVLXLoad(MachineInstrBuilder &MIB, const TargetRegisterInfo *TRI, const MCInstrDesc &LoadDesc, const MCInstrDesc &BroadcastDesc, unsigned SubIdx)
static void genAlternativeDpCodeSequence(MachineInstr &Root, const TargetInstrInfo &TII, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg)
#define CASE_ND(OP)
static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1, unsigned SrcOpIdx2)
This determines which of three possible cases of a three source commute the source indexes correspond...
static bool isFrameStoreOpcode(int Opcode, TypeSize &MemBytes)
static unsigned getTruncatedShiftCount(const MachineInstr &MI, unsigned ShiftAmtOperandIdx)
Check whether the shift count for a machine operand is non-zero.
static SmallVector< MachineMemOperand *, 2 > extractStoreMMOs(ArrayRef< MachineMemOperand * > MMOs, MachineFunction &MF)
static unsigned getBroadcastOpcode(const X86FoldTableEntry *I, const TargetRegisterClass *RC, const X86Subtarget &STI)
static unsigned convertALUrr2ALUri(unsigned Opc)
Convert an ALUrr opcode to corresponding ALUri opcode.
static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI)
Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
static bool isCommutableVPERMV3Instruction(unsigned Opcode)
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition: APInt.h:206
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:209
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:219
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:270
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:681
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:707
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:708
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:684
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:693
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:682
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:683
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:702
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:701
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:705
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:692
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:686
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:689
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:703
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:690
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:685
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:687
@ ICMP_EQ
equal
Definition: InstrTypes.h:699
@ ICMP_NE
not equal
Definition: InstrTypes.h:700
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:706
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:694
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:704
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:691
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:688
This is an important base class in LLVM.
Definition: Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
DWARF expression.
static LLVM_ABI void appendOffset(SmallVectorImpl< uint64_t > &Ops, int64_t Offset)
Append Ops with operations to apply the Offset.
static LLVM_ABI DIExpression * appendExt(const DIExpression *Expr, unsigned FromSize, unsigned ToSize, bool Signed)
Append a zero- or sign-extension to Expr.
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:124
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:230
Base class for the actual dominator tree node.
DomTreeNodeBase< NodeT > * getRootNode()
getRootNode - This returns the entry node for the CFG of the function.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:803
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:314
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:706
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:703
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:359
LiveInterval - This class represents the liveness of a register, or stack slot.
Definition: LiveInterval.h:690
SlotIndex InsertMachineInstrInMaps(MachineInstr &MI)
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
A set of physical registers with utility functions to track liveness when walking backward/forward th...
Definition: LivePhysRegs.h:52
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
Definition: LiveInterval.h:410
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
static LocationSize precise(uint64_t Value)
bool usesWindowsCFI() const
Definition: MCAsmInfo.h:652
static MCCFIInstruction createAdjustCfaOffset(MCSymbol *L, int64_t Adjustment, SMLoc Loc={})
.cfi_adjust_cfa_offset Same as .cfi_def_cfa_offset, but Offset is a relative value that is added/subt...
Definition: MCDwarf.h:608
Instances of this class represent a single low-level machine instruction.
Definition: MCInst.h:188
void setOpcode(unsigned Op)
Definition: MCInst.h:201
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:199
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:238
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
Definition: MCInstrDesc.h:249
unsigned getOpcode() const
Return the opcode number for this descriptor.
Definition: MCInstrDesc.h:231
unsigned char NumDefs
Definition: MCInstrDesc.h:208
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:86
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1565
Set of metadata that should be preserved when using BuildMI().
SimpleValueType SimpleTy
unsigned pred_size() const
MachineInstrBundleIterator< const MachineInstr > const_iterator
reverse_iterator rend()
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
LLVM_ABI bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
reverse_iterator rbegin()
@ LQR_Dead
Register is known to be fully dead.
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
This class is a data container for one entry in a MachineConstantPool.
union llvm::MachineConstantPoolEntry::@205 Val
The constant itself.
bool isMachineConstantPoolEntry() const
isMachineConstantPoolEntry - Return true if the MachineConstantPoolEntry is indeed a target specific ...
The MachineConstantPool class keeps track of constants referenced by a function which must be spilled...
unsigned getConstantPoolIndex(const Constant *C, Align Alignment)
getConstantPoolIndex - Create a new entry in the constant pool or return an existing one.
Analysis pass which computes a MachineDominatorTree.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
MCSymbol * getPICBaseSymbol() const
getPICBaseSymbol - Return a function-local symbol to represent the PIC base.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineInstr * CreateMachineInstr(const MCInstrDesc &MCID, DebugLoc DL, bool NoImplicit=false)
CreateMachineInstr - Allocate a new MachineInstr.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
bool needsFrameMoves() const
True if this function needs frame moves for debug or exceptions.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineConstantPool * getConstantPool()
getConstantPool - Return the constant pool object for the current function.
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
const MachineBasicBlock & front() const
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
Definition: MachineInstr.h:72
mop_iterator operands_begin()
Definition: MachineInstr.h:687
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:587
bool isImplicitDef() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:359
void dropDebugNumber()
Drop any variable location debugging information associated with this instruction.
Definition: MachineInstr.h:568
LLVM_ABI void setPreInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just prior to the instruction itself.
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
Definition: MachineInstr.h:409
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:590
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool modifiesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:584
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI unsigned getNumExplicitDefs() const
Returns the number of non-implicit definitions.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
Definition: MachineInstr.h:813
LLVM_ABI void substituteRegister(Register FromReg, Register ToReg, unsigned SubIdx, const TargetRegisterInfo &RegInfo)
Replace all occurrences of FromReg with ToReg:SubIdx, properly composing subreg indices where necessa...
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:798
LLVM_ABI bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
void setFlag(MIFlag Flag)
Set a MI flag.
Definition: MachineInstr.h:416
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:511
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void dump() const
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:595
unsigned getNumDefs() const
Returns the total number of definitions.
Definition: MachineInstr.h:637
void setDebugLoc(DebugLoc DL)
Replace current source information with new such.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
This class contains meta information specific to a module.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImplicit(bool Val=true)
void setImm(int64_t immVal)
int64_t getImm() const
bool readsReg() const
readsReg - Returns true if this operand reads the previous value of its register.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isCPI() const
isCPI - Tests if this is a MO_ConstantPoolIndex operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
bool isJTI() const
isJTI - Tests if this is a MO_JumpTableIndex operand.
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
static MachineOperand CreateCPI(unsigned Idx, int Offset, unsigned TargetFlags=0)
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
static MachineOperand CreateFI(int Idx)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:107
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:74
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:78
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:229
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:493
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:66
SlotIndex getBaseIndex() const
Returns the base index for associated with this index.
Definition: SlotIndexes.h:225
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
Definition: SlotIndexes.h:238
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:684
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
Information about stack frame layout on the target.
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
virtual bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const
Returns true iff the routine could find two commutable operands in the given machine instruction.
virtual bool hasReassociableOperands(const MachineInstr &Inst, const MachineBasicBlock *MBB) const
Return true when \P Inst has reassociable operands in the same \P MBB.
virtual void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstIdxForVirtReg) const
When getMachineCombinerPatterns() finds patterns, this function generates the instructions that could...
virtual std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const
Produce the expression describing the MI loading a value into the physical register Reg.
virtual bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const
Test if the given instruction should be considered a scheduling boundary.
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual const TargetRegisterClass * getRegClass(const MCInstrDesc &MCID, unsigned OpNum, const TargetRegisterInfo *TRI, const MachineFunction &MF) const
Given a machine instruction descriptor, returns the register class constraint for OpNum,...
bool isPositionIndependent() const
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
TypeSize getRegSizeInBits(const TargetRegisterClass &RC) const
Return the size in bits of a register from class RC.
Provide an instruction scheduling machine model to CodeGen passes.
virtual const TargetFrameLowering * getFrameLowering() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:346
static constexpr TypeSize getZero()
Definition: TypeSize.h:352
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
static LLVM_ABI Type * getFP128Ty(LLVMContext &C)
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
SlotIndex def
The index of the defining instruction.
Definition: LiveInterval.h:62
LLVM Value Representation.
Definition: Value.h:75
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const MCCFIInstruction &CFIInst, MachineInstr::MIFlag Flag=MachineInstr::NoFlags) const
Wraps up getting a CFI index and building a MachineInstr for it.
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
void getFrameIndexOperands(SmallVectorImpl< MachineOperand > &Ops, int FI) const override
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
Check if there exists an earlier instruction that operates on the same source operands and sets eflag...
bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
Overrides the isSchedulingBoundary from Codegen/TargetInstrInfo.cpp to make it capable of identifying...
MachineBasicBlock::iterator insertOutlinedCall(Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, MachineFunction &MF, outliner::Candidate &C) const override
const TargetRegisterClass * getRegClass(const MCInstrDesc &MCID, unsigned OpNum, const TargetRegisterInfo *TRI, const MachineFunction &MF) const override
Given a machine instruction descriptor, returns the register class constraint for OpNum,...
void replaceBranchWithTailCall(MachineBasicBlock &MBB, SmallVectorImpl< MachineOperand > &Cond, const MachineInstr &TailCall) const override
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
bool canInsertSelect(const MachineBasicBlock &, ArrayRef< MachineOperand > Cond, Register, Register, Register, int &, int &, int &) const override
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, bool UnfoldLoad, bool UnfoldStore, unsigned *LoadRegIndex=nullptr) const override
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override
Returns true iff the routine could find two commutable operands in the given machine instruction.
bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1, int64_t &Offset2) const override
static bool isDataInvariantLoad(MachineInstr &MI)
Returns true if the instruction has no behavior (specified or otherwise) that is based on the value l...
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned CommuteOpIdx1, unsigned CommuteOpIdx2) const override
bool isFunctionSafeToOutlineFrom(MachineFunction &MF, bool OutlineFromLinkOnceODRs) const override
const X86RegisterInfo & getRegisterInfo() const
getRegisterInfo - TargetInstrInfo is a superset of MRegister info.
Definition: X86InstrInfo.h:258
bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override
bool hasCommutePreference(MachineInstr &MI, bool &Commute) const override
Returns true if we have preference on the operands order in MI, the commute decision is returned in C...
bool hasLiveCondCodeDef(MachineInstr &MI) const
True if MI has a condition code def, e.g.
std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const override
bool canMakeTailCallConditional(SmallVectorImpl< MachineOperand > &Cond, const MachineInstr &TailCall) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const override
bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr &MI, Register Reg, bool UnfoldLoad, bool UnfoldStore, SmallVectorImpl< MachineInstr * > &NewMIs) const override
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
convertToThreeAddress - This method must be implemented by targets that set the M_CONVERTIBLE_TO_3_AD...
X86InstrInfo(X86Subtarget &STI)
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool expandPostRAPseudo(MachineInstr &MI) const override
bool isAssociativeAndCommutative(const MachineInstr &Inst, bool Invert) const override
MCInst getNop() const override
Return the noop instruction to use for a noop.
outliner::InstrType getOutliningTypeImpl(const MachineModuleInfo &MMI, MachineBasicBlock::iterator &MIT, unsigned Flags) const override
bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, int64_t Offset1, int64_t Offset2, unsigned NumLoads) const override
This is a used by the pre-regalloc scheduler to determine (in conjunction with areLoadsFromSameBasePt...
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
Fold a load or store of the specified stack slot into the specified machine instruction for the speci...
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
std::optional< ExtAddrMode > getAddrModeFromMemoryOp(const MachineInstr &MemI, const TargetRegisterInfo *TRI) const override
Register isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
isStoreToStackSlotPostFE - Check for post-frame ptr elimination stack locations as well.
bool isUnconditionalTailCall(const MachineInstr &MI) const override
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
std::optional< std::unique_ptr< outliner::OutlinedFunction > > getOutliningCandidateInfo(const MachineModuleInfo &MMI, std::vector< outliner::Candidate > &RepeatedSequenceLocs, unsigned MinRepeats) const override
bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, unsigned LEAOpcode, bool AllowSP, Register &NewSrc, unsigned &NewSrcSubReg, bool &isKill, MachineOperand &ImplicitOp, LiveVariables *LV, LiveIntervals *LIS) const
Given an operand within a MachineInstr, insert preceding code to put it into the right format for a p...
Register isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
isLoadFromStackSlotPostFE - Check for post-frame ptr elimination stack locations as well.
void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool setExecutionDomainCustom(MachineInstr &MI, unsigned Domain) const
int getSPAdjust(const MachineInstr &MI) const override
getSPAdjust - This returns the stack pointer adjustment made by this instruction.
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
Register getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
int getJumpTableIndex(const MachineInstr &MI) const override
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2, MachineInstr &NewMI1, MachineInstr &NewMI2) const override
This is an architecture-specific helper function of reassociateOps.
std::pair< uint16_t, uint16_t > getExecutionDomain(const MachineInstr &MI) const override
bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, Register &DstReg, unsigned &SubIdx) const override
isCoalescableExtInstr - Return true if the instruction is a "coalescable" extension instruction.
void loadStoreTileReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Opc, Register Reg, int FrameIdx, bool isKill=false) const
void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg) const override
When getMachineCombinerPatterns() finds potential patterns, this function generates the instructions ...
bool hasReassociableOperands(const MachineInstr &Inst, const MachineBasicBlock *MBB) const override
bool analyzeBranchPredicate(MachineBasicBlock &MBB, TargetInstrInfo::MachineBranchPredicate &MBP, bool AllowModify=false) const override
static bool isDataInvariant(MachineInstr &MI)
Returns true if the instruction has no behavior (specified or otherwise) that is based on the value o...
unsigned getUndefRegClearance(const MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override
Inform the BreakFalseDeps pass how many idle instructions we would like before certain undef register...
void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override
void buildClearRegister(Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator Iter, DebugLoc &DL, bool AllowSideEffects=true) const override
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
int64_t getFrameAdjustment(const MachineInstr &I) const
Returns the stack pointer adjustment that happens inside the frame setup..destroy sequence (e....
Definition: X86InstrInfo.h:262
bool hasHighOperandLatency(const TargetSchedModel &SchedModel, const MachineRegisterInfo *MRI, const MachineInstr &DefMI, unsigned DefIdx, const MachineInstr &UseMI, unsigned UseIdx) const override
bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override
uint16_t getExecutionDomainCustom(const MachineInstr &MI) const
bool isHighLatencyDef(int opc) const override
void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF, const outliner::OutlinedFunction &OF) const override
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const override
foldImmediate - 'Reg' is known to be defined by a move immediate instruction, try to fold the immedia...
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
unsigned getFMA3OpcodeToCommuteOperands(const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2, const X86InstrFMA3Group &FMA3Group) const
Returns an adjusted FMA opcode that must be used in FMA instruction that performs the same computatio...
bool preservesZeroValueInReg(const MachineInstr *MI, const Register NullValueReg, const TargetRegisterInfo *TRI) const override
unsigned getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override
Inform the BreakFalseDeps pass how many idle instructions we would like before a partial register upd...
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
unsigned getNumLocalDynamicTLSAccesses() const
bool canRealignStack(const MachineFunction &MF) const override
const TargetRegisterClass * constrainRegClassToNonRex2(const TargetRegisterClass *RC) const
bool isPICStyleGOT() const
Definition: X86Subtarget.h:333
bool canUseCMOV() const
Definition: X86Subtarget.h:188
bool isTargetWin64() const
Definition: X86Subtarget.h:329
const X86InstrInfo * getInstrInfo() const override
Definition: X86Subtarget.h:122
bool hasAVX512() const
Definition: X86Subtarget.h:197
bool hasSSE41() const
Definition: X86Subtarget.h:193
bool hasSSE2() const
Definition: X86Subtarget.h:190
const X86RegisterInfo * getRegisterInfo() const override
Definition: X86Subtarget.h:132
bool hasAVX() const
Definition: X86Subtarget.h:195
const X86FrameLowering * getFrameLowering() const override
Definition: X86Subtarget.h:124
bool hasAVX2() const
Definition: X86Subtarget.h:196
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:359
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1685
@ OPERAND_MEMORY
Definition: MCInstrDesc.h:63
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ X86
Windows x64, Windows Itanium (IA-64)
Reg
All possible values of the reg field in the ModR/M byte.
bool isKMergeMasked(uint64_t TSFlags)
Definition: X86BaseInfo.h:1319
@ EVEX
EVEX - Specifies that this instruction use EVEX form which provides syntax support up to 32 512-bit r...
Definition: X86BaseInfo.h:825
@ SSEDomainShift
Execution domain for SSE instructions.
Definition: X86BaseInfo.h:811
bool hasNewDataDest(uint64_t TSFlags)
Definition: X86BaseInfo.h:1001
@ MO_GOT_ABSOLUTE_ADDRESS
MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a relocation of: SYMBOL_LABEL + [.
Definition: X86BaseInfo.h:367
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
Definition: X86BaseInfo.h:432
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
Definition: X86BaseInfo.h:456
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
Definition: X86BaseInfo.h:425
@ MO_PIC_BASE_OFFSET
MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the immediate should get the value of th...
Definition: X86BaseInfo.h:371
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
Definition: X86BaseInfo.h:387
bool canUseApxExtendedReg(const MCInstrDesc &Desc)
Definition: X86BaseInfo.h:1260
bool isPseudo(uint64_t TSFlags)
Definition: X86BaseInfo.h:887
bool isKMasked(uint64_t TSFlags)
Definition: X86BaseInfo.h:1314
int getMemoryOperandNo(uint64_t TSFlags)
Definition: X86BaseInfo.h:1011
unsigned getOperandBias(const MCInstrDesc &Desc)
Compute whether all of the def operands are repeated in the uses and therefore should be skipped.
Definition: X86BaseInfo.h:968
CondCode getCondFromBranch(const MachineInstr &MI)
CondCode getCondFromCFCMov(const MachineInstr &MI)
@ LAST_VALID_COND
Definition: X86BaseInfo.h:94
CondCode getCondFromMI(const MachineInstr &MI)
Return the condition code of the instruction.
int getFirstAddrOperandIdx(const MachineInstr &MI)
Return the index of the instruction's first address operand, if it has a memory reference,...
unsigned getSwappedVCMPImm(unsigned Imm)
Get the VCMP immediate if the opcodes are swapped.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
unsigned getSwappedVPCOMImm(unsigned Imm)
Get the VPCOM immediate if the opcodes are swapped.
bool isX87Instruction(MachineInstr &MI)
Check if the instruction is X87 instruction.
unsigned getNonNDVariant(unsigned Opc)
unsigned getVPCMPImmForCond(ISD::CondCode CC)
Get the VPCMP immediate for the given condition.
std::pair< CondCode, bool > getX86ConditionCode(CmpInst::Predicate Predicate)
Return a pair of condition code for the given predicate and whether the instruction operands should b...
CondCode getCondFromSETCC(const MachineInstr &MI)
unsigned getSwappedVPCMPImm(unsigned Imm)
Get the VPCMP immediate if the opcodes are swapped.
CondCode getCondFromCCMP(const MachineInstr &MI)
int getCCMPCondFlagsFromCondCode(CondCode CC)
int getCondSrcNoFromDesc(const MCInstrDesc &MCID)
Return the source operand # for condition code by MCID.
const Constant * getConstantFromPool(const MachineInstr &MI, unsigned OpNo)
Find any constant pool entry associated with a specific instruction operand.
@ AddrScaleAmt
Definition: X86BaseInfo.h:30
@ AddrSegmentReg
Definition: X86BaseInfo.h:34
@ AddrIndexReg
Definition: X86BaseInfo.h:31
@ AddrNumOperands
Definition: X86BaseInfo.h:36
unsigned getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand=false, bool HasNDD=false)
Return a cmov opcode for the given register size in bytes, and operand type.
unsigned getNFVariant(unsigned Opc)
unsigned getVectorRegisterWidth(const MCOperandInfo &Info)
Get the width of the vector register operand.
CondCode getCondFromCMov(const MachineInstr &MI)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
InstrType
Represents how an instruction should be mapped by the outliner.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
@ Offset
Definition: DWP.cpp:477
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:307
std::pair< MachineOperand, DIExpression * > ParamLoadedValue
MaybeAlign getAlign(const CallInst &I, unsigned Index)
static bool isAddMemInstrWithRelocation(const MachineInstr &MI)
Definition: X86InstrInfo.h:177
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
static bool isMem(const MachineInstr &MI, unsigned Op)
Definition: X86InstrInfo.h:170
bool isAligned(Align Lhs, uint64_t SizeInBytes)
Checks that SizeInBytes is a multiple of the alignment.
Definition: Alignment.h:145
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
FunctionPass * createX86GlobalBaseRegPass()
This pass initializes a global base register for PIC on x86-32.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2155
static const MachineInstrBuilder & addRegReg(const MachineInstrBuilder &MIB, Register Reg1, bool isKill1, unsigned SubReg1, Register Reg2, bool isKill2, unsigned SubReg2)
addRegReg - This function is used to add a memory reference of the form: [Reg + Reg].
unsigned getDeadRegState(bool B)
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
FunctionPass * createCleanupLocalDynamicTLSPass()
This pass combines multiple accesses to local-dynamic TLS variables so that the TLS base address for ...
const X86FoldTableEntry * lookupBroadcastFoldTable(unsigned RegOp, unsigned OpNum)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:157
const X86InstrFMA3Group * getFMA3Group(unsigned Opcode, uint64_t TSFlags)
Returns a reference to a group of FMA3 opcodes to where the given Opcode is included.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
Definition: SPIRVUtils.cpp:976
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition: Error.cpp:167
const X86FoldTableEntry * lookupTwoAddrFoldTable(unsigned RegOp)
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition: STLExtras.h:1939
static bool isMemInstrWithGOTPCREL(const MachineInstr &MI)
Definition: X86InstrInfo.h:190
static const MachineInstrBuilder & addOffset(const MachineInstrBuilder &MIB, int Offset)
unsigned getUndefRegState(bool B)
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
unsigned getDefRegState(bool B)
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:2013
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
@ DPWSSD
Definition: X86InstrInfo.h:32
const X86FoldTableEntry * lookupUnfoldTable(unsigned MemOp)
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:223
bool matchBroadcastSize(const X86FoldTableEntry &Entry, unsigned BroadcastBits)
const X86FoldTableEntry * lookupFoldTable(unsigned RegOp, unsigned OpNum)
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, Register Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:858
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Description of the encoding of one expression Op.
Extended Value Type.
Definition: ValueTypes.h:35
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
Used to describe addressing mode similar to ExtAddrMode in CodeGenPrepare.
This represents a simple continuous liveness interval for a value.
Definition: LiveInterval.h:163
std::vector< MachineInstr * > Kills
Kills - List of MachineInstruction's which are the last use of this virtual register (kill it) in the...
Definition: LiveVariables.h:89
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
X86AddressMode - This struct holds a generalized full x86 address mode.
This class is used to group {132, 213, 231} forms of FMA opcodes together.
unsigned get213Opcode() const
Returns the 213 form of FMA opcode.
unsigned get231Opcode() const
Returns the 231 form of FMA opcode.
bool isIntrinsic() const
Returns true iff the group of FMA opcodes holds intrinsic opcodes.
unsigned get132Opcode() const
Returns the 132 form of FMA opcode.
An individual sequence of instructions to be replaced with a call to an outlined function.
The information necessary to create an outlined function for some class of candidate.