LLVM 22.0.0git
ARMISelLowering.cpp
Go to the documentation of this file.
1//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that ARM uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "ARMISelLowering.h"
15#include "ARMBaseInstrInfo.h"
16#include "ARMBaseRegisterInfo.h"
17#include "ARMCallingConv.h"
20#include "ARMPerfectShuffle.h"
21#include "ARMRegisterInfo.h"
22#include "ARMSelectionDAGInfo.h"
23#include "ARMSubtarget.h"
27#include "Utils/ARMBaseInfo.h"
28#include "llvm/ADT/APFloat.h"
29#include "llvm/ADT/APInt.h"
30#include "llvm/ADT/ArrayRef.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/DenseMap.h"
33#include "llvm/ADT/STLExtras.h"
36#include "llvm/ADT/Statistic.h"
38#include "llvm/ADT/StringRef.h"
40#include "llvm/ADT/Twine.h"
67#include "llvm/IR/Attributes.h"
68#include "llvm/IR/CallingConv.h"
69#include "llvm/IR/Constant.h"
70#include "llvm/IR/Constants.h"
71#include "llvm/IR/DataLayout.h"
72#include "llvm/IR/DebugLoc.h"
74#include "llvm/IR/Function.h"
75#include "llvm/IR/GlobalAlias.h"
76#include "llvm/IR/GlobalValue.h"
78#include "llvm/IR/IRBuilder.h"
79#include "llvm/IR/InlineAsm.h"
80#include "llvm/IR/Instruction.h"
83#include "llvm/IR/Intrinsics.h"
84#include "llvm/IR/IntrinsicsARM.h"
85#include "llvm/IR/Module.h"
86#include "llvm/IR/Type.h"
87#include "llvm/IR/User.h"
88#include "llvm/IR/Value.h"
89#include "llvm/MC/MCInstrDesc.h"
91#include "llvm/MC/MCSchedule.h"
98#include "llvm/Support/Debug.h"
106#include <algorithm>
107#include <cassert>
108#include <cstdint>
109#include <cstdlib>
110#include <iterator>
111#include <limits>
112#include <optional>
113#include <tuple>
114#include <utility>
115#include <vector>
116
117using namespace llvm;
118
119#define DEBUG_TYPE "arm-isel"
120
121STATISTIC(NumTailCalls, "Number of tail calls");
122STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
123STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
124STATISTIC(NumConstpoolPromoted,
125 "Number of constants with their storage promoted into constant pools");
126
127static cl::opt<bool>
128ARMInterworking("arm-interworking", cl::Hidden,
129 cl::desc("Enable / disable ARM interworking (for debugging only)"),
130 cl::init(true));
131
133 "arm-promote-constant", cl::Hidden,
134 cl::desc("Enable / disable promotion of unnamed_addr constants into "
135 "constant pools"),
136 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
138 "arm-promote-constant-max-size", cl::Hidden,
139 cl::desc("Maximum size of constant to promote into a constant pool"),
140 cl::init(64));
142 "arm-promote-constant-max-total", cl::Hidden,
143 cl::desc("Maximum size of ALL constants to promote into a constant pool"),
144 cl::init(128));
145
147MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
148 cl::desc("Maximum interleave factor for MVE VLDn to generate."),
149 cl::init(2));
150
152 "arm-max-base-updates-to-check", cl::Hidden,
153 cl::desc("Maximum number of base-updates to check generating postindex."),
154 cl::init(64));
155
156/// Value type used for "flags" operands / results (either CPSR or FPSCR_NZCV).
157constexpr MVT FlagsVT = MVT::i32;
158
159// The APCS parameter registers.
160static const MCPhysReg GPRArgRegs[] = {
161 ARM::R0, ARM::R1, ARM::R2, ARM::R3
162};
163
165 SelectionDAG &DAG, const SDLoc &DL) {
167 assert(Arg.ArgVT.bitsLT(MVT::i32));
168 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, Arg.ArgVT, Value);
169 SDValue Ext =
171 MVT::i32, Trunc);
172 return Ext;
173}
174
175void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
176 if (VT != PromotedLdStVT) {
177 setOperationAction(ISD::LOAD, VT, Promote);
178 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
179
180 setOperationAction(ISD::STORE, VT, Promote);
181 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
182 }
183
184 MVT ElemTy = VT.getVectorElementType();
185 if (ElemTy != MVT::f64)
189 if (ElemTy == MVT::i32) {
194 } else {
199 }
208 if (VT.isInteger()) {
212 }
213
214 // Neon does not support vector divide/remainder operations.
223
224 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
225 for (auto Opcode : {ISD::ABS, ISD::ABDS, ISD::ABDU, ISD::SMIN, ISD::SMAX,
227 setOperationAction(Opcode, VT, Legal);
228 if (!VT.isFloatingPoint())
229 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
230 setOperationAction(Opcode, VT, Legal);
231}
232
233void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
234 addRegisterClass(VT, &ARM::DPRRegClass);
235 addTypeForNEON(VT, MVT::f64);
236}
237
238void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
239 addRegisterClass(VT, &ARM::DPairRegClass);
240 addTypeForNEON(VT, MVT::v2f64);
241}
242
243void ARMTargetLowering::setAllExpand(MVT VT) {
244 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
246
247 // We support these really simple operations even on types where all
248 // the actual arithmetic has to be broken down into simpler
249 // operations or turned into library calls.
250 setOperationAction(ISD::BITCAST, VT, Legal);
251 setOperationAction(ISD::LOAD, VT, Legal);
252 setOperationAction(ISD::STORE, VT, Legal);
254}
255
256void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
257 LegalizeAction Action) {
258 setLoadExtAction(ISD::EXTLOAD, From, To, Action);
259 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
260 setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
261}
262
263void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
264 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
265
266 for (auto VT : IntTypes) {
267 addRegisterClass(VT, &ARM::MQPRRegClass);
281 setOperationAction(ISD::MLOAD, VT, Custom);
282 setOperationAction(ISD::MSTORE, VT, Legal);
297
298 // No native support for these.
308
309 // Vector reductions
310 setOperationAction(ISD::VECREDUCE_ADD, VT, Legal);
311 setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal);
312 setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal);
313 setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal);
314 setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal);
315 setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
316 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
317 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
318 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
319
320 if (!HasMVEFP) {
325 } else {
328 }
329
330 // Pre and Post inc are supported on loads and stores
331 for (unsigned im = (unsigned)ISD::PRE_INC;
332 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
337 }
338 }
339
340 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
341 for (auto VT : FloatTypes) {
342 addRegisterClass(VT, &ARM::MQPRRegClass);
343 if (!HasMVEFP)
344 setAllExpand(VT);
345
346 // These are legal or custom whether we have MVE.fp or not
355 setOperationAction(ISD::MLOAD, VT, Custom);
356 setOperationAction(ISD::MSTORE, VT, Legal);
359
360 // Pre and Post inc are supported on loads and stores
361 for (unsigned im = (unsigned)ISD::PRE_INC;
362 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
367 }
368
369 if (HasMVEFP) {
370 setOperationAction(ISD::FMINNUM, VT, Legal);
371 setOperationAction(ISD::FMAXNUM, VT, Legal);
372 setOperationAction(ISD::FROUND, VT, Legal);
373 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
374 setOperationAction(ISD::FRINT, VT, Legal);
375 setOperationAction(ISD::FTRUNC, VT, Legal);
376 setOperationAction(ISD::FFLOOR, VT, Legal);
377 setOperationAction(ISD::FCEIL, VT, Legal);
378 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
379 setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
380 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
381 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
382
383 // No native support for these.
386 setOperationAction(ISD::FSQRT, VT, Expand);
387 setOperationAction(ISD::FSIN, VT, Expand);
388 setOperationAction(ISD::FCOS, VT, Expand);
389 setOperationAction(ISD::FTAN, VT, Expand);
390 setOperationAction(ISD::FPOW, VT, Expand);
391 setOperationAction(ISD::FLOG, VT, Expand);
392 setOperationAction(ISD::FLOG2, VT, Expand);
393 setOperationAction(ISD::FLOG10, VT, Expand);
394 setOperationAction(ISD::FEXP, VT, Expand);
395 setOperationAction(ISD::FEXP2, VT, Expand);
396 setOperationAction(ISD::FEXP10, VT, Expand);
397 setOperationAction(ISD::FNEARBYINT, VT, Expand);
398 }
399 }
400
401 // Custom Expand smaller than legal vector reductions to prevent false zero
402 // items being added.
403 setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom);
404 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom);
405 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom);
406 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom);
407 setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
408 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom);
409 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom);
410 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom);
411
412 // We 'support' these types up to bitcast/load/store level, regardless of
413 // MVE integer-only / float support. Only doing FP data processing on the FP
414 // vector types is inhibited at integer-only level.
415 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
416 for (auto VT : LongTypes) {
417 addRegisterClass(VT, &ARM::MQPRRegClass);
418 setAllExpand(VT);
424 }
426
427 // We can do bitwise operations on v2i64 vectors
428 setOperationAction(ISD::AND, MVT::v2i64, Legal);
429 setOperationAction(ISD::OR, MVT::v2i64, Legal);
430 setOperationAction(ISD::XOR, MVT::v2i64, Legal);
431
432 // It is legal to extload from v4i8 to v4i16 or v4i32.
433 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
434 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
435 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
436
437 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
443
444 // Some truncating stores are legal too.
445 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
446 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
447 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
448
449 // Pre and Post inc on these are legal, given the correct extends
450 for (unsigned im = (unsigned)ISD::PRE_INC;
451 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
452 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
457 }
458 }
459
460 // Predicate types
461 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
462 for (auto VT : pTypes) {
463 addRegisterClass(VT, &ARM::VCCRRegClass);
472 setOperationAction(ISD::LOAD, VT, Custom);
473 setOperationAction(ISD::STORE, VT, Custom);
478
479 if (!HasMVEFP) {
484 }
485 }
489 setOperationAction(ISD::OR, MVT::v2i1, Expand);
495
504}
505
507 return static_cast<const ARMBaseTargetMachine &>(getTargetMachine());
508}
509
511 const ARMSubtarget &STI)
512 : TargetLowering(TM_), Subtarget(&STI),
513 RegInfo(Subtarget->getRegisterInfo()),
514 Itins(Subtarget->getInstrItineraryData()) {
515 const auto &TM = static_cast<const ARMBaseTargetMachine &>(TM_);
516
519
520 const Triple &TT = TM.getTargetTriple();
521
522 if (TT.isOSBinFormatMachO()) {
523 // Uses VFP for Thumb libfuncs if available.
524 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
525 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
526 // clang-format off
527 static const struct {
528 const RTLIB::Libcall Op;
529 const RTLIB::LibcallImpl Impl;
530 } LibraryCalls[] = {
531 // Single-precision floating-point arithmetic.
532 { RTLIB::ADD_F32, RTLIB::impl___addsf3vfp },
533 { RTLIB::SUB_F32, RTLIB::impl___subsf3vfp },
534 { RTLIB::MUL_F32, RTLIB::impl___mulsf3vfp },
535 { RTLIB::DIV_F32, RTLIB::impl___divsf3vfp },
536
537 // Double-precision floating-point arithmetic.
538 { RTLIB::ADD_F64, RTLIB::impl___adddf3vfp },
539 { RTLIB::SUB_F64, RTLIB::impl___subdf3vfp },
540 { RTLIB::MUL_F64, RTLIB::impl___muldf3vfp },
541 { RTLIB::DIV_F64, RTLIB::impl___divdf3vfp },
542
543 // Single-precision comparisons.
544 { RTLIB::OEQ_F32, RTLIB::impl___eqsf2vfp },
545 { RTLIB::UNE_F32, RTLIB::impl___nesf2vfp },
546 { RTLIB::OLT_F32, RTLIB::impl___ltsf2vfp },
547 { RTLIB::OLE_F32, RTLIB::impl___lesf2vfp },
548 { RTLIB::OGE_F32, RTLIB::impl___gesf2vfp },
549 { RTLIB::OGT_F32, RTLIB::impl___gtsf2vfp },
550 { RTLIB::UO_F32, RTLIB::impl___unordsf2vfp },
551
552 // Double-precision comparisons.
553 { RTLIB::OEQ_F64, RTLIB::impl___eqdf2vfp },
554 { RTLIB::UNE_F64, RTLIB::impl___nedf2vfp },
555 { RTLIB::OLT_F64, RTLIB::impl___ltdf2vfp },
556 { RTLIB::OLE_F64, RTLIB::impl___ledf2vfp },
557 { RTLIB::OGE_F64, RTLIB::impl___gedf2vfp },
558 { RTLIB::OGT_F64, RTLIB::impl___gtdf2vfp },
559 { RTLIB::UO_F64, RTLIB::impl___unorddf2vfp },
560
561 // Floating-point to integer conversions.
562 // i64 conversions are done via library routines even when generating VFP
563 // instructions, so use the same ones.
564 { RTLIB::FPTOSINT_F64_I32, RTLIB::impl___fixdfsivfp },
565 { RTLIB::FPTOUINT_F64_I32, RTLIB::impl___fixunsdfsivfp },
566 { RTLIB::FPTOSINT_F32_I32, RTLIB::impl___fixsfsivfp },
567 { RTLIB::FPTOUINT_F32_I32, RTLIB::impl___fixunssfsivfp },
568
569 // Conversions between floating types.
570 { RTLIB::FPROUND_F64_F32, RTLIB::impl___truncdfsf2vfp },
571 { RTLIB::FPEXT_F32_F64, RTLIB::impl___extendsfdf2vfp },
572
573 // Integer to floating-point conversions.
574 // i64 conversions are done via library routines even when generating VFP
575 // instructions, so use the same ones.
576 // FIXME: There appears to be some naming inconsistency in ARM libgcc:
577 // e.g., __floatunsidf vs. __floatunssidfvfp.
578 { RTLIB::SINTTOFP_I32_F64, RTLIB::impl___floatsidfvfp },
579 { RTLIB::UINTTOFP_I32_F64, RTLIB::impl___floatunssidfvfp },
580 { RTLIB::SINTTOFP_I32_F32, RTLIB::impl___floatsisfvfp },
581 { RTLIB::UINTTOFP_I32_F32, RTLIB::impl___floatunssisfvfp },
582 };
583 // clang-format on
584
585 for (const auto &LC : LibraryCalls)
586 setLibcallImpl(LC.Op, LC.Impl);
587 }
588 }
589
590 if (Subtarget->isThumb1Only())
591 addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
592 else
593 addRegisterClass(MVT::i32, &ARM::GPRRegClass);
594
595 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
596 Subtarget->hasFPRegs()) {
597 addRegisterClass(MVT::f32, &ARM::SPRRegClass);
598 addRegisterClass(MVT::f64, &ARM::DPRRegClass);
599
604
605 if (!Subtarget->hasVFP2Base())
606 setAllExpand(MVT::f32);
607 if (!Subtarget->hasFP64())
608 setAllExpand(MVT::f64);
609 }
610
611 if (Subtarget->hasFullFP16()) {
612 addRegisterClass(MVT::f16, &ARM::HPRRegClass);
613 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
614 setOperationAction(ISD::BITCAST, MVT::f16, Custom);
615
616 setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
617 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
618 }
619
620 if (Subtarget->hasBF16()) {
621 addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
622 setAllExpand(MVT::bf16);
623 if (!Subtarget->hasFullFP16())
624 setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
625 } else {
626 setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand);
627 setOperationAction(ISD::BF16_TO_FP, MVT::f64, Expand);
628 setOperationAction(ISD::FP_TO_BF16, MVT::f32, Custom);
629 setOperationAction(ISD::FP_TO_BF16, MVT::f64, Custom);
630 }
631
633 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
634 setTruncStoreAction(VT, InnerVT, Expand);
635 addAllExtLoads(VT, InnerVT, Expand);
636 }
637
640
642 }
643
644 if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps())
646
647 if (!Subtarget->hasV8_1MMainlineOps())
649
650 if (!Subtarget->isThumb1Only())
652
655
658
659 if (Subtarget->hasMVEIntegerOps())
660 addMVEVectorTypes(Subtarget->hasMVEFloatOps());
661
662 // Combine low-overhead loop intrinsics so that we can lower i1 types.
663 if (Subtarget->hasLOB()) {
664 setTargetDAGCombine({ISD::BRCOND, ISD::BR_CC});
665 }
666
667 if (Subtarget->hasNEON()) {
668 addDRTypeForNEON(MVT::v2f32);
669 addDRTypeForNEON(MVT::v8i8);
670 addDRTypeForNEON(MVT::v4i16);
671 addDRTypeForNEON(MVT::v2i32);
672 addDRTypeForNEON(MVT::v1i64);
673
674 addQRTypeForNEON(MVT::v4f32);
675 addQRTypeForNEON(MVT::v2f64);
676 addQRTypeForNEON(MVT::v16i8);
677 addQRTypeForNEON(MVT::v8i16);
678 addQRTypeForNEON(MVT::v4i32);
679 addQRTypeForNEON(MVT::v2i64);
680
681 if (Subtarget->hasFullFP16()) {
682 addQRTypeForNEON(MVT::v8f16);
683 addDRTypeForNEON(MVT::v4f16);
684 }
685
686 if (Subtarget->hasBF16()) {
687 addQRTypeForNEON(MVT::v8bf16);
688 addDRTypeForNEON(MVT::v4bf16);
689 }
690 }
691
692 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
693 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
694 // none of Neon, MVE or VFP supports any arithmetic operations on it.
695 setOperationAction(ISD::FADD, MVT::v2f64, Expand);
696 setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
697 setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
698 // FIXME: Code duplication: FDIV and FREM are expanded always, see
699 // ARMTargetLowering::addTypeForNEON method for details.
700 setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
701 setOperationAction(ISD::FREM, MVT::v2f64, Expand);
702 // FIXME: Create unittest.
703 // In another words, find a way when "copysign" appears in DAG with vector
704 // operands.
706 // FIXME: Code duplication: SETCC has custom operation action, see
707 // ARMTargetLowering::addTypeForNEON method for details.
709 // FIXME: Create unittest for FNEG and for FABS.
710 setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
711 setOperationAction(ISD::FABS, MVT::v2f64, Expand);
712 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
713 setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
714 setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
715 setOperationAction(ISD::FTAN, MVT::v2f64, Expand);
716 setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
717 setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
718 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
719 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
720 setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
721 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
722 setOperationAction(ISD::FEXP10, MVT::v2f64, Expand);
723 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
724 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
725 setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
726 setOperationAction(ISD::FROUNDEVEN, MVT::v2f64, Expand);
727 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
728 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
729 setOperationAction(ISD::FMA, MVT::v2f64, Expand);
730 }
731
732 if (Subtarget->hasNEON()) {
733 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
734 // supported for v4f32.
735 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
736 setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
737 setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
738 setOperationAction(ISD::FTAN, MVT::v4f32, Expand);
739 setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
740 setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
741 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
742 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
743 setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
744 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
745 setOperationAction(ISD::FEXP10, MVT::v4f32, Expand);
746 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
747 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
748 setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
749 setOperationAction(ISD::FROUNDEVEN, MVT::v4f32, Expand);
750 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
751 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
752
753 // Mark v2f32 intrinsics.
754 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
755 setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
756 setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
757 setOperationAction(ISD::FTAN, MVT::v2f32, Expand);
758 setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
759 setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
760 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
761 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
762 setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
763 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
764 setOperationAction(ISD::FEXP10, MVT::v2f32, Expand);
765 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
766 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
767 setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
768 setOperationAction(ISD::FROUNDEVEN, MVT::v2f32, Expand);
769 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
770 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
771
772 for (ISD::NodeType Op : {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
773 ISD::FRINT, ISD::FTRUNC, ISD::FROUNDEVEN}) {
774 setOperationAction(Op, MVT::v4f16, Expand);
775 setOperationAction(Op, MVT::v8f16, Expand);
776 }
777
778 // Neon does not support some operations on v1i64 and v2i64 types.
779 setOperationAction(ISD::MUL, MVT::v1i64, Expand);
780 // Custom handling for some quad-vector types to detect VMULL.
781 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
782 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
783 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
784 // Custom handling for some vector types to avoid expensive expansions
785 setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
787 setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
789 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
790 // a destination type that is wider than the source, and nor does
791 // it have a FP_TO_[SU]INT instruction with a narrower destination than
792 // source.
801
803 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
804
805 // NEON does not have single instruction CTPOP for vectors with element
806 // types wider than 8-bits. However, custom lowering can leverage the
807 // v8i8/v16i8 vcnt instruction.
814
815 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
816 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
817
818 // NEON does not have single instruction CTTZ for vectors.
820 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
821 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
822 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
823
824 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
825 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
826 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
827 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
828
833
838
842 }
843
844 // NEON only has FMA instructions as of VFP4.
845 if (!Subtarget->hasVFP4Base()) {
846 setOperationAction(ISD::FMA, MVT::v2f32, Expand);
847 setOperationAction(ISD::FMA, MVT::v4f32, Expand);
848 }
849
851 ISD::FP_TO_UINT, ISD::FMUL, ISD::LOAD});
852
853 // It is legal to extload from v4i8 to v4i16 or v4i32.
854 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
855 MVT::v2i32}) {
860 }
861 }
862
863 for (auto VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
864 MVT::v4i32}) {
865 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
866 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
867 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
868 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
869 }
870 }
871
872 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
878 ISD::INTRINSIC_VOID, ISD::VECREDUCE_ADD, ISD::ADD, ISD::BITCAST});
879 }
880 if (Subtarget->hasMVEIntegerOps()) {
882 ISD::FP_EXTEND, ISD::SELECT, ISD::SELECT_CC,
883 ISD::SETCC});
884 }
885 if (Subtarget->hasMVEFloatOps()) {
887 }
888
889 if (!Subtarget->hasFP64()) {
890 // When targeting a floating-point unit with only single-precision
891 // operations, f64 is legal for the few double-precision instructions which
892 // are present However, no double-precision operations other than moves,
893 // loads and stores are provided by the hardware.
902 setOperationAction(ISD::FNEG, MVT::f64, Expand);
903 setOperationAction(ISD::FABS, MVT::f64, Expand);
904 setOperationAction(ISD::FSQRT, MVT::f64, Expand);
905 setOperationAction(ISD::FSIN, MVT::f64, Expand);
906 setOperationAction(ISD::FCOS, MVT::f64, Expand);
907 setOperationAction(ISD::FPOW, MVT::f64, Expand);
908 setOperationAction(ISD::FLOG, MVT::f64, Expand);
909 setOperationAction(ISD::FLOG2, MVT::f64, Expand);
910 setOperationAction(ISD::FLOG10, MVT::f64, Expand);
911 setOperationAction(ISD::FEXP, MVT::f64, Expand);
912 setOperationAction(ISD::FEXP2, MVT::f64, Expand);
913 setOperationAction(ISD::FEXP10, MVT::f64, Expand);
914 setOperationAction(ISD::FCEIL, MVT::f64, Expand);
915 setOperationAction(ISD::FTRUNC, MVT::f64, Expand);
916 setOperationAction(ISD::FRINT, MVT::f64, Expand);
917 setOperationAction(ISD::FROUNDEVEN, MVT::f64, Expand);
918 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
919 setOperationAction(ISD::FFLOOR, MVT::f64, Expand);
932 }
933
934 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
935 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
937 if (Subtarget->hasFullFP16()) {
940 }
941 }
942
943 if (!Subtarget->hasFP16()) {
944 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
946 }
947
948 computeRegisterProperties(Subtarget->getRegisterInfo());
949
950 // ARM does not have floating-point extending loads.
951 for (MVT VT : MVT::fp_valuetypes()) {
952 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
953 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
954 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
955 }
956
957 // ... or truncating stores
958 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
959 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
960 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
961 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
962 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
963
964 // ARM does not have i1 sign extending load.
965 for (MVT VT : MVT::integer_valuetypes())
967
968 // ARM supports all 4 flavors of integer indexed load / store.
969 if (!Subtarget->isThumb1Only()) {
970 for (unsigned im = (unsigned)ISD::PRE_INC;
972 setIndexedLoadAction(im, MVT::i1, Legal);
973 setIndexedLoadAction(im, MVT::i8, Legal);
974 setIndexedLoadAction(im, MVT::i16, Legal);
975 setIndexedLoadAction(im, MVT::i32, Legal);
976 setIndexedStoreAction(im, MVT::i1, Legal);
977 setIndexedStoreAction(im, MVT::i8, Legal);
978 setIndexedStoreAction(im, MVT::i16, Legal);
979 setIndexedStoreAction(im, MVT::i32, Legal);
980 }
981 } else {
982 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
985 }
986
991
994 if (Subtarget->hasDSP()) {
1003 }
1004 if (Subtarget->hasBaseDSP()) {
1007 }
1008
1009 // i64 operation support.
1012 if (Subtarget->isThumb1Only()) {
1015 }
1016 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
1017 || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
1019
1027 setOperationAction(ISD::LOAD, MVT::i64, Custom);
1028 setOperationAction(ISD::STORE, MVT::i64, Custom);
1029
1030 // MVE lowers 64 bit shifts to lsll and lsrl
1031 // assuming that ISD::SRL and SRA of i64 are already marked custom
1032 if (Subtarget->hasMVEIntegerOps())
1034
1035 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1036 if (Subtarget->isThumb1Only()) {
1040 }
1041
1042 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
1044
1045 // ARM does not have ROTL.
1050 }
1052 // TODO: These two should be set to LibCall, but this currently breaks
1053 // the Linux kernel build. See #101786.
1056 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1059 }
1060
1061 // @llvm.readcyclecounter requires the Performance Monitors extension.
1062 // Default to the 0 expansion on unsupported platforms.
1063 // FIXME: Technically there are older ARM CPUs that have
1064 // implementation-specific ways of obtaining this information.
1065 if (Subtarget->hasPerfMon())
1066 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
1067
1068 // Only ARMv6 has BSWAP.
1069 if (!Subtarget->hasV6Ops())
1071
1072 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1073 : Subtarget->hasDivideInARMMode();
1074 if (!hasDivide) {
1075 // These are expanded into libcalls if the cpu doesn't have HW divider.
1078 }
1079
1080 if (TT.isOSWindows() && !Subtarget->hasDivideInThumbMode()) {
1083
1086 }
1087
1090
1091 // Register based DivRem for AEABI (RTABI 4.2)
1092 if (TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() ||
1093 TT.isTargetMuslAEABI() || TT.isOSWindows()) {
1096 HasStandaloneRem = false;
1097
1102 } else {
1105 }
1106
1111
1112 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1113 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
1114
1115 // Use the default implementation.
1116 setOperationAction(ISD::VASTART, MVT::Other, Custom);
1117 setOperationAction(ISD::VAARG, MVT::Other, Expand);
1118 setOperationAction(ISD::VACOPY, MVT::Other, Expand);
1119 setOperationAction(ISD::VAEND, MVT::Other, Expand);
1120 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
1121 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
1122
1123 if (TT.isOSWindows())
1124 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
1125 else
1126 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
1127
1128 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1129 // the default expansion.
1130 InsertFencesForAtomic = false;
1131 if (Subtarget->hasAnyDataBarrier() &&
1132 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1133 // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1134 // to ldrex/strex loops already.
1135 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
1136 if (!Subtarget->isThumb() || !Subtarget->isMClass())
1137 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
1138
1139 // On v8, we have particularly efficient implementations of atomic fences
1140 // if they can be combined with nearby atomic loads and stores.
1141 if (!Subtarget->hasAcquireRelease() ||
1142 getTargetMachine().getOptLevel() == CodeGenOptLevel::None) {
1143 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1144 InsertFencesForAtomic = true;
1145 }
1146 } else {
1147 // If there's anything we can use as a barrier, go through custom lowering
1148 // for ATOMIC_FENCE.
1149 // If target has DMB in thumb, Fences can be inserted.
1150 if (Subtarget->hasDataBarrier())
1151 InsertFencesForAtomic = true;
1152
1153 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other,
1154 Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1155
1156 // Set them all for libcall, which will force libcalls.
1157 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
1158 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
1159 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
1160 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, LibCall);
1161 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, LibCall);
1162 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
1163 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
1164 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, LibCall);
1165 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, LibCall);
1166 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, LibCall);
1167 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, LibCall);
1168 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, LibCall);
1169 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1170 // Unordered/Monotonic case.
1171 if (!InsertFencesForAtomic) {
1172 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
1173 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
1174 }
1175 }
1176
1177 // Compute supported atomic widths.
1178 if (TT.isOSLinux() || (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
1179 // For targets where __sync_* routines are reliably available, we use them
1180 // if necessary.
1181 //
1182 // ARM Linux always supports 64-bit atomics through kernel-assisted atomic
1183 // routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
1184 //
1185 // ARMv6 targets have native instructions in ARM mode. For Thumb mode,
1186 // such targets should provide __sync_* routines, which use the ARM mode
1187 // instructions. (ARMv6 doesn't have dmb, but it has an equivalent
1188 // encoding; see ARMISD::MEMBARRIER_MCR.)
1190 } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||
1191 Subtarget->hasForced32BitAtomics()) {
1192 // Cortex-M (besides Cortex-M0) have 32-bit atomics.
1194 } else {
1195 // We can't assume anything about other targets; just use libatomic
1196 // routines.
1198 }
1199
1201
1202 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
1203
1204 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1205 if (!Subtarget->hasV6Ops()) {
1208 }
1210
1211 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1212 !Subtarget->isThumb1Only()) {
1213 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1214 // iff target supports vfp2.
1215 setOperationAction(ISD::BITCAST, MVT::i64, Custom);
1217 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
1218 setOperationAction(ISD::GET_FPENV, MVT::i32, Legal);
1219 setOperationAction(ISD::SET_FPENV, MVT::i32, Legal);
1220 setOperationAction(ISD::RESET_FPENV, MVT::Other, Legal);
1221 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
1222 setOperationAction(ISD::SET_FPMODE, MVT::i32, Custom);
1223 setOperationAction(ISD::RESET_FPMODE, MVT::Other, Custom);
1224 }
1225
1226 // We want to custom lower some of our intrinsics.
1231
1241 if (Subtarget->hasFullFP16()) {
1245 }
1246
1248
1249 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
1250 setOperationAction(ISD::BR_CC, MVT::i32, Custom);
1251 if (Subtarget->hasFullFP16())
1252 setOperationAction(ISD::BR_CC, MVT::f16, Custom);
1253 setOperationAction(ISD::BR_CC, MVT::f32, Custom);
1254 setOperationAction(ISD::BR_CC, MVT::f64, Custom);
1255 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
1256
1257 // We don't support sin/cos/fmod/copysign/pow
1258 setOperationAction(ISD::FSIN, MVT::f64, Expand);
1259 setOperationAction(ISD::FSIN, MVT::f32, Expand);
1260 setOperationAction(ISD::FCOS, MVT::f32, Expand);
1261 setOperationAction(ISD::FCOS, MVT::f64, Expand);
1262 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1263 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
1266 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1267 !Subtarget->isThumb1Only()) {
1270 }
1271 setOperationAction(ISD::FPOW, MVT::f64, Expand);
1272 setOperationAction(ISD::FPOW, MVT::f32, Expand);
1273
1274 if (!Subtarget->hasVFP4Base()) {
1277 }
1278
1279 // Various VFP goodness
1280 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1281 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1282 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1283 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
1284 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
1285 }
1286
1287 // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1288 if (!Subtarget->hasFP16()) {
1289 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
1290 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
1291 }
1292
1293 // Strict floating-point comparisons need custom lowering.
1300 }
1301
1302 // Use __sincos_stret if available.
1303 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1304 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1305 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1306 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1307 }
1308
1309 // FP-ARMv8 implements a lot of rounding-like FP operations.
1310 if (Subtarget->hasFPARMv8Base()) {
1311 setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
1312 setOperationAction(ISD::FCEIL, MVT::f32, Legal);
1313 setOperationAction(ISD::FROUND, MVT::f32, Legal);
1314 setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
1315 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
1316 setOperationAction(ISD::FRINT, MVT::f32, Legal);
1317 setOperationAction(ISD::FROUNDEVEN, MVT::f32, Legal);
1318 setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
1319 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
1320 if (Subtarget->hasNEON()) {
1321 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
1322 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
1323 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
1324 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
1325 }
1326
1327 if (Subtarget->hasFP64()) {
1328 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
1329 setOperationAction(ISD::FCEIL, MVT::f64, Legal);
1330 setOperationAction(ISD::FROUND, MVT::f64, Legal);
1331 setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
1332 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
1333 setOperationAction(ISD::FRINT, MVT::f64, Legal);
1334 setOperationAction(ISD::FROUNDEVEN, MVT::f64, Legal);
1335 setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
1336 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
1337 }
1338 }
1339
1340 // FP16 often need to be promoted to call lib functions
1341 if (Subtarget->hasFullFP16()) {
1344 setOperationAction(ISD::FSIN, MVT::f16, Promote);
1345 setOperationAction(ISD::FCOS, MVT::f16, Promote);
1346 setOperationAction(ISD::FTAN, MVT::f16, Promote);
1347 setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
1348 setOperationAction(ISD::FPOWI, MVT::f16, Promote);
1349 setOperationAction(ISD::FPOW, MVT::f16, Promote);
1350 setOperationAction(ISD::FEXP, MVT::f16, Promote);
1351 setOperationAction(ISD::FEXP2, MVT::f16, Promote);
1352 setOperationAction(ISD::FEXP10, MVT::f16, Promote);
1353 setOperationAction(ISD::FLOG, MVT::f16, Promote);
1354 setOperationAction(ISD::FLOG10, MVT::f16, Promote);
1355 setOperationAction(ISD::FLOG2, MVT::f16, Promote);
1356
1357 setOperationAction(ISD::FROUND, MVT::f16, Legal);
1358 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
1359 setOperationAction(ISD::FTRUNC, MVT::f16, Legal);
1360 setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal);
1361 setOperationAction(ISD::FRINT, MVT::f16, Legal);
1362 setOperationAction(ISD::FFLOOR, MVT::f16, Legal);
1363 setOperationAction(ISD::FCEIL, MVT::f16, Legal);
1364 }
1365
1366 if (Subtarget->hasNEON()) {
1367 // vmin and vmax aren't available in a scalar form, so we can use
1368 // a NEON instruction with an undef lane instead.
1369 setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
1370 setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
1371 setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
1372 setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
1373 setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
1374 setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
1375 setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
1376 setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);
1377
1378 if (Subtarget->hasV8Ops()) {
1379 setOperationAction(ISD::FFLOOR, MVT::v2f32, Legal);
1380 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
1381 setOperationAction(ISD::FROUND, MVT::v2f32, Legal);
1382 setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
1383 setOperationAction(ISD::FROUNDEVEN, MVT::v2f32, Legal);
1384 setOperationAction(ISD::FROUNDEVEN, MVT::v4f32, Legal);
1385 setOperationAction(ISD::FCEIL, MVT::v2f32, Legal);
1386 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
1387 setOperationAction(ISD::FTRUNC, MVT::v2f32, Legal);
1388 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
1389 setOperationAction(ISD::FRINT, MVT::v2f32, Legal);
1390 setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
1391 }
1392
1393 if (Subtarget->hasFullFP16()) {
1394 setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal);
1395 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal);
1396 setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal);
1397 setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal);
1398
1399 setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal);
1400 setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal);
1401 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal);
1402 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal);
1403
1404 setOperationAction(ISD::FFLOOR, MVT::v4f16, Legal);
1405 setOperationAction(ISD::FFLOOR, MVT::v8f16, Legal);
1406 setOperationAction(ISD::FROUND, MVT::v4f16, Legal);
1407 setOperationAction(ISD::FROUND, MVT::v8f16, Legal);
1408 setOperationAction(ISD::FROUNDEVEN, MVT::v4f16, Legal);
1409 setOperationAction(ISD::FROUNDEVEN, MVT::v8f16, Legal);
1410 setOperationAction(ISD::FCEIL, MVT::v4f16, Legal);
1411 setOperationAction(ISD::FCEIL, MVT::v8f16, Legal);
1412 setOperationAction(ISD::FTRUNC, MVT::v4f16, Legal);
1413 setOperationAction(ISD::FTRUNC, MVT::v8f16, Legal);
1414 setOperationAction(ISD::FRINT, MVT::v4f16, Legal);
1415 setOperationAction(ISD::FRINT, MVT::v8f16, Legal);
1416 }
1417 }
1418
1419 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1420 // it, but it's just a wrapper around ldexp.
1421 if (TT.isOSWindows()) {
1422 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
1423 if (isOperationExpand(Op, MVT::f32))
1424 setOperationAction(Op, MVT::f32, Promote);
1425 }
1426
1427 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1428 // isn't legal.
1429 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
1430 if (isOperationExpand(Op, MVT::f16))
1431 setOperationAction(Op, MVT::f16, Promote);
1432
1433 // We have target-specific dag combine patterns for the following nodes:
1434 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
1437
1438 if (Subtarget->hasMVEIntegerOps())
1440
1441 if (Subtarget->hasV6Ops())
1443 if (Subtarget->isThumb1Only())
1445 // Attempt to lower smin/smax to ssat/usat
1446 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
1447 Subtarget->isThumb2()) {
1449 }
1450
1452
1453 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1454 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1456 else
1458
1459 //// temporary - rewrite interface to use type
1462 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1464 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1466
1467 // On ARM arguments smaller than 4 bytes are extended, so all arguments
1468 // are at least 4 bytes aligned.
1470
1471 // Prefer likely predicted branches to selects on out-of-order cores.
1472 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1473
1474 setPrefLoopAlignment(Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1476 Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1477
1478 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1479}
1480
1482 return Subtarget->useSoftFloat();
1483}
1484
1486 return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32;
1487}
1488
1489// FIXME: It might make sense to define the representative register class as the
1490// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1491// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1492// SPR's representative would be DPR_VFP2. This should work well if register
1493// pressure tracking were modified such that a register use would increment the
1494// pressure of the register class's representative and all of it's super
1495// classes' representatives transitively. We have not implemented this because
1496// of the difficulty prior to coalescing of modeling operand register classes
1497// due to the common occurrence of cross class copies and subregister insertions
1498// and extractions.
1499std::pair<const TargetRegisterClass *, uint8_t>
1501 MVT VT) const {
1502 const TargetRegisterClass *RRC = nullptr;
1503 uint8_t Cost = 1;
1504 switch (VT.SimpleTy) {
1505 default:
1507 // Use DPR as representative register class for all floating point
1508 // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1509 // the cost is 1 for both f32 and f64.
1510 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1511 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1512 RRC = &ARM::DPRRegClass;
1513 // When NEON is used for SP, only half of the register file is available
1514 // because operations that define both SP and DP results will be constrained
1515 // to the VFP2 class (D0-D15). We currently model this constraint prior to
1516 // coalescing by double-counting the SP regs. See the FIXME above.
1517 if (Subtarget->useNEONForSinglePrecisionFP())
1518 Cost = 2;
1519 break;
1520 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1521 case MVT::v4f32: case MVT::v2f64:
1522 RRC = &ARM::DPRRegClass;
1523 Cost = 2;
1524 break;
1525 case MVT::v4i64:
1526 RRC = &ARM::DPRRegClass;
1527 Cost = 4;
1528 break;
1529 case MVT::v8i64:
1530 RRC = &ARM::DPRRegClass;
1531 Cost = 8;
1532 break;
1533 }
1534 return std::make_pair(RRC, Cost);
1535}
1536
1537const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
1538#define MAKE_CASE(V) \
1539 case V: \
1540 return #V;
1541 switch ((ARMISD::NodeType)Opcode) {
1543 break;
1746#undef MAKE_CASE
1747 }
1748 return nullptr;
1749}
1750
1752 EVT VT) const {
1753 if (!VT.isVector())
1754 return getPointerTy(DL);
1755
1756 // MVE has a predicate register.
1757 if ((Subtarget->hasMVEIntegerOps() &&
1758 (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
1759 VT == MVT::v16i8)) ||
1760 (Subtarget->hasMVEFloatOps() &&
1761 (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16)))
1762 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
1764}
1765
1766/// getRegClassFor - Return the register class that should be used for the
1767/// specified value type.
1768const TargetRegisterClass *
1769ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1770 (void)isDivergent;
1771 // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1772 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1773 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1774 // MVE Q registers.
1775 if (Subtarget->hasNEON()) {
1776 if (VT == MVT::v4i64)
1777 return &ARM::QQPRRegClass;
1778 if (VT == MVT::v8i64)
1779 return &ARM::QQQQPRRegClass;
1780 }
1781 if (Subtarget->hasMVEIntegerOps()) {
1782 if (VT == MVT::v4i64)
1783 return &ARM::MQQPRRegClass;
1784 if (VT == MVT::v8i64)
1785 return &ARM::MQQQQPRRegClass;
1786 }
1788}
1789
1790// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1791// source/dest is aligned and the copy size is large enough. We therefore want
1792// to align such objects passed to memory intrinsics.
1794 Align &PrefAlign) const {
1795 if (!isa<MemIntrinsic>(CI))
1796 return false;
1797 MinSize = 8;
1798 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1799 // cycle faster than 4-byte aligned LDM.
1800 PrefAlign =
1801 (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));
1802 return true;
1803}
1804
1805// Create a fast isel object.
1806FastISel *
1808 const TargetLibraryInfo *libInfo) const {
1809 return ARM::createFastISel(funcInfo, libInfo);
1810}
1811
1813 unsigned NumVals = N->getNumValues();
1814 if (!NumVals)
1815 return Sched::RegPressure;
1816
1817 for (unsigned i = 0; i != NumVals; ++i) {
1818 EVT VT = N->getValueType(i);
1819 if (VT == MVT::Glue || VT == MVT::Other)
1820 continue;
1821 if (VT.isFloatingPoint() || VT.isVector())
1822 return Sched::ILP;
1823 }
1824
1825 if (!N->isMachineOpcode())
1826 return Sched::RegPressure;
1827
1828 // Load are scheduled for latency even if there instruction itinerary
1829 // is not available.
1830 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1831 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1832
1833 if (MCID.getNumDefs() == 0)
1834 return Sched::RegPressure;
1835 if (!Itins->isEmpty() &&
1836 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2U)
1837 return Sched::ILP;
1838
1839 return Sched::RegPressure;
1840}
1841
1842//===----------------------------------------------------------------------===//
1843// Lowering Code
1844//===----------------------------------------------------------------------===//
1845
1846static bool isSRL16(const SDValue &Op) {
1847 if (Op.getOpcode() != ISD::SRL)
1848 return false;
1849 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1850 return Const->getZExtValue() == 16;
1851 return false;
1852}
1853
1854static bool isSRA16(const SDValue &Op) {
1855 if (Op.getOpcode() != ISD::SRA)
1856 return false;
1857 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1858 return Const->getZExtValue() == 16;
1859 return false;
1860}
1861
1862static bool isSHL16(const SDValue &Op) {
1863 if (Op.getOpcode() != ISD::SHL)
1864 return false;
1865 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1866 return Const->getZExtValue() == 16;
1867 return false;
1868}
1869
1870// Check for a signed 16-bit value. We special case SRA because it makes it
1871// more simple when also looking for SRAs that aren't sign extending a
1872// smaller value. Without the check, we'd need to take extra care with
1873// checking order for some operations.
1874static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
1875 if (isSRA16(Op))
1876 return isSHL16(Op.getOperand(0));
1877 return DAG.ComputeNumSignBits(Op) == 17;
1878}
1879
1880/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1882 switch (CC) {
1883 default: llvm_unreachable("Unknown condition code!");
1884 case ISD::SETNE: return ARMCC::NE;
1885 case ISD::SETEQ: return ARMCC::EQ;
1886 case ISD::SETGT: return ARMCC::GT;
1887 case ISD::SETGE: return ARMCC::GE;
1888 case ISD::SETLT: return ARMCC::LT;
1889 case ISD::SETLE: return ARMCC::LE;
1890 case ISD::SETUGT: return ARMCC::HI;
1891 case ISD::SETUGE: return ARMCC::HS;
1892 case ISD::SETULT: return ARMCC::LO;
1893 case ISD::SETULE: return ARMCC::LS;
1894 }
1895}
1896
1897/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1899 ARMCC::CondCodes &CondCode2) {
1900 CondCode2 = ARMCC::AL;
1901 switch (CC) {
1902 default: llvm_unreachable("Unknown FP condition!");
1903 case ISD::SETEQ:
1904 case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1905 case ISD::SETGT:
1906 case ISD::SETOGT: CondCode = ARMCC::GT; break;
1907 case ISD::SETGE:
1908 case ISD::SETOGE: CondCode = ARMCC::GE; break;
1909 case ISD::SETOLT: CondCode = ARMCC::MI; break;
1910 case ISD::SETOLE: CondCode = ARMCC::LS; break;
1911 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1912 case ISD::SETO: CondCode = ARMCC::VC; break;
1913 case ISD::SETUO: CondCode = ARMCC::VS; break;
1914 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1915 case ISD::SETUGT: CondCode = ARMCC::HI; break;
1916 case ISD::SETUGE: CondCode = ARMCC::PL; break;
1917 case ISD::SETLT:
1918 case ISD::SETULT: CondCode = ARMCC::LT; break;
1919 case ISD::SETLE:
1920 case ISD::SETULE: CondCode = ARMCC::LE; break;
1921 case ISD::SETNE:
1922 case ISD::SETUNE: CondCode = ARMCC::NE; break;
1923 }
1924}
1925
1926//===----------------------------------------------------------------------===//
1927// Calling Convention Implementation
1928//===----------------------------------------------------------------------===//
1929
1930/// getEffectiveCallingConv - Get the effective calling convention, taking into
1931/// account presence of floating point hardware and calling convention
1932/// limitations, such as support for variadic functions.
1934ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
1935 bool isVarArg) const {
1936 switch (CC) {
1937 default:
1938 report_fatal_error("Unsupported calling convention");
1941 case CallingConv::GHC:
1943 return CC;
1949 case CallingConv::Swift:
1952 case CallingConv::C:
1953 case CallingConv::Tail:
1954 if (!getTM().isAAPCS_ABI())
1955 return CallingConv::ARM_APCS;
1956 else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
1957 getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1958 !isVarArg)
1960 else
1962 case CallingConv::Fast:
1964 if (!getTM().isAAPCS_ABI()) {
1965 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
1966 return CallingConv::Fast;
1967 return CallingConv::ARM_APCS;
1968 } else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
1969 !isVarArg)
1971 else
1973 }
1974}
1975
1977 bool isVarArg) const {
1978 return CCAssignFnForNode(CC, false, isVarArg);
1979}
1980
1982 bool isVarArg) const {
1983 return CCAssignFnForNode(CC, true, isVarArg);
1984}
1985
1986/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
1987/// CallingConvention.
1988CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1989 bool Return,
1990 bool isVarArg) const {
1991 switch (getEffectiveCallingConv(CC, isVarArg)) {
1992 default:
1993 report_fatal_error("Unsupported calling convention");
1995 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1997 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1999 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
2000 case CallingConv::Fast:
2001 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
2002 case CallingConv::GHC:
2003 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
2005 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2007 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2009 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
2010 }
2011}
2012
2013SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
2014 MVT LocVT, MVT ValVT, SDValue Val) const {
2015 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
2016 Val);
2017 if (Subtarget->hasFullFP16()) {
2018 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
2019 } else {
2020 Val = DAG.getNode(ISD::TRUNCATE, dl,
2021 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2022 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
2023 }
2024 return Val;
2025}
2026
2027SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
2028 MVT LocVT, MVT ValVT,
2029 SDValue Val) const {
2030 if (Subtarget->hasFullFP16()) {
2031 Val = DAG.getNode(ARMISD::VMOVrh, dl,
2032 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2033 } else {
2034 Val = DAG.getNode(ISD::BITCAST, dl,
2035 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2036 Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
2037 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2038 }
2039 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
2040}
2041
2042/// LowerCallResult - Lower the result values of a call into the
2043/// appropriate copies out of appropriate physical registers.
2044SDValue ARMTargetLowering::LowerCallResult(
2045 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
2046 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2047 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
2048 SDValue ThisVal, bool isCmseNSCall) const {
2049 // Assign locations to each value returned by this call.
2051 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2052 *DAG.getContext());
2053 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
2054
2055 // Copy all of the result registers out of their specified physreg.
2056 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2057 CCValAssign VA = RVLocs[i];
2058
2059 // Pass 'this' value directly from the argument to return value, to avoid
2060 // reg unit interference
2061 if (i == 0 && isThisReturn) {
2062 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
2063 "unexpected return calling convention register assignment");
2064 InVals.push_back(ThisVal);
2065 continue;
2066 }
2067
2068 SDValue Val;
2069 if (VA.needsCustom() &&
2070 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
2071 // Handle f64 or half of a v2f64.
2072 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2073 InGlue);
2074 Chain = Lo.getValue(1);
2075 InGlue = Lo.getValue(2);
2076 VA = RVLocs[++i]; // skip ahead to next loc
2077 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2078 InGlue);
2079 Chain = Hi.getValue(1);
2080 InGlue = Hi.getValue(2);
2081 if (!Subtarget->isLittle())
2082 std::swap (Lo, Hi);
2083 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2084
2085 if (VA.getLocVT() == MVT::v2f64) {
2086 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
2087 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2088 DAG.getConstant(0, dl, MVT::i32));
2089
2090 VA = RVLocs[++i]; // skip ahead to next loc
2091 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
2092 Chain = Lo.getValue(1);
2093 InGlue = Lo.getValue(2);
2094 VA = RVLocs[++i]; // skip ahead to next loc
2095 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
2096 Chain = Hi.getValue(1);
2097 InGlue = Hi.getValue(2);
2098 if (!Subtarget->isLittle())
2099 std::swap (Lo, Hi);
2100 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2101 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2102 DAG.getConstant(1, dl, MVT::i32));
2103 }
2104 } else {
2105 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
2106 InGlue);
2107 Chain = Val.getValue(1);
2108 InGlue = Val.getValue(2);
2109 }
2110
2111 switch (VA.getLocInfo()) {
2112 default: llvm_unreachable("Unknown loc info!");
2113 case CCValAssign::Full: break;
2114 case CCValAssign::BCvt:
2115 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
2116 break;
2117 }
2118
2119 // f16 arguments have their size extended to 4 bytes and passed as if they
2120 // had been copied to the LSBs of a 32-bit register.
2121 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2122 if (VA.needsCustom() &&
2123 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
2124 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
2125
2126 // On CMSE Non-secure Calls, call results (returned values) whose bitwidth
2127 // is less than 32 bits must be sign- or zero-extended after the call for
2128 // security reasons. Although the ABI mandates an extension done by the
2129 // callee, the latter cannot be trusted to follow the rules of the ABI.
2130 const ISD::InputArg &Arg = Ins[VA.getValNo()];
2131 if (isCmseNSCall && Arg.ArgVT.isScalarInteger() &&
2132 VA.getLocVT().isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
2133 Val = handleCMSEValue(Val, Arg, DAG, dl);
2134
2135 InVals.push_back(Val);
2136 }
2137
2138 return Chain;
2139}
2140
2141std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
2142 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
2143 bool IsTailCall, int SPDiff) const {
2144 SDValue DstAddr;
2145 MachinePointerInfo DstInfo;
2146 int32_t Offset = VA.getLocMemOffset();
2147 MachineFunction &MF = DAG.getMachineFunction();
2148
2149 if (IsTailCall) {
2150 Offset += SPDiff;
2151 auto PtrVT = getPointerTy(DAG.getDataLayout());
2152 int Size = VA.getLocVT().getFixedSizeInBits() / 8;
2153 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
2154 DstAddr = DAG.getFrameIndex(FI, PtrVT);
2155 DstInfo =
2157 } else {
2158 SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
2159 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2160 StackPtr, PtrOff);
2161 DstInfo =
2163 }
2164
2165 return std::make_pair(DstAddr, DstInfo);
2166}
2167
2168// Returns the type of copying which is required to set up a byval argument to
2169// a tail-called function. This isn't needed for non-tail calls, because they
2170// always need the equivalent of CopyOnce, but tail-calls sometimes need two to
2171// avoid clobbering another argument (CopyViaTemp), and sometimes can be
2172// optimised to zero copies when forwarding an argument from the caller's
2173// caller (NoCopy).
2174ARMTargetLowering::ByValCopyKind ARMTargetLowering::ByValNeedsCopyForTailCall(
2175 SelectionDAG &DAG, SDValue Src, SDValue Dst, ISD::ArgFlagsTy Flags) const {
2176 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
2177 ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
2178
2179 // Globals are always safe to copy from.
2181 return CopyOnce;
2182
2183 // Can only analyse frame index nodes, conservatively assume we need a
2184 // temporary.
2185 auto *SrcFrameIdxNode = dyn_cast<FrameIndexSDNode>(Src);
2186 auto *DstFrameIdxNode = dyn_cast<FrameIndexSDNode>(Dst);
2187 if (!SrcFrameIdxNode || !DstFrameIdxNode)
2188 return CopyViaTemp;
2189
2190 int SrcFI = SrcFrameIdxNode->getIndex();
2191 int DstFI = DstFrameIdxNode->getIndex();
2192 assert(MFI.isFixedObjectIndex(DstFI) &&
2193 "byval passed in non-fixed stack slot");
2194
2195 int64_t SrcOffset = MFI.getObjectOffset(SrcFI);
2196 int64_t DstOffset = MFI.getObjectOffset(DstFI);
2197
2198 // If the source is in the local frame, then the copy to the argument memory
2199 // is always valid.
2200 bool FixedSrc = MFI.isFixedObjectIndex(SrcFI);
2201 if (!FixedSrc ||
2202 (FixedSrc && SrcOffset < -(int64_t)AFI->getArgRegsSaveSize()))
2203 return CopyOnce;
2204
2205 // In the case of byval arguments split between registers and the stack,
2206 // computeAddrForCallArg returns a FrameIndex which corresponds only to the
2207 // stack portion, but the Src SDValue will refer to the full value, including
2208 // the local stack memory that the register portion gets stored into. We only
2209 // need to compare them for equality, so normalise on the full value version.
2210 uint64_t RegSize = Flags.getByValSize() - MFI.getObjectSize(DstFI);
2211 DstOffset -= RegSize;
2212
2213 // If the value is already in the correct location, then no copying is
2214 // needed. If not, then we need to copy via a temporary.
2215 if (SrcOffset == DstOffset)
2216 return NoCopy;
2217 else
2218 return CopyViaTemp;
2219}
2220
2221void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
2222 SDValue Chain, SDValue &Arg,
2223 RegsToPassVector &RegsToPass,
2224 CCValAssign &VA, CCValAssign &NextVA,
2225 SDValue &StackPtr,
2226 SmallVectorImpl<SDValue> &MemOpChains,
2227 bool IsTailCall,
2228 int SPDiff) const {
2229 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2230 DAG.getVTList(MVT::i32, MVT::i32), Arg);
2231 unsigned id = Subtarget->isLittle() ? 0 : 1;
2232 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
2233
2234 if (NextVA.isRegLoc())
2235 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
2236 else {
2237 assert(NextVA.isMemLoc());
2238 if (!StackPtr.getNode())
2239 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
2241
2242 SDValue DstAddr;
2243 MachinePointerInfo DstInfo;
2244 std::tie(DstAddr, DstInfo) =
2245 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
2246 MemOpChains.push_back(
2247 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
2248 }
2249}
2250
2251static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
2252 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
2254}
2255
2256/// LowerCall - Lowering a call into a callseq_start <-
2257/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2258/// nodes.
2259SDValue
2260ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2261 SmallVectorImpl<SDValue> &InVals) const {
2262 SelectionDAG &DAG = CLI.DAG;
2263 SDLoc &dl = CLI.DL;
2264 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2265 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2266 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2267 SDValue Chain = CLI.Chain;
2268 SDValue Callee = CLI.Callee;
2269 bool &isTailCall = CLI.IsTailCall;
2270 CallingConv::ID CallConv = CLI.CallConv;
2271 bool doesNotRet = CLI.DoesNotReturn;
2272 bool isVarArg = CLI.IsVarArg;
2273 const CallBase *CB = CLI.CB;
2274
2275 MachineFunction &MF = DAG.getMachineFunction();
2276 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2277 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
2278 MachineFunction::CallSiteInfo CSInfo;
2279 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2280 bool isThisReturn = false;
2281 bool isCmseNSCall = false;
2282 bool isSibCall = false;
2283 bool PreferIndirect = false;
2284 bool GuardWithBTI = false;
2285
2286 // Analyze operands of the call, assigning locations to each operand.
2288 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2289 *DAG.getContext());
2290 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2291
2292 // Lower 'returns_twice' calls to a pseudo-instruction.
2293 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
2294 !Subtarget->noBTIAtReturnTwice())
2295 GuardWithBTI = AFI->branchTargetEnforcement();
2296
2297 // Set type id for call site info.
2298 if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
2299 CSInfo = MachineFunction::CallSiteInfo(*CB);
2300
2301 // Determine whether this is a non-secure function call.
2302 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
2303 isCmseNSCall = true;
2304
2305 // Disable tail calls if they're not supported.
2306 if (!Subtarget->supportsTailCall())
2307 isTailCall = false;
2308
2309 // For both the non-secure calls and the returns from a CMSE entry function,
2310 // the function needs to do some extra work after the call, or before the
2311 // return, respectively, thus it cannot end with a tail call
2312 if (isCmseNSCall || AFI->isCmseNSEntryFunction())
2313 isTailCall = false;
2314
2315 if (isa<GlobalAddressSDNode>(Callee)) {
2316 // If we're optimizing for minimum size and the function is called three or
2317 // more times in this block, we can improve codesize by calling indirectly
2318 // as BLXr has a 16-bit encoding.
2319 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2320 if (CLI.CB) {
2321 auto *BB = CLI.CB->getParent();
2322 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2323 count_if(GV->users(), [&BB](const User *U) {
2324 return isa<Instruction>(U) &&
2325 cast<Instruction>(U)->getParent() == BB;
2326 }) > 2;
2327 }
2328 }
2329 if (isTailCall) {
2330 // Check if it's really possible to do a tail call.
2331 isTailCall =
2332 IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, PreferIndirect);
2333
2334 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
2335 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
2336 isSibCall = true;
2337
2338 // We don't support GuaranteedTailCallOpt for ARM, only automatically
2339 // detected sibcalls.
2340 if (isTailCall)
2341 ++NumTailCalls;
2342 }
2343
2344 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
2345 report_fatal_error("failed to perform tail call elimination on a call "
2346 "site marked musttail");
2347
2348 // Get a count of how many bytes are to be pushed on the stack.
2349 unsigned NumBytes = CCInfo.getStackSize();
2350
2351 // SPDiff is the byte offset of the call's argument area from the callee's.
2352 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2353 // by this amount for a tail call. In a sibling call it must be 0 because the
2354 // caller will deallocate the entire stack and the callee still expects its
2355 // arguments to begin at SP+0. Completely unused for non-tail calls.
2356 int SPDiff = 0;
2357
2358 if (isTailCall && !isSibCall) {
2359 auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
2360 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
2361
2362 // Since callee will pop argument stack as a tail call, we must keep the
2363 // popped size 16-byte aligned.
2364 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
2365 assert(StackAlign && "data layout string is missing stack alignment");
2366 NumBytes = alignTo(NumBytes, *StackAlign);
2367
2368 // SPDiff will be negative if this tail call requires more space than we
2369 // would automatically have in our incoming argument space. Positive if we
2370 // can actually shrink the stack.
2371 SPDiff = NumReusableBytes - NumBytes;
2372
2373 // If this call requires more stack than we have available from
2374 // LowerFormalArguments, tell FrameLowering to reserve space for it.
2375 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
2376 AFI->setArgRegsSaveSize(-SPDiff);
2377 }
2378
2379 if (isSibCall) {
2380 // For sibling tail calls, memory operands are available in our caller's stack.
2381 NumBytes = 0;
2382 } else {
2383 // Adjust the stack pointer for the new arguments...
2384 // These operations are automatically eliminated by the prolog/epilog pass
2385 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
2386 }
2387
2389 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2390
2391 RegsToPassVector RegsToPass;
2392 SmallVector<SDValue, 8> MemOpChains;
2393
2394 // If we are doing a tail-call, any byval arguments will be written to stack
2395 // space which was used for incoming arguments. If any the values being used
2396 // are incoming byval arguments to this function, then they might be
2397 // overwritten by the stores of the outgoing arguments. To avoid this, we
2398 // need to make a temporary copy of them in local stack space, then copy back
2399 // to the argument area.
2400 DenseMap<unsigned, SDValue> ByValTemporaries;
2401 SDValue ByValTempChain;
2402 if (isTailCall) {
2403 SmallVector<SDValue, 8> ByValCopyChains;
2404 for (const CCValAssign &VA : ArgLocs) {
2405 unsigned ArgIdx = VA.getValNo();
2406 SDValue Src = OutVals[ArgIdx];
2407 ISD::ArgFlagsTy Flags = Outs[ArgIdx].Flags;
2408
2409 if (!Flags.isByVal())
2410 continue;
2411
2412 SDValue Dst;
2413 MachinePointerInfo DstInfo;
2414 std::tie(Dst, DstInfo) =
2415 computeAddrForCallArg(dl, DAG, VA, SDValue(), true, SPDiff);
2416 ByValCopyKind Copy = ByValNeedsCopyForTailCall(DAG, Src, Dst, Flags);
2417
2418 if (Copy == NoCopy) {
2419 // If the argument is already at the correct offset on the stack
2420 // (because we are forwarding a byval argument from our caller), we
2421 // don't need any copying.
2422 continue;
2423 } else if (Copy == CopyOnce) {
2424 // If the argument is in our local stack frame, no other argument
2425 // preparation can clobber it, so we can copy it to the final location
2426 // later.
2427 ByValTemporaries[ArgIdx] = Src;
2428 } else {
2429 assert(Copy == CopyViaTemp && "unexpected enum value");
2430 // If we might be copying this argument from the outgoing argument
2431 // stack area, we need to copy via a temporary in the local stack
2432 // frame.
2433 int TempFrameIdx = MFI.CreateStackObject(
2434 Flags.getByValSize(), Flags.getNonZeroByValAlign(), false);
2435 SDValue Temp =
2436 DAG.getFrameIndex(TempFrameIdx, getPointerTy(DAG.getDataLayout()));
2437
2438 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2439 SDValue AlignNode =
2440 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2441
2442 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2443 SDValue Ops[] = {Chain, Temp, Src, SizeNode, AlignNode};
2444 ByValCopyChains.push_back(
2445 DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops));
2446 ByValTemporaries[ArgIdx] = Temp;
2447 }
2448 }
2449 if (!ByValCopyChains.empty())
2450 ByValTempChain =
2451 DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ByValCopyChains);
2452 }
2453
2454 // During a tail call, stores to the argument area must happen after all of
2455 // the function's incoming arguments have been loaded because they may alias.
2456 // This is done by folding in a TokenFactor from LowerFormalArguments, but
2457 // there's no point in doing so repeatedly so this tracks whether that's
2458 // happened yet.
2459 bool AfterFormalArgLoads = false;
2460
2461 // Walk the register/memloc assignments, inserting copies/loads. In the case
2462 // of tail call optimization, arguments are handled later.
2463 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2464 i != e;
2465 ++i, ++realArgIdx) {
2466 CCValAssign &VA = ArgLocs[i];
2467 SDValue Arg = OutVals[realArgIdx];
2468 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2469 bool isByVal = Flags.isByVal();
2470
2471 // Promote the value if needed.
2472 switch (VA.getLocInfo()) {
2473 default: llvm_unreachable("Unknown loc info!");
2474 case CCValAssign::Full: break;
2475 case CCValAssign::SExt:
2476 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2477 break;
2478 case CCValAssign::ZExt:
2479 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2480 break;
2481 case CCValAssign::AExt:
2482 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2483 break;
2484 case CCValAssign::BCvt:
2485 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2486 break;
2487 }
2488
2489 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
2490 Chain = DAG.getStackArgumentTokenFactor(Chain);
2491 if (ByValTempChain)
2492 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain,
2493 ByValTempChain);
2494 AfterFormalArgLoads = true;
2495 }
2496
2497 // f16 arguments have their size extended to 4 bytes and passed as if they
2498 // had been copied to the LSBs of a 32-bit register.
2499 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2500 if (VA.needsCustom() &&
2501 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
2502 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2503 } else {
2504 // f16 arguments could have been extended prior to argument lowering.
2505 // Mask them arguments if this is a CMSE nonsecure call.
2506 auto ArgVT = Outs[realArgIdx].ArgVT;
2507 if (isCmseNSCall && (ArgVT == MVT::f16)) {
2508 auto LocBits = VA.getLocVT().getSizeInBits();
2509 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
2510 SDValue Mask =
2511 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2512 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2513 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2514 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2515 }
2516 }
2517
2518 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2519 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
2520 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2521 DAG.getConstant(0, dl, MVT::i32));
2522 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2523 DAG.getConstant(1, dl, MVT::i32));
2524
2525 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
2526 StackPtr, MemOpChains, isTailCall, SPDiff);
2527
2528 VA = ArgLocs[++i]; // skip ahead to next loc
2529 if (VA.isRegLoc()) {
2530 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
2531 StackPtr, MemOpChains, isTailCall, SPDiff);
2532 } else {
2533 assert(VA.isMemLoc());
2534 SDValue DstAddr;
2535 MachinePointerInfo DstInfo;
2536 std::tie(DstAddr, DstInfo) =
2537 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2538 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
2539 }
2540 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
2541 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2542 StackPtr, MemOpChains, isTailCall, SPDiff);
2543 } else if (VA.isRegLoc()) {
2544 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2545 Outs[0].VT == MVT::i32) {
2546 assert(VA.getLocVT() == MVT::i32 &&
2547 "unexpected calling convention register assignment");
2548 assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2549 "unexpected use of 'returned'");
2550 isThisReturn = true;
2551 }
2552 const TargetOptions &Options = DAG.getTarget().Options;
2553 if (Options.EmitCallSiteInfo)
2554 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
2555 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2556 } else if (isByVal) {
2557 assert(VA.isMemLoc());
2558 unsigned offset = 0;
2559
2560 // True if this byval aggregate will be split between registers
2561 // and memory.
2562 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2563 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2564
2565 SDValue ByValSrc;
2566 bool NeedsStackCopy;
2567 if (auto It = ByValTemporaries.find(realArgIdx);
2568 It != ByValTemporaries.end()) {
2569 ByValSrc = It->second;
2570 NeedsStackCopy = true;
2571 } else {
2572 ByValSrc = Arg;
2573 NeedsStackCopy = !isTailCall;
2574 }
2575
2576 // If part of the argument is in registers, load them.
2577 if (CurByValIdx < ByValArgsCount) {
2578 unsigned RegBegin, RegEnd;
2579 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2580
2581 EVT PtrVT = getPointerTy(DAG.getDataLayout());
2582 unsigned int i, j;
2583 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2584 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2585 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, Const);
2586 SDValue Load =
2587 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
2588 DAG.InferPtrAlign(AddArg));
2589 MemOpChains.push_back(Load.getValue(1));
2590 RegsToPass.push_back(std::make_pair(j, Load));
2591 }
2592
2593 // If parameter size outsides register area, "offset" value
2594 // helps us to calculate stack slot for remained part properly.
2595 offset = RegEnd - RegBegin;
2596
2597 CCInfo.nextInRegsParam();
2598 }
2599
2600 // If the memory part of the argument isn't already in the correct place
2601 // (which can happen with tail calls), copy it into the argument area.
2602 if (NeedsStackCopy && Flags.getByValSize() > 4 * offset) {
2603 auto PtrVT = getPointerTy(DAG.getDataLayout());
2604 SDValue Dst;
2605 MachinePointerInfo DstInfo;
2606 std::tie(Dst, DstInfo) =
2607 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2608 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2609 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, SrcOffset);
2610 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2611 MVT::i32);
2612 SDValue AlignNode =
2613 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2614
2615 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2616 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2617 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2618 Ops));
2619 }
2620 } else {
2621 assert(VA.isMemLoc());
2622 SDValue DstAddr;
2623 MachinePointerInfo DstInfo;
2624 std::tie(DstAddr, DstInfo) =
2625 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2626
2627 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
2628 MemOpChains.push_back(Store);
2629 }
2630 }
2631
2632 if (!MemOpChains.empty())
2633 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2634
2635 // Build a sequence of copy-to-reg nodes chained together with token chain
2636 // and flag operands which copy the outgoing args into the appropriate regs.
2637 SDValue InGlue;
2638 for (const auto &[Reg, N] : RegsToPass) {
2639 Chain = DAG.getCopyToReg(Chain, dl, Reg, N, InGlue);
2640 InGlue = Chain.getValue(1);
2641 }
2642
2643 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2644 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2645 // node so that legalize doesn't hack it.
2646 bool isDirect = false;
2647
2648 const TargetMachine &TM = getTargetMachine();
2649 const GlobalValue *GVal = nullptr;
2650 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2651 GVal = G->getGlobal();
2652 bool isStub = !TM.shouldAssumeDSOLocal(GVal) && Subtarget->isTargetMachO();
2653
2654 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2655 bool isLocalARMFunc = false;
2656 auto PtrVt = getPointerTy(DAG.getDataLayout());
2657
2658 if (Subtarget->genLongCalls()) {
2659 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
2660 "long-calls codegen is not position independent!");
2661 // Handle a global address or an external symbol. If it's not one of
2662 // those, the target's already in a register, so we don't need to do
2663 // anything extra.
2664 if (isa<GlobalAddressSDNode>(Callee)) {
2665 if (Subtarget->genExecuteOnly()) {
2666 if (Subtarget->useMovt())
2667 ++NumMovwMovt;
2668 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2669 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2670 } else {
2671 // Create a constant pool entry for the callee address
2672 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2673 ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
2674 GVal, ARMPCLabelIndex, ARMCP::CPValue, 0);
2675
2676 // Get the address of the callee into a register
2677 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2678 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2679 Callee = DAG.getLoad(
2680 PtrVt, dl, DAG.getEntryNode(), Addr,
2682 }
2683 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2684 const char *Sym = S->getSymbol();
2685
2686 if (Subtarget->genExecuteOnly()) {
2687 if (Subtarget->useMovt())
2688 ++NumMovwMovt;
2689 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2690 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2691 } else {
2692 // Create a constant pool entry for the callee address
2693 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2694 ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(
2695 *DAG.getContext(), Sym, ARMPCLabelIndex, 0);
2696
2697 // Get the address of the callee into a register
2698 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2699 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2700 Callee = DAG.getLoad(
2701 PtrVt, dl, DAG.getEntryNode(), Addr,
2703 }
2704 }
2705 } else if (isa<GlobalAddressSDNode>(Callee)) {
2706 if (!PreferIndirect) {
2707 isDirect = true;
2708 bool isDef = GVal->isStrongDefinitionForLinker();
2709
2710 // ARM call to a local ARM function is predicable.
2711 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2712 // tBX takes a register source operand.
2713 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2714 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2715 Callee = DAG.getNode(
2716 ARMISD::WrapperPIC, dl, PtrVt,
2717 DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2718 Callee = DAG.getLoad(
2719 PtrVt, dl, DAG.getEntryNode(), Callee,
2723 } else if (Subtarget->isTargetCOFF()) {
2724 assert(Subtarget->isTargetWindows() &&
2725 "Windows is the only supported COFF target");
2726 unsigned TargetFlags = ARMII::MO_NO_FLAG;
2727 if (GVal->hasDLLImportStorageClass())
2728 TargetFlags = ARMII::MO_DLLIMPORT;
2729 else if (!TM.shouldAssumeDSOLocal(GVal))
2730 TargetFlags = ARMII::MO_COFFSTUB;
2731 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
2732 TargetFlags);
2733 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
2734 Callee =
2735 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2736 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2738 } else {
2739 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0);
2740 }
2741 }
2742 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2743 isDirect = true;
2744 // tBX takes a register source operand.
2745 const char *Sym = S->getSymbol();
2746 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2747 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2748 ARMConstantPoolValue *CPV =
2750 ARMPCLabelIndex, 4);
2751 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2752 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2753 Callee = DAG.getLoad(
2754 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2756 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2757 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2758 } else {
2759 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2760 }
2761 }
2762
2763 if (isCmseNSCall) {
2764 assert(!isARMFunc && !isDirect &&
2765 "Cannot handle call to ARM function or direct call");
2766 if (NumBytes > 0) {
2767 DAG.getContext()->diagnose(
2768 DiagnosticInfoUnsupported(DAG.getMachineFunction().getFunction(),
2769 "call to non-secure function would require "
2770 "passing arguments on stack",
2771 dl.getDebugLoc()));
2772 }
2773 if (isStructRet) {
2774 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
2776 "call to non-secure function would return value through pointer",
2777 dl.getDebugLoc()));
2778 }
2779 }
2780
2781 // FIXME: handle tail calls differently.
2782 unsigned CallOpc;
2783 if (Subtarget->isThumb()) {
2784 if (GuardWithBTI)
2785 CallOpc = ARMISD::t2CALL_BTI;
2786 else if (isCmseNSCall)
2787 CallOpc = ARMISD::tSECALL;
2788 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2789 CallOpc = ARMISD::CALL_NOLINK;
2790 else
2791 CallOpc = ARMISD::CALL;
2792 } else {
2793 if (!isDirect && !Subtarget->hasV5TOps())
2794 CallOpc = ARMISD::CALL_NOLINK;
2795 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2796 // Emit regular call when code size is the priority
2797 !Subtarget->hasMinSize())
2798 // "mov lr, pc; b _foo" to avoid confusing the RSP
2799 CallOpc = ARMISD::CALL_NOLINK;
2800 else
2801 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2802 }
2803
2804 // We don't usually want to end the call-sequence here because we would tidy
2805 // the frame up *after* the call, however in the ABI-changing tail-call case
2806 // we've carefully laid out the parameters so that when sp is reset they'll be
2807 // in the correct location.
2808 if (isTailCall && !isSibCall) {
2809 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, dl);
2810 InGlue = Chain.getValue(1);
2811 }
2812
2813 std::vector<SDValue> Ops;
2814 Ops.push_back(Chain);
2815 Ops.push_back(Callee);
2816
2817 if (isTailCall) {
2818 Ops.push_back(DAG.getSignedTargetConstant(SPDiff, dl, MVT::i32));
2819 }
2820
2821 // Add argument registers to the end of the list so that they are known live
2822 // into the call.
2823 for (const auto &[Reg, N] : RegsToPass)
2824 Ops.push_back(DAG.getRegister(Reg, N.getValueType()));
2825
2826 // Add a register mask operand representing the call-preserved registers.
2827 const uint32_t *Mask;
2828 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2829 if (isThisReturn) {
2830 // For 'this' returns, use the R0-preserving mask if applicable
2831 Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2832 if (!Mask) {
2833 // Set isThisReturn to false if the calling convention is not one that
2834 // allows 'returned' to be modeled in this way, so LowerCallResult does
2835 // not try to pass 'this' straight through
2836 isThisReturn = false;
2837 Mask = ARI->getCallPreservedMask(MF, CallConv);
2838 }
2839 } else
2840 Mask = ARI->getCallPreservedMask(MF, CallConv);
2841
2842 assert(Mask && "Missing call preserved mask for calling convention");
2843 Ops.push_back(DAG.getRegisterMask(Mask));
2844
2845 if (InGlue.getNode())
2846 Ops.push_back(InGlue);
2847
2848 if (isTailCall) {
2850 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, MVT::Other, Ops);
2851 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
2852 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2853 return Ret;
2854 }
2855
2856 // Returns a chain and a flag for retval copy to use.
2857 Chain = DAG.getNode(CallOpc, dl, {MVT::Other, MVT::Glue}, Ops);
2858 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2859 InGlue = Chain.getValue(1);
2860 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2861
2862 // If we're guaranteeing tail-calls will be honoured, the callee must
2863 // pop its own argument stack on return. But this call is *not* a tail call so
2864 // we need to undo that after it returns to restore the status-quo.
2865 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
2866 uint64_t CalleePopBytes =
2867 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1U;
2868
2869 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, dl);
2870 if (!Ins.empty())
2871 InGlue = Chain.getValue(1);
2872
2873 // Handle result values, copying them out of physregs into vregs that we
2874 // return.
2875 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2876 InVals, isThisReturn,
2877 isThisReturn ? OutVals[0] : SDValue(), isCmseNSCall);
2878}
2879
2880/// HandleByVal - Every parameter *after* a byval parameter is passed
2881/// on the stack. Remember the next parameter register to allocate,
2882/// and then confiscate the rest of the parameter registers to insure
2883/// this.
2884void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2885 Align Alignment) const {
2886 // Byval (as with any stack) slots are always at least 4 byte aligned.
2887 Alignment = std::max(Alignment, Align(4));
2888
2889 MCRegister Reg = State->AllocateReg(GPRArgRegs);
2890 if (!Reg)
2891 return;
2892
2893 unsigned AlignInRegs = Alignment.value() / 4;
2894 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2895 for (unsigned i = 0; i < Waste; ++i)
2896 Reg = State->AllocateReg(GPRArgRegs);
2897
2898 if (!Reg)
2899 return;
2900
2901 unsigned Excess = 4 * (ARM::R4 - Reg);
2902
2903 // Special case when NSAA != SP and parameter size greater than size of
2904 // all remained GPR regs. In that case we can't split parameter, we must
2905 // send it to stack. We also must set NCRN to R4, so waste all
2906 // remained registers.
2907 const unsigned NSAAOffset = State->getStackSize();
2908 if (NSAAOffset != 0 && Size > Excess) {
2909 while (State->AllocateReg(GPRArgRegs))
2910 ;
2911 return;
2912 }
2913
2914 // First register for byval parameter is the first register that wasn't
2915 // allocated before this method call, so it would be "reg".
2916 // If parameter is small enough to be saved in range [reg, r4), then
2917 // the end (first after last) register would be reg + param-size-in-regs,
2918 // else parameter would be splitted between registers and stack,
2919 // end register would be r4 in this case.
2920 unsigned ByValRegBegin = Reg;
2921 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2922 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2923 // Note, first register is allocated in the beginning of function already,
2924 // allocate remained amount of registers we need.
2925 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2926 State->AllocateReg(GPRArgRegs);
2927 // A byval parameter that is split between registers and memory needs its
2928 // size truncated here.
2929 // In the case where the entire structure fits in registers, we set the
2930 // size in memory to zero.
2931 Size = std::max<int>(Size - Excess, 0);
2932}
2933
2934/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2935/// for tail call optimization. Targets which want to do tail call
2936/// optimization should implement this function. Note that this function also
2937/// processes musttail calls, so when this function returns false on a valid
2938/// musttail call, a fatal backend error occurs.
2939bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2941 SmallVectorImpl<CCValAssign> &ArgLocs, const bool isIndirect) const {
2942 CallingConv::ID CalleeCC = CLI.CallConv;
2943 SDValue Callee = CLI.Callee;
2944 bool isVarArg = CLI.IsVarArg;
2945 const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2946 const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2947 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2948 const SelectionDAG &DAG = CLI.DAG;
2949 MachineFunction &MF = DAG.getMachineFunction();
2950 const Function &CallerF = MF.getFunction();
2951 CallingConv::ID CallerCC = CallerF.getCallingConv();
2952
2953 assert(Subtarget->supportsTailCall());
2954
2955 // Indirect tail-calls require a register to hold the target address. That
2956 // register must be:
2957 // * Allocatable (i.e. r0-r7 if the target is Thumb1).
2958 // * Not callee-saved, so must be one of r0-r3 or r12.
2959 // * Not used to hold an argument to the tail-called function, which might be
2960 // in r0-r3.
2961 // * Not used to hold the return address authentication code, which is in r12
2962 // if enabled.
2963 // Sometimes, no register matches all of these conditions, so we can't do a
2964 // tail-call.
2965 if (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect) {
2966 SmallSet<MCPhysReg, 5> AddressRegisters = {ARM::R0, ARM::R1, ARM::R2,
2967 ARM::R3};
2968 if (!(Subtarget->isThumb1Only() ||
2969 MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true)))
2970 AddressRegisters.insert(ARM::R12);
2971 for (const CCValAssign &AL : ArgLocs)
2972 if (AL.isRegLoc())
2973 AddressRegisters.erase(AL.getLocReg());
2974 if (AddressRegisters.empty()) {
2975 LLVM_DEBUG(dbgs() << "false (no reg to hold function pointer)\n");
2976 return false;
2977 }
2978 }
2979
2980 // Look for obvious safe cases to perform tail call optimization that do not
2981 // require ABI changes. This is what gcc calls sibcall.
2982
2983 // Exception-handling functions need a special set of instructions to indicate
2984 // a return to the hardware. Tail-calling another function would probably
2985 // break this.
2986 if (CallerF.hasFnAttribute("interrupt")) {
2987 LLVM_DEBUG(dbgs() << "false (interrupt attribute)\n");
2988 return false;
2989 }
2990
2991 if (canGuaranteeTCO(CalleeCC,
2992 getTargetMachine().Options.GuaranteedTailCallOpt)) {
2993 LLVM_DEBUG(dbgs() << (CalleeCC == CallerCC ? "true" : "false")
2994 << " (guaranteed tail-call CC)\n");
2995 return CalleeCC == CallerCC;
2996 }
2997
2998 // Also avoid sibcall optimization if either caller or callee uses struct
2999 // return semantics.
3000 bool isCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
3001 bool isCallerStructRet = MF.getFunction().hasStructRetAttr();
3002 if (isCalleeStructRet != isCallerStructRet) {
3003 LLVM_DEBUG(dbgs() << "false (struct-ret)\n");
3004 return false;
3005 }
3006
3007 // Externally-defined functions with weak linkage should not be
3008 // tail-called on ARM when the OS does not support dynamic
3009 // pre-emption of symbols, as the AAELF spec requires normal calls
3010 // to undefined weak functions to be replaced with a NOP or jump to the
3011 // next instruction. The behaviour of branch instructions in this
3012 // situation (as used for tail calls) is implementation-defined, so we
3013 // cannot rely on the linker replacing the tail call with a return.
3014 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3015 const GlobalValue *GV = G->getGlobal();
3016 const Triple &TT = getTargetMachine().getTargetTriple();
3017 if (GV->hasExternalWeakLinkage() &&
3018 (!TT.isOSWindows() || TT.isOSBinFormatELF() ||
3019 TT.isOSBinFormatMachO())) {
3020 LLVM_DEBUG(dbgs() << "false (external weak linkage)\n");
3021 return false;
3022 }
3023 }
3024
3025 // Check that the call results are passed in the same way.
3026 LLVMContext &C = *DAG.getContext();
3028 getEffectiveCallingConv(CalleeCC, isVarArg),
3029 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
3030 CCAssignFnForReturn(CalleeCC, isVarArg),
3031 CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) {
3032 LLVM_DEBUG(dbgs() << "false (incompatible results)\n");
3033 return false;
3034 }
3035 // The callee has to preserve all registers the caller needs to preserve.
3036 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3037 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3038 if (CalleeCC != CallerCC) {
3039 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3040 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) {
3041 LLVM_DEBUG(dbgs() << "false (not all registers preserved)\n");
3042 return false;
3043 }
3044 }
3045
3046 // If Caller's vararg argument has been split between registers and stack, do
3047 // not perform tail call, since part of the argument is in caller's local
3048 // frame.
3049 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
3050 if (CLI.IsVarArg && AFI_Caller->getArgRegsSaveSize()) {
3051 LLVM_DEBUG(dbgs() << "false (arg reg save area)\n");
3052 return false;
3053 }
3054
3055 // If the callee takes no arguments then go on to check the results of the
3056 // call.
3057 const MachineRegisterInfo &MRI = MF.getRegInfo();
3058 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) {
3059 LLVM_DEBUG(dbgs() << "false (parameters in CSRs do not match)\n");
3060 return false;
3061 }
3062
3063 // If the stack arguments for this call do not fit into our own save area then
3064 // the call cannot be made tail.
3065 if (CCInfo.getStackSize() > AFI_Caller->getArgumentStackSize())
3066 return false;
3067
3068 LLVM_DEBUG(dbgs() << "true\n");
3069 return true;
3070}
3071
3072bool
3073ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
3074 MachineFunction &MF, bool isVarArg,
3076 LLVMContext &Context, const Type *RetTy) const {
3078 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3079 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3080}
3081
3083 const SDLoc &DL, SelectionDAG &DAG) {
3084 const MachineFunction &MF = DAG.getMachineFunction();
3085 const Function &F = MF.getFunction();
3086
3087 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
3088
3089 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
3090 // version of the "preferred return address". These offsets affect the return
3091 // instruction if this is a return from PL1 without hypervisor extensions.
3092 // IRQ/FIQ: +4 "subs pc, lr, #4"
3093 // SWI: 0 "subs pc, lr, #0"
3094 // ABORT: +4 "subs pc, lr, #4"
3095 // UNDEF: +4/+2 "subs pc, lr, #0"
3096 // UNDEF varies depending on where the exception came from ARM or Thumb
3097 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
3098
3099 int64_t LROffset;
3100 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
3101 IntKind == "ABORT")
3102 LROffset = 4;
3103 else if (IntKind == "SWI" || IntKind == "UNDEF")
3104 LROffset = 0;
3105 else
3106 report_fatal_error("Unsupported interrupt attribute. If present, value "
3107 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
3108
3109 RetOps.insert(RetOps.begin() + 1,
3110 DAG.getConstant(LROffset, DL, MVT::i32, false));
3111
3112 return DAG.getNode(ARMISD::INTRET_GLUE, DL, MVT::Other, RetOps);
3113}
3114
3115SDValue
3116ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3117 bool isVarArg,
3119 const SmallVectorImpl<SDValue> &OutVals,
3120 const SDLoc &dl, SelectionDAG &DAG) const {
3121 // CCValAssign - represent the assignment of the return value to a location.
3123
3124 // CCState - Info about the registers and stack slots.
3125 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3126 *DAG.getContext());
3127
3128 // Analyze outgoing return values.
3129 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3130
3131 SDValue Glue;
3133 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3134 bool isLittleEndian = Subtarget->isLittle();
3135
3136 MachineFunction &MF = DAG.getMachineFunction();
3137 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3138 AFI->setReturnRegsCount(RVLocs.size());
3139
3140 // Report error if cmse entry function returns structure through first ptr arg.
3141 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
3142 // Note: using an empty SDLoc(), as the first line of the function is a
3143 // better place to report than the last line.
3144 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
3146 "secure entry function would return value through pointer",
3147 SDLoc().getDebugLoc()));
3148 }
3149
3150 // Copy the result values into the output registers.
3151 for (unsigned i = 0, realRVLocIdx = 0;
3152 i != RVLocs.size();
3153 ++i, ++realRVLocIdx) {
3154 CCValAssign &VA = RVLocs[i];
3155 assert(VA.isRegLoc() && "Can only return in registers!");
3156
3157 SDValue Arg = OutVals[realRVLocIdx];
3158 bool ReturnF16 = false;
3159
3160 if (Subtarget->hasFullFP16() && getTM().isTargetHardFloat()) {
3161 // Half-precision return values can be returned like this:
3162 //
3163 // t11 f16 = fadd ...
3164 // t12: i16 = bitcast t11
3165 // t13: i32 = zero_extend t12
3166 // t14: f32 = bitcast t13 <~~~~~~~ Arg
3167 //
3168 // to avoid code generation for bitcasts, we simply set Arg to the node
3169 // that produces the f16 value, t11 in this case.
3170 //
3171 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
3172 SDValue ZE = Arg.getOperand(0);
3173 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
3174 SDValue BC = ZE.getOperand(0);
3175 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
3176 Arg = BC.getOperand(0);
3177 ReturnF16 = true;
3178 }
3179 }
3180 }
3181 }
3182
3183 switch (VA.getLocInfo()) {
3184 default: llvm_unreachable("Unknown loc info!");
3185 case CCValAssign::Full: break;
3186 case CCValAssign::BCvt:
3187 if (!ReturnF16)
3188 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3189 break;
3190 }
3191
3192 // Mask f16 arguments if this is a CMSE nonsecure entry.
3193 auto RetVT = Outs[realRVLocIdx].ArgVT;
3194 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
3195 if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
3196 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
3197 } else {
3198 auto LocBits = VA.getLocVT().getSizeInBits();
3199 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
3200 SDValue Mask =
3201 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
3202 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
3203 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
3204 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3205 }
3206 }
3207
3208 if (VA.needsCustom() &&
3209 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
3210 if (VA.getLocVT() == MVT::v2f64) {
3211 // Extract the first half and return it in two registers.
3212 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3213 DAG.getConstant(0, dl, MVT::i32));
3214 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
3215 DAG.getVTList(MVT::i32, MVT::i32), Half);
3216
3217 Chain =
3218 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3219 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Glue);
3220 Glue = Chain.getValue(1);
3221 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3222 VA = RVLocs[++i]; // skip ahead to next loc
3223 Chain =
3224 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3225 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Glue);
3226 Glue = Chain.getValue(1);
3227 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3228 VA = RVLocs[++i]; // skip ahead to next loc
3229
3230 // Extract the 2nd half and fall through to handle it as an f64 value.
3231 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3232 DAG.getConstant(1, dl, MVT::i32));
3233 }
3234 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
3235 // available.
3236 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
3237 DAG.getVTList(MVT::i32, MVT::i32), Arg);
3238 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3239 fmrrd.getValue(isLittleEndian ? 0 : 1), Glue);
3240 Glue = Chain.getValue(1);
3241 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3242 VA = RVLocs[++i]; // skip ahead to next loc
3243 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3244 fmrrd.getValue(isLittleEndian ? 1 : 0), Glue);
3245 } else
3246 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Glue);
3247
3248 // Guarantee that all emitted copies are
3249 // stuck together, avoiding something bad.
3250 Glue = Chain.getValue(1);
3251 RetOps.push_back(DAG.getRegister(
3252 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
3253 }
3254 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3255 const MCPhysReg *I =
3256 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3257 if (I) {
3258 for (; *I; ++I) {
3259 if (ARM::GPRRegClass.contains(*I))
3260 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3261 else if (ARM::DPRRegClass.contains(*I))
3263 else
3264 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3265 }
3266 }
3267
3268 // Update chain and glue.
3269 RetOps[0] = Chain;
3270 if (Glue.getNode())
3271 RetOps.push_back(Glue);
3272
3273 // CPUs which aren't M-class use a special sequence to return from
3274 // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3275 // though we use "subs pc, lr, #N").
3276 //
3277 // M-class CPUs actually use a normal return sequence with a special
3278 // (hardware-provided) value in LR, so the normal code path works.
3279 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
3280 !Subtarget->isMClass()) {
3281 if (Subtarget->isThumb1Only())
3282 report_fatal_error("interrupt attribute is not supported in Thumb1");
3283 return LowerInterruptReturn(RetOps, dl, DAG);
3284 }
3285
3288 return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
3289}
3290
3291bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3292 if (N->getNumValues() != 1)
3293 return false;
3294 if (!N->hasNUsesOfValue(1, 0))
3295 return false;
3296
3297 SDValue TCChain = Chain;
3298 SDNode *Copy = *N->user_begin();
3299 if (Copy->getOpcode() == ISD::CopyToReg) {
3300 // If the copy has a glue operand, we conservatively assume it isn't safe to
3301 // perform a tail call.
3302 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3303 return false;
3304 TCChain = Copy->getOperand(0);
3305 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
3306 SDNode *VMov = Copy;
3307 // f64 returned in a pair of GPRs.
3308 SmallPtrSet<SDNode*, 2> Copies;
3309 for (SDNode *U : VMov->users()) {
3310 if (U->getOpcode() != ISD::CopyToReg)
3311 return false;
3312 Copies.insert(U);
3313 }
3314 if (Copies.size() > 2)
3315 return false;
3316
3317 for (SDNode *U : VMov->users()) {
3318 SDValue UseChain = U->getOperand(0);
3319 if (Copies.count(UseChain.getNode()))
3320 // Second CopyToReg
3321 Copy = U;
3322 else {
3323 // We are at the top of this chain.
3324 // If the copy has a glue operand, we conservatively assume it
3325 // isn't safe to perform a tail call.
3326 if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)
3327 return false;
3328 // First CopyToReg
3329 TCChain = UseChain;
3330 }
3331 }
3332 } else if (Copy->getOpcode() == ISD::BITCAST) {
3333 // f32 returned in a single GPR.
3334 if (!Copy->hasOneUse())
3335 return false;
3336 Copy = *Copy->user_begin();
3337 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
3338 return false;
3339 // If the copy has a glue operand, we conservatively assume it isn't safe to
3340 // perform a tail call.
3341 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3342 return false;
3343 TCChain = Copy->getOperand(0);
3344 } else {
3345 return false;
3346 }
3347
3348 bool HasRet = false;
3349 for (const SDNode *U : Copy->users()) {
3350 if (U->getOpcode() != ARMISD::RET_GLUE &&
3351 U->getOpcode() != ARMISD::INTRET_GLUE)
3352 return false;
3353 HasRet = true;
3354 }
3355
3356 if (!HasRet)
3357 return false;
3358
3359 Chain = TCChain;
3360 return true;
3361}
3362
3363bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3364 if (!Subtarget->supportsTailCall())
3365 return false;
3366
3367 if (!CI->isTailCall())
3368 return false;
3369
3370 return true;
3371}
3372
3373// Trying to write a 64 bit value so need to split into two 32 bit values first,
3374// and pass the lower and high parts through.
3376 SDLoc DL(Op);
3377 SDValue WriteValue = Op->getOperand(2);
3378
3379 // This function is only supposed to be called for i64 type argument.
3380 assert(WriteValue.getValueType() == MVT::i64
3381 && "LowerWRITE_REGISTER called for non-i64 type argument.");
3382
3383 SDValue Lo, Hi;
3384 std::tie(Lo, Hi) = DAG.SplitScalar(WriteValue, DL, MVT::i32, MVT::i32);
3385 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
3386 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
3387}
3388
3389// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
3390// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
3391// one of the above mentioned nodes. It has to be wrapped because otherwise
3392// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
3393// be used to form addressing mode. These wrapped nodes will be selected
3394// into MOVi.
3395SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
3396 SelectionDAG &DAG) const {
3397 EVT PtrVT = Op.getValueType();
3398 // FIXME there is no actual debug info here
3399 SDLoc dl(Op);
3400 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
3401 SDValue Res;
3402
3403 // When generating execute-only code Constant Pools must be promoted to the
3404 // global data section. It's a bit ugly that we can't share them across basic
3405 // blocks, but this way we guarantee that execute-only behaves correct with
3406 // position-independent addressing modes.
3407 if (Subtarget->genExecuteOnly()) {
3408 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3409 auto *T = CP->getType();
3410 auto C = const_cast<Constant*>(CP->getConstVal());
3411 auto M = DAG.getMachineFunction().getFunction().getParent();
3412 auto GV = new GlobalVariable(
3413 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
3414 Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
3415 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
3416 Twine(AFI->createPICLabelUId())
3417 );
3419 dl, PtrVT);
3420 return LowerGlobalAddress(GA, DAG);
3421 }
3422
3423 // The 16-bit ADR instruction can only encode offsets that are multiples of 4,
3424 // so we need to align to at least 4 bytes when we don't have 32-bit ADR.
3425 Align CPAlign = CP->getAlign();
3426 if (Subtarget->isThumb1Only())
3427 CPAlign = std::max(CPAlign, Align(4));
3428 if (CP->isMachineConstantPoolEntry())
3429 Res =
3430 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign);
3431 else
3432 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign);
3433 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
3434}
3435
3437 // If we don't have a 32-bit pc-relative branch instruction then the jump
3438 // table consists of block addresses. Usually this is inline, but for
3439 // execute-only it must be placed out-of-line.
3440 if (Subtarget->genExecuteOnly() && !Subtarget->hasV8MBaselineOps())
3443}
3444
3445SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
3446 SelectionDAG &DAG) const {
3449 unsigned ARMPCLabelIndex = 0;
3450 SDLoc DL(Op);
3451 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3452 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
3453 SDValue CPAddr;
3454 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
3455 if (!IsPositionIndependent) {
3456 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
3457 } else {
3458 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3459 ARMPCLabelIndex = AFI->createPICLabelUId();
3461 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
3462 ARMCP::CPBlockAddress, PCAdj);
3463 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3464 }
3465 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
3466 SDValue Result = DAG.getLoad(
3467 PtrVT, DL, DAG.getEntryNode(), CPAddr,
3469 if (!IsPositionIndependent)
3470 return Result;
3471 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
3472 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
3473}
3474
3475/// Convert a TLS address reference into the correct sequence of loads
3476/// and calls to compute the variable's address for Darwin, and return an
3477/// SDValue containing the final node.
3478
3479/// Darwin only has one TLS scheme which must be capable of dealing with the
3480/// fully general situation, in the worst case. This means:
3481/// + "extern __thread" declaration.
3482/// + Defined in a possibly unknown dynamic library.
3483///
3484/// The general system is that each __thread variable has a [3 x i32] descriptor
3485/// which contains information used by the runtime to calculate the address. The
3486/// only part of this the compiler needs to know about is the first word, which
3487/// contains a function pointer that must be called with the address of the
3488/// entire descriptor in "r0".
3489///
3490/// Since this descriptor may be in a different unit, in general access must
3491/// proceed along the usual ARM rules. A common sequence to produce is:
3492///
3493/// movw rT1, :lower16:_var$non_lazy_ptr
3494/// movt rT1, :upper16:_var$non_lazy_ptr
3495/// ldr r0, [rT1]
3496/// ldr rT2, [r0]
3497/// blx rT2
3498/// [...address now in r0...]
3499SDValue
3500ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3501 SelectionDAG &DAG) const {
3502 assert(Subtarget->isTargetDarwin() &&
3503 "This function expects a Darwin target");
3504 SDLoc DL(Op);
3505
3506 // First step is to get the address of the actua global symbol. This is where
3507 // the TLS descriptor lives.
3508 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3509
3510 // The first entry in the descriptor is a function pointer that we must call
3511 // to obtain the address of the variable.
3512 SDValue Chain = DAG.getEntryNode();
3513 SDValue FuncTLVGet = DAG.getLoad(
3514 MVT::i32, DL, Chain, DescAddr,
3518 Chain = FuncTLVGet.getValue(1);
3519
3520 MachineFunction &F = DAG.getMachineFunction();
3521 MachineFrameInfo &MFI = F.getFrameInfo();
3522 MFI.setAdjustsStack(true);
3523
3524 // TLS calls preserve all registers except those that absolutely must be
3525 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3526 // silly).
3527 auto TRI =
3529 auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3530 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
3531
3532 // Finally, we can make the call. This is just a degenerate version of a
3533 // normal AArch64 call node: r0 takes the address of the descriptor, and
3534 // returns the address of the variable in this thread.
3535 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3536 Chain =
3537 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3538 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3539 DAG.getRegisterMask(Mask), Chain.getValue(1));
3540 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3541}
3542
3543SDValue
3544ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3545 SelectionDAG &DAG) const {
3546 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
3547
3548 SDValue Chain = DAG.getEntryNode();
3549 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3550 SDLoc DL(Op);
3551
3552 // Load the current TEB (thread environment block)
3553 SDValue Ops[] = {Chain,
3554 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3555 DAG.getTargetConstant(15, DL, MVT::i32),
3556 DAG.getTargetConstant(0, DL, MVT::i32),
3557 DAG.getTargetConstant(13, DL, MVT::i32),
3558 DAG.getTargetConstant(0, DL, MVT::i32),
3559 DAG.getTargetConstant(2, DL, MVT::i32)};
3560 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
3561 DAG.getVTList(MVT::i32, MVT::Other), Ops);
3562
3563 SDValue TEB = CurrentTEB.getValue(0);
3564 Chain = CurrentTEB.getValue(1);
3565
3566 // Load the ThreadLocalStoragePointer from the TEB
3567 // A pointer to the TLS array is located at offset 0x2c from the TEB.
3568 SDValue TLSArray =
3569 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
3570 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
3571
3572 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3573 // offset into the TLSArray.
3574
3575 // Load the TLS index from the C runtime
3576 SDValue TLSIndex =
3577 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
3578 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
3579 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
3580
3581 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
3582 DAG.getConstant(2, DL, MVT::i32));
3583 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
3584 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
3585 MachinePointerInfo());
3586
3587 // Get the offset of the start of the .tls section (section base)
3588 const auto *GA = cast<GlobalAddressSDNode>(Op);
3589 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
3590 SDValue Offset = DAG.getLoad(
3591 PtrVT, DL, Chain,
3592 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
3593 DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
3595
3596 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
3597}
3598
3599// Lower ISD::GlobalTLSAddress using the "general dynamic" model
3600SDValue
3601ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
3602 SelectionDAG &DAG) const {
3603 SDLoc dl(GA);
3604 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3605 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3606 MachineFunction &MF = DAG.getMachineFunction();
3607 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3608 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3609 ARMConstantPoolValue *CPV =
3610 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3611 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
3612 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3613 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
3614 Argument = DAG.getLoad(
3615 PtrVT, dl, DAG.getEntryNode(), Argument,
3617 SDValue Chain = Argument.getValue(1);
3618
3619 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3620 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
3621
3622 // call __tls_get_addr.
3624 Args.emplace_back(Argument, Type::getInt32Ty(*DAG.getContext()));
3625
3626 // FIXME: is there useful debug info available here?
3627 TargetLowering::CallLoweringInfo CLI(DAG);
3628 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3630 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
3631
3632 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3633 return CallResult.first;
3634}
3635
3636// Lower ISD::GlobalTLSAddress using the "initial exec" or
3637// "local exec" model.
3638SDValue
3639ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
3640 SelectionDAG &DAG,
3641 TLSModel::Model model) const {
3642 const GlobalValue *GV = GA->getGlobal();
3643 SDLoc dl(GA);
3645 SDValue Chain = DAG.getEntryNode();
3646 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3647 // Get the Thread Pointer
3649
3650 if (model == TLSModel::InitialExec) {
3651 MachineFunction &MF = DAG.getMachineFunction();
3652 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3653 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3654 // Initial exec model.
3655 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3656 ARMConstantPoolValue *CPV =
3657 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3659 true);
3660 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3661 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3662 Offset = DAG.getLoad(
3663 PtrVT, dl, Chain, Offset,
3665 Chain = Offset.getValue(1);
3666
3667 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3668 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
3669
3670 Offset = DAG.getLoad(
3671 PtrVT, dl, Chain, Offset,
3673 } else {
3674 // local exec model
3675 assert(model == TLSModel::LocalExec);
3676 ARMConstantPoolValue *CPV =
3678 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3679 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3680 Offset = DAG.getLoad(
3681 PtrVT, dl, Chain, Offset,
3683 }
3684
3685 // The address of the thread local variable is the add of the thread
3686 // pointer with the offset of the variable.
3687 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
3688}
3689
3690SDValue
3691ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
3692 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3693 if (DAG.getTarget().useEmulatedTLS())
3694 return LowerToTLSEmulatedModel(GA, DAG);
3695
3696 if (Subtarget->isTargetDarwin())
3697 return LowerGlobalTLSAddressDarwin(Op, DAG);
3698
3699 if (Subtarget->isTargetWindows())
3700 return LowerGlobalTLSAddressWindows(Op, DAG);
3701
3702 // TODO: implement the "local dynamic" model
3703 assert(Subtarget->isTargetELF() && "Only ELF implemented here");
3705
3706 switch (model) {
3709 return LowerToTLSGeneralDynamicModel(GA, DAG);
3712 return LowerToTLSExecModels(GA, DAG, model);
3713 }
3714 llvm_unreachable("bogus TLS model");
3715}
3716
3717/// Return true if all users of V are within function F, looking through
3718/// ConstantExprs.
3719static bool allUsersAreInFunction(const Value *V, const Function *F) {
3720 SmallVector<const User*,4> Worklist(V->users());
3721 while (!Worklist.empty()) {
3722 auto *U = Worklist.pop_back_val();
3723 if (isa<ConstantExpr>(U)) {
3724 append_range(Worklist, U->users());
3725 continue;
3726 }
3727
3728 auto *I = dyn_cast<Instruction>(U);
3729 if (!I || I->getParent()->getParent() != F)
3730 return false;
3731 }
3732 return true;
3733}
3734
3736 const GlobalValue *GV, SelectionDAG &DAG,
3737 EVT PtrVT, const SDLoc &dl) {
3738 // If we're creating a pool entry for a constant global with unnamed address,
3739 // and the global is small enough, we can emit it inline into the constant pool
3740 // to save ourselves an indirection.
3741 //
3742 // This is a win if the constant is only used in one function (so it doesn't
3743 // need to be duplicated) or duplicating the constant wouldn't increase code
3744 // size (implying the constant is no larger than 4 bytes).
3745 const Function &F = DAG.getMachineFunction().getFunction();
3746
3747 // We rely on this decision to inline being idemopotent and unrelated to the
3748 // use-site. We know that if we inline a variable at one use site, we'll
3749 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3750 // doesn't know about this optimization, so bail out if it's enabled else
3751 // we could decide to inline here (and thus never emit the GV) but require
3752 // the GV from fast-isel generated code.
3755 return SDValue();
3756
3757 auto *GVar = dyn_cast<GlobalVariable>(GV);
3758 if (!GVar || !GVar->hasInitializer() ||
3759 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3760 !GVar->hasLocalLinkage())
3761 return SDValue();
3762
3763 // If we inline a value that contains relocations, we move the relocations
3764 // from .data to .text. This is not allowed in position-independent code.
3765 auto *Init = GVar->getInitializer();
3766 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
3767 Init->needsDynamicRelocation())
3768 return SDValue();
3769
3770 // The constant islands pass can only really deal with alignment requests
3771 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3772 // any type wanting greater alignment requirements than 4 bytes. We also
3773 // can only promote constants that are multiples of 4 bytes in size or
3774 // are paddable to a multiple of 4. Currently we only try and pad constants
3775 // that are strings for simplicity.
3776 auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
3777 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3778 Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);
3779 unsigned RequiredPadding = 4 - (Size % 4);
3780 bool PaddingPossible =
3781 RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3782 if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
3783 Size == 0)
3784 return SDValue();
3785
3786 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3788 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3789
3790 // We can't bloat the constant pool too much, else the ConstantIslands pass
3791 // may fail to converge. If we haven't promoted this global yet (it may have
3792 // multiple uses), and promoting it would increase the constant pool size (Sz
3793 // > 4), ensure we have space to do so up to MaxTotal.
3794 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3795 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3797 return SDValue();
3798
3799 // This is only valid if all users are in a single function; we can't clone
3800 // the constant in general. The LLVM IR unnamed_addr allows merging
3801 // constants, but not cloning them.
3802 //
3803 // We could potentially allow cloning if we could prove all uses of the
3804 // constant in the current function don't care about the address, like
3805 // printf format strings. But that isn't implemented for now.
3806 if (!allUsersAreInFunction(GVar, &F))
3807 return SDValue();
3808
3809 // We're going to inline this global. Pad it out if needed.
3810 if (RequiredPadding != 4) {
3811 StringRef S = CDAInit->getAsString();
3812
3814 std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3815 while (RequiredPadding--)
3816 V.push_back(0);
3818 }
3819
3820 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
3821 SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4));
3822 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3825 PaddedSize - 4);
3826 }
3827 ++NumConstpoolPromoted;
3828 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3829}
3830
3832 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3833 if (!(GV = GA->getAliaseeObject()))
3834 return false;
3835 if (const auto *V = dyn_cast<GlobalVariable>(GV))
3836 return V->isConstant();
3837 return isa<Function>(GV);
3838}
3839
3840SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
3841 SelectionDAG &DAG) const {
3842 switch (Subtarget->getTargetTriple().getObjectFormat()) {
3843 default: llvm_unreachable("unknown object format");
3844 case Triple::COFF:
3845 return LowerGlobalAddressWindows(Op, DAG);
3846 case Triple::ELF:
3847 return LowerGlobalAddressELF(Op, DAG);
3848 case Triple::MachO:
3849 return LowerGlobalAddressDarwin(Op, DAG);
3850 }
3851}
3852
3853SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3854 SelectionDAG &DAG) const {
3855 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3856 SDLoc dl(Op);
3857 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3858 bool IsRO = isReadOnly(GV);
3859
3860 // promoteToConstantPool only if not generating XO text section
3861 if (GV->isDSOLocal() && !Subtarget->genExecuteOnly())
3862 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
3863 return V;
3864
3865 if (isPositionIndependent()) {
3867 GV, dl, PtrVT, 0, GV->isDSOLocal() ? 0 : ARMII::MO_GOT);
3868 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3869 if (!GV->isDSOLocal())
3870 Result =
3871 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3873 return Result;
3874 } else if (Subtarget->isROPI() && IsRO) {
3875 // PC-relative.
3876 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3877 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3878 return Result;
3879 } else if (Subtarget->isRWPI() && !IsRO) {
3880 // SB-relative.
3881 SDValue RelAddr;
3882 if (Subtarget->useMovt()) {
3883 ++NumMovwMovt;
3884 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
3885 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
3886 } else { // use literal pool for address constant
3887 ARMConstantPoolValue *CPV =
3889 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3890 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3891 RelAddr = DAG.getLoad(
3892 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3894 }
3895 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3896 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
3897 return Result;
3898 }
3899
3900 // If we have T2 ops, we can materialize the address directly via movt/movw
3901 // pair. This is always cheaper. If need to generate Execute Only code, and we
3902 // only have Thumb1 available, we can't use a constant pool and are forced to
3903 // use immediate relocations.
3904 if (Subtarget->useMovt() || Subtarget->genExecuteOnly()) {
3905 if (Subtarget->useMovt())
3906 ++NumMovwMovt;
3907 // FIXME: Once remat is capable of dealing with instructions with register
3908 // operands, expand this into two nodes.
3909 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
3910 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
3911 } else {
3912 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
3913 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3914 return DAG.getLoad(
3915 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3917 }
3918}
3919
3920SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
3921 SelectionDAG &DAG) const {
3922 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3923 "ROPI/RWPI not currently supported for Darwin");
3924 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3925 SDLoc dl(Op);
3926 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3927
3928 if (Subtarget->useMovt())
3929 ++NumMovwMovt;
3930
3931 // FIXME: Once remat is capable of dealing with instructions with register
3932 // operands, expand this into multiple nodes
3933 unsigned Wrapper =
3935
3936 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
3937 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
3938
3939 if (Subtarget->isGVIndirectSymbol(GV))
3940 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3942 return Result;
3943}
3944
3945SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
3946 SelectionDAG &DAG) const {
3947 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
3948 assert(Subtarget->useMovt() &&
3949 "Windows on ARM expects to use movw/movt");
3950 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3951 "ROPI/RWPI not currently supported for Windows");
3952
3953 const TargetMachine &TM = getTargetMachine();
3954 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3955 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
3956 if (GV->hasDLLImportStorageClass())
3957 TargetFlags = ARMII::MO_DLLIMPORT;
3958 else if (!TM.shouldAssumeDSOLocal(GV))
3959 TargetFlags = ARMII::MO_COFFSTUB;
3960 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3962 SDLoc DL(Op);
3963
3964 ++NumMovwMovt;
3965
3966 // FIXME: Once remat is capable of dealing with instructions with register
3967 // operands, expand this into two nodes.
3968 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
3969 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
3970 TargetFlags));
3971 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
3972 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3974 return Result;
3975}
3976
3977SDValue
3978ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
3979 SDLoc dl(Op);
3980 SDValue Val = DAG.getConstant(0, dl, MVT::i32);
3981 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
3982 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
3983 Op.getOperand(1), Val);
3984}
3985
3986SDValue
3987ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
3988 SDLoc dl(Op);
3989 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
3990 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
3991}
3992
3993SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
3994 SelectionDAG &DAG) const {
3995 SDLoc dl(Op);
3996 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
3997 Op.getOperand(0));
3998}
3999
4000SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
4001 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
4002 unsigned IntNo =
4003 Op.getConstantOperandVal(Op.getOperand(0).getValueType() == MVT::Other);
4004 switch (IntNo) {
4005 default:
4006 return SDValue(); // Don't custom lower most intrinsics.
4007 case Intrinsic::arm_gnu_eabi_mcount: {
4008 MachineFunction &MF = DAG.getMachineFunction();
4009 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4010 SDLoc dl(Op);
4011 SDValue Chain = Op.getOperand(0);
4012 // call "\01__gnu_mcount_nc"
4013 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
4014 const uint32_t *Mask =
4016 assert(Mask && "Missing call preserved mask for calling convention");
4017 // Mark LR an implicit live-in.
4018 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
4019 SDValue ReturnAddress =
4020 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
4021 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
4022 SDValue Callee =
4023 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
4025 if (Subtarget->isThumb())
4026 return SDValue(
4027 DAG.getMachineNode(
4028 ARM::tBL_PUSHLR, dl, ResultTys,
4029 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
4030 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
4031 0);
4032 return SDValue(
4033 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
4034 {ReturnAddress, Callee, RegisterMask, Chain}),
4035 0);
4036 }
4037 }
4038}
4039
4040SDValue
4041ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
4042 const ARMSubtarget *Subtarget) const {
4043 unsigned IntNo = Op.getConstantOperandVal(0);
4044 SDLoc dl(Op);
4045 switch (IntNo) {
4046 default: return SDValue(); // Don't custom lower most intrinsics.
4047 case Intrinsic::thread_pointer: {
4048 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4049 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
4050 }
4051 case Intrinsic::arm_cls: {
4052 const SDValue &Operand = Op.getOperand(1);
4053 const EVT VTy = Op.getValueType();
4054 SDValue SRA =
4055 DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
4056 SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
4057 SDValue SHL =
4058 DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
4059 SDValue OR =
4060 DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
4061 SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR);
4062 return Result;
4063 }
4064 case Intrinsic::arm_cls64: {
4065 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x))
4066 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x)))
4067 const SDValue &Operand = Op.getOperand(1);
4068 const EVT VTy = Op.getValueType();
4069 SDValue Lo, Hi;
4070 std::tie(Lo, Hi) = DAG.SplitScalar(Operand, dl, VTy, VTy);
4071 SDValue Constant0 = DAG.getConstant(0, dl, VTy);
4072 SDValue Constant1 = DAG.getConstant(1, dl, VTy);
4073 SDValue Constant31 = DAG.getConstant(31, dl, VTy);
4074 SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31);
4075 SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi);
4076 SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1);
4077 SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1);
4078 SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi);
4079 SDValue CheckLo =
4080 DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ);
4081 SDValue HiIsZero =
4082 DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ);
4083 SDValue AdjustedLo =
4084 DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy));
4085 SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo);
4086 SDValue Result =
4087 DAG.getSelect(dl, VTy, CheckLo,
4088 DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi);
4089 return Result;
4090 }
4091 case Intrinsic::eh_sjlj_lsda: {
4092 MachineFunction &MF = DAG.getMachineFunction();
4093 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4094 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
4095 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4096 SDValue CPAddr;
4097 bool IsPositionIndependent = isPositionIndependent();
4098 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
4099 ARMConstantPoolValue *CPV =
4100 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
4101 ARMCP::CPLSDA, PCAdj);
4102 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
4103 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
4104 SDValue Result = DAG.getLoad(
4105 PtrVT, dl, DAG.getEntryNode(), CPAddr,
4107
4108 if (IsPositionIndependent) {
4109 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
4110 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
4111 }
4112 return Result;
4113 }
4114 case Intrinsic::arm_neon_vabs:
4115 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
4116 Op.getOperand(1));
4117 case Intrinsic::arm_neon_vabds:
4118 if (Op.getValueType().isInteger())
4119 return DAG.getNode(ISD::ABDS, SDLoc(Op), Op.getValueType(),
4120 Op.getOperand(1), Op.getOperand(2));
4121 return SDValue();
4122 case Intrinsic::arm_neon_vabdu:
4123 return DAG.getNode(ISD::ABDU, SDLoc(Op), Op.getValueType(),
4124 Op.getOperand(1), Op.getOperand(2));
4125 case Intrinsic::arm_neon_vmulls:
4126 case Intrinsic::arm_neon_vmullu: {
4127 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
4128 ? ARMISD::VMULLs : ARMISD::VMULLu;
4129 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4130 Op.getOperand(1), Op.getOperand(2));
4131 }
4132 case Intrinsic::arm_neon_vminnm:
4133 case Intrinsic::arm_neon_vmaxnm: {
4134 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
4135 ? ISD::FMINNUM : ISD::FMAXNUM;
4136 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4137 Op.getOperand(1), Op.getOperand(2));
4138 }
4139 case Intrinsic::arm_neon_vminu:
4140 case Intrinsic::arm_neon_vmaxu: {
4141 if (Op.getValueType().isFloatingPoint())
4142 return SDValue();
4143 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
4144 ? ISD::UMIN : ISD::UMAX;
4145 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4146 Op.getOperand(1), Op.getOperand(2));
4147 }
4148 case Intrinsic::arm_neon_vmins:
4149 case Intrinsic::arm_neon_vmaxs: {
4150 // v{min,max}s is overloaded between signed integers and floats.
4151 if (!Op.getValueType().isFloatingPoint()) {
4152 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
4153 ? ISD::SMIN : ISD::SMAX;
4154 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4155 Op.getOperand(1), Op.getOperand(2));
4156 }
4157 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
4158 ? ISD::FMINIMUM : ISD::FMAXIMUM;
4159 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4160 Op.getOperand(1), Op.getOperand(2));
4161 }
4162 case Intrinsic::arm_neon_vtbl1:
4163 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
4164 Op.getOperand(1), Op.getOperand(2));
4165 case Intrinsic::arm_neon_vtbl2:
4166 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
4167 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4168 case Intrinsic::arm_mve_pred_i2v:
4169 case Intrinsic::arm_mve_pred_v2i:
4170 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
4171 Op.getOperand(1));
4172 case Intrinsic::arm_mve_vreinterpretq:
4173 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
4174 Op.getOperand(1));
4175 case Intrinsic::arm_mve_lsll:
4176 return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),
4177 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4178 case Intrinsic::arm_mve_asrl:
4179 return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),
4180 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4181 }
4182}
4183
4185 const ARMSubtarget *Subtarget) {
4186 SDLoc dl(Op);
4187 auto SSID = static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
4188 if (SSID == SyncScope::SingleThread)
4189 return Op;
4190
4191 if (!Subtarget->hasDataBarrier()) {
4192 // Some ARMv6 cpus can support data barriers with an mcr instruction.
4193 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
4194 // here.
4195 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
4196 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
4197 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
4198 DAG.getConstant(0, dl, MVT::i32));
4199 }
4200
4201 AtomicOrdering Ord =
4202 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
4204 if (Subtarget->isMClass()) {
4205 // Only a full system barrier exists in the M-class architectures.
4207 } else if (Subtarget->preferISHSTBarriers() &&
4208 Ord == AtomicOrdering::Release) {
4209 // Swift happens to implement ISHST barriers in a way that's compatible with
4210 // Release semantics but weaker than ISH so we'd be fools not to use
4211 // it. Beware: other processors probably don't!
4213 }
4214
4215 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
4216 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
4217 DAG.getConstant(Domain, dl, MVT::i32));
4218}
4219
4221 const ARMSubtarget *Subtarget) {
4222 // ARM pre v5TE and Thumb1 does not have preload instructions.
4223 if (!(Subtarget->isThumb2() ||
4224 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
4225 // Just preserve the chain.
4226 return Op.getOperand(0);
4227
4228 SDLoc dl(Op);
4229 unsigned isRead = ~Op.getConstantOperandVal(2) & 1;
4230 if (!isRead &&
4231 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
4232 // ARMv7 with MP extension has PLDW.
4233 return Op.getOperand(0);
4234
4235 unsigned isData = Op.getConstantOperandVal(4);
4236 if (Subtarget->isThumb()) {
4237 // Invert the bits.
4238 isRead = ~isRead & 1;
4239 isData = ~isData & 1;
4240 }
4241
4242 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
4243 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
4244 DAG.getConstant(isData, dl, MVT::i32));
4245}
4246
4249 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
4250
4251 // vastart just stores the address of the VarArgsFrameIndex slot into the
4252 // memory location argument.
4253 SDLoc dl(Op);
4255 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
4256 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4257 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
4258 MachinePointerInfo(SV));
4259}
4260
4261SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
4262 CCValAssign &NextVA,
4263 SDValue &Root,
4264 SelectionDAG &DAG,
4265 const SDLoc &dl) const {
4266 MachineFunction &MF = DAG.getMachineFunction();
4267 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4268
4269 const TargetRegisterClass *RC;
4270 if (AFI->isThumb1OnlyFunction())
4271 RC = &ARM::tGPRRegClass;
4272 else
4273 RC = &ARM::GPRRegClass;
4274
4275 // Transform the arguments stored in physical registers into virtual ones.
4276 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4277 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4278
4279 SDValue ArgValue2;
4280 if (NextVA.isMemLoc()) {
4281 MachineFrameInfo &MFI = MF.getFrameInfo();
4282 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
4283
4284 // Create load node to retrieve arguments from the stack.
4285 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4286 ArgValue2 = DAG.getLoad(
4287 MVT::i32, dl, Root, FIN,
4289 } else {
4290 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
4291 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4292 }
4293 if (!Subtarget->isLittle())
4294 std::swap (ArgValue, ArgValue2);
4295 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
4296}
4297
4298// The remaining GPRs hold either the beginning of variable-argument
4299// data, or the beginning of an aggregate passed by value (usually
4300// byval). Either way, we allocate stack slots adjacent to the data
4301// provided by our caller, and store the unallocated registers there.
4302// If this is a variadic function, the va_list pointer will begin with
4303// these values; otherwise, this reassembles a (byval) structure that
4304// was split between registers and memory.
4305// Return: The frame index registers were stored into.
4306int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
4307 const SDLoc &dl, SDValue &Chain,
4308 const Value *OrigArg,
4309 unsigned InRegsParamRecordIdx,
4310 int ArgOffset, unsigned ArgSize) const {
4311 // Currently, two use-cases possible:
4312 // Case #1. Non-var-args function, and we meet first byval parameter.
4313 // Setup first unallocated register as first byval register;
4314 // eat all remained registers
4315 // (these two actions are performed by HandleByVal method).
4316 // Then, here, we initialize stack frame with
4317 // "store-reg" instructions.
4318 // Case #2. Var-args function, that doesn't contain byval parameters.
4319 // The same: eat all remained unallocated registers,
4320 // initialize stack frame.
4321
4322 MachineFunction &MF = DAG.getMachineFunction();
4323 MachineFrameInfo &MFI = MF.getFrameInfo();
4324 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4325 unsigned RBegin, REnd;
4326 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
4327 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
4328 } else {
4329 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4330 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
4331 REnd = ARM::R4;
4332 }
4333
4334 if (REnd != RBegin)
4335 ArgOffset = -4 * (ARM::R4 - RBegin);
4336
4337 auto PtrVT = getPointerTy(DAG.getDataLayout());
4338 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
4339 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
4340
4342 const TargetRegisterClass *RC =
4343 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
4344
4345 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
4346 Register VReg = MF.addLiveIn(Reg, RC);
4347 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
4348 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
4349 MachinePointerInfo(OrigArg, 4 * i));
4350 MemOps.push_back(Store);
4351 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
4352 }
4353
4354 if (!MemOps.empty())
4355 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
4356 return FrameIndex;
4357}
4358
4359// Setup stack frame, the va_list pointer will start from.
4360void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
4361 const SDLoc &dl, SDValue &Chain,
4362 unsigned ArgOffset,
4363 unsigned TotalArgRegsSaveSize,
4364 bool ForceMutable) const {
4365 MachineFunction &MF = DAG.getMachineFunction();
4366 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4367
4368 // Try to store any remaining integer argument regs
4369 // to their spots on the stack so that they may be loaded by dereferencing
4370 // the result of va_next.
4371 // If there is no regs to be stored, just point address after last
4372 // argument passed via stack.
4373 int FrameIndex = StoreByValRegs(
4374 CCInfo, DAG, dl, Chain, nullptr, CCInfo.getInRegsParamsCount(),
4375 CCInfo.getStackSize(), std::max(4U, TotalArgRegsSaveSize));
4376 AFI->setVarArgsFrameIndex(FrameIndex);
4377}
4378
4379bool ARMTargetLowering::splitValueIntoRegisterParts(
4380 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
4381 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
4382 EVT ValueVT = Val.getValueType();
4383 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4384 unsigned ValueBits = ValueVT.getSizeInBits();
4385 unsigned PartBits = PartVT.getSizeInBits();
4386 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
4387 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
4388 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
4389 Parts[0] = Val;
4390 return true;
4391 }
4392 return false;
4393}
4394
4395SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
4396 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
4397 MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
4398 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4399 unsigned ValueBits = ValueVT.getSizeInBits();
4400 unsigned PartBits = PartVT.getSizeInBits();
4401 SDValue Val = Parts[0];
4402
4403 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
4404 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
4405 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
4406 return Val;
4407 }
4408 return SDValue();
4409}
4410
4411SDValue ARMTargetLowering::LowerFormalArguments(
4412 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4413 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4414 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4415 MachineFunction &MF = DAG.getMachineFunction();
4416 MachineFrameInfo &MFI = MF.getFrameInfo();
4417
4418 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4419
4420 // Assign locations to all of the incoming arguments.
4422 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4423 *DAG.getContext());
4424 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
4425
4427 unsigned CurArgIdx = 0;
4428
4429 // Initially ArgRegsSaveSize is zero.
4430 // Then we increase this value each time we meet byval parameter.
4431 // We also increase this value in case of varargs function.
4432 AFI->setArgRegsSaveSize(0);
4433
4434 // Calculate the amount of stack space that we need to allocate to store
4435 // byval and variadic arguments that are passed in registers.
4436 // We need to know this before we allocate the first byval or variadic
4437 // argument, as they will be allocated a stack slot below the CFA (Canonical
4438 // Frame Address, the stack pointer at entry to the function).
4439 unsigned ArgRegBegin = ARM::R4;
4440 for (const CCValAssign &VA : ArgLocs) {
4441 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
4442 break;
4443
4444 unsigned Index = VA.getValNo();
4445 ISD::ArgFlagsTy Flags = Ins[Index].Flags;
4446 if (!Flags.isByVal())
4447 continue;
4448
4449 assert(VA.isMemLoc() && "unexpected byval pointer in reg");
4450 unsigned RBegin, REnd;
4451 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
4452 ArgRegBegin = std::min(ArgRegBegin, RBegin);
4453
4454 CCInfo.nextInRegsParam();
4455 }
4456 CCInfo.rewindByValRegsInfo();
4457
4458 int lastInsIndex = -1;
4459 if (isVarArg && MFI.hasVAStart()) {
4460 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4461 if (RegIdx != std::size(GPRArgRegs))
4462 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
4463 }
4464
4465 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
4466 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
4467 auto PtrVT = getPointerTy(DAG.getDataLayout());
4468
4469 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4470 CCValAssign &VA = ArgLocs[i];
4471 if (Ins[VA.getValNo()].isOrigArg()) {
4472 std::advance(CurOrigArg,
4473 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
4474 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
4475 }
4476 // Arguments stored in registers.
4477 if (VA.isRegLoc()) {
4478 EVT RegVT = VA.getLocVT();
4479 SDValue ArgValue;
4480
4481 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
4482 // f64 and vector types are split up into multiple registers or
4483 // combinations of registers and stack slots.
4484 SDValue ArgValue1 =
4485 GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4486 VA = ArgLocs[++i]; // skip ahead to next loc
4487 SDValue ArgValue2;
4488 if (VA.isMemLoc()) {
4489 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
4490 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4491 ArgValue2 = DAG.getLoad(
4492 MVT::f64, dl, Chain, FIN,
4494 } else {
4495 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4496 }
4497 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
4498 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4499 ArgValue1, DAG.getIntPtrConstant(0, dl));
4500 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4501 ArgValue2, DAG.getIntPtrConstant(1, dl));
4502 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
4503 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4504 } else {
4505 const TargetRegisterClass *RC;
4506
4507 if (RegVT == MVT::f16 || RegVT == MVT::bf16)
4508 RC = &ARM::HPRRegClass;
4509 else if (RegVT == MVT::f32)
4510 RC = &ARM::SPRRegClass;
4511 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
4512 RegVT == MVT::v4bf16)
4513 RC = &ARM::DPRRegClass;
4514 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
4515 RegVT == MVT::v8bf16)
4516 RC = &ARM::QPRRegClass;
4517 else if (RegVT == MVT::i32)
4518 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
4519 : &ARM::GPRRegClass;
4520 else
4521 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
4522
4523 // Transform the arguments in physical registers into virtual ones.
4524 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4525 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4526
4527 // If this value is passed in r0 and has the returned attribute (e.g.
4528 // C++ 'structors), record this fact for later use.
4529 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
4530 AFI->setPreservesR0();
4531 }
4532 }
4533
4534 // If this is an 8 or 16-bit value, it is really passed promoted
4535 // to 32 bits. Insert an assert[sz]ext to capture this, then
4536 // truncate to the right size.
4537 switch (VA.getLocInfo()) {
4538 default: llvm_unreachable("Unknown loc info!");
4539 case CCValAssign::Full: break;
4540 case CCValAssign::BCvt:
4541 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
4542 break;
4543 }
4544
4545 // f16 arguments have their size extended to 4 bytes and passed as if they
4546 // had been copied to the LSBs of a 32-bit register.
4547 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
4548 if (VA.needsCustom() &&
4549 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
4550 ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
4551
4552 // On CMSE Entry Functions, formal integer arguments whose bitwidth is
4553 // less than 32 bits must be sign- or zero-extended in the callee for
4554 // security reasons. Although the ABI mandates an extension done by the
4555 // caller, the latter cannot be trusted to follow the rules of the ABI.
4556 const ISD::InputArg &Arg = Ins[VA.getValNo()];
4557 if (AFI->isCmseNSEntryFunction() && Arg.ArgVT.isScalarInteger() &&
4558 RegVT.isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
4559 ArgValue = handleCMSEValue(ArgValue, Arg, DAG, dl);
4560
4561 InVals.push_back(ArgValue);
4562 } else { // VA.isRegLoc()
4563 // Only arguments passed on the stack should make it here.
4564 assert(VA.isMemLoc());
4565 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
4566
4567 int index = VA.getValNo();
4568
4569 // Some Ins[] entries become multiple ArgLoc[] entries.
4570 // Process them only once.
4571 if (index != lastInsIndex)
4572 {
4573 ISD::ArgFlagsTy Flags = Ins[index].Flags;
4574 // FIXME: For now, all byval parameter objects are marked mutable.
4575 // This can be changed with more analysis.
4576 // In case of tail call optimization mark all arguments mutable.
4577 // Since they could be overwritten by lowering of arguments in case of
4578 // a tail call.
4579 if (Flags.isByVal()) {
4580 assert(Ins[index].isOrigArg() &&
4581 "Byval arguments cannot be implicit");
4582 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
4583
4584 int FrameIndex = StoreByValRegs(
4585 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
4586 VA.getLocMemOffset(), Flags.getByValSize());
4587 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
4588 CCInfo.nextInRegsParam();
4589 } else if (VA.needsCustom() && (VA.getValVT() == MVT::f16 ||
4590 VA.getValVT() == MVT::bf16)) {
4591 // f16 and bf16 values are passed in the least-significant half of
4592 // a 4 byte stack slot. This is done as-if the extension was done
4593 // in a 32-bit register, so the actual bytes used for the value
4594 // differ between little and big endian.
4595 assert(VA.getLocVT().getSizeInBits() == 32);
4596 unsigned FIOffset = VA.getLocMemOffset();
4597 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits() / 8,
4598 FIOffset, true);
4599
4600 SDValue Addr = DAG.getFrameIndex(FI, PtrVT);
4601 if (DAG.getDataLayout().isBigEndian())
4602 Addr = DAG.getObjectPtrOffset(dl, Addr, TypeSize::getFixed(2));
4603
4604 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, Addr,
4606 DAG.getMachineFunction(), FI)));
4607
4608 } else {
4609 unsigned FIOffset = VA.getLocMemOffset();
4610 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
4611 FIOffset, true);
4612
4613 // Create load nodes to retrieve arguments from the stack.
4614 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4615 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
4617 DAG.getMachineFunction(), FI)));
4618 }
4619 lastInsIndex = index;
4620 }
4621 }
4622 }
4623
4624 // varargs
4625 if (isVarArg && MFI.hasVAStart()) {
4626 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getStackSize(),
4627 TotalArgRegsSaveSize);
4628 if (AFI->isCmseNSEntryFunction()) {
4629 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
4631 "secure entry function must not be variadic", dl.getDebugLoc()));
4632 }
4633 }
4634
4635 unsigned StackArgSize = CCInfo.getStackSize();
4636 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4637 if (canGuaranteeTCO(CallConv, TailCallOpt)) {
4638 // The only way to guarantee a tail call is if the callee restores its
4639 // argument area, but it must also keep the stack aligned when doing so.
4640 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
4641 assert(StackAlign && "data layout string is missing stack alignment");
4642 StackArgSize = alignTo(StackArgSize, *StackAlign);
4643
4644 AFI->setArgumentStackToRestore(StackArgSize);
4645 }
4646 AFI->setArgumentStackSize(StackArgSize);
4647
4648 if (CCInfo.getStackSize() > 0 && AFI->isCmseNSEntryFunction()) {
4649 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
4651 "secure entry function requires arguments on stack", dl.getDebugLoc()));
4652 }
4653
4654 return Chain;
4655}
4656
4657/// isFloatingPointZero - Return true if this is +0.0.
4660 return CFP->getValueAPF().isPosZero();
4661 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
4662 // Maybe this has already been legalized into the constant pool?
4663 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
4664 SDValue WrapperOp = Op.getOperand(1).getOperand(0);
4666 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
4667 return CFP->getValueAPF().isPosZero();
4668 }
4669 } else if (Op->getOpcode() == ISD::BITCAST &&
4670 Op->getValueType(0) == MVT::f64) {
4671 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
4672 // created by LowerConstantFP().
4673 SDValue BitcastOp = Op->getOperand(0);
4674 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
4675 isNullConstant(BitcastOp->getOperand(0)))
4676 return true;
4677 }
4678 return false;
4679}
4680
4681/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
4682/// the given operands.
4683SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
4684 SDValue &ARMcc, SelectionDAG &DAG,
4685 const SDLoc &dl) const {
4686 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4687 unsigned C = RHSC->getZExtValue();
4688 if (!isLegalICmpImmediate((int32_t)C)) {
4689 // Constant does not fit, try adjusting it by one.
4690 switch (CC) {
4691 default: break;
4692 case ISD::SETLT:
4693 case ISD::SETGE:
4694 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
4695 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4696 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4697 }
4698 break;
4699 case ISD::SETULT:
4700 case ISD::SETUGE:
4701 if (C != 0 && isLegalICmpImmediate(C-1)) {
4702 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4703 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4704 }
4705 break;
4706 case ISD::SETLE:
4707 case ISD::SETGT:
4708 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
4709 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4710 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4711 }
4712 break;
4713 case ISD::SETULE:
4714 case ISD::SETUGT:
4715 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
4716 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4717 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4718 }
4719 break;
4720 }
4721 }
4722 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
4724 // In ARM and Thumb-2, the compare instructions can shift their second
4725 // operand.
4727 std::swap(LHS, RHS);
4728 }
4729
4730 // Thumb1 has very limited immediate modes, so turning an "and" into a
4731 // shift can save multiple instructions.
4732 //
4733 // If we have (x & C1), and C1 is an appropriate mask, we can transform it
4734 // into "((x << n) >> n)". But that isn't necessarily profitable on its
4735 // own. If it's the operand to an unsigned comparison with an immediate,
4736 // we can eliminate one of the shifts: we transform
4737 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
4738 //
4739 // We avoid transforming cases which aren't profitable due to encoding
4740 // details:
4741 //
4742 // 1. C2 fits into the immediate field of a cmp, and the transformed version
4743 // would not; in that case, we're essentially trading one immediate load for
4744 // another.
4745 // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
4746 // 3. C2 is zero; we have other code for this special case.
4747 //
4748 // FIXME: Figure out profitability for Thumb2; we usually can't save an
4749 // instruction, since the AND is always one instruction anyway, but we could
4750 // use narrow instructions in some cases.
4751 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
4752 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4753 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
4754 !isSignedIntSetCC(CC)) {
4755 unsigned Mask = LHS.getConstantOperandVal(1);
4756 auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
4757 uint64_t RHSV = RHSC->getZExtValue();
4758 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
4759 unsigned ShiftBits = llvm::countl_zero(Mask);
4760 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
4761 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
4762 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
4763 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
4764 }
4765 }
4766 }
4767
4768 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
4769 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same
4770 // way a cmp would.
4771 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
4772 // some tweaks to the heuristics for the previous and->shift transform.
4773 // FIXME: Optimize cases where the LHS isn't a shift.
4774 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
4775 isa<ConstantSDNode>(RHS) && RHS->getAsZExtVal() == 0x80000000U &&
4776 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4777 LHS.getConstantOperandVal(1) < 31) {
4778 unsigned ShiftAmt = LHS.getConstantOperandVal(1) + 1;
4779 SDValue Shift =
4780 DAG.getNode(ARMISD::LSLS, dl, DAG.getVTList(MVT::i32, FlagsVT),
4781 LHS.getOperand(0), DAG.getConstant(ShiftAmt, dl, MVT::i32));
4782 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
4783 return Shift.getValue(1);
4784 }
4785
4787
4788 // If the RHS is a constant zero then the V (overflow) flag will never be
4789 // set. This can allow us to simplify GE to PL or LT to MI, which can be
4790 // simpler for other passes (like the peephole optimiser) to deal with.
4791 if (isNullConstant(RHS)) {
4792 switch (CondCode) {
4793 default: break;
4794 case ARMCC::GE:
4796 break;
4797 case ARMCC::LT:
4799 break;
4800 }
4801 }
4802
4803 ARMISD::NodeType CompareType;
4804 switch (CondCode) {
4805 default:
4806 CompareType = ARMISD::CMP;
4807 break;
4808 case ARMCC::EQ:
4809 case ARMCC::NE:
4810 // Uses only Z Flag
4811 CompareType = ARMISD::CMPZ;
4812 break;
4813 }
4814 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4815 return DAG.getNode(CompareType, dl, FlagsVT, LHS, RHS);
4816}
4817
4818/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
4819SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
4820 SelectionDAG &DAG, const SDLoc &dl,
4821 bool Signaling) const {
4822 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
4823 SDValue Flags;
4825 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP, dl, FlagsVT,
4826 LHS, RHS);
4827 else
4828 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0, dl,
4829 FlagsVT, LHS);
4830 return DAG.getNode(ARMISD::FMSTAT, dl, FlagsVT, Flags);
4831}
4832
4833// This function returns three things: the arithmetic computation itself
4834// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The
4835// comparison and the condition code define the case in which the arithmetic
4836// computation *does not* overflow.
4837std::pair<SDValue, SDValue>
4838ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
4839 SDValue &ARMcc) const {
4840 assert(Op.getValueType() == MVT::i32 && "Unsupported value type");
4841
4842 SDValue Value, OverflowCmp;
4843 SDValue LHS = Op.getOperand(0);
4844 SDValue RHS = Op.getOperand(1);
4845 SDLoc dl(Op);
4846
4847 // FIXME: We are currently always generating CMPs because we don't support
4848 // generating CMN through the backend. This is not as good as the natural
4849 // CMP case because it causes a register dependency and cannot be folded
4850 // later.
4851
4852 switch (Op.getOpcode()) {
4853 default:
4854 llvm_unreachable("Unknown overflow instruction!");
4855 case ISD::SADDO:
4856 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4857 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
4858 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4859 break;
4860 case ISD::UADDO:
4861 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4862 // We use ADDC here to correspond to its use in LowerUnsignedALUO.
4863 // We do not use it in the USUBO case as Value may not be used.
4864 Value = DAG.getNode(ARMISD::ADDC, dl,
4865 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
4866 .getValue(0);
4867 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4868 break;
4869 case ISD::SSUBO:
4870 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4871 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4872 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4873 break;
4874 case ISD::USUBO:
4875 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4876 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4877 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4878 break;
4879 case ISD::UMULO:
4880 // We generate a UMUL_LOHI and then check if the high word is 0.
4881 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4882 Value = DAG.getNode(ISD::UMUL_LOHI, dl,
4883 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4884 LHS, RHS);
4885 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value.getValue(1),
4886 DAG.getConstant(0, dl, MVT::i32));
4887 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4888 break;
4889 case ISD::SMULO:
4890 // We generate a SMUL_LOHI and then check if all the bits of the high word
4891 // are the same as the sign bit of the low word.
4892 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4893 Value = DAG.getNode(ISD::SMUL_LOHI, dl,
4894 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4895 LHS, RHS);
4896 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value.getValue(1),
4897 DAG.getNode(ISD::SRA, dl, Op.getValueType(),
4898 Value.getValue(0),
4899 DAG.getConstant(31, dl, MVT::i32)));
4900 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4901 break;
4902 } // switch (...)
4903
4904 return std::make_pair(Value, OverflowCmp);
4905}
4906
4907SDValue
4908ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
4909 // Let legalize expand this if it isn't a legal type yet.
4910 if (!isTypeLegal(Op.getValueType()))
4911 return SDValue();
4912
4913 SDValue Value, OverflowCmp;
4914 SDValue ARMcc;
4915 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
4916 SDLoc dl(Op);
4917 // We use 0 and 1 as false and true values.
4918 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4919 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4920 EVT VT = Op.getValueType();
4921
4922 SDValue Overflow =
4923 DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, ARMcc, OverflowCmp);
4924
4925 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4926 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4927}
4928
4930 SelectionDAG &DAG) {
4931 SDLoc DL(BoolCarry);
4932 EVT CarryVT = BoolCarry.getValueType();
4933
4934 // This converts the boolean value carry into the carry flag by doing
4935 // ARMISD::SUBC Carry, 1
4936 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
4937 DAG.getVTList(CarryVT, MVT::i32),
4938 BoolCarry, DAG.getConstant(1, DL, CarryVT));
4939 return Carry.getValue(1);
4940}
4941
4943 SelectionDAG &DAG) {
4944 SDLoc DL(Flags);
4945
4946 // Now convert the carry flag into a boolean carry. We do this
4947 // using ARMISD:ADDE 0, 0, Carry
4948 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
4949 DAG.getConstant(0, DL, MVT::i32),
4950 DAG.getConstant(0, DL, MVT::i32), Flags);
4951}
4952
4953SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
4954 SelectionDAG &DAG) const {
4955 // Let legalize expand this if it isn't a legal type yet.
4956 if (!isTypeLegal(Op.getValueType()))
4957 return SDValue();
4958
4959 SDValue LHS = Op.getOperand(0);
4960 SDValue RHS = Op.getOperand(1);
4961 SDLoc dl(Op);
4962
4963 EVT VT = Op.getValueType();
4964 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
4965 SDValue Value;
4966 SDValue Overflow;
4967 switch (Op.getOpcode()) {
4968 default:
4969 llvm_unreachable("Unknown overflow instruction!");
4970 case ISD::UADDO:
4971 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
4972 // Convert the carry flag into a boolean value.
4973 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4974 break;
4975 case ISD::USUBO: {
4976 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
4977 // Convert the carry flag into a boolean value.
4978 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4979 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
4980 // value. So compute 1 - C.
4981 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
4982 DAG.getConstant(1, dl, MVT::i32), Overflow);
4983 break;
4984 }
4985 }
4986
4987 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4988}
4989
4991 const ARMSubtarget *Subtarget) {
4992 EVT VT = Op.getValueType();
4993 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() || Subtarget->isThumb1Only())
4994 return SDValue();
4995 if (!VT.isSimple())
4996 return SDValue();
4997
4998 unsigned NewOpcode;
4999 switch (VT.getSimpleVT().SimpleTy) {
5000 default:
5001 return SDValue();
5002 case MVT::i8:
5003 switch (Op->getOpcode()) {
5004 case ISD::UADDSAT:
5005 NewOpcode = ARMISD::UQADD8b;
5006 break;
5007 case ISD::SADDSAT:
5008 NewOpcode = ARMISD::QADD8b;
5009 break;
5010 case ISD::USUBSAT:
5011 NewOpcode = ARMISD::UQSUB8b;
5012 break;
5013 case ISD::SSUBSAT:
5014 NewOpcode = ARMISD::QSUB8b;
5015 break;
5016 }
5017 break;
5018 case MVT::i16:
5019 switch (Op->getOpcode()) {
5020 case ISD::UADDSAT:
5021 NewOpcode = ARMISD::UQADD16b;
5022 break;
5023 case ISD::SADDSAT:
5024 NewOpcode = ARMISD::QADD16b;
5025 break;
5026 case ISD::USUBSAT:
5027 NewOpcode = ARMISD::UQSUB16b;
5028 break;
5029 case ISD::SSUBSAT:
5030 NewOpcode = ARMISD::QSUB16b;
5031 break;
5032 }
5033 break;
5034 }
5035
5036 SDLoc dl(Op);
5037 SDValue Add =
5038 DAG.getNode(NewOpcode, dl, MVT::i32,
5039 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
5040 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
5041 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
5042}
5043
5044SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
5045 SDValue Cond = Op.getOperand(0);
5046 SDValue SelectTrue = Op.getOperand(1);
5047 SDValue SelectFalse = Op.getOperand(2);
5048 SDLoc dl(Op);
5049 unsigned Opc = Cond.getOpcode();
5050
5051 if (Cond.getResNo() == 1 &&
5052 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5053 Opc == ISD::USUBO)) {
5054 if (!isTypeLegal(Cond->getValueType(0)))
5055 return SDValue();
5056
5057 SDValue Value, OverflowCmp;
5058 SDValue ARMcc;
5059 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5060 EVT VT = Op.getValueType();
5061
5062 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, OverflowCmp, DAG);
5063 }
5064
5065 // Convert:
5066 //
5067 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
5068 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
5069 //
5070 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
5071 const ConstantSDNode *CMOVTrue =
5072 dyn_cast<ConstantSDNode>(Cond.getOperand(0));
5073 const ConstantSDNode *CMOVFalse =
5074 dyn_cast<ConstantSDNode>(Cond.getOperand(1));
5075
5076 if (CMOVTrue && CMOVFalse) {
5077 unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
5078 unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
5079
5080 SDValue True;
5081 SDValue False;
5082 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
5083 True = SelectTrue;
5084 False = SelectFalse;
5085 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
5086 True = SelectFalse;
5087 False = SelectTrue;
5088 }
5089
5090 if (True.getNode() && False.getNode())
5091 return getCMOV(dl, Op.getValueType(), True, False, Cond.getOperand(2),
5092 Cond.getOperand(3), DAG);
5093 }
5094 }
5095
5096 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
5097 // undefined bits before doing a full-word comparison with zero.
5098 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
5099 DAG.getConstant(1, dl, Cond.getValueType()));
5100
5101 return DAG.getSelectCC(dl, Cond,
5102 DAG.getConstant(0, dl, Cond.getValueType()),
5103 SelectTrue, SelectFalse, ISD::SETNE);
5104}
5105
5107 bool &swpCmpOps, bool &swpVselOps) {
5108 // Start by selecting the GE condition code for opcodes that return true for
5109 // 'equality'
5110 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
5111 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)
5112 CondCode = ARMCC::GE;
5113
5114 // and GT for opcodes that return false for 'equality'.
5115 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
5116 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)
5117 CondCode = ARMCC::GT;
5118
5119 // Since we are constrained to GE/GT, if the opcode contains 'less', we need
5120 // to swap the compare operands.
5121 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
5122 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)
5123 swpCmpOps = true;
5124
5125 // Both GT and GE are ordered comparisons, and return false for 'unordered'.
5126 // If we have an unordered opcode, we need to swap the operands to the VSEL
5127 // instruction (effectively negating the condition).
5128 //
5129 // This also has the effect of swapping which one of 'less' or 'greater'
5130 // returns true, so we also swap the compare operands. It also switches
5131 // whether we return true for 'equality', so we compensate by picking the
5132 // opposite condition code to our original choice.
5133 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
5134 CC == ISD::SETUGT) {
5135 swpCmpOps = !swpCmpOps;
5136 swpVselOps = !swpVselOps;
5137 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
5138 }
5139
5140 // 'ordered' is 'anything but unordered', so use the VS condition code and
5141 // swap the VSEL operands.
5142 if (CC == ISD::SETO) {
5143 CondCode = ARMCC::VS;
5144 swpVselOps = true;
5145 }
5146
5147 // 'unordered or not equal' is 'anything but equal', so use the EQ condition
5148 // code and swap the VSEL operands. Also do this if we don't care about the
5149 // unordered case.
5150 if (CC == ISD::SETUNE || CC == ISD::SETNE) {
5151 CondCode = ARMCC::EQ;
5152 swpVselOps = true;
5153 }
5154}
5155
5156SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
5157 SDValue TrueVal, SDValue ARMcc,
5158 SDValue Flags, SelectionDAG &DAG) const {
5159 if (!Subtarget->hasFP64() && VT == MVT::f64) {
5161 DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
5163 DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
5164
5165 SDValue TrueLow = TrueVal.getValue(0);
5166 SDValue TrueHigh = TrueVal.getValue(1);
5167 SDValue FalseLow = FalseVal.getValue(0);
5168 SDValue FalseHigh = FalseVal.getValue(1);
5169
5170 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
5171 ARMcc, Flags);
5172 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
5173 ARMcc, Flags);
5174
5175 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
5176 }
5177 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, Flags);
5178}
5179
5180static bool isGTorGE(ISD::CondCode CC) {
5181 return CC == ISD::SETGT || CC == ISD::SETGE;
5182}
5183
5184static bool isLTorLE(ISD::CondCode CC) {
5185 return CC == ISD::SETLT || CC == ISD::SETLE;
5186}
5187
5188// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
5189// All of these conditions (and their <= and >= counterparts) will do:
5190// x < k ? k : x
5191// x > k ? x : k
5192// k < x ? x : k
5193// k > x ? k : x
5194static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
5195 const SDValue TrueVal, const SDValue FalseVal,
5196 const ISD::CondCode CC, const SDValue K) {
5197 return (isGTorGE(CC) &&
5198 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
5199 (isLTorLE(CC) &&
5200 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
5201}
5202
5203// Check if two chained conditionals could be converted into SSAT or USAT.
5204//
5205// SSAT can replace a set of two conditional selectors that bound a number to an
5206// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
5207//
5208// x < -k ? -k : (x > k ? k : x)
5209// x < -k ? -k : (x < k ? x : k)
5210// x > -k ? (x > k ? k : x) : -k
5211// x < k ? (x < -k ? -k : x) : k
5212// etc.
5213//
5214// LLVM canonicalizes these to either a min(max()) or a max(min())
5215// pattern. This function tries to match one of these and will return a SSAT
5216// node if successful.
5217//
5218// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1
5219// is a power of 2.
5221 EVT VT = Op.getValueType();
5222 SDValue V1 = Op.getOperand(0);
5223 SDValue K1 = Op.getOperand(1);
5224 SDValue TrueVal1 = Op.getOperand(2);
5225 SDValue FalseVal1 = Op.getOperand(3);
5226 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5227
5228 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
5229 if (Op2.getOpcode() != ISD::SELECT_CC)
5230 return SDValue();
5231
5232 SDValue V2 = Op2.getOperand(0);
5233 SDValue K2 = Op2.getOperand(1);
5234 SDValue TrueVal2 = Op2.getOperand(2);
5235 SDValue FalseVal2 = Op2.getOperand(3);
5236 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
5237
5238 SDValue V1Tmp = V1;
5239 SDValue V2Tmp = V2;
5240
5241 // Check that the registers and the constants match a max(min()) or min(max())
5242 // pattern
5243 if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
5244 K2 != FalseVal2 ||
5245 !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))
5246 return SDValue();
5247
5248 // Check that the constant in the lower-bound check is
5249 // the opposite of the constant in the upper-bound check
5250 // in 1's complement.
5252 return SDValue();
5253
5254 int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();
5255 int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();
5256 int64_t PosVal = std::max(Val1, Val2);
5257 int64_t NegVal = std::min(Val1, Val2);
5258
5259 if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||
5260 !isPowerOf2_64(PosVal + 1))
5261 return SDValue();
5262
5263 // Handle the difference between USAT (unsigned) and SSAT (signed)
5264 // saturation
5265 // At this point, PosVal is guaranteed to be positive
5266 uint64_t K = PosVal;
5267 SDLoc dl(Op);
5268 if (Val1 == ~Val2)
5269 return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,
5270 DAG.getConstant(llvm::countr_one(K), dl, VT));
5271 if (NegVal == 0)
5272 return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,
5273 DAG.getConstant(llvm::countr_one(K), dl, VT));
5274
5275 return SDValue();
5276}
5277
5278// Check if a condition of the type x < k ? k : x can be converted into a
5279// bit operation instead of conditional moves.
5280// Currently this is allowed given:
5281// - The conditions and values match up
5282// - k is 0 or -1 (all ones)
5283// This function will not check the last condition, thats up to the caller
5284// It returns true if the transformation can be made, and in such case
5285// returns x in V, and k in SatK.
5287 SDValue &SatK)
5288{
5289 SDValue LHS = Op.getOperand(0);
5290 SDValue RHS = Op.getOperand(1);
5291 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5292 SDValue TrueVal = Op.getOperand(2);
5293 SDValue FalseVal = Op.getOperand(3);
5294
5296 ? &RHS
5297 : nullptr;
5298
5299 // No constant operation in comparison, early out
5300 if (!K)
5301 return false;
5302
5303 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
5304 V = (KTmp == TrueVal) ? FalseVal : TrueVal;
5305 SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
5306
5307 // If the constant on left and right side, or variable on left and right,
5308 // does not match, early out
5309 if (*K != KTmp || V != VTmp)
5310 return false;
5311
5312 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
5313 SatK = *K;
5314 return true;
5315 }
5316
5317 return false;
5318}
5319
5320bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
5321 if (VT == MVT::f32)
5322 return !Subtarget->hasVFP2Base();
5323 if (VT == MVT::f64)
5324 return !Subtarget->hasFP64();
5325 if (VT == MVT::f16)
5326 return !Subtarget->hasFullFP16();
5327 return false;
5328}
5329
5330SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
5331 EVT VT = Op.getValueType();
5332 SDLoc dl(Op);
5333
5334 // Try to convert two saturating conditional selects into a single SSAT
5335 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
5336 if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))
5337 return SatValue;
5338
5339 // Try to convert expressions of the form x < k ? k : x (and similar forms)
5340 // into more efficient bit operations, which is possible when k is 0 or -1
5341 // On ARM and Thumb-2 which have flexible operand 2 this will result in
5342 // single instructions. On Thumb the shift and the bit operation will be two
5343 // instructions.
5344 // Only allow this transformation on full-width (32-bit) operations
5345 SDValue LowerSatConstant;
5346 SDValue SatValue;
5347 if (VT == MVT::i32 &&
5348 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
5349 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
5350 DAG.getConstant(31, dl, VT));
5351 if (isNullConstant(LowerSatConstant)) {
5352 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
5353 DAG.getAllOnesConstant(dl, VT));
5354 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
5355 } else if (isAllOnesConstant(LowerSatConstant))
5356 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
5357 }
5358
5359 SDValue LHS = Op.getOperand(0);
5360 SDValue RHS = Op.getOperand(1);
5361 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5362 SDValue TrueVal = Op.getOperand(2);
5363 SDValue FalseVal = Op.getOperand(3);
5364 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
5365 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
5366 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
5367 if (Op.getValueType().isInteger()) {
5368
5369 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
5370 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
5371 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
5372 // Both require less instructions than compare and conditional select.
5373 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TrueVal && RHSC &&
5374 RHSC->isZero() && CFVal && CFVal->isZero() &&
5375 LHS.getValueType() == RHS.getValueType()) {
5376 EVT VT = LHS.getValueType();
5377 SDValue Shift =
5378 DAG.getNode(ISD::SRA, dl, VT, LHS,
5379 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
5380
5381 if (CC == ISD::SETGT)
5382 Shift = DAG.getNOT(dl, Shift, VT);
5383
5384 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
5385 }
5386 }
5387
5388 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal &&
5389 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) {
5390 unsigned TVal = CTVal->getZExtValue();
5391 unsigned FVal = CFVal->getZExtValue();
5392 unsigned Opcode = 0;
5393
5394 if (TVal == ~FVal) {
5395 Opcode = ARMISD::CSINV;
5396 } else if (TVal == ~FVal + 1) {
5397 Opcode = ARMISD::CSNEG;
5398 } else if (TVal + 1 == FVal) {
5399 Opcode = ARMISD::CSINC;
5400 } else if (TVal == FVal + 1) {
5401 Opcode = ARMISD::CSINC;
5402 std::swap(TrueVal, FalseVal);
5403 std::swap(TVal, FVal);
5404 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5405 }
5406
5407 if (Opcode) {
5408 // If one of the constants is cheaper than another, materialise the
5409 // cheaper one and let the csel generate the other.
5410 if (Opcode != ARMISD::CSINC &&
5411 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
5412 std::swap(TrueVal, FalseVal);
5413 std::swap(TVal, FVal);
5414 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5415 }
5416
5417 // Attempt to use ZR checking TVal is 0, possibly inverting the condition
5418 // to get there. CSINC not is invertable like the other two (~(~a) == a,
5419 // -(-a) == a, but (a+1)+1 != a).
5420 if (FVal == 0 && Opcode != ARMISD::CSINC) {
5421 std::swap(TrueVal, FalseVal);
5422 std::swap(TVal, FVal);
5423 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5424 }
5425
5426 // Drops F's value because we can get it by inverting/negating TVal.
5427 FalseVal = TrueVal;
5428
5429 SDValue ARMcc;
5430 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5431 EVT VT = TrueVal.getValueType();
5432 return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp);
5433 }
5434 }
5435
5436 if (isUnsupportedFloatingType(LHS.getValueType())) {
5437 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5438
5439 // If softenSetCCOperands only returned one value, we should compare it to
5440 // zero.
5441 if (!RHS.getNode()) {
5442 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5443 CC = ISD::SETNE;
5444 }
5445 }
5446
5447 if (LHS.getValueType() == MVT::i32) {
5448 // Try to generate VSEL on ARMv8.
5449 // The VSEL instruction can't use all the usual ARM condition
5450 // codes: it only has two bits to select the condition code, so it's
5451 // constrained to use only GE, GT, VS and EQ.
5452 //
5453 // To implement all the various ISD::SETXXX opcodes, we sometimes need to
5454 // swap the operands of the previous compare instruction (effectively
5455 // inverting the compare condition, swapping 'less' and 'greater') and
5456 // sometimes need to swap the operands to the VSEL (which inverts the
5457 // condition in the sense of firing whenever the previous condition didn't)
5458 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
5459 TrueVal.getValueType() == MVT::f32 ||
5460 TrueVal.getValueType() == MVT::f64)) {
5462 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
5463 CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
5464 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5465 std::swap(TrueVal, FalseVal);
5466 }
5467 }
5468
5469 SDValue ARMcc;
5470 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5471 // Choose GE over PL, which vsel does now support
5472 if (ARMcc->getAsZExtVal() == ARMCC::PL)
5473 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
5474 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5475 }
5476
5477 ARMCC::CondCodes CondCode, CondCode2;
5478 FPCCToARMCC(CC, CondCode, CondCode2);
5479
5480 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
5481 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
5482 // must use VSEL (limited condition codes), due to not having conditional f16
5483 // moves.
5484 if (Subtarget->hasFPARMv8Base() &&
5485 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
5486 (TrueVal.getValueType() == MVT::f16 ||
5487 TrueVal.getValueType() == MVT::f32 ||
5488 TrueVal.getValueType() == MVT::f64)) {
5489 bool swpCmpOps = false;
5490 bool swpVselOps = false;
5491 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
5492
5493 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
5494 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
5495 if (swpCmpOps)
5496 std::swap(LHS, RHS);
5497 if (swpVselOps)
5498 std::swap(TrueVal, FalseVal);
5499 }
5500 }
5501
5502 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5503 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5504 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5505 if (CondCode2 != ARMCC::AL) {
5506 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
5507 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, Cmp, DAG);
5508 }
5509 return Result;
5510}
5511
5512/// canChangeToInt - Given the fp compare operand, return true if it is suitable
5513/// to morph to an integer compare sequence.
5514static bool canChangeToInt(SDValue Op, bool &SeenZero,
5515 const ARMSubtarget *Subtarget) {
5516 SDNode *N = Op.getNode();
5517 if (!N->hasOneUse())
5518 // Otherwise it requires moving the value from fp to integer registers.
5519 return false;
5520 if (!N->getNumValues())
5521 return false;
5522 EVT VT = Op.getValueType();
5523 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
5524 // f32 case is generally profitable. f64 case only makes sense when vcmpe +
5525 // vmrs are very slow, e.g. cortex-a8.
5526 return false;
5527
5528 if (isFloatingPointZero(Op)) {
5529 SeenZero = true;
5530 return true;
5531 }
5532 return ISD::isNormalLoad(N);
5533}
5534
5537 return DAG.getConstant(0, SDLoc(Op), MVT::i32);
5538
5540 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
5541 Ld->getPointerInfo(), Ld->getAlign(),
5542 Ld->getMemOperand()->getFlags());
5543
5544 llvm_unreachable("Unknown VFP cmp argument!");
5545}
5546
5548 SDValue &RetVal1, SDValue &RetVal2) {
5549 SDLoc dl(Op);
5550
5551 if (isFloatingPointZero(Op)) {
5552 RetVal1 = DAG.getConstant(0, dl, MVT::i32);
5553 RetVal2 = DAG.getConstant(0, dl, MVT::i32);
5554 return;
5555 }
5556
5557 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
5558 SDValue Ptr = Ld->getBasePtr();
5559 RetVal1 =
5560 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
5561 Ld->getAlign(), Ld->getMemOperand()->getFlags());
5562
5563 EVT PtrType = Ptr.getValueType();
5564 SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
5565 PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
5566 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
5567 Ld->getPointerInfo().getWithOffset(4),
5568 commonAlignment(Ld->getAlign(), 4),
5569 Ld->getMemOperand()->getFlags());
5570 return;
5571 }
5572
5573 llvm_unreachable("Unknown VFP cmp argument!");
5574}
5575
5576/// OptimizeVFPBrcond - With nnan, it's legal to optimize some
5577/// f32 and even f64 comparisons to integer ones.
5578SDValue
5579ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
5580 SDValue Chain = Op.getOperand(0);
5581 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5582 SDValue LHS = Op.getOperand(2);
5583 SDValue RHS = Op.getOperand(3);
5584 SDValue Dest = Op.getOperand(4);
5585 SDLoc dl(Op);
5586
5587 bool LHSSeenZero = false;
5588 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
5589 bool RHSSeenZero = false;
5590 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
5591 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
5592 // If unsafe fp math optimization is enabled and there are no other uses of
5593 // the CMP operands, and the condition code is EQ or NE, we can optimize it
5594 // to an integer comparison.
5595 if (CC == ISD::SETOEQ)
5596 CC = ISD::SETEQ;
5597 else if (CC == ISD::SETUNE)
5598 CC = ISD::SETNE;
5599
5600 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5601 SDValue ARMcc;
5602 if (LHS.getValueType() == MVT::f32) {
5603 LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5604 bitcastf32Toi32(LHS, DAG), Mask);
5605 RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5606 bitcastf32Toi32(RHS, DAG), Mask);
5607 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5608 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5609 Cmp);
5610 }
5611
5612 SDValue LHS1, LHS2;
5613 SDValue RHS1, RHS2;
5614 expandf64Toi32(LHS, DAG, LHS1, LHS2);
5615 expandf64Toi32(RHS, DAG, RHS1, RHS2);
5616 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
5617 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
5619 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5620 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
5621 return DAG.getNode(ARMISD::BCC_i64, dl, MVT::Other, Ops);
5622 }
5623
5624 return SDValue();
5625}
5626
5627// Generate CMP + CMOV for integer abs.
5628SDValue ARMTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
5629 SDLoc DL(Op);
5630
5631 SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, MVT::i32);
5632
5633 // Generate CMP & CMOV.
5634 SDValue Cmp = DAG.getNode(ARMISD::CMP, DL, FlagsVT, Op.getOperand(0),
5635 DAG.getConstant(0, DL, MVT::i32));
5636 return DAG.getNode(ARMISD::CMOV, DL, MVT::i32, Op.getOperand(0), Neg,
5637 DAG.getConstant(ARMCC::MI, DL, MVT::i32), Cmp);
5638}
5639
5640SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
5641 SDValue Chain = Op.getOperand(0);
5642 SDValue Cond = Op.getOperand(1);
5643 SDValue Dest = Op.getOperand(2);
5644 SDLoc dl(Op);
5645
5646 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5647 // instruction.
5648 unsigned Opc = Cond.getOpcode();
5649 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5650 !Subtarget->isThumb1Only();
5651 if (Cond.getResNo() == 1 &&
5652 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5653 Opc == ISD::USUBO || OptimizeMul)) {
5654 // Only lower legal XALUO ops.
5655 if (!isTypeLegal(Cond->getValueType(0)))
5656 return SDValue();
5657
5658 // The actual operation with overflow check.
5659 SDValue Value, OverflowCmp;
5660 SDValue ARMcc;
5661 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5662
5663 // Reverse the condition code.
5665 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5667 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5668
5669 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5670 OverflowCmp);
5671 }
5672
5673 return SDValue();
5674}
5675
5676SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
5677 SDValue Chain = Op.getOperand(0);
5678 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5679 SDValue LHS = Op.getOperand(2);
5680 SDValue RHS = Op.getOperand(3);
5681 SDValue Dest = Op.getOperand(4);
5682 SDLoc dl(Op);
5683
5684 if (isUnsupportedFloatingType(LHS.getValueType())) {
5685 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5686
5687 // If softenSetCCOperands only returned one value, we should compare it to
5688 // zero.
5689 if (!RHS.getNode()) {
5690 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5691 CC = ISD::SETNE;
5692 }
5693 }
5694
5695 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5696 // instruction.
5697 unsigned Opc = LHS.getOpcode();
5698 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5699 !Subtarget->isThumb1Only();
5700 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
5701 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5702 Opc == ISD::USUBO || OptimizeMul) &&
5703 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5704 // Only lower legal XALUO ops.
5705 if (!isTypeLegal(LHS->getValueType(0)))
5706 return SDValue();
5707
5708 // The actual operation with overflow check.
5709 SDValue Value, OverflowCmp;
5710 SDValue ARMcc;
5711 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
5712
5713 if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
5714 // Reverse the condition code.
5716 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5718 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5719 }
5720
5721 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5722 OverflowCmp);
5723 }
5724
5725 if (LHS.getValueType() == MVT::i32) {
5726 SDValue ARMcc;
5727 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5728 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, Cmp);
5729 }
5730
5731 SDNodeFlags Flags = Op->getFlags();
5732 if ((getTargetMachine().Options.UnsafeFPMath || Flags.hasNoNaNs()) &&
5733 (DAG.getDenormalMode(MVT::f32) == DenormalMode::getIEEE() &&
5734 DAG.getDenormalMode(MVT::f64) == DenormalMode::getIEEE()) &&
5735 (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETNE ||
5736 CC == ISD::SETUNE)) {
5737 if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
5738 return Result;
5739 }
5740
5741 ARMCC::CondCodes CondCode, CondCode2;
5742 FPCCToARMCC(CC, CondCode, CondCode2);
5743
5744 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5745 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5746 SDValue Ops[] = {Chain, Dest, ARMcc, Cmp};
5747 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5748 if (CondCode2 != ARMCC::AL) {
5749 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
5750 SDValue Ops[] = {Res, Dest, ARMcc, Cmp};
5751 Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5752 }
5753 return Res;
5754}
5755
5756SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
5757 SDValue Chain = Op.getOperand(0);
5758 SDValue Table = Op.getOperand(1);
5759 SDValue Index = Op.getOperand(2);
5760 SDLoc dl(Op);
5761
5762 EVT PTy = getPointerTy(DAG.getDataLayout());
5763 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
5764 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
5765 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
5766 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
5767 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
5768 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
5769 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
5770 // which does another jump to the destination. This also makes it easier
5771 // to translate it to TBB / TBH later (Thumb2 only).
5772 // FIXME: This might not work if the function is extremely large.
5773 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
5774 Addr, Op.getOperand(2), JTI);
5775 }
5776 if (isPositionIndependent() || Subtarget->isROPI()) {
5777 Addr =
5778 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
5780 Chain = Addr.getValue(1);
5781 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
5782 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5783 } else {
5784 Addr =
5785 DAG.getLoad(PTy, dl, Chain, Addr,
5787 Chain = Addr.getValue(1);
5788 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5789 }
5790}
5791
5793 EVT VT = Op.getValueType();
5794 SDLoc dl(Op);
5795
5796 if (Op.getValueType().getVectorElementType() == MVT::i32) {
5797 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
5798 return Op;
5799 return DAG.UnrollVectorOp(Op.getNode());
5800 }
5801
5802 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5803
5804 EVT NewTy;
5805 const EVT OpTy = Op.getOperand(0).getValueType();
5806 if (OpTy == MVT::v4f32)
5807 NewTy = MVT::v4i32;
5808 else if (OpTy == MVT::v4f16 && HasFullFP16)
5809 NewTy = MVT::v4i16;
5810 else if (OpTy == MVT::v8f16 && HasFullFP16)
5811 NewTy = MVT::v8i16;
5812 else
5813 llvm_unreachable("Invalid type for custom lowering!");
5814
5815 if (VT != MVT::v4i16 && VT != MVT::v8i16)
5816 return DAG.UnrollVectorOp(Op.getNode());
5817
5818 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
5819 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
5820}
5821
5822SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
5823 EVT VT = Op.getValueType();
5824 if (VT.isVector())
5825 return LowerVectorFP_TO_INT(Op, DAG);
5826
5827 bool IsStrict = Op->isStrictFPOpcode();
5828 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5829
5830 if (isUnsupportedFloatingType(SrcVal.getValueType())) {
5831 RTLIB::Libcall LC;
5832 if (Op.getOpcode() == ISD::FP_TO_SINT ||
5833 Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
5834 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(),
5835 Op.getValueType());
5836 else
5837 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(),
5838 Op.getValueType());
5839 SDLoc Loc(Op);
5840 MakeLibCallOptions CallOptions;
5841 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
5843 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
5844 CallOptions, Loc, Chain);
5845 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
5846 }
5847
5848 // FIXME: Remove this when we have strict fp instruction selection patterns
5849 if (IsStrict) {
5850 SDLoc Loc(Op);
5851 SDValue Result =
5854 Loc, Op.getValueType(), SrcVal);
5855 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
5856 }
5857
5858 return Op;
5859}
5860
5862 const ARMSubtarget *Subtarget) {
5863 EVT VT = Op.getValueType();
5864 EVT ToVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5865 EVT FromVT = Op.getOperand(0).getValueType();
5866
5867 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32)
5868 return Op;
5869 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 &&
5870 Subtarget->hasFP64())
5871 return Op;
5872 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 &&
5873 Subtarget->hasFullFP16())
5874 return Op;
5875 if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 &&
5876 Subtarget->hasMVEFloatOps())
5877 return Op;
5878 if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 &&
5879 Subtarget->hasMVEFloatOps())
5880 return Op;
5881
5882 if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16)
5883 return SDValue();
5884
5885 SDLoc DL(Op);
5886 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;
5887 unsigned BW = ToVT.getScalarSizeInBits() - IsSigned;
5888 SDValue CVT = DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
5889 DAG.getValueType(VT.getScalarType()));
5890 SDValue Max = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, CVT,
5891 DAG.getConstant((1 << BW) - 1, DL, VT));
5892 if (IsSigned)
5893 Max = DAG.getNode(ISD::SMAX, DL, VT, Max,
5894 DAG.getSignedConstant(-(1 << BW), DL, VT));
5895 return Max;
5896}
5897
5899 EVT VT = Op.getValueType();
5900 SDLoc dl(Op);
5901
5902 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
5903 if (VT.getVectorElementType() == MVT::f32)
5904 return Op;
5905 return DAG.UnrollVectorOp(Op.getNode());
5906 }
5907
5908 assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
5909 Op.getOperand(0).getValueType() == MVT::v8i16) &&
5910 "Invalid type for custom lowering!");
5911
5912 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5913
5914 EVT DestVecType;
5915 if (VT == MVT::v4f32)
5916 DestVecType = MVT::v4i32;
5917 else if (VT == MVT::v4f16 && HasFullFP16)
5918 DestVecType = MVT::v4i16;
5919 else if (VT == MVT::v8f16 && HasFullFP16)
5920 DestVecType = MVT::v8i16;
5921 else
5922 return DAG.UnrollVectorOp(Op.getNode());
5923
5924 unsigned CastOpc;
5925 unsigned Opc;
5926 switch (Op.getOpcode()) {
5927 default: llvm_unreachable("Invalid opcode!");
5928 case ISD::SINT_TO_FP:
5929 CastOpc = ISD::SIGN_EXTEND;
5931 break;
5932 case ISD::UINT_TO_FP:
5933 CastOpc = ISD::ZERO_EXTEND;
5935 break;
5936 }
5937
5938 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
5939 return DAG.getNode(Opc, dl, VT, Op);
5940}
5941
5942SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
5943 EVT VT = Op.getValueType();
5944 if (VT.isVector())
5945 return LowerVectorINT_TO_FP(Op, DAG);
5946 if (isUnsupportedFloatingType(VT)) {
5947 RTLIB::Libcall LC;
5948 if (Op.getOpcode() == ISD::SINT_TO_FP)
5949 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
5950 Op.getValueType());
5951 else
5952 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
5953 Op.getValueType());
5954 MakeLibCallOptions CallOptions;
5955 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
5956 CallOptions, SDLoc(Op)).first;
5957 }
5958
5959 return Op;
5960}
5961
5962SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
5963 // Implement fcopysign with a fabs and a conditional fneg.
5964 SDValue Tmp0 = Op.getOperand(0);
5965 SDValue Tmp1 = Op.getOperand(1);
5966 SDLoc dl(Op);
5967 EVT VT = Op.getValueType();
5968 EVT SrcVT = Tmp1.getValueType();
5969 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
5970 Tmp0.getOpcode() == ARMISD::VMOVDRR;
5971 bool UseNEON = !InGPR && Subtarget->hasNEON();
5972
5973 if (UseNEON) {
5974 // Use VBSL to copy the sign bit.
5975 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
5976 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
5977 DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
5978 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
5979 if (VT == MVT::f64)
5980 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5981 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
5982 DAG.getConstant(32, dl, MVT::i32));
5983 else /*if (VT == MVT::f32)*/
5984 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
5985 if (SrcVT == MVT::f32) {
5986 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
5987 if (VT == MVT::f64)
5988 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5989 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
5990 DAG.getConstant(32, dl, MVT::i32));
5991 } else if (VT == MVT::f32)
5992 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
5993 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
5994 DAG.getConstant(32, dl, MVT::i32));
5995 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
5996 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
5997
5999 dl, MVT::i32);
6000 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
6001 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
6002 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
6003
6004 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
6005 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
6006 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
6007 if (VT == MVT::f32) {
6008 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
6009 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
6010 DAG.getConstant(0, dl, MVT::i32));
6011 } else {
6012 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
6013 }
6014
6015 return Res;
6016 }
6017
6018 // Bitcast operand 1 to i32.
6019 if (SrcVT == MVT::f64)
6020 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
6021 Tmp1).getValue(1);
6022 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
6023
6024 // Or in the signbit with integer operations.
6025 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
6026 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
6027 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
6028 if (VT == MVT::f32) {
6029 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
6030 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
6031 return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
6032 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
6033 }
6034
6035 // f64: Or the high part with signbit and then combine two parts.
6036 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
6037 Tmp0);
6038 SDValue Lo = Tmp0.getValue(0);
6039 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
6040 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
6041 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
6042}
6043
6044SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
6045 MachineFunction &MF = DAG.getMachineFunction();
6046 MachineFrameInfo &MFI = MF.getFrameInfo();
6047 MFI.setReturnAddressIsTaken(true);
6048
6049 EVT VT = Op.getValueType();
6050 SDLoc dl(Op);
6051 unsigned Depth = Op.getConstantOperandVal(0);
6052 if (Depth) {
6053 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
6054 SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
6055 return DAG.getLoad(VT, dl, DAG.getEntryNode(),
6056 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
6057 MachinePointerInfo());
6058 }
6059
6060 // Return LR, which contains the return address. Mark it an implicit live-in.
6061 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
6062 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
6063}
6064
6065SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
6066 const ARMBaseRegisterInfo &ARI =
6067 *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
6068 MachineFunction &MF = DAG.getMachineFunction();
6069 MachineFrameInfo &MFI = MF.getFrameInfo();
6070 MFI.setFrameAddressIsTaken(true);
6071
6072 EVT VT = Op.getValueType();
6073 SDLoc dl(Op); // FIXME probably not meaningful
6074 unsigned Depth = Op.getConstantOperandVal(0);
6075 Register FrameReg = ARI.getFrameRegister(MF);
6076 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
6077 while (Depth--)
6078 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
6079 MachinePointerInfo());
6080 return FrameAddr;
6081}
6082
6083// FIXME? Maybe this could be a TableGen attribute on some registers and
6084// this table could be generated automatically from RegInfo.
6085Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
6086 const MachineFunction &MF) const {
6087 return StringSwitch<Register>(RegName)
6088 .Case("sp", ARM::SP)
6089 .Default(Register());
6090}
6091
6092// Result is 64 bit value so split into two 32 bit values and return as a
6093// pair of values.
6095 SelectionDAG &DAG) {
6096 SDLoc DL(N);
6097
6098 // This function is only supposed to be called for i64 type destination.
6099 assert(N->getValueType(0) == MVT::i64
6100 && "ExpandREAD_REGISTER called for non-i64 type result.");
6101
6103 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
6104 N->getOperand(0),
6105 N->getOperand(1));
6106
6107 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
6108 Read.getValue(1)));
6109 Results.push_back(Read.getValue(2)); // Chain
6110}
6111
6112/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
6113/// When \p DstVT, the destination type of \p BC, is on the vector
6114/// register bank and the source of bitcast, \p Op, operates on the same bank,
6115/// it might be possible to combine them, such that everything stays on the
6116/// vector register bank.
6117/// \p return The node that would replace \p BT, if the combine
6118/// is possible.
6120 SelectionDAG &DAG) {
6121 SDValue Op = BC->getOperand(0);
6122 EVT DstVT = BC->getValueType(0);
6123
6124 // The only vector instruction that can produce a scalar (remember,
6125 // since the bitcast was about to be turned into VMOVDRR, the source
6126 // type is i64) from a vector is EXTRACT_VECTOR_ELT.
6127 // Moreover, we can do this combine only if there is one use.
6128 // Finally, if the destination type is not a vector, there is not
6129 // much point on forcing everything on the vector bank.
6130 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6131 !Op.hasOneUse())
6132 return SDValue();
6133
6134 // If the index is not constant, we will introduce an additional
6135 // multiply that will stick.
6136 // Give up in that case.
6137 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
6138 if (!Index)
6139 return SDValue();
6140 unsigned DstNumElt = DstVT.getVectorNumElements();
6141
6142 // Compute the new index.
6143 const APInt &APIntIndex = Index->getAPIntValue();
6144 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
6145 NewIndex *= APIntIndex;
6146 // Check if the new constant index fits into i32.
6147 if (NewIndex.getBitWidth() > 32)
6148 return SDValue();
6149
6150 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
6151 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
6152 SDLoc dl(Op);
6153 SDValue ExtractSrc = Op.getOperand(0);
6154 EVT VecVT = EVT::getVectorVT(
6155 *DAG.getContext(), DstVT.getScalarType(),
6156 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
6157 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
6158 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
6159 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
6160}
6161
6162/// ExpandBITCAST - If the target supports VFP, this function is called to
6163/// expand a bit convert where either the source or destination type is i64 to
6164/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
6165/// operand type is illegal (e.g., v2f32 for a target that doesn't support
6166/// vectors), since the legalizer won't know what to do with that.
6167SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
6168 const ARMSubtarget *Subtarget) const {
6169 SDLoc dl(N);
6170 SDValue Op = N->getOperand(0);
6171
6172 // This function is only supposed to be called for i16 and i64 types, either
6173 // as the source or destination of the bit convert.
6174 EVT SrcVT = Op.getValueType();
6175 EVT DstVT = N->getValueType(0);
6176
6177 if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
6178 (DstVT == MVT::f16 || DstVT == MVT::bf16))
6179 return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
6180 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
6181
6182 if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
6183 (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) {
6184 if (Subtarget->hasFullFP16() && !Subtarget->hasBF16())
6185 Op = DAG.getBitcast(MVT::f16, Op);
6186 return DAG.getNode(
6187 ISD::TRUNCATE, SDLoc(N), DstVT,
6188 MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
6189 }
6190
6191 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
6192 return SDValue();
6193
6194 // Turn i64->f64 into VMOVDRR.
6195 if (SrcVT == MVT::i64 && isTypeLegal(DstVT)) {
6196 // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
6197 // if we can combine the bitcast with its source.
6199 return Val;
6200 SDValue Lo, Hi;
6201 std::tie(Lo, Hi) = DAG.SplitScalar(Op, dl, MVT::i32, MVT::i32);
6202 return DAG.getNode(ISD::BITCAST, dl, DstVT,
6203 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
6204 }
6205
6206 // Turn f64->i64 into VMOVRRD.
6207 if (DstVT == MVT::i64 && isTypeLegal(SrcVT)) {
6208 SDValue Cvt;
6209 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
6210 SrcVT.getVectorNumElements() > 1)
6211 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6212 DAG.getVTList(MVT::i32, MVT::i32),
6213 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
6214 else
6215 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6216 DAG.getVTList(MVT::i32, MVT::i32), Op);
6217 // Merge the pieces into a single i64 value.
6218 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
6219 }
6220
6221 return SDValue();
6222}
6223
6224/// getZeroVector - Returns a vector of specified type with all zero elements.
6225/// Zero vectors are used to represent vector negation and in those cases
6226/// will be implemented with the NEON VNEG instruction. However, VNEG does
6227/// not support i64 elements, so sometimes the zero vectors will need to be
6228/// explicitly constructed. Regardless, use a canonical VMOV to create the
6229/// zero vector.
6230static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6231 assert(VT.isVector() && "Expected a vector type");
6232 // The canonical modified immediate encoding of a zero vector is....0!
6233 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
6234 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
6235 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
6236 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6237}
6238
6239/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
6240/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6241SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
6242 SelectionDAG &DAG) const {
6243 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6244 EVT VT = Op.getValueType();
6245 unsigned VTBits = VT.getSizeInBits();
6246 SDLoc dl(Op);
6247 SDValue ShOpLo = Op.getOperand(0);
6248 SDValue ShOpHi = Op.getOperand(1);
6249 SDValue ShAmt = Op.getOperand(2);
6250 SDValue ARMcc;
6251 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
6252
6253 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
6254
6255 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6256 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6257 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
6258 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6259 DAG.getConstant(VTBits, dl, MVT::i32));
6260 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
6261 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6262 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
6263 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6264 ISD::SETGE, ARMcc, DAG, dl);
6265 SDValue Lo =
6266 DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, ARMcc, CmpLo);
6267
6268 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
6269 SDValue HiBigShift = Opc == ISD::SRA
6270 ? DAG.getNode(Opc, dl, VT, ShOpHi,
6271 DAG.getConstant(VTBits - 1, dl, VT))
6272 : DAG.getConstant(0, dl, VT);
6273 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6274 ISD::SETGE, ARMcc, DAG, dl);
6275 SDValue Hi =
6276 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6277
6278 SDValue Ops[2] = { Lo, Hi };
6279 return DAG.getMergeValues(Ops, dl);
6280}
6281
6282/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
6283/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6284SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
6285 SelectionDAG &DAG) const {
6286 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6287 EVT VT = Op.getValueType();
6288 unsigned VTBits = VT.getSizeInBits();
6289 SDLoc dl(Op);
6290 SDValue ShOpLo = Op.getOperand(0);
6291 SDValue ShOpHi = Op.getOperand(1);
6292 SDValue ShAmt = Op.getOperand(2);
6293 SDValue ARMcc;
6294
6295 assert(Op.getOpcode() == ISD::SHL_PARTS);
6296 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6297 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6298 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
6299 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
6300 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6301
6302 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6303 DAG.getConstant(VTBits, dl, MVT::i32));
6304 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
6305 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6306 ISD::SETGE, ARMcc, DAG, dl);
6307 SDValue Hi =
6308 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6309
6310 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6311 ISD::SETGE, ARMcc, DAG, dl);
6312 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
6313 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
6314 DAG.getConstant(0, dl, VT), ARMcc, CmpLo);
6315
6316 SDValue Ops[2] = { Lo, Hi };
6317 return DAG.getMergeValues(Ops, dl);
6318}
6319
6320SDValue ARMTargetLowering::LowerGET_ROUNDING(SDValue Op,
6321 SelectionDAG &DAG) const {
6322 // The rounding mode is in bits 23:22 of the FPSCR.
6323 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
6324 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
6325 // so that the shift + and get folded into a bitfield extract.
6326 SDLoc dl(Op);
6327 SDValue Chain = Op.getOperand(0);
6328 SDValue Ops[] = {Chain,
6329 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
6330
6331 SDValue FPSCR =
6332 DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);
6333 Chain = FPSCR.getValue(1);
6334 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
6335 DAG.getConstant(1U << 22, dl, MVT::i32));
6336 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
6337 DAG.getConstant(22, dl, MVT::i32));
6338 SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
6339 DAG.getConstant(3, dl, MVT::i32));
6340 return DAG.getMergeValues({And, Chain}, dl);
6341}
6342
6343SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op,
6344 SelectionDAG &DAG) const {
6345 SDLoc DL(Op);
6346 SDValue Chain = Op->getOperand(0);
6347 SDValue RMValue = Op->getOperand(1);
6348
6349 // The rounding mode is in bits 23:22 of the FPSCR.
6350 // The llvm.set.rounding argument value to ARM rounding mode value mapping
6351 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
6352 // ((arg - 1) & 3) << 22).
6353 //
6354 // It is expected that the argument of llvm.set.rounding is within the
6355 // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is
6356 // responsibility of the code generated llvm.set.rounding to ensure this
6357 // condition.
6358
6359 // Calculate new value of FPSCR[23:22].
6360 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
6361 DAG.getConstant(1, DL, MVT::i32));
6362 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
6363 DAG.getConstant(0x3, DL, MVT::i32));
6364 RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
6365 DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32));
6366
6367 // Get current value of FPSCR.
6368 SDValue Ops[] = {Chain,
6369 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6370 SDValue FPSCR =
6371 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6372 Chain = FPSCR.getValue(1);
6373 FPSCR = FPSCR.getValue(0);
6374
6375 // Put new rounding mode into FPSCR[23:22].
6376 const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos);
6377 FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6378 DAG.getConstant(RMMask, DL, MVT::i32));
6379 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue);
6380 SDValue Ops2[] = {
6381 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6382 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6383}
6384
6385SDValue ARMTargetLowering::LowerSET_FPMODE(SDValue Op,
6386 SelectionDAG &DAG) const {
6387 SDLoc DL(Op);
6388 SDValue Chain = Op->getOperand(0);
6389 SDValue Mode = Op->getOperand(1);
6390
6391 // Generate nodes to build:
6392 // FPSCR = (FPSCR & FPStatusBits) | (Mode & ~FPStatusBits)
6393 SDValue Ops[] = {Chain,
6394 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6395 SDValue FPSCR =
6396 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6397 Chain = FPSCR.getValue(1);
6398 FPSCR = FPSCR.getValue(0);
6399
6400 SDValue FPSCRMasked =
6401 DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6402 DAG.getConstant(ARM::FPStatusBits, DL, MVT::i32));
6403 SDValue InputMasked =
6404 DAG.getNode(ISD::AND, DL, MVT::i32, Mode,
6405 DAG.getConstant(~ARM::FPStatusBits, DL, MVT::i32));
6406 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCRMasked, InputMasked);
6407
6408 SDValue Ops2[] = {
6409 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6410 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6411}
6412
6413SDValue ARMTargetLowering::LowerRESET_FPMODE(SDValue Op,
6414 SelectionDAG &DAG) const {
6415 SDLoc DL(Op);
6416 SDValue Chain = Op->getOperand(0);
6417
6418 // To get the default FP mode all control bits are cleared:
6419 // FPSCR = FPSCR & (FPStatusBits | FPReservedBits)
6420 SDValue Ops[] = {Chain,
6421 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6422 SDValue FPSCR =
6423 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6424 Chain = FPSCR.getValue(1);
6425 FPSCR = FPSCR.getValue(0);
6426
6427 SDValue FPSCRMasked = DAG.getNode(
6428 ISD::AND, DL, MVT::i32, FPSCR,
6430 SDValue Ops2[] = {Chain,
6431 DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32),
6432 FPSCRMasked};
6433 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6434}
6435
6437 const ARMSubtarget *ST) {
6438 SDLoc dl(N);
6439 EVT VT = N->getValueType(0);
6440 if (VT.isVector() && ST->hasNEON()) {
6441
6442 // Compute the least significant set bit: LSB = X & -X
6443 SDValue X = N->getOperand(0);
6444 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
6445 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
6446
6447 EVT ElemTy = VT.getVectorElementType();
6448
6449 if (ElemTy == MVT::i8) {
6450 // Compute with: cttz(x) = ctpop(lsb - 1)
6451 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6452 DAG.getTargetConstant(1, dl, ElemTy));
6453 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6454 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6455 }
6456
6457 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
6458 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
6459 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
6460 unsigned NumBits = ElemTy.getSizeInBits();
6461 SDValue WidthMinus1 =
6462 DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6463 DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
6464 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
6465 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
6466 }
6467
6468 // Compute with: cttz(x) = ctpop(lsb - 1)
6469
6470 // Compute LSB - 1.
6471 SDValue Bits;
6472 if (ElemTy == MVT::i64) {
6473 // Load constant 0xffff'ffff'ffff'ffff to register.
6474 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6475 DAG.getTargetConstant(0x1eff, dl, MVT::i32));
6476 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
6477 } else {
6478 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6479 DAG.getTargetConstant(1, dl, ElemTy));
6480 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6481 }
6482 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6483 }
6484
6485 if (!ST->hasV6T2Ops())
6486 return SDValue();
6487
6488 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
6489 return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
6490}
6491
6493 const ARMSubtarget *ST) {
6494 EVT VT = N->getValueType(0);
6495 SDLoc DL(N);
6496
6497 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
6498 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
6499 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
6500 "Unexpected type for custom ctpop lowering");
6501
6502 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6503 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
6504 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
6505 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
6506
6507 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
6508 unsigned EltSize = 8;
6509 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
6510 while (EltSize != VT.getScalarSizeInBits()) {
6512 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
6513 TLI.getPointerTy(DAG.getDataLayout())));
6514 Ops.push_back(Res);
6515
6516 EltSize *= 2;
6517 NumElts /= 2;
6518 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
6519 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
6520 }
6521
6522 return Res;
6523}
6524
6525/// Getvshiftimm - Check if this is a valid build_vector for the immediate
6526/// operand of a vector shift operation, where all the elements of the
6527/// build_vector must have the same constant integer value.
6528static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
6529 // Ignore bit_converts.
6530 while (Op.getOpcode() == ISD::BITCAST)
6531 Op = Op.getOperand(0);
6533 APInt SplatBits, SplatUndef;
6534 unsigned SplatBitSize;
6535 bool HasAnyUndefs;
6536 if (!BVN ||
6537 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
6538 ElementBits) ||
6539 SplatBitSize > ElementBits)
6540 return false;
6541 Cnt = SplatBits.getSExtValue();
6542 return true;
6543}
6544
6545/// isVShiftLImm - Check if this is a valid build_vector for the immediate
6546/// operand of a vector shift left operation. That value must be in the range:
6547/// 0 <= Value < ElementBits for a left shift; or
6548/// 0 <= Value <= ElementBits for a long left shift.
6549static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
6550 assert(VT.isVector() && "vector shift count is not a vector type");
6551 int64_t ElementBits = VT.getScalarSizeInBits();
6552 if (!getVShiftImm(Op, ElementBits, Cnt))
6553 return false;
6554 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
6555}
6556
6557/// isVShiftRImm - Check if this is a valid build_vector for the immediate
6558/// operand of a vector shift right operation. For a shift opcode, the value
6559/// is positive, but for an intrinsic the value count must be negative. The
6560/// absolute value must be in the range:
6561/// 1 <= |Value| <= ElementBits for a right shift; or
6562/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
6563static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
6564 int64_t &Cnt) {
6565 assert(VT.isVector() && "vector shift count is not a vector type");
6566 int64_t ElementBits = VT.getScalarSizeInBits();
6567 if (!getVShiftImm(Op, ElementBits, Cnt))
6568 return false;
6569 if (!isIntrinsic)
6570 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
6571 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
6572 Cnt = -Cnt;
6573 return true;
6574 }
6575 return false;
6576}
6577
6579 const ARMSubtarget *ST) {
6580 EVT VT = N->getValueType(0);
6581 SDLoc dl(N);
6582 int64_t Cnt;
6583
6584 if (!VT.isVector())
6585 return SDValue();
6586
6587 // We essentially have two forms here. Shift by an immediate and shift by a
6588 // vector register (there are also shift by a gpr, but that is just handled
6589 // with a tablegen pattern). We cannot easily match shift by an immediate in
6590 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
6591 // For shifting by a vector, we don't have VSHR, only VSHL (which can be
6592 // signed or unsigned, and a negative shift indicates a shift right).
6593 if (N->getOpcode() == ISD::SHL) {
6594 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
6595 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
6596 DAG.getConstant(Cnt, dl, MVT::i32));
6597 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
6598 N->getOperand(1));
6599 }
6600
6601 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
6602 "unexpected vector shift opcode");
6603
6604 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
6605 unsigned VShiftOpc =
6606 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
6607 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
6608 DAG.getConstant(Cnt, dl, MVT::i32));
6609 }
6610
6611 // Other right shifts we don't have operations for (we use a shift left by a
6612 // negative number).
6613 EVT ShiftVT = N->getOperand(1).getValueType();
6614 SDValue NegatedCount = DAG.getNode(
6615 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
6616 unsigned VShiftOpc =
6617 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
6618 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
6619}
6620
6622 const ARMSubtarget *ST) {
6623 EVT VT = N->getValueType(0);
6624 SDLoc dl(N);
6625
6626 // We can get here for a node like i32 = ISD::SHL i32, i64
6627 if (VT != MVT::i64)
6628 return SDValue();
6629
6630 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
6631 N->getOpcode() == ISD::SHL) &&
6632 "Unknown shift to lower!");
6633
6634 unsigned ShOpc = N->getOpcode();
6635 if (ST->hasMVEIntegerOps()) {
6636 SDValue ShAmt = N->getOperand(1);
6637 unsigned ShPartsOpc = ARMISD::LSLL;
6639
6640 // If the shift amount is greater than 32 or has a greater bitwidth than 64
6641 // then do the default optimisation
6642 if ((!Con && ShAmt->getValueType(0).getSizeInBits() > 64) ||
6643 (Con && (Con->getAPIntValue() == 0 || Con->getAPIntValue().uge(32))))
6644 return SDValue();
6645
6646 // Extract the lower 32 bits of the shift amount if it's not an i32
6647 if (ShAmt->getValueType(0) != MVT::i32)
6648 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
6649
6650 if (ShOpc == ISD::SRL) {
6651 if (!Con)
6652 // There is no t2LSRLr instruction so negate and perform an lsll if the
6653 // shift amount is in a register, emulating a right shift.
6654 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6655 DAG.getConstant(0, dl, MVT::i32), ShAmt);
6656 else
6657 // Else generate an lsrl on the immediate shift amount
6658 ShPartsOpc = ARMISD::LSRL;
6659 } else if (ShOpc == ISD::SRA)
6660 ShPartsOpc = ARMISD::ASRL;
6661
6662 // Split Lower/Upper 32 bits of the destination/source
6663 SDValue Lo, Hi;
6664 std::tie(Lo, Hi) =
6665 DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6666 // Generate the shift operation as computed above
6667 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
6668 ShAmt);
6669 // The upper 32 bits come from the second return value of lsll
6670 Hi = SDValue(Lo.getNode(), 1);
6671 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6672 }
6673
6674 // We only lower SRA, SRL of 1 here, all others use generic lowering.
6675 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
6676 return SDValue();
6677
6678 // If we are in thumb mode, we don't have RRX.
6679 if (ST->isThumb1Only())
6680 return SDValue();
6681
6682 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
6683 SDValue Lo, Hi;
6684 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6685
6686 // First, build a LSRS1/ASRS1 op, which shifts the top part by one and
6687 // captures the shifted out bit into a carry flag.
6688 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::LSRS1 : ARMISD::ASRS1;
6689 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, FlagsVT), Hi);
6690
6691 // The low part is an ARMISD::RRX operand, which shifts the carry in.
6692 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
6693
6694 // Merge the pieces into a single i64 value.
6695 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6696}
6697
6699 const ARMSubtarget *ST) {
6700 bool Invert = false;
6701 bool Swap = false;
6702 unsigned Opc = ARMCC::AL;
6703
6704 SDValue Op0 = Op.getOperand(0);
6705 SDValue Op1 = Op.getOperand(1);
6706 SDValue CC = Op.getOperand(2);
6707 EVT VT = Op.getValueType();
6708 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
6709 SDLoc dl(Op);
6710
6711 EVT CmpVT;
6712 if (ST->hasNEON())
6714 else {
6715 assert(ST->hasMVEIntegerOps() &&
6716 "No hardware support for integer vector comparison!");
6717
6718 if (Op.getValueType().getVectorElementType() != MVT::i1)
6719 return SDValue();
6720
6721 // Make sure we expand floating point setcc to scalar if we do not have
6722 // mve.fp, so that we can handle them from there.
6723 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
6724 return SDValue();
6725
6726 CmpVT = VT;
6727 }
6728
6729 if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
6730 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
6731 // Special-case integer 64-bit equality comparisons. They aren't legal,
6732 // but they can be lowered with a few vector instructions.
6733 unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
6734 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
6735 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
6736 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
6737 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
6738 DAG.getCondCode(ISD::SETEQ));
6739 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
6740 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
6741 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
6742 if (SetCCOpcode == ISD::SETNE)
6743 Merged = DAG.getNOT(dl, Merged, CmpVT);
6744 Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
6745 return Merged;
6746 }
6747
6748 if (CmpVT.getVectorElementType() == MVT::i64)
6749 // 64-bit comparisons are not legal in general.
6750 return SDValue();
6751
6752 if (Op1.getValueType().isFloatingPoint()) {
6753 switch (SetCCOpcode) {
6754 default: llvm_unreachable("Illegal FP comparison");
6755 case ISD::SETUNE:
6756 case ISD::SETNE:
6757 if (ST->hasMVEFloatOps()) {
6758 Opc = ARMCC::NE; break;
6759 } else {
6760 Invert = true; [[fallthrough]];
6761 }
6762 case ISD::SETOEQ:
6763 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6764 case ISD::SETOLT:
6765 case ISD::SETLT: Swap = true; [[fallthrough]];
6766 case ISD::SETOGT:
6767 case ISD::SETGT: Opc = ARMCC::GT; break;
6768 case ISD::SETOLE:
6769 case ISD::SETLE: Swap = true; [[fallthrough]];
6770 case ISD::SETOGE:
6771 case ISD::SETGE: Opc = ARMCC::GE; break;
6772 case ISD::SETUGE: Swap = true; [[fallthrough]];
6773 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
6774 case ISD::SETUGT: Swap = true; [[fallthrough]];
6775 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
6776 case ISD::SETUEQ: Invert = true; [[fallthrough]];
6777 case ISD::SETONE: {
6778 // Expand this to (OLT | OGT).
6779 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6780 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6781 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6782 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6783 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6784 if (Invert)
6785 Result = DAG.getNOT(dl, Result, VT);
6786 return Result;
6787 }
6788 case ISD::SETUO: Invert = true; [[fallthrough]];
6789 case ISD::SETO: {
6790 // Expand this to (OLT | OGE).
6791 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6792 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6793 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6794 DAG.getConstant(ARMCC::GE, dl, MVT::i32));
6795 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6796 if (Invert)
6797 Result = DAG.getNOT(dl, Result, VT);
6798 return Result;
6799 }
6800 }
6801 } else {
6802 // Integer comparisons.
6803 switch (SetCCOpcode) {
6804 default: llvm_unreachable("Illegal integer comparison");
6805 case ISD::SETNE:
6806 if (ST->hasMVEIntegerOps()) {
6807 Opc = ARMCC::NE; break;
6808 } else {
6809 Invert = true; [[fallthrough]];
6810 }
6811 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6812 case ISD::SETLT: Swap = true; [[fallthrough]];
6813 case ISD::SETGT: Opc = ARMCC::GT; break;
6814 case ISD::SETLE: Swap = true; [[fallthrough]];
6815 case ISD::SETGE: Opc = ARMCC::GE; break;
6816 case ISD::SETULT: Swap = true; [[fallthrough]];
6817 case ISD::SETUGT: Opc = ARMCC::HI; break;
6818 case ISD::SETULE: Swap = true; [[fallthrough]];
6819 case ISD::SETUGE: Opc = ARMCC::HS; break;
6820 }
6821
6822 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
6823 if (ST->hasNEON() && Opc == ARMCC::EQ) {
6824 SDValue AndOp;
6826 AndOp = Op0;
6827 else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
6828 AndOp = Op1;
6829
6830 // Ignore bitconvert.
6831 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
6832 AndOp = AndOp.getOperand(0);
6833
6834 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
6835 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
6836 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
6837 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
6838 if (!Invert)
6839 Result = DAG.getNOT(dl, Result, VT);
6840 return Result;
6841 }
6842 }
6843 }
6844
6845 if (Swap)
6846 std::swap(Op0, Op1);
6847
6848 // If one of the operands is a constant vector zero, attempt to fold the
6849 // comparison to a specialized compare-against-zero form.
6851 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::EQ ||
6852 Opc == ARMCC::NE)) {
6853 if (Opc == ARMCC::GE)
6854 Opc = ARMCC::LE;
6855 else if (Opc == ARMCC::GT)
6856 Opc = ARMCC::LT;
6857 std::swap(Op0, Op1);
6858 }
6859
6860 SDValue Result;
6862 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::LE ||
6863 Opc == ARMCC::LT || Opc == ARMCC::NE || Opc == ARMCC::EQ))
6864 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, Op0,
6865 DAG.getConstant(Opc, dl, MVT::i32));
6866 else
6867 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6868 DAG.getConstant(Opc, dl, MVT::i32));
6869
6870 Result = DAG.getSExtOrTrunc(Result, dl, VT);
6871
6872 if (Invert)
6873 Result = DAG.getNOT(dl, Result, VT);
6874
6875 return Result;
6876}
6877
6879 SDValue LHS = Op.getOperand(0);
6880 SDValue RHS = Op.getOperand(1);
6881 SDValue Carry = Op.getOperand(2);
6882 SDValue Cond = Op.getOperand(3);
6883 SDLoc DL(Op);
6884
6885 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
6886
6887 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
6888 // have to invert the carry first.
6889 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
6890 DAG.getConstant(1, DL, MVT::i32), Carry);
6891 // This converts the boolean value carry into the carry flag.
6892 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
6893
6894 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
6895 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
6896
6897 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
6898 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
6899 SDValue ARMcc = DAG.getConstant(
6900 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
6901 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
6902 Cmp.getValue(1));
6903}
6904
6905/// isVMOVModifiedImm - Check if the specified splat value corresponds to a
6906/// valid vector constant for a NEON or MVE instruction with a "modified
6907/// immediate" operand (e.g., VMOV). If so, return the encoded value.
6908static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
6909 unsigned SplatBitSize, SelectionDAG &DAG,
6910 const SDLoc &dl, EVT &VT, EVT VectorVT,
6911 VMOVModImmType type) {
6912 unsigned OpCmode, Imm;
6913 bool is128Bits = VectorVT.is128BitVector();
6914
6915 // SplatBitSize is set to the smallest size that splats the vector, so a
6916 // zero vector will always have SplatBitSize == 8. However, NEON modified
6917 // immediate instructions others than VMOV do not support the 8-bit encoding
6918 // of a zero vector, and the default encoding of zero is supposed to be the
6919 // 32-bit version.
6920 if (SplatBits == 0)
6921 SplatBitSize = 32;
6922
6923 switch (SplatBitSize) {
6924 case 8:
6925 if (type != VMOVModImm)
6926 return SDValue();
6927 // Any 1-byte value is OK. Op=0, Cmode=1110.
6928 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
6929 OpCmode = 0xe;
6930 Imm = SplatBits;
6931 VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
6932 break;
6933
6934 case 16:
6935 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
6936 VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
6937 if ((SplatBits & ~0xff) == 0) {
6938 // Value = 0x00nn: Op=x, Cmode=100x.
6939 OpCmode = 0x8;
6940 Imm = SplatBits;
6941 break;
6942 }
6943 if ((SplatBits & ~0xff00) == 0) {
6944 // Value = 0xnn00: Op=x, Cmode=101x.
6945 OpCmode = 0xa;
6946 Imm = SplatBits >> 8;
6947 break;
6948 }
6949 return SDValue();
6950
6951 case 32:
6952 // NEON's 32-bit VMOV supports splat values where:
6953 // * only one byte is nonzero, or
6954 // * the least significant byte is 0xff and the second byte is nonzero, or
6955 // * the least significant 2 bytes are 0xff and the third is nonzero.
6956 VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
6957 if ((SplatBits & ~0xff) == 0) {
6958 // Value = 0x000000nn: Op=x, Cmode=000x.
6959 OpCmode = 0;
6960 Imm = SplatBits;
6961 break;
6962 }
6963 if ((SplatBits & ~0xff00) == 0) {
6964 // Value = 0x0000nn00: Op=x, Cmode=001x.
6965 OpCmode = 0x2;
6966 Imm = SplatBits >> 8;
6967 break;
6968 }
6969 if ((SplatBits & ~0xff0000) == 0) {
6970 // Value = 0x00nn0000: Op=x, Cmode=010x.
6971 OpCmode = 0x4;
6972 Imm = SplatBits >> 16;
6973 break;
6974 }
6975 if ((SplatBits & ~0xff000000) == 0) {
6976 // Value = 0xnn000000: Op=x, Cmode=011x.
6977 OpCmode = 0x6;
6978 Imm = SplatBits >> 24;
6979 break;
6980 }
6981
6982 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
6983 if (type == OtherModImm) return SDValue();
6984
6985 if ((SplatBits & ~0xffff) == 0 &&
6986 ((SplatBits | SplatUndef) & 0xff) == 0xff) {
6987 // Value = 0x0000nnff: Op=x, Cmode=1100.
6988 OpCmode = 0xc;
6989 Imm = SplatBits >> 8;
6990 break;
6991 }
6992
6993 // cmode == 0b1101 is not supported for MVE VMVN
6994 if (type == MVEVMVNModImm)
6995 return SDValue();
6996
6997 if ((SplatBits & ~0xffffff) == 0 &&
6998 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
6999 // Value = 0x00nnffff: Op=x, Cmode=1101.
7000 OpCmode = 0xd;
7001 Imm = SplatBits >> 16;
7002 break;
7003 }
7004
7005 // Note: there are a few 32-bit splat values (specifically: 00ffff00,
7006 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
7007 // VMOV.I32. A (very) minor optimization would be to replicate the value
7008 // and fall through here to test for a valid 64-bit splat. But, then the
7009 // caller would also need to check and handle the change in size.
7010 return SDValue();
7011
7012 case 64: {
7013 if (type != VMOVModImm)
7014 return SDValue();
7015 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
7016 uint64_t BitMask = 0xff;
7017 unsigned ImmMask = 1;
7018 Imm = 0;
7019 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
7020 if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
7021 Imm |= ImmMask;
7022 } else if ((SplatBits & BitMask) != 0) {
7023 return SDValue();
7024 }
7025 BitMask <<= 8;
7026 ImmMask <<= 1;
7027 }
7028
7029 // Op=1, Cmode=1110.
7030 OpCmode = 0x1e;
7031 VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
7032 break;
7033 }
7034
7035 default:
7036 llvm_unreachable("unexpected size for isVMOVModifiedImm");
7037 }
7038
7039 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
7040 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
7041}
7042
7043SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
7044 const ARMSubtarget *ST) const {
7045 EVT VT = Op.getValueType();
7046 bool IsDouble = (VT == MVT::f64);
7047 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
7048 const APFloat &FPVal = CFP->getValueAPF();
7049
7050 // Prevent floating-point constants from using literal loads
7051 // when execute-only is enabled.
7052 if (ST->genExecuteOnly()) {
7053 // We shouldn't trigger this for v6m execute-only
7054 assert((!ST->isThumb1Only() || ST->hasV8MBaselineOps()) &&
7055 "Unexpected architecture");
7056
7057 // If we can represent the constant as an immediate, don't lower it
7058 if (isFPImmLegal(FPVal, VT))
7059 return Op;
7060 // Otherwise, construct as integer, and move to float register
7061 APInt INTVal = FPVal.bitcastToAPInt();
7062 SDLoc DL(CFP);
7063 switch (VT.getSimpleVT().SimpleTy) {
7064 default:
7065 llvm_unreachable("Unknown floating point type!");
7066 break;
7067 case MVT::f64: {
7068 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
7069 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
7070 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
7071 }
7072 case MVT::f32:
7073 return DAG.getNode(ARMISD::VMOVSR, DL, VT,
7074 DAG.getConstant(INTVal, DL, MVT::i32));
7075 }
7076 }
7077
7078 if (!ST->hasVFP3Base())
7079 return SDValue();
7080
7081 // Use the default (constant pool) lowering for double constants when we have
7082 // an SP-only FPU
7083 if (IsDouble && !Subtarget->hasFP64())
7084 return SDValue();
7085
7086 // Try splatting with a VMOV.f32...
7087 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
7088
7089 if (ImmVal != -1) {
7090 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
7091 // We have code in place to select a valid ConstantFP already, no need to
7092 // do any mangling.
7093 return Op;
7094 }
7095
7096 // It's a float and we are trying to use NEON operations where
7097 // possible. Lower it to a splat followed by an extract.
7098 SDLoc DL(Op);
7099 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
7100 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
7101 NewVal);
7102 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
7103 DAG.getConstant(0, DL, MVT::i32));
7104 }
7105
7106 // The rest of our options are NEON only, make sure that's allowed before
7107 // proceeding..
7108 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
7109 return SDValue();
7110
7111 EVT VMovVT;
7112 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
7113
7114 // It wouldn't really be worth bothering for doubles except for one very
7115 // important value, which does happen to match: 0.0. So make sure we don't do
7116 // anything stupid.
7117 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
7118 return SDValue();
7119
7120 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
7121 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
7122 VMovVT, VT, VMOVModImm);
7123 if (NewVal != SDValue()) {
7124 SDLoc DL(Op);
7125 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
7126 NewVal);
7127 if (IsDouble)
7128 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
7129
7130 // It's a float: cast and extract a vector element.
7131 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
7132 VecConstant);
7133 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
7134 DAG.getConstant(0, DL, MVT::i32));
7135 }
7136
7137 // Finally, try a VMVN.i32
7138 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
7139 VT, VMVNModImm);
7140 if (NewVal != SDValue()) {
7141 SDLoc DL(Op);
7142 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
7143
7144 if (IsDouble)
7145 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
7146
7147 // It's a float: cast and extract a vector element.
7148 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
7149 VecConstant);
7150 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
7151 DAG.getConstant(0, DL, MVT::i32));
7152 }
7153
7154 return SDValue();
7155}
7156
7157// check if an VEXT instruction can handle the shuffle mask when the
7158// vector sources of the shuffle are the same.
7159static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
7160 unsigned NumElts = VT.getVectorNumElements();
7161
7162 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7163 if (M[0] < 0)
7164 return false;
7165
7166 Imm = M[0];
7167
7168 // If this is a VEXT shuffle, the immediate value is the index of the first
7169 // element. The other shuffle indices must be the successive elements after
7170 // the first one.
7171 unsigned ExpectedElt = Imm;
7172 for (unsigned i = 1; i < NumElts; ++i) {
7173 // Increment the expected index. If it wraps around, just follow it
7174 // back to index zero and keep going.
7175 ++ExpectedElt;
7176 if (ExpectedElt == NumElts)
7177 ExpectedElt = 0;
7178
7179 if (M[i] < 0) continue; // ignore UNDEF indices
7180 if (ExpectedElt != static_cast<unsigned>(M[i]))
7181 return false;
7182 }
7183
7184 return true;
7185}
7186
7187static bool isVEXTMask(ArrayRef<int> M, EVT VT,
7188 bool &ReverseVEXT, unsigned &Imm) {
7189 unsigned NumElts = VT.getVectorNumElements();
7190 ReverseVEXT = false;
7191
7192 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7193 if (M[0] < 0)
7194 return false;
7195
7196 Imm = M[0];
7197
7198 // If this is a VEXT shuffle, the immediate value is the index of the first
7199 // element. The other shuffle indices must be the successive elements after
7200 // the first one.
7201 unsigned ExpectedElt = Imm;
7202 for (unsigned i = 1; i < NumElts; ++i) {
7203 // Increment the expected index. If it wraps around, it may still be
7204 // a VEXT but the source vectors must be swapped.
7205 ExpectedElt += 1;
7206 if (ExpectedElt == NumElts * 2) {
7207 ExpectedElt = 0;
7208 ReverseVEXT = true;
7209 }
7210
7211 if (M[i] < 0) continue; // ignore UNDEF indices
7212 if (ExpectedElt != static_cast<unsigned>(M[i]))
7213 return false;
7214 }
7215
7216 // Adjust the index value if the source operands will be swapped.
7217 if (ReverseVEXT)
7218 Imm -= NumElts;
7219
7220 return true;
7221}
7222
7223static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
7224 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
7225 // range, then 0 is placed into the resulting vector. So pretty much any mask
7226 // of 8 elements can work here.
7227 return VT == MVT::v8i8 && M.size() == 8;
7228}
7229
7230static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
7231 unsigned Index) {
7232 if (Mask.size() == Elements * 2)
7233 return Index / Elements;
7234 return Mask[Index] == 0 ? 0 : 1;
7235}
7236
7237// Checks whether the shuffle mask represents a vector transpose (VTRN) by
7238// checking that pairs of elements in the shuffle mask represent the same index
7239// in each vector, incrementing the expected index by 2 at each step.
7240// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
7241// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
7242// v2={e,f,g,h}
7243// WhichResult gives the offset for each element in the mask based on which
7244// of the two results it belongs to.
7245//
7246// The transpose can be represented either as:
7247// result1 = shufflevector v1, v2, result1_shuffle_mask
7248// result2 = shufflevector v1, v2, result2_shuffle_mask
7249// where v1/v2 and the shuffle masks have the same number of elements
7250// (here WhichResult (see below) indicates which result is being checked)
7251//
7252// or as:
7253// results = shufflevector v1, v2, shuffle_mask
7254// where both results are returned in one vector and the shuffle mask has twice
7255// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
7256// want to check the low half and high half of the shuffle mask as if it were
7257// the other case
7258static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7259 unsigned EltSz = VT.getScalarSizeInBits();
7260 if (EltSz == 64)
7261 return false;
7262
7263 unsigned NumElts = VT.getVectorNumElements();
7264 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7265 return false;
7266
7267 // If the mask is twice as long as the input vector then we need to check the
7268 // upper and lower parts of the mask with a matching value for WhichResult
7269 // FIXME: A mask with only even values will be rejected in case the first
7270 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
7271 // M[0] is used to determine WhichResult
7272 for (unsigned i = 0; i < M.size(); i += NumElts) {
7273 WhichResult = SelectPairHalf(NumElts, M, i);
7274 for (unsigned j = 0; j < NumElts; j += 2) {
7275 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7276 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
7277 return false;
7278 }
7279 }
7280
7281 if (M.size() == NumElts*2)
7282 WhichResult = 0;
7283
7284 return true;
7285}
7286
7287/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
7288/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7289/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
7290static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7291 unsigned EltSz = VT.getScalarSizeInBits();
7292 if (EltSz == 64)
7293 return false;
7294
7295 unsigned NumElts = VT.getVectorNumElements();
7296 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7297 return false;
7298
7299 for (unsigned i = 0; i < M.size(); i += NumElts) {
7300 WhichResult = SelectPairHalf(NumElts, M, i);
7301 for (unsigned j = 0; j < NumElts; j += 2) {
7302 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7303 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
7304 return false;
7305 }
7306 }
7307
7308 if (M.size() == NumElts*2)
7309 WhichResult = 0;
7310
7311 return true;
7312}
7313
7314// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
7315// that the mask elements are either all even and in steps of size 2 or all odd
7316// and in steps of size 2.
7317// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
7318// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
7319// v2={e,f,g,h}
7320// Requires similar checks to that of isVTRNMask with
7321// respect the how results are returned.
7322static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7323 unsigned EltSz = VT.getScalarSizeInBits();
7324 if (EltSz == 64)
7325 return false;
7326
7327 unsigned NumElts = VT.getVectorNumElements();
7328 if (M.size() != NumElts && M.size() != NumElts*2)
7329 return false;
7330
7331 for (unsigned i = 0; i < M.size(); i += NumElts) {
7332 WhichResult = SelectPairHalf(NumElts, M, i);
7333 for (unsigned j = 0; j < NumElts; ++j) {
7334 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
7335 return false;
7336 }
7337 }
7338
7339 if (M.size() == NumElts*2)
7340 WhichResult = 0;
7341
7342 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7343 if (VT.is64BitVector() && EltSz == 32)
7344 return false;
7345
7346 return true;
7347}
7348
7349/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
7350/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7351/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
7352static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7353 unsigned EltSz = VT.getScalarSizeInBits();
7354 if (EltSz == 64)
7355 return false;
7356
7357 unsigned NumElts = VT.getVectorNumElements();
7358 if (M.size() != NumElts && M.size() != NumElts*2)
7359 return false;
7360
7361 unsigned Half = NumElts / 2;
7362 for (unsigned i = 0; i < M.size(); i += NumElts) {
7363 WhichResult = SelectPairHalf(NumElts, M, i);
7364 for (unsigned j = 0; j < NumElts; j += Half) {
7365 unsigned Idx = WhichResult;
7366 for (unsigned k = 0; k < Half; ++k) {
7367 int MIdx = M[i + j + k];
7368 if (MIdx >= 0 && (unsigned) MIdx != Idx)
7369 return false;
7370 Idx += 2;
7371 }
7372 }
7373 }
7374
7375 if (M.size() == NumElts*2)
7376 WhichResult = 0;
7377
7378 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7379 if (VT.is64BitVector() && EltSz == 32)
7380 return false;
7381
7382 return true;
7383}
7384
7385// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
7386// that pairs of elements of the shufflemask represent the same index in each
7387// vector incrementing sequentially through the vectors.
7388// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
7389// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
7390// v2={e,f,g,h}
7391// Requires similar checks to that of isVTRNMask with respect the how results
7392// are returned.
7393static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7394 unsigned EltSz = VT.getScalarSizeInBits();
7395 if (EltSz == 64)
7396 return false;
7397
7398 unsigned NumElts = VT.getVectorNumElements();
7399 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7400 return false;
7401
7402 for (unsigned i = 0; i < M.size(); i += NumElts) {
7403 WhichResult = SelectPairHalf(NumElts, M, i);
7404 unsigned Idx = WhichResult * NumElts / 2;
7405 for (unsigned j = 0; j < NumElts; j += 2) {
7406 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7407 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
7408 return false;
7409 Idx += 1;
7410 }
7411 }
7412
7413 if (M.size() == NumElts*2)
7414 WhichResult = 0;
7415
7416 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7417 if (VT.is64BitVector() && EltSz == 32)
7418 return false;
7419
7420 return true;
7421}
7422
7423/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
7424/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7425/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
7426static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7427 unsigned EltSz = VT.getScalarSizeInBits();
7428 if (EltSz == 64)
7429 return false;
7430
7431 unsigned NumElts = VT.getVectorNumElements();
7432 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7433 return false;
7434
7435 for (unsigned i = 0; i < M.size(); i += NumElts) {
7436 WhichResult = SelectPairHalf(NumElts, M, i);
7437 unsigned Idx = WhichResult * NumElts / 2;
7438 for (unsigned j = 0; j < NumElts; j += 2) {
7439 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7440 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
7441 return false;
7442 Idx += 1;
7443 }
7444 }
7445
7446 if (M.size() == NumElts*2)
7447 WhichResult = 0;
7448
7449 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7450 if (VT.is64BitVector() && EltSz == 32)
7451 return false;
7452
7453 return true;
7454}
7455
7456/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
7457/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
7458static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
7459 unsigned &WhichResult,
7460 bool &isV_UNDEF) {
7461 isV_UNDEF = false;
7462 if (isVTRNMask(ShuffleMask, VT, WhichResult))
7463 return ARMISD::VTRN;
7464 if (isVUZPMask(ShuffleMask, VT, WhichResult))
7465 return ARMISD::VUZP;
7466 if (isVZIPMask(ShuffleMask, VT, WhichResult))
7467 return ARMISD::VZIP;
7468
7469 isV_UNDEF = true;
7470 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
7471 return ARMISD::VTRN;
7472 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7473 return ARMISD::VUZP;
7474 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7475 return ARMISD::VZIP;
7476
7477 return 0;
7478}
7479
7480/// \return true if this is a reverse operation on an vector.
7481static bool isReverseMask(ArrayRef<int> M, EVT VT) {
7482 unsigned NumElts = VT.getVectorNumElements();
7483 // Make sure the mask has the right size.
7484 if (NumElts != M.size())
7485 return false;
7486
7487 // Look for <15, ..., 3, -1, 1, 0>.
7488 for (unsigned i = 0; i != NumElts; ++i)
7489 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
7490 return false;
7491
7492 return true;
7493}
7494
7495static bool isTruncMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7496 unsigned NumElts = VT.getVectorNumElements();
7497 // Make sure the mask has the right size.
7498 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7499 return false;
7500
7501 // Half-width truncation patterns (e.g. v4i32 -> v8i16):
7502 // !Top && SingleSource: <0, 2, 4, 6, 0, 2, 4, 6>
7503 // !Top && !SingleSource: <0, 2, 4, 6, 8, 10, 12, 14>
7504 // Top && SingleSource: <1, 3, 5, 7, 1, 3, 5, 7>
7505 // Top && !SingleSource: <1, 3, 5, 7, 9, 11, 13, 15>
7506 int Ofs = Top ? 1 : 0;
7507 int Upper = SingleSource ? 0 : NumElts;
7508 for (int i = 0, e = NumElts / 2; i != e; ++i) {
7509 if (M[i] >= 0 && M[i] != (i * 2) + Ofs)
7510 return false;
7511 if (M[i + e] >= 0 && M[i + e] != (i * 2) + Ofs + Upper)
7512 return false;
7513 }
7514 return true;
7515}
7516
7517static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7518 unsigned NumElts = VT.getVectorNumElements();
7519 // Make sure the mask has the right size.
7520 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7521 return false;
7522
7523 // If Top
7524 // Look for <0, N, 2, N+2, 4, N+4, ..>.
7525 // This inserts Input2 into Input1
7526 // else if not Top
7527 // Look for <0, N+1, 2, N+3, 4, N+5, ..>
7528 // This inserts Input1 into Input2
7529 unsigned Offset = Top ? 0 : 1;
7530 unsigned N = SingleSource ? 0 : NumElts;
7531 for (unsigned i = 0; i < NumElts; i += 2) {
7532 if (M[i] >= 0 && M[i] != (int)i)
7533 return false;
7534 if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset))
7535 return false;
7536 }
7537
7538 return true;
7539}
7540
7541static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) {
7542 unsigned NumElts = ToVT.getVectorNumElements();
7543 if (NumElts != M.size())
7544 return false;
7545
7546 // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are
7547 // looking for patterns of:
7548 // !rev: 0 N/2 1 N/2+1 2 N/2+2 ...
7549 // rev: N/2 0 N/2+1 1 N/2+2 2 ...
7550
7551 unsigned Off0 = rev ? NumElts / 2 : 0;
7552 unsigned Off1 = rev ? 0 : NumElts / 2;
7553 for (unsigned i = 0; i < NumElts; i += 2) {
7554 if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
7555 return false;
7556 if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
7557 return false;
7558 }
7559
7560 return true;
7561}
7562
7563// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
7564// from a pair of inputs. For example:
7565// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7566// FP_ROUND(EXTRACT_ELT(Y, 0),
7567// FP_ROUND(EXTRACT_ELT(X, 1),
7568// FP_ROUND(EXTRACT_ELT(Y, 1), ...)
7570 const ARMSubtarget *ST) {
7571 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7572 if (!ST->hasMVEFloatOps())
7573 return SDValue();
7574
7575 SDLoc dl(BV);
7576 EVT VT = BV.getValueType();
7577 if (VT != MVT::v8f16)
7578 return SDValue();
7579
7580 // We are looking for a buildvector of fptrunc elements, where all the
7581 // elements are interleavingly extracted from two sources. Check the first two
7582 // items are valid enough and extract some info from them (they are checked
7583 // properly in the loop below).
7584 if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||
7587 return SDValue();
7588 if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||
7591 return SDValue();
7592 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7593 SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);
7594 if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
7595 return SDValue();
7596
7597 // Check all the values in the BuildVector line up with our expectations.
7598 for (unsigned i = 1; i < 4; i++) {
7599 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7600 return Trunc.getOpcode() == ISD::FP_ROUND &&
7602 Trunc.getOperand(0).getOperand(0) == Op &&
7603 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7604 };
7605 if (!Check(BV.getOperand(i * 2 + 0), Op0, i))
7606 return SDValue();
7607 if (!Check(BV.getOperand(i * 2 + 1), Op1, i))
7608 return SDValue();
7609 }
7610
7611 SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
7612 DAG.getConstant(0, dl, MVT::i32));
7613 return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
7614 DAG.getConstant(1, dl, MVT::i32));
7615}
7616
7617// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
7618// from a single input on alternating lanes. For example:
7619// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7620// FP_ROUND(EXTRACT_ELT(X, 2),
7621// FP_ROUND(EXTRACT_ELT(X, 4), ...)
7623 const ARMSubtarget *ST) {
7624 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7625 if (!ST->hasMVEFloatOps())
7626 return SDValue();
7627
7628 SDLoc dl(BV);
7629 EVT VT = BV.getValueType();
7630 if (VT != MVT::v4f32)
7631 return SDValue();
7632
7633 // We are looking for a buildvector of fptext elements, where all the
7634 // elements are alternating lanes from a single source. For example <0,2,4,6>
7635 // or <1,3,5,7>. Check the first two items are valid enough and extract some
7636 // info from them (they are checked properly in the loop below).
7637 if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
7639 return SDValue();
7640 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7642 if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
7643 return SDValue();
7644
7645 // Check all the values in the BuildVector line up with our expectations.
7646 for (unsigned i = 1; i < 4; i++) {
7647 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7648 return Trunc.getOpcode() == ISD::FP_EXTEND &&
7650 Trunc.getOperand(0).getOperand(0) == Op &&
7651 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7652 };
7653 if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
7654 return SDValue();
7655 }
7656
7657 return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
7658 DAG.getConstant(Offset, dl, MVT::i32));
7659}
7660
7661// If N is an integer constant that can be moved into a register in one
7662// instruction, return an SDValue of such a constant (will become a MOV
7663// instruction). Otherwise return null.
7665 const ARMSubtarget *ST, const SDLoc &dl) {
7666 uint64_t Val;
7667 if (!isa<ConstantSDNode>(N))
7668 return SDValue();
7669 Val = N->getAsZExtVal();
7670
7671 if (ST->isThumb1Only()) {
7672 if (Val <= 255 || ~Val <= 255)
7673 return DAG.getConstant(Val, dl, MVT::i32);
7674 } else {
7675 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
7676 return DAG.getConstant(Val, dl, MVT::i32);
7677 }
7678 return SDValue();
7679}
7680
7682 const ARMSubtarget *ST) {
7683 SDLoc dl(Op);
7684 EVT VT = Op.getValueType();
7685
7686 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
7687
7688 unsigned NumElts = VT.getVectorNumElements();
7689 unsigned BoolMask;
7690 unsigned BitsPerBool;
7691 if (NumElts == 2) {
7692 BitsPerBool = 8;
7693 BoolMask = 0xff;
7694 } else if (NumElts == 4) {
7695 BitsPerBool = 4;
7696 BoolMask = 0xf;
7697 } else if (NumElts == 8) {
7698 BitsPerBool = 2;
7699 BoolMask = 0x3;
7700 } else if (NumElts == 16) {
7701 BitsPerBool = 1;
7702 BoolMask = 0x1;
7703 } else
7704 return SDValue();
7705
7706 // If this is a single value copied into all lanes (a splat), we can just sign
7707 // extend that single value
7708 SDValue FirstOp = Op.getOperand(0);
7709 if (!isa<ConstantSDNode>(FirstOp) &&
7710 llvm::all_of(llvm::drop_begin(Op->ops()), [&FirstOp](const SDUse &U) {
7711 return U.get().isUndef() || U.get() == FirstOp;
7712 })) {
7713 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
7714 DAG.getValueType(MVT::i1));
7715 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
7716 }
7717
7718 // First create base with bits set where known
7719 unsigned Bits32 = 0;
7720 for (unsigned i = 0; i < NumElts; ++i) {
7721 SDValue V = Op.getOperand(i);
7722 if (!isa<ConstantSDNode>(V) && !V.isUndef())
7723 continue;
7724 bool BitSet = V.isUndef() ? false : V->getAsZExtVal();
7725 if (BitSet)
7726 Bits32 |= BoolMask << (i * BitsPerBool);
7727 }
7728
7729 // Add in unknown nodes
7731 DAG.getConstant(Bits32, dl, MVT::i32));
7732 for (unsigned i = 0; i < NumElts; ++i) {
7733 SDValue V = Op.getOperand(i);
7734 if (isa<ConstantSDNode>(V) || V.isUndef())
7735 continue;
7736 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
7737 DAG.getConstant(i, dl, MVT::i32));
7738 }
7739
7740 return Base;
7741}
7742
7744 const ARMSubtarget *ST) {
7745 if (!ST->hasMVEIntegerOps())
7746 return SDValue();
7747
7748 // We are looking for a buildvector where each element is Op[0] + i*N
7749 EVT VT = Op.getValueType();
7750 SDValue Op0 = Op.getOperand(0);
7751 unsigned NumElts = VT.getVectorNumElements();
7752
7753 // Get the increment value from operand 1
7754 SDValue Op1 = Op.getOperand(1);
7755 if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 ||
7757 return SDValue();
7758 unsigned N = Op1.getConstantOperandVal(1);
7759 if (N != 1 && N != 2 && N != 4 && N != 8)
7760 return SDValue();
7761
7762 // Check that each other operand matches
7763 for (unsigned I = 2; I < NumElts; I++) {
7764 SDValue OpI = Op.getOperand(I);
7765 if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 ||
7767 OpI.getConstantOperandVal(1) != I * N)
7768 return SDValue();
7769 }
7770
7771 SDLoc DL(Op);
7772 return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0,
7773 DAG.getConstant(N, DL, MVT::i32));
7774}
7775
7776// Returns true if the operation N can be treated as qr instruction variant at
7777// operand Op.
7778static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) {
7779 switch (N->getOpcode()) {
7780 case ISD::ADD:
7781 case ISD::MUL:
7782 case ISD::SADDSAT:
7783 case ISD::UADDSAT:
7784 case ISD::AVGFLOORS:
7785 case ISD::AVGFLOORU:
7786 return true;
7787 case ISD::SUB:
7788 case ISD::SSUBSAT:
7789 case ISD::USUBSAT:
7790 return N->getOperand(1).getNode() == Op;
7792 switch (N->getConstantOperandVal(0)) {
7793 case Intrinsic::arm_mve_add_predicated:
7794 case Intrinsic::arm_mve_mul_predicated:
7795 case Intrinsic::arm_mve_qadd_predicated:
7796 case Intrinsic::arm_mve_vhadd:
7797 case Intrinsic::arm_mve_hadd_predicated:
7798 case Intrinsic::arm_mve_vqdmulh:
7799 case Intrinsic::arm_mve_qdmulh_predicated:
7800 case Intrinsic::arm_mve_vqrdmulh:
7801 case Intrinsic::arm_mve_qrdmulh_predicated:
7802 case Intrinsic::arm_mve_vqdmull:
7803 case Intrinsic::arm_mve_vqdmull_predicated:
7804 return true;
7805 case Intrinsic::arm_mve_sub_predicated:
7806 case Intrinsic::arm_mve_qsub_predicated:
7807 case Intrinsic::arm_mve_vhsub:
7808 case Intrinsic::arm_mve_hsub_predicated:
7809 return N->getOperand(2).getNode() == Op;
7810 default:
7811 return false;
7812 }
7813 default:
7814 return false;
7815 }
7816}
7817
7818// If this is a case we can't handle, return null and let the default
7819// expansion code take care of it.
7820SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
7821 const ARMSubtarget *ST) const {
7822 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
7823 SDLoc dl(Op);
7824 EVT VT = Op.getValueType();
7825
7826 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
7827 return LowerBUILD_VECTOR_i1(Op, DAG, ST);
7828
7829 if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST))
7830 return R;
7831
7832 APInt SplatBits, SplatUndef;
7833 unsigned SplatBitSize;
7834 bool HasAnyUndefs;
7835 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
7836 if (SplatUndef.isAllOnes())
7837 return DAG.getUNDEF(VT);
7838
7839 // If all the users of this constant splat are qr instruction variants,
7840 // generate a vdup of the constant.
7841 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize &&
7842 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) &&
7843 all_of(BVN->users(),
7844 [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) {
7845 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7846 : SplatBitSize == 16 ? MVT::v8i16
7847 : MVT::v16i8;
7848 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7849 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7850 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7851 }
7852
7853 if ((ST->hasNEON() && SplatBitSize <= 64) ||
7854 (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
7855 // Check if an immediate VMOV works.
7856 EVT VmovVT;
7857 SDValue Val =
7858 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
7859 SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);
7860
7861 if (Val.getNode()) {
7862 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
7863 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7864 }
7865
7866 // Try an immediate VMVN.
7867 uint64_t NegatedImm = (~SplatBits).getZExtValue();
7868 Val = isVMOVModifiedImm(
7869 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
7870 VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
7871 if (Val.getNode()) {
7872 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
7873 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7874 }
7875
7876 // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
7877 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
7878 int ImmVal = ARM_AM::getFP32Imm(SplatBits);
7879 if (ImmVal != -1) {
7880 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
7881 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
7882 }
7883 }
7884
7885 // If we are under MVE, generate a VDUP(constant), bitcast to the original
7886 // type.
7887 if (ST->hasMVEIntegerOps() &&
7888 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) {
7889 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7890 : SplatBitSize == 16 ? MVT::v8i16
7891 : MVT::v16i8;
7892 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7893 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7894 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7895 }
7896 }
7897 }
7898
7899 // Scan through the operands to see if only one value is used.
7900 //
7901 // As an optimisation, even if more than one value is used it may be more
7902 // profitable to splat with one value then change some lanes.
7903 //
7904 // Heuristically we decide to do this if the vector has a "dominant" value,
7905 // defined as splatted to more than half of the lanes.
7906 unsigned NumElts = VT.getVectorNumElements();
7907 bool isOnlyLowElement = true;
7908 bool usesOnlyOneValue = true;
7909 bool hasDominantValue = false;
7910 bool isConstant = true;
7911
7912 // Map of the number of times a particular SDValue appears in the
7913 // element list.
7914 DenseMap<SDValue, unsigned> ValueCounts;
7915 SDValue Value;
7916 for (unsigned i = 0; i < NumElts; ++i) {
7917 SDValue V = Op.getOperand(i);
7918 if (V.isUndef())
7919 continue;
7920 if (i > 0)
7921 isOnlyLowElement = false;
7923 isConstant = false;
7924
7925 unsigned &Count = ValueCounts[V];
7926
7927 // Is this value dominant? (takes up more than half of the lanes)
7928 if (++Count > (NumElts / 2)) {
7929 hasDominantValue = true;
7930 Value = V;
7931 }
7932 }
7933 if (ValueCounts.size() != 1)
7934 usesOnlyOneValue = false;
7935 if (!Value.getNode() && !ValueCounts.empty())
7936 Value = ValueCounts.begin()->first;
7937
7938 if (ValueCounts.empty())
7939 return DAG.getUNDEF(VT);
7940
7941 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
7942 // Keep going if we are hitting this case.
7943 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
7944 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
7945
7946 unsigned EltSize = VT.getScalarSizeInBits();
7947
7948 // Use VDUP for non-constant splats. For f32 constant splats, reduce to
7949 // i32 and try again.
7950 if (hasDominantValue && EltSize <= 32) {
7951 if (!isConstant) {
7952 SDValue N;
7953
7954 // If we are VDUPing a value that comes directly from a vector, that will
7955 // cause an unnecessary move to and from a GPR, where instead we could
7956 // just use VDUPLANE. We can only do this if the lane being extracted
7957 // is at a constant index, as the VDUP from lane instructions only have
7958 // constant-index forms.
7959 ConstantSDNode *constIndex;
7960 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7961 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
7962 // We need to create a new undef vector to use for the VDUPLANE if the
7963 // size of the vector from which we get the value is different than the
7964 // size of the vector that we need to create. We will insert the element
7965 // such that the register coalescer will remove unnecessary copies.
7966 if (VT != Value->getOperand(0).getValueType()) {
7967 unsigned index = constIndex->getAPIntValue().getLimitedValue() %
7969 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7970 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
7971 Value, DAG.getConstant(index, dl, MVT::i32)),
7972 DAG.getConstant(index, dl, MVT::i32));
7973 } else
7974 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7975 Value->getOperand(0), Value->getOperand(1));
7976 } else
7977 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
7978
7979 if (!usesOnlyOneValue) {
7980 // The dominant value was splatted as 'N', but we now have to insert
7981 // all differing elements.
7982 for (unsigned I = 0; I < NumElts; ++I) {
7983 if (Op.getOperand(I) == Value)
7984 continue;
7986 Ops.push_back(N);
7987 Ops.push_back(Op.getOperand(I));
7988 Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
7989 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
7990 }
7991 }
7992 return N;
7993 }
7996 MVT FVT = VT.getVectorElementType().getSimpleVT();
7997 assert(FVT == MVT::f32 || FVT == MVT::f16);
7998 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
7999 for (unsigned i = 0; i < NumElts; ++i)
8000 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
8001 Op.getOperand(i)));
8002 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
8003 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
8004 Val = LowerBUILD_VECTOR(Val, DAG, ST);
8005 if (Val.getNode())
8006 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8007 }
8008 if (usesOnlyOneValue) {
8009 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
8010 if (isConstant && Val.getNode())
8011 return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
8012 }
8013 }
8014
8015 // If all elements are constants and the case above didn't get hit, fall back
8016 // to the default expansion, which will generate a load from the constant
8017 // pool.
8018 if (isConstant)
8019 return SDValue();
8020
8021 // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
8022 // vmovn). Empirical tests suggest this is rarely worth it for vectors of
8023 // length <= 2.
8024 if (NumElts >= 4)
8025 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
8026 return shuffle;
8027
8028 // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
8029 // VCVT's
8030 if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
8031 return VCVT;
8032 if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
8033 return VCVT;
8034
8035 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
8036 // If we haven't found an efficient lowering, try splitting a 128-bit vector
8037 // into two 64-bit vectors; we might discover a better way to lower it.
8038 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
8039 EVT ExtVT = VT.getVectorElementType();
8040 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
8041 SDValue Lower = DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[0], NumElts / 2));
8042 if (Lower.getOpcode() == ISD::BUILD_VECTOR)
8043 Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
8044 SDValue Upper =
8045 DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[NumElts / 2], NumElts / 2));
8046 if (Upper.getOpcode() == ISD::BUILD_VECTOR)
8047 Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
8048 if (Lower && Upper)
8049 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
8050 }
8051
8052 // Vectors with 32- or 64-bit elements can be built by directly assigning
8053 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
8054 // will be legalized.
8055 if (EltSize >= 32) {
8056 // Do the expansion with floating-point types, since that is what the VFP
8057 // registers are defined to use, and since i64 is not legal.
8058 EVT EltVT = EVT::getFloatingPointVT(EltSize);
8059 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8061 for (unsigned i = 0; i < NumElts; ++i)
8062 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
8063 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8064 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8065 }
8066
8067 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
8068 // know the default expansion would otherwise fall back on something even
8069 // worse. For a vector with one or two non-undef values, that's
8070 // scalar_to_vector for the elements followed by a shuffle (provided the
8071 // shuffle is valid for the target) and materialization element by element
8072 // on the stack followed by a load for everything else.
8073 if (!isConstant && !usesOnlyOneValue) {
8074 SDValue Vec = DAG.getUNDEF(VT);
8075 for (unsigned i = 0 ; i < NumElts; ++i) {
8076 SDValue V = Op.getOperand(i);
8077 if (V.isUndef())
8078 continue;
8079 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
8080 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
8081 }
8082 return Vec;
8083 }
8084
8085 return SDValue();
8086}
8087
8088// Gather data to see if the operation can be modelled as a
8089// shuffle in combination with VEXTs.
8090SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
8091 SelectionDAG &DAG) const {
8092 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
8093 SDLoc dl(Op);
8094 EVT VT = Op.getValueType();
8095 unsigned NumElts = VT.getVectorNumElements();
8096
8097 struct ShuffleSourceInfo {
8098 SDValue Vec;
8099 unsigned MinElt = std::numeric_limits<unsigned>::max();
8100 unsigned MaxElt = 0;
8101
8102 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
8103 // be compatible with the shuffle we intend to construct. As a result
8104 // ShuffleVec will be some sliding window into the original Vec.
8105 SDValue ShuffleVec;
8106
8107 // Code should guarantee that element i in Vec starts at element "WindowBase
8108 // + i * WindowScale in ShuffleVec".
8109 int WindowBase = 0;
8110 int WindowScale = 1;
8111
8112 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
8113
8114 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
8115 };
8116
8117 // First gather all vectors used as an immediate source for this BUILD_VECTOR
8118 // node.
8120 for (unsigned i = 0; i < NumElts; ++i) {
8121 SDValue V = Op.getOperand(i);
8122 if (V.isUndef())
8123 continue;
8124 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
8125 // A shuffle can only come from building a vector from various
8126 // elements of other vectors.
8127 return SDValue();
8128 } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
8129 // Furthermore, shuffles require a constant mask, whereas extractelts
8130 // accept variable indices.
8131 return SDValue();
8132 }
8133
8134 // Add this element source to the list if it's not already there.
8135 SDValue SourceVec = V.getOperand(0);
8136 auto Source = llvm::find(Sources, SourceVec);
8137 if (Source == Sources.end())
8138 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
8139
8140 // Update the minimum and maximum lane number seen.
8141 unsigned EltNo = V.getConstantOperandVal(1);
8142 Source->MinElt = std::min(Source->MinElt, EltNo);
8143 Source->MaxElt = std::max(Source->MaxElt, EltNo);
8144 }
8145
8146 // Currently only do something sane when at most two source vectors
8147 // are involved.
8148 if (Sources.size() > 2)
8149 return SDValue();
8150
8151 // Find out the smallest element size among result and two sources, and use
8152 // it as element size to build the shuffle_vector.
8153 EVT SmallestEltTy = VT.getVectorElementType();
8154 for (auto &Source : Sources) {
8155 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
8156 if (SrcEltTy.bitsLT(SmallestEltTy))
8157 SmallestEltTy = SrcEltTy;
8158 }
8159 unsigned ResMultiplier =
8160 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
8161 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
8162 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
8163
8164 // If the source vector is too wide or too narrow, we may nevertheless be able
8165 // to construct a compatible shuffle either by concatenating it with UNDEF or
8166 // extracting a suitable range of elements.
8167 for (auto &Src : Sources) {
8168 EVT SrcVT = Src.ShuffleVec.getValueType();
8169
8170 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
8171 uint64_t VTSize = VT.getFixedSizeInBits();
8172 if (SrcVTSize == VTSize)
8173 continue;
8174
8175 // This stage of the search produces a source with the same element type as
8176 // the original, but with a total width matching the BUILD_VECTOR output.
8177 EVT EltVT = SrcVT.getVectorElementType();
8178 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
8179 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
8180
8181 if (SrcVTSize < VTSize) {
8182 if (2 * SrcVTSize != VTSize)
8183 return SDValue();
8184 // We can pad out the smaller vector for free, so if it's part of a
8185 // shuffle...
8186 Src.ShuffleVec =
8187 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
8188 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
8189 continue;
8190 }
8191
8192 if (SrcVTSize != 2 * VTSize)
8193 return SDValue();
8194
8195 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
8196 // Span too large for a VEXT to cope
8197 return SDValue();
8198 }
8199
8200 if (Src.MinElt >= NumSrcElts) {
8201 // The extraction can just take the second half
8202 Src.ShuffleVec =
8203 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8204 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8205 Src.WindowBase = -NumSrcElts;
8206 } else if (Src.MaxElt < NumSrcElts) {
8207 // The extraction can just take the first half
8208 Src.ShuffleVec =
8209 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8210 DAG.getConstant(0, dl, MVT::i32));
8211 } else {
8212 // An actual VEXT is needed
8213 SDValue VEXTSrc1 =
8214 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8215 DAG.getConstant(0, dl, MVT::i32));
8216 SDValue VEXTSrc2 =
8217 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8218 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8219
8220 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
8221 VEXTSrc2,
8222 DAG.getConstant(Src.MinElt, dl, MVT::i32));
8223 Src.WindowBase = -Src.MinElt;
8224 }
8225 }
8226
8227 // Another possible incompatibility occurs from the vector element types. We
8228 // can fix this by bitcasting the source vectors to the same type we intend
8229 // for the shuffle.
8230 for (auto &Src : Sources) {
8231 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
8232 if (SrcEltTy == SmallestEltTy)
8233 continue;
8234 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
8235 Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
8236 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
8237 Src.WindowBase *= Src.WindowScale;
8238 }
8239
8240 // Final check before we try to actually produce a shuffle.
8241 LLVM_DEBUG({
8242 for (auto Src : Sources)
8243 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
8244 });
8245
8246 // The stars all align, our next step is to produce the mask for the shuffle.
8247 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
8248 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
8249 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
8250 SDValue Entry = Op.getOperand(i);
8251 if (Entry.isUndef())
8252 continue;
8253
8254 auto Src = llvm::find(Sources, Entry.getOperand(0));
8255 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
8256
8257 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8258 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8259 // segment.
8260 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
8261 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
8262 VT.getScalarSizeInBits());
8263 int LanesDefined = BitsDefined / BitsPerShuffleLane;
8264
8265 // This source is expected to fill ResMultiplier lanes of the final shuffle,
8266 // starting at the appropriate offset.
8267 int *LaneMask = &Mask[i * ResMultiplier];
8268
8269 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
8270 ExtractBase += NumElts * (Src - Sources.begin());
8271 for (int j = 0; j < LanesDefined; ++j)
8272 LaneMask[j] = ExtractBase + j;
8273 }
8274
8275
8276 // We can't handle more than two sources. This should have already
8277 // been checked before this point.
8278 assert(Sources.size() <= 2 && "Too many sources!");
8279
8280 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
8281 for (unsigned i = 0; i < Sources.size(); ++i)
8282 ShuffleOps[i] = Sources[i].ShuffleVec;
8283
8284 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
8285 ShuffleOps[1], Mask, DAG);
8286 if (!Shuffle)
8287 return SDValue();
8288 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
8289}
8290
8292 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
8301 OP_VUZPL, // VUZP, left result
8302 OP_VUZPR, // VUZP, right result
8303 OP_VZIPL, // VZIP, left result
8304 OP_VZIPR, // VZIP, right result
8305 OP_VTRNL, // VTRN, left result
8306 OP_VTRNR // VTRN, right result
8307};
8308
8309static bool isLegalMVEShuffleOp(unsigned PFEntry) {
8310 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8311 switch (OpNum) {
8312 case OP_COPY:
8313 case OP_VREV:
8314 case OP_VDUP0:
8315 case OP_VDUP1:
8316 case OP_VDUP2:
8317 case OP_VDUP3:
8318 return true;
8319 }
8320 return false;
8321}
8322
8323/// isShuffleMaskLegal - Targets can use this to indicate that they only
8324/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
8325/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
8326/// are assumed to be legal.
8328 if (VT.getVectorNumElements() == 4 &&
8329 (VT.is128BitVector() || VT.is64BitVector())) {
8330 unsigned PFIndexes[4];
8331 for (unsigned i = 0; i != 4; ++i) {
8332 if (M[i] < 0)
8333 PFIndexes[i] = 8;
8334 else
8335 PFIndexes[i] = M[i];
8336 }
8337
8338 // Compute the index in the perfect shuffle table.
8339 unsigned PFTableIndex =
8340 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8341 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8342 unsigned Cost = (PFEntry >> 30);
8343
8344 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
8345 return true;
8346 }
8347
8348 bool ReverseVEXT, isV_UNDEF;
8349 unsigned Imm, WhichResult;
8350
8351 unsigned EltSize = VT.getScalarSizeInBits();
8352 if (EltSize >= 32 ||
8354 ShuffleVectorInst::isIdentityMask(M, M.size()) ||
8355 isVREVMask(M, VT, 64) ||
8356 isVREVMask(M, VT, 32) ||
8357 isVREVMask(M, VT, 16))
8358 return true;
8359 else if (Subtarget->hasNEON() &&
8360 (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
8361 isVTBLMask(M, VT) ||
8362 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
8363 return true;
8364 else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8365 isReverseMask(M, VT))
8366 return true;
8367 else if (Subtarget->hasMVEIntegerOps() &&
8368 (isVMOVNMask(M, VT, true, false) ||
8369 isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true)))
8370 return true;
8371 else if (Subtarget->hasMVEIntegerOps() &&
8372 (isTruncMask(M, VT, false, false) ||
8373 isTruncMask(M, VT, false, true) ||
8374 isTruncMask(M, VT, true, false) || isTruncMask(M, VT, true, true)))
8375 return true;
8376 else
8377 return false;
8378}
8379
8380/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
8381/// the specified operations to build the shuffle.
8383 SDValue RHS, SelectionDAG &DAG,
8384 const SDLoc &dl) {
8385 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8386 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8387 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8388
8389 if (OpNum == OP_COPY) {
8390 if (LHSID == (1*9+2)*9+3) return LHS;
8391 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
8392 return RHS;
8393 }
8394
8395 SDValue OpLHS, OpRHS;
8396 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
8397 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
8398 EVT VT = OpLHS.getValueType();
8399
8400 switch (OpNum) {
8401 default: llvm_unreachable("Unknown shuffle opcode!");
8402 case OP_VREV:
8403 // VREV divides the vector in half and swaps within the half.
8404 if (VT.getScalarSizeInBits() == 32)
8405 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
8406 // vrev <4 x i16> -> VREV32
8407 if (VT.getScalarSizeInBits() == 16)
8408 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
8409 // vrev <4 x i8> -> VREV16
8410 assert(VT.getScalarSizeInBits() == 8);
8411 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
8412 case OP_VDUP0:
8413 case OP_VDUP1:
8414 case OP_VDUP2:
8415 case OP_VDUP3:
8416 return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8417 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
8418 case OP_VEXT1:
8419 case OP_VEXT2:
8420 case OP_VEXT3:
8421 return DAG.getNode(ARMISD::VEXT, dl, VT,
8422 OpLHS, OpRHS,
8423 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
8424 case OP_VUZPL:
8425 case OP_VUZPR:
8426 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
8427 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
8428 case OP_VZIPL:
8429 case OP_VZIPR:
8430 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
8431 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
8432 case OP_VTRNL:
8433 case OP_VTRNR:
8434 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
8435 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
8436 }
8437}
8438
8440 ArrayRef<int> ShuffleMask,
8441 SelectionDAG &DAG) {
8442 // Check to see if we can use the VTBL instruction.
8443 SDValue V1 = Op.getOperand(0);
8444 SDValue V2 = Op.getOperand(1);
8445 SDLoc DL(Op);
8446
8447 SmallVector<SDValue, 8> VTBLMask;
8448 for (int I : ShuffleMask)
8449 VTBLMask.push_back(DAG.getSignedConstant(I, DL, MVT::i32));
8450
8451 if (V2.getNode()->isUndef())
8452 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
8453 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8454
8455 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
8456 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8457}
8458
8460 SDLoc DL(Op);
8461 EVT VT = Op.getValueType();
8462
8463 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8464 "Expect an v8i16/v16i8 type");
8465 SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0));
8466 // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now,
8467 // extract the first 8 bytes into the top double word and the last 8 bytes
8468 // into the bottom double word, through a new vector shuffle that will be
8469 // turned into a VEXT on Neon, or a couple of VMOVDs on MVE.
8470 std::vector<int> NewMask;
8471 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8472 NewMask.push_back(VT.getVectorNumElements() / 2 + i);
8473 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8474 NewMask.push_back(i);
8475 return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask);
8476}
8477
8479 switch (VT.getSimpleVT().SimpleTy) {
8480 case MVT::v2i1:
8481 return MVT::v2f64;
8482 case MVT::v4i1:
8483 return MVT::v4i32;
8484 case MVT::v8i1:
8485 return MVT::v8i16;
8486 case MVT::v16i1:
8487 return MVT::v16i8;
8488 default:
8489 llvm_unreachable("Unexpected vector predicate type");
8490 }
8491}
8492
8494 SelectionDAG &DAG) {
8495 // Converting from boolean predicates to integers involves creating a vector
8496 // of all ones or all zeroes and selecting the lanes based upon the real
8497 // predicate.
8499 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
8500 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
8501
8502 SDValue AllZeroes =
8503 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
8504 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
8505
8506 // Get full vector type from predicate type
8508
8509 SDValue RecastV1;
8510 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
8511 // this to a v16i1. This cannot be done with an ordinary bitcast because the
8512 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
8513 // since we know in hardware the sizes are really the same.
8514 if (VT != MVT::v16i1)
8515 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
8516 else
8517 RecastV1 = Pred;
8518
8519 // Select either all ones or zeroes depending upon the real predicate bits.
8520 SDValue PredAsVector =
8521 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
8522
8523 // Recast our new predicate-as-integer v16i8 vector into something
8524 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
8525 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
8526}
8527
8529 const ARMSubtarget *ST) {
8530 EVT VT = Op.getValueType();
8532 ArrayRef<int> ShuffleMask = SVN->getMask();
8533
8534 assert(ST->hasMVEIntegerOps() &&
8535 "No support for vector shuffle of boolean predicates");
8536
8537 SDValue V1 = Op.getOperand(0);
8538 SDValue V2 = Op.getOperand(1);
8539 SDLoc dl(Op);
8540 if (isReverseMask(ShuffleMask, VT)) {
8541 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
8542 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
8543 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
8544 DAG.getConstant(16, dl, MVT::i32));
8545 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
8546 }
8547
8548 // Until we can come up with optimised cases for every single vector
8549 // shuffle in existence we have chosen the least painful strategy. This is
8550 // to essentially promote the boolean predicate to a 8-bit integer, where
8551 // each predicate represents a byte. Then we fall back on a normal integer
8552 // vector shuffle and convert the result back into a predicate vector. In
8553 // many cases the generated code might be even better than scalar code
8554 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
8555 // fields in a register into 8 other arbitrary 2-bit fields!
8556 SDValue PredAsVector1 = PromoteMVEPredVector(dl, V1, VT, DAG);
8557 EVT NewVT = PredAsVector1.getValueType();
8558 SDValue PredAsVector2 = V2.isUndef() ? DAG.getUNDEF(NewVT)
8559 : PromoteMVEPredVector(dl, V2, VT, DAG);
8560 assert(PredAsVector2.getValueType() == NewVT &&
8561 "Expected identical vector type in expanded i1 shuffle!");
8562
8563 // Do the shuffle!
8564 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector1,
8565 PredAsVector2, ShuffleMask);
8566
8567 // Now return the result of comparing the shuffled vector with zero,
8568 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1
8569 // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
8570 if (VT == MVT::v2i1) {
8571 SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled);
8572 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
8573 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8574 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
8575 }
8576 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
8577 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8578}
8579
8581 ArrayRef<int> ShuffleMask,
8582 SelectionDAG &DAG) {
8583 // Attempt to lower the vector shuffle using as many whole register movs as
8584 // possible. This is useful for types smaller than 32bits, which would
8585 // often otherwise become a series for grp movs.
8586 SDLoc dl(Op);
8587 EVT VT = Op.getValueType();
8588 if (VT.getScalarSizeInBits() >= 32)
8589 return SDValue();
8590
8591 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8592 "Unexpected vector type");
8593 int NumElts = VT.getVectorNumElements();
8594 int QuarterSize = NumElts / 4;
8595 // The four final parts of the vector, as i32's
8596 SDValue Parts[4];
8597
8598 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
8599 // <u,u,u,u>), returning the vmov lane index
8600 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
8601 // Detect which mov lane this would be from the first non-undef element.
8602 int MovIdx = -1;
8603 for (int i = 0; i < Length; i++) {
8604 if (ShuffleMask[Start + i] >= 0) {
8605 if (ShuffleMask[Start + i] % Length != i)
8606 return -1;
8607 MovIdx = ShuffleMask[Start + i] / Length;
8608 break;
8609 }
8610 }
8611 // If all items are undef, leave this for other combines
8612 if (MovIdx == -1)
8613 return -1;
8614 // Check the remaining values are the correct part of the same mov
8615 for (int i = 1; i < Length; i++) {
8616 if (ShuffleMask[Start + i] >= 0 &&
8617 (ShuffleMask[Start + i] / Length != MovIdx ||
8618 ShuffleMask[Start + i] % Length != i))
8619 return -1;
8620 }
8621 return MovIdx;
8622 };
8623
8624 for (int Part = 0; Part < 4; ++Part) {
8625 // Does this part look like a mov
8626 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
8627 if (Elt != -1) {
8628 SDValue Input = Op->getOperand(0);
8629 if (Elt >= 4) {
8630 Input = Op->getOperand(1);
8631 Elt -= 4;
8632 }
8633 SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input);
8634 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast,
8635 DAG.getConstant(Elt, dl, MVT::i32));
8636 }
8637 }
8638
8639 // Nothing interesting found, just return
8640 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
8641 return SDValue();
8642
8643 // The other parts need to be built with the old shuffle vector, cast to a
8644 // v4i32 and extract_vector_elts
8645 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
8646 SmallVector<int, 16> NewShuffleMask;
8647 for (int Part = 0; Part < 4; ++Part)
8648 for (int i = 0; i < QuarterSize; i++)
8649 NewShuffleMask.push_back(
8650 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
8651 SDValue NewShuffle = DAG.getVectorShuffle(
8652 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
8653 SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle);
8654
8655 for (int Part = 0; Part < 4; ++Part)
8656 if (!Parts[Part])
8657 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32,
8658 BitCast, DAG.getConstant(Part, dl, MVT::i32));
8659 }
8660 // Build a vector out of the various parts and bitcast it back to the original
8661 // type.
8662 SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts);
8663 return DAG.getBitcast(VT, NewVec);
8664}
8665
8667 ArrayRef<int> ShuffleMask,
8668 SelectionDAG &DAG) {
8669 SDValue V1 = Op.getOperand(0);
8670 SDValue V2 = Op.getOperand(1);
8671 EVT VT = Op.getValueType();
8672 unsigned NumElts = VT.getVectorNumElements();
8673
8674 // An One-Off Identity mask is one that is mostly an identity mask from as
8675 // single source but contains a single element out-of-place, either from a
8676 // different vector or from another position in the same vector. As opposed to
8677 // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
8678 // pair directly.
8679 auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,
8680 int &OffElement) {
8681 OffElement = -1;
8682 int NonUndef = 0;
8683 for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {
8684 if (Mask[i] == -1)
8685 continue;
8686 NonUndef++;
8687 if (Mask[i] != i + BaseOffset) {
8688 if (OffElement == -1)
8689 OffElement = i;
8690 else
8691 return false;
8692 }
8693 }
8694 return NonUndef > 2 && OffElement != -1;
8695 };
8696 int OffElement;
8697 SDValue VInput;
8698 if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))
8699 VInput = V1;
8700 else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))
8701 VInput = V2;
8702 else
8703 return SDValue();
8704
8705 SDLoc dl(Op);
8706 EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16
8707 ? MVT::i32
8708 : VT.getScalarType();
8709 SDValue Elt = DAG.getNode(
8710 ISD::EXTRACT_VECTOR_ELT, dl, SVT,
8711 ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,
8712 DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl));
8713 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt,
8714 DAG.getVectorIdxConstant(OffElement % NumElts, dl));
8715}
8716
8718 const ARMSubtarget *ST) {
8719 SDValue V1 = Op.getOperand(0);
8720 SDValue V2 = Op.getOperand(1);
8721 SDLoc dl(Op);
8722 EVT VT = Op.getValueType();
8724 unsigned EltSize = VT.getScalarSizeInBits();
8725
8726 if (ST->hasMVEIntegerOps() && EltSize == 1)
8727 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
8728
8729 // Convert shuffles that are directly supported on NEON to target-specific
8730 // DAG nodes, instead of keeping them as shuffles and matching them again
8731 // during code selection. This is more efficient and avoids the possibility
8732 // of inconsistencies between legalization and selection.
8733 // FIXME: floating-point vectors should be canonicalized to integer vectors
8734 // of the same time so that they get CSEd properly.
8735 ArrayRef<int> ShuffleMask = SVN->getMask();
8736
8737 if (EltSize <= 32) {
8738 if (SVN->isSplat()) {
8739 int Lane = SVN->getSplatIndex();
8740 // If this is undef splat, generate it via "just" vdup, if possible.
8741 if (Lane == -1) Lane = 0;
8742
8743 // Test if V1 is a SCALAR_TO_VECTOR.
8744 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8745 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8746 }
8747 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
8748 // (and probably will turn into a SCALAR_TO_VECTOR once legalization
8749 // reaches it).
8750 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
8752 bool IsScalarToVector = true;
8753 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
8754 if (!V1.getOperand(i).isUndef()) {
8755 IsScalarToVector = false;
8756 break;
8757 }
8758 if (IsScalarToVector)
8759 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8760 }
8761 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
8762 DAG.getConstant(Lane, dl, MVT::i32));
8763 }
8764
8765 bool ReverseVEXT = false;
8766 unsigned Imm = 0;
8767 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
8768 if (ReverseVEXT)
8769 std::swap(V1, V2);
8770 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
8771 DAG.getConstant(Imm, dl, MVT::i32));
8772 }
8773
8774 if (isVREVMask(ShuffleMask, VT, 64))
8775 return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
8776 if (isVREVMask(ShuffleMask, VT, 32))
8777 return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
8778 if (isVREVMask(ShuffleMask, VT, 16))
8779 return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
8780
8781 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
8782 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
8783 DAG.getConstant(Imm, dl, MVT::i32));
8784 }
8785
8786 // Check for Neon shuffles that modify both input vectors in place.
8787 // If both results are used, i.e., if there are two shuffles with the same
8788 // source operands and with masks corresponding to both results of one of
8789 // these operations, DAG memoization will ensure that a single node is
8790 // used for both shuffles.
8791 unsigned WhichResult = 0;
8792 bool isV_UNDEF = false;
8793 if (ST->hasNEON()) {
8794 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8795 ShuffleMask, VT, WhichResult, isV_UNDEF)) {
8796 if (isV_UNDEF)
8797 V2 = V1;
8798 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
8799 .getValue(WhichResult);
8800 }
8801 }
8802 if (ST->hasMVEIntegerOps()) {
8803 if (isVMOVNMask(ShuffleMask, VT, false, false))
8804 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
8805 DAG.getConstant(0, dl, MVT::i32));
8806 if (isVMOVNMask(ShuffleMask, VT, true, false))
8807 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
8808 DAG.getConstant(1, dl, MVT::i32));
8809 if (isVMOVNMask(ShuffleMask, VT, true, true))
8810 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1,
8811 DAG.getConstant(1, dl, MVT::i32));
8812 }
8813
8814 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
8815 // shuffles that produce a result larger than their operands with:
8816 // shuffle(concat(v1, undef), concat(v2, undef))
8817 // ->
8818 // shuffle(concat(v1, v2), undef)
8819 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
8820 //
8821 // This is useful in the general case, but there are special cases where
8822 // native shuffles produce larger results: the two-result ops.
8823 //
8824 // Look through the concat when lowering them:
8825 // shuffle(concat(v1, v2), undef)
8826 // ->
8827 // concat(VZIP(v1, v2):0, :1)
8828 //
8829 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
8830 SDValue SubV1 = V1->getOperand(0);
8831 SDValue SubV2 = V1->getOperand(1);
8832 EVT SubVT = SubV1.getValueType();
8833
8834 // We expect these to have been canonicalized to -1.
8835 assert(llvm::all_of(ShuffleMask, [&](int i) {
8836 return i < (int)VT.getVectorNumElements();
8837 }) && "Unexpected shuffle index into UNDEF operand!");
8838
8839 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8840 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
8841 if (isV_UNDEF)
8842 SubV2 = SubV1;
8843 assert((WhichResult == 0) &&
8844 "In-place shuffle of concat can only have one result!");
8845 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
8846 SubV1, SubV2);
8847 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
8848 Res.getValue(1));
8849 }
8850 }
8851 }
8852
8853 if (ST->hasMVEIntegerOps() && EltSize <= 32) {
8854 if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
8855 return V;
8856
8857 for (bool Top : {false, true}) {
8858 for (bool SingleSource : {false, true}) {
8859 if (isTruncMask(ShuffleMask, VT, Top, SingleSource)) {
8860 MVT FromSVT = MVT::getIntegerVT(EltSize * 2);
8861 MVT FromVT = MVT::getVectorVT(FromSVT, ShuffleMask.size() / 2);
8862 SDValue Lo = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, V1);
8863 SDValue Hi = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT,
8864 SingleSource ? V1 : V2);
8865 if (Top) {
8866 SDValue Amt = DAG.getConstant(EltSize, dl, FromVT);
8867 Lo = DAG.getNode(ISD::SRL, dl, FromVT, Lo, Amt);
8868 Hi = DAG.getNode(ISD::SRL, dl, FromVT, Hi, Amt);
8869 }
8870 return DAG.getNode(ARMISD::MVETRUNC, dl, VT, Lo, Hi);
8871 }
8872 }
8873 }
8874 }
8875
8876 // If the shuffle is not directly supported and it has 4 elements, use
8877 // the PerfectShuffle-generated table to synthesize it from other shuffles.
8878 unsigned NumElts = VT.getVectorNumElements();
8879 if (NumElts == 4) {
8880 unsigned PFIndexes[4];
8881 for (unsigned i = 0; i != 4; ++i) {
8882 if (ShuffleMask[i] < 0)
8883 PFIndexes[i] = 8;
8884 else
8885 PFIndexes[i] = ShuffleMask[i];
8886 }
8887
8888 // Compute the index in the perfect shuffle table.
8889 unsigned PFTableIndex =
8890 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8891 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8892 unsigned Cost = (PFEntry >> 30);
8893
8894 if (Cost <= 4) {
8895 if (ST->hasNEON())
8896 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8897 else if (isLegalMVEShuffleOp(PFEntry)) {
8898 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8899 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8900 unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
8901 unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
8902 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
8903 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8904 }
8905 }
8906 }
8907
8908 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
8909 if (EltSize >= 32) {
8910 // Do the expansion with floating-point types, since that is what the VFP
8911 // registers are defined to use, and since i64 is not legal.
8912 EVT EltVT = EVT::getFloatingPointVT(EltSize);
8913 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8914 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
8915 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
8917 for (unsigned i = 0; i < NumElts; ++i) {
8918 if (ShuffleMask[i] < 0)
8919 Ops.push_back(DAG.getUNDEF(EltVT));
8920 else
8921 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
8922 ShuffleMask[i] < (int)NumElts ? V1 : V2,
8923 DAG.getConstant(ShuffleMask[i] & (NumElts-1),
8924 dl, MVT::i32)));
8925 }
8926 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8927 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8928 }
8929
8930 if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8931 isReverseMask(ShuffleMask, VT))
8932 return LowerReverse_VECTOR_SHUFFLE(Op, DAG);
8933
8934 if (ST->hasNEON() && VT == MVT::v8i8)
8935 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
8936 return NewOp;
8937
8938 if (ST->hasMVEIntegerOps())
8939 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
8940 return NewOp;
8941
8942 return SDValue();
8943}
8944
8946 const ARMSubtarget *ST) {
8947 EVT VecVT = Op.getOperand(0).getValueType();
8948 SDLoc dl(Op);
8949
8950 assert(ST->hasMVEIntegerOps() &&
8951 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8952
8953 SDValue Conv =
8954 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8955 unsigned Lane = Op.getConstantOperandVal(2);
8956 unsigned LaneWidth =
8958 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
8959 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
8960 Op.getOperand(1), DAG.getValueType(MVT::i1));
8961 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
8962 DAG.getConstant(~Mask, dl, MVT::i32));
8963 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
8964}
8965
8966SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
8967 SelectionDAG &DAG) const {
8968 // INSERT_VECTOR_ELT is legal only for immediate indexes.
8969 SDValue Lane = Op.getOperand(2);
8970 if (!isa<ConstantSDNode>(Lane))
8971 return SDValue();
8972
8973 SDValue Elt = Op.getOperand(1);
8974 EVT EltVT = Elt.getValueType();
8975
8976 if (Subtarget->hasMVEIntegerOps() &&
8977 Op.getValueType().getScalarSizeInBits() == 1)
8978 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
8979
8980 if (getTypeAction(*DAG.getContext(), EltVT) ==
8982 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
8983 // but the type system will try to do that if we don't intervene.
8984 // Reinterpret any such vector-element insertion as one with the
8985 // corresponding integer types.
8986
8987 SDLoc dl(Op);
8988
8989 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
8990 assert(getTypeAction(*DAG.getContext(), IEltVT) !=
8992
8993 SDValue VecIn = Op.getOperand(0);
8994 EVT VecVT = VecIn.getValueType();
8995 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
8996 VecVT.getVectorNumElements());
8997
8998 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
8999 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
9000 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
9001 IVecIn, IElt, Lane);
9002 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
9003 }
9004
9005 return Op;
9006}
9007
9009 const ARMSubtarget *ST) {
9010 EVT VecVT = Op.getOperand(0).getValueType();
9011 SDLoc dl(Op);
9012
9013 assert(ST->hasMVEIntegerOps() &&
9014 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
9015
9016 SDValue Conv =
9017 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
9018 unsigned Lane = Op.getConstantOperandVal(1);
9019 unsigned LaneWidth =
9021 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
9022 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
9023 return Shift;
9024}
9025
9027 const ARMSubtarget *ST) {
9028 // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
9029 SDValue Lane = Op.getOperand(1);
9030 if (!isa<ConstantSDNode>(Lane))
9031 return SDValue();
9032
9033 SDValue Vec = Op.getOperand(0);
9034 EVT VT = Vec.getValueType();
9035
9036 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
9037 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
9038
9039 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
9040 SDLoc dl(Op);
9041 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
9042 }
9043
9044 return Op;
9045}
9046
9048 const ARMSubtarget *ST) {
9049 SDLoc dl(Op);
9050 assert(Op.getValueType().getScalarSizeInBits() == 1 &&
9051 "Unexpected custom CONCAT_VECTORS lowering");
9052 assert(isPowerOf2_32(Op.getNumOperands()) &&
9053 "Unexpected custom CONCAT_VECTORS lowering");
9054 assert(ST->hasMVEIntegerOps() &&
9055 "CONCAT_VECTORS lowering only supported for MVE");
9056
9057 auto ConcatPair = [&](SDValue V1, SDValue V2) {
9058 EVT Op1VT = V1.getValueType();
9059 EVT Op2VT = V2.getValueType();
9060 assert(Op1VT == Op2VT && "Operand types don't match!");
9061 assert((Op1VT == MVT::v2i1 || Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) &&
9062 "Unexpected i1 concat operations!");
9063 EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext());
9064
9065 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
9066 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
9067
9068 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
9069 // promoted to v8i16, etc.
9070 MVT ElType =
9072 unsigned NumElts = 2 * Op1VT.getVectorNumElements();
9073
9074 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
9075 if (Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) {
9076 // Use MVETRUNC to truncate the combined NewV1::NewV2 into the smaller
9077 // ConcatVT.
9078 SDValue ConVec =
9079 DAG.getNode(ARMISD::MVETRUNC, dl, ConcatVT, NewV1, NewV2);
9080 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
9081 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9082 }
9083
9084 // Extract the vector elements from Op1 and Op2 one by one and truncate them
9085 // to be the right size for the destination. For example, if Op1 is v4i1
9086 // then the promoted vector is v4i32. The result of concatenation gives a
9087 // v8i1, which when promoted is v8i16. That means each i32 element from Op1
9088 // needs truncating to i16 and inserting in the result.
9089 auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
9090 EVT NewVT = NewV.getValueType();
9091 EVT ConcatVT = ConVec.getValueType();
9092 unsigned ExtScale = 1;
9093 if (NewVT == MVT::v2f64) {
9094 NewV = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, NewV);
9095 ExtScale = 2;
9096 }
9097 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
9098 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
9099 DAG.getIntPtrConstant(i * ExtScale, dl));
9100 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
9101 DAG.getConstant(j, dl, MVT::i32));
9102 }
9103 return ConVec;
9104 };
9105 unsigned j = 0;
9106 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
9107 ConVec = ExtractInto(NewV1, ConVec, j);
9108 ConVec = ExtractInto(NewV2, ConVec, j);
9109
9110 // Now return the result of comparing the subvector with zero, which will
9111 // generate a real predicate, i.e. v4i1, v8i1 or v16i1.
9112 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
9113 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9114 };
9115
9116 // Concat each pair of subvectors and pack into the lower half of the array.
9117 SmallVector<SDValue> ConcatOps(Op->ops());
9118 while (ConcatOps.size() > 1) {
9119 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
9120 SDValue V1 = ConcatOps[I];
9121 SDValue V2 = ConcatOps[I + 1];
9122 ConcatOps[I / 2] = ConcatPair(V1, V2);
9123 }
9124 ConcatOps.resize(ConcatOps.size() / 2);
9125 }
9126 return ConcatOps[0];
9127}
9128
9130 const ARMSubtarget *ST) {
9131 EVT VT = Op->getValueType(0);
9132 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
9133 return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
9134
9135 // The only time a CONCAT_VECTORS operation can have legal types is when
9136 // two 64-bit vectors are concatenated to a 128-bit vector.
9137 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
9138 "unexpected CONCAT_VECTORS");
9139 SDLoc dl(Op);
9140 SDValue Val = DAG.getUNDEF(MVT::v2f64);
9141 SDValue Op0 = Op.getOperand(0);
9142 SDValue Op1 = Op.getOperand(1);
9143 if (!Op0.isUndef())
9144 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
9145 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
9146 DAG.getIntPtrConstant(0, dl));
9147 if (!Op1.isUndef())
9148 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
9149 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
9150 DAG.getIntPtrConstant(1, dl));
9151 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
9152}
9153
9155 const ARMSubtarget *ST) {
9156 SDValue V1 = Op.getOperand(0);
9157 SDValue V2 = Op.getOperand(1);
9158 SDLoc dl(Op);
9159 EVT VT = Op.getValueType();
9160 EVT Op1VT = V1.getValueType();
9161 unsigned NumElts = VT.getVectorNumElements();
9162 unsigned Index = V2->getAsZExtVal();
9163
9164 assert(VT.getScalarSizeInBits() == 1 &&
9165 "Unexpected custom EXTRACT_SUBVECTOR lowering");
9166 assert(ST->hasMVEIntegerOps() &&
9167 "EXTRACT_SUBVECTOR lowering only supported for MVE");
9168
9169 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
9170
9171 // We now have Op1 promoted to a vector of integers, where v8i1 gets
9172 // promoted to v8i16, etc.
9173
9175
9176 if (NumElts == 2) {
9177 EVT SubVT = MVT::v4i32;
9178 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
9179 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) {
9180 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9181 DAG.getIntPtrConstant(i, dl));
9182 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9183 DAG.getConstant(j, dl, MVT::i32));
9184 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9185 DAG.getConstant(j + 1, dl, MVT::i32));
9186 }
9187 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec,
9188 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9189 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
9190 }
9191
9192 EVT SubVT = MVT::getVectorVT(ElType, NumElts);
9193 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
9194 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
9195 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9196 DAG.getIntPtrConstant(i, dl));
9197 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9198 DAG.getConstant(j, dl, MVT::i32));
9199 }
9200
9201 // Now return the result of comparing the subvector with zero,
9202 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
9203 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
9204 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9205}
9206
9207// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
9209 const ARMSubtarget *ST) {
9210 assert(ST->hasMVEIntegerOps() && "Expected MVE!");
9211 EVT VT = N->getValueType(0);
9212 assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
9213 "Expected a vector i1 type!");
9214 SDValue Op = N->getOperand(0);
9215 EVT FromVT = Op.getValueType();
9216 SDLoc DL(N);
9217
9218 SDValue And =
9219 DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));
9220 return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),
9221 DAG.getCondCode(ISD::SETNE));
9222}
9223
9225 const ARMSubtarget *Subtarget) {
9226 if (!Subtarget->hasMVEIntegerOps())
9227 return SDValue();
9228
9229 EVT ToVT = N->getValueType(0);
9230 if (ToVT.getScalarType() == MVT::i1)
9231 return LowerTruncatei1(N, DAG, Subtarget);
9232
9233 // MVE does not have a single instruction to perform the truncation of a v4i32
9234 // into the lower half of a v8i16, in the same way that a NEON vmovn would.
9235 // Most of the instructions in MVE follow the 'Beats' system, where moving
9236 // values from different lanes is usually something that the instructions
9237 // avoid.
9238 //
9239 // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,
9240 // which take a the top/bottom half of a larger lane and extend it (or do the
9241 // opposite, truncating into the top/bottom lane from a larger lane). Note
9242 // that because of the way we widen lanes, a v4i16 is really a v4i32 using the
9243 // bottom 16bits from each vector lane. This works really well with T/B
9244 // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need
9245 // to move order.
9246 //
9247 // But truncates and sext/zext are always going to be fairly common from llvm.
9248 // We have several options for how to deal with them:
9249 // - Wherever possible combine them into an instruction that makes them
9250 // "free". This includes loads/stores, which can perform the trunc as part
9251 // of the memory operation. Or certain shuffles that can be turned into
9252 // VMOVN/VMOVL.
9253 // - Lane Interleaving to transform blocks surrounded by ext/trunc. So
9254 // trunc(mul(sext(a), sext(b))) may become
9255 // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in
9256 // this case can use VMULL). This is performed in the
9257 // MVELaneInterleavingPass.
9258 // - Otherwise we have an option. By default we would expand the
9259 // zext/sext/trunc into a series of lane extract/inserts going via GPR
9260 // registers. One for each vector lane in the vector. This can obviously be
9261 // very expensive.
9262 // - The other option is to use the fact that loads/store can extend/truncate
9263 // to turn a trunc into two truncating stack stores and a stack reload. This
9264 // becomes 3 back-to-back memory operations, but at least that is less than
9265 // all the insert/extracts.
9266 //
9267 // In order to do the last, we convert certain trunc's into MVETRUNC, which
9268 // are either optimized where they can be, or eventually lowered into stack
9269 // stores/loads. This prevents us from splitting a v8i16 trunc into two stores
9270 // two early, where other instructions would be better, and stops us from
9271 // having to reconstruct multiple buildvector shuffles into loads/stores.
9272 if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)
9273 return SDValue();
9274 EVT FromVT = N->getOperand(0).getValueType();
9275 if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16)
9276 return SDValue();
9277
9278 SDValue Lo, Hi;
9279 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
9280 SDLoc DL(N);
9281 return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi);
9282}
9283
9285 const ARMSubtarget *Subtarget) {
9286 if (!Subtarget->hasMVEIntegerOps())
9287 return SDValue();
9288
9289 // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
9290
9291 EVT ToVT = N->getValueType(0);
9292 if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)
9293 return SDValue();
9294 SDValue Op = N->getOperand(0);
9295 EVT FromVT = Op.getValueType();
9296 if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)
9297 return SDValue();
9298
9299 SDLoc DL(N);
9300 EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
9301 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)
9302 ExtVT = MVT::v8i16;
9303
9304 unsigned Opcode =
9306 SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op);
9307 SDValue Ext1 = Ext.getValue(1);
9308
9309 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {
9310 Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);
9311 Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);
9312 }
9313
9314 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1);
9315}
9316
9317/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
9318/// element has been zero/sign-extended, depending on the isSigned parameter,
9319/// from an integer type half its size.
9321 bool isSigned) {
9322 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
9323 EVT VT = N->getValueType(0);
9324 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
9325 SDNode *BVN = N->getOperand(0).getNode();
9326 if (BVN->getValueType(0) != MVT::v4i32 ||
9327 BVN->getOpcode() != ISD::BUILD_VECTOR)
9328 return false;
9329 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9330 unsigned HiElt = 1 - LoElt;
9335 if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
9336 return false;
9337 if (isSigned) {
9338 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
9339 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
9340 return true;
9341 } else {
9342 if (Hi0->isZero() && Hi1->isZero())
9343 return true;
9344 }
9345 return false;
9346 }
9347
9348 if (N->getOpcode() != ISD::BUILD_VECTOR)
9349 return false;
9350
9351 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
9352 SDNode *Elt = N->getOperand(i).getNode();
9354 unsigned EltSize = VT.getScalarSizeInBits();
9355 unsigned HalfSize = EltSize / 2;
9356 if (isSigned) {
9357 if (!isIntN(HalfSize, C->getSExtValue()))
9358 return false;
9359 } else {
9360 if (!isUIntN(HalfSize, C->getZExtValue()))
9361 return false;
9362 }
9363 continue;
9364 }
9365 return false;
9366 }
9367
9368 return true;
9369}
9370
9371/// isSignExtended - Check if a node is a vector value that is sign-extended
9372/// or a constant BUILD_VECTOR with sign-extended elements.
9374 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
9375 return true;
9376 if (isExtendedBUILD_VECTOR(N, DAG, true))
9377 return true;
9378 return false;
9379}
9380
9381/// isZeroExtended - Check if a node is a vector value that is zero-extended (or
9382/// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
9384 if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
9386 return true;
9387 if (isExtendedBUILD_VECTOR(N, DAG, false))
9388 return true;
9389 return false;
9390}
9391
9392static EVT getExtensionTo64Bits(const EVT &OrigVT) {
9393 if (OrigVT.getSizeInBits() >= 64)
9394 return OrigVT;
9395
9396 assert(OrigVT.isSimple() && "Expecting a simple value type");
9397
9398 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
9399 switch (OrigSimpleTy) {
9400 default: llvm_unreachable("Unexpected Vector Type");
9401 case MVT::v2i8:
9402 case MVT::v2i16:
9403 return MVT::v2i32;
9404 case MVT::v4i8:
9405 return MVT::v4i16;
9406 }
9407}
9408
9409/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
9410/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
9411/// We insert the required extension here to get the vector to fill a D register.
9413 const EVT &OrigTy,
9414 const EVT &ExtTy,
9415 unsigned ExtOpcode) {
9416 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
9417 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
9418 // 64-bits we need to insert a new extension so that it will be 64-bits.
9419 assert(ExtTy.is128BitVector() && "Unexpected extension size");
9420 if (OrigTy.getSizeInBits() >= 64)
9421 return N;
9422
9423 // Must extend size to at least 64 bits to be used as an operand for VMULL.
9424 EVT NewVT = getExtensionTo64Bits(OrigTy);
9425
9426 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
9427}
9428
9429/// SkipLoadExtensionForVMULL - return a load of the original vector size that
9430/// does not do any sign/zero extension. If the original vector is less
9431/// than 64 bits, an appropriate extension will be added after the load to
9432/// reach a total size of 64 bits. We have to add the extension separately
9433/// because ARM does not have a sign/zero extending load for vectors.
9435 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
9436
9437 // The load already has the right type.
9438 if (ExtendedTy == LD->getMemoryVT())
9439 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
9440 LD->getBasePtr(), LD->getPointerInfo(), LD->getAlign(),
9441 LD->getMemOperand()->getFlags());
9442
9443 // We need to create a zextload/sextload. We cannot just create a load
9444 // followed by a zext/zext node because LowerMUL is also run during normal
9445 // operation legalization where we can't create illegal types.
9446 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
9447 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
9448 LD->getMemoryVT(), LD->getAlign(),
9449 LD->getMemOperand()->getFlags());
9450}
9451
9452/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
9453/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
9454/// the unextended value. The unextended vector should be 64 bits so that it can
9455/// be used as an operand to a VMULL instruction. If the original vector size
9456/// before extension is less than 64 bits we add a an extension to resize
9457/// the vector to 64 bits.
9459 if (N->getOpcode() == ISD::SIGN_EXTEND ||
9460 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
9461 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
9462 N->getOperand(0)->getValueType(0),
9463 N->getValueType(0),
9464 N->getOpcode());
9465
9466 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
9467 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
9468 "Expected extending load");
9469
9470 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
9471 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
9472 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
9473 SDValue extLoad =
9474 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
9475 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
9476
9477 return newLoad;
9478 }
9479
9480 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
9481 // have been legalized as a BITCAST from v4i32.
9482 if (N->getOpcode() == ISD::BITCAST) {
9483 SDNode *BVN = N->getOperand(0).getNode();
9485 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
9486 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9487 return DAG.getBuildVector(
9488 MVT::v2i32, SDLoc(N),
9489 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
9490 }
9491 // Construct a new BUILD_VECTOR with elements truncated to half the size.
9492 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
9493 EVT VT = N->getValueType(0);
9494 unsigned EltSize = VT.getScalarSizeInBits() / 2;
9495 unsigned NumElts = VT.getVectorNumElements();
9496 MVT TruncVT = MVT::getIntegerVT(EltSize);
9498 SDLoc dl(N);
9499 for (unsigned i = 0; i != NumElts; ++i) {
9500 const APInt &CInt = N->getConstantOperandAPInt(i);
9501 // Element types smaller than 32 bits are not legal, so use i32 elements.
9502 // The values are implicitly truncated so sext vs. zext doesn't matter.
9503 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
9504 }
9505 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
9506}
9507
9508static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
9509 unsigned Opcode = N->getOpcode();
9510 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9511 SDNode *N0 = N->getOperand(0).getNode();
9512 SDNode *N1 = N->getOperand(1).getNode();
9513 return N0->hasOneUse() && N1->hasOneUse() &&
9514 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
9515 }
9516 return false;
9517}
9518
9519static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
9520 unsigned Opcode = N->getOpcode();
9521 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9522 SDNode *N0 = N->getOperand(0).getNode();
9523 SDNode *N1 = N->getOperand(1).getNode();
9524 return N0->hasOneUse() && N1->hasOneUse() &&
9525 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
9526 }
9527 return false;
9528}
9529
9531 // Multiplications are only custom-lowered for 128-bit vectors so that
9532 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
9533 EVT VT = Op.getValueType();
9534 assert(VT.is128BitVector() && VT.isInteger() &&
9535 "unexpected type for custom-lowering ISD::MUL");
9536 SDNode *N0 = Op.getOperand(0).getNode();
9537 SDNode *N1 = Op.getOperand(1).getNode();
9538 unsigned NewOpc = 0;
9539 bool isMLA = false;
9540 bool isN0SExt = isSignExtended(N0, DAG);
9541 bool isN1SExt = isSignExtended(N1, DAG);
9542 if (isN0SExt && isN1SExt)
9543 NewOpc = ARMISD::VMULLs;
9544 else {
9545 bool isN0ZExt = isZeroExtended(N0, DAG);
9546 bool isN1ZExt = isZeroExtended(N1, DAG);
9547 if (isN0ZExt && isN1ZExt)
9548 NewOpc = ARMISD::VMULLu;
9549 else if (isN1SExt || isN1ZExt) {
9550 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
9551 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
9552 if (isN1SExt && isAddSubSExt(N0, DAG)) {
9553 NewOpc = ARMISD::VMULLs;
9554 isMLA = true;
9555 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
9556 NewOpc = ARMISD::VMULLu;
9557 isMLA = true;
9558 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
9559 std::swap(N0, N1);
9560 NewOpc = ARMISD::VMULLu;
9561 isMLA = true;
9562 }
9563 }
9564
9565 if (!NewOpc) {
9566 if (VT == MVT::v2i64)
9567 // Fall through to expand this. It is not legal.
9568 return SDValue();
9569 else
9570 // Other vector multiplications are legal.
9571 return Op;
9572 }
9573 }
9574
9575 // Legalize to a VMULL instruction.
9576 SDLoc DL(Op);
9577 SDValue Op0;
9578 SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
9579 if (!isMLA) {
9580 Op0 = SkipExtensionForVMULL(N0, DAG);
9582 Op1.getValueType().is64BitVector() &&
9583 "unexpected types for extended operands to VMULL");
9584 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
9585 }
9586
9587 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
9588 // isel lowering to take advantage of no-stall back to back vmul + vmla.
9589 // vmull q0, d4, d6
9590 // vmlal q0, d5, d6
9591 // is faster than
9592 // vaddl q0, d4, d5
9593 // vmovl q1, d6
9594 // vmul q0, q0, q1
9595 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
9596 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
9597 EVT Op1VT = Op1.getValueType();
9598 return DAG.getNode(N0->getOpcode(), DL, VT,
9599 DAG.getNode(NewOpc, DL, VT,
9600 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
9601 DAG.getNode(NewOpc, DL, VT,
9602 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
9603}
9604
9606 SelectionDAG &DAG) {
9607 // TODO: Should this propagate fast-math-flags?
9608
9609 // Convert to float
9610 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
9611 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
9612 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
9613 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
9614 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
9615 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
9616 // Get reciprocal estimate.
9617 // float4 recip = vrecpeq_f32(yf);
9618 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9619 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9620 Y);
9621 // Because char has a smaller range than uchar, we can actually get away
9622 // without any newton steps. This requires that we use a weird bias
9623 // of 0xb000, however (again, this has been exhaustively tested).
9624 // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
9625 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
9626 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
9627 Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
9628 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
9629 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
9630 // Convert back to short.
9631 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
9632 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
9633 return X;
9634}
9635
9637 SelectionDAG &DAG) {
9638 // TODO: Should this propagate fast-math-flags?
9639
9640 SDValue N2;
9641 // Convert to float.
9642 // float4 yf = vcvt_f32_s32(vmovl_s16(y));
9643 // float4 xf = vcvt_f32_s32(vmovl_s16(x));
9644 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
9645 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
9646 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9647 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9648
9649 // Use reciprocal estimate and one refinement step.
9650 // float4 recip = vrecpeq_f32(yf);
9651 // recip *= vrecpsq_f32(yf, recip);
9652 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9653 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9654 N1);
9655 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9656 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9657 N1, N2);
9658 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9659 // Because short has a smaller range than ushort, we can actually get away
9660 // with only a single newton step. This requires that we use a weird bias
9661 // of 89, however (again, this has been exhaustively tested).
9662 // float4 result = as_float4(as_int4(xf*recip) + 0x89);
9663 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9664 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9665 N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
9666 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9667 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9668 // Convert back to integer and return.
9669 // return vmovn_s32(vcvt_s32_f32(result));
9670 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9671 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9672 return N0;
9673}
9674
9676 const ARMSubtarget *ST) {
9677 EVT VT = Op.getValueType();
9678 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9679 "unexpected type for custom-lowering ISD::SDIV");
9680
9681 SDLoc dl(Op);
9682 SDValue N0 = Op.getOperand(0);
9683 SDValue N1 = Op.getOperand(1);
9684 SDValue N2, N3;
9685
9686 if (VT == MVT::v8i8) {
9687 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
9688 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
9689
9690 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9691 DAG.getIntPtrConstant(4, dl));
9692 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9693 DAG.getIntPtrConstant(4, dl));
9694 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9695 DAG.getIntPtrConstant(0, dl));
9696 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9697 DAG.getIntPtrConstant(0, dl));
9698
9699 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
9700 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
9701
9702 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9703 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9704
9705 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
9706 return N0;
9707 }
9708 return LowerSDIV_v4i16(N0, N1, dl, DAG);
9709}
9710
9712 const ARMSubtarget *ST) {
9713 // TODO: Should this propagate fast-math-flags?
9714 EVT VT = Op.getValueType();
9715 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9716 "unexpected type for custom-lowering ISD::UDIV");
9717
9718 SDLoc dl(Op);
9719 SDValue N0 = Op.getOperand(0);
9720 SDValue N1 = Op.getOperand(1);
9721 SDValue N2, N3;
9722
9723 if (VT == MVT::v8i8) {
9724 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
9725 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
9726
9727 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9728 DAG.getIntPtrConstant(4, dl));
9729 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9730 DAG.getIntPtrConstant(4, dl));
9731 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9732 DAG.getIntPtrConstant(0, dl));
9733 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9734 DAG.getIntPtrConstant(0, dl));
9735
9736 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
9737 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
9738
9739 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9740 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9741
9742 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
9743 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
9744 MVT::i32),
9745 N0);
9746 return N0;
9747 }
9748
9749 // v4i16 sdiv ... Convert to float.
9750 // float4 yf = vcvt_f32_s32(vmovl_u16(y));
9751 // float4 xf = vcvt_f32_s32(vmovl_u16(x));
9752 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
9753 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
9754 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9755 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9756
9757 // Use reciprocal estimate and two refinement steps.
9758 // float4 recip = vrecpeq_f32(yf);
9759 // recip *= vrecpsq_f32(yf, recip);
9760 // recip *= vrecpsq_f32(yf, recip);
9761 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9762 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9763 BN1);
9764 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9765 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9766 BN1, N2);
9767 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9768 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9769 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9770 BN1, N2);
9771 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9772 // Simply multiplying by the reciprocal estimate can leave us a few ulps
9773 // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
9774 // and that it will never cause us to return an answer too large).
9775 // float4 result = as_float4(as_int4(xf*recip) + 2);
9776 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9777 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9778 N1 = DAG.getConstant(2, dl, MVT::v4i32);
9779 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9780 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9781 // Convert back to integer and return.
9782 // return vmovn_u32(vcvt_s32_f32(result));
9783 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9784 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9785 return N0;
9786}
9787
9789 SDNode *N = Op.getNode();
9790 EVT VT = N->getValueType(0);
9791 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
9792
9793 SDValue Carry = Op.getOperand(2);
9794
9795 SDLoc DL(Op);
9796
9797 SDValue Result;
9798 if (Op.getOpcode() == ISD::UADDO_CARRY) {
9799 // This converts the boolean value carry into the carry flag.
9800 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9801
9802 // Do the addition proper using the carry flag we wanted.
9803 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
9804 Op.getOperand(1), Carry);
9805
9806 // Now convert the carry flag into a boolean value.
9807 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9808 } else {
9809 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
9810 // have to invert the carry first.
9811 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9812 DAG.getConstant(1, DL, MVT::i32), Carry);
9813 // This converts the boolean value carry into the carry flag.
9814 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9815
9816 // Do the subtraction proper using the carry flag we wanted.
9817 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
9818 Op.getOperand(1), Carry);
9819
9820 // Now convert the carry flag into a boolean value.
9821 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9822 // But the carry returned by ARMISD::SUBE is not a borrow as expected
9823 // by ISD::USUBO_CARRY, so compute 1 - C.
9824 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9825 DAG.getConstant(1, DL, MVT::i32), Carry);
9826 }
9827
9828 // Return both values.
9829 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
9830}
9831
9832SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
9833 assert(Subtarget->isTargetDarwin());
9834
9835 // For iOS, we want to call an alternative entry point: __sincos_stret,
9836 // return values are passed via sret.
9837 SDLoc dl(Op);
9838 SDValue Arg = Op.getOperand(0);
9839 EVT ArgVT = Arg.getValueType();
9840 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
9841 auto PtrVT = getPointerTy(DAG.getDataLayout());
9842
9843 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
9844
9845 // Pair of floats / doubles used to pass the result.
9846 Type *RetTy = StructType::get(ArgTy, ArgTy);
9847 auto &DL = DAG.getDataLayout();
9848
9850 bool ShouldUseSRet = getTM().isAPCS_ABI();
9851 SDValue SRet;
9852 if (ShouldUseSRet) {
9853 // Create stack object for sret.
9854 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
9855 const Align StackAlign = DL.getPrefTypeAlign(RetTy);
9856 int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
9857 SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL));
9858
9860 Entry.IsSExt = false;
9861 Entry.IsZExt = false;
9862 Entry.IsSRet = true;
9863 Args.push_back(Entry);
9864 RetTy = Type::getVoidTy(*DAG.getContext());
9865 }
9866
9867 Args.emplace_back(Arg, ArgTy);
9868
9869 RTLIB::Libcall LC =
9870 (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
9871 const char *LibcallName = getLibcallName(LC);
9873 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
9874
9875 TargetLowering::CallLoweringInfo CLI(DAG);
9876 CLI.setDebugLoc(dl)
9877 .setChain(DAG.getEntryNode())
9878 .setCallee(CC, RetTy, Callee, std::move(Args))
9879 .setDiscardResult(ShouldUseSRet);
9880 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
9881
9882 if (!ShouldUseSRet)
9883 return CallResult.first;
9884
9885 SDValue LoadSin =
9886 DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
9887
9888 // Address of cos field.
9889 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
9890 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
9891 SDValue LoadCos =
9892 DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
9893
9894 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
9895 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
9896 LoadSin.getValue(0), LoadCos.getValue(0));
9897}
9898
9899SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
9900 bool Signed,
9901 SDValue &Chain) const {
9902 EVT VT = Op.getValueType();
9903 assert((VT == MVT::i32 || VT == MVT::i64) &&
9904 "unexpected type for custom lowering DIV");
9905 SDLoc dl(Op);
9906
9907 const auto &DL = DAG.getDataLayout();
9908 RTLIB::Libcall LC;
9909 if (Signed)
9910 LC = VT == MVT::i32 ? RTLIB::SDIVREM_I32 : RTLIB::SDIVREM_I64;
9911 else
9912 LC = VT == MVT::i32 ? RTLIB::UDIVREM_I32 : RTLIB::UDIVREM_I64;
9913
9914 const char *Name = getLibcallName(LC);
9915 SDValue ES = DAG.getExternalSymbol(Name, getPointerTy(DL));
9916
9918
9919 for (auto AI : {1, 0}) {
9920 SDValue Operand = Op.getOperand(AI);
9921 Args.emplace_back(Operand,
9922 Operand.getValueType().getTypeForEVT(*DAG.getContext()));
9923 }
9924
9925 CallLoweringInfo CLI(DAG);
9926 CLI.setDebugLoc(dl)
9927 .setChain(Chain)
9929 ES, std::move(Args));
9930
9931 return LowerCallTo(CLI).first;
9932}
9933
9934// This is a code size optimisation: return the original SDIV node to
9935// DAGCombiner when we don't want to expand SDIV into a sequence of
9936// instructions, and an empty node otherwise which will cause the
9937// SDIV to be expanded in DAGCombine.
9938SDValue
9939ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
9940 SelectionDAG &DAG,
9941 SmallVectorImpl<SDNode *> &Created) const {
9942 // TODO: Support SREM
9943 if (N->getOpcode() != ISD::SDIV)
9944 return SDValue();
9945
9946 const auto &ST = DAG.getSubtarget<ARMSubtarget>();
9947 const bool MinSize = ST.hasMinSize();
9948 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
9949 : ST.hasDivideInARMMode();
9950
9951 // Don't touch vector types; rewriting this may lead to scalarizing
9952 // the int divs.
9953 if (N->getOperand(0).getValueType().isVector())
9954 return SDValue();
9955
9956 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
9957 // hwdiv support for this to be really profitable.
9958 if (!(MinSize && HasDivide))
9959 return SDValue();
9960
9961 // ARM mode is a bit simpler than Thumb: we can handle large power
9962 // of 2 immediates with 1 mov instruction; no further checks required,
9963 // just return the sdiv node.
9964 if (!ST.isThumb())
9965 return SDValue(N, 0);
9966
9967 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
9968 // and thus lose the code size benefits of a MOVS that requires only 2.
9969 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
9970 // but as it's doing exactly this, it's not worth the trouble to get TTI.
9971 if (Divisor.sgt(128))
9972 return SDValue();
9973
9974 return SDValue(N, 0);
9975}
9976
9977SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
9978 bool Signed) const {
9979 assert(Op.getValueType() == MVT::i32 &&
9980 "unexpected type for custom lowering DIV");
9981 SDLoc dl(Op);
9982
9983 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
9984 DAG.getEntryNode(), Op.getOperand(1));
9985
9986 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9987}
9988
9990 SDLoc DL(N);
9991 SDValue Op = N->getOperand(1);
9992 if (N->getValueType(0) == MVT::i32)
9993 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
9994 SDValue Lo, Hi;
9995 std::tie(Lo, Hi) = DAG.SplitScalar(Op, DL, MVT::i32, MVT::i32);
9996 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
9997 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
9998}
9999
10000void ARMTargetLowering::ExpandDIV_Windows(
10001 SDValue Op, SelectionDAG &DAG, bool Signed,
10003 const auto &DL = DAG.getDataLayout();
10004
10005 assert(Op.getValueType() == MVT::i64 &&
10006 "unexpected type for custom lowering DIV");
10007 SDLoc dl(Op);
10008
10009 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
10010
10011 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
10012
10013 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
10014 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
10015 DAG.getConstant(32, dl, getPointerTy(DL)));
10016 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
10017
10018 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
10019}
10020
10022 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
10023 EVT MemVT = LD->getMemoryVT();
10024 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10025 MemVT == MVT::v16i1) &&
10026 "Expected a predicate type!");
10027 assert(MemVT == Op.getValueType());
10028 assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
10029 "Expected a non-extending load");
10030 assert(LD->isUnindexed() && "Expected a unindexed load");
10031
10032 // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit
10033 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
10034 // need to make sure that 8/4/2 bits are actually loaded into the correct
10035 // place, which means loading the value and then shuffling the values into
10036 // the bottom bits of the predicate.
10037 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
10038 // for BE).
10039 // Speaking of BE, apparently the rest of llvm will assume a reverse order to
10040 // a natural VMSR(load), so needs to be reversed.
10041
10042 SDLoc dl(Op);
10043 SDValue Load = DAG.getExtLoad(
10044 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
10046 LD->getMemOperand());
10047 SDValue Val = Load;
10048 if (DAG.getDataLayout().isBigEndian())
10049 Val = DAG.getNode(ISD::SRL, dl, MVT::i32,
10050 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),
10051 DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));
10052 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);
10053 if (MemVT != MVT::v16i1)
10054 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
10055 DAG.getConstant(0, dl, MVT::i32));
10056 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
10057}
10058
10059void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
10060 SelectionDAG &DAG) const {
10061 LoadSDNode *LD = cast<LoadSDNode>(N);
10062 EVT MemVT = LD->getMemoryVT();
10063 assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
10064
10065 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
10066 !Subtarget->isThumb1Only() && LD->isVolatile() &&
10067 LD->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
10068 SDLoc dl(N);
10070 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
10071 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
10072 SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
10073 SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
10074 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
10075 Results.append({Pair, Result.getValue(2)});
10076 }
10077}
10078
10080 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
10081 EVT MemVT = ST->getMemoryVT();
10082 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10083 MemVT == MVT::v16i1) &&
10084 "Expected a predicate type!");
10085 assert(MemVT == ST->getValue().getValueType());
10086 assert(!ST->isTruncatingStore() && "Expected a non-extending store");
10087 assert(ST->isUnindexed() && "Expected a unindexed store");
10088
10089 // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with
10090 // top bits unset and a scalar store.
10091 SDLoc dl(Op);
10092 SDValue Build = ST->getValue();
10093 if (MemVT != MVT::v16i1) {
10095 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {
10096 unsigned Elt = DAG.getDataLayout().isBigEndian()
10097 ? MemVT.getVectorNumElements() - I - 1
10098 : I;
10099 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
10100 DAG.getConstant(Elt, dl, MVT::i32)));
10101 }
10102 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
10103 Ops.push_back(DAG.getUNDEF(MVT::i32));
10104 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
10105 }
10106 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
10107 if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())
10108 GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,
10109 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP),
10110 DAG.getConstant(16, dl, MVT::i32));
10111 return DAG.getTruncStore(
10112 ST->getChain(), dl, GRP, ST->getBasePtr(),
10114 ST->getMemOperand());
10115}
10116
10118 const ARMSubtarget *Subtarget) {
10119 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
10120 EVT MemVT = ST->getMemoryVT();
10121 assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
10122
10123 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
10124 !Subtarget->isThumb1Only() && ST->isVolatile() &&
10125 ST->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
10126 SDNode *N = Op.getNode();
10127 SDLoc dl(N);
10128
10129 SDValue Lo = DAG.getNode(
10130 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
10131 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
10132 MVT::i32));
10133 SDValue Hi = DAG.getNode(
10134 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
10135 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
10136 MVT::i32));
10137
10138 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
10139 {ST->getChain(), Lo, Hi, ST->getBasePtr()},
10140 MemVT, ST->getMemOperand());
10141 } else if (Subtarget->hasMVEIntegerOps() &&
10142 ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10143 MemVT == MVT::v16i1))) {
10144 return LowerPredicateStore(Op, DAG);
10145 }
10146
10147 return SDValue();
10148}
10149
10150static bool isZeroVector(SDValue N) {
10151 return (ISD::isBuildVectorAllZeros(N.getNode()) ||
10152 (N->getOpcode() == ARMISD::VMOVIMM &&
10153 isNullConstant(N->getOperand(0))));
10154}
10155
10158 MVT VT = Op.getSimpleValueType();
10159 SDValue Mask = N->getMask();
10160 SDValue PassThru = N->getPassThru();
10161 SDLoc dl(Op);
10162
10163 if (isZeroVector(PassThru))
10164 return Op;
10165
10166 // MVE Masked loads use zero as the passthru value. Here we convert undef to
10167 // zero too, and other values are lowered to a select.
10168 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
10169 DAG.getTargetConstant(0, dl, MVT::i32));
10170 SDValue NewLoad = DAG.getMaskedLoad(
10171 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
10172 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
10173 N->getExtensionType(), N->isExpandingLoad());
10174 SDValue Combo = NewLoad;
10175 bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
10176 PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
10177 isZeroVector(PassThru->getOperand(0));
10178 if (!PassThru.isUndef() && !PassThruIsCastZero)
10179 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
10180 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
10181}
10182
10184 const ARMSubtarget *ST) {
10185 if (!ST->hasMVEIntegerOps())
10186 return SDValue();
10187
10188 SDLoc dl(Op);
10189 unsigned BaseOpcode = 0;
10190 switch (Op->getOpcode()) {
10191 default: llvm_unreachable("Expected VECREDUCE opcode");
10192 case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
10193 case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
10194 case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
10195 case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
10196 case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
10197 case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
10198 case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
10199 case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
10200 }
10201
10202 SDValue Op0 = Op->getOperand(0);
10203 EVT VT = Op0.getValueType();
10204 EVT EltVT = VT.getVectorElementType();
10205 unsigned NumElts = VT.getVectorNumElements();
10206 unsigned NumActiveLanes = NumElts;
10207
10208 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10209 NumActiveLanes == 2) &&
10210 "Only expected a power 2 vector size");
10211
10212 // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
10213 // allows us to easily extract vector elements from the lanes.
10214 while (NumActiveLanes > 4) {
10215 unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
10216 SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
10217 Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
10218 NumActiveLanes /= 2;
10219 }
10220
10221 SDValue Res;
10222 if (NumActiveLanes == 4) {
10223 // The remaining 4 elements are summed sequentially
10224 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10225 DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
10226 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10227 DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
10228 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10229 DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
10230 SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10231 DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
10232 SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10233 SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
10234 Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
10235 } else {
10236 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10237 DAG.getConstant(0, dl, MVT::i32));
10238 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10239 DAG.getConstant(1, dl, MVT::i32));
10240 Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10241 }
10242
10243 // Result type may be wider than element type.
10244 if (EltVT != Op->getValueType(0))
10245 Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
10246 return Res;
10247}
10248
10250 const ARMSubtarget *ST) {
10251 if (!ST->hasMVEFloatOps())
10252 return SDValue();
10253 return LowerVecReduce(Op, DAG, ST);
10254}
10255
10257 const ARMSubtarget *ST) {
10258 if (!ST->hasNEON())
10259 return SDValue();
10260
10261 SDLoc dl(Op);
10262 SDValue Op0 = Op->getOperand(0);
10263 EVT VT = Op0.getValueType();
10264 EVT EltVT = VT.getVectorElementType();
10265
10266 unsigned PairwiseIntrinsic = 0;
10267 switch (Op->getOpcode()) {
10268 default:
10269 llvm_unreachable("Expected VECREDUCE opcode");
10270 case ISD::VECREDUCE_UMIN:
10271 PairwiseIntrinsic = Intrinsic::arm_neon_vpminu;
10272 break;
10273 case ISD::VECREDUCE_UMAX:
10274 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxu;
10275 break;
10276 case ISD::VECREDUCE_SMIN:
10277 PairwiseIntrinsic = Intrinsic::arm_neon_vpmins;
10278 break;
10279 case ISD::VECREDUCE_SMAX:
10280 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxs;
10281 break;
10282 }
10283 SDValue PairwiseOp = DAG.getConstant(PairwiseIntrinsic, dl, MVT::i32);
10284
10285 unsigned NumElts = VT.getVectorNumElements();
10286 unsigned NumActiveLanes = NumElts;
10287
10288 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10289 NumActiveLanes == 2) &&
10290 "Only expected a power 2 vector size");
10291
10292 // Split 128-bit vectors, since vpmin/max takes 2 64-bit vectors.
10293 if (VT.is128BitVector()) {
10294 SDValue Lo, Hi;
10295 std::tie(Lo, Hi) = DAG.SplitVector(Op0, dl);
10296 VT = Lo.getValueType();
10297 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Lo, Hi});
10298 NumActiveLanes /= 2;
10299 }
10300
10301 // Use pairwise reductions until one lane remains
10302 while (NumActiveLanes > 1) {
10303 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Op0, Op0});
10304 NumActiveLanes /= 2;
10305 }
10306
10307 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10308 DAG.getConstant(0, dl, MVT::i32));
10309
10310 // Result type may be wider than element type.
10311 if (EltVT != Op.getValueType()) {
10312 unsigned Extend = 0;
10313 switch (Op->getOpcode()) {
10314 default:
10315 llvm_unreachable("Expected VECREDUCE opcode");
10316 case ISD::VECREDUCE_UMIN:
10317 case ISD::VECREDUCE_UMAX:
10318 Extend = ISD::ZERO_EXTEND;
10319 break;
10320 case ISD::VECREDUCE_SMIN:
10321 case ISD::VECREDUCE_SMAX:
10322 Extend = ISD::SIGN_EXTEND;
10323 break;
10324 }
10325 Res = DAG.getNode(Extend, dl, Op.getValueType(), Res);
10326 }
10327 return Res;
10328}
10329
10331 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))
10332 // Acquire/Release load/store is not legal for targets without a dmb or
10333 // equivalent available.
10334 return SDValue();
10335
10336 // Monotonic load/store is legal for all targets.
10337 return Op;
10338}
10339
10342 SelectionDAG &DAG,
10343 const ARMSubtarget *Subtarget) {
10344 SDLoc DL(N);
10345 // Under Power Management extensions, the cycle-count is:
10346 // mrc p15, #0, <Rt>, c9, c13, #0
10347 SDValue Ops[] = { N->getOperand(0), // Chain
10348 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
10349 DAG.getTargetConstant(15, DL, MVT::i32),
10350 DAG.getTargetConstant(0, DL, MVT::i32),
10351 DAG.getTargetConstant(9, DL, MVT::i32),
10352 DAG.getTargetConstant(13, DL, MVT::i32),
10353 DAG.getTargetConstant(0, DL, MVT::i32)
10354 };
10355
10356 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
10357 DAG.getVTList(MVT::i32, MVT::Other), Ops);
10358 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
10359 DAG.getConstant(0, DL, MVT::i32)));
10360 Results.push_back(Cycles32.getValue(1));
10361}
10362
10364 SDValue V1) {
10365 SDLoc dl(V0.getNode());
10366 SDValue RegClass =
10367 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
10368 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
10369 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
10370 const SDValue Ops[] = {RegClass, V0, SubReg0, V1, SubReg1};
10371 return SDValue(
10372 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
10373}
10374
10376 SDLoc dl(V.getNode());
10377 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i32, MVT::i32);
10378 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10379 if (isBigEndian)
10380 std::swap(VLo, VHi);
10381 return createGPRPairNode2xi32(DAG, VLo, VHi);
10382}
10383
10386 SelectionDAG &DAG) {
10387 assert(N->getValueType(0) == MVT::i64 &&
10388 "AtomicCmpSwap on types less than 64 should be legal");
10389 SDValue Ops[] = {
10390 createGPRPairNode2xi32(DAG, N->getOperand(1),
10391 DAG.getUNDEF(MVT::i32)), // pointer, temp
10392 createGPRPairNodei64(DAG, N->getOperand(2)), // expected
10393 createGPRPairNodei64(DAG, N->getOperand(3)), // new
10394 N->getOperand(0), // chain in
10395 };
10396 SDNode *CmpSwap = DAG.getMachineNode(
10397 ARM::CMP_SWAP_64, SDLoc(N),
10398 DAG.getVTList(MVT::Untyped, MVT::Untyped, MVT::Other), Ops);
10399
10400 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
10401 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
10402
10403 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10404
10405 SDValue Lo =
10406 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
10407 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10408 SDValue Hi =
10409 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
10410 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10411 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
10412 Results.push_back(SDValue(CmpSwap, 2));
10413}
10414
10415SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
10416 SDLoc dl(Op);
10417 EVT VT = Op.getValueType();
10418 SDValue Chain = Op.getOperand(0);
10419 SDValue LHS = Op.getOperand(1);
10420 SDValue RHS = Op.getOperand(2);
10421 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();
10422 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10423
10424 // If we don't have instructions of this float type then soften to a libcall
10425 // and use SETCC instead.
10426 if (isUnsupportedFloatingType(LHS.getValueType())) {
10427 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS,
10428 Chain, IsSignaling);
10429 if (!RHS.getNode()) {
10430 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10431 CC = ISD::SETNE;
10432 }
10433 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS,
10434 DAG.getCondCode(CC));
10435 return DAG.getMergeValues({Result, Chain}, dl);
10436 }
10437
10438 ARMCC::CondCodes CondCode, CondCode2;
10439 FPCCToARMCC(CC, CondCode, CondCode2);
10440
10441 SDValue True = DAG.getConstant(1, dl, VT);
10442 SDValue False = DAG.getConstant(0, dl, VT);
10443 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
10444 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
10445 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, Cmp, DAG);
10446 if (CondCode2 != ARMCC::AL) {
10447 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
10448 Result = getCMOV(dl, VT, Result, True, ARMcc, Cmp, DAG);
10449 }
10450 return DAG.getMergeValues({Result, Chain}, dl);
10451}
10452
10453SDValue ARMTargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
10454 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10455
10456 EVT VT = getPointerTy(DAG.getDataLayout());
10457 int FI = MFI.CreateFixedObject(4, 0, false);
10458 return DAG.getFrameIndex(FI, VT);
10459}
10460
10461SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op,
10462 SelectionDAG &DAG) const {
10463 SDLoc DL(Op);
10464 MakeLibCallOptions CallOptions;
10465 MVT SVT = Op.getOperand(0).getSimpleValueType();
10466 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
10467 SDValue Res =
10468 makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
10469 return DAG.getBitcast(MVT::i32, Res);
10470}
10471
10472SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
10473 SDLoc dl(Op);
10474 SDValue LHS = Op.getOperand(0);
10475 SDValue RHS = Op.getOperand(1);
10476
10477 // Determine if this is signed or unsigned comparison
10478 bool IsSigned = (Op.getOpcode() == ISD::SCMP);
10479
10480 // Special case for Thumb1 UCMP only
10481 if (!IsSigned && Subtarget->isThumb1Only()) {
10482 // For Thumb unsigned comparison, use this sequence:
10483 // subs r2, r0, r1 ; r2 = LHS - RHS, sets flags
10484 // sbc r2, r2 ; r2 = r2 - r2 - !carry
10485 // cmp r1, r0 ; compare RHS with LHS
10486 // sbc r1, r1 ; r1 = r1 - r1 - !carry
10487 // subs r0, r2, r1 ; r0 = r2 - r1 (final result)
10488
10489 // First subtraction: LHS - RHS
10490 SDValue Sub1WithFlags = DAG.getNode(
10491 ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10492 SDValue Sub1Result = Sub1WithFlags.getValue(0);
10493 SDValue Flags1 = Sub1WithFlags.getValue(1);
10494
10495 // SUBE: Sub1Result - Sub1Result - !carry
10496 // This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned)
10497 SDValue Sbc1 =
10498 DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT),
10499 Sub1Result, Sub1Result, Flags1);
10500 SDValue Sbc1Result = Sbc1.getValue(0);
10501
10502 // Second comparison: RHS vs LHS (reverse comparison)
10503 SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS);
10504
10505 // SUBE: RHS - RHS - !carry
10506 // This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned)
10507 SDValue Sbc2 = DAG.getNode(
10508 ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags);
10509 SDValue Sbc2Result = Sbc2.getValue(0);
10510
10511 // Final subtraction: Sbc1Result - Sbc2Result (no flags needed)
10512 SDValue Result =
10513 DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result);
10514 if (Op.getValueType() != MVT::i32)
10515 Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
10516
10517 return Result;
10518 }
10519
10520 // For the ARM assembly pattern:
10521 // subs r0, r0, r1 ; subtract RHS from LHS and set flags
10522 // movgt r0, #1 ; if LHS > RHS, set result to 1 (GT for signed, HI for
10523 // unsigned) mvnlt r0, #0 ; if LHS < RHS, set result to -1 (LT for
10524 // signed, LO for unsigned)
10525 // ; if LHS == RHS, result remains 0 from the subs
10526
10527 // Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC
10528 unsigned Opcode = ARMISD::SUBC;
10529
10530 // Check if RHS is a subtraction against 0: (0 - X)
10531 if (RHS.getOpcode() == ISD::SUB) {
10532 SDValue SubLHS = RHS.getOperand(0);
10533 SDValue SubRHS = RHS.getOperand(1);
10534
10535 // Check if it's 0 - X
10536 if (isNullConstant(SubLHS)) {
10537 bool CanUseAdd = false;
10538 if (IsSigned) {
10539 // For SCMP: only if X is known to never be INT_MIN (to avoid overflow)
10540 if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS)
10542 .isMinSignedValue()) {
10543 CanUseAdd = true;
10544 }
10545 } else {
10546 // For UCMP: only if X is known to never be zero
10547 if (DAG.isKnownNeverZero(SubRHS)) {
10548 CanUseAdd = true;
10549 }
10550 }
10551
10552 if (CanUseAdd) {
10553 Opcode = ARMISD::ADDC;
10554 RHS = SubRHS; // Replace RHS with X, so we do LHS + X instead of
10555 // LHS - (0 - X)
10556 }
10557 }
10558 }
10559
10560 // Generate the operation with flags
10561 SDValue OpWithFlags =
10562 DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10563
10564 SDValue OpResult = OpWithFlags.getValue(0);
10565 SDValue Flags = OpWithFlags.getValue(1);
10566
10567 // Constants for conditional moves
10568 SDValue One = DAG.getConstant(1, dl, MVT::i32);
10569 SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32);
10570
10571 // Select condition codes based on signed vs unsigned
10572 ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI;
10573 ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO;
10574
10575 // First conditional move: if greater than, set to 1
10576 SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32);
10577 SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One,
10578 GTCondValue, Flags);
10579
10580 // Second conditional move: if less than, set to -1
10581 SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32);
10582 SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne,
10583 LTCondValue, Flags);
10584
10585 if (Op.getValueType() != MVT::i32)
10586 Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
10587
10588 return Result2;
10589}
10590
10592 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
10593 switch (Op.getOpcode()) {
10594 default: llvm_unreachable("Don't know how to custom lower this!");
10595 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
10596 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
10597 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
10598 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
10599 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
10600 case ISD::SELECT: return LowerSELECT(Op, DAG);
10601 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
10602 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
10603 case ISD::BR_CC: return LowerBR_CC(Op, DAG);
10604 case ISD::BR_JT: return LowerBR_JT(Op, DAG);
10605 case ISD::VASTART: return LowerVASTART(Op, DAG);
10606 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);
10607 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
10608 case ISD::SINT_TO_FP:
10609 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
10612 case ISD::FP_TO_SINT:
10613 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
10615 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget);
10616 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
10617 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
10618 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
10619 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
10620 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
10621 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
10622 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
10623 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
10624 Subtarget);
10625 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
10626 case ISD::SHL:
10627 case ISD::SRL:
10628 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
10629 case ISD::SREM: return LowerREM(Op.getNode(), DAG);
10630 case ISD::UREM: return LowerREM(Op.getNode(), DAG);
10631 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
10632 case ISD::SRL_PARTS:
10633 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
10634 case ISD::CTTZ:
10635 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
10636 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
10637 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
10638 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
10639 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
10640 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
10641 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
10642 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
10643 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
10644 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
10645 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
10646 case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget);
10647 case ISD::SIGN_EXTEND:
10648 case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget);
10649 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
10650 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
10651 case ISD::SET_FPMODE:
10652 return LowerSET_FPMODE(Op, DAG);
10653 case ISD::RESET_FPMODE:
10654 return LowerRESET_FPMODE(Op, DAG);
10655 case ISD::MUL: return LowerMUL(Op, DAG);
10656 case ISD::SDIV:
10657 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10658 return LowerDIV_Windows(Op, DAG, /* Signed */ true);
10659 return LowerSDIV(Op, DAG, Subtarget);
10660 case ISD::UDIV:
10661 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10662 return LowerDIV_Windows(Op, DAG, /* Signed */ false);
10663 return LowerUDIV(Op, DAG, Subtarget);
10664 case ISD::UADDO_CARRY:
10665 case ISD::USUBO_CARRY:
10666 return LowerUADDSUBO_CARRY(Op, DAG);
10667 case ISD::SADDO:
10668 case ISD::SSUBO:
10669 return LowerSignedALUO(Op, DAG);
10670 case ISD::UADDO:
10671 case ISD::USUBO:
10672 return LowerUnsignedALUO(Op, DAG);
10673 case ISD::SADDSAT:
10674 case ISD::SSUBSAT:
10675 case ISD::UADDSAT:
10676 case ISD::USUBSAT:
10677 return LowerADDSUBSAT(Op, DAG, Subtarget);
10678 case ISD::LOAD:
10679 return LowerPredicateLoad(Op, DAG);
10680 case ISD::STORE:
10681 return LowerSTORE(Op, DAG, Subtarget);
10682 case ISD::MLOAD:
10683 return LowerMLOAD(Op, DAG);
10684 case ISD::VECREDUCE_MUL:
10685 case ISD::VECREDUCE_AND:
10686 case ISD::VECREDUCE_OR:
10687 case ISD::VECREDUCE_XOR:
10688 return LowerVecReduce(Op, DAG, Subtarget);
10689 case ISD::VECREDUCE_FADD:
10690 case ISD::VECREDUCE_FMUL:
10691 case ISD::VECREDUCE_FMIN:
10692 case ISD::VECREDUCE_FMAX:
10693 return LowerVecReduceF(Op, DAG, Subtarget);
10694 case ISD::VECREDUCE_UMIN:
10695 case ISD::VECREDUCE_UMAX:
10696 case ISD::VECREDUCE_SMIN:
10697 case ISD::VECREDUCE_SMAX:
10698 return LowerVecReduceMinMax(Op, DAG, Subtarget);
10699 case ISD::ATOMIC_LOAD:
10700 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
10701 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
10702 case ISD::SDIVREM:
10703 case ISD::UDIVREM: return LowerDivRem(Op, DAG);
10704 case ISD::DYNAMIC_STACKALLOC:
10705 if (Subtarget->isTargetWindows())
10706 return LowerDYNAMIC_STACKALLOC(Op, DAG);
10707 llvm_unreachable("Don't know how to custom lower this!");
10709 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
10711 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
10712 case ISD::STRICT_FSETCC:
10713 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
10714 case ISD::SPONENTRY:
10715 return LowerSPONENTRY(Op, DAG);
10716 case ISD::FP_TO_BF16:
10717 return LowerFP_TO_BF16(Op, DAG);
10718 case ARMISD::WIN__DBZCHK: return SDValue();
10719 case ISD::UCMP:
10720 case ISD::SCMP:
10721 return LowerCMP(Op, DAG);
10722 case ISD::ABS:
10723 return LowerABS(Op, DAG);
10724 }
10725}
10726
10728 SelectionDAG &DAG) {
10729 unsigned IntNo = N->getConstantOperandVal(0);
10730 unsigned Opc = 0;
10731 if (IntNo == Intrinsic::arm_smlald)
10733 else if (IntNo == Intrinsic::arm_smlaldx)
10735 else if (IntNo == Intrinsic::arm_smlsld)
10737 else if (IntNo == Intrinsic::arm_smlsldx)
10739 else
10740 return;
10741
10742 SDLoc dl(N);
10743 SDValue Lo, Hi;
10744 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(3), dl, MVT::i32, MVT::i32);
10745
10746 SDValue LongMul = DAG.getNode(Opc, dl,
10747 DAG.getVTList(MVT::i32, MVT::i32),
10748 N->getOperand(1), N->getOperand(2),
10749 Lo, Hi);
10750 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
10751 LongMul.getValue(0), LongMul.getValue(1)));
10752}
10753
10754/// ReplaceNodeResults - Replace the results of node with an illegal result
10755/// type with new values built out of custom code.
10758 SelectionDAG &DAG) const {
10759 SDValue Res;
10760 switch (N->getOpcode()) {
10761 default:
10762 llvm_unreachable("Don't know how to custom expand this!");
10763 case ISD::READ_REGISTER:
10765 break;
10766 case ISD::BITCAST:
10767 Res = ExpandBITCAST(N, DAG, Subtarget);
10768 break;
10769 case ISD::SRL:
10770 case ISD::SRA:
10771 case ISD::SHL:
10772 Res = Expand64BitShift(N, DAG, Subtarget);
10773 break;
10774 case ISD::SREM:
10775 case ISD::UREM:
10776 Res = LowerREM(N, DAG);
10777 break;
10778 case ISD::SDIVREM:
10779 case ISD::UDIVREM:
10780 Res = LowerDivRem(SDValue(N, 0), DAG);
10781 assert(Res.getNumOperands() == 2 && "DivRem needs two values");
10782 Results.push_back(Res.getValue(0));
10783 Results.push_back(Res.getValue(1));
10784 return;
10785 case ISD::SADDSAT:
10786 case ISD::SSUBSAT:
10787 case ISD::UADDSAT:
10788 case ISD::USUBSAT:
10789 Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
10790 break;
10791 case ISD::READCYCLECOUNTER:
10792 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
10793 return;
10794 case ISD::UDIV:
10795 case ISD::SDIV:
10796 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
10797 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
10798 Results);
10799 case ISD::ATOMIC_CMP_SWAP:
10801 return;
10803 return ReplaceLongIntrinsic(N, Results, DAG);
10804 case ISD::LOAD:
10805 LowerLOAD(N, Results, DAG);
10806 break;
10807 case ISD::TRUNCATE:
10808 Res = LowerTruncate(N, DAG, Subtarget);
10809 break;
10810 case ISD::SIGN_EXTEND:
10811 case ISD::ZERO_EXTEND:
10812 Res = LowerVectorExtend(N, DAG, Subtarget);
10813 break;
10816 Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget);
10817 break;
10818 }
10819 if (Res.getNode())
10820 Results.push_back(Res);
10821}
10822
10823//===----------------------------------------------------------------------===//
10824// ARM Scheduler Hooks
10825//===----------------------------------------------------------------------===//
10826
10827/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
10828/// registers the function context.
10829void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
10831 MachineBasicBlock *DispatchBB,
10832 int FI) const {
10833 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
10834 "ROPI/RWPI not currently supported with SjLj");
10835 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10836 DebugLoc dl = MI.getDebugLoc();
10837 MachineFunction *MF = MBB->getParent();
10841 const Function &F = MF->getFunction();
10842
10843 bool isThumb = Subtarget->isThumb();
10844 bool isThumb2 = Subtarget->isThumb2();
10845
10846 unsigned PCLabelId = AFI->createPICLabelUId();
10847 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
10849 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
10850 unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
10851
10852 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
10853 : &ARM::GPRRegClass;
10854
10855 // Grab constant pool and fixed stack memory operands.
10856 MachineMemOperand *CPMMO =
10859
10860 MachineMemOperand *FIMMOSt =
10863
10864 // Load the address of the dispatch MBB into the jump buffer.
10865 if (isThumb2) {
10866 // Incoming value: jbuf
10867 // ldr.n r5, LCPI1_1
10868 // orr r5, r5, #1
10869 // add r5, pc
10870 // str r5, [$jbuf, #+4] ; &jbuf[1]
10871 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10872 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
10874 .addMemOperand(CPMMO)
10876 // Set the low bit because of thumb mode.
10877 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10878 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
10879 .addReg(NewVReg1, RegState::Kill)
10880 .addImm(0x01)
10882 .add(condCodeOp());
10883 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10884 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
10885 .addReg(NewVReg2, RegState::Kill)
10886 .addImm(PCLabelId);
10887 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
10888 .addReg(NewVReg3, RegState::Kill)
10889 .addFrameIndex(FI)
10890 .addImm(36) // &jbuf[1] :: pc
10891 .addMemOperand(FIMMOSt)
10893 } else if (isThumb) {
10894 // Incoming value: jbuf
10895 // ldr.n r1, LCPI1_4
10896 // add r1, pc
10897 // mov r2, #1
10898 // orrs r1, r2
10899 // add r2, $jbuf, #+4 ; &jbuf[1]
10900 // str r1, [r2]
10901 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10902 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
10904 .addMemOperand(CPMMO)
10906 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10907 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
10908 .addReg(NewVReg1, RegState::Kill)
10909 .addImm(PCLabelId);
10910 // Set the low bit because of thumb mode.
10911 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10912 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
10913 .addReg(ARM::CPSR, RegState::Define)
10914 .addImm(1)
10916 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10917 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
10918 .addReg(ARM::CPSR, RegState::Define)
10919 .addReg(NewVReg2, RegState::Kill)
10920 .addReg(NewVReg3, RegState::Kill)
10922 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10923 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
10924 .addFrameIndex(FI)
10925 .addImm(36); // &jbuf[1] :: pc
10926 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
10927 .addReg(NewVReg4, RegState::Kill)
10928 .addReg(NewVReg5, RegState::Kill)
10929 .addImm(0)
10930 .addMemOperand(FIMMOSt)
10932 } else {
10933 // Incoming value: jbuf
10934 // ldr r1, LCPI1_1
10935 // add r1, pc, r1
10936 // str r1, [$jbuf, #+4] ; &jbuf[1]
10937 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10938 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
10940 .addImm(0)
10941 .addMemOperand(CPMMO)
10943 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10944 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
10945 .addReg(NewVReg1, RegState::Kill)
10946 .addImm(PCLabelId)
10948 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
10949 .addReg(NewVReg2, RegState::Kill)
10950 .addFrameIndex(FI)
10951 .addImm(36) // &jbuf[1] :: pc
10952 .addMemOperand(FIMMOSt)
10954 }
10955}
10956
10957void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
10958 MachineBasicBlock *MBB) const {
10959 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10960 DebugLoc dl = MI.getDebugLoc();
10961 MachineFunction *MF = MBB->getParent();
10962 MachineRegisterInfo *MRI = &MF->getRegInfo();
10963 MachineFrameInfo &MFI = MF->getFrameInfo();
10964 int FI = MFI.getFunctionContextIndex();
10965
10966 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
10967 : &ARM::GPRnopcRegClass;
10968
10969 // Get a mapping of the call site numbers to all of the landing pads they're
10970 // associated with.
10971 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
10972 unsigned MaxCSNum = 0;
10973 for (MachineBasicBlock &BB : *MF) {
10974 if (!BB.isEHPad())
10975 continue;
10976
10977 // FIXME: We should assert that the EH_LABEL is the first MI in the landing
10978 // pad.
10979 for (MachineInstr &II : BB) {
10980 if (!II.isEHLabel())
10981 continue;
10982
10983 MCSymbol *Sym = II.getOperand(0).getMCSymbol();
10984 if (!MF->hasCallSiteLandingPad(Sym)) continue;
10985
10986 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
10987 for (unsigned Idx : CallSiteIdxs) {
10988 CallSiteNumToLPad[Idx].push_back(&BB);
10989 MaxCSNum = std::max(MaxCSNum, Idx);
10990 }
10991 break;
10992 }
10993 }
10994
10995 // Get an ordered list of the machine basic blocks for the jump table.
10996 std::vector<MachineBasicBlock*> LPadList;
10997 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
10998 LPadList.reserve(CallSiteNumToLPad.size());
10999 for (unsigned I = 1; I <= MaxCSNum; ++I) {
11000 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
11001 for (MachineBasicBlock *MBB : MBBList) {
11002 LPadList.push_back(MBB);
11003 InvokeBBs.insert_range(MBB->predecessors());
11004 }
11005 }
11006
11007 assert(!LPadList.empty() &&
11008 "No landing pad destinations for the dispatch jump table!");
11009
11010 // Create the jump table and associated information.
11011 MachineJumpTableInfo *JTI =
11012 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
11013 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
11014
11015 // Create the MBBs for the dispatch code.
11016
11017 // Shove the dispatch's address into the return slot in the function context.
11018 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
11019 DispatchBB->setIsEHPad();
11020
11021 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
11022
11023 BuildMI(TrapBB, dl, TII->get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP));
11024 DispatchBB->addSuccessor(TrapBB);
11025
11026 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
11027 DispatchBB->addSuccessor(DispContBB);
11028
11029 // Insert and MBBs.
11030 MF->insert(MF->end(), DispatchBB);
11031 MF->insert(MF->end(), DispContBB);
11032 MF->insert(MF->end(), TrapBB);
11033
11034 // Insert code into the entry block that creates and registers the function
11035 // context.
11036 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
11037
11038 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
11041
11042 MachineInstrBuilder MIB;
11043 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
11044
11045 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
11046 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
11047
11048 // Add a register mask with no preserved registers. This results in all
11049 // registers being marked as clobbered. This can't work if the dispatch block
11050 // is in a Thumb1 function and is linked with ARM code which uses the FP
11051 // registers, as there is no way to preserve the FP registers in Thumb1 mode.
11053
11054 bool IsPositionIndependent = isPositionIndependent();
11055 unsigned NumLPads = LPadList.size();
11056 if (Subtarget->isThumb2()) {
11057 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11058 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
11059 .addFrameIndex(FI)
11060 .addImm(4)
11061 .addMemOperand(FIMMOLd)
11063
11064 if (NumLPads < 256) {
11065 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
11066 .addReg(NewVReg1)
11067 .addImm(LPadList.size())
11069 } else {
11070 Register VReg1 = MRI->createVirtualRegister(TRC);
11071 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
11072 .addImm(NumLPads & 0xFFFF)
11074
11075 unsigned VReg2 = VReg1;
11076 if ((NumLPads & 0xFFFF0000) != 0) {
11077 VReg2 = MRI->createVirtualRegister(TRC);
11078 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
11079 .addReg(VReg1)
11080 .addImm(NumLPads >> 16)
11082 }
11083
11084 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
11085 .addReg(NewVReg1)
11086 .addReg(VReg2)
11088 }
11089
11090 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
11091 .addMBB(TrapBB)
11093 .addReg(ARM::CPSR);
11094
11095 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11096 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
11097 .addJumpTableIndex(MJTI)
11099
11100 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11101 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
11102 .addReg(NewVReg3, RegState::Kill)
11103 .addReg(NewVReg1)
11106 .add(condCodeOp());
11107
11108 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
11109 .addReg(NewVReg4, RegState::Kill)
11110 .addReg(NewVReg1)
11111 .addJumpTableIndex(MJTI);
11112 } else if (Subtarget->isThumb()) {
11113 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11114 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
11115 .addFrameIndex(FI)
11116 .addImm(1)
11117 .addMemOperand(FIMMOLd)
11119
11120 if (NumLPads < 256) {
11121 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
11122 .addReg(NewVReg1)
11123 .addImm(NumLPads)
11125 } else {
11126 MachineConstantPool *ConstantPool = MF->getConstantPool();
11127 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11128 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11129
11130 // MachineConstantPool wants an explicit alignment.
11131 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11132 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11133
11134 Register VReg1 = MRI->createVirtualRegister(TRC);
11135 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
11136 .addReg(VReg1, RegState::Define)
11139 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
11140 .addReg(NewVReg1)
11141 .addReg(VReg1)
11143 }
11144
11145 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
11146 .addMBB(TrapBB)
11148 .addReg(ARM::CPSR);
11149
11150 Register NewVReg2 = MRI->createVirtualRegister(TRC);
11151 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
11152 .addReg(ARM::CPSR, RegState::Define)
11153 .addReg(NewVReg1)
11154 .addImm(2)
11156
11157 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11158 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
11159 .addJumpTableIndex(MJTI)
11161
11162 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11163 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
11164 .addReg(ARM::CPSR, RegState::Define)
11165 .addReg(NewVReg2, RegState::Kill)
11166 .addReg(NewVReg3)
11168
11169 MachineMemOperand *JTMMOLd =
11170 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11172
11173 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11174 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
11175 .addReg(NewVReg4, RegState::Kill)
11176 .addImm(0)
11177 .addMemOperand(JTMMOLd)
11179
11180 unsigned NewVReg6 = NewVReg5;
11181 if (IsPositionIndependent) {
11182 NewVReg6 = MRI->createVirtualRegister(TRC);
11183 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
11184 .addReg(ARM::CPSR, RegState::Define)
11185 .addReg(NewVReg5, RegState::Kill)
11186 .addReg(NewVReg3)
11188 }
11189
11190 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
11191 .addReg(NewVReg6, RegState::Kill)
11192 .addJumpTableIndex(MJTI);
11193 } else {
11194 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11195 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
11196 .addFrameIndex(FI)
11197 .addImm(4)
11198 .addMemOperand(FIMMOLd)
11200
11201 if (NumLPads < 256) {
11202 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
11203 .addReg(NewVReg1)
11204 .addImm(NumLPads)
11206 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
11207 Register VReg1 = MRI->createVirtualRegister(TRC);
11208 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
11209 .addImm(NumLPads & 0xFFFF)
11211
11212 unsigned VReg2 = VReg1;
11213 if ((NumLPads & 0xFFFF0000) != 0) {
11214 VReg2 = MRI->createVirtualRegister(TRC);
11215 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
11216 .addReg(VReg1)
11217 .addImm(NumLPads >> 16)
11219 }
11220
11221 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11222 .addReg(NewVReg1)
11223 .addReg(VReg2)
11225 } else {
11226 MachineConstantPool *ConstantPool = MF->getConstantPool();
11227 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11228 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11229
11230 // MachineConstantPool wants an explicit alignment.
11231 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11232 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11233
11234 Register VReg1 = MRI->createVirtualRegister(TRC);
11235 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
11236 .addReg(VReg1, RegState::Define)
11238 .addImm(0)
11240 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11241 .addReg(NewVReg1)
11242 .addReg(VReg1, RegState::Kill)
11244 }
11245
11246 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
11247 .addMBB(TrapBB)
11249 .addReg(ARM::CPSR);
11250
11251 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11252 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
11253 .addReg(NewVReg1)
11256 .add(condCodeOp());
11257 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11258 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
11259 .addJumpTableIndex(MJTI)
11261
11262 MachineMemOperand *JTMMOLd =
11263 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11265 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11266 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
11267 .addReg(NewVReg3, RegState::Kill)
11268 .addReg(NewVReg4)
11269 .addImm(0)
11270 .addMemOperand(JTMMOLd)
11272
11273 if (IsPositionIndependent) {
11274 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
11275 .addReg(NewVReg5, RegState::Kill)
11276 .addReg(NewVReg4)
11277 .addJumpTableIndex(MJTI);
11278 } else {
11279 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
11280 .addReg(NewVReg5, RegState::Kill)
11281 .addJumpTableIndex(MJTI);
11282 }
11283 }
11284
11285 // Add the jump table entries as successors to the MBB.
11286 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
11287 for (MachineBasicBlock *CurMBB : LPadList) {
11288 if (SeenMBBs.insert(CurMBB).second)
11289 DispContBB->addSuccessor(CurMBB);
11290 }
11291
11292 // N.B. the order the invoke BBs are processed in doesn't matter here.
11293 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
11295 for (MachineBasicBlock *BB : InvokeBBs) {
11296
11297 // Remove the landing pad successor from the invoke block and replace it
11298 // with the new dispatch block.
11299 SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
11300 while (!Successors.empty()) {
11301 MachineBasicBlock *SMBB = Successors.pop_back_val();
11302 if (SMBB->isEHPad()) {
11303 BB->removeSuccessor(SMBB);
11304 MBBLPads.push_back(SMBB);
11305 }
11306 }
11307
11308 BB->addSuccessor(DispatchBB, BranchProbability::getZero());
11309 BB->normalizeSuccProbs();
11310
11311 // Find the invoke call and mark all of the callee-saved registers as
11312 // 'implicit defined' so that they're spilled. This prevents code from
11313 // moving instructions to before the EH block, where they will never be
11314 // executed.
11316 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
11317 if (!II->isCall()) continue;
11318
11319 DenseSet<unsigned> DefRegs;
11321 OI = II->operands_begin(), OE = II->operands_end();
11322 OI != OE; ++OI) {
11323 if (!OI->isReg()) continue;
11324 DefRegs.insert(OI->getReg());
11325 }
11326
11327 MachineInstrBuilder MIB(*MF, &*II);
11328
11329 for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
11330 unsigned Reg = SavedRegs[i];
11331 if (Subtarget->isThumb2() &&
11332 !ARM::tGPRRegClass.contains(Reg) &&
11333 !ARM::hGPRRegClass.contains(Reg))
11334 continue;
11335 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
11336 continue;
11337 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
11338 continue;
11339 if (!DefRegs.contains(Reg))
11341 }
11342
11343 break;
11344 }
11345 }
11346
11347 // Mark all former landing pads as non-landing pads. The dispatch is the only
11348 // landing pad now.
11349 for (MachineBasicBlock *MBBLPad : MBBLPads)
11350 MBBLPad->setIsEHPad(false);
11351
11352 // The instruction is gone now.
11353 MI.eraseFromParent();
11354}
11355
11356static
11358 for (MachineBasicBlock *S : MBB->successors())
11359 if (S != Succ)
11360 return S;
11361 llvm_unreachable("Expecting a BB with two successors!");
11362}
11363
11364/// Return the load opcode for a given load size. If load size >= 8,
11365/// neon opcode will be returned.
11366static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
11367 if (LdSize >= 8)
11368 return LdSize == 16 ? ARM::VLD1q32wb_fixed
11369 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
11370 if (IsThumb1)
11371 return LdSize == 4 ? ARM::tLDRi
11372 : LdSize == 2 ? ARM::tLDRHi
11373 : LdSize == 1 ? ARM::tLDRBi : 0;
11374 if (IsThumb2)
11375 return LdSize == 4 ? ARM::t2LDR_POST
11376 : LdSize == 2 ? ARM::t2LDRH_POST
11377 : LdSize == 1 ? ARM::t2LDRB_POST : 0;
11378 return LdSize == 4 ? ARM::LDR_POST_IMM
11379 : LdSize == 2 ? ARM::LDRH_POST
11380 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
11381}
11382
11383/// Return the store opcode for a given store size. If store size >= 8,
11384/// neon opcode will be returned.
11385static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
11386 if (StSize >= 8)
11387 return StSize == 16 ? ARM::VST1q32wb_fixed
11388 : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
11389 if (IsThumb1)
11390 return StSize == 4 ? ARM::tSTRi
11391 : StSize == 2 ? ARM::tSTRHi
11392 : StSize == 1 ? ARM::tSTRBi : 0;
11393 if (IsThumb2)
11394 return StSize == 4 ? ARM::t2STR_POST
11395 : StSize == 2 ? ARM::t2STRH_POST
11396 : StSize == 1 ? ARM::t2STRB_POST : 0;
11397 return StSize == 4 ? ARM::STR_POST_IMM
11398 : StSize == 2 ? ARM::STRH_POST
11399 : StSize == 1 ? ARM::STRB_POST_IMM : 0;
11400}
11401
11402/// Emit a post-increment load operation with given size. The instructions
11403/// will be added to BB at Pos.
11405 const TargetInstrInfo *TII, const DebugLoc &dl,
11406 unsigned LdSize, unsigned Data, unsigned AddrIn,
11407 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11408 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
11409 assert(LdOpc != 0 && "Should have a load opcode");
11410 if (LdSize >= 8) {
11411 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11412 .addReg(AddrOut, RegState::Define)
11413 .addReg(AddrIn)
11414 .addImm(0)
11416 } else if (IsThumb1) {
11417 // load + update AddrIn
11418 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11419 .addReg(AddrIn)
11420 .addImm(0)
11422 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11423 .add(t1CondCodeOp())
11424 .addReg(AddrIn)
11425 .addImm(LdSize)
11427 } else if (IsThumb2) {
11428 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11429 .addReg(AddrOut, RegState::Define)
11430 .addReg(AddrIn)
11431 .addImm(LdSize)
11433 } else { // arm
11434 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11435 .addReg(AddrOut, RegState::Define)
11436 .addReg(AddrIn)
11437 .addReg(0)
11438 .addImm(LdSize)
11440 }
11441}
11442
11443/// Emit a post-increment store operation with given size. The instructions
11444/// will be added to BB at Pos.
11446 const TargetInstrInfo *TII, const DebugLoc &dl,
11447 unsigned StSize, unsigned Data, unsigned AddrIn,
11448 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11449 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
11450 assert(StOpc != 0 && "Should have a store opcode");
11451 if (StSize >= 8) {
11452 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11453 .addReg(AddrIn)
11454 .addImm(0)
11455 .addReg(Data)
11457 } else if (IsThumb1) {
11458 // store + update AddrIn
11459 BuildMI(*BB, Pos, dl, TII->get(StOpc))
11460 .addReg(Data)
11461 .addReg(AddrIn)
11462 .addImm(0)
11464 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11465 .add(t1CondCodeOp())
11466 .addReg(AddrIn)
11467 .addImm(StSize)
11469 } else if (IsThumb2) {
11470 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11471 .addReg(Data)
11472 .addReg(AddrIn)
11473 .addImm(StSize)
11475 } else { // arm
11476 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11477 .addReg(Data)
11478 .addReg(AddrIn)
11479 .addReg(0)
11480 .addImm(StSize)
11482 }
11483}
11484
11486ARMTargetLowering::EmitStructByval(MachineInstr &MI,
11487 MachineBasicBlock *BB) const {
11488 // This pseudo instruction has 3 operands: dst, src, size
11489 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
11490 // Otherwise, we will generate unrolled scalar copies.
11491 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11492 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11494
11495 Register dest = MI.getOperand(0).getReg();
11496 Register src = MI.getOperand(1).getReg();
11497 unsigned SizeVal = MI.getOperand(2).getImm();
11498 unsigned Alignment = MI.getOperand(3).getImm();
11499 DebugLoc dl = MI.getDebugLoc();
11500
11501 MachineFunction *MF = BB->getParent();
11502 MachineRegisterInfo &MRI = MF->getRegInfo();
11503 unsigned UnitSize = 0;
11504 const TargetRegisterClass *TRC = nullptr;
11505 const TargetRegisterClass *VecTRC = nullptr;
11506
11507 bool IsThumb1 = Subtarget->isThumb1Only();
11508 bool IsThumb2 = Subtarget->isThumb2();
11509 bool IsThumb = Subtarget->isThumb();
11510
11511 if (Alignment & 1) {
11512 UnitSize = 1;
11513 } else if (Alignment & 2) {
11514 UnitSize = 2;
11515 } else {
11516 // Check whether we can use NEON instructions.
11517 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
11518 Subtarget->hasNEON()) {
11519 if ((Alignment % 16 == 0) && SizeVal >= 16)
11520 UnitSize = 16;
11521 else if ((Alignment % 8 == 0) && SizeVal >= 8)
11522 UnitSize = 8;
11523 }
11524 // Can't use NEON instructions.
11525 if (UnitSize == 0)
11526 UnitSize = 4;
11527 }
11528
11529 // Select the correct opcode and register class for unit size load/store
11530 bool IsNeon = UnitSize >= 8;
11531 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
11532 if (IsNeon)
11533 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
11534 : UnitSize == 8 ? &ARM::DPRRegClass
11535 : nullptr;
11536
11537 unsigned BytesLeft = SizeVal % UnitSize;
11538 unsigned LoopSize = SizeVal - BytesLeft;
11539
11540 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
11541 // Use LDR and STR to copy.
11542 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
11543 // [destOut] = STR_POST(scratch, destIn, UnitSize)
11544 unsigned srcIn = src;
11545 unsigned destIn = dest;
11546 for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
11547 Register srcOut = MRI.createVirtualRegister(TRC);
11548 Register destOut = MRI.createVirtualRegister(TRC);
11549 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11550 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
11551 IsThumb1, IsThumb2);
11552 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
11553 IsThumb1, IsThumb2);
11554 srcIn = srcOut;
11555 destIn = destOut;
11556 }
11557
11558 // Handle the leftover bytes with LDRB and STRB.
11559 // [scratch, srcOut] = LDRB_POST(srcIn, 1)
11560 // [destOut] = STRB_POST(scratch, destIn, 1)
11561 for (unsigned i = 0; i < BytesLeft; i++) {
11562 Register srcOut = MRI.createVirtualRegister(TRC);
11563 Register destOut = MRI.createVirtualRegister(TRC);
11564 Register scratch = MRI.createVirtualRegister(TRC);
11565 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
11566 IsThumb1, IsThumb2);
11567 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
11568 IsThumb1, IsThumb2);
11569 srcIn = srcOut;
11570 destIn = destOut;
11571 }
11572 MI.eraseFromParent(); // The instruction is gone now.
11573 return BB;
11574 }
11575
11576 // Expand the pseudo op to a loop.
11577 // thisMBB:
11578 // ...
11579 // movw varEnd, # --> with thumb2
11580 // movt varEnd, #
11581 // ldrcp varEnd, idx --> without thumb2
11582 // fallthrough --> loopMBB
11583 // loopMBB:
11584 // PHI varPhi, varEnd, varLoop
11585 // PHI srcPhi, src, srcLoop
11586 // PHI destPhi, dst, destLoop
11587 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11588 // [destLoop] = STR_POST(scratch, destPhi, UnitSize)
11589 // subs varLoop, varPhi, #UnitSize
11590 // bne loopMBB
11591 // fallthrough --> exitMBB
11592 // exitMBB:
11593 // epilogue to handle left-over bytes
11594 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11595 // [destOut] = STRB_POST(scratch, destLoop, 1)
11596 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11597 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11598 MF->insert(It, loopMBB);
11599 MF->insert(It, exitMBB);
11600
11601 // Set the call frame size on entry to the new basic blocks.
11602 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
11603 loopMBB->setCallFrameSize(CallFrameSize);
11604 exitMBB->setCallFrameSize(CallFrameSize);
11605
11606 // Transfer the remainder of BB and its successor edges to exitMBB.
11607 exitMBB->splice(exitMBB->begin(), BB,
11608 std::next(MachineBasicBlock::iterator(MI)), BB->end());
11610
11611 // Load an immediate to varEnd.
11612 Register varEnd = MRI.createVirtualRegister(TRC);
11613 if (Subtarget->useMovt()) {
11614 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi32imm : ARM::MOVi32imm),
11615 varEnd)
11616 .addImm(LoopSize);
11617 } else if (Subtarget->genExecuteOnly()) {
11618 assert(IsThumb && "Non-thumb expected to have used movt");
11619 BuildMI(BB, dl, TII->get(ARM::tMOVi32imm), varEnd).addImm(LoopSize);
11620 } else {
11621 MachineConstantPool *ConstantPool = MF->getConstantPool();
11623 const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
11624
11625 // MachineConstantPool wants an explicit alignment.
11626 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11627 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11628 MachineMemOperand *CPMMO =
11631
11632 if (IsThumb)
11633 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
11634 .addReg(varEnd, RegState::Define)
11637 .addMemOperand(CPMMO);
11638 else
11639 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
11640 .addReg(varEnd, RegState::Define)
11642 .addImm(0)
11644 .addMemOperand(CPMMO);
11645 }
11646 BB->addSuccessor(loopMBB);
11647
11648 // Generate the loop body:
11649 // varPhi = PHI(varLoop, varEnd)
11650 // srcPhi = PHI(srcLoop, src)
11651 // destPhi = PHI(destLoop, dst)
11652 MachineBasicBlock *entryBB = BB;
11653 BB = loopMBB;
11654 Register varLoop = MRI.createVirtualRegister(TRC);
11655 Register varPhi = MRI.createVirtualRegister(TRC);
11656 Register srcLoop = MRI.createVirtualRegister(TRC);
11657 Register srcPhi = MRI.createVirtualRegister(TRC);
11658 Register destLoop = MRI.createVirtualRegister(TRC);
11659 Register destPhi = MRI.createVirtualRegister(TRC);
11660
11661 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
11662 .addReg(varLoop).addMBB(loopMBB)
11663 .addReg(varEnd).addMBB(entryBB);
11664 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
11665 .addReg(srcLoop).addMBB(loopMBB)
11666 .addReg(src).addMBB(entryBB);
11667 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
11668 .addReg(destLoop).addMBB(loopMBB)
11669 .addReg(dest).addMBB(entryBB);
11670
11671 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11672 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
11673 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11674 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
11675 IsThumb1, IsThumb2);
11676 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
11677 IsThumb1, IsThumb2);
11678
11679 // Decrement loop variable by UnitSize.
11680 if (IsThumb1) {
11681 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
11682 .add(t1CondCodeOp())
11683 .addReg(varPhi)
11684 .addImm(UnitSize)
11686 } else {
11687 MachineInstrBuilder MIB =
11688 BuildMI(*BB, BB->end(), dl,
11689 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
11690 MIB.addReg(varPhi)
11691 .addImm(UnitSize)
11693 .add(condCodeOp());
11694 MIB->getOperand(5).setReg(ARM::CPSR);
11695 MIB->getOperand(5).setIsDef(true);
11696 }
11697 BuildMI(*BB, BB->end(), dl,
11698 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
11699 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
11700
11701 // loopMBB can loop back to loopMBB or fall through to exitMBB.
11702 BB->addSuccessor(loopMBB);
11703 BB->addSuccessor(exitMBB);
11704
11705 // Add epilogue to handle BytesLeft.
11706 BB = exitMBB;
11707 auto StartOfExit = exitMBB->begin();
11708
11709 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11710 // [destOut] = STRB_POST(scratch, destLoop, 1)
11711 unsigned srcIn = srcLoop;
11712 unsigned destIn = destLoop;
11713 for (unsigned i = 0; i < BytesLeft; i++) {
11714 Register srcOut = MRI.createVirtualRegister(TRC);
11715 Register destOut = MRI.createVirtualRegister(TRC);
11716 Register scratch = MRI.createVirtualRegister(TRC);
11717 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
11718 IsThumb1, IsThumb2);
11719 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
11720 IsThumb1, IsThumb2);
11721 srcIn = srcOut;
11722 destIn = destOut;
11723 }
11724
11725 MI.eraseFromParent(); // The instruction is gone now.
11726 return BB;
11727}
11728
11730ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
11731 MachineBasicBlock *MBB) const {
11732 const TargetMachine &TM = getTargetMachine();
11733 const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
11734 DebugLoc DL = MI.getDebugLoc();
11735
11736 assert(Subtarget->isTargetWindows() &&
11737 "__chkstk is only supported on Windows");
11738 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
11739
11740 // __chkstk takes the number of words to allocate on the stack in R4, and
11741 // returns the stack adjustment in number of bytes in R4. This will not
11742 // clober any other registers (other than the obvious lr).
11743 //
11744 // Although, technically, IP should be considered a register which may be
11745 // clobbered, the call itself will not touch it. Windows on ARM is a pure
11746 // thumb-2 environment, so there is no interworking required. As a result, we
11747 // do not expect a veneer to be emitted by the linker, clobbering IP.
11748 //
11749 // Each module receives its own copy of __chkstk, so no import thunk is
11750 // required, again, ensuring that IP is not clobbered.
11751 //
11752 // Finally, although some linkers may theoretically provide a trampoline for
11753 // out of range calls (which is quite common due to a 32M range limitation of
11754 // branches for Thumb), we can generate the long-call version via
11755 // -mcmodel=large, alleviating the need for the trampoline which may clobber
11756 // IP.
11757
11758 switch (TM.getCodeModel()) {
11759 case CodeModel::Tiny:
11760 llvm_unreachable("Tiny code model not available on ARM.");
11761 case CodeModel::Small:
11762 case CodeModel::Medium:
11763 case CodeModel::Kernel:
11764 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
11766 .addExternalSymbol("__chkstk")
11769 .addReg(ARM::R12,
11771 .addReg(ARM::CPSR,
11773 break;
11774 case CodeModel::Large: {
11775 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
11776 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11777
11778 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
11779 .addExternalSymbol("__chkstk");
11785 .addReg(ARM::R12,
11787 .addReg(ARM::CPSR,
11789 break;
11790 }
11791 }
11792
11793 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
11794 .addReg(ARM::SP, RegState::Kill)
11795 .addReg(ARM::R4, RegState::Kill)
11798 .add(condCodeOp());
11799
11800 MI.eraseFromParent();
11801 return MBB;
11802}
11803
11805ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
11806 MachineBasicBlock *MBB) const {
11807 DebugLoc DL = MI.getDebugLoc();
11808 MachineFunction *MF = MBB->getParent();
11809 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11810
11811 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
11812 MF->insert(++MBB->getIterator(), ContBB);
11813 ContBB->splice(ContBB->begin(), MBB,
11814 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
11816 MBB->addSuccessor(ContBB);
11817
11818 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
11819 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
11820 MF->push_back(TrapBB);
11821 MBB->addSuccessor(TrapBB);
11822
11823 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
11824 .addReg(MI.getOperand(0).getReg())
11825 .addImm(0)
11827 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
11828 .addMBB(TrapBB)
11830 .addReg(ARM::CPSR);
11831
11832 MI.eraseFromParent();
11833 return ContBB;
11834}
11835
11836// The CPSR operand of SelectItr might be missing a kill marker
11837// because there were multiple uses of CPSR, and ISel didn't know
11838// which to mark. Figure out whether SelectItr should have had a
11839// kill marker, and set it if it should. Returns the correct kill
11840// marker value.
11843 const TargetRegisterInfo* TRI) {
11844 // Scan forward through BB for a use/def of CPSR.
11845 MachineBasicBlock::iterator miI(std::next(SelectItr));
11846 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
11847 const MachineInstr& mi = *miI;
11848 if (mi.readsRegister(ARM::CPSR, /*TRI=*/nullptr))
11849 return false;
11850 if (mi.definesRegister(ARM::CPSR, /*TRI=*/nullptr))
11851 break; // Should have kill-flag - update below.
11852 }
11853
11854 // If we hit the end of the block, check whether CPSR is live into a
11855 // successor.
11856 if (miI == BB->end()) {
11857 for (MachineBasicBlock *Succ : BB->successors())
11858 if (Succ->isLiveIn(ARM::CPSR))
11859 return false;
11860 }
11861
11862 // We found a def, or hit the end of the basic block and CPSR wasn't live
11863 // out. SelectMI should have a kill flag on CPSR.
11864 SelectItr->addRegisterKilled(ARM::CPSR, TRI);
11865 return true;
11866}
11867
11868/// Adds logic in loop entry MBB to calculate loop iteration count and adds
11869/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
11871 MachineBasicBlock *TpLoopBody,
11872 MachineBasicBlock *TpExit, Register OpSizeReg,
11873 const TargetInstrInfo *TII, DebugLoc Dl,
11875 // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.
11876 Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11877 BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
11878 .addUse(OpSizeReg)
11879 .addImm(15)
11881 .addReg(0);
11882
11883 Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11884 BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
11885 .addUse(AddDestReg, RegState::Kill)
11886 .addImm(4)
11888 .addReg(0);
11889
11890 Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11891 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
11892 .addUse(LsrDestReg, RegState::Kill);
11893
11894 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
11895 .addUse(TotalIterationsReg)
11896 .addMBB(TpExit);
11897
11898 BuildMI(TpEntry, Dl, TII->get(ARM::t2B))
11899 .addMBB(TpLoopBody)
11901
11902 return TotalIterationsReg;
11903}
11904
11905/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
11906/// t2DoLoopEnd. These are used by later passes to generate tail predicated
11907/// loops.
11908static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
11909 MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
11910 const TargetInstrInfo *TII, DebugLoc Dl,
11911 MachineRegisterInfo &MRI, Register OpSrcReg,
11912 Register OpDestReg, Register ElementCountReg,
11913 Register TotalIterationsReg, bool IsMemcpy) {
11914 // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
11915 // array, loop iteration counter, predication counter.
11916
11917 Register SrcPhiReg, CurrSrcReg;
11918 if (IsMemcpy) {
11919 // Current position in the src array
11920 SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11921 CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11922 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
11923 .addUse(OpSrcReg)
11924 .addMBB(TpEntry)
11925 .addUse(CurrSrcReg)
11926 .addMBB(TpLoopBody);
11927 }
11928
11929 // Current position in the dest array
11930 Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11931 Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11932 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
11933 .addUse(OpDestReg)
11934 .addMBB(TpEntry)
11935 .addUse(CurrDestReg)
11936 .addMBB(TpLoopBody);
11937
11938 // Current loop counter
11939 Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11940 Register RemainingLoopIterationsReg =
11941 MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11942 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
11943 .addUse(TotalIterationsReg)
11944 .addMBB(TpEntry)
11945 .addUse(RemainingLoopIterationsReg)
11946 .addMBB(TpLoopBody);
11947
11948 // Predication counter
11949 Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11950 Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11951 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
11952 .addUse(ElementCountReg)
11953 .addMBB(TpEntry)
11954 .addUse(RemainingElementsReg)
11955 .addMBB(TpLoopBody);
11956
11957 // Pass predication counter to VCTP
11958 Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
11959 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
11960 .addUse(PredCounterPhiReg)
11962 .addReg(0)
11963 .addReg(0);
11964
11965 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
11966 .addUse(PredCounterPhiReg)
11967 .addImm(16)
11969 .addReg(0);
11970
11971 // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
11972 Register SrcValueReg;
11973 if (IsMemcpy) {
11974 SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
11975 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
11976 .addDef(CurrSrcReg)
11977 .addDef(SrcValueReg)
11978 .addReg(SrcPhiReg)
11979 .addImm(16)
11981 .addUse(VccrReg)
11982 .addReg(0);
11983 } else
11984 SrcValueReg = OpSrcReg;
11985
11986 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
11987 .addDef(CurrDestReg)
11988 .addUse(SrcValueReg)
11989 .addReg(DestPhiReg)
11990 .addImm(16)
11992 .addUse(VccrReg)
11993 .addReg(0);
11994
11995 // Add the pseudoInstrs for decrementing the loop counter and marking the
11996 // end:t2DoLoopDec and t2DoLoopEnd
11997 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
11998 .addUse(LoopCounterPhiReg)
11999 .addImm(1);
12000
12001 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
12002 .addUse(RemainingLoopIterationsReg)
12003 .addMBB(TpLoopBody);
12004
12005 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
12006 .addMBB(TpExit)
12008}
12009
12012 MachineBasicBlock *BB) const {
12013 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
12014 DebugLoc dl = MI.getDebugLoc();
12015 bool isThumb2 = Subtarget->isThumb2();
12016 switch (MI.getOpcode()) {
12017 default: {
12018 MI.print(errs());
12019 llvm_unreachable("Unexpected instr type to insert");
12020 }
12021
12022 // Thumb1 post-indexed loads are really just single-register LDMs.
12023 case ARM::tLDR_postidx: {
12024 MachineOperand Def(MI.getOperand(1));
12025 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
12026 .add(Def) // Rn_wb
12027 .add(MI.getOperand(2)) // Rn
12028 .add(MI.getOperand(3)) // PredImm
12029 .add(MI.getOperand(4)) // PredReg
12030 .add(MI.getOperand(0)) // Rt
12031 .cloneMemRefs(MI);
12032 MI.eraseFromParent();
12033 return BB;
12034 }
12035
12036 case ARM::MVE_MEMCPYLOOPINST:
12037 case ARM::MVE_MEMSETLOOPINST: {
12038
12039 // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
12040 // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
12041 // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
12042 // adds the relevant instructions in the TP loop Body for generation of a
12043 // WLSTP loop.
12044
12045 // Below is relevant portion of the CFG after the transformation.
12046 // The Machine Basic Blocks are shown along with branch conditions (in
12047 // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
12048 // portion of the CFG and may not necessarily be the entry/exit of the
12049 // function.
12050
12051 // (Relevant) CFG after transformation:
12052 // TP entry MBB
12053 // |
12054 // |-----------------|
12055 // (n <= 0) (n > 0)
12056 // | |
12057 // | TP loop Body MBB<--|
12058 // | | |
12059 // \ |___________|
12060 // \ /
12061 // TP exit MBB
12062
12063 MachineFunction *MF = BB->getParent();
12064 MachineFunctionProperties &Properties = MF->getProperties();
12066
12067 Register OpDestReg = MI.getOperand(0).getReg();
12068 Register OpSrcReg = MI.getOperand(1).getReg();
12069 Register OpSizeReg = MI.getOperand(2).getReg();
12070
12071 // Allocate the required MBBs and add to parent function.
12072 MachineBasicBlock *TpEntry = BB;
12073 MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
12074 MachineBasicBlock *TpExit;
12075
12076 MF->push_back(TpLoopBody);
12077
12078 // If any instructions are present in the current block after
12079 // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
12080 // move the instructions into the newly created exit block. If there are no
12081 // instructions add an explicit branch to the FallThrough block and then
12082 // split.
12083 //
12084 // The split is required for two reasons:
12085 // 1) A terminator(t2WhileLoopStart) will be placed at that site.
12086 // 2) Since a TPLoopBody will be added later, any phis in successive blocks
12087 // need to be updated. splitAt() already handles this.
12088 TpExit = BB->splitAt(MI, false);
12089 if (TpExit == BB) {
12090 assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
12091 "block containing memcpy/memset Pseudo");
12092 TpExit = BB->getFallThrough();
12093 BuildMI(BB, dl, TII->get(ARM::t2B))
12094 .addMBB(TpExit)
12096 TpExit = BB->splitAt(MI, false);
12097 }
12098
12099 // Add logic for iteration count
12100 Register TotalIterationsReg =
12101 genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
12102
12103 // Add the vectorized (and predicated) loads/store instructions
12104 bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
12105 genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
12106 OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);
12107
12108 // Required to avoid conflict with the MachineVerifier during testing.
12109 Properties.resetNoPHIs();
12110
12111 // Connect the blocks
12112 TpEntry->addSuccessor(TpLoopBody);
12113 TpLoopBody->addSuccessor(TpLoopBody);
12114 TpLoopBody->addSuccessor(TpExit);
12115
12116 // Reorder for a more natural layout
12117 TpLoopBody->moveAfter(TpEntry);
12118 TpExit->moveAfter(TpLoopBody);
12119
12120 // Finally, remove the memcpy Pseudo Instruction
12121 MI.eraseFromParent();
12122
12123 // Return the exit block as it may contain other instructions requiring a
12124 // custom inserter
12125 return TpExit;
12126 }
12127
12128 // The Thumb2 pre-indexed stores have the same MI operands, they just
12129 // define them differently in the .td files from the isel patterns, so
12130 // they need pseudos.
12131 case ARM::t2STR_preidx:
12132 MI.setDesc(TII->get(ARM::t2STR_PRE));
12133 return BB;
12134 case ARM::t2STRB_preidx:
12135 MI.setDesc(TII->get(ARM::t2STRB_PRE));
12136 return BB;
12137 case ARM::t2STRH_preidx:
12138 MI.setDesc(TII->get(ARM::t2STRH_PRE));
12139 return BB;
12140
12141 case ARM::STRi_preidx:
12142 case ARM::STRBi_preidx: {
12143 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
12144 : ARM::STRB_PRE_IMM;
12145 // Decode the offset.
12146 unsigned Offset = MI.getOperand(4).getImm();
12147 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
12149 if (isSub)
12150 Offset = -Offset;
12151
12152 MachineMemOperand *MMO = *MI.memoperands_begin();
12153 BuildMI(*BB, MI, dl, TII->get(NewOpc))
12154 .add(MI.getOperand(0)) // Rn_wb
12155 .add(MI.getOperand(1)) // Rt
12156 .add(MI.getOperand(2)) // Rn
12157 .addImm(Offset) // offset (skip GPR==zero_reg)
12158 .add(MI.getOperand(5)) // pred
12159 .add(MI.getOperand(6))
12160 .addMemOperand(MMO);
12161 MI.eraseFromParent();
12162 return BB;
12163 }
12164 case ARM::STRr_preidx:
12165 case ARM::STRBr_preidx:
12166 case ARM::STRH_preidx: {
12167 unsigned NewOpc;
12168 switch (MI.getOpcode()) {
12169 default: llvm_unreachable("unexpected opcode!");
12170 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
12171 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
12172 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
12173 }
12174 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
12175 for (const MachineOperand &MO : MI.operands())
12176 MIB.add(MO);
12177 MI.eraseFromParent();
12178 return BB;
12179 }
12180
12181 case ARM::tMOVCCr_pseudo: {
12182 // To "insert" a SELECT_CC instruction, we actually have to insert the
12183 // diamond control-flow pattern. The incoming instruction knows the
12184 // destination vreg to set, the condition code register to branch on, the
12185 // true/false values to select between, and a branch opcode to use.
12186 const BasicBlock *LLVM_BB = BB->getBasicBlock();
12188
12189 // thisMBB:
12190 // ...
12191 // TrueVal = ...
12192 // cmpTY ccX, r1, r2
12193 // bCC copy1MBB
12194 // fallthrough --> copy0MBB
12195 MachineBasicBlock *thisMBB = BB;
12196 MachineFunction *F = BB->getParent();
12197 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
12198 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
12199 F->insert(It, copy0MBB);
12200 F->insert(It, sinkMBB);
12201
12202 // Set the call frame size on entry to the new basic blocks.
12203 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
12204 copy0MBB->setCallFrameSize(CallFrameSize);
12205 sinkMBB->setCallFrameSize(CallFrameSize);
12206
12207 // Check whether CPSR is live past the tMOVCCr_pseudo.
12208 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
12209 if (!MI.killsRegister(ARM::CPSR, /*TRI=*/nullptr) &&
12210 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
12211 copy0MBB->addLiveIn(ARM::CPSR);
12212 sinkMBB->addLiveIn(ARM::CPSR);
12213 }
12214
12215 // Transfer the remainder of BB and its successor edges to sinkMBB.
12216 sinkMBB->splice(sinkMBB->begin(), BB,
12217 std::next(MachineBasicBlock::iterator(MI)), BB->end());
12219
12220 BB->addSuccessor(copy0MBB);
12221 BB->addSuccessor(sinkMBB);
12222
12223 BuildMI(BB, dl, TII->get(ARM::tBcc))
12224 .addMBB(sinkMBB)
12225 .addImm(MI.getOperand(3).getImm())
12226 .addReg(MI.getOperand(4).getReg());
12227
12228 // copy0MBB:
12229 // %FalseValue = ...
12230 // # fallthrough to sinkMBB
12231 BB = copy0MBB;
12232
12233 // Update machine-CFG edges
12234 BB->addSuccessor(sinkMBB);
12235
12236 // sinkMBB:
12237 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
12238 // ...
12239 BB = sinkMBB;
12240 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
12241 .addReg(MI.getOperand(1).getReg())
12242 .addMBB(copy0MBB)
12243 .addReg(MI.getOperand(2).getReg())
12244 .addMBB(thisMBB);
12245
12246 MI.eraseFromParent(); // The pseudo instruction is gone now.
12247 return BB;
12248 }
12249
12250 case ARM::BCCi64:
12251 case ARM::BCCZi64: {
12252 // If there is an unconditional branch to the other successor, remove it.
12253 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
12254
12255 // Compare both parts that make up the double comparison separately for
12256 // equality.
12257 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
12258
12259 Register LHS1 = MI.getOperand(1).getReg();
12260 Register LHS2 = MI.getOperand(2).getReg();
12261 if (RHSisZero) {
12262 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12263 .addReg(LHS1)
12264 .addImm(0)
12266 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12267 .addReg(LHS2).addImm(0)
12268 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12269 } else {
12270 Register RHS1 = MI.getOperand(3).getReg();
12271 Register RHS2 = MI.getOperand(4).getReg();
12272 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12273 .addReg(LHS1)
12274 .addReg(RHS1)
12276 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12277 .addReg(LHS2).addReg(RHS2)
12278 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12279 }
12280
12281 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
12282 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
12283 if (MI.getOperand(0).getImm() == ARMCC::NE)
12284 std::swap(destMBB, exitMBB);
12285
12286 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
12287 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
12288 if (isThumb2)
12289 BuildMI(BB, dl, TII->get(ARM::t2B))
12290 .addMBB(exitMBB)
12292 else
12293 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
12294
12295 MI.eraseFromParent(); // The pseudo instruction is gone now.
12296 return BB;
12297 }
12298
12299 case ARM::Int_eh_sjlj_setjmp:
12300 case ARM::Int_eh_sjlj_setjmp_nofp:
12301 case ARM::tInt_eh_sjlj_setjmp:
12302 case ARM::t2Int_eh_sjlj_setjmp:
12303 case ARM::t2Int_eh_sjlj_setjmp_nofp:
12304 return BB;
12305
12306 case ARM::Int_eh_sjlj_setup_dispatch:
12307 EmitSjLjDispatchBlock(MI, BB);
12308 return BB;
12309 case ARM::COPY_STRUCT_BYVAL_I32:
12310 ++NumLoopByVals;
12311 return EmitStructByval(MI, BB);
12312 case ARM::WIN__CHKSTK:
12313 return EmitLowered__chkstk(MI, BB);
12314 case ARM::WIN__DBZCHK:
12315 return EmitLowered__dbzchk(MI, BB);
12316 }
12317}
12318
12319/// Attaches vregs to MEMCPY that it will use as scratch registers
12320/// when it is expanded into LDM/STM. This is done as a post-isel lowering
12321/// instead of as a custom inserter because we need the use list from the SDNode.
12322static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
12323 MachineInstr &MI, const SDNode *Node) {
12324 bool isThumb1 = Subtarget->isThumb1Only();
12325
12326 MachineFunction *MF = MI.getParent()->getParent();
12328 MachineInstrBuilder MIB(*MF, MI);
12329
12330 // If the new dst/src is unused mark it as dead.
12331 if (!Node->hasAnyUseOfValue(0)) {
12332 MI.getOperand(0).setIsDead(true);
12333 }
12334 if (!Node->hasAnyUseOfValue(1)) {
12335 MI.getOperand(1).setIsDead(true);
12336 }
12337
12338 // The MEMCPY both defines and kills the scratch registers.
12339 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
12340 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
12341 : &ARM::GPRRegClass);
12343 }
12344}
12345
12347 SDNode *Node) const {
12348 if (MI.getOpcode() == ARM::MEMCPY) {
12349 attachMEMCPYScratchRegs(Subtarget, MI, Node);
12350 return;
12351 }
12352
12353 const MCInstrDesc *MCID = &MI.getDesc();
12354 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
12355 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
12356 // operand is still set to noreg. If needed, set the optional operand's
12357 // register to CPSR, and remove the redundant implicit def.
12358 //
12359 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
12360
12361 // Rename pseudo opcodes.
12362 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
12363 unsigned ccOutIdx;
12364 if (NewOpc) {
12365 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
12366 MCID = &TII->get(NewOpc);
12367
12368 assert(MCID->getNumOperands() ==
12369 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
12370 && "converted opcode should be the same except for cc_out"
12371 " (and, on Thumb1, pred)");
12372
12373 MI.setDesc(*MCID);
12374
12375 // Add the optional cc_out operand
12376 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
12377
12378 // On Thumb1, move all input operands to the end, then add the predicate
12379 if (Subtarget->isThumb1Only()) {
12380 for (unsigned c = MCID->getNumOperands() - 4; c--;) {
12381 MI.addOperand(MI.getOperand(1));
12382 MI.removeOperand(1);
12383 }
12384
12385 // Restore the ties
12386 for (unsigned i = MI.getNumOperands(); i--;) {
12387 const MachineOperand& op = MI.getOperand(i);
12388 if (op.isReg() && op.isUse()) {
12389 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
12390 if (DefIdx != -1)
12391 MI.tieOperands(DefIdx, i);
12392 }
12393 }
12394
12396 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
12397 ccOutIdx = 1;
12398 } else
12399 ccOutIdx = MCID->getNumOperands() - 1;
12400 } else
12401 ccOutIdx = MCID->getNumOperands() - 1;
12402
12403 // Any ARM instruction that sets the 's' bit should specify an optional
12404 // "cc_out" operand in the last operand position.
12405 if (!MI.hasOptionalDef() || !MCID->operands()[ccOutIdx].isOptionalDef()) {
12406 assert(!NewOpc && "Optional cc_out operand required");
12407 return;
12408 }
12409 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
12410 // since we already have an optional CPSR def.
12411 bool definesCPSR = false;
12412 bool deadCPSR = false;
12413 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
12414 ++i) {
12415 const MachineOperand &MO = MI.getOperand(i);
12416 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
12417 definesCPSR = true;
12418 if (MO.isDead())
12419 deadCPSR = true;
12420 MI.removeOperand(i);
12421 break;
12422 }
12423 }
12424 if (!definesCPSR) {
12425 assert(!NewOpc && "Optional cc_out operand required");
12426 return;
12427 }
12428 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
12429 if (deadCPSR) {
12430 assert(!MI.getOperand(ccOutIdx).getReg() &&
12431 "expect uninitialized optional cc_out operand");
12432 // Thumb1 instructions must have the S bit even if the CPSR is dead.
12433 if (!Subtarget->isThumb1Only())
12434 return;
12435 }
12436
12437 // If this instruction was defined with an optional CPSR def and its dag node
12438 // had a live implicit CPSR def, then activate the optional CPSR def.
12439 MachineOperand &MO = MI.getOperand(ccOutIdx);
12440 MO.setReg(ARM::CPSR);
12441 MO.setIsDef(true);
12442}
12443
12444//===----------------------------------------------------------------------===//
12445// ARM Optimization Hooks
12446//===----------------------------------------------------------------------===//
12447
12448// Helper function that checks if N is a null or all ones constant.
12449static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
12451}
12452
12453// Return true if N is conditionally 0 or all ones.
12454// Detects these expressions where cc is an i1 value:
12455//
12456// (select cc 0, y) [AllOnes=0]
12457// (select cc y, 0) [AllOnes=0]
12458// (zext cc) [AllOnes=0]
12459// (sext cc) [AllOnes=0/1]
12460// (select cc -1, y) [AllOnes=1]
12461// (select cc y, -1) [AllOnes=1]
12462//
12463// Invert is set when N is the null/all ones constant when CC is false.
12464// OtherOp is set to the alternative value of N.
12466 SDValue &CC, bool &Invert,
12467 SDValue &OtherOp,
12468 SelectionDAG &DAG) {
12469 switch (N->getOpcode()) {
12470 default: return false;
12471 case ISD::SELECT: {
12472 CC = N->getOperand(0);
12473 SDValue N1 = N->getOperand(1);
12474 SDValue N2 = N->getOperand(2);
12475 if (isZeroOrAllOnes(N1, AllOnes)) {
12476 Invert = false;
12477 OtherOp = N2;
12478 return true;
12479 }
12480 if (isZeroOrAllOnes(N2, AllOnes)) {
12481 Invert = true;
12482 OtherOp = N1;
12483 return true;
12484 }
12485 return false;
12486 }
12487 case ISD::ZERO_EXTEND:
12488 // (zext cc) can never be the all ones value.
12489 if (AllOnes)
12490 return false;
12491 [[fallthrough]];
12492 case ISD::SIGN_EXTEND: {
12493 SDLoc dl(N);
12494 EVT VT = N->getValueType(0);
12495 CC = N->getOperand(0);
12496 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
12497 return false;
12498 Invert = !AllOnes;
12499 if (AllOnes)
12500 // When looking for an AllOnes constant, N is an sext, and the 'other'
12501 // value is 0.
12502 OtherOp = DAG.getConstant(0, dl, VT);
12503 else if (N->getOpcode() == ISD::ZERO_EXTEND)
12504 // When looking for a 0 constant, N can be zext or sext.
12505 OtherOp = DAG.getConstant(1, dl, VT);
12506 else
12507 OtherOp = DAG.getAllOnesConstant(dl, VT);
12508 return true;
12509 }
12510 }
12511}
12512
12513// Combine a constant select operand into its use:
12514//
12515// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
12516// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
12517// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
12518// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
12519// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
12520//
12521// The transform is rejected if the select doesn't have a constant operand that
12522// is null, or all ones when AllOnes is set.
12523//
12524// Also recognize sext/zext from i1:
12525//
12526// (add (zext cc), x) -> (select cc (add x, 1), x)
12527// (add (sext cc), x) -> (select cc (add x, -1), x)
12528//
12529// These transformations eventually create predicated instructions.
12530//
12531// @param N The node to transform.
12532// @param Slct The N operand that is a select.
12533// @param OtherOp The other N operand (x above).
12534// @param DCI Context.
12535// @param AllOnes Require the select constant to be all ones instead of null.
12536// @returns The new node, or SDValue() on failure.
12537static
12540 bool AllOnes = false) {
12541 SelectionDAG &DAG = DCI.DAG;
12542 EVT VT = N->getValueType(0);
12543 SDValue NonConstantVal;
12544 SDValue CCOp;
12545 bool SwapSelectOps;
12546 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
12547 NonConstantVal, DAG))
12548 return SDValue();
12549
12550 // Slct is now know to be the desired identity constant when CC is true.
12551 SDValue TrueVal = OtherOp;
12552 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
12553 OtherOp, NonConstantVal);
12554 // Unless SwapSelectOps says CC should be false.
12555 if (SwapSelectOps)
12556 std::swap(TrueVal, FalseVal);
12557
12558 return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
12559 CCOp, TrueVal, FalseVal);
12560}
12561
12562// Attempt combineSelectAndUse on each operand of a commutative operator N.
12563static
12566 SDValue N0 = N->getOperand(0);
12567 SDValue N1 = N->getOperand(1);
12568 if (N0.getNode()->hasOneUse())
12569 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
12570 return Result;
12571 if (N1.getNode()->hasOneUse())
12572 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
12573 return Result;
12574 return SDValue();
12575}
12576
12578 // VUZP shuffle node.
12579 if (N->getOpcode() == ARMISD::VUZP)
12580 return true;
12581
12582 // "VUZP" on i32 is an alias for VTRN.
12583 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
12584 return true;
12585
12586 return false;
12587}
12588
12591 const ARMSubtarget *Subtarget) {
12592 // Look for ADD(VUZP.0, VUZP.1).
12593 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
12594 N0 == N1)
12595 return SDValue();
12596
12597 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
12598 if (!N->getValueType(0).is64BitVector())
12599 return SDValue();
12600
12601 // Generate vpadd.
12602 SelectionDAG &DAG = DCI.DAG;
12603 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12604 SDLoc dl(N);
12605 SDNode *Unzip = N0.getNode();
12606 EVT VT = N->getValueType(0);
12607
12609 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
12610 TLI.getPointerTy(DAG.getDataLayout())));
12611 Ops.push_back(Unzip->getOperand(0));
12612 Ops.push_back(Unzip->getOperand(1));
12613
12614 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12615}
12616
12619 const ARMSubtarget *Subtarget) {
12620 // Check for two extended operands.
12621 if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
12622 N1.getOpcode() == ISD::SIGN_EXTEND) &&
12623 !(N0.getOpcode() == ISD::ZERO_EXTEND &&
12624 N1.getOpcode() == ISD::ZERO_EXTEND))
12625 return SDValue();
12626
12627 SDValue N00 = N0.getOperand(0);
12628 SDValue N10 = N1.getOperand(0);
12629
12630 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
12631 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
12632 N00 == N10)
12633 return SDValue();
12634
12635 // We only recognize Q register paddl here; this can't be reached until
12636 // after type legalization.
12637 if (!N00.getValueType().is64BitVector() ||
12639 return SDValue();
12640
12641 // Generate vpaddl.
12642 SelectionDAG &DAG = DCI.DAG;
12643 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12644 SDLoc dl(N);
12645 EVT VT = N->getValueType(0);
12646
12648 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
12649 unsigned Opcode;
12650 if (N0.getOpcode() == ISD::SIGN_EXTEND)
12651 Opcode = Intrinsic::arm_neon_vpaddls;
12652 else
12653 Opcode = Intrinsic::arm_neon_vpaddlu;
12654 Ops.push_back(DAG.getConstant(Opcode, dl,
12655 TLI.getPointerTy(DAG.getDataLayout())));
12656 EVT ElemTy = N00.getValueType().getVectorElementType();
12657 unsigned NumElts = VT.getVectorNumElements();
12658 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
12659 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
12660 N00.getOperand(0), N00.getOperand(1));
12661 Ops.push_back(Concat);
12662
12663 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12664}
12665
12666// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
12667// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
12668// much easier to match.
12669static SDValue
12672 const ARMSubtarget *Subtarget) {
12673 // Only perform optimization if after legalize, and if NEON is available. We
12674 // also expected both operands to be BUILD_VECTORs.
12675 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
12676 || N0.getOpcode() != ISD::BUILD_VECTOR
12677 || N1.getOpcode() != ISD::BUILD_VECTOR)
12678 return SDValue();
12679
12680 // Check output type since VPADDL operand elements can only be 8, 16, or 32.
12681 EVT VT = N->getValueType(0);
12682 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
12683 return SDValue();
12684
12685 // Check that the vector operands are of the right form.
12686 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
12687 // operands, where N is the size of the formed vector.
12688 // Each EXTRACT_VECTOR should have the same input vector and odd or even
12689 // index such that we have a pair wise add pattern.
12690
12691 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
12693 return SDValue();
12694 SDValue Vec = N0->getOperand(0)->getOperand(0);
12695 SDNode *V = Vec.getNode();
12696 unsigned nextIndex = 0;
12697
12698 // For each operands to the ADD which are BUILD_VECTORs,
12699 // check to see if each of their operands are an EXTRACT_VECTOR with
12700 // the same vector and appropriate index.
12701 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
12704
12705 SDValue ExtVec0 = N0->getOperand(i);
12706 SDValue ExtVec1 = N1->getOperand(i);
12707
12708 // First operand is the vector, verify its the same.
12709 if (V != ExtVec0->getOperand(0).getNode() ||
12710 V != ExtVec1->getOperand(0).getNode())
12711 return SDValue();
12712
12713 // Second is the constant, verify its correct.
12716
12717 // For the constant, we want to see all the even or all the odd.
12718 if (!C0 || !C1 || C0->getZExtValue() != nextIndex
12719 || C1->getZExtValue() != nextIndex+1)
12720 return SDValue();
12721
12722 // Increment index.
12723 nextIndex+=2;
12724 } else
12725 return SDValue();
12726 }
12727
12728 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
12729 // we're using the entire input vector, otherwise there's a size/legality
12730 // mismatch somewhere.
12731 if (nextIndex != Vec.getValueType().getVectorNumElements() ||
12733 return SDValue();
12734
12735 // Create VPADDL node.
12736 SelectionDAG &DAG = DCI.DAG;
12737 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12738
12739 SDLoc dl(N);
12740
12741 // Build operand list.
12743 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
12744 TLI.getPointerTy(DAG.getDataLayout())));
12745
12746 // Input is the vector.
12747 Ops.push_back(Vec);
12748
12749 // Get widened type and narrowed type.
12750 MVT widenType;
12751 unsigned numElem = VT.getVectorNumElements();
12752
12753 EVT inputLaneType = Vec.getValueType().getVectorElementType();
12754 switch (inputLaneType.getSimpleVT().SimpleTy) {
12755 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
12756 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
12757 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
12758 default:
12759 llvm_unreachable("Invalid vector element type for padd optimization.");
12760 }
12761
12762 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
12763 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
12764 return DAG.getNode(ExtOp, dl, VT, tmp);
12765}
12766
12768 if (V->getOpcode() == ISD::UMUL_LOHI ||
12769 V->getOpcode() == ISD::SMUL_LOHI)
12770 return V;
12771 return SDValue();
12772}
12773
12774static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
12776 const ARMSubtarget *Subtarget) {
12777 if (!Subtarget->hasBaseDSP())
12778 return SDValue();
12779
12780 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
12781 // accumulates the product into a 64-bit value. The 16-bit values will
12782 // be sign extended somehow or SRA'd into 32-bit values
12783 // (addc (adde (mul 16bit, 16bit), lo), hi)
12784 SDValue Mul = AddcNode->getOperand(0);
12785 SDValue Lo = AddcNode->getOperand(1);
12786 if (Mul.getOpcode() != ISD::MUL) {
12787 Lo = AddcNode->getOperand(0);
12788 Mul = AddcNode->getOperand(1);
12789 if (Mul.getOpcode() != ISD::MUL)
12790 return SDValue();
12791 }
12792
12793 SDValue SRA = AddeNode->getOperand(0);
12794 SDValue Hi = AddeNode->getOperand(1);
12795 if (SRA.getOpcode() != ISD::SRA) {
12796 SRA = AddeNode->getOperand(1);
12797 Hi = AddeNode->getOperand(0);
12798 if (SRA.getOpcode() != ISD::SRA)
12799 return SDValue();
12800 }
12801 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
12802 if (Const->getZExtValue() != 31)
12803 return SDValue();
12804 } else
12805 return SDValue();
12806
12807 if (SRA.getOperand(0) != Mul)
12808 return SDValue();
12809
12810 SelectionDAG &DAG = DCI.DAG;
12811 SDLoc dl(AddcNode);
12812 unsigned Opcode = 0;
12813 SDValue Op0;
12814 SDValue Op1;
12815
12816 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
12817 Opcode = ARMISD::SMLALBB;
12818 Op0 = Mul.getOperand(0);
12819 Op1 = Mul.getOperand(1);
12820 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
12821 Opcode = ARMISD::SMLALBT;
12822 Op0 = Mul.getOperand(0);
12823 Op1 = Mul.getOperand(1).getOperand(0);
12824 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
12825 Opcode = ARMISD::SMLALTB;
12826 Op0 = Mul.getOperand(0).getOperand(0);
12827 Op1 = Mul.getOperand(1);
12828 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
12829 Opcode = ARMISD::SMLALTT;
12830 Op0 = Mul->getOperand(0).getOperand(0);
12831 Op1 = Mul->getOperand(1).getOperand(0);
12832 }
12833
12834 if (!Op0 || !Op1)
12835 return SDValue();
12836
12837 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
12838 Op0, Op1, Lo, Hi);
12839 // Replace the ADDs' nodes uses by the MLA node's values.
12840 SDValue HiMLALResult(SMLAL.getNode(), 1);
12841 SDValue LoMLALResult(SMLAL.getNode(), 0);
12842
12843 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
12844 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
12845
12846 // Return original node to notify the driver to stop replacing.
12847 SDValue resNode(AddcNode, 0);
12848 return resNode;
12849}
12850
12853 const ARMSubtarget *Subtarget) {
12854 // Look for multiply add opportunities.
12855 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
12856 // each add nodes consumes a value from ISD::UMUL_LOHI and there is
12857 // a glue link from the first add to the second add.
12858 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
12859 // a S/UMLAL instruction.
12860 // UMUL_LOHI
12861 // / :lo \ :hi
12862 // V \ [no multiline comment]
12863 // loAdd -> ADDC |
12864 // \ :carry /
12865 // V V
12866 // ADDE <- hiAdd
12867 //
12868 // In the special case where only the higher part of a signed result is used
12869 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
12870 // a constant with the exact value of 0x80000000, we recognize we are dealing
12871 // with a "rounded multiply and add" (or subtract) and transform it into
12872 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
12873
12874 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
12875 AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
12876 "Expect an ADDE or SUBE");
12877
12878 assert(AddeSubeNode->getNumOperands() == 3 &&
12879 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
12880 "ADDE node has the wrong inputs");
12881
12882 // Check that we are chained to the right ADDC or SUBC node.
12883 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
12884 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12885 AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
12886 (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
12887 AddcSubcNode->getOpcode() != ARMISD::SUBC))
12888 return SDValue();
12889
12890 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
12891 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
12892
12893 // Check if the two operands are from the same mul_lohi node.
12894 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
12895 return SDValue();
12896
12897 assert(AddcSubcNode->getNumValues() == 2 &&
12898 AddcSubcNode->getValueType(0) == MVT::i32 &&
12899 "Expect ADDC with two result values. First: i32");
12900
12901 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
12902 // maybe a SMLAL which multiplies two 16-bit values.
12903 if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12904 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
12905 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
12906 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
12907 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
12908 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
12909
12910 // Check for the triangle shape.
12911 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
12912 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
12913
12914 // Make sure that the ADDE/SUBE operands are not coming from the same node.
12915 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
12916 return SDValue();
12917
12918 // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
12919 bool IsLeftOperandMUL = false;
12920 SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
12921 if (MULOp == SDValue())
12922 MULOp = findMUL_LOHI(AddeSubeOp1);
12923 else
12924 IsLeftOperandMUL = true;
12925 if (MULOp == SDValue())
12926 return SDValue();
12927
12928 // Figure out the right opcode.
12929 unsigned Opc = MULOp->getOpcode();
12930 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
12931
12932 // Figure out the high and low input values to the MLAL node.
12933 SDValue *HiAddSub = nullptr;
12934 SDValue *LoMul = nullptr;
12935 SDValue *LowAddSub = nullptr;
12936
12937 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
12938 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
12939 return SDValue();
12940
12941 if (IsLeftOperandMUL)
12942 HiAddSub = &AddeSubeOp1;
12943 else
12944 HiAddSub = &AddeSubeOp0;
12945
12946 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
12947 // whose low result is fed to the ADDC/SUBC we are checking.
12948
12949 if (AddcSubcOp0 == MULOp.getValue(0)) {
12950 LoMul = &AddcSubcOp0;
12951 LowAddSub = &AddcSubcOp1;
12952 }
12953 if (AddcSubcOp1 == MULOp.getValue(0)) {
12954 LoMul = &AddcSubcOp1;
12955 LowAddSub = &AddcSubcOp0;
12956 }
12957
12958 if (!LoMul)
12959 return SDValue();
12960
12961 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
12962 // the replacement below will create a cycle.
12963 if (AddcSubcNode == HiAddSub->getNode() ||
12964 AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
12965 return SDValue();
12966
12967 // Create the merged node.
12968 SelectionDAG &DAG = DCI.DAG;
12969
12970 // Start building operand list.
12972 Ops.push_back(LoMul->getOperand(0));
12973 Ops.push_back(LoMul->getOperand(1));
12974
12975 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
12976 // the case, we must be doing signed multiplication and only use the higher
12977 // part of the result of the MLAL, furthermore the LowAddSub must be a constant
12978 // addition or subtraction with the value of 0x800000.
12979 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
12980 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
12981 LowAddSub->getNode()->getOpcode() == ISD::Constant &&
12982 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
12983 0x80000000) {
12984 Ops.push_back(*HiAddSub);
12985 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
12986 FinalOpc = ARMISD::SMMLSR;
12987 } else {
12988 FinalOpc = ARMISD::SMMLAR;
12989 }
12990 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
12991 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
12992
12993 return SDValue(AddeSubeNode, 0);
12994 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
12995 // SMMLS is generated during instruction selection and the rest of this
12996 // function can not handle the case where AddcSubcNode is a SUBC.
12997 return SDValue();
12998
12999 // Finish building the operand list for {U/S}MLAL
13000 Ops.push_back(*LowAddSub);
13001 Ops.push_back(*HiAddSub);
13002
13003 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
13004 DAG.getVTList(MVT::i32, MVT::i32), Ops);
13005
13006 // Replace the ADDs' nodes uses by the MLA node's values.
13007 SDValue HiMLALResult(MLALNode.getNode(), 1);
13008 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
13009
13010 SDValue LoMLALResult(MLALNode.getNode(), 0);
13011 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
13012
13013 // Return original node to notify the driver to stop replacing.
13014 return SDValue(AddeSubeNode, 0);
13015}
13016
13019 const ARMSubtarget *Subtarget) {
13020 // UMAAL is similar to UMLAL except that it adds two unsigned values.
13021 // While trying to combine for the other MLAL nodes, first search for the
13022 // chance to use UMAAL. Check if Addc uses a node which has already
13023 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
13024 // as the addend, and it's handled in PerformUMLALCombine.
13025
13026 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
13027 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
13028
13029 // Check that we have a glued ADDC node.
13030 SDNode* AddcNode = AddeNode->getOperand(2).getNode();
13031 if (AddcNode->getOpcode() != ARMISD::ADDC)
13032 return SDValue();
13033
13034 // Find the converted UMAAL or quit if it doesn't exist.
13035 SDNode *UmlalNode = nullptr;
13036 SDValue AddHi;
13037 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
13038 UmlalNode = AddcNode->getOperand(0).getNode();
13039 AddHi = AddcNode->getOperand(1);
13040 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
13041 UmlalNode = AddcNode->getOperand(1).getNode();
13042 AddHi = AddcNode->getOperand(0);
13043 } else {
13044 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
13045 }
13046
13047 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
13048 // the ADDC as well as Zero.
13049 if (!isNullConstant(UmlalNode->getOperand(3)))
13050 return SDValue();
13051
13052 if ((isNullConstant(AddeNode->getOperand(0)) &&
13053 AddeNode->getOperand(1).getNode() == UmlalNode) ||
13054 (AddeNode->getOperand(0).getNode() == UmlalNode &&
13055 isNullConstant(AddeNode->getOperand(1)))) {
13056 SelectionDAG &DAG = DCI.DAG;
13057 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
13058 UmlalNode->getOperand(2), AddHi };
13059 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
13060 DAG.getVTList(MVT::i32, MVT::i32), Ops);
13061
13062 // Replace the ADDs' nodes uses by the UMAAL node's values.
13063 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
13064 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
13065
13066 // Return original node to notify the driver to stop replacing.
13067 return SDValue(AddeNode, 0);
13068 }
13069 return SDValue();
13070}
13071
13073 const ARMSubtarget *Subtarget) {
13074 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
13075 return SDValue();
13076
13077 // Check that we have a pair of ADDC and ADDE as operands.
13078 // Both addends of the ADDE must be zero.
13079 SDNode* AddcNode = N->getOperand(2).getNode();
13080 SDNode* AddeNode = N->getOperand(3).getNode();
13081 if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
13082 (AddeNode->getOpcode() == ARMISD::ADDE) &&
13083 isNullConstant(AddeNode->getOperand(0)) &&
13084 isNullConstant(AddeNode->getOperand(1)) &&
13085 (AddeNode->getOperand(2).getNode() == AddcNode))
13086 return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
13087 DAG.getVTList(MVT::i32, MVT::i32),
13088 {N->getOperand(0), N->getOperand(1),
13089 AddcNode->getOperand(0), AddcNode->getOperand(1)});
13090 else
13091 return SDValue();
13092}
13093
13096 const ARMSubtarget *Subtarget) {
13097 SelectionDAG &DAG(DCI.DAG);
13098
13099 if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(1)) {
13100 // (SUBC (ADDE 0, 0, C), 1) -> C
13101 SDValue LHS = N->getOperand(0);
13102 SDValue RHS = N->getOperand(1);
13103 if (LHS->getOpcode() == ARMISD::ADDE &&
13104 isNullConstant(LHS->getOperand(0)) &&
13105 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
13106 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
13107 }
13108 }
13109
13110 if (Subtarget->isThumb1Only()) {
13111 SDValue RHS = N->getOperand(1);
13113 int32_t imm = C->getSExtValue();
13114 if (imm < 0 && imm > std::numeric_limits<int>::min()) {
13115 SDLoc DL(N);
13116 RHS = DAG.getConstant(-imm, DL, MVT::i32);
13117 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
13118 : ARMISD::ADDC;
13119 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
13120 }
13121 }
13122 }
13123
13124 return SDValue();
13125}
13126
13129 const ARMSubtarget *Subtarget) {
13130 if (Subtarget->isThumb1Only()) {
13131 SelectionDAG &DAG = DCI.DAG;
13132 SDValue RHS = N->getOperand(1);
13134 int64_t imm = C->getSExtValue();
13135 if (imm < 0) {
13136 SDLoc DL(N);
13137
13138 // The with-carry-in form matches bitwise not instead of the negation.
13139 // Effectively, the inverse interpretation of the carry flag already
13140 // accounts for part of the negation.
13141 RHS = DAG.getConstant(~imm, DL, MVT::i32);
13142
13143 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
13144 : ARMISD::ADDE;
13145 return DAG.getNode(Opcode, DL, N->getVTList(),
13146 N->getOperand(0), RHS, N->getOperand(2));
13147 }
13148 }
13149 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
13150 return AddCombineTo64bitMLAL(N, DCI, Subtarget);
13151 }
13152 return SDValue();
13153}
13154
13157 const ARMSubtarget *Subtarget) {
13158 if (!Subtarget->hasMVEIntegerOps())
13159 return SDValue();
13160
13161 SDLoc dl(N);
13162 SDValue SetCC;
13163 SDValue LHS;
13164 SDValue RHS;
13165 ISD::CondCode CC;
13166 SDValue TrueVal;
13167 SDValue FalseVal;
13168
13169 if (N->getOpcode() == ISD::SELECT &&
13170 N->getOperand(0)->getOpcode() == ISD::SETCC) {
13171 SetCC = N->getOperand(0);
13172 LHS = SetCC->getOperand(0);
13173 RHS = SetCC->getOperand(1);
13174 CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
13175 TrueVal = N->getOperand(1);
13176 FalseVal = N->getOperand(2);
13177 } else if (N->getOpcode() == ISD::SELECT_CC) {
13178 LHS = N->getOperand(0);
13179 RHS = N->getOperand(1);
13180 CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
13181 TrueVal = N->getOperand(2);
13182 FalseVal = N->getOperand(3);
13183 } else {
13184 return SDValue();
13185 }
13186
13187 unsigned int Opcode = 0;
13188 if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
13189 FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
13190 (CC == ISD::SETULT || CC == ISD::SETUGT)) {
13191 Opcode = ARMISD::VMINVu;
13192 if (CC == ISD::SETUGT)
13193 std::swap(TrueVal, FalseVal);
13194 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
13195 FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
13196 (CC == ISD::SETLT || CC == ISD::SETGT)) {
13197 Opcode = ARMISD::VMINVs;
13198 if (CC == ISD::SETGT)
13199 std::swap(TrueVal, FalseVal);
13200 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
13201 FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
13202 (CC == ISD::SETUGT || CC == ISD::SETULT)) {
13203 Opcode = ARMISD::VMAXVu;
13204 if (CC == ISD::SETULT)
13205 std::swap(TrueVal, FalseVal);
13206 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
13207 FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
13208 (CC == ISD::SETGT || CC == ISD::SETLT)) {
13209 Opcode = ARMISD::VMAXVs;
13210 if (CC == ISD::SETLT)
13211 std::swap(TrueVal, FalseVal);
13212 } else
13213 return SDValue();
13214
13215 // Normalise to the right hand side being the vector reduction
13216 switch (TrueVal->getOpcode()) {
13217 case ISD::VECREDUCE_UMIN:
13218 case ISD::VECREDUCE_SMIN:
13219 case ISD::VECREDUCE_UMAX:
13220 case ISD::VECREDUCE_SMAX:
13221 std::swap(LHS, RHS);
13222 std::swap(TrueVal, FalseVal);
13223 break;
13224 }
13225
13226 EVT VectorType = FalseVal->getOperand(0).getValueType();
13227
13228 if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&
13229 VectorType != MVT::v4i32)
13230 return SDValue();
13231
13232 EVT VectorScalarType = VectorType.getVectorElementType();
13233
13234 // The values being selected must also be the ones being compared
13235 if (TrueVal != LHS || FalseVal != RHS)
13236 return SDValue();
13237
13238 EVT LeftType = LHS->getValueType(0);
13239 EVT RightType = RHS->getValueType(0);
13240
13241 // The types must match the reduced type too
13242 if (LeftType != VectorScalarType || RightType != VectorScalarType)
13243 return SDValue();
13244
13245 // Legalise the scalar to an i32
13246 if (VectorScalarType != MVT::i32)
13247 LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
13248
13249 // Generate the reduction as an i32 for legalisation purposes
13250 auto Reduction =
13251 DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
13252
13253 // The result isn't actually an i32 so truncate it back to its original type
13254 if (VectorScalarType != MVT::i32)
13255 Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
13256
13257 return Reduction;
13258}
13259
13260// A special combine for the vqdmulh family of instructions. This is one of the
13261// potential set of patterns that could patch this instruction. The base pattern
13262// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
13263// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
13264// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
13265// the max is unnecessary.
13267 EVT VT = N->getValueType(0);
13268 SDValue Shft;
13269 ConstantSDNode *Clamp;
13270
13271 if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
13272 return SDValue();
13273
13274 if (N->getOpcode() == ISD::SMIN) {
13275 Shft = N->getOperand(0);
13276 Clamp = isConstOrConstSplat(N->getOperand(1));
13277 } else if (N->getOpcode() == ISD::VSELECT) {
13278 // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
13279 SDValue Cmp = N->getOperand(0);
13280 if (Cmp.getOpcode() != ISD::SETCC ||
13281 cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||
13282 Cmp.getOperand(0) != N->getOperand(1) ||
13283 Cmp.getOperand(1) != N->getOperand(2))
13284 return SDValue();
13285 Shft = N->getOperand(1);
13286 Clamp = isConstOrConstSplat(N->getOperand(2));
13287 } else
13288 return SDValue();
13289
13290 if (!Clamp)
13291 return SDValue();
13292
13293 MVT ScalarType;
13294 int ShftAmt = 0;
13295 switch (Clamp->getSExtValue()) {
13296 case (1 << 7) - 1:
13297 ScalarType = MVT::i8;
13298 ShftAmt = 7;
13299 break;
13300 case (1 << 15) - 1:
13301 ScalarType = MVT::i16;
13302 ShftAmt = 15;
13303 break;
13304 case (1ULL << 31) - 1:
13305 ScalarType = MVT::i32;
13306 ShftAmt = 31;
13307 break;
13308 default:
13309 return SDValue();
13310 }
13311
13312 if (Shft.getOpcode() != ISD::SRA)
13313 return SDValue();
13315 if (!N1 || N1->getSExtValue() != ShftAmt)
13316 return SDValue();
13317
13318 SDValue Mul = Shft.getOperand(0);
13319 if (Mul.getOpcode() != ISD::MUL)
13320 return SDValue();
13321
13322 SDValue Ext0 = Mul.getOperand(0);
13323 SDValue Ext1 = Mul.getOperand(1);
13324 if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
13325 Ext1.getOpcode() != ISD::SIGN_EXTEND)
13326 return SDValue();
13327 EVT VecVT = Ext0.getOperand(0).getValueType();
13328 if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1)
13329 return SDValue();
13330 if (Ext1.getOperand(0).getValueType() != VecVT ||
13331 VecVT.getScalarType() != ScalarType ||
13332 VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
13333 return SDValue();
13334
13335 SDLoc DL(Mul);
13336 unsigned LegalLanes = 128 / (ShftAmt + 1);
13337 EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes);
13338 // For types smaller than legal vectors extend to be legal and only use needed
13339 // lanes.
13340 if (VecVT.getSizeInBits() < 128) {
13341 EVT ExtVecVT =
13343 VecVT.getVectorNumElements());
13344 SDValue Inp0 =
13345 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0));
13346 SDValue Inp1 =
13347 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0));
13348 Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0);
13349 Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1);
13350 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13351 SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH);
13352 Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc);
13353 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc);
13354 }
13355
13356 // For larger types, split into legal sized chunks.
13357 assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type");
13358 unsigned NumParts = VecVT.getSizeInBits() / 128;
13360 for (unsigned I = 0; I < NumParts; ++I) {
13361 SDValue Inp0 =
13362 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0),
13363 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13364 SDValue Inp1 =
13365 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0),
13366 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13367 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13368 Parts.push_back(VQDMULH);
13369 }
13370 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT,
13371 DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));
13372}
13373
13376 const ARMSubtarget *Subtarget) {
13377 if (!Subtarget->hasMVEIntegerOps())
13378 return SDValue();
13379
13380 if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))
13381 return V;
13382
13383 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
13384 //
13385 // We need to re-implement this optimization here as the implementation in the
13386 // Target-Independent DAGCombiner does not handle the kind of constant we make
13387 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
13388 // good reason, allowing truncation there would break other targets).
13389 //
13390 // Currently, this is only done for MVE, as it's the only target that benefits
13391 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
13392 if (N->getOperand(0).getOpcode() != ISD::XOR)
13393 return SDValue();
13394 SDValue XOR = N->getOperand(0);
13395
13396 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
13397 // It is important to check with truncation allowed as the BUILD_VECTORs we
13398 // generate in those situations will truncate their operands.
13399 ConstantSDNode *Const =
13400 isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,
13401 /*AllowTruncation*/ true);
13402 if (!Const || !Const->isOne())
13403 return SDValue();
13404
13405 // Rewrite into vselect(cond, rhs, lhs).
13406 SDValue Cond = XOR->getOperand(0);
13407 SDValue LHS = N->getOperand(1);
13408 SDValue RHS = N->getOperand(2);
13409 EVT Type = N->getValueType(0);
13410 return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
13411}
13412
13413// Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n
13416 const ARMSubtarget *Subtarget) {
13417 SDValue Op0 = N->getOperand(0);
13418 SDValue Op1 = N->getOperand(1);
13419 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
13420 EVT VT = N->getValueType(0);
13421
13422 if (!Subtarget->hasMVEIntegerOps() ||
13424 return SDValue();
13425
13426 if (CC == ISD::SETUGE) {
13427 std::swap(Op0, Op1);
13428 CC = ISD::SETULT;
13429 }
13430
13431 if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 ||
13433 return SDValue();
13434
13435 // Check first operand is BuildVector of 0,1,2,...
13436 for (unsigned I = 0; I < VT.getVectorNumElements(); I++) {
13437 if (!Op0.getOperand(I).isUndef() &&
13439 Op0.getConstantOperandVal(I) == I))
13440 return SDValue();
13441 }
13442
13443 // The second is a Splat of Op1S
13444 SDValue Op1S = DCI.DAG.getSplatValue(Op1);
13445 if (!Op1S)
13446 return SDValue();
13447
13448 unsigned Opc;
13449 switch (VT.getVectorNumElements()) {
13450 case 2:
13451 Opc = Intrinsic::arm_mve_vctp64;
13452 break;
13453 case 4:
13454 Opc = Intrinsic::arm_mve_vctp32;
13455 break;
13456 case 8:
13457 Opc = Intrinsic::arm_mve_vctp16;
13458 break;
13459 case 16:
13460 Opc = Intrinsic::arm_mve_vctp8;
13461 break;
13462 default:
13463 return SDValue();
13464 }
13465
13466 SDLoc DL(N);
13467 return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
13468 DCI.DAG.getConstant(Opc, DL, MVT::i32),
13469 DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32));
13470}
13471
13472/// PerformADDECombine - Target-specific dag combine transform from
13473/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
13474/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
13477 const ARMSubtarget *Subtarget) {
13478 // Only ARM and Thumb2 support UMLAL/SMLAL.
13479 if (Subtarget->isThumb1Only())
13480 return PerformAddeSubeCombine(N, DCI, Subtarget);
13481
13482 // Only perform the checks after legalize when the pattern is available.
13483 if (DCI.isBeforeLegalize()) return SDValue();
13484
13485 return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
13486}
13487
13488/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
13489/// operands N0 and N1. This is a helper for PerformADDCombine that is
13490/// called with the default operands, and if that fails, with commuted
13491/// operands.
13494 const ARMSubtarget *Subtarget){
13495 // Attempt to create vpadd for this add.
13496 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
13497 return Result;
13498
13499 // Attempt to create vpaddl for this add.
13500 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
13501 return Result;
13502 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
13503 Subtarget))
13504 return Result;
13505
13506 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
13507 if (N0.getNode()->hasOneUse())
13508 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
13509 return Result;
13510 return SDValue();
13511}
13512
13514 EVT VT = N->getValueType(0);
13515 SDValue N0 = N->getOperand(0);
13516 SDValue N1 = N->getOperand(1);
13517 SDLoc dl(N);
13518
13519 auto IsVecReduce = [](SDValue Op) {
13520 switch (Op.getOpcode()) {
13521 case ISD::VECREDUCE_ADD:
13522 case ARMISD::VADDVs:
13523 case ARMISD::VADDVu:
13524 case ARMISD::VMLAVs:
13525 case ARMISD::VMLAVu:
13526 return true;
13527 }
13528 return false;
13529 };
13530
13531 auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) {
13532 // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->
13533 // add(add(X, vecreduce(Y)), vecreduce(Z))
13534 // to make better use of vaddva style instructions.
13535 if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&
13536 IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) &&
13537 !isa<ConstantSDNode>(N0) && N1->hasOneUse()) {
13538 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0));
13539 return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1));
13540 }
13541 // And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
13542 // add(add(add(A, C), reduce(B)), reduce(D))
13543 if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
13544 N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) {
13545 unsigned N0RedOp = 0;
13546 if (!IsVecReduce(N0.getOperand(N0RedOp))) {
13547 N0RedOp = 1;
13548 if (!IsVecReduce(N0.getOperand(N0RedOp)))
13549 return SDValue();
13550 }
13551
13552 unsigned N1RedOp = 0;
13553 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13554 N1RedOp = 1;
13555 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13556 return SDValue();
13557
13558 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp),
13559 N1.getOperand(1 - N1RedOp));
13560 SDValue Add1 =
13561 DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp));
13562 return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp));
13563 }
13564 return SDValue();
13565 };
13566 if (SDValue R = DistrubuteAddAddVecReduce(N0, N1))
13567 return R;
13568 if (SDValue R = DistrubuteAddAddVecReduce(N1, N0))
13569 return R;
13570
13571 // Distribute add(vecreduce(load(Y)), vecreduce(load(Z)))
13572 // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z)))
13573 // by ascending load offsets. This can help cores prefetch if the order of
13574 // loads is more predictable.
13575 auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) {
13576 // Check if two reductions are known to load data where one is before/after
13577 // another. Return negative if N0 loads data before N1, positive if N1 is
13578 // before N0 and 0 otherwise if nothing is known.
13579 auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) {
13580 // Look through to the first operand of a MUL, for the VMLA case.
13581 // Currently only looks at the first operand, in the hope they are equal.
13582 if (N0.getOpcode() == ISD::MUL)
13583 N0 = N0.getOperand(0);
13584 if (N1.getOpcode() == ISD::MUL)
13585 N1 = N1.getOperand(0);
13586
13587 // Return true if the two operands are loads to the same object and the
13588 // offset of the first is known to be less than the offset of the second.
13589 LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0);
13590 LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1);
13591 if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() ||
13592 !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() ||
13593 Load1->isIndexed())
13594 return 0;
13595
13596 auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG);
13597 auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG);
13598
13599 if (!BaseLocDecomp0.getBase() ||
13600 BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() ||
13601 !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset())
13602 return 0;
13603 if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset())
13604 return -1;
13605 if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset())
13606 return 1;
13607 return 0;
13608 };
13609
13610 SDValue X;
13611 if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) {
13612 if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) {
13613 int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0),
13614 N0.getOperand(1).getOperand(0));
13615 if (IsBefore < 0) {
13616 X = N0.getOperand(0);
13617 N0 = N0.getOperand(1);
13618 } else if (IsBefore > 0) {
13619 X = N0.getOperand(1);
13620 N0 = N0.getOperand(0);
13621 } else
13622 return SDValue();
13623 } else if (IsVecReduce(N0.getOperand(0))) {
13624 X = N0.getOperand(1);
13625 N0 = N0.getOperand(0);
13626 } else if (IsVecReduce(N0.getOperand(1))) {
13627 X = N0.getOperand(0);
13628 N0 = N0.getOperand(1);
13629 } else
13630 return SDValue();
13631 } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) &&
13632 IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) {
13633 // Note this is backward to how you would expect. We create
13634 // add(reduce(load + 16), reduce(load + 0)) so that the
13635 // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving
13636 // the X as VADDV(load + 0)
13637 return DAG.getNode(ISD::ADD, dl, VT, N1, N0);
13638 } else
13639 return SDValue();
13640
13641 if (!IsVecReduce(N0) || !IsVecReduce(N1))
13642 return SDValue();
13643
13644 if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0)
13645 return SDValue();
13646
13647 // Switch from add(add(X, N0), N1) to add(add(X, N1), N0)
13648 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1);
13649 return DAG.getNode(ISD::ADD, dl, VT, Add0, N0);
13650 };
13651 if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true))
13652 return R;
13653 if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false))
13654 return R;
13655 return SDValue();
13656}
13657
13659 const ARMSubtarget *Subtarget) {
13660 if (!Subtarget->hasMVEIntegerOps())
13661 return SDValue();
13662
13664 return R;
13665
13666 EVT VT = N->getValueType(0);
13667 SDValue N0 = N->getOperand(0);
13668 SDValue N1 = N->getOperand(1);
13669 SDLoc dl(N);
13670
13671 if (VT != MVT::i64)
13672 return SDValue();
13673
13674 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
13675 // will look like:
13676 // t1: i32,i32 = ARMISD::VADDLVs x
13677 // t2: i64 = build_pair t1, t1:1
13678 // t3: i64 = add t2, y
13679 // Otherwise we try to push the add up above VADDLVAx, to potentially allow
13680 // the add to be simplified separately.
13681 // We also need to check for sext / zext and commutitive adds.
13682 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
13683 SDValue NB) {
13684 if (NB->getOpcode() != ISD::BUILD_PAIR)
13685 return SDValue();
13686 SDValue VecRed = NB->getOperand(0);
13687 if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) ||
13688 VecRed.getResNo() != 0 ||
13689 NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
13690 return SDValue();
13691
13692 if (VecRed->getOpcode() == OpcodeA) {
13693 // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
13694 SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
13695 VecRed.getOperand(0), VecRed.getOperand(1));
13696 NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA);
13697 }
13698
13700 std::tie(Ops[0], Ops[1]) = DAG.SplitScalar(NA, dl, MVT::i32, MVT::i32);
13701
13702 unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0;
13703 for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++)
13704 Ops.push_back(VecRed->getOperand(I));
13705 SDValue Red =
13706 DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops);
13707 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
13708 SDValue(Red.getNode(), 1));
13709 };
13710
13711 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
13712 return M;
13713 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))
13714 return M;
13715 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))
13716 return M;
13717 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
13718 return M;
13719 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
13720 return M;
13721 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
13722 return M;
13723 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
13724 return M;
13725 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
13726 return M;
13727 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
13728 return M;
13729 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
13730 return M;
13731 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))
13732 return M;
13733 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
13734 return M;
13735 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))
13736 return M;
13737 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))
13738 return M;
13739 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))
13740 return M;
13741 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))
13742 return M;
13743 return SDValue();
13744}
13745
13746bool
13748 CombineLevel Level) const {
13749 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
13750 N->getOpcode() == ISD::SRL) &&
13751 "Expected shift op");
13752
13753 SDValue ShiftLHS = N->getOperand(0);
13754 if (!ShiftLHS->hasOneUse())
13755 return false;
13756
13757 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
13758 !ShiftLHS.getOperand(0)->hasOneUse())
13759 return false;
13760
13761 if (Level == BeforeLegalizeTypes)
13762 return true;
13763
13764 if (N->getOpcode() != ISD::SHL)
13765 return true;
13766
13767 if (Subtarget->isThumb1Only()) {
13768 // Avoid making expensive immediates by commuting shifts. (This logic
13769 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
13770 // for free.)
13771 if (N->getOpcode() != ISD::SHL)
13772 return true;
13773 SDValue N1 = N->getOperand(0);
13774 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
13775 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
13776 return true;
13777 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
13778 if (Const->getAPIntValue().ult(256))
13779 return false;
13780 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
13781 Const->getAPIntValue().sgt(-256))
13782 return false;
13783 }
13784 return true;
13785 }
13786
13787 // Turn off commute-with-shift transform after legalization, so it doesn't
13788 // conflict with PerformSHLSimplify. (We could try to detect when
13789 // PerformSHLSimplify would trigger more precisely, but it isn't
13790 // really necessary.)
13791 return false;
13792}
13793
13795 const SDNode *N) const {
13796 assert(N->getOpcode() == ISD::XOR &&
13797 (N->getOperand(0).getOpcode() == ISD::SHL ||
13798 N->getOperand(0).getOpcode() == ISD::SRL) &&
13799 "Expected XOR(SHIFT) pattern");
13800
13801 // Only commute if the entire NOT mask is a hidden shifted mask.
13802 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
13803 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
13804 if (XorC && ShiftC) {
13805 unsigned MaskIdx, MaskLen;
13806 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
13807 unsigned ShiftAmt = ShiftC->getZExtValue();
13808 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
13809 if (N->getOperand(0).getOpcode() == ISD::SHL)
13810 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
13811 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
13812 }
13813 }
13814
13815 return false;
13816}
13817
13819 const SDNode *N, CombineLevel Level) const {
13820 assert(((N->getOpcode() == ISD::SHL &&
13821 N->getOperand(0).getOpcode() == ISD::SRL) ||
13822 (N->getOpcode() == ISD::SRL &&
13823 N->getOperand(0).getOpcode() == ISD::SHL)) &&
13824 "Expected shift-shift mask");
13825
13826 if (!Subtarget->isThumb1Only())
13827 return true;
13828
13829 if (Level == BeforeLegalizeTypes)
13830 return true;
13831
13832 return false;
13833}
13834
13836 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
13837 SDValue Y) const {
13838 return Subtarget->hasMVEIntegerOps() && isTypeLegal(VT) &&
13839 SelectOpcode == ISD::VSELECT;
13840}
13841
13843 if (!Subtarget->hasNEON()) {
13844 if (Subtarget->isThumb1Only())
13845 return VT.getScalarSizeInBits() <= 32;
13846 return true;
13847 }
13848 return VT.isScalarInteger();
13849}
13850
13852 EVT VT) const {
13853 if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple())
13854 return false;
13855
13856 switch (FPVT.getSimpleVT().SimpleTy) {
13857 case MVT::f16:
13858 return Subtarget->hasVFP2Base();
13859 case MVT::f32:
13860 return Subtarget->hasVFP2Base();
13861 case MVT::f64:
13862 return Subtarget->hasFP64();
13863 case MVT::v4f32:
13864 case MVT::v8f16:
13865 return Subtarget->hasMVEFloatOps();
13866 default:
13867 return false;
13868 }
13869}
13870
13873 const ARMSubtarget *ST) {
13874 // Allow the generic combiner to identify potential bswaps.
13875 if (DCI.isBeforeLegalize())
13876 return SDValue();
13877
13878 // DAG combiner will fold:
13879 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
13880 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
13881 // Other code patterns that can be also be modified have the following form:
13882 // b + ((a << 1) | 510)
13883 // b + ((a << 1) & 510)
13884 // b + ((a << 1) ^ 510)
13885 // b + ((a << 1) + 510)
13886
13887 // Many instructions can perform the shift for free, but it requires both
13888 // the operands to be registers. If c1 << c2 is too large, a mov immediate
13889 // instruction will needed. So, unfold back to the original pattern if:
13890 // - if c1 and c2 are small enough that they don't require mov imms.
13891 // - the user(s) of the node can perform an shl
13892
13893 // No shifted operands for 16-bit instructions.
13894 if (ST->isThumb() && ST->isThumb1Only())
13895 return SDValue();
13896
13897 // Check that all the users could perform the shl themselves.
13898 for (auto *U : N->users()) {
13899 switch(U->getOpcode()) {
13900 default:
13901 return SDValue();
13902 case ISD::SUB:
13903 case ISD::ADD:
13904 case ISD::AND:
13905 case ISD::OR:
13906 case ISD::XOR:
13907 case ISD::SETCC:
13908 case ARMISD::CMP:
13909 // Check that the user isn't already using a constant because there
13910 // aren't any instructions that support an immediate operand and a
13911 // shifted operand.
13912 if (isa<ConstantSDNode>(U->getOperand(0)) ||
13913 isa<ConstantSDNode>(U->getOperand(1)))
13914 return SDValue();
13915
13916 // Check that it's not already using a shift.
13917 if (U->getOperand(0).getOpcode() == ISD::SHL ||
13918 U->getOperand(1).getOpcode() == ISD::SHL)
13919 return SDValue();
13920 break;
13921 }
13922 }
13923
13924 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
13925 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
13926 return SDValue();
13927
13928 if (N->getOperand(0).getOpcode() != ISD::SHL)
13929 return SDValue();
13930
13931 SDValue SHL = N->getOperand(0);
13932
13933 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
13934 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
13935 if (!C1ShlC2 || !C2)
13936 return SDValue();
13937
13938 APInt C2Int = C2->getAPIntValue();
13939 APInt C1Int = C1ShlC2->getAPIntValue();
13940 unsigned C2Width = C2Int.getBitWidth();
13941 if (C2Int.uge(C2Width))
13942 return SDValue();
13943 uint64_t C2Value = C2Int.getZExtValue();
13944
13945 // Check that performing a lshr will not lose any information.
13946 APInt Mask = APInt::getHighBitsSet(C2Width, C2Width - C2Value);
13947 if ((C1Int & Mask) != C1Int)
13948 return SDValue();
13949
13950 // Shift the first constant.
13951 C1Int.lshrInPlace(C2Int);
13952
13953 // The immediates are encoded as an 8-bit value that can be rotated.
13954 auto LargeImm = [](const APInt &Imm) {
13955 unsigned Zeros = Imm.countl_zero() + Imm.countr_zero();
13956 return Imm.getBitWidth() - Zeros > 8;
13957 };
13958
13959 if (LargeImm(C1Int) || LargeImm(C2Int))
13960 return SDValue();
13961
13962 SelectionDAG &DAG = DCI.DAG;
13963 SDLoc dl(N);
13964 SDValue X = SHL.getOperand(0);
13965 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
13966 DAG.getConstant(C1Int, dl, MVT::i32));
13967 // Shift left to compensate for the lshr of C1Int.
13968 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
13969
13970 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
13971 SHL.dump(); N->dump());
13972 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
13973 return Res;
13974}
13975
13976
13977/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
13978///
13981 const ARMSubtarget *Subtarget) {
13982 SDValue N0 = N->getOperand(0);
13983 SDValue N1 = N->getOperand(1);
13984
13985 // Only works one way, because it needs an immediate operand.
13986 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
13987 return Result;
13988
13989 if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget))
13990 return Result;
13991
13992 // First try with the default operand order.
13993 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
13994 return Result;
13995
13996 // If that didn't work, try again with the operands commuted.
13997 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
13998}
13999
14000// Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)
14001// providing -X is as cheap as X (currently, just a constant).
14003 if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0)))
14004 return SDValue();
14005 SDValue CSINC = N->getOperand(1);
14006 if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse())
14007 return SDValue();
14008
14009 ConstantSDNode *X = dyn_cast<ConstantSDNode>(CSINC.getOperand(0));
14010 if (!X)
14011 return SDValue();
14012
14013 return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32,
14014 DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0),
14015 CSINC.getOperand(0)),
14016 CSINC.getOperand(1), CSINC.getOperand(2),
14017 CSINC.getOperand(3));
14018}
14019
14021 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
14022}
14023
14024// Try to fold
14025//
14026// (neg (cmov X, Y)) -> (cmov (neg X), (neg Y))
14027//
14028// The folding helps cmov to be matched with csneg without generating
14029// redundant neg instruction.
14031 if (!isNegatedInteger(SDValue(N, 0)))
14032 return SDValue();
14033
14034 SDValue CMov = N->getOperand(1);
14035 if (CMov.getOpcode() != ARMISD::CMOV || !CMov->hasOneUse())
14036 return SDValue();
14037
14038 SDValue N0 = CMov.getOperand(0);
14039 SDValue N1 = CMov.getOperand(1);
14040
14041 // If neither of them are negations, it's not worth the folding as it
14042 // introduces two additional negations while reducing one negation.
14043 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
14044 return SDValue();
14045
14046 SDLoc DL(N);
14047 EVT VT = CMov.getValueType();
14048
14049 SDValue N0N = DAG.getNegative(N0, DL, VT);
14050 SDValue N1N = DAG.getNegative(N1, DL, VT);
14051 return DAG.getNode(ARMISD::CMOV, DL, VT, N0N, N1N, CMov.getOperand(2),
14052 CMov.getOperand(3));
14053}
14054
14055/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
14056///
14059 const ARMSubtarget *Subtarget) {
14060 SDValue N0 = N->getOperand(0);
14061 SDValue N1 = N->getOperand(1);
14062
14063 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
14064 if (N1.getNode()->hasOneUse())
14065 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
14066 return Result;
14067
14068 if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG))
14069 return R;
14070
14071 if (SDValue Val = performNegCMovCombine(N, DCI.DAG))
14072 return Val;
14073
14074 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
14075 return SDValue();
14076
14077 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
14078 // so that we can readily pattern match more mve instructions which can use
14079 // a scalar operand.
14080 SDValue VDup = N->getOperand(1);
14081 if (VDup->getOpcode() != ARMISD::VDUP)
14082 return SDValue();
14083
14084 SDValue VMov = N->getOperand(0);
14085 if (VMov->getOpcode() == ISD::BITCAST)
14086 VMov = VMov->getOperand(0);
14087
14088 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov))
14089 return SDValue();
14090
14091 SDLoc dl(N);
14092 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,
14093 DCI.DAG.getConstant(0, dl, MVT::i32),
14094 VDup->getOperand(0));
14095 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate);
14096}
14097
14098/// PerformVMULCombine
14099/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
14100/// special multiplier accumulator forwarding.
14101/// vmul d3, d0, d2
14102/// vmla d3, d1, d2
14103/// is faster than
14104/// vadd d3, d0, d1
14105/// vmul d3, d3, d2
14106// However, for (A + B) * (A + B),
14107// vadd d2, d0, d1
14108// vmul d3, d0, d2
14109// vmla d3, d1, d2
14110// is slower than
14111// vadd d2, d0, d1
14112// vmul d3, d2, d2
14115 const ARMSubtarget *Subtarget) {
14116 if (!Subtarget->hasVMLxForwarding())
14117 return SDValue();
14118
14119 SelectionDAG &DAG = DCI.DAG;
14120 SDValue N0 = N->getOperand(0);
14121 SDValue N1 = N->getOperand(1);
14122 unsigned Opcode = N0.getOpcode();
14123 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14124 Opcode != ISD::FADD && Opcode != ISD::FSUB) {
14125 Opcode = N1.getOpcode();
14126 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14127 Opcode != ISD::FADD && Opcode != ISD::FSUB)
14128 return SDValue();
14129 std::swap(N0, N1);
14130 }
14131
14132 if (N0 == N1)
14133 return SDValue();
14134
14135 EVT VT = N->getValueType(0);
14136 SDLoc DL(N);
14137 SDValue N00 = N0->getOperand(0);
14138 SDValue N01 = N0->getOperand(1);
14139 return DAG.getNode(Opcode, DL, VT,
14140 DAG.getNode(ISD::MUL, DL, VT, N00, N1),
14141 DAG.getNode(ISD::MUL, DL, VT, N01, N1));
14142}
14143
14145 const ARMSubtarget *Subtarget) {
14146 EVT VT = N->getValueType(0);
14147 if (VT != MVT::v2i64)
14148 return SDValue();
14149
14150 SDValue N0 = N->getOperand(0);
14151 SDValue N1 = N->getOperand(1);
14152
14153 auto IsSignExt = [&](SDValue Op) {
14154 if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
14155 return SDValue();
14156 EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
14157 if (VT.getScalarSizeInBits() == 32)
14158 return Op->getOperand(0);
14159 return SDValue();
14160 };
14161 auto IsZeroExt = [&](SDValue Op) {
14162 // Zero extends are a little more awkward. At the point we are matching
14163 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
14164 // That might be before of after a bitcast depending on how the and is
14165 // placed. Because this has to look through bitcasts, it is currently only
14166 // supported on LE.
14167 if (!Subtarget->isLittle())
14168 return SDValue();
14169
14170 SDValue And = Op;
14171 if (And->getOpcode() == ISD::BITCAST)
14172 And = And->getOperand(0);
14173 if (And->getOpcode() != ISD::AND)
14174 return SDValue();
14175 SDValue Mask = And->getOperand(1);
14176 if (Mask->getOpcode() == ISD::BITCAST)
14177 Mask = Mask->getOperand(0);
14178
14179 if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
14180 Mask.getValueType() != MVT::v4i32)
14181 return SDValue();
14182 if (isAllOnesConstant(Mask->getOperand(0)) &&
14183 isNullConstant(Mask->getOperand(1)) &&
14184 isAllOnesConstant(Mask->getOperand(2)) &&
14185 isNullConstant(Mask->getOperand(3)))
14186 return And->getOperand(0);
14187 return SDValue();
14188 };
14189
14190 SDLoc dl(N);
14191 if (SDValue Op0 = IsSignExt(N0)) {
14192 if (SDValue Op1 = IsSignExt(N1)) {
14193 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14194 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14195 return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
14196 }
14197 }
14198 if (SDValue Op0 = IsZeroExt(N0)) {
14199 if (SDValue Op1 = IsZeroExt(N1)) {
14200 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14201 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14202 return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
14203 }
14204 }
14205
14206 return SDValue();
14207}
14208
14211 const ARMSubtarget *Subtarget) {
14212 SelectionDAG &DAG = DCI.DAG;
14213
14214 EVT VT = N->getValueType(0);
14215 if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
14216 return PerformMVEVMULLCombine(N, DAG, Subtarget);
14217
14218 if (Subtarget->isThumb1Only())
14219 return SDValue();
14220
14221 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14222 return SDValue();
14223
14224 if (VT.is64BitVector() || VT.is128BitVector())
14225 return PerformVMULCombine(N, DCI, Subtarget);
14226 if (VT != MVT::i32)
14227 return SDValue();
14228
14229 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14230 if (!C)
14231 return SDValue();
14232
14233 int64_t MulAmt = C->getSExtValue();
14234 unsigned ShiftAmt = llvm::countr_zero<uint64_t>(MulAmt);
14235
14236 ShiftAmt = ShiftAmt & (32 - 1);
14237 SDValue V = N->getOperand(0);
14238 SDLoc DL(N);
14239
14240 SDValue Res;
14241 MulAmt >>= ShiftAmt;
14242
14243 if (MulAmt >= 0) {
14244 if (llvm::has_single_bit<uint32_t>(MulAmt - 1)) {
14245 // (mul x, 2^N + 1) => (add (shl x, N), x)
14246 Res = DAG.getNode(ISD::ADD, DL, VT,
14247 V,
14248 DAG.getNode(ISD::SHL, DL, VT,
14249 V,
14250 DAG.getConstant(Log2_32(MulAmt - 1), DL,
14251 MVT::i32)));
14252 } else if (llvm::has_single_bit<uint32_t>(MulAmt + 1)) {
14253 // (mul x, 2^N - 1) => (sub (shl x, N), x)
14254 Res = DAG.getNode(ISD::SUB, DL, VT,
14255 DAG.getNode(ISD::SHL, DL, VT,
14256 V,
14257 DAG.getConstant(Log2_32(MulAmt + 1), DL,
14258 MVT::i32)),
14259 V);
14260 } else
14261 return SDValue();
14262 } else {
14263 uint64_t MulAmtAbs = -MulAmt;
14264 if (llvm::has_single_bit<uint32_t>(MulAmtAbs + 1)) {
14265 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
14266 Res = DAG.getNode(ISD::SUB, DL, VT,
14267 V,
14268 DAG.getNode(ISD::SHL, DL, VT,
14269 V,
14270 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
14271 MVT::i32)));
14272 } else if (llvm::has_single_bit<uint32_t>(MulAmtAbs - 1)) {
14273 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
14274 Res = DAG.getNode(ISD::ADD, DL, VT,
14275 V,
14276 DAG.getNode(ISD::SHL, DL, VT,
14277 V,
14278 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
14279 MVT::i32)));
14280 Res = DAG.getNode(ISD::SUB, DL, VT,
14281 DAG.getConstant(0, DL, MVT::i32), Res);
14282 } else
14283 return SDValue();
14284 }
14285
14286 if (ShiftAmt != 0)
14287 Res = DAG.getNode(ISD::SHL, DL, VT,
14288 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
14289
14290 // Do not add new nodes to DAG combiner worklist.
14291 DCI.CombineTo(N, Res, false);
14292 return SDValue();
14293}
14294
14297 const ARMSubtarget *Subtarget) {
14298 // Allow DAGCombine to pattern-match before we touch the canonical form.
14299 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14300 return SDValue();
14301
14302 if (N->getValueType(0) != MVT::i32)
14303 return SDValue();
14304
14305 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14306 if (!N1C)
14307 return SDValue();
14308
14309 uint32_t C1 = (uint32_t)N1C->getZExtValue();
14310 // Don't transform uxtb/uxth.
14311 if (C1 == 255 || C1 == 65535)
14312 return SDValue();
14313
14314 SDNode *N0 = N->getOperand(0).getNode();
14315 if (!N0->hasOneUse())
14316 return SDValue();
14317
14318 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
14319 return SDValue();
14320
14321 bool LeftShift = N0->getOpcode() == ISD::SHL;
14322
14324 if (!N01C)
14325 return SDValue();
14326
14327 uint32_t C2 = (uint32_t)N01C->getZExtValue();
14328 if (!C2 || C2 >= 32)
14329 return SDValue();
14330
14331 // Clear irrelevant bits in the mask.
14332 if (LeftShift)
14333 C1 &= (-1U << C2);
14334 else
14335 C1 &= (-1U >> C2);
14336
14337 SelectionDAG &DAG = DCI.DAG;
14338 SDLoc DL(N);
14339
14340 // We have a pattern of the form "(and (shl x, c2) c1)" or
14341 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
14342 // transform to a pair of shifts, to save materializing c1.
14343
14344 // First pattern: right shift, then mask off leading bits.
14345 // FIXME: Use demanded bits?
14346 if (!LeftShift && isMask_32(C1)) {
14347 uint32_t C3 = llvm::countl_zero(C1);
14348 if (C2 < C3) {
14349 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14350 DAG.getConstant(C3 - C2, DL, MVT::i32));
14351 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14352 DAG.getConstant(C3, DL, MVT::i32));
14353 }
14354 }
14355
14356 // First pattern, reversed: left shift, then mask off trailing bits.
14357 if (LeftShift && isMask_32(~C1)) {
14358 uint32_t C3 = llvm::countr_zero(C1);
14359 if (C2 < C3) {
14360 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14361 DAG.getConstant(C3 - C2, DL, MVT::i32));
14362 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14363 DAG.getConstant(C3, DL, MVT::i32));
14364 }
14365 }
14366
14367 // Second pattern: left shift, then mask off leading bits.
14368 // FIXME: Use demanded bits?
14369 if (LeftShift && isShiftedMask_32(C1)) {
14370 uint32_t Trailing = llvm::countr_zero(C1);
14371 uint32_t C3 = llvm::countl_zero(C1);
14372 if (Trailing == C2 && C2 + C3 < 32) {
14373 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14374 DAG.getConstant(C2 + C3, DL, MVT::i32));
14375 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14376 DAG.getConstant(C3, DL, MVT::i32));
14377 }
14378 }
14379
14380 // Second pattern, reversed: right shift, then mask off trailing bits.
14381 // FIXME: Handle other patterns of known/demanded bits.
14382 if (!LeftShift && isShiftedMask_32(C1)) {
14383 uint32_t Leading = llvm::countl_zero(C1);
14384 uint32_t C3 = llvm::countr_zero(C1);
14385 if (Leading == C2 && C2 + C3 < 32) {
14386 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14387 DAG.getConstant(C2 + C3, DL, MVT::i32));
14388 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14389 DAG.getConstant(C3, DL, MVT::i32));
14390 }
14391 }
14392
14393 // Transform "(and (shl x, c2) c1)" into "(shl (and x, c1>>c2), c2)"
14394 // if "c1 >> c2" is a cheaper immediate than "c1"
14395 if (LeftShift &&
14396 HasLowerConstantMaterializationCost(C1 >> C2, C1, Subtarget)) {
14397
14398 SDValue And = DAG.getNode(ISD::AND, DL, MVT::i32, N0->getOperand(0),
14399 DAG.getConstant(C1 >> C2, DL, MVT::i32));
14400 return DAG.getNode(ISD::SHL, DL, MVT::i32, And,
14401 DAG.getConstant(C2, DL, MVT::i32));
14402 }
14403
14404 return SDValue();
14405}
14406
14409 const ARMSubtarget *Subtarget) {
14410 // Attempt to use immediate-form VBIC
14411 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14412 SDLoc dl(N);
14413 EVT VT = N->getValueType(0);
14414 SelectionDAG &DAG = DCI.DAG;
14415
14416 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 ||
14417 VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)
14418 return SDValue();
14419
14420 APInt SplatBits, SplatUndef;
14421 unsigned SplatBitSize;
14422 bool HasAnyUndefs;
14423 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14424 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14425 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14426 SplatBitSize == 64) {
14427 EVT VbicVT;
14428 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
14429 SplatUndef.getZExtValue(), SplatBitSize,
14430 DAG, dl, VbicVT, VT, OtherModImm);
14431 if (Val.getNode()) {
14432 SDValue Input =
14433 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VbicVT, N->getOperand(0));
14434 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
14435 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vbic);
14436 }
14437 }
14438 }
14439
14440 if (!Subtarget->isThumb1Only()) {
14441 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
14442 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
14443 return Result;
14444
14445 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14446 return Result;
14447 }
14448
14449 if (Subtarget->isThumb1Only())
14450 if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
14451 return Result;
14452
14453 return SDValue();
14454}
14455
14456// Try combining OR nodes to SMULWB, SMULWT.
14459 const ARMSubtarget *Subtarget) {
14460 if (!Subtarget->hasV6Ops() ||
14461 (Subtarget->isThumb() &&
14462 (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
14463 return SDValue();
14464
14465 SDValue SRL = OR->getOperand(0);
14466 SDValue SHL = OR->getOperand(1);
14467
14468 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
14469 SRL = OR->getOperand(1);
14470 SHL = OR->getOperand(0);
14471 }
14472 if (!isSRL16(SRL) || !isSHL16(SHL))
14473 return SDValue();
14474
14475 // The first operands to the shifts need to be the two results from the
14476 // same smul_lohi node.
14477 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
14478 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
14479 return SDValue();
14480
14481 SDNode *SMULLOHI = SRL.getOperand(0).getNode();
14482 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
14483 SHL.getOperand(0) != SDValue(SMULLOHI, 1))
14484 return SDValue();
14485
14486 // Now we have:
14487 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
14488 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
14489 // For SMUWB the 16-bit value will signed extended somehow.
14490 // For SMULWT only the SRA is required.
14491 // Check both sides of SMUL_LOHI
14492 SDValue OpS16 = SMULLOHI->getOperand(0);
14493 SDValue OpS32 = SMULLOHI->getOperand(1);
14494
14495 SelectionDAG &DAG = DCI.DAG;
14496 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
14497 OpS16 = OpS32;
14498 OpS32 = SMULLOHI->getOperand(0);
14499 }
14500
14501 SDLoc dl(OR);
14502 unsigned Opcode = 0;
14503 if (isS16(OpS16, DAG))
14504 Opcode = ARMISD::SMULWB;
14505 else if (isSRA16(OpS16)) {
14506 Opcode = ARMISD::SMULWT;
14507 OpS16 = OpS16->getOperand(0);
14508 }
14509 else
14510 return SDValue();
14511
14512 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
14513 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
14514 return SDValue(OR, 0);
14515}
14516
14519 const ARMSubtarget *Subtarget) {
14520 // BFI is only available on V6T2+
14521 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
14522 return SDValue();
14523
14524 EVT VT = N->getValueType(0);
14525 SDValue N0 = N->getOperand(0);
14526 SDValue N1 = N->getOperand(1);
14527 SelectionDAG &DAG = DCI.DAG;
14528 SDLoc DL(N);
14529 // 1) or (and A, mask), val => ARMbfi A, val, mask
14530 // iff (val & mask) == val
14531 //
14532 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14533 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
14534 // && mask == ~mask2
14535 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
14536 // && ~mask == mask2
14537 // (i.e., copy a bitfield value into another bitfield of the same width)
14538
14539 if (VT != MVT::i32)
14540 return SDValue();
14541
14542 SDValue N00 = N0.getOperand(0);
14543
14544 // The value and the mask need to be constants so we can verify this is
14545 // actually a bitfield set. If the mask is 0xffff, we can do better
14546 // via a movt instruction, so don't use BFI in that case.
14547 SDValue MaskOp = N0.getOperand(1);
14549 if (!MaskC)
14550 return SDValue();
14551 unsigned Mask = MaskC->getZExtValue();
14552 if (Mask == 0xffff)
14553 return SDValue();
14554 SDValue Res;
14555 // Case (1): or (and A, mask), val => ARMbfi A, val, mask
14557 if (N1C) {
14558 unsigned Val = N1C->getZExtValue();
14559 if ((Val & ~Mask) != Val)
14560 return SDValue();
14561
14562 if (ARM::isBitFieldInvertedMask(Mask)) {
14563 Val >>= llvm::countr_zero(~Mask);
14564
14565 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
14566 DAG.getConstant(Val, DL, MVT::i32),
14567 DAG.getConstant(Mask, DL, MVT::i32));
14568
14569 DCI.CombineTo(N, Res, false);
14570 // Return value from the original node to inform the combiner than N is
14571 // now dead.
14572 return SDValue(N, 0);
14573 }
14574 } else if (N1.getOpcode() == ISD::AND) {
14575 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14577 if (!N11C)
14578 return SDValue();
14579 unsigned Mask2 = N11C->getZExtValue();
14580
14581 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
14582 // as is to match.
14583 if (ARM::isBitFieldInvertedMask(Mask) &&
14584 (Mask == ~Mask2)) {
14585 // The pack halfword instruction works better for masks that fit it,
14586 // so use that when it's available.
14587 if (Subtarget->hasDSP() &&
14588 (Mask == 0xffff || Mask == 0xffff0000))
14589 return SDValue();
14590 // 2a
14591 unsigned amt = llvm::countr_zero(Mask2);
14592 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
14593 DAG.getConstant(amt, DL, MVT::i32));
14594 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
14595 DAG.getConstant(Mask, DL, MVT::i32));
14596 DCI.CombineTo(N, Res, false);
14597 // Return value from the original node to inform the combiner than N is
14598 // now dead.
14599 return SDValue(N, 0);
14600 } else if (ARM::isBitFieldInvertedMask(~Mask) &&
14601 (~Mask == Mask2)) {
14602 // The pack halfword instruction works better for masks that fit it,
14603 // so use that when it's available.
14604 if (Subtarget->hasDSP() &&
14605 (Mask2 == 0xffff || Mask2 == 0xffff0000))
14606 return SDValue();
14607 // 2b
14608 unsigned lsb = llvm::countr_zero(Mask);
14609 Res = DAG.getNode(ISD::SRL, DL, VT, N00,
14610 DAG.getConstant(lsb, DL, MVT::i32));
14611 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
14612 DAG.getConstant(Mask2, DL, MVT::i32));
14613 DCI.CombineTo(N, Res, false);
14614 // Return value from the original node to inform the combiner than N is
14615 // now dead.
14616 return SDValue(N, 0);
14617 }
14618 }
14619
14620 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
14621 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
14623 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
14624 // where lsb(mask) == #shamt and masked bits of B are known zero.
14625 SDValue ShAmt = N00.getOperand(1);
14626 unsigned ShAmtC = ShAmt->getAsZExtVal();
14627 unsigned LSB = llvm::countr_zero(Mask);
14628 if (ShAmtC != LSB)
14629 return SDValue();
14630
14631 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
14632 DAG.getConstant(~Mask, DL, MVT::i32));
14633
14634 DCI.CombineTo(N, Res, false);
14635 // Return value from the original node to inform the combiner than N is
14636 // now dead.
14637 return SDValue(N, 0);
14638 }
14639
14640 return SDValue();
14641}
14642
14643static bool isValidMVECond(unsigned CC, bool IsFloat) {
14644 switch (CC) {
14645 case ARMCC::EQ:
14646 case ARMCC::NE:
14647 case ARMCC::LE:
14648 case ARMCC::GT:
14649 case ARMCC::GE:
14650 case ARMCC::LT:
14651 return true;
14652 case ARMCC::HS:
14653 case ARMCC::HI:
14654 return !IsFloat;
14655 default:
14656 return false;
14657 };
14658}
14659
14661 if (N->getOpcode() == ARMISD::VCMP)
14662 return (ARMCC::CondCodes)N->getConstantOperandVal(2);
14663 else if (N->getOpcode() == ARMISD::VCMPZ)
14664 return (ARMCC::CondCodes)N->getConstantOperandVal(1);
14665 else
14666 llvm_unreachable("Not a VCMP/VCMPZ!");
14667}
14668
14671 return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
14672}
14673
14675 const ARMSubtarget *Subtarget) {
14676 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
14677 // together with predicates
14678 EVT VT = N->getValueType(0);
14679 SDLoc DL(N);
14680 SDValue N0 = N->getOperand(0);
14681 SDValue N1 = N->getOperand(1);
14682
14683 auto IsFreelyInvertable = [&](SDValue V) {
14684 if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
14685 return CanInvertMVEVCMP(V);
14686 return false;
14687 };
14688
14689 // At least one operand must be freely invertable.
14690 if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
14691 return SDValue();
14692
14693 SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT);
14694 SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT);
14695 SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
14696 return DAG.getLogicalNOT(DL, And, VT);
14697}
14698
14699/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
14702 const ARMSubtarget *Subtarget) {
14703 // Attempt to use immediate-form VORR
14704 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14705 SDLoc dl(N);
14706 EVT VT = N->getValueType(0);
14707 SelectionDAG &DAG = DCI.DAG;
14708
14709 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14710 return SDValue();
14711
14712 if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 ||
14713 VT == MVT::v8i1 || VT == MVT::v16i1))
14714 return PerformORCombine_i1(N, DAG, Subtarget);
14715
14716 APInt SplatBits, SplatUndef;
14717 unsigned SplatBitSize;
14718 bool HasAnyUndefs;
14719 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14720 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14721 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14722 SplatBitSize == 64) {
14723 EVT VorrVT;
14724 SDValue Val =
14725 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
14726 SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
14727 if (Val.getNode()) {
14728 SDValue Input =
14729 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VorrVT, N->getOperand(0));
14730 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
14731 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vorr);
14732 }
14733 }
14734 }
14735
14736 if (!Subtarget->isThumb1Only()) {
14737 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
14738 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14739 return Result;
14740 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
14741 return Result;
14742 }
14743
14744 SDValue N0 = N->getOperand(0);
14745 SDValue N1 = N->getOperand(1);
14746
14747 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
14748 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
14750
14751 // The code below optimizes (or (and X, Y), Z).
14752 // The AND operand needs to have a single user to make these optimizations
14753 // profitable.
14754 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
14755 return SDValue();
14756
14757 APInt SplatUndef;
14758 unsigned SplatBitSize;
14759 bool HasAnyUndefs;
14760
14761 APInt SplatBits0, SplatBits1;
14764 // Ensure that the second operand of both ands are constants
14765 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
14766 HasAnyUndefs) && !HasAnyUndefs) {
14767 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
14768 HasAnyUndefs) && !HasAnyUndefs) {
14769 // Ensure that the bit width of the constants are the same and that
14770 // the splat arguments are logical inverses as per the pattern we
14771 // are trying to simplify.
14772 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
14773 SplatBits0 == ~SplatBits1) {
14774 // Canonicalize the vector type to make instruction selection
14775 // simpler.
14776 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
14777 SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,
14778 N0->getOperand(1),
14779 N0->getOperand(0),
14780 N1->getOperand(0));
14781 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Result);
14782 }
14783 }
14784 }
14785 }
14786
14787 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
14788 // reasonable.
14789 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
14790 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
14791 return Res;
14792 }
14793
14794 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14795 return Result;
14796
14797 return SDValue();
14798}
14799
14802 const ARMSubtarget *Subtarget) {
14803 EVT VT = N->getValueType(0);
14804 SelectionDAG &DAG = DCI.DAG;
14805
14806 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14807 return SDValue();
14808
14809 if (!Subtarget->isThumb1Only()) {
14810 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
14811 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14812 return Result;
14813
14814 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14815 return Result;
14816 }
14817
14818 if (Subtarget->hasMVEIntegerOps()) {
14819 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
14820 SDValue N0 = N->getOperand(0);
14821 SDValue N1 = N->getOperand(1);
14822 const TargetLowering *TLI = Subtarget->getTargetLowering();
14823 if (TLI->isConstTrueVal(N1) &&
14824 (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
14825 if (CanInvertMVEVCMP(N0)) {
14826 SDLoc DL(N0);
14828
14830 Ops.push_back(N0->getOperand(0));
14831 if (N0->getOpcode() == ARMISD::VCMP)
14832 Ops.push_back(N0->getOperand(1));
14833 Ops.push_back(DAG.getConstant(CC, DL, MVT::i32));
14834 return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
14835 }
14836 }
14837 }
14838
14839 return SDValue();
14840}
14841
14842// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
14843// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
14844// their position in "to" (Rd).
14845static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
14846 assert(N->getOpcode() == ARMISD::BFI);
14847
14848 SDValue From = N->getOperand(1);
14849 ToMask = ~N->getConstantOperandAPInt(2);
14850 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.popcount());
14851
14852 // If the Base came from a SHR #C, we can deduce that it is really testing bit
14853 // #C in the base of the SHR.
14854 if (From->getOpcode() == ISD::SRL &&
14855 isa<ConstantSDNode>(From->getOperand(1))) {
14856 APInt Shift = From->getConstantOperandAPInt(1);
14857 assert(Shift.getLimitedValue() < 32 && "Shift too large!");
14858 FromMask <<= Shift.getLimitedValue(31);
14859 From = From->getOperand(0);
14860 }
14861
14862 return From;
14863}
14864
14865// If A and B contain one contiguous set of bits, does A | B == A . B?
14866//
14867// Neither A nor B must be zero.
14868static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
14869 unsigned LastActiveBitInA = A.countr_zero();
14870 unsigned FirstActiveBitInB = B.getBitWidth() - B.countl_zero() - 1;
14871 return LastActiveBitInA - 1 == FirstActiveBitInB;
14872}
14873
14875 // We have a BFI in N. Find a BFI it can combine with, if one exists.
14876 APInt ToMask, FromMask;
14877 SDValue From = ParseBFI(N, ToMask, FromMask);
14878 SDValue To = N->getOperand(0);
14879
14880 SDValue V = To;
14881 if (V.getOpcode() != ARMISD::BFI)
14882 return SDValue();
14883
14884 APInt NewToMask, NewFromMask;
14885 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
14886 if (NewFrom != From)
14887 return SDValue();
14888
14889 // Do the written bits conflict with any we've seen so far?
14890 if ((NewToMask & ToMask).getBoolValue())
14891 // Conflicting bits.
14892 return SDValue();
14893
14894 // Are the new bits contiguous when combined with the old bits?
14895 if (BitsProperlyConcatenate(ToMask, NewToMask) &&
14896 BitsProperlyConcatenate(FromMask, NewFromMask))
14897 return V;
14898 if (BitsProperlyConcatenate(NewToMask, ToMask) &&
14899 BitsProperlyConcatenate(NewFromMask, FromMask))
14900 return V;
14901
14902 return SDValue();
14903}
14904
14906 SDValue N0 = N->getOperand(0);
14907 SDValue N1 = N->getOperand(1);
14908
14909 if (N1.getOpcode() == ISD::AND) {
14910 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
14911 // the bits being cleared by the AND are not demanded by the BFI.
14913 if (!N11C)
14914 return SDValue();
14915 unsigned InvMask = N->getConstantOperandVal(2);
14916 unsigned LSB = llvm::countr_zero(~InvMask);
14917 unsigned Width = llvm::bit_width<unsigned>(~InvMask) - LSB;
14918 assert(Width <
14919 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
14920 "undefined behavior");
14921 unsigned Mask = (1u << Width) - 1;
14922 unsigned Mask2 = N11C->getZExtValue();
14923 if ((Mask & (~Mask2)) == 0)
14924 return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
14925 N->getOperand(0), N1.getOperand(0), N->getOperand(2));
14926 return SDValue();
14927 }
14928
14929 // Look for another BFI to combine with.
14930 if (SDValue CombineBFI = FindBFIToCombineWith(N)) {
14931 // We've found a BFI.
14932 APInt ToMask1, FromMask1;
14933 SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
14934
14935 APInt ToMask2, FromMask2;
14936 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
14937 assert(From1 == From2);
14938 (void)From2;
14939
14940 // Create a new BFI, combining the two together.
14941 APInt NewFromMask = FromMask1 | FromMask2;
14942 APInt NewToMask = ToMask1 | ToMask2;
14943
14944 EVT VT = N->getValueType(0);
14945 SDLoc dl(N);
14946
14947 if (NewFromMask[0] == 0)
14948 From1 = DAG.getNode(ISD::SRL, dl, VT, From1,
14949 DAG.getConstant(NewFromMask.countr_zero(), dl, VT));
14950 return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1,
14951 DAG.getConstant(~NewToMask, dl, VT));
14952 }
14953
14954 // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so
14955 // that lower bit insertions are performed first, providing that M1 and M2
14956 // do no overlap. This can allow multiple BFI instructions to be combined
14957 // together by the other folds above.
14958 if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
14959 APInt ToMask1 = ~N->getConstantOperandAPInt(2);
14960 APInt ToMask2 = ~N0.getConstantOperandAPInt(2);
14961
14962 if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 ||
14963 ToMask1.countl_zero() < ToMask2.countl_zero())
14964 return SDValue();
14965
14966 EVT VT = N->getValueType(0);
14967 SDLoc dl(N);
14968 SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0),
14969 N->getOperand(1), N->getOperand(2));
14970 return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1),
14971 N0.getOperand(2));
14972 }
14973
14974 return SDValue();
14975}
14976
14977// Check that N is CMPZ(CSINC(0, 0, CC, X)),
14978// or CMPZ(CMOV(1, 0, CC, X))
14979// return X if valid.
14981 if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1)))
14982 return SDValue();
14983 SDValue CSInc = Cmp->getOperand(0);
14984
14985 // Ignore any `And 1` nodes that may not yet have been removed. We are
14986 // looking for a value that produces 1/0, so these have no effect on the
14987 // code.
14988 while (CSInc.getOpcode() == ISD::AND &&
14989 isa<ConstantSDNode>(CSInc.getOperand(1)) &&
14990 CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse())
14991 CSInc = CSInc.getOperand(0);
14992
14993 if (CSInc.getOpcode() == ARMISD::CSINC &&
14994 isNullConstant(CSInc.getOperand(0)) &&
14995 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
14997 return CSInc.getOperand(3);
14998 }
14999 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) &&
15000 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
15002 return CSInc.getOperand(3);
15003 }
15004 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) &&
15005 isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) {
15008 return CSInc.getOperand(3);
15009 }
15010 return SDValue();
15011}
15012
15014 // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in
15015 // t92: flags = ARMISD::CMPZ t74, 0
15016 // t93: i32 = ARMISD::CSINC 0, 0, 1, t92
15017 // t96: flags = ARMISD::CMPZ t93, 0
15018 // t114: i32 = ARMISD::CSINV 0, 0, 0, t96
15020 if (SDValue C = IsCMPZCSINC(N, Cond))
15021 if (Cond == ARMCC::EQ)
15022 return C;
15023 return SDValue();
15024}
15025
15027 // Fold away an unneccessary CMPZ/CSINC
15028 // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) ->
15029 // if C1==EQ -> CSXYZ A, B, C2, D
15030 // if C1==NE -> CSXYZ A, B, NOT(C2), D
15032 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
15033 if (N->getConstantOperandVal(2) == ARMCC::EQ)
15034 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15035 N->getOperand(1),
15036 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
15037 if (N->getConstantOperandVal(2) == ARMCC::NE)
15038 return DAG.getNode(
15039 N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15040 N->getOperand(1),
15042 }
15043 return SDValue();
15044}
15045
15046/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
15047/// ARMISD::VMOVRRD.
15050 const ARMSubtarget *Subtarget) {
15051 // vmovrrd(vmovdrr x, y) -> x,y
15052 SDValue InDouble = N->getOperand(0);
15053 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
15054 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
15055
15056 // vmovrrd(load f64) -> (load i32), (load i32)
15057 SDNode *InNode = InDouble.getNode();
15058 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
15059 InNode->getValueType(0) == MVT::f64 &&
15060 InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
15061 !cast<LoadSDNode>(InNode)->isVolatile()) {
15062 // TODO: Should this be done for non-FrameIndex operands?
15063 LoadSDNode *LD = cast<LoadSDNode>(InNode);
15064
15065 SelectionDAG &DAG = DCI.DAG;
15066 SDLoc DL(LD);
15067 SDValue BasePtr = LD->getBasePtr();
15068 SDValue NewLD1 =
15069 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
15070 LD->getAlign(), LD->getMemOperand()->getFlags());
15071
15072 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
15073 DAG.getConstant(4, DL, MVT::i32));
15074
15075 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
15076 LD->getPointerInfo().getWithOffset(4),
15077 commonAlignment(LD->getAlign(), 4),
15078 LD->getMemOperand()->getFlags());
15079
15080 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
15081 if (DCI.DAG.getDataLayout().isBigEndian())
15082 std::swap (NewLD1, NewLD2);
15083 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
15084 return Result;
15085 }
15086
15087 // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
15088 // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
15089 if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15090 isa<ConstantSDNode>(InDouble.getOperand(1))) {
15091 SDValue BV = InDouble.getOperand(0);
15092 // Look up through any nop bitcasts and vector_reg_casts. bitcasts may
15093 // change lane order under big endian.
15094 bool BVSwap = BV.getOpcode() == ISD::BITCAST;
15095 while (
15096 (BV.getOpcode() == ISD::BITCAST ||
15098 (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {
15099 BVSwap = BV.getOpcode() == ISD::BITCAST;
15100 BV = BV.getOperand(0);
15101 }
15102 if (BV.getValueType() != MVT::v4i32)
15103 return SDValue();
15104
15105 // Handle buildvectors, pulling out the correct lane depending on
15106 // endianness.
15107 unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0;
15108 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
15109 SDValue Op0 = BV.getOperand(Offset);
15110 SDValue Op1 = BV.getOperand(Offset + 1);
15111 if (!Subtarget->isLittle() && BVSwap)
15112 std::swap(Op0, Op1);
15113
15114 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15115 }
15116
15117 // A chain of insert_vectors, grabbing the correct value of the chain of
15118 // inserts.
15119 SDValue Op0, Op1;
15120 while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {
15121 if (isa<ConstantSDNode>(BV.getOperand(2))) {
15122 if (BV.getConstantOperandVal(2) == Offset && !Op0)
15123 Op0 = BV.getOperand(1);
15124 if (BV.getConstantOperandVal(2) == Offset + 1 && !Op1)
15125 Op1 = BV.getOperand(1);
15126 }
15127 BV = BV.getOperand(0);
15128 }
15129 if (!Subtarget->isLittle() && BVSwap)
15130 std::swap(Op0, Op1);
15131 if (Op0 && Op1)
15132 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15133 }
15134
15135 return SDValue();
15136}
15137
15138/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
15139/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
15141 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
15142 SDValue Op0 = N->getOperand(0);
15143 SDValue Op1 = N->getOperand(1);
15144 if (Op0.getOpcode() == ISD::BITCAST)
15145 Op0 = Op0.getOperand(0);
15146 if (Op1.getOpcode() == ISD::BITCAST)
15147 Op1 = Op1.getOperand(0);
15148 if (Op0.getOpcode() == ARMISD::VMOVRRD &&
15149 Op0.getNode() == Op1.getNode() &&
15150 Op0.getResNo() == 0 && Op1.getResNo() == 1)
15151 return DAG.getNode(ISD::BITCAST, SDLoc(N),
15152 N->getValueType(0), Op0.getOperand(0));
15153 return SDValue();
15154}
15155
15158 SDValue Op0 = N->getOperand(0);
15159
15160 // VMOVhr (VMOVrh (X)) -> X
15161 if (Op0->getOpcode() == ARMISD::VMOVrh)
15162 return Op0->getOperand(0);
15163
15164 // FullFP16: half values are passed in S-registers, and we don't
15165 // need any of the bitcast and moves:
15166 //
15167 // t2: f32,ch1,gl1? = CopyFromReg ch, Register:f32 %0, gl?
15168 // t5: i32 = bitcast t2
15169 // t18: f16 = ARMISD::VMOVhr t5
15170 // =>
15171 // tN: f16,ch2,gl2? = CopyFromReg ch, Register::f32 %0, gl?
15172 if (Op0->getOpcode() == ISD::BITCAST) {
15173 SDValue Copy = Op0->getOperand(0);
15174 if (Copy.getValueType() == MVT::f32 &&
15175 Copy->getOpcode() == ISD::CopyFromReg) {
15176 bool HasGlue = Copy->getNumOperands() == 3;
15177 SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1),
15178 HasGlue ? Copy->getOperand(2) : SDValue()};
15179 EVT OutTys[] = {N->getValueType(0), MVT::Other, MVT::Glue};
15180 SDValue NewCopy =
15182 DCI.DAG.getVTList(ArrayRef(OutTys, HasGlue ? 3 : 2)),
15183 ArrayRef(Ops, HasGlue ? 3 : 2));
15184
15185 // Update Users, Chains, and Potential Glue.
15186 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), NewCopy.getValue(0));
15187 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(1), NewCopy.getValue(1));
15188 if (HasGlue)
15189 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(2),
15190 NewCopy.getValue(2));
15191
15192 return NewCopy;
15193 }
15194 }
15195
15196 // fold (VMOVhr (load x)) -> (load (f16*)x)
15197 if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {
15198 if (LN0->hasOneUse() && LN0->isUnindexed() &&
15199 LN0->getMemoryVT() == MVT::i16) {
15200 SDValue Load =
15201 DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),
15202 LN0->getBasePtr(), LN0->getMemOperand());
15203 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15204 DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
15205 return Load;
15206 }
15207 }
15208
15209 // Only the bottom 16 bits of the source register are used.
15210 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15211 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15212 if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
15213 return SDValue(N, 0);
15214
15215 return SDValue();
15216}
15217
15219 SDValue N0 = N->getOperand(0);
15220 EVT VT = N->getValueType(0);
15221
15222 // fold (VMOVrh (fpconst x)) -> const x
15224 APFloat V = C->getValueAPF();
15225 return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
15226 }
15227
15228 // fold (VMOVrh (load x)) -> (zextload (i16*)x)
15229 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
15230 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
15231
15232 SDValue Load =
15233 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
15234 LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
15235 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15236 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
15237 return Load;
15238 }
15239
15240 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
15241 if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15243 return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
15244 N0->getOperand(1));
15245
15246 return SDValue();
15247}
15248
15249/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
15250/// are normal, non-volatile loads. If so, it is profitable to bitcast an
15251/// i64 vector to have f64 elements, since the value can then be loaded
15252/// directly into a VFP register.
15254 unsigned NumElts = N->getValueType(0).getVectorNumElements();
15255 for (unsigned i = 0; i < NumElts; ++i) {
15256 SDNode *Elt = N->getOperand(i).getNode();
15257 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
15258 return true;
15259 }
15260 return false;
15261}
15262
15263/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
15264/// ISD::BUILD_VECTOR.
15267 const ARMSubtarget *Subtarget) {
15268 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
15269 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
15270 // into a pair of GPRs, which is fine when the value is used as a scalar,
15271 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
15272 SelectionDAG &DAG = DCI.DAG;
15273 if (N->getNumOperands() == 2)
15274 if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
15275 return RV;
15276
15277 // Load i64 elements as f64 values so that type legalization does not split
15278 // them up into i32 values.
15279 EVT VT = N->getValueType(0);
15280 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
15281 return SDValue();
15282 SDLoc dl(N);
15284 unsigned NumElts = VT.getVectorNumElements();
15285 for (unsigned i = 0; i < NumElts; ++i) {
15286 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
15287 Ops.push_back(V);
15288 // Make the DAGCombiner fold the bitcast.
15289 DCI.AddToWorklist(V.getNode());
15290 }
15291 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
15292 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
15293 return DAG.getNode(ISD::BITCAST, dl, VT, BV);
15294}
15295
15296/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
15297static SDValue
15299 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
15300 // At that time, we may have inserted bitcasts from integer to float.
15301 // If these bitcasts have survived DAGCombine, change the lowering of this
15302 // BUILD_VECTOR in something more vector friendly, i.e., that does not
15303 // force to use floating point types.
15304
15305 // Make sure we can change the type of the vector.
15306 // This is possible iff:
15307 // 1. The vector is only used in a bitcast to a integer type. I.e.,
15308 // 1.1. Vector is used only once.
15309 // 1.2. Use is a bit convert to an integer type.
15310 // 2. The size of its operands are 32-bits (64-bits are not legal).
15311 EVT VT = N->getValueType(0);
15312 EVT EltVT = VT.getVectorElementType();
15313
15314 // Check 1.1. and 2.
15315 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
15316 return SDValue();
15317
15318 // By construction, the input type must be float.
15319 assert(EltVT == MVT::f32 && "Unexpected type!");
15320
15321 // Check 1.2.
15322 SDNode *Use = *N->user_begin();
15323 if (Use->getOpcode() != ISD::BITCAST ||
15324 Use->getValueType(0).isFloatingPoint())
15325 return SDValue();
15326
15327 // Check profitability.
15328 // Model is, if more than half of the relevant operands are bitcast from
15329 // i32, turn the build_vector into a sequence of insert_vector_elt.
15330 // Relevant operands are everything that is not statically
15331 // (i.e., at compile time) bitcasted.
15332 unsigned NumOfBitCastedElts = 0;
15333 unsigned NumElts = VT.getVectorNumElements();
15334 unsigned NumOfRelevantElts = NumElts;
15335 for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
15336 SDValue Elt = N->getOperand(Idx);
15337 if (Elt->getOpcode() == ISD::BITCAST) {
15338 // Assume only bit cast to i32 will go away.
15339 if (Elt->getOperand(0).getValueType() == MVT::i32)
15340 ++NumOfBitCastedElts;
15341 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
15342 // Constants are statically casted, thus do not count them as
15343 // relevant operands.
15344 --NumOfRelevantElts;
15345 }
15346
15347 // Check if more than half of the elements require a non-free bitcast.
15348 if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
15349 return SDValue();
15350
15351 SelectionDAG &DAG = DCI.DAG;
15352 // Create the new vector type.
15353 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
15354 // Check if the type is legal.
15355 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15356 if (!TLI.isTypeLegal(VecVT))
15357 return SDValue();
15358
15359 // Combine:
15360 // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
15361 // => BITCAST INSERT_VECTOR_ELT
15362 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
15363 // (BITCAST EN), N.
15364 SDValue Vec = DAG.getUNDEF(VecVT);
15365 SDLoc dl(N);
15366 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
15367 SDValue V = N->getOperand(Idx);
15368 if (V.isUndef())
15369 continue;
15370 if (V.getOpcode() == ISD::BITCAST &&
15371 V->getOperand(0).getValueType() == MVT::i32)
15372 // Fold obvious case.
15373 V = V.getOperand(0);
15374 else {
15375 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
15376 // Make the DAGCombiner fold the bitcasts.
15377 DCI.AddToWorklist(V.getNode());
15378 }
15379 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
15380 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
15381 }
15382 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
15383 // Make the DAGCombiner fold the bitcasts.
15384 DCI.AddToWorklist(Vec.getNode());
15385 return Vec;
15386}
15387
15388static SDValue
15390 EVT VT = N->getValueType(0);
15391 SDValue Op = N->getOperand(0);
15392 SDLoc dl(N);
15393
15394 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
15395 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
15396 // If the valuetypes are the same, we can remove the cast entirely.
15397 if (Op->getOperand(0).getValueType() == VT)
15398 return Op->getOperand(0);
15399 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15400 }
15401
15402 // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
15403 // more VPNOT which might get folded as else predicates.
15404 if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
15405 SDValue X =
15406 DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15408 DCI.DAG.getConstant(65535, dl, MVT::i32));
15409 return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);
15410 }
15411
15412 // Only the bottom 16 bits of the source register are used.
15413 if (Op.getValueType() == MVT::i32) {
15414 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15415 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15416 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
15417 return SDValue(N, 0);
15418 }
15419 return SDValue();
15420}
15421
15423 const ARMSubtarget *ST) {
15424 EVT VT = N->getValueType(0);
15425 SDValue Op = N->getOperand(0);
15426 SDLoc dl(N);
15427
15428 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
15429 if (ST->isLittle())
15430 return DAG.getNode(ISD::BITCAST, dl, VT, Op);
15431
15432 // VT VECTOR_REG_CAST (VT Op) -> Op
15433 if (Op.getValueType() == VT)
15434 return Op;
15435 // VECTOR_REG_CAST undef -> undef
15436 if (Op.isUndef())
15437 return DAG.getUNDEF(VT);
15438
15439 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
15440 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
15441 // If the valuetypes are the same, we can remove the cast entirely.
15442 if (Op->getOperand(0).getValueType() == VT)
15443 return Op->getOperand(0);
15444 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
15445 }
15446
15447 return SDValue();
15448}
15449
15451 const ARMSubtarget *Subtarget) {
15452 if (!Subtarget->hasMVEIntegerOps())
15453 return SDValue();
15454
15455 EVT VT = N->getValueType(0);
15456 SDValue Op0 = N->getOperand(0);
15457 SDValue Op1 = N->getOperand(1);
15458 ARMCC::CondCodes Cond = (ARMCC::CondCodes)N->getConstantOperandVal(2);
15459 SDLoc dl(N);
15460
15461 // vcmp X, 0, cc -> vcmpz X, cc
15462 if (isZeroVector(Op1))
15463 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2));
15464
15465 unsigned SwappedCond = getSwappedCondition(Cond);
15466 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
15467 // vcmp 0, X, cc -> vcmpz X, reversed(cc)
15468 if (isZeroVector(Op0))
15469 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
15470 DAG.getConstant(SwappedCond, dl, MVT::i32));
15471 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
15472 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
15473 return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
15474 DAG.getConstant(SwappedCond, dl, MVT::i32));
15475 }
15476
15477 return SDValue();
15478}
15479
15480/// PerformInsertEltCombine - Target-specific dag combine xforms for
15481/// ISD::INSERT_VECTOR_ELT.
15484 // Bitcast an i64 load inserted into a vector to f64.
15485 // Otherwise, the i64 value will be legalized to a pair of i32 values.
15486 EVT VT = N->getValueType(0);
15487 SDNode *Elt = N->getOperand(1).getNode();
15488 if (VT.getVectorElementType() != MVT::i64 ||
15489 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
15490 return SDValue();
15491
15492 SelectionDAG &DAG = DCI.DAG;
15493 SDLoc dl(N);
15494 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
15496 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
15497 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
15498 // Make the DAGCombiner fold the bitcasts.
15499 DCI.AddToWorklist(Vec.getNode());
15500 DCI.AddToWorklist(V.getNode());
15501 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
15502 Vec, V, N->getOperand(2));
15503 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
15504}
15505
15506// Convert a pair of extracts from the same base vector to a VMOVRRD. Either
15507// directly or bitcast to an integer if the original is a float vector.
15508// extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2)
15509// bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2)
15510static SDValue
15512 EVT VT = N->getValueType(0);
15513 SDLoc dl(N);
15514
15515 if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 ||
15516 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64))
15517 return SDValue();
15518
15519 SDValue Ext = SDValue(N, 0);
15520 if (Ext.getOpcode() == ISD::BITCAST &&
15521 Ext.getOperand(0).getValueType() == MVT::f32)
15522 Ext = Ext.getOperand(0);
15523 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15524 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
15525 Ext.getConstantOperandVal(1) % 2 != 0)
15526 return SDValue();
15527 if (Ext->hasOneUse() && (Ext->user_begin()->getOpcode() == ISD::SINT_TO_FP ||
15528 Ext->user_begin()->getOpcode() == ISD::UINT_TO_FP))
15529 return SDValue();
15530
15531 SDValue Op0 = Ext.getOperand(0);
15532 EVT VecVT = Op0.getValueType();
15533 unsigned ResNo = Op0.getResNo();
15534 unsigned Lane = Ext.getConstantOperandVal(1);
15535 if (VecVT.getVectorNumElements() != 4)
15536 return SDValue();
15537
15538 // Find another extract, of Lane + 1
15539 auto OtherIt = find_if(Op0->users(), [&](SDNode *V) {
15540 return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15541 isa<ConstantSDNode>(V->getOperand(1)) &&
15542 V->getConstantOperandVal(1) == Lane + 1 &&
15543 V->getOperand(0).getResNo() == ResNo;
15544 });
15545 if (OtherIt == Op0->users().end())
15546 return SDValue();
15547
15548 // For float extracts, we need to be converting to a i32 for both vector
15549 // lanes.
15550 SDValue OtherExt(*OtherIt, 0);
15551 if (OtherExt.getValueType() != MVT::i32) {
15552 if (!OtherExt->hasOneUse() ||
15553 OtherExt->user_begin()->getOpcode() != ISD::BITCAST ||
15554 OtherExt->user_begin()->getValueType(0) != MVT::i32)
15555 return SDValue();
15556 OtherExt = SDValue(*OtherExt->user_begin(), 0);
15557 }
15558
15559 // Convert the type to a f64 and extract with a VMOVRRD.
15560 SDValue F64 = DCI.DAG.getNode(
15561 ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15562 DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),
15563 DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));
15564 SDValue VMOVRRD =
15565 DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);
15566
15567 DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1));
15568 return VMOVRRD;
15569}
15570
15573 const ARMSubtarget *ST) {
15574 SDValue Op0 = N->getOperand(0);
15575 EVT VT = N->getValueType(0);
15576 SDLoc dl(N);
15577
15578 // extract (vdup x) -> x
15579 if (Op0->getOpcode() == ARMISD::VDUP) {
15580 SDValue X = Op0->getOperand(0);
15581 if (VT == MVT::f16 && X.getValueType() == MVT::i32)
15582 return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
15583 if (VT == MVT::i32 && X.getValueType() == MVT::f16)
15584 return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
15585 if (VT == MVT::f32 && X.getValueType() == MVT::i32)
15586 return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X);
15587
15588 while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
15589 X = X->getOperand(0);
15590 if (X.getValueType() == VT)
15591 return X;
15592 }
15593
15594 // extract ARM_BUILD_VECTOR -> x
15595 if (Op0->getOpcode() == ARMISD::BUILD_VECTOR &&
15596 isa<ConstantSDNode>(N->getOperand(1)) &&
15597 N->getConstantOperandVal(1) < Op0.getNumOperands()) {
15598 return Op0.getOperand(N->getConstantOperandVal(1));
15599 }
15600
15601 // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b
15602 if (Op0.getValueType() == MVT::v4i32 &&
15603 isa<ConstantSDNode>(N->getOperand(1)) &&
15604 Op0.getOpcode() == ISD::BITCAST &&
15606 Op0.getOperand(0).getValueType() == MVT::v2f64) {
15607 SDValue BV = Op0.getOperand(0);
15608 unsigned Offset = N->getConstantOperandVal(1);
15609 SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1);
15610 if (MOV.getOpcode() == ARMISD::VMOVDRR)
15611 return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2);
15612 }
15613
15614 // extract x, n; extract x, n+1 -> VMOVRRD x
15615 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
15616 return R;
15617
15618 // extract (MVETrunc(x)) -> extract x
15619 if (Op0->getOpcode() == ARMISD::MVETRUNC) {
15620 unsigned Idx = N->getConstantOperandVal(1);
15621 unsigned Vec =
15623 unsigned SubIdx =
15625 return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec),
15626 DCI.DAG.getConstant(SubIdx, dl, MVT::i32));
15627 }
15628
15629 return SDValue();
15630}
15631
15633 SDValue Op = N->getOperand(0);
15634 EVT VT = N->getValueType(0);
15635
15636 // sext_inreg(VGETLANEu) -> VGETLANEs
15637 if (Op.getOpcode() == ARMISD::VGETLANEu &&
15638 cast<VTSDNode>(N->getOperand(1))->getVT() ==
15639 Op.getOperand(0).getValueType().getScalarType())
15640 return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0),
15641 Op.getOperand(1));
15642
15643 return SDValue();
15644}
15645
15646static SDValue
15648 SDValue Vec = N->getOperand(0);
15649 SDValue SubVec = N->getOperand(1);
15650 uint64_t IdxVal = N->getConstantOperandVal(2);
15651 EVT VecVT = Vec.getValueType();
15652 EVT SubVT = SubVec.getValueType();
15653
15654 // Only do this for legal fixed vector types.
15655 if (!VecVT.isFixedLengthVector() ||
15656 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
15658 return SDValue();
15659
15660 // Ignore widening patterns.
15661 if (IdxVal == 0 && Vec.isUndef())
15662 return SDValue();
15663
15664 // Subvector must be half the width and an "aligned" insertion.
15665 unsigned NumSubElts = SubVT.getVectorNumElements();
15666 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
15667 (IdxVal != 0 && IdxVal != NumSubElts))
15668 return SDValue();
15669
15670 // Fold insert_subvector -> concat_vectors
15671 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
15672 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
15673 SDLoc DL(N);
15674 SDValue Lo, Hi;
15675 if (IdxVal == 0) {
15676 Lo = SubVec;
15677 Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15678 DCI.DAG.getVectorIdxConstant(NumSubElts, DL));
15679 } else {
15680 Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15681 DCI.DAG.getVectorIdxConstant(0, DL));
15682 Hi = SubVec;
15683 }
15684 return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
15685}
15686
15687// shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
15689 SelectionDAG &DAG) {
15690 SDValue Trunc = N->getOperand(0);
15691 EVT VT = Trunc.getValueType();
15692 if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef())
15693 return SDValue();
15694
15695 SDLoc DL(Trunc);
15696 if (isVMOVNTruncMask(N->getMask(), VT, false))
15697 return DAG.getNode(
15698 ARMISD::VMOVN, DL, VT,
15699 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15700 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15701 DAG.getConstant(1, DL, MVT::i32));
15702 else if (isVMOVNTruncMask(N->getMask(), VT, true))
15703 return DAG.getNode(
15704 ARMISD::VMOVN, DL, VT,
15705 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15706 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15707 DAG.getConstant(1, DL, MVT::i32));
15708 return SDValue();
15709}
15710
15711/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
15712/// ISD::VECTOR_SHUFFLE.
15715 return R;
15716
15717 // The LLVM shufflevector instruction does not require the shuffle mask
15718 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
15719 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
15720 // operands do not match the mask length, they are extended by concatenating
15721 // them with undef vectors. That is probably the right thing for other
15722 // targets, but for NEON it is better to concatenate two double-register
15723 // size vector operands into a single quad-register size vector. Do that
15724 // transformation here:
15725 // shuffle(concat(v1, undef), concat(v2, undef)) ->
15726 // shuffle(concat(v1, v2), undef)
15727 SDValue Op0 = N->getOperand(0);
15728 SDValue Op1 = N->getOperand(1);
15729 if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
15730 Op1.getOpcode() != ISD::CONCAT_VECTORS ||
15731 Op0.getNumOperands() != 2 ||
15732 Op1.getNumOperands() != 2)
15733 return SDValue();
15734 SDValue Concat0Op1 = Op0.getOperand(1);
15735 SDValue Concat1Op1 = Op1.getOperand(1);
15736 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
15737 return SDValue();
15738 // Skip the transformation if any of the types are illegal.
15739 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15740 EVT VT = N->getValueType(0);
15741 if (!TLI.isTypeLegal(VT) ||
15742 !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
15743 !TLI.isTypeLegal(Concat1Op1.getValueType()))
15744 return SDValue();
15745
15746 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
15747 Op0.getOperand(0), Op1.getOperand(0));
15748 // Translate the shuffle mask.
15749 SmallVector<int, 16> NewMask;
15750 unsigned NumElts = VT.getVectorNumElements();
15751 unsigned HalfElts = NumElts/2;
15753 for (unsigned n = 0; n < NumElts; ++n) {
15754 int MaskElt = SVN->getMaskElt(n);
15755 int NewElt = -1;
15756 if (MaskElt < (int)HalfElts)
15757 NewElt = MaskElt;
15758 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
15759 NewElt = HalfElts + MaskElt - NumElts;
15760 NewMask.push_back(NewElt);
15761 }
15762 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
15763 DAG.getUNDEF(VT), NewMask);
15764}
15765
15766/// Load/store instruction that can be merged with a base address
15767/// update
15772 unsigned AddrOpIdx;
15773};
15774
15776 /// Instruction that updates a pointer
15778 /// Pointer increment operand
15780 /// Pointer increment value if it is a constant, or 0 otherwise
15781 unsigned ConstInc;
15782};
15783
15785 // Check that the add is independent of the load/store.
15786 // Otherwise, folding it would create a cycle. Search through Addr
15787 // as well, since the User may not be a direct user of Addr and
15788 // only share a base pointer.
15791 Worklist.push_back(N);
15792 Worklist.push_back(User);
15793 const unsigned MaxSteps = 1024;
15794 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
15795 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
15796 return false;
15797 return true;
15798}
15799
15801 struct BaseUpdateUser &User,
15802 bool SimpleConstIncOnly,
15804 SelectionDAG &DAG = DCI.DAG;
15805 SDNode *N = Target.N;
15806 MemSDNode *MemN = cast<MemSDNode>(N);
15807 SDLoc dl(N);
15808
15809 // Find the new opcode for the updating load/store.
15810 bool isLoadOp = true;
15811 bool isLaneOp = false;
15812 // Workaround for vst1x and vld1x intrinsics which do not have alignment
15813 // as an operand.
15814 bool hasAlignment = true;
15815 unsigned NewOpc = 0;
15816 unsigned NumVecs = 0;
15817 if (Target.isIntrinsic) {
15818 unsigned IntNo = N->getConstantOperandVal(1);
15819 switch (IntNo) {
15820 default:
15821 llvm_unreachable("unexpected intrinsic for Neon base update");
15822 case Intrinsic::arm_neon_vld1:
15823 NewOpc = ARMISD::VLD1_UPD;
15824 NumVecs = 1;
15825 break;
15826 case Intrinsic::arm_neon_vld2:
15827 NewOpc = ARMISD::VLD2_UPD;
15828 NumVecs = 2;
15829 break;
15830 case Intrinsic::arm_neon_vld3:
15831 NewOpc = ARMISD::VLD3_UPD;
15832 NumVecs = 3;
15833 break;
15834 case Intrinsic::arm_neon_vld4:
15835 NewOpc = ARMISD::VLD4_UPD;
15836 NumVecs = 4;
15837 break;
15838 case Intrinsic::arm_neon_vld1x2:
15839 NewOpc = ARMISD::VLD1x2_UPD;
15840 NumVecs = 2;
15841 hasAlignment = false;
15842 break;
15843 case Intrinsic::arm_neon_vld1x3:
15844 NewOpc = ARMISD::VLD1x3_UPD;
15845 NumVecs = 3;
15846 hasAlignment = false;
15847 break;
15848 case Intrinsic::arm_neon_vld1x4:
15849 NewOpc = ARMISD::VLD1x4_UPD;
15850 NumVecs = 4;
15851 hasAlignment = false;
15852 break;
15853 case Intrinsic::arm_neon_vld2dup:
15854 NewOpc = ARMISD::VLD2DUP_UPD;
15855 NumVecs = 2;
15856 break;
15857 case Intrinsic::arm_neon_vld3dup:
15858 NewOpc = ARMISD::VLD3DUP_UPD;
15859 NumVecs = 3;
15860 break;
15861 case Intrinsic::arm_neon_vld4dup:
15862 NewOpc = ARMISD::VLD4DUP_UPD;
15863 NumVecs = 4;
15864 break;
15865 case Intrinsic::arm_neon_vld2lane:
15866 NewOpc = ARMISD::VLD2LN_UPD;
15867 NumVecs = 2;
15868 isLaneOp = true;
15869 break;
15870 case Intrinsic::arm_neon_vld3lane:
15871 NewOpc = ARMISD::VLD3LN_UPD;
15872 NumVecs = 3;
15873 isLaneOp = true;
15874 break;
15875 case Intrinsic::arm_neon_vld4lane:
15876 NewOpc = ARMISD::VLD4LN_UPD;
15877 NumVecs = 4;
15878 isLaneOp = true;
15879 break;
15880 case Intrinsic::arm_neon_vst1:
15881 NewOpc = ARMISD::VST1_UPD;
15882 NumVecs = 1;
15883 isLoadOp = false;
15884 break;
15885 case Intrinsic::arm_neon_vst2:
15886 NewOpc = ARMISD::VST2_UPD;
15887 NumVecs = 2;
15888 isLoadOp = false;
15889 break;
15890 case Intrinsic::arm_neon_vst3:
15891 NewOpc = ARMISD::VST3_UPD;
15892 NumVecs = 3;
15893 isLoadOp = false;
15894 break;
15895 case Intrinsic::arm_neon_vst4:
15896 NewOpc = ARMISD::VST4_UPD;
15897 NumVecs = 4;
15898 isLoadOp = false;
15899 break;
15900 case Intrinsic::arm_neon_vst2lane:
15901 NewOpc = ARMISD::VST2LN_UPD;
15902 NumVecs = 2;
15903 isLoadOp = false;
15904 isLaneOp = true;
15905 break;
15906 case Intrinsic::arm_neon_vst3lane:
15907 NewOpc = ARMISD::VST3LN_UPD;
15908 NumVecs = 3;
15909 isLoadOp = false;
15910 isLaneOp = true;
15911 break;
15912 case Intrinsic::arm_neon_vst4lane:
15913 NewOpc = ARMISD::VST4LN_UPD;
15914 NumVecs = 4;
15915 isLoadOp = false;
15916 isLaneOp = true;
15917 break;
15918 case Intrinsic::arm_neon_vst1x2:
15919 NewOpc = ARMISD::VST1x2_UPD;
15920 NumVecs = 2;
15921 isLoadOp = false;
15922 hasAlignment = false;
15923 break;
15924 case Intrinsic::arm_neon_vst1x3:
15925 NewOpc = ARMISD::VST1x3_UPD;
15926 NumVecs = 3;
15927 isLoadOp = false;
15928 hasAlignment = false;
15929 break;
15930 case Intrinsic::arm_neon_vst1x4:
15931 NewOpc = ARMISD::VST1x4_UPD;
15932 NumVecs = 4;
15933 isLoadOp = false;
15934 hasAlignment = false;
15935 break;
15936 }
15937 } else {
15938 isLaneOp = true;
15939 switch (N->getOpcode()) {
15940 default:
15941 llvm_unreachable("unexpected opcode for Neon base update");
15942 case ARMISD::VLD1DUP:
15943 NewOpc = ARMISD::VLD1DUP_UPD;
15944 NumVecs = 1;
15945 break;
15946 case ARMISD::VLD2DUP:
15947 NewOpc = ARMISD::VLD2DUP_UPD;
15948 NumVecs = 2;
15949 break;
15950 case ARMISD::VLD3DUP:
15951 NewOpc = ARMISD::VLD3DUP_UPD;
15952 NumVecs = 3;
15953 break;
15954 case ARMISD::VLD4DUP:
15955 NewOpc = ARMISD::VLD4DUP_UPD;
15956 NumVecs = 4;
15957 break;
15958 case ISD::LOAD:
15959 NewOpc = ARMISD::VLD1_UPD;
15960 NumVecs = 1;
15961 isLaneOp = false;
15962 break;
15963 case ISD::STORE:
15964 NewOpc = ARMISD::VST1_UPD;
15965 NumVecs = 1;
15966 isLaneOp = false;
15967 isLoadOp = false;
15968 break;
15969 }
15970 }
15971
15972 // Find the size of memory referenced by the load/store.
15973 EVT VecTy;
15974 if (isLoadOp) {
15975 VecTy = N->getValueType(0);
15976 } else if (Target.isIntrinsic) {
15977 VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType();
15978 } else {
15979 assert(Target.isStore &&
15980 "Node has to be a load, a store, or an intrinsic!");
15981 VecTy = N->getOperand(1).getValueType();
15982 }
15983
15984 bool isVLDDUPOp =
15985 NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
15986 NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
15987
15988 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
15989 if (isLaneOp || isVLDDUPOp)
15990 NumBytes /= VecTy.getVectorNumElements();
15991
15992 if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {
15993 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
15994 // separate instructions that make it harder to use a non-constant update.
15995 return false;
15996 }
15997
15998 if (SimpleConstIncOnly && User.ConstInc != NumBytes)
15999 return false;
16000
16001 if (!isValidBaseUpdate(N, User.N))
16002 return false;
16003
16004 // OK, we found an ADD we can fold into the base update.
16005 // Now, create a _UPD node, taking care of not breaking alignment.
16006
16007 EVT AlignedVecTy = VecTy;
16008 Align Alignment = MemN->getAlign();
16009
16010 // If this is a less-than-standard-aligned load/store, change the type to
16011 // match the standard alignment.
16012 // The alignment is overlooked when selecting _UPD variants; and it's
16013 // easier to introduce bitcasts here than fix that.
16014 // There are 3 ways to get to this base-update combine:
16015 // - intrinsics: they are assumed to be properly aligned (to the standard
16016 // alignment of the memory type), so we don't need to do anything.
16017 // - ARMISD::VLDx nodes: they are only generated from the aforementioned
16018 // intrinsics, so, likewise, there's nothing to do.
16019 // - generic load/store instructions: the alignment is specified as an
16020 // explicit operand, rather than implicitly as the standard alignment
16021 // of the memory type (like the intrisics). We need to change the
16022 // memory type to match the explicit alignment. That way, we don't
16023 // generate non-standard-aligned ARMISD::VLDx nodes.
16024 if (isa<LSBaseSDNode>(N)) {
16025 if (Alignment.value() < VecTy.getScalarSizeInBits() / 8) {
16026 MVT EltTy = MVT::getIntegerVT(Alignment.value() * 8);
16027 assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
16028 assert(!isLaneOp && "Unexpected generic load/store lane.");
16029 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
16030 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
16031 }
16032 // Don't set an explicit alignment on regular load/stores that we want
16033 // to transform to VLD/VST 1_UPD nodes.
16034 // This matches the behavior of regular load/stores, which only get an
16035 // explicit alignment if the MMO alignment is larger than the standard
16036 // alignment of the memory type.
16037 // Intrinsics, however, always get an explicit alignment, set to the
16038 // alignment of the MMO.
16039 Alignment = Align(1);
16040 }
16041
16042 // Create the new updating load/store node.
16043 // First, create an SDVTList for the new updating node's results.
16044 EVT Tys[6];
16045 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16046 unsigned n;
16047 for (n = 0; n < NumResultVecs; ++n)
16048 Tys[n] = AlignedVecTy;
16049 Tys[n++] = MVT::i32;
16050 Tys[n] = MVT::Other;
16051 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16052
16053 // Then, gather the new node's operands.
16055 Ops.push_back(N->getOperand(0)); // incoming chain
16056 Ops.push_back(N->getOperand(Target.AddrOpIdx));
16057 Ops.push_back(User.Inc);
16058
16059 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
16060 // Try to match the intrinsic's signature
16061 Ops.push_back(StN->getValue());
16062 } else {
16063 // Loads (and of course intrinsics) match the intrinsics' signature,
16064 // so just add all but the alignment operand.
16065 unsigned LastOperand =
16066 hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
16067 for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)
16068 Ops.push_back(N->getOperand(i));
16069 }
16070
16071 // For all node types, the alignment operand is always the last one.
16072 Ops.push_back(DAG.getConstant(Alignment.value(), dl, MVT::i32));
16073
16074 // If this is a non-standard-aligned STORE, the penultimate operand is the
16075 // stored value. Bitcast it to the aligned type.
16076 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
16077 SDValue &StVal = Ops[Ops.size() - 2];
16078 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
16079 }
16080
16081 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
16082 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
16083 MemN->getMemOperand());
16084
16085 // Update the uses.
16086 SmallVector<SDValue, 5> NewResults;
16087 for (unsigned i = 0; i < NumResultVecs; ++i)
16088 NewResults.push_back(SDValue(UpdN.getNode(), i));
16089
16090 // If this is an non-standard-aligned LOAD, the first result is the loaded
16091 // value. Bitcast it to the expected result type.
16092 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
16093 SDValue &LdVal = NewResults[0];
16094 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
16095 }
16096
16097 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16098 DCI.CombineTo(N, NewResults);
16099 DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs));
16100
16101 return true;
16102}
16103
16104// If (opcode ptr inc) is and ADD-like instruction, return the
16105// increment value. Otherwise return 0.
16106static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,
16107 SDValue Inc, const SelectionDAG &DAG) {
16109 if (!CInc)
16110 return 0;
16111
16112 switch (Opcode) {
16113 case ARMISD::VLD1_UPD:
16114 case ISD::ADD:
16115 return CInc->getZExtValue();
16116 case ISD::OR: {
16117 if (DAG.haveNoCommonBitsSet(Ptr, Inc)) {
16118 // (OR ptr inc) is the same as (ADD ptr inc)
16119 return CInc->getZExtValue();
16120 }
16121 return 0;
16122 }
16123 default:
16124 return 0;
16125 }
16126}
16127
16129 switch (N->getOpcode()) {
16130 case ISD::ADD:
16131 case ISD::OR: {
16132 if (isa<ConstantSDNode>(N->getOperand(1))) {
16133 *Ptr = N->getOperand(0);
16134 *CInc = N->getOperand(1);
16135 return true;
16136 }
16137 return false;
16138 }
16139 case ARMISD::VLD1_UPD: {
16140 if (isa<ConstantSDNode>(N->getOperand(2))) {
16141 *Ptr = N->getOperand(1);
16142 *CInc = N->getOperand(2);
16143 return true;
16144 }
16145 return false;
16146 }
16147 default:
16148 return false;
16149 }
16150}
16151
16152/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
16153/// NEON load/store intrinsics, and generic vector load/stores, to merge
16154/// base address updates.
16155/// For generic load/stores, the memory type is assumed to be a vector.
16156/// The caller is assumed to have checked legality.
16159 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
16160 N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
16161 const bool isStore = N->getOpcode() == ISD::STORE;
16162 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
16163 BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
16164
16165 // Limit the number of possible base-updates we look at to prevent degenerate
16166 // cases.
16167 unsigned MaxBaseUpdates = ArmMaxBaseUpdatesToCheck;
16168
16169 SDValue Addr = N->getOperand(AddrOpIdx);
16170
16172
16173 // Search for a use of the address operand that is an increment.
16174 for (SDUse &Use : Addr->uses()) {
16175 SDNode *User = Use.getUser();
16176 if (Use.getResNo() != Addr.getResNo() || User->getNumOperands() != 2)
16177 continue;
16178
16179 SDValue Inc = User->getOperand(Use.getOperandNo() == 1 ? 0 : 1);
16180 unsigned ConstInc =
16181 getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
16182
16183 if (ConstInc || User->getOpcode() == ISD::ADD) {
16184 BaseUpdates.push_back({User, Inc, ConstInc});
16185 if (BaseUpdates.size() >= MaxBaseUpdates)
16186 break;
16187 }
16188 }
16189
16190 // If the address is a constant pointer increment itself, find
16191 // another constant increment that has the same base operand
16192 SDValue Base;
16193 SDValue CInc;
16194 if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {
16195 unsigned Offset =
16196 getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);
16197 for (SDUse &Use : Base->uses()) {
16198
16199 SDNode *User = Use.getUser();
16200 if (Use.getResNo() != Base.getResNo() || User == Addr.getNode() ||
16201 User->getNumOperands() != 2)
16202 continue;
16203
16204 SDValue UserInc = User->getOperand(Use.getOperandNo() == 0 ? 1 : 0);
16205 unsigned UserOffset =
16206 getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);
16207
16208 if (!UserOffset || UserOffset <= Offset)
16209 continue;
16210
16211 unsigned NewConstInc = UserOffset - Offset;
16212 SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
16213 BaseUpdates.push_back({User, NewInc, NewConstInc});
16214 if (BaseUpdates.size() >= MaxBaseUpdates)
16215 break;
16216 }
16217 }
16218
16219 // Try to fold the load/store with an update that matches memory
16220 // access size. This should work well for sequential loads.
16221 unsigned NumValidUpd = BaseUpdates.size();
16222 for (unsigned I = 0; I < NumValidUpd; I++) {
16223 BaseUpdateUser &User = BaseUpdates[I];
16224 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
16225 return SDValue();
16226 }
16227
16228 // Try to fold with other users. Non-constant updates are considered
16229 // first, and constant updates are sorted to not break a sequence of
16230 // strided accesses (if there is any).
16231 llvm::stable_sort(BaseUpdates,
16232 [](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) {
16233 return LHS.ConstInc < RHS.ConstInc;
16234 });
16235 for (BaseUpdateUser &User : BaseUpdates) {
16236 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
16237 return SDValue();
16238 }
16239 return SDValue();
16240}
16241
16244 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16245 return SDValue();
16246
16247 return CombineBaseUpdate(N, DCI);
16248}
16249
16252 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16253 return SDValue();
16254
16255 SelectionDAG &DAG = DCI.DAG;
16256 SDValue Addr = N->getOperand(2);
16257 MemSDNode *MemN = cast<MemSDNode>(N);
16258 SDLoc dl(N);
16259
16260 // For the stores, where there are multiple intrinsics we only actually want
16261 // to post-inc the last of the them.
16262 unsigned IntNo = N->getConstantOperandVal(1);
16263 if (IntNo == Intrinsic::arm_mve_vst2q && N->getConstantOperandVal(5) != 1)
16264 return SDValue();
16265 if (IntNo == Intrinsic::arm_mve_vst4q && N->getConstantOperandVal(7) != 3)
16266 return SDValue();
16267
16268 // Search for a use of the address operand that is an increment.
16269 for (SDUse &Use : Addr->uses()) {
16270 SDNode *User = Use.getUser();
16271 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
16272 continue;
16273
16274 // Check that the add is independent of the load/store. Otherwise, folding
16275 // it would create a cycle. We can avoid searching through Addr as it's a
16276 // predecessor to both.
16279 Visited.insert(Addr.getNode());
16280 Worklist.push_back(N);
16281 Worklist.push_back(User);
16282 const unsigned MaxSteps = 1024;
16283 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
16284 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
16285 continue;
16286
16287 // Find the new opcode for the updating load/store.
16288 bool isLoadOp = true;
16289 unsigned NewOpc = 0;
16290 unsigned NumVecs = 0;
16291 switch (IntNo) {
16292 default:
16293 llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
16294 case Intrinsic::arm_mve_vld2q:
16295 NewOpc = ARMISD::VLD2_UPD;
16296 NumVecs = 2;
16297 break;
16298 case Intrinsic::arm_mve_vld4q:
16299 NewOpc = ARMISD::VLD4_UPD;
16300 NumVecs = 4;
16301 break;
16302 case Intrinsic::arm_mve_vst2q:
16303 NewOpc = ARMISD::VST2_UPD;
16304 NumVecs = 2;
16305 isLoadOp = false;
16306 break;
16307 case Intrinsic::arm_mve_vst4q:
16308 NewOpc = ARMISD::VST4_UPD;
16309 NumVecs = 4;
16310 isLoadOp = false;
16311 break;
16312 }
16313
16314 // Find the size of memory referenced by the load/store.
16315 EVT VecTy;
16316 if (isLoadOp) {
16317 VecTy = N->getValueType(0);
16318 } else {
16319 VecTy = N->getOperand(3).getValueType();
16320 }
16321
16322 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
16323
16324 // If the increment is a constant, it must match the memory ref size.
16325 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
16327 if (!CInc || CInc->getZExtValue() != NumBytes)
16328 continue;
16329
16330 // Create the new updating load/store node.
16331 // First, create an SDVTList for the new updating node's results.
16332 EVT Tys[6];
16333 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16334 unsigned n;
16335 for (n = 0; n < NumResultVecs; ++n)
16336 Tys[n] = VecTy;
16337 Tys[n++] = MVT::i32;
16338 Tys[n] = MVT::Other;
16339 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16340
16341 // Then, gather the new node's operands.
16343 Ops.push_back(N->getOperand(0)); // incoming chain
16344 Ops.push_back(N->getOperand(2)); // ptr
16345 Ops.push_back(Inc);
16346
16347 for (unsigned i = 3; i < N->getNumOperands(); ++i)
16348 Ops.push_back(N->getOperand(i));
16349
16350 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,
16351 MemN->getMemOperand());
16352
16353 // Update the uses.
16354 SmallVector<SDValue, 5> NewResults;
16355 for (unsigned i = 0; i < NumResultVecs; ++i)
16356 NewResults.push_back(SDValue(UpdN.getNode(), i));
16357
16358 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16359 DCI.CombineTo(N, NewResults);
16360 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
16361
16362 break;
16363 }
16364
16365 return SDValue();
16366}
16367
16368/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
16369/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
16370/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
16371/// return true.
16373 SelectionDAG &DAG = DCI.DAG;
16374 EVT VT = N->getValueType(0);
16375 // vldN-dup instructions only support 64-bit vectors for N > 1.
16376 if (!VT.is64BitVector())
16377 return false;
16378
16379 // Check if the VDUPLANE operand is a vldN-dup intrinsic.
16380 SDNode *VLD = N->getOperand(0).getNode();
16381 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
16382 return false;
16383 unsigned NumVecs = 0;
16384 unsigned NewOpc = 0;
16385 unsigned IntNo = VLD->getConstantOperandVal(1);
16386 if (IntNo == Intrinsic::arm_neon_vld2lane) {
16387 NumVecs = 2;
16388 NewOpc = ARMISD::VLD2DUP;
16389 } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
16390 NumVecs = 3;
16391 NewOpc = ARMISD::VLD3DUP;
16392 } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
16393 NumVecs = 4;
16394 NewOpc = ARMISD::VLD4DUP;
16395 } else {
16396 return false;
16397 }
16398
16399 // First check that all the vldN-lane uses are VDUPLANEs and that the lane
16400 // numbers match the load.
16401 unsigned VLDLaneNo = VLD->getConstantOperandVal(NumVecs + 3);
16402 for (SDUse &Use : VLD->uses()) {
16403 // Ignore uses of the chain result.
16404 if (Use.getResNo() == NumVecs)
16405 continue;
16406 SDNode *User = Use.getUser();
16407 if (User->getOpcode() != ARMISD::VDUPLANE ||
16408 VLDLaneNo != User->getConstantOperandVal(1))
16409 return false;
16410 }
16411
16412 // Create the vldN-dup node.
16413 EVT Tys[5];
16414 unsigned n;
16415 for (n = 0; n < NumVecs; ++n)
16416 Tys[n] = VT;
16417 Tys[n] = MVT::Other;
16418 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumVecs + 1));
16419 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
16421 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
16422 Ops, VLDMemInt->getMemoryVT(),
16423 VLDMemInt->getMemOperand());
16424
16425 // Update the uses.
16426 for (SDUse &Use : VLD->uses()) {
16427 unsigned ResNo = Use.getResNo();
16428 // Ignore uses of the chain result.
16429 if (ResNo == NumVecs)
16430 continue;
16431 DCI.CombineTo(Use.getUser(), SDValue(VLDDup.getNode(), ResNo));
16432 }
16433
16434 // Now the vldN-lane intrinsic is dead except for its chain result.
16435 // Update uses of the chain.
16436 std::vector<SDValue> VLDDupResults;
16437 for (unsigned n = 0; n < NumVecs; ++n)
16438 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
16439 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
16440 DCI.CombineTo(VLD, VLDDupResults);
16441
16442 return true;
16443}
16444
16445/// PerformVDUPLANECombine - Target-specific dag combine xforms for
16446/// ARMISD::VDUPLANE.
16449 const ARMSubtarget *Subtarget) {
16450 SDValue Op = N->getOperand(0);
16451 EVT VT = N->getValueType(0);
16452
16453 // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
16454 if (Subtarget->hasMVEIntegerOps()) {
16455 EVT ExtractVT = VT.getVectorElementType();
16456 // We need to ensure we are creating a legal type.
16457 if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
16458 ExtractVT = MVT::i32;
16459 SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
16460 N->getOperand(0), N->getOperand(1));
16461 return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
16462 }
16463
16464 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
16465 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
16466 if (CombineVLDDUP(N, DCI))
16467 return SDValue(N, 0);
16468
16469 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
16470 // redundant. Ignore bit_converts for now; element sizes are checked below.
16471 while (Op.getOpcode() == ISD::BITCAST)
16472 Op = Op.getOperand(0);
16473 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
16474 return SDValue();
16475
16476 // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
16477 unsigned EltSize = Op.getScalarValueSizeInBits();
16478 // The canonical VMOV for a zero vector uses a 32-bit element size.
16479 unsigned Imm = Op.getConstantOperandVal(0);
16480 unsigned EltBits;
16481 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
16482 EltSize = 8;
16483 if (EltSize > VT.getScalarSizeInBits())
16484 return SDValue();
16485
16486 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
16487}
16488
16489/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
16491 const ARMSubtarget *Subtarget) {
16492 SDValue Op = N->getOperand(0);
16493 SDLoc dl(N);
16494
16495 if (Subtarget->hasMVEIntegerOps()) {
16496 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
16497 // need to come from a GPR.
16498 if (Op.getValueType() == MVT::f32)
16499 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16500 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
16501 else if (Op.getValueType() == MVT::f16)
16502 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16503 DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
16504 }
16505
16506 if (!Subtarget->hasNEON())
16507 return SDValue();
16508
16509 // Match VDUP(LOAD) -> VLD1DUP.
16510 // We match this pattern here rather than waiting for isel because the
16511 // transform is only legal for unindexed loads.
16512 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
16513 if (LD && Op.hasOneUse() && LD->isUnindexed() &&
16514 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
16515 SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),
16516 DAG.getConstant(LD->getAlign().value(), SDLoc(N), MVT::i32)};
16517 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
16518 SDValue VLDDup =
16520 LD->getMemoryVT(), LD->getMemOperand());
16521 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
16522 return VLDDup;
16523 }
16524
16525 return SDValue();
16526}
16527
16530 const ARMSubtarget *Subtarget) {
16531 EVT VT = N->getValueType(0);
16532
16533 // If this is a legal vector load, try to combine it into a VLD1_UPD.
16534 if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() &&
16536 return CombineBaseUpdate(N, DCI);
16537
16538 return SDValue();
16539}
16540
16541// Optimize trunc store (of multiple scalars) to shuffle and store. First,
16542// pack all of the elements in one place. Next, store to memory in fewer
16543// chunks.
16545 SelectionDAG &DAG) {
16546 SDValue StVal = St->getValue();
16547 EVT VT = StVal.getValueType();
16548 if (!St->isTruncatingStore() || !VT.isVector())
16549 return SDValue();
16550 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16551 EVT StVT = St->getMemoryVT();
16552 unsigned NumElems = VT.getVectorNumElements();
16553 assert(StVT != VT && "Cannot truncate to the same type");
16554 unsigned FromEltSz = VT.getScalarSizeInBits();
16555 unsigned ToEltSz = StVT.getScalarSizeInBits();
16556
16557 // From, To sizes and ElemCount must be pow of two
16558 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
16559 return SDValue();
16560
16561 // We are going to use the original vector elt for storing.
16562 // Accumulated smaller vector elements must be a multiple of the store size.
16563 if (0 != (NumElems * FromEltSz) % ToEltSz)
16564 return SDValue();
16565
16566 unsigned SizeRatio = FromEltSz / ToEltSz;
16567 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
16568
16569 // Create a type on which we perform the shuffle.
16570 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
16571 NumElems * SizeRatio);
16572 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
16573
16574 SDLoc DL(St);
16575 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
16576 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
16577 for (unsigned i = 0; i < NumElems; ++i)
16578 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
16579 : i * SizeRatio;
16580
16581 // Can't shuffle using an illegal type.
16582 if (!TLI.isTypeLegal(WideVecVT))
16583 return SDValue();
16584
16585 SDValue Shuff = DAG.getVectorShuffle(
16586 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
16587 // At this point all of the data is stored at the bottom of the
16588 // register. We now need to save it to mem.
16589
16590 // Find the largest store unit
16591 MVT StoreType = MVT::i8;
16592 for (MVT Tp : MVT::integer_valuetypes()) {
16593 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
16594 StoreType = Tp;
16595 }
16596 // Didn't find a legal store type.
16597 if (!TLI.isTypeLegal(StoreType))
16598 return SDValue();
16599
16600 // Bitcast the original vector into a vector of store-size units
16601 EVT StoreVecVT =
16602 EVT::getVectorVT(*DAG.getContext(), StoreType,
16603 VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
16604 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
16605 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
16607 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
16608 TLI.getPointerTy(DAG.getDataLayout()));
16609 SDValue BasePtr = St->getBasePtr();
16610
16611 // Perform one or more big stores into memory.
16612 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
16613 for (unsigned I = 0; I < E; I++) {
16614 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
16615 ShuffWide, DAG.getIntPtrConstant(I, DL));
16616 SDValue Ch =
16617 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
16618 St->getAlign(), St->getMemOperand()->getFlags());
16619 BasePtr =
16620 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
16621 Chains.push_back(Ch);
16622 }
16623 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
16624}
16625
16626// Try taking a single vector store from an fpround (which would otherwise turn
16627// into an expensive buildvector) and splitting it into a series of narrowing
16628// stores.
16630 SelectionDAG &DAG) {
16631 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16632 return SDValue();
16633 SDValue Trunc = St->getValue();
16634 if (Trunc->getOpcode() != ISD::FP_ROUND)
16635 return SDValue();
16636 EVT FromVT = Trunc->getOperand(0).getValueType();
16637 EVT ToVT = Trunc.getValueType();
16638 if (!ToVT.isVector())
16639 return SDValue();
16641 EVT ToEltVT = ToVT.getVectorElementType();
16642 EVT FromEltVT = FromVT.getVectorElementType();
16643
16644 if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16)
16645 return SDValue();
16646
16647 unsigned NumElements = 4;
16648 if (FromVT.getVectorNumElements() % NumElements != 0)
16649 return SDValue();
16650
16651 // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so
16652 // use the VMOVN over splitting the store. We are looking for patterns of:
16653 // !rev: 0 N 1 N+1 2 N+2 ...
16654 // rev: N 0 N+1 1 N+2 2 ...
16655 // The shuffle may either be a single source (in which case N = NumElts/2) or
16656 // two inputs extended with concat to the same size (in which case N =
16657 // NumElts).
16658 auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
16659 ArrayRef<int> M = SVN->getMask();
16660 unsigned NumElts = ToVT.getVectorNumElements();
16661 if (SVN->getOperand(1).isUndef())
16662 NumElts /= 2;
16663
16664 unsigned Off0 = Rev ? NumElts : 0;
16665 unsigned Off1 = Rev ? 0 : NumElts;
16666
16667 for (unsigned I = 0; I < NumElts; I += 2) {
16668 if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
16669 return false;
16670 if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
16671 return false;
16672 }
16673
16674 return true;
16675 };
16676
16677 if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
16678 if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
16679 return SDValue();
16680
16681 LLVMContext &C = *DAG.getContext();
16682 SDLoc DL(St);
16683 // Details about the old store
16684 SDValue Ch = St->getChain();
16685 SDValue BasePtr = St->getBasePtr();
16686 Align Alignment = St->getBaseAlign();
16688 AAMDNodes AAInfo = St->getAAInfo();
16689
16690 // We split the store into slices of NumElements. fp16 trunc stores are vcvt
16691 // and then stored as truncating integer stores.
16692 EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
16693 EVT NewToVT = EVT::getVectorVT(
16694 C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
16695
16697 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
16698 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
16699 SDValue NewPtr =
16700 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16701
16702 SDValue Extract =
16703 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
16704 DAG.getConstant(i * NumElements, DL, MVT::i32));
16705
16706 SDValue FPTrunc =
16707 DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
16708 Extract, DAG.getConstant(0, DL, MVT::i32));
16709 Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
16710
16711 SDValue Store = DAG.getTruncStore(
16712 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16713 NewToVT, Alignment, MMOFlags, AAInfo);
16714 Stores.push_back(Store);
16715 }
16716 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16717}
16718
16719// Try taking a single vector store from an MVETRUNC (which would otherwise turn
16720// into an expensive buildvector) and splitting it into a series of narrowing
16721// stores.
16723 SelectionDAG &DAG) {
16724 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16725 return SDValue();
16726 SDValue Trunc = St->getValue();
16727 if (Trunc->getOpcode() != ARMISD::MVETRUNC)
16728 return SDValue();
16729 EVT FromVT = Trunc->getOperand(0).getValueType();
16730 EVT ToVT = Trunc.getValueType();
16731
16732 LLVMContext &C = *DAG.getContext();
16733 SDLoc DL(St);
16734 // Details about the old store
16735 SDValue Ch = St->getChain();
16736 SDValue BasePtr = St->getBasePtr();
16737 Align Alignment = St->getBaseAlign();
16739 AAMDNodes AAInfo = St->getAAInfo();
16740
16741 EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(),
16742 FromVT.getVectorNumElements());
16743
16745 for (unsigned i = 0; i < Trunc.getNumOperands(); i++) {
16746 unsigned NewOffset =
16747 i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8;
16748 SDValue NewPtr =
16749 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16750
16751 SDValue Extract = Trunc.getOperand(i);
16752 SDValue Store = DAG.getTruncStore(
16753 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16754 NewToVT, Alignment, MMOFlags, AAInfo);
16755 Stores.push_back(Store);
16756 }
16757 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16758}
16759
16760// Given a floating point store from an extracted vector, with an integer
16761// VGETLANE that already exists, store the existing VGETLANEu directly. This can
16762// help reduce fp register pressure, doesn't require the fp extract and allows
16763// use of more integer post-inc stores not available with vstr.
16765 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16766 return SDValue();
16767 SDValue Extract = St->getValue();
16768 EVT VT = Extract.getValueType();
16769 // For now only uses f16. This may be useful for f32 too, but that will
16770 // be bitcast(extract), not the VGETLANEu we currently check here.
16771 if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16772 return SDValue();
16773
16774 SDNode *GetLane =
16775 DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32),
16776 {Extract.getOperand(0), Extract.getOperand(1)});
16777 if (!GetLane)
16778 return SDValue();
16779
16780 LLVMContext &C = *DAG.getContext();
16781 SDLoc DL(St);
16782 // Create a new integer store to replace the existing floating point version.
16783 SDValue Ch = St->getChain();
16784 SDValue BasePtr = St->getBasePtr();
16785 Align Alignment = St->getBaseAlign();
16787 AAMDNodes AAInfo = St->getAAInfo();
16788 EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits());
16789 SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr,
16790 St->getPointerInfo(), NewToVT, Alignment,
16791 MMOFlags, AAInfo);
16792
16793 return Store;
16794}
16795
16796/// PerformSTORECombine - Target-specific dag combine xforms for
16797/// ISD::STORE.
16800 const ARMSubtarget *Subtarget) {
16802 if (St->isVolatile())
16803 return SDValue();
16804 SDValue StVal = St->getValue();
16805 EVT VT = StVal.getValueType();
16806
16807 if (Subtarget->hasNEON())
16808 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
16809 return Store;
16810
16811 if (Subtarget->hasMVEFloatOps())
16812 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
16813 return NewToken;
16814
16815 if (Subtarget->hasMVEIntegerOps()) {
16816 if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG))
16817 return NewChain;
16818 if (SDValue NewToken =
16820 return NewToken;
16821 }
16822
16823 if (!ISD::isNormalStore(St))
16824 return SDValue();
16825
16826 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
16827 // ARM stores of arguments in the same cache line.
16828 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
16829 StVal.getNode()->hasOneUse()) {
16830 SelectionDAG &DAG = DCI.DAG;
16831 bool isBigEndian = DAG.getDataLayout().isBigEndian();
16832 SDLoc DL(St);
16833 SDValue BasePtr = St->getBasePtr();
16834 SDValue NewST1 = DAG.getStore(
16835 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
16836 BasePtr, St->getPointerInfo(), St->getBaseAlign(),
16837 St->getMemOperand()->getFlags());
16838
16839 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
16840 DAG.getConstant(4, DL, MVT::i32));
16841 return DAG.getStore(NewST1.getValue(0), DL,
16842 StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
16843 OffsetPtr, St->getPointerInfo().getWithOffset(4),
16844 St->getBaseAlign(), St->getMemOperand()->getFlags());
16845 }
16846
16847 if (StVal.getValueType() == MVT::i64 &&
16849
16850 // Bitcast an i64 store extracted from a vector to f64.
16851 // Otherwise, the i64 value will be legalized to a pair of i32 values.
16852 SelectionDAG &DAG = DCI.DAG;
16853 SDLoc dl(StVal);
16854 SDValue IntVec = StVal.getOperand(0);
16855 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
16857 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
16858 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16859 Vec, StVal.getOperand(1));
16860 dl = SDLoc(N);
16861 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
16862 // Make the DAGCombiner fold the bitcasts.
16863 DCI.AddToWorklist(Vec.getNode());
16864 DCI.AddToWorklist(ExtElt.getNode());
16865 DCI.AddToWorklist(V.getNode());
16866 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
16867 St->getPointerInfo(), St->getAlign(),
16868 St->getMemOperand()->getFlags(), St->getAAInfo());
16869 }
16870
16871 // If this is a legal vector store, try to combine it into a VST1_UPD.
16872 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
16874 return CombineBaseUpdate(N, DCI);
16875
16876 return SDValue();
16877}
16878
16879/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
16880/// can replace combinations of VMUL and VCVT (floating-point to integer)
16881/// when the VMUL has a constant operand that is a power of 2.
16882///
16883/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
16884/// vmul.f32 d16, d17, d16
16885/// vcvt.s32.f32 d16, d16
16886/// becomes:
16887/// vcvt.s32.f32 d16, d16, #3
16889 const ARMSubtarget *Subtarget) {
16890 if (!Subtarget->hasNEON())
16891 return SDValue();
16892
16893 SDValue Op = N->getOperand(0);
16894 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
16895 Op.getOpcode() != ISD::FMUL)
16896 return SDValue();
16897
16898 SDValue ConstVec = Op->getOperand(1);
16899 if (!isa<BuildVectorSDNode>(ConstVec))
16900 return SDValue();
16901
16902 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
16903 uint32_t FloatBits = FloatTy.getSizeInBits();
16904 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
16905 uint32_t IntBits = IntTy.getSizeInBits();
16906 unsigned NumLanes = Op.getValueType().getVectorNumElements();
16907 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
16908 // These instructions only exist converting from f32 to i32. We can handle
16909 // smaller integers by generating an extra truncate, but larger ones would
16910 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
16911 // these intructions only support v2i32/v4i32 types.
16912 return SDValue();
16913 }
16914
16915 BitVector UndefElements;
16917 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
16918 if (C == -1 || C == 0 || C > 32)
16919 return SDValue();
16920
16921 SDLoc dl(N);
16922 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
16923 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
16924 Intrinsic::arm_neon_vcvtfp2fxu;
16925 SDValue FixConv = DAG.getNode(
16926 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
16927 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
16928 DAG.getConstant(C, dl, MVT::i32));
16929
16930 if (IntBits < FloatBits)
16931 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
16932
16933 return FixConv;
16934}
16935
16937 const ARMSubtarget *Subtarget) {
16938 if (!Subtarget->hasMVEFloatOps())
16939 return SDValue();
16940
16941 // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x)
16942 // The second form can be more easily turned into a predicated vadd, and
16943 // possibly combined into a fma to become a predicated vfma.
16944 SDValue Op0 = N->getOperand(0);
16945 SDValue Op1 = N->getOperand(1);
16946 EVT VT = N->getValueType(0);
16947 SDLoc DL(N);
16948
16949 // The identity element for a fadd is -0.0 or +0.0 when the nsz flag is set,
16950 // which these VMOV's represent.
16951 auto isIdentitySplat = [&](SDValue Op, bool NSZ) {
16952 if (Op.getOpcode() != ISD::BITCAST ||
16953 Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM)
16954 return false;
16955 uint64_t ImmVal = Op.getOperand(0).getConstantOperandVal(0);
16956 if (VT == MVT::v4f32 && (ImmVal == 1664 || (ImmVal == 0 && NSZ)))
16957 return true;
16958 if (VT == MVT::v8f16 && (ImmVal == 2688 || (ImmVal == 0 && NSZ)))
16959 return true;
16960 return false;
16961 };
16962
16963 if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT)
16964 std::swap(Op0, Op1);
16965
16966 if (Op1.getOpcode() != ISD::VSELECT)
16967 return SDValue();
16968
16969 SDNodeFlags FaddFlags = N->getFlags();
16970 bool NSZ = FaddFlags.hasNoSignedZeros();
16971 if (!isIdentitySplat(Op1.getOperand(2), NSZ))
16972 return SDValue();
16973
16974 SDValue FAdd =
16975 DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), FaddFlags);
16976 return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0, FaddFlags);
16977}
16978
16980 SDValue LHS = N->getOperand(0);
16981 SDValue RHS = N->getOperand(1);
16982 EVT VT = N->getValueType(0);
16983 SDLoc DL(N);
16984
16985 if (!N->getFlags().hasAllowReassociation())
16986 return SDValue();
16987
16988 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
16989 auto ReassocComplex = [&](SDValue A, SDValue B) {
16990 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
16991 return SDValue();
16992 unsigned Opc = A.getConstantOperandVal(0);
16993 if (Opc != Intrinsic::arm_mve_vcmlaq)
16994 return SDValue();
16995 SDValue VCMLA = DAG.getNode(
16996 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0), A.getOperand(1),
16997 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(2), B, N->getFlags()),
16998 A.getOperand(3), A.getOperand(4));
16999 VCMLA->setFlags(A->getFlags());
17000 return VCMLA;
17001 };
17002 if (SDValue R = ReassocComplex(LHS, RHS))
17003 return R;
17004 if (SDValue R = ReassocComplex(RHS, LHS))
17005 return R;
17006
17007 return SDValue();
17008}
17009
17011 const ARMSubtarget *Subtarget) {
17012 if (SDValue S = PerformFAddVSelectCombine(N, DAG, Subtarget))
17013 return S;
17014 if (SDValue S = PerformFADDVCMLACombine(N, DAG))
17015 return S;
17016 return SDValue();
17017}
17018
17019/// PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
17020/// can replace combinations of VCVT (integer to floating-point) and VMUL
17021/// when the VMUL has a constant operand that is a power of 2.
17022///
17023/// Example (assume d17 = <float 0.125, float 0.125>):
17024/// vcvt.f32.s32 d16, d16
17025/// vmul.f32 d16, d16, d17
17026/// becomes:
17027/// vcvt.f32.s32 d16, d16, #3
17029 const ARMSubtarget *Subtarget) {
17030 if (!Subtarget->hasNEON())
17031 return SDValue();
17032
17033 SDValue Op = N->getOperand(0);
17034 unsigned OpOpcode = Op.getNode()->getOpcode();
17035 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
17036 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
17037 return SDValue();
17038
17039 SDValue ConstVec = N->getOperand(1);
17040 if (!isa<BuildVectorSDNode>(ConstVec))
17041 return SDValue();
17042
17043 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
17044 uint32_t FloatBits = FloatTy.getSizeInBits();
17045 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
17046 uint32_t IntBits = IntTy.getSizeInBits();
17047 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17048 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
17049 // These instructions only exist converting from i32 to f32. We can handle
17050 // smaller integers by generating an extra extend, but larger ones would
17051 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
17052 // these intructions only support v2i32/v4i32 types.
17053 return SDValue();
17054 }
17055
17056 ConstantFPSDNode *CN = isConstOrConstSplatFP(ConstVec, true);
17057 APFloat Recip(0.0f);
17058 if (!CN || !CN->getValueAPF().getExactInverse(&Recip))
17059 return SDValue();
17060
17061 bool IsExact;
17062 APSInt IntVal(33);
17063 if (Recip.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
17064 APFloat::opOK ||
17065 !IsExact)
17066 return SDValue();
17067
17068 int32_t C = IntVal.exactLogBase2();
17069 if (C == -1 || C == 0 || C > 32)
17070 return SDValue();
17071
17072 SDLoc DL(N);
17073 bool isSigned = OpOpcode == ISD::SINT_TO_FP;
17074 SDValue ConvInput = Op.getOperand(0);
17075 if (IntBits < FloatBits)
17077 NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput);
17078
17079 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp
17080 : Intrinsic::arm_neon_vcvtfxu2fp;
17081 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
17082 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
17083 DAG.getConstant(C, DL, MVT::i32));
17084}
17085
17087 const ARMSubtarget *ST) {
17088 if (!ST->hasMVEIntegerOps())
17089 return SDValue();
17090
17091 assert(N->getOpcode() == ISD::VECREDUCE_ADD);
17092 EVT ResVT = N->getValueType(0);
17093 SDValue N0 = N->getOperand(0);
17094 SDLoc dl(N);
17095
17096 // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
17097 if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
17098 (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 ||
17099 N0.getValueType() == MVT::v16i8)) {
17100 SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0));
17101 SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1));
17102 return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1);
17103 }
17104
17105 // We are looking for something that will have illegal types if left alone,
17106 // but that we can convert to a single instruction under MVE. For example
17107 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
17108 // or
17109 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
17110
17111 // The legal cases are:
17112 // VADDV u/s 8/16/32
17113 // VMLAV u/s 8/16/32
17114 // VADDLV u/s 32
17115 // VMLALV u/s 16/32
17116
17117 // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
17118 // extend it and use v4i32 instead.
17119 auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) {
17120 EVT AVT = A.getValueType();
17121 return any_of(ExtTypes, [&](MVT Ty) {
17122 return AVT.getVectorNumElements() == Ty.getVectorNumElements() &&
17123 AVT.bitsLE(Ty);
17124 });
17125 };
17126 auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
17127 EVT AVT = A.getValueType();
17128 if (!AVT.is128BitVector())
17129 A = DAG.getNode(ExtendCode, dl,
17131 128 / AVT.getVectorMinNumElements())),
17132 A);
17133 return A;
17134 };
17135 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
17136 if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
17137 return SDValue();
17138 SDValue A = N0->getOperand(0);
17139 if (ExtTypeMatches(A, ExtTypes))
17140 return ExtendIfNeeded(A, ExtendCode);
17141 return SDValue();
17142 };
17143 auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
17144 ArrayRef<MVT> ExtTypes, SDValue &Mask) {
17145 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17147 return SDValue();
17148 Mask = N0->getOperand(0);
17149 SDValue Ext = N0->getOperand(1);
17150 if (Ext->getOpcode() != ExtendCode)
17151 return SDValue();
17152 SDValue A = Ext->getOperand(0);
17153 if (ExtTypeMatches(A, ExtTypes))
17154 return ExtendIfNeeded(A, ExtendCode);
17155 return SDValue();
17156 };
17157 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17158 SDValue &A, SDValue &B) {
17159 // For a vmla we are trying to match a larger pattern:
17160 // ExtA = sext/zext A
17161 // ExtB = sext/zext B
17162 // Mul = mul ExtA, ExtB
17163 // vecreduce.add Mul
17164 // There might also be en extra extend between the mul and the addreduce, so
17165 // long as the bitwidth is high enough to make them equivalent (for example
17166 // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
17167 if (ResVT != RetTy)
17168 return false;
17169 SDValue Mul = N0;
17170 if (Mul->getOpcode() == ExtendCode &&
17171 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17172 ResVT.getScalarSizeInBits())
17173 Mul = Mul->getOperand(0);
17174 if (Mul->getOpcode() != ISD::MUL)
17175 return false;
17176 SDValue ExtA = Mul->getOperand(0);
17177 SDValue ExtB = Mul->getOperand(1);
17178 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17179 return false;
17180 A = ExtA->getOperand(0);
17181 B = ExtB->getOperand(0);
17182 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17183 A = ExtendIfNeeded(A, ExtendCode);
17184 B = ExtendIfNeeded(B, ExtendCode);
17185 return true;
17186 }
17187 return false;
17188 };
17189 auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17190 SDValue &A, SDValue &B, SDValue &Mask) {
17191 // Same as the pattern above with a select for the zero predicated lanes
17192 // ExtA = sext/zext A
17193 // ExtB = sext/zext B
17194 // Mul = mul ExtA, ExtB
17195 // N0 = select Mask, Mul, 0
17196 // vecreduce.add N0
17197 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17199 return false;
17200 Mask = N0->getOperand(0);
17201 SDValue Mul = N0->getOperand(1);
17202 if (Mul->getOpcode() == ExtendCode &&
17203 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17204 ResVT.getScalarSizeInBits())
17205 Mul = Mul->getOperand(0);
17206 if (Mul->getOpcode() != ISD::MUL)
17207 return false;
17208 SDValue ExtA = Mul->getOperand(0);
17209 SDValue ExtB = Mul->getOperand(1);
17210 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17211 return false;
17212 A = ExtA->getOperand(0);
17213 B = ExtB->getOperand(0);
17214 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17215 A = ExtendIfNeeded(A, ExtendCode);
17216 B = ExtendIfNeeded(B, ExtendCode);
17217 return true;
17218 }
17219 return false;
17220 };
17221 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
17222 // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64
17223 // reductions. The operands are extended with MVEEXT, but as they are
17224 // reductions the lane orders do not matter. MVEEXT may be combined with
17225 // loads to produce two extending loads, or else they will be expanded to
17226 // VREV/VMOVL.
17227 EVT VT = Ops[0].getValueType();
17228 if (VT == MVT::v16i8) {
17229 assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) &&
17230 "Unexpected illegal long reduction opcode");
17231 bool IsUnsigned = Opcode == ARMISD::VMLALVu;
17232
17233 SDValue Ext0 =
17234 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17235 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]);
17236 SDValue Ext1 =
17237 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17238 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]);
17239
17240 SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
17241 Ext0, Ext1);
17242 SDValue MLA1 =
17243 DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl,
17244 DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1),
17245 Ext0.getValue(1), Ext1.getValue(1));
17246 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1));
17247 }
17248 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
17249 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
17250 SDValue(Node.getNode(), 1));
17251 };
17252
17253 SDValue A, B;
17254 SDValue Mask;
17255 if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17256 return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
17257 if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17258 return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
17259 if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17260 A, B))
17261 return Create64bitNode(ARMISD::VMLALVs, {A, B});
17262 if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17263 A, B))
17264 return Create64bitNode(ARMISD::VMLALVu, {A, B});
17265 if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
17266 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17267 DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
17268 if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))
17269 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17270 DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
17271
17272 if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17273 Mask))
17274 return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
17275 if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17276 Mask))
17277 return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
17278 if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17279 Mask))
17280 return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
17281 if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17282 Mask))
17283 return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
17284 if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
17285 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17286 DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
17287 if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))
17288 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17289 DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
17290
17291 if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
17292 return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
17293 if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
17294 return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
17295 if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
17296 return Create64bitNode(ARMISD::VADDLVs, {A});
17297 if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
17298 return Create64bitNode(ARMISD::VADDLVu, {A});
17299 if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
17300 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17301 DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
17302 if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
17303 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17304 DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
17305
17306 if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17307 return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
17308 if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17309 return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
17310 if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))
17311 return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
17312 if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))
17313 return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
17314 if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
17315 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17316 DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
17317 if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
17318 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17319 DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
17320
17321 // Some complications. We can get a case where the two inputs of the mul are
17322 // the same, then the output sext will have been helpfully converted to a
17323 // zext. Turn it back.
17324 SDValue Op = N0;
17325 if (Op->getOpcode() == ISD::VSELECT)
17326 Op = Op->getOperand(1);
17327 if (Op->getOpcode() == ISD::ZERO_EXTEND &&
17328 Op->getOperand(0)->getOpcode() == ISD::MUL) {
17329 SDValue Mul = Op->getOperand(0);
17330 if (Mul->getOperand(0) == Mul->getOperand(1) &&
17331 Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
17332 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);
17333 if (Op != N0)
17334 Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),
17335 N0->getOperand(0), Ext, N0->getOperand(2));
17336 return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);
17337 }
17338 }
17339
17340 return SDValue();
17341}
17342
17343// Looks for vaddv(shuffle) or vmlav(shuffle, shuffle), with a shuffle where all
17344// the lanes are used. Due to the reduction being commutative the shuffle can be
17345// removed.
17347 unsigned VecOp = N->getOperand(0).getValueType().isVector() ? 0 : 2;
17348 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp));
17349 if (!Shuf || !Shuf->getOperand(1).isUndef())
17350 return SDValue();
17351
17352 // Check all elements are used once in the mask.
17353 ArrayRef<int> Mask = Shuf->getMask();
17354 APInt SetElts(Mask.size(), 0);
17355 for (int E : Mask) {
17356 if (E < 0 || E >= (int)Mask.size())
17357 return SDValue();
17358 SetElts.setBit(E);
17359 }
17360 if (!SetElts.isAllOnes())
17361 return SDValue();
17362
17363 if (N->getNumOperands() != VecOp + 1) {
17364 auto *Shuf2 = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp + 1));
17365 if (!Shuf2 || !Shuf2->getOperand(1).isUndef() || Shuf2->getMask() != Mask)
17366 return SDValue();
17367 }
17368
17370 for (SDValue Op : N->ops()) {
17371 if (Op.getValueType().isVector())
17372 Ops.push_back(Op.getOperand(0));
17373 else
17374 Ops.push_back(Op);
17375 }
17376 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops);
17377}
17378
17381 SDValue Op0 = N->getOperand(0);
17382 SDValue Op1 = N->getOperand(1);
17383 unsigned IsTop = N->getConstantOperandVal(2);
17384
17385 // VMOVNT a undef -> a
17386 // VMOVNB a undef -> a
17387 // VMOVNB undef a -> a
17388 if (Op1->isUndef())
17389 return Op0;
17390 if (Op0->isUndef() && !IsTop)
17391 return Op1;
17392
17393 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
17394 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
17395 if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
17396 Op1->getOpcode() == ARMISD::VQMOVNu) &&
17397 Op1->getConstantOperandVal(2) == 0)
17398 return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),
17399 Op0, Op1->getOperand(1), N->getOperand(2));
17400
17401 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
17402 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
17403 // into the top or bottom lanes.
17404 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17405 APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1));
17406 APInt Op0DemandedElts =
17407 IsTop ? Op1DemandedElts
17408 : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));
17409
17410 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17411 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17412 return SDValue(N, 0);
17413 if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, DCI))
17414 return SDValue(N, 0);
17415
17416 return SDValue();
17417}
17418
17421 SDValue Op0 = N->getOperand(0);
17422 unsigned IsTop = N->getConstantOperandVal(2);
17423
17424 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17425 APInt Op0DemandedElts =
17426 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
17427 : APInt::getHighBitsSet(2, 1));
17428
17429 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17430 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17431 return SDValue(N, 0);
17432 return SDValue();
17433}
17434
17437 EVT VT = N->getValueType(0);
17438 SDValue LHS = N->getOperand(0);
17439 SDValue RHS = N->getOperand(1);
17440
17441 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS);
17442 auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS);
17443 // Turn VQDMULH(shuffle, shuffle) -> shuffle(VQDMULH)
17444 if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) &&
17445 LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() &&
17446 (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {
17447 SDLoc DL(N);
17448 SDValue NewBinOp = DCI.DAG.getNode(N->getOpcode(), DL, VT,
17449 LHS.getOperand(0), RHS.getOperand(0));
17450 SDValue UndefV = LHS.getOperand(1);
17451 return DCI.DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());
17452 }
17453 return SDValue();
17454}
17455
17457 SDLoc DL(N);
17458 SDValue Op0 = N->getOperand(0);
17459 SDValue Op1 = N->getOperand(1);
17460
17461 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
17462 // uses of the intrinsics.
17463 if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
17464 int ShiftAmt = C->getSExtValue();
17465 if (ShiftAmt == 0) {
17466 SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);
17467 DAG.ReplaceAllUsesWith(N, Merge.getNode());
17468 return SDValue();
17469 }
17470
17471 if (ShiftAmt >= -32 && ShiftAmt < 0) {
17472 unsigned NewOpcode =
17473 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
17474 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
17475 DAG.getConstant(-ShiftAmt, DL, MVT::i32));
17476 DAG.ReplaceAllUsesWith(N, NewShift.getNode());
17477 return NewShift;
17478 }
17479 }
17480
17481 return SDValue();
17482}
17483
17484/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
17486 DAGCombinerInfo &DCI) const {
17487 SelectionDAG &DAG = DCI.DAG;
17488 unsigned IntNo = N->getConstantOperandVal(0);
17489 switch (IntNo) {
17490 default:
17491 // Don't do anything for most intrinsics.
17492 break;
17493
17494 // Vector shifts: check for immediate versions and lower them.
17495 // Note: This is done during DAG combining instead of DAG legalizing because
17496 // the build_vectors for 64-bit vector element shift counts are generally
17497 // not legal, and it is hard to see their values after they get legalized to
17498 // loads from a constant pool.
17499 case Intrinsic::arm_neon_vshifts:
17500 case Intrinsic::arm_neon_vshiftu:
17501 case Intrinsic::arm_neon_vrshifts:
17502 case Intrinsic::arm_neon_vrshiftu:
17503 case Intrinsic::arm_neon_vrshiftn:
17504 case Intrinsic::arm_neon_vqshifts:
17505 case Intrinsic::arm_neon_vqshiftu:
17506 case Intrinsic::arm_neon_vqshiftsu:
17507 case Intrinsic::arm_neon_vqshiftns:
17508 case Intrinsic::arm_neon_vqshiftnu:
17509 case Intrinsic::arm_neon_vqshiftnsu:
17510 case Intrinsic::arm_neon_vqrshiftns:
17511 case Intrinsic::arm_neon_vqrshiftnu:
17512 case Intrinsic::arm_neon_vqrshiftnsu: {
17513 EVT VT = N->getOperand(1).getValueType();
17514 int64_t Cnt;
17515 unsigned VShiftOpc = 0;
17516
17517 switch (IntNo) {
17518 case Intrinsic::arm_neon_vshifts:
17519 case Intrinsic::arm_neon_vshiftu:
17520 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
17521 VShiftOpc = ARMISD::VSHLIMM;
17522 break;
17523 }
17524 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
17525 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
17527 break;
17528 }
17529 return SDValue();
17530
17531 case Intrinsic::arm_neon_vrshifts:
17532 case Intrinsic::arm_neon_vrshiftu:
17533 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
17534 break;
17535 return SDValue();
17536
17537 case Intrinsic::arm_neon_vqshifts:
17538 case Intrinsic::arm_neon_vqshiftu:
17539 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17540 break;
17541 return SDValue();
17542
17543 case Intrinsic::arm_neon_vqshiftsu:
17544 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17545 break;
17546 llvm_unreachable("invalid shift count for vqshlu intrinsic");
17547
17548 case Intrinsic::arm_neon_vrshiftn:
17549 case Intrinsic::arm_neon_vqshiftns:
17550 case Intrinsic::arm_neon_vqshiftnu:
17551 case Intrinsic::arm_neon_vqshiftnsu:
17552 case Intrinsic::arm_neon_vqrshiftns:
17553 case Intrinsic::arm_neon_vqrshiftnu:
17554 case Intrinsic::arm_neon_vqrshiftnsu:
17555 // Narrowing shifts require an immediate right shift.
17556 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
17557 break;
17558 llvm_unreachable("invalid shift count for narrowing vector shift "
17559 "intrinsic");
17560
17561 default:
17562 llvm_unreachable("unhandled vector shift");
17563 }
17564
17565 switch (IntNo) {
17566 case Intrinsic::arm_neon_vshifts:
17567 case Intrinsic::arm_neon_vshiftu:
17568 // Opcode already set above.
17569 break;
17570 case Intrinsic::arm_neon_vrshifts:
17571 VShiftOpc = ARMISD::VRSHRsIMM;
17572 break;
17573 case Intrinsic::arm_neon_vrshiftu:
17574 VShiftOpc = ARMISD::VRSHRuIMM;
17575 break;
17576 case Intrinsic::arm_neon_vrshiftn:
17577 VShiftOpc = ARMISD::VRSHRNIMM;
17578 break;
17579 case Intrinsic::arm_neon_vqshifts:
17580 VShiftOpc = ARMISD::VQSHLsIMM;
17581 break;
17582 case Intrinsic::arm_neon_vqshiftu:
17583 VShiftOpc = ARMISD::VQSHLuIMM;
17584 break;
17585 case Intrinsic::arm_neon_vqshiftsu:
17586 VShiftOpc = ARMISD::VQSHLsuIMM;
17587 break;
17588 case Intrinsic::arm_neon_vqshiftns:
17589 VShiftOpc = ARMISD::VQSHRNsIMM;
17590 break;
17591 case Intrinsic::arm_neon_vqshiftnu:
17592 VShiftOpc = ARMISD::VQSHRNuIMM;
17593 break;
17594 case Intrinsic::arm_neon_vqshiftnsu:
17595 VShiftOpc = ARMISD::VQSHRNsuIMM;
17596 break;
17597 case Intrinsic::arm_neon_vqrshiftns:
17598 VShiftOpc = ARMISD::VQRSHRNsIMM;
17599 break;
17600 case Intrinsic::arm_neon_vqrshiftnu:
17601 VShiftOpc = ARMISD::VQRSHRNuIMM;
17602 break;
17603 case Intrinsic::arm_neon_vqrshiftnsu:
17604 VShiftOpc = ARMISD::VQRSHRNsuIMM;
17605 break;
17606 }
17607
17608 SDLoc dl(N);
17609 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17610 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
17611 }
17612
17613 case Intrinsic::arm_neon_vshiftins: {
17614 EVT VT = N->getOperand(1).getValueType();
17615 int64_t Cnt;
17616 unsigned VShiftOpc = 0;
17617
17618 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
17619 VShiftOpc = ARMISD::VSLIIMM;
17620 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
17621 VShiftOpc = ARMISD::VSRIIMM;
17622 else {
17623 llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
17624 }
17625
17626 SDLoc dl(N);
17627 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17628 N->getOperand(1), N->getOperand(2),
17629 DAG.getConstant(Cnt, dl, MVT::i32));
17630 }
17631
17632 case Intrinsic::arm_neon_vqrshifts:
17633 case Intrinsic::arm_neon_vqrshiftu:
17634 // No immediate versions of these to check for.
17635 break;
17636
17637 case Intrinsic::arm_neon_vbsl: {
17638 SDLoc dl(N);
17639 return DAG.getNode(ARMISD::VBSP, dl, N->getValueType(0), N->getOperand(1),
17640 N->getOperand(2), N->getOperand(3));
17641 }
17642 case Intrinsic::arm_mve_vqdmlah:
17643 case Intrinsic::arm_mve_vqdmlash:
17644 case Intrinsic::arm_mve_vqrdmlah:
17645 case Intrinsic::arm_mve_vqrdmlash:
17646 case Intrinsic::arm_mve_vmla_n_predicated:
17647 case Intrinsic::arm_mve_vmlas_n_predicated:
17648 case Intrinsic::arm_mve_vqdmlah_predicated:
17649 case Intrinsic::arm_mve_vqdmlash_predicated:
17650 case Intrinsic::arm_mve_vqrdmlah_predicated:
17651 case Intrinsic::arm_mve_vqrdmlash_predicated: {
17652 // These intrinsics all take an i32 scalar operand which is narrowed to the
17653 // size of a single lane of the vector type they return. So we don't need
17654 // any bits of that operand above that point, which allows us to eliminate
17655 // uxth/sxth.
17656 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
17657 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17658 if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
17659 return SDValue();
17660 break;
17661 }
17662
17663 case Intrinsic::arm_mve_minv:
17664 case Intrinsic::arm_mve_maxv:
17665 case Intrinsic::arm_mve_minav:
17666 case Intrinsic::arm_mve_maxav:
17667 case Intrinsic::arm_mve_minv_predicated:
17668 case Intrinsic::arm_mve_maxv_predicated:
17669 case Intrinsic::arm_mve_minav_predicated:
17670 case Intrinsic::arm_mve_maxav_predicated: {
17671 // These intrinsics all take an i32 scalar operand which is narrowed to the
17672 // size of a single lane of the vector type they take as the other input.
17673 unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
17674 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17675 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
17676 return SDValue();
17677 break;
17678 }
17679
17680 case Intrinsic::arm_mve_addv: {
17681 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
17682 // which allow PerformADDVecReduce to turn it into VADDLV when possible.
17683 bool Unsigned = N->getConstantOperandVal(2);
17685 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
17686 }
17687
17688 case Intrinsic::arm_mve_addlv:
17689 case Intrinsic::arm_mve_addlv_predicated: {
17690 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
17691 // which recombines the two outputs into an i64
17692 bool Unsigned = N->getConstantOperandVal(2);
17693 unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
17696
17698 for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
17699 if (i != 2) // skip the unsigned flag
17700 Ops.push_back(N->getOperand(i));
17701
17702 SDLoc dl(N);
17703 SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
17704 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
17705 val.getValue(1));
17706 }
17707 }
17708
17709 return SDValue();
17710}
17711
17712/// PerformShiftCombine - Checks for immediate versions of vector shifts and
17713/// lowers them. As with the vector shift intrinsics, this is done during DAG
17714/// combining instead of DAG legalizing because the build_vectors for 64-bit
17715/// vector element shift counts are generally not legal, and it is hard to see
17716/// their values after they get legalized to loads from a constant pool.
17719 const ARMSubtarget *ST) {
17720 SelectionDAG &DAG = DCI.DAG;
17721 EVT VT = N->getValueType(0);
17722
17723 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
17724 N->getOperand(0)->getOpcode() == ISD::AND &&
17725 N->getOperand(0)->hasOneUse()) {
17726 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
17727 return SDValue();
17728 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
17729 // usually show up because instcombine prefers to canonicalize it to
17730 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
17731 // out of GEP lowering in some cases.
17732 SDValue N0 = N->getOperand(0);
17733 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
17734 if (!ShiftAmtNode)
17735 return SDValue();
17736 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
17737 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
17738 if (!AndMaskNode)
17739 return SDValue();
17740 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
17741 // Don't transform uxtb/uxth.
17742 if (AndMask == 255 || AndMask == 65535)
17743 return SDValue();
17744 if (isMask_32(AndMask)) {
17745 uint32_t MaskedBits = llvm::countl_zero(AndMask);
17746 if (MaskedBits > ShiftAmt) {
17747 SDLoc DL(N);
17748 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
17749 DAG.getConstant(MaskedBits, DL, MVT::i32));
17750 return DAG.getNode(
17751 ISD::SRL, DL, MVT::i32, SHL,
17752 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
17753 }
17754 }
17755 }
17756
17757 // Nothing to be done for scalar shifts.
17758 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17759 if (!VT.isVector() || !TLI.isTypeLegal(VT))
17760 return SDValue();
17761 if (ST->hasMVEIntegerOps())
17762 return SDValue();
17763
17764 int64_t Cnt;
17765
17766 switch (N->getOpcode()) {
17767 default: llvm_unreachable("unexpected shift opcode");
17768
17769 case ISD::SHL:
17770 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
17771 SDLoc dl(N);
17772 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
17773 DAG.getConstant(Cnt, dl, MVT::i32));
17774 }
17775 break;
17776
17777 case ISD::SRA:
17778 case ISD::SRL:
17779 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
17780 unsigned VShiftOpc =
17781 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
17782 SDLoc dl(N);
17783 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
17784 DAG.getConstant(Cnt, dl, MVT::i32));
17785 }
17786 }
17787 return SDValue();
17788}
17789
17790// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
17791// split into multiple extending loads, which are simpler to deal with than an
17792// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
17793// to convert the type to an f32.
17795 SDValue N0 = N->getOperand(0);
17796 if (N0.getOpcode() != ISD::LOAD)
17797 return SDValue();
17799 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
17800 LD->getExtensionType() != ISD::NON_EXTLOAD)
17801 return SDValue();
17802 EVT FromVT = LD->getValueType(0);
17803 EVT ToVT = N->getValueType(0);
17804 if (!ToVT.isVector())
17805 return SDValue();
17807 EVT ToEltVT = ToVT.getVectorElementType();
17808 EVT FromEltVT = FromVT.getVectorElementType();
17809
17810 unsigned NumElements = 0;
17811 if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)
17812 NumElements = 4;
17813 if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
17814 NumElements = 4;
17815 if (NumElements == 0 ||
17816 (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
17817 FromVT.getVectorNumElements() % NumElements != 0 ||
17818 !isPowerOf2_32(NumElements))
17819 return SDValue();
17820
17821 LLVMContext &C = *DAG.getContext();
17822 SDLoc DL(LD);
17823 // Details about the old load
17824 SDValue Ch = LD->getChain();
17825 SDValue BasePtr = LD->getBasePtr();
17826 Align Alignment = LD->getBaseAlign();
17827 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
17828 AAMDNodes AAInfo = LD->getAAInfo();
17829
17830 ISD::LoadExtType NewExtType =
17831 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
17832 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
17833 EVT NewFromVT = EVT::getVectorVT(
17834 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
17835 EVT NewToVT = EVT::getVectorVT(
17836 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
17837
17840 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
17841 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
17842 SDValue NewPtr =
17843 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
17844
17845 SDValue NewLoad =
17846 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
17847 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
17848 Alignment, MMOFlags, AAInfo);
17849 Loads.push_back(NewLoad);
17850 Chains.push_back(SDValue(NewLoad.getNode(), 1));
17851 }
17852
17853 // Float truncs need to extended with VCVTB's into their floating point types.
17854 if (FromEltVT == MVT::f16) {
17856
17857 for (unsigned i = 0; i < Loads.size(); i++) {
17858 SDValue LoadBC =
17859 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
17860 SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
17861 DAG.getConstant(0, DL, MVT::i32));
17862 Extends.push_back(FPExt);
17863 }
17864
17865 Loads = Extends;
17866 }
17867
17868 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
17869 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
17870 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
17871}
17872
17873/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
17874/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
17876 const ARMSubtarget *ST) {
17877 SDValue N0 = N->getOperand(0);
17878
17879 // Check for sign- and zero-extensions of vector extract operations of 8- and
17880 // 16-bit vector elements. NEON and MVE support these directly. They are
17881 // handled during DAG combining because type legalization will promote them
17882 // to 32-bit types and it is messy to recognize the operations after that.
17883 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&
17885 SDValue Vec = N0.getOperand(0);
17886 SDValue Lane = N0.getOperand(1);
17887 EVT VT = N->getValueType(0);
17888 EVT EltVT = N0.getValueType();
17889 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17890
17891 if (VT == MVT::i32 &&
17892 (EltVT == MVT::i8 || EltVT == MVT::i16) &&
17893 TLI.isTypeLegal(Vec.getValueType()) &&
17894 isa<ConstantSDNode>(Lane)) {
17895
17896 unsigned Opc = 0;
17897 switch (N->getOpcode()) {
17898 default: llvm_unreachable("unexpected opcode");
17899 case ISD::SIGN_EXTEND:
17901 break;
17902 case ISD::ZERO_EXTEND:
17903 case ISD::ANY_EXTEND:
17905 break;
17906 }
17907 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
17908 }
17909 }
17910
17911 if (ST->hasMVEIntegerOps())
17912 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17913 return NewLoad;
17914
17915 return SDValue();
17916}
17917
17919 const ARMSubtarget *ST) {
17920 if (ST->hasMVEFloatOps())
17921 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17922 return NewLoad;
17923
17924 return SDValue();
17925}
17926
17927// Lower smin(smax(x, C1), C2) to ssat or usat, if they have saturating
17928// constant bounds.
17930 const ARMSubtarget *Subtarget) {
17931 if ((Subtarget->isThumb() || !Subtarget->hasV6Ops()) &&
17932 !Subtarget->isThumb2())
17933 return SDValue();
17934
17935 EVT VT = Op.getValueType();
17936 SDValue Op0 = Op.getOperand(0);
17937
17938 if (VT != MVT::i32 ||
17939 (Op0.getOpcode() != ISD::SMIN && Op0.getOpcode() != ISD::SMAX) ||
17940 !isa<ConstantSDNode>(Op.getOperand(1)) ||
17942 return SDValue();
17943
17944 SDValue Min = Op;
17945 SDValue Max = Op0;
17946 SDValue Input = Op0.getOperand(0);
17947 if (Min.getOpcode() == ISD::SMAX)
17948 std::swap(Min, Max);
17949
17950 APInt MinC = Min.getConstantOperandAPInt(1);
17951 APInt MaxC = Max.getConstantOperandAPInt(1);
17952
17953 if (Min.getOpcode() != ISD::SMIN || Max.getOpcode() != ISD::SMAX ||
17954 !(MinC + 1).isPowerOf2())
17955 return SDValue();
17956
17957 SDLoc DL(Op);
17958 if (MinC == ~MaxC)
17959 return DAG.getNode(ARMISD::SSAT, DL, VT, Input,
17960 DAG.getConstant(MinC.countr_one(), DL, VT));
17961 if (MaxC == 0)
17962 return DAG.getNode(ARMISD::USAT, DL, VT, Input,
17963 DAG.getConstant(MinC.countr_one(), DL, VT));
17964
17965 return SDValue();
17966}
17967
17968/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
17969/// saturates.
17971 const ARMSubtarget *ST) {
17972 EVT VT = N->getValueType(0);
17973 SDValue N0 = N->getOperand(0);
17974
17975 if (VT == MVT::i32)
17976 return PerformMinMaxToSatCombine(SDValue(N, 0), DAG, ST);
17977
17978 if (!ST->hasMVEIntegerOps())
17979 return SDValue();
17980
17981 if (SDValue V = PerformVQDMULHCombine(N, DAG))
17982 return V;
17983
17984 if (VT != MVT::v4i32 && VT != MVT::v8i16)
17985 return SDValue();
17986
17987 auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
17988 // Check one is a smin and the other is a smax
17989 if (Min->getOpcode() != ISD::SMIN)
17990 std::swap(Min, Max);
17991 if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
17992 return false;
17993
17994 APInt SaturateC;
17995 if (VT == MVT::v4i32)
17996 SaturateC = APInt(32, (1 << 15) - 1, true);
17997 else //if (VT == MVT::v8i16)
17998 SaturateC = APInt(16, (1 << 7) - 1, true);
17999
18000 APInt MinC, MaxC;
18001 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18002 MinC != SaturateC)
18003 return false;
18004 if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
18005 MaxC != ~SaturateC)
18006 return false;
18007 return true;
18008 };
18009
18010 if (IsSignedSaturate(N, N0.getNode())) {
18011 SDLoc DL(N);
18012 MVT ExtVT, HalfVT;
18013 if (VT == MVT::v4i32) {
18014 HalfVT = MVT::v8i16;
18015 ExtVT = MVT::v4i16;
18016 } else { // if (VT == MVT::v8i16)
18017 HalfVT = MVT::v16i8;
18018 ExtVT = MVT::v8i8;
18019 }
18020
18021 // Create a VQMOVNB with undef top lanes, then signed extended into the top
18022 // half. That extend will hopefully be removed if only the bottom bits are
18023 // demanded (though a truncating store, for example).
18024 SDValue VQMOVN =
18025 DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
18026 N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
18027 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18028 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
18029 DAG.getValueType(ExtVT));
18030 }
18031
18032 auto IsUnsignedSaturate = [&](SDNode *Min) {
18033 // For unsigned, we just need to check for <= 0xffff
18034 if (Min->getOpcode() != ISD::UMIN)
18035 return false;
18036
18037 APInt SaturateC;
18038 if (VT == MVT::v4i32)
18039 SaturateC = APInt(32, (1 << 16) - 1, true);
18040 else //if (VT == MVT::v8i16)
18041 SaturateC = APInt(16, (1 << 8) - 1, true);
18042
18043 APInt MinC;
18044 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18045 MinC != SaturateC)
18046 return false;
18047 return true;
18048 };
18049
18050 if (IsUnsignedSaturate(N)) {
18051 SDLoc DL(N);
18052 MVT HalfVT;
18053 unsigned ExtConst;
18054 if (VT == MVT::v4i32) {
18055 HalfVT = MVT::v8i16;
18056 ExtConst = 0x0000FFFF;
18057 } else { //if (VT == MVT::v8i16)
18058 HalfVT = MVT::v16i8;
18059 ExtConst = 0x00FF;
18060 }
18061
18062 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
18063 // an AND. That extend will hopefully be removed if only the bottom bits are
18064 // demanded (though a truncating store, for example).
18065 SDValue VQMOVN =
18066 DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
18067 DAG.getConstant(0, DL, MVT::i32));
18068 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18069 return DAG.getNode(ISD::AND, DL, VT, Bitcast,
18070 DAG.getConstant(ExtConst, DL, VT));
18071 }
18072
18073 return SDValue();
18074}
18075
18078 if (!C)
18079 return nullptr;
18080 const APInt *CV = &C->getAPIntValue();
18081 return CV->isPowerOf2() ? CV : nullptr;
18082}
18083
18085 // If we have a CMOV, OR and AND combination such as:
18086 // if (x & CN)
18087 // y |= CM;
18088 //
18089 // And:
18090 // * CN is a single bit;
18091 // * All bits covered by CM are known zero in y
18092 //
18093 // Then we can convert this into a sequence of BFI instructions. This will
18094 // always be a win if CM is a single bit, will always be no worse than the
18095 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
18096 // three bits (due to the extra IT instruction).
18097
18098 SDValue Op0 = CMOV->getOperand(0);
18099 SDValue Op1 = CMOV->getOperand(1);
18100 auto CC = CMOV->getConstantOperandAPInt(2).getLimitedValue();
18101 SDValue CmpZ = CMOV->getOperand(3);
18102
18103 // The compare must be against zero.
18104 if (!isNullConstant(CmpZ->getOperand(1)))
18105 return SDValue();
18106
18107 assert(CmpZ->getOpcode() == ARMISD::CMPZ);
18108 SDValue And = CmpZ->getOperand(0);
18109 if (And->getOpcode() != ISD::AND)
18110 return SDValue();
18111 const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
18112 if (!AndC)
18113 return SDValue();
18114 SDValue X = And->getOperand(0);
18115
18116 if (CC == ARMCC::EQ) {
18117 // We're performing an "equal to zero" compare. Swap the operands so we
18118 // canonicalize on a "not equal to zero" compare.
18119 std::swap(Op0, Op1);
18120 } else {
18121 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
18122 }
18123
18124 if (Op1->getOpcode() != ISD::OR)
18125 return SDValue();
18126
18128 if (!OrC)
18129 return SDValue();
18130 SDValue Y = Op1->getOperand(0);
18131
18132 if (Op0 != Y)
18133 return SDValue();
18134
18135 // Now, is it profitable to continue?
18136 APInt OrCI = OrC->getAPIntValue();
18137 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
18138 if (OrCI.popcount() > Heuristic)
18139 return SDValue();
18140
18141 // Lastly, can we determine that the bits defined by OrCI
18142 // are zero in Y?
18143 KnownBits Known = DAG.computeKnownBits(Y);
18144 if ((OrCI & Known.Zero) != OrCI)
18145 return SDValue();
18146
18147 // OK, we can do the combine.
18148 SDValue V = Y;
18149 SDLoc dl(X);
18150 EVT VT = X.getValueType();
18151 unsigned BitInX = AndC->logBase2();
18152
18153 if (BitInX != 0) {
18154 // We must shift X first.
18155 X = DAG.getNode(ISD::SRL, dl, VT, X,
18156 DAG.getConstant(BitInX, dl, VT));
18157 }
18158
18159 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
18160 BitInY < NumActiveBits; ++BitInY) {
18161 if (OrCI[BitInY] == 0)
18162 continue;
18163 APInt Mask(VT.getSizeInBits(), 0);
18164 Mask.setBit(BitInY);
18165 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
18166 // Confusingly, the operand is an *inverted* mask.
18167 DAG.getConstant(~Mask, dl, VT));
18168 }
18169
18170 return V;
18171}
18172
18173// Given N, the value controlling the conditional branch, search for the loop
18174// intrinsic, returning it, along with how the value is used. We need to handle
18175// patterns such as the following:
18176// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
18177// (brcond (setcc (loop.decrement), 0, eq), exit)
18178// (brcond (setcc (loop.decrement), 0, ne), header)
18180 bool &Negate) {
18181 switch (N->getOpcode()) {
18182 default:
18183 break;
18184 case ISD::XOR: {
18185 if (!isa<ConstantSDNode>(N.getOperand(1)))
18186 return SDValue();
18187 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
18188 return SDValue();
18189 Negate = !Negate;
18190 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
18191 }
18192 case ISD::SETCC: {
18193 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
18194 if (!Const)
18195 return SDValue();
18196 if (Const->isZero())
18197 Imm = 0;
18198 else if (Const->isOne())
18199 Imm = 1;
18200 else
18201 return SDValue();
18202 CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
18203 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
18204 }
18206 unsigned IntOp = N.getConstantOperandVal(1);
18207 if (IntOp != Intrinsic::test_start_loop_iterations &&
18208 IntOp != Intrinsic::loop_decrement_reg)
18209 return SDValue();
18210 return N;
18211 }
18212 }
18213 return SDValue();
18214}
18215
18218 const ARMSubtarget *ST) {
18219
18220 // The hwloop intrinsics that we're interested are used for control-flow,
18221 // either for entering or exiting the loop:
18222 // - test.start.loop.iterations will test whether its operand is zero. If it
18223 // is zero, the proceeding branch should not enter the loop.
18224 // - loop.decrement.reg also tests whether its operand is zero. If it is
18225 // zero, the proceeding branch should not branch back to the beginning of
18226 // the loop.
18227 // So here, we need to check that how the brcond is using the result of each
18228 // of the intrinsics to ensure that we're branching to the right place at the
18229 // right time.
18230
18231 ISD::CondCode CC;
18232 SDValue Cond;
18233 int Imm = 1;
18234 bool Negate = false;
18235 SDValue Chain = N->getOperand(0);
18236 SDValue Dest;
18237
18238 if (N->getOpcode() == ISD::BRCOND) {
18239 CC = ISD::SETEQ;
18240 Cond = N->getOperand(1);
18241 Dest = N->getOperand(2);
18242 } else {
18243 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
18244 CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
18245 Cond = N->getOperand(2);
18246 Dest = N->getOperand(4);
18247 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
18248 if (!Const->isOne() && !Const->isZero())
18249 return SDValue();
18250 Imm = Const->getZExtValue();
18251 } else
18252 return SDValue();
18253 }
18254
18255 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
18256 if (!Int)
18257 return SDValue();
18258
18259 if (Negate)
18260 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);
18261
18262 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
18263 return (CC == ISD::SETEQ && Imm == 0) ||
18264 (CC == ISD::SETNE && Imm == 1) ||
18265 (CC == ISD::SETLT && Imm == 1) ||
18266 (CC == ISD::SETULT && Imm == 1);
18267 };
18268
18269 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
18270 return (CC == ISD::SETEQ && Imm == 1) ||
18271 (CC == ISD::SETNE && Imm == 0) ||
18272 (CC == ISD::SETGT && Imm == 0) ||
18273 (CC == ISD::SETUGT && Imm == 0) ||
18274 (CC == ISD::SETGE && Imm == 1) ||
18275 (CC == ISD::SETUGE && Imm == 1);
18276 };
18277
18278 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
18279 "unsupported condition");
18280
18281 SDLoc dl(Int);
18282 SelectionDAG &DAG = DCI.DAG;
18283 SDValue Elements = Int.getOperand(2);
18284 unsigned IntOp = Int->getConstantOperandVal(1);
18285 assert((N->hasOneUse() && N->user_begin()->getOpcode() == ISD::BR) &&
18286 "expected single br user");
18287 SDNode *Br = *N->user_begin();
18288 SDValue OtherTarget = Br->getOperand(1);
18289
18290 // Update the unconditional branch to branch to the given Dest.
18291 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
18292 SDValue NewBrOps[] = { Br->getOperand(0), Dest };
18293 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
18294 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
18295 };
18296
18297 if (IntOp == Intrinsic::test_start_loop_iterations) {
18298 SDValue Res;
18299 SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements);
18300 // We expect this 'instruction' to branch when the counter is zero.
18301 if (IsTrueIfZero(CC, Imm)) {
18302 SDValue Ops[] = {Chain, Setup, Dest};
18303 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18304 } else {
18305 // The logic is the reverse of what we need for WLS, so find the other
18306 // basic block target: the target of the proceeding br.
18307 UpdateUncondBr(Br, Dest, DAG);
18308
18309 SDValue Ops[] = {Chain, Setup, OtherTarget};
18310 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18311 }
18312 // Update LR count to the new value
18313 DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup);
18314 // Update chain
18315 DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));
18316 return Res;
18317 } else {
18318 SDValue Size =
18319 DAG.getTargetConstant(Int.getConstantOperandVal(3), dl, MVT::i32);
18320 SDValue Args[] = { Int.getOperand(0), Elements, Size, };
18321 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
18322 DAG.getVTList(MVT::i32, MVT::Other), Args);
18323 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
18324
18325 // We expect this instruction to branch when the count is not zero.
18326 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
18327
18328 // Update the unconditional branch to target the loop preheader if we've
18329 // found the condition has been reversed.
18330 if (Target == OtherTarget)
18331 UpdateUncondBr(Br, Dest, DAG);
18332
18333 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
18334 SDValue(LoopDec.getNode(), 1), Chain);
18335
18336 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
18337 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
18338 }
18339 return SDValue();
18340}
18341
18342/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
18343SDValue
18345 SDValue Cmp = N->getOperand(3);
18346 if (Cmp.getOpcode() != ARMISD::CMPZ)
18347 // Only looking at NE cases.
18348 return SDValue();
18349
18350 SDLoc dl(N);
18351 SDValue LHS = Cmp.getOperand(0);
18352 SDValue RHS = Cmp.getOperand(1);
18353 SDValue Chain = N->getOperand(0);
18354 SDValue BB = N->getOperand(1);
18355 SDValue ARMcc = N->getOperand(2);
18357
18358 // (brcond Chain BB ne (cmpz (and (cmov 0 1 CC Flags) 1) 0))
18359 // -> (brcond Chain BB CC Flags)
18360 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
18361 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
18362 LHS->getOperand(0)->hasOneUse() &&
18363 isNullConstant(LHS->getOperand(0)->getOperand(0)) &&
18364 isOneConstant(LHS->getOperand(0)->getOperand(1)) &&
18365 isOneConstant(LHS->getOperand(1)) && isNullConstant(RHS)) {
18366 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, BB,
18367 LHS->getOperand(0)->getOperand(2),
18368 LHS->getOperand(0)->getOperand(3));
18369 }
18370
18371 return SDValue();
18372}
18373
18374/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
18375SDValue
18377 SDValue Cmp = N->getOperand(3);
18378 if (Cmp.getOpcode() != ARMISD::CMPZ)
18379 // Only looking at EQ and NE cases.
18380 return SDValue();
18381
18382 EVT VT = N->getValueType(0);
18383 SDLoc dl(N);
18384 SDValue LHS = Cmp.getOperand(0);
18385 SDValue RHS = Cmp.getOperand(1);
18386 SDValue FalseVal = N->getOperand(0);
18387 SDValue TrueVal = N->getOperand(1);
18388 SDValue ARMcc = N->getOperand(2);
18390
18391 // BFI is only available on V6T2+.
18392 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
18394 if (R)
18395 return R;
18396 }
18397
18398 // Simplify
18399 // mov r1, r0
18400 // cmp r1, x
18401 // mov r0, y
18402 // moveq r0, x
18403 // to
18404 // cmp r0, x
18405 // movne r0, y
18406 //
18407 // mov r1, r0
18408 // cmp r1, x
18409 // mov r0, x
18410 // movne r0, y
18411 // to
18412 // cmp r0, x
18413 // movne r0, y
18414 /// FIXME: Turn this into a target neutral optimization?
18415 SDValue Res;
18416 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
18417 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, Cmp);
18418 } else if (CC == ARMCC::EQ && TrueVal == RHS) {
18419 SDValue ARMcc;
18420 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
18421 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, NewCmp);
18422 }
18423
18424 // (cmov F T ne (cmpz (cmov 0 1 CC Flags) 0))
18425 // -> (cmov F T CC Flags)
18426 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse() &&
18427 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
18428 isNullConstant(RHS)) {
18429 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
18430 LHS->getOperand(2), LHS->getOperand(3));
18431 }
18432
18433 if (!VT.isInteger())
18434 return SDValue();
18435
18436 // Fold away an unneccessary CMPZ/CMOV
18437 // CMOV A, B, C1, (CMPZ (CMOV 1, 0, C2, D), 0) ->
18438 // if C1==EQ -> CMOV A, B, C2, D
18439 // if C1==NE -> CMOV A, B, NOT(C2), D
18440 if (N->getConstantOperandVal(2) == ARMCC::EQ ||
18441 N->getConstantOperandVal(2) == ARMCC::NE) {
18443 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
18444 if (N->getConstantOperandVal(2) == ARMCC::NE)
18446 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
18447 N->getOperand(1),
18448 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
18449 }
18450 }
18451
18452 // Materialize a boolean comparison for integers so we can avoid branching.
18453 if (isNullConstant(FalseVal)) {
18454 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
18455 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
18456 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
18457 // right 5 bits will make that 32 be 1, otherwise it will be 0.
18458 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
18459 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18460 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
18461 DAG.getConstant(5, dl, MVT::i32));
18462 } else {
18463 // CMOV 0, 1, ==, (CMPZ x, y) ->
18464 // (UADDO_CARRY (SUB x, y), t:0, t:1)
18465 // where t = (USUBO_CARRY 0, (SUB x, y), 0)
18466 //
18467 // The USUBO_CARRY computes 0 - (x - y) and this will give a borrow when
18468 // x != y. In other words, a carry C == 1 when x == y, C == 0
18469 // otherwise.
18470 // The final UADDO_CARRY computes
18471 // x - y + (0 - (x - y)) + C == C
18472 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18473 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18474 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
18475 // ISD::USUBO_CARRY returns a borrow but we want the carry here
18476 // actually.
18477 SDValue Carry =
18478 DAG.getNode(ISD::SUB, dl, MVT::i32,
18479 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
18480 Res = DAG.getNode(ISD::UADDO_CARRY, dl, VTs, Sub, Neg, Carry);
18481 }
18482 } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
18483 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
18484 // This seems pointless but will allow us to combine it further below.
18485 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18486 SDValue Sub =
18487 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18488 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
18489 Sub.getValue(1));
18490 FalseVal = Sub;
18491 }
18492 } else if (isNullConstant(TrueVal)) {
18493 if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
18494 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
18495 // This seems pointless but will allow us to combine it further below
18496 // Note that we change == for != as this is the dual for the case above.
18497 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18498 SDValue Sub =
18499 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18500 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
18501 DAG.getConstant(ARMCC::NE, dl, MVT::i32),
18502 Sub.getValue(1));
18503 FalseVal = Sub;
18504 }
18505 }
18506
18507 // On Thumb1, the DAG above may be further combined if z is a power of 2
18508 // (z == 2 ^ K).
18509 // CMOV (SUBC x, y), z, !=, (SUBC x, y):1 ->
18510 // t1 = (USUBO (SUB x, y), 1)
18511 // t2 = (USUBO_CARRY (SUB x, y), t1:0, t1:1)
18512 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18513 //
18514 // This also handles the special case of comparing against zero; it's
18515 // essentially, the same pattern, except there's no SUBC:
18516 // CMOV x, z, !=, (CMPZ x, 0) ->
18517 // t1 = (USUBO x, 1)
18518 // t2 = (USUBO_CARRY x, t1:0, t1:1)
18519 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18520 const APInt *TrueConst;
18521 if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
18522 ((FalseVal.getOpcode() == ARMISD::SUBC && FalseVal.getOperand(0) == LHS &&
18523 FalseVal.getOperand(1) == RHS) ||
18524 (FalseVal == LHS && isNullConstant(RHS))) &&
18525 (TrueConst = isPowerOf2Constant(TrueVal))) {
18526 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18527 unsigned ShiftAmount = TrueConst->logBase2();
18528 if (ShiftAmount)
18529 TrueVal = DAG.getConstant(1, dl, VT);
18530 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
18531 Res = DAG.getNode(ISD::USUBO_CARRY, dl, VTs, FalseVal, Subc,
18532 Subc.getValue(1));
18533
18534 if (ShiftAmount)
18535 Res = DAG.getNode(ISD::SHL, dl, VT, Res,
18536 DAG.getConstant(ShiftAmount, dl, MVT::i32));
18537 }
18538
18539 if (Res.getNode()) {
18540 KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
18541 // Capture demanded bits information that would be otherwise lost.
18542 if (Known.Zero == 0xfffffffe)
18543 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18544 DAG.getValueType(MVT::i1));
18545 else if (Known.Zero == 0xffffff00)
18546 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18547 DAG.getValueType(MVT::i8));
18548 else if (Known.Zero == 0xffff0000)
18549 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18550 DAG.getValueType(MVT::i16));
18551 }
18552
18553 return Res;
18554}
18555
18558 const ARMSubtarget *ST) {
18559 SelectionDAG &DAG = DCI.DAG;
18560 SDValue Src = N->getOperand(0);
18561 EVT DstVT = N->getValueType(0);
18562
18563 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
18564 if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
18565 EVT SrcVT = Src.getValueType();
18566 if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
18567 return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
18568 }
18569
18570 // We may have a bitcast of something that has already had this bitcast
18571 // combine performed on it, so skip past any VECTOR_REG_CASTs.
18572 if (Src.getOpcode() == ARMISD::VECTOR_REG_CAST &&
18573 Src.getOperand(0).getValueType().getScalarSizeInBits() <=
18574 Src.getValueType().getScalarSizeInBits())
18575 Src = Src.getOperand(0);
18576
18577 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
18578 // would be generated is at least the width of the element type.
18579 EVT SrcVT = Src.getValueType();
18580 if ((Src.getOpcode() == ARMISD::VMOVIMM ||
18581 Src.getOpcode() == ARMISD::VMVNIMM ||
18582 Src.getOpcode() == ARMISD::VMOVFPIMM) &&
18583 SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
18584 DAG.getDataLayout().isBigEndian())
18585 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
18586
18587 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x
18588 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
18589 return R;
18590
18591 return SDValue();
18592}
18593
18594// Some combines for the MVETrunc truncations legalizer helper. Also lowers the
18595// node into stack operations after legalizeOps.
18598 SelectionDAG &DAG = DCI.DAG;
18599 EVT VT = N->getValueType(0);
18600 SDLoc DL(N);
18601
18602 // MVETrunc(Undef, Undef) -> Undef
18603 if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); }))
18604 return DAG.getUNDEF(VT);
18605
18606 // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc
18607 if (N->getNumOperands() == 2 &&
18608 N->getOperand(0).getOpcode() == ARMISD::MVETRUNC &&
18609 N->getOperand(1).getOpcode() == ARMISD::MVETRUNC)
18610 return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0),
18611 N->getOperand(0).getOperand(1),
18612 N->getOperand(1).getOperand(0),
18613 N->getOperand(1).getOperand(1));
18614
18615 // MVETrunc(shuffle, shuffle) -> VMOVN
18616 if (N->getNumOperands() == 2 &&
18617 N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
18618 N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) {
18619 auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode());
18620 auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode());
18621
18622 if (S0->getOperand(0) == S1->getOperand(0) &&
18623 S0->getOperand(1) == S1->getOperand(1)) {
18624 // Construct complete shuffle mask
18625 SmallVector<int, 8> Mask(S0->getMask());
18626 Mask.append(S1->getMask().begin(), S1->getMask().end());
18627
18628 if (isVMOVNTruncMask(Mask, VT, false))
18629 return DAG.getNode(
18630 ARMISD::VMOVN, DL, VT,
18631 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18632 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18633 DAG.getConstant(1, DL, MVT::i32));
18634 if (isVMOVNTruncMask(Mask, VT, true))
18635 return DAG.getNode(
18636 ARMISD::VMOVN, DL, VT,
18637 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18638 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18639 DAG.getConstant(1, DL, MVT::i32));
18640 }
18641 }
18642
18643 // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
18644 // truncate to a buildvector to allow the generic optimisations to kick in.
18645 if (all_of(N->ops(), [](SDValue Op) {
18646 return Op.getOpcode() == ISD::BUILD_VECTOR ||
18647 Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
18648 (Op.getOpcode() == ISD::BITCAST &&
18649 Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
18650 })) {
18651 SmallVector<SDValue, 8> Extracts;
18652 for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
18653 SDValue O = N->getOperand(Op);
18654 for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) {
18655 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O,
18656 DAG.getConstant(i, DL, MVT::i32));
18657 Extracts.push_back(Ext);
18658 }
18659 }
18660 return DAG.getBuildVector(VT, DL, Extracts);
18661 }
18662
18663 // If we are late in the legalization process and nothing has optimised
18664 // the trunc to anything better, lower it to a stack store and reload,
18665 // performing the truncation whilst keeping the lanes in the correct order:
18666 // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
18667 if (!DCI.isAfterLegalizeDAG())
18668 return SDValue();
18669
18670 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18671 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18672 int NumIns = N->getNumOperands();
18673 assert((NumIns == 2 || NumIns == 4) &&
18674 "Expected 2 or 4 inputs to an MVETrunc");
18675 EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
18676 if (N->getNumOperands() == 4)
18677 StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());
18678
18679 SmallVector<SDValue> Chains;
18680 for (int I = 0; I < NumIns; I++) {
18681 SDValue Ptr = DAG.getNode(
18682 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18683 DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
18685 DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
18686 SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
18687 Ptr, MPI, StoreVT, Align(4));
18688 Chains.push_back(Ch);
18689 }
18690
18691 SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18692 MachinePointerInfo MPI =
18694 return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
18695}
18696
18697// Take a MVEEXT(load x) and split that into (extload x, extload x+8)
18699 SelectionDAG &DAG) {
18700 SDValue N0 = N->getOperand(0);
18702 if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())
18703 return SDValue();
18704
18705 EVT FromVT = LD->getMemoryVT();
18706 EVT ToVT = N->getValueType(0);
18707 if (!ToVT.isVector())
18708 return SDValue();
18709 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);
18710 EVT ToEltVT = ToVT.getVectorElementType();
18711 EVT FromEltVT = FromVT.getVectorElementType();
18712
18713 unsigned NumElements = 0;
18714 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
18715 NumElements = 4;
18716 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
18717 NumElements = 8;
18718 assert(NumElements != 0);
18719
18720 ISD::LoadExtType NewExtType =
18721 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
18722 if (LD->getExtensionType() != ISD::NON_EXTLOAD &&
18723 LD->getExtensionType() != ISD::EXTLOAD &&
18724 LD->getExtensionType() != NewExtType)
18725 return SDValue();
18726
18727 LLVMContext &C = *DAG.getContext();
18728 SDLoc DL(LD);
18729 // Details about the old load
18730 SDValue Ch = LD->getChain();
18731 SDValue BasePtr = LD->getBasePtr();
18732 Align Alignment = LD->getBaseAlign();
18733 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
18734 AAMDNodes AAInfo = LD->getAAInfo();
18735
18736 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
18737 EVT NewFromVT = EVT::getVectorVT(
18738 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
18739 EVT NewToVT = EVT::getVectorVT(
18740 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
18741
18744 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
18745 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
18746 SDValue NewPtr =
18747 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
18748
18749 SDValue NewLoad =
18750 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
18751 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
18752 Alignment, MMOFlags, AAInfo);
18753 Loads.push_back(NewLoad);
18754 Chains.push_back(SDValue(NewLoad.getNode(), 1));
18755 }
18756
18757 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18758 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
18759 return DAG.getMergeValues(Loads, DL);
18760}
18761
18762// Perform combines for MVEEXT. If it has not be optimized to anything better
18763// before lowering, it gets converted to stack store and extloads performing the
18764// extend whilst still keeping the same lane ordering.
18767 SelectionDAG &DAG = DCI.DAG;
18768 EVT VT = N->getValueType(0);
18769 SDLoc DL(N);
18770 assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
18771 assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");
18772
18773 EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18774 *DAG.getContext());
18775 auto Extend = [&](SDValue V) {
18776 SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V);
18777 return N->getOpcode() == ARMISD::MVESEXT
18778 ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT,
18779 DAG.getValueType(ExtVT))
18780 : DAG.getZeroExtendInReg(VVT, DL, ExtVT);
18781 };
18782
18783 // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
18784 if (N->getOperand(0).getOpcode() == ARMISD::VDUP) {
18785 SDValue Ext = Extend(N->getOperand(0));
18786 return DAG.getMergeValues({Ext, Ext}, DL);
18787 }
18788
18789 // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
18790 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) {
18791 ArrayRef<int> Mask = SVN->getMask();
18792 assert(Mask.size() == 2 * VT.getVectorNumElements());
18793 assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());
18794 unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;
18795 SDValue Op0 = SVN->getOperand(0);
18796 SDValue Op1 = SVN->getOperand(1);
18797
18798 auto CheckInregMask = [&](int Start, int Offset) {
18799 for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)
18800 if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)
18801 return false;
18802 return true;
18803 };
18804 SDValue V0 = SDValue(N, 0);
18805 SDValue V1 = SDValue(N, 1);
18806 if (CheckInregMask(0, 0))
18807 V0 = Extend(Op0);
18808 else if (CheckInregMask(0, 1))
18809 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18810 else if (CheckInregMask(0, Mask.size()))
18811 V0 = Extend(Op1);
18812 else if (CheckInregMask(0, Mask.size() + 1))
18813 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18814
18815 if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))
18816 V1 = Extend(Op1);
18817 else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))
18818 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18819 else if (CheckInregMask(VT.getVectorNumElements(), 0))
18820 V1 = Extend(Op0);
18821 else if (CheckInregMask(VT.getVectorNumElements(), 1))
18822 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18823
18824 if (V0.getNode() != N || V1.getNode() != N)
18825 return DAG.getMergeValues({V0, V1}, DL);
18826 }
18827
18828 // MVEEXT(load) -> extload, extload
18829 if (N->getOperand(0)->getOpcode() == ISD::LOAD)
18831 return L;
18832
18833 if (!DCI.isAfterLegalizeDAG())
18834 return SDValue();
18835
18836 // Lower to a stack store and reload:
18837 // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
18838 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18839 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18840 int NumOuts = N->getNumValues();
18841 assert((NumOuts == 2 || NumOuts == 4) &&
18842 "Expected 2 or 4 outputs to an MVEEXT");
18843 EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18844 *DAG.getContext());
18845 if (N->getNumOperands() == 4)
18846 LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext());
18847
18848 MachinePointerInfo MPI =
18850 SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0),
18851 StackPtr, MPI, Align(4));
18852
18854 for (int I = 0; I < NumOuts; I++) {
18855 SDValue Ptr = DAG.getNode(
18856 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18857 DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType()));
18859 DAG.getMachineFunction(), SPFI, I * 16 / NumOuts);
18860 SDValue Load = DAG.getExtLoad(
18861 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL,
18862 VT, Chain, Ptr, MPI, LoadVT, Align(4));
18863 Loads.push_back(Load);
18864 }
18865
18866 return DAG.getMergeValues(Loads, DL);
18867}
18868
18870 DAGCombinerInfo &DCI) const {
18871 switch (N->getOpcode()) {
18872 default: break;
18873 case ISD::SELECT_CC:
18874 case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget);
18875 case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);
18876 case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
18877 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
18878 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
18879 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
18880 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget);
18881 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
18882 case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
18883 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
18884 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
18885 case ISD::BRCOND:
18886 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget);
18887 case ARMISD::ADDC:
18888 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
18889 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
18890 case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG);
18891 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
18892 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
18893 case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI);
18894 case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG);
18895 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
18896 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
18899 return PerformExtractEltCombine(N, DCI, Subtarget);
18903 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
18904 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget);
18905 case ISD::FP_TO_SINT:
18906 case ISD::FP_TO_UINT:
18907 return PerformVCVTCombine(N, DCI.DAG, Subtarget);
18908 case ISD::FADD:
18909 return PerformFADDCombine(N, DCI.DAG, Subtarget);
18910 case ISD::FMUL:
18911 return PerformVMulVCTPCombine(N, DCI.DAG, Subtarget);
18913 return PerformIntrinsicCombine(N, DCI);
18914 case ISD::SHL:
18915 case ISD::SRA:
18916 case ISD::SRL:
18917 return PerformShiftCombine(N, DCI, Subtarget);
18918 case ISD::SIGN_EXTEND:
18919 case ISD::ZERO_EXTEND:
18920 case ISD::ANY_EXTEND:
18921 return PerformExtendCombine(N, DCI.DAG, Subtarget);
18922 case ISD::FP_EXTEND:
18923 return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
18924 case ISD::SMIN:
18925 case ISD::UMIN:
18926 case ISD::SMAX:
18927 case ISD::UMAX:
18928 return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
18929 case ARMISD::CMOV:
18930 return PerformCMOVCombine(N, DCI.DAG);
18931 case ARMISD::BRCOND:
18932 return PerformBRCONDCombine(N, DCI.DAG);
18933 case ARMISD::CMPZ:
18934 return PerformCMPZCombine(N, DCI.DAG);
18935 case ARMISD::CSINC:
18936 case ARMISD::CSINV:
18937 case ARMISD::CSNEG:
18938 return PerformCSETCombine(N, DCI.DAG);
18939 case ISD::LOAD:
18940 return PerformLOADCombine(N, DCI, Subtarget);
18941 case ARMISD::VLD1DUP:
18942 case ARMISD::VLD2DUP:
18943 case ARMISD::VLD3DUP:
18944 case ARMISD::VLD4DUP:
18945 return PerformVLDCombine(N, DCI);
18947 return PerformARMBUILD_VECTORCombine(N, DCI);
18948 case ISD::BITCAST:
18949 return PerformBITCASTCombine(N, DCI, Subtarget);
18951 return PerformPREDICATE_CASTCombine(N, DCI);
18953 return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget);
18954 case ARMISD::MVETRUNC:
18955 return PerformMVETruncCombine(N, DCI);
18956 case ARMISD::MVESEXT:
18957 case ARMISD::MVEZEXT:
18958 return PerformMVEExtCombine(N, DCI);
18959 case ARMISD::VCMP:
18960 return PerformVCMPCombine(N, DCI.DAG, Subtarget);
18961 case ISD::VECREDUCE_ADD:
18962 return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
18963 case ARMISD::VADDVs:
18964 case ARMISD::VADDVu:
18965 case ARMISD::VADDLVs:
18966 case ARMISD::VADDLVu:
18967 case ARMISD::VADDLVAs:
18968 case ARMISD::VADDLVAu:
18969 case ARMISD::VMLAVs:
18970 case ARMISD::VMLAVu:
18971 case ARMISD::VMLALVs:
18972 case ARMISD::VMLALVu:
18973 case ARMISD::VMLALVAs:
18974 case ARMISD::VMLALVAu:
18975 return PerformReduceShuffleCombine(N, DCI.DAG);
18976 case ARMISD::VMOVN:
18977 return PerformVMOVNCombine(N, DCI);
18978 case ARMISD::VQMOVNs:
18979 case ARMISD::VQMOVNu:
18980 return PerformVQMOVNCombine(N, DCI);
18981 case ARMISD::VQDMULH:
18982 return PerformVQDMULHCombine(N, DCI);
18983 case ARMISD::ASRL:
18984 case ARMISD::LSRL:
18985 case ARMISD::LSLL:
18986 return PerformLongShiftCombine(N, DCI.DAG);
18987 case ARMISD::SMULWB: {
18988 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18989 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
18990 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
18991 return SDValue();
18992 break;
18993 }
18994 case ARMISD::SMULWT: {
18995 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18996 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
18997 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
18998 return SDValue();
18999 break;
19000 }
19001 case ARMISD::SMLALBB:
19002 case ARMISD::QADD16b:
19003 case ARMISD::QSUB16b:
19004 case ARMISD::UQADD16b:
19005 case ARMISD::UQSUB16b: {
19006 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19007 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
19008 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19009 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19010 return SDValue();
19011 break;
19012 }
19013 case ARMISD::SMLALBT: {
19014 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
19015 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
19016 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
19017 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
19018 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
19019 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
19020 return SDValue();
19021 break;
19022 }
19023 case ARMISD::SMLALTB: {
19024 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
19025 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
19026 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
19027 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
19028 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
19029 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
19030 return SDValue();
19031 break;
19032 }
19033 case ARMISD::SMLALTT: {
19034 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19035 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
19036 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19037 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19038 return SDValue();
19039 break;
19040 }
19041 case ARMISD::QADD8b:
19042 case ARMISD::QSUB8b:
19043 case ARMISD::UQADD8b:
19044 case ARMISD::UQSUB8b: {
19045 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19046 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
19047 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19048 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19049 return SDValue();
19050 break;
19051 }
19052 case ARMISD::VBSP:
19053 if (N->getOperand(1) == N->getOperand(2))
19054 return N->getOperand(1);
19055 return SDValue();
19058 switch (N->getConstantOperandVal(1)) {
19059 case Intrinsic::arm_neon_vld1:
19060 case Intrinsic::arm_neon_vld1x2:
19061 case Intrinsic::arm_neon_vld1x3:
19062 case Intrinsic::arm_neon_vld1x4:
19063 case Intrinsic::arm_neon_vld2:
19064 case Intrinsic::arm_neon_vld3:
19065 case Intrinsic::arm_neon_vld4:
19066 case Intrinsic::arm_neon_vld2lane:
19067 case Intrinsic::arm_neon_vld3lane:
19068 case Intrinsic::arm_neon_vld4lane:
19069 case Intrinsic::arm_neon_vld2dup:
19070 case Intrinsic::arm_neon_vld3dup:
19071 case Intrinsic::arm_neon_vld4dup:
19072 case Intrinsic::arm_neon_vst1:
19073 case Intrinsic::arm_neon_vst1x2:
19074 case Intrinsic::arm_neon_vst1x3:
19075 case Intrinsic::arm_neon_vst1x4:
19076 case Intrinsic::arm_neon_vst2:
19077 case Intrinsic::arm_neon_vst3:
19078 case Intrinsic::arm_neon_vst4:
19079 case Intrinsic::arm_neon_vst2lane:
19080 case Intrinsic::arm_neon_vst3lane:
19081 case Intrinsic::arm_neon_vst4lane:
19082 return PerformVLDCombine(N, DCI);
19083 case Intrinsic::arm_mve_vld2q:
19084 case Intrinsic::arm_mve_vld4q:
19085 case Intrinsic::arm_mve_vst2q:
19086 case Intrinsic::arm_mve_vst4q:
19087 return PerformMVEVLDCombine(N, DCI);
19088 default: break;
19089 }
19090 break;
19091 }
19092 return SDValue();
19093}
19094
19096 EVT VT) const {
19097 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
19098}
19099
19101 Align Alignment,
19103 unsigned *Fast) const {
19104 // Depends what it gets converted into if the type is weird.
19105 if (!VT.isSimple())
19106 return false;
19107
19108 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
19109 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
19110 auto Ty = VT.getSimpleVT().SimpleTy;
19111
19112 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
19113 // Unaligned access can use (for example) LRDB, LRDH, LDR
19114 if (AllowsUnaligned) {
19115 if (Fast)
19116 *Fast = Subtarget->hasV7Ops();
19117 return true;
19118 }
19119 }
19120
19121 if (Ty == MVT::f64 || Ty == MVT::v2f64) {
19122 // For any little-endian targets with neon, we can support unaligned ld/st
19123 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
19124 // A big-endian target may also explicitly support unaligned accesses
19125 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
19126 if (Fast)
19127 *Fast = 1;
19128 return true;
19129 }
19130 }
19131
19132 if (!Subtarget->hasMVEIntegerOps())
19133 return false;
19134
19135 // These are for predicates
19136 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 ||
19137 Ty == MVT::v2i1)) {
19138 if (Fast)
19139 *Fast = 1;
19140 return true;
19141 }
19142
19143 // These are for truncated stores/narrowing loads. They are fine so long as
19144 // the alignment is at least the size of the item being loaded
19145 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
19146 Alignment >= VT.getScalarSizeInBits() / 8) {
19147 if (Fast)
19148 *Fast = true;
19149 return true;
19150 }
19151
19152 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
19153 // VSTRW.U32 all store the vector register in exactly the same format, and
19154 // differ only in the range of their immediate offset field and the required
19155 // alignment. So there is always a store that can be used, regardless of
19156 // actual type.
19157 //
19158 // For big endian, that is not the case. But can still emit a (VSTRB.U8;
19159 // VREV64.8) pair and get the same effect. This will likely be better than
19160 // aligning the vector through the stack.
19161 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
19162 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
19163 Ty == MVT::v2f64) {
19164 if (Fast)
19165 *Fast = 1;
19166 return true;
19167 }
19168
19169 return false;
19170}
19171
19173 LLVMContext &Context, const MemOp &Op,
19174 const AttributeList &FuncAttributes) const {
19175 // See if we can use NEON instructions for this...
19176 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
19177 !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
19178 unsigned Fast;
19179 if (Op.size() >= 16 &&
19180 (Op.isAligned(Align(16)) ||
19181 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1),
19183 Fast))) {
19184 return MVT::v2f64;
19185 } else if (Op.size() >= 8 &&
19186 (Op.isAligned(Align(8)) ||
19188 MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) &&
19189 Fast))) {
19190 return MVT::f64;
19191 }
19192 }
19193
19194 // Let the target-independent logic figure it out.
19195 return MVT::Other;
19196}
19197
19198// 64-bit integers are split into their high and low parts and held in two
19199// different registers, so the trunc is free since the low register can just
19200// be used.
19201bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
19202 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
19203 return false;
19204 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
19205 unsigned DestBits = DstTy->getPrimitiveSizeInBits();
19206 return (SrcBits == 64 && DestBits == 32);
19207}
19208
19210 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
19211 !DstVT.isInteger())
19212 return false;
19213 unsigned SrcBits = SrcVT.getSizeInBits();
19214 unsigned DestBits = DstVT.getSizeInBits();
19215 return (SrcBits == 64 && DestBits == 32);
19216}
19217
19219 if (Val.getOpcode() != ISD::LOAD)
19220 return false;
19221
19222 EVT VT1 = Val.getValueType();
19223 if (!VT1.isSimple() || !VT1.isInteger() ||
19224 !VT2.isSimple() || !VT2.isInteger())
19225 return false;
19226
19227 switch (VT1.getSimpleVT().SimpleTy) {
19228 default: break;
19229 case MVT::i1:
19230 case MVT::i8:
19231 case MVT::i16:
19232 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
19233 return true;
19234 }
19235
19236 return false;
19237}
19238
19240 if (!VT.isSimple())
19241 return false;
19242
19243 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
19244 // negate values directly (fneg is free). So, we don't want to let the DAG
19245 // combiner rewrite fneg into xors and some other instructions. For f16 and
19246 // FullFP16 argument passing, some bitcast nodes may be introduced,
19247 // triggering this DAG combine rewrite, so we are avoiding that with this.
19248 switch (VT.getSimpleVT().SimpleTy) {
19249 default: break;
19250 case MVT::f16:
19251 return Subtarget->hasFullFP16();
19252 }
19253
19254 return false;
19255}
19256
19258 if (!Subtarget->hasMVEIntegerOps())
19259 return nullptr;
19260 Type *SVIType = SVI->getType();
19261 Type *ScalarType = SVIType->getScalarType();
19262
19263 if (ScalarType->isFloatTy())
19264 return Type::getInt32Ty(SVIType->getContext());
19265 if (ScalarType->isHalfTy())
19266 return Type::getInt16Ty(SVIType->getContext());
19267 return nullptr;
19268}
19269
19271 EVT VT = ExtVal.getValueType();
19272
19273 if (!isTypeLegal(VT))
19274 return false;
19275
19276 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
19277 if (Ld->isExpandingLoad())
19278 return false;
19279 }
19280
19281 if (Subtarget->hasMVEIntegerOps())
19282 return true;
19283
19284 // Don't create a loadext if we can fold the extension into a wide/long
19285 // instruction.
19286 // If there's more than one user instruction, the loadext is desirable no
19287 // matter what. There can be two uses by the same instruction.
19288 if (ExtVal->use_empty() ||
19289 !ExtVal->user_begin()->isOnlyUserOf(ExtVal.getNode()))
19290 return true;
19291
19292 SDNode *U = *ExtVal->user_begin();
19293 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
19294 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
19295 return false;
19296
19297 return true;
19298}
19299
19301 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
19302 return false;
19303
19304 if (!isTypeLegal(EVT::getEVT(Ty1)))
19305 return false;
19306
19307 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
19308
19309 // Assuming the caller doesn't have a zeroext or signext return parameter,
19310 // truncation all the way down to i1 is valid.
19311 return true;
19312}
19313
19314/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
19315/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
19316/// expanded to FMAs when this method returns true, otherwise fmuladd is
19317/// expanded to fmul + fadd.
19318///
19319/// ARM supports both fused and unfused multiply-add operations; we already
19320/// lower a pair of fmul and fadd to the latter so it's not clear that there
19321/// would be a gain or that the gain would be worthwhile enough to risk
19322/// correctness bugs.
19323///
19324/// For MVE, we set this to true as it helps simplify the need for some
19325/// patterns (and we don't have the non-fused floating point instruction).
19326bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
19327 EVT VT) const {
19328 if (Subtarget->useSoftFloat())
19329 return false;
19330
19331 if (!VT.isSimple())
19332 return false;
19333
19334 switch (VT.getSimpleVT().SimpleTy) {
19335 case MVT::v4f32:
19336 case MVT::v8f16:
19337 return Subtarget->hasMVEFloatOps();
19338 case MVT::f16:
19339 return Subtarget->useFPVFMx16();
19340 case MVT::f32:
19341 return Subtarget->useFPVFMx();
19342 case MVT::f64:
19343 return Subtarget->useFPVFMx64();
19344 default:
19345 break;
19346 }
19347
19348 return false;
19349}
19350
19351static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
19352 if (V < 0)
19353 return false;
19354
19355 unsigned Scale = 1;
19356 switch (VT.getSimpleVT().SimpleTy) {
19357 case MVT::i1:
19358 case MVT::i8:
19359 // Scale == 1;
19360 break;
19361 case MVT::i16:
19362 // Scale == 2;
19363 Scale = 2;
19364 break;
19365 default:
19366 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
19367 // Scale == 4;
19368 Scale = 4;
19369 break;
19370 }
19371
19372 if ((V & (Scale - 1)) != 0)
19373 return false;
19374 return isUInt<5>(V / Scale);
19375}
19376
19377static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
19378 const ARMSubtarget *Subtarget) {
19379 if (!VT.isInteger() && !VT.isFloatingPoint())
19380 return false;
19381 if (VT.isVector() && Subtarget->hasNEON())
19382 return false;
19383 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
19384 !Subtarget->hasMVEFloatOps())
19385 return false;
19386
19387 bool IsNeg = false;
19388 if (V < 0) {
19389 IsNeg = true;
19390 V = -V;
19391 }
19392
19393 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U);
19394
19395 // MVE: size * imm7
19396 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
19397 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
19398 case MVT::i32:
19399 case MVT::f32:
19400 return isShiftedUInt<7,2>(V);
19401 case MVT::i16:
19402 case MVT::f16:
19403 return isShiftedUInt<7,1>(V);
19404 case MVT::i8:
19405 return isUInt<7>(V);
19406 default:
19407 return false;
19408 }
19409 }
19410
19411 // half VLDR: 2 * imm8
19412 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
19413 return isShiftedUInt<8, 1>(V);
19414 // VLDR and LDRD: 4 * imm8
19415 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
19416 return isShiftedUInt<8, 2>(V);
19417
19418 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
19419 // + imm12 or - imm8
19420 if (IsNeg)
19421 return isUInt<8>(V);
19422 return isUInt<12>(V);
19423 }
19424
19425 return false;
19426}
19427
19428/// isLegalAddressImmediate - Return true if the integer value can be used
19429/// as the offset of the target addressing mode for load / store of the
19430/// given type.
19431static bool isLegalAddressImmediate(int64_t V, EVT VT,
19432 const ARMSubtarget *Subtarget) {
19433 if (V == 0)
19434 return true;
19435
19436 if (!VT.isSimple())
19437 return false;
19438
19439 if (Subtarget->isThumb1Only())
19440 return isLegalT1AddressImmediate(V, VT);
19441 else if (Subtarget->isThumb2())
19442 return isLegalT2AddressImmediate(V, VT, Subtarget);
19443
19444 // ARM mode.
19445 if (V < 0)
19446 V = - V;
19447 switch (VT.getSimpleVT().SimpleTy) {
19448 default: return false;
19449 case MVT::i1:
19450 case MVT::i8:
19451 case MVT::i32:
19452 // +- imm12
19453 return isUInt<12>(V);
19454 case MVT::i16:
19455 // +- imm8
19456 return isUInt<8>(V);
19457 case MVT::f32:
19458 case MVT::f64:
19459 if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
19460 return false;
19461 return isShiftedUInt<8, 2>(V);
19462 }
19463}
19464
19466 EVT VT) const {
19467 int Scale = AM.Scale;
19468 if (Scale < 0)
19469 return false;
19470
19471 switch (VT.getSimpleVT().SimpleTy) {
19472 default: return false;
19473 case MVT::i1:
19474 case MVT::i8:
19475 case MVT::i16:
19476 case MVT::i32:
19477 if (Scale == 1)
19478 return true;
19479 // r + r << imm
19480 Scale = Scale & ~1;
19481 return Scale == 2 || Scale == 4 || Scale == 8;
19482 case MVT::i64:
19483 // FIXME: What are we trying to model here? ldrd doesn't have an r + r
19484 // version in Thumb mode.
19485 // r + r
19486 if (Scale == 1)
19487 return true;
19488 // r * 2 (this can be lowered to r + r).
19489 if (!AM.HasBaseReg && Scale == 2)
19490 return true;
19491 return false;
19492 case MVT::isVoid:
19493 // Note, we allow "void" uses (basically, uses that aren't loads or
19494 // stores), because arm allows folding a scale into many arithmetic
19495 // operations. This should be made more precise and revisited later.
19496
19497 // Allow r << imm, but the imm has to be a multiple of two.
19498 if (Scale & 1) return false;
19499 return isPowerOf2_32(Scale);
19500 }
19501}
19502
19504 EVT VT) const {
19505 const int Scale = AM.Scale;
19506
19507 // Negative scales are not supported in Thumb1.
19508 if (Scale < 0)
19509 return false;
19510
19511 // Thumb1 addressing modes do not support register scaling excepting the
19512 // following cases:
19513 // 1. Scale == 1 means no scaling.
19514 // 2. Scale == 2 this can be lowered to r + r if there is no base register.
19515 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
19516}
19517
19518/// isLegalAddressingMode - Return true if the addressing mode represented
19519/// by AM is legal for this target, for a load/store of the specified type.
19521 const AddrMode &AM, Type *Ty,
19522 unsigned AS, Instruction *I) const {
19523 EVT VT = getValueType(DL, Ty, true);
19524 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
19525 return false;
19526
19527 // Can never fold addr of global into load/store.
19528 if (AM.BaseGV)
19529 return false;
19530
19531 switch (AM.Scale) {
19532 case 0: // no scale reg, must be "r+i" or "r", or "i".
19533 break;
19534 default:
19535 // ARM doesn't support any R+R*scale+imm addr modes.
19536 if (AM.BaseOffs)
19537 return false;
19538
19539 if (!VT.isSimple())
19540 return false;
19541
19542 if (Subtarget->isThumb1Only())
19543 return isLegalT1ScaledAddressingMode(AM, VT);
19544
19545 if (Subtarget->isThumb2())
19546 return isLegalT2ScaledAddressingMode(AM, VT);
19547
19548 int Scale = AM.Scale;
19549 switch (VT.getSimpleVT().SimpleTy) {
19550 default: return false;
19551 case MVT::i1:
19552 case MVT::i8:
19553 case MVT::i32:
19554 if (Scale < 0) Scale = -Scale;
19555 if (Scale == 1)
19556 return true;
19557 // r + r << imm
19558 return isPowerOf2_32(Scale & ~1);
19559 case MVT::i16:
19560 case MVT::i64:
19561 // r +/- r
19562 if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
19563 return true;
19564 // r * 2 (this can be lowered to r + r).
19565 if (!AM.HasBaseReg && Scale == 2)
19566 return true;
19567 return false;
19568
19569 case MVT::isVoid:
19570 // Note, we allow "void" uses (basically, uses that aren't loads or
19571 // stores), because arm allows folding a scale into many arithmetic
19572 // operations. This should be made more precise and revisited later.
19573
19574 // Allow r << imm, but the imm has to be a multiple of two.
19575 if (Scale & 1) return false;
19576 return isPowerOf2_32(Scale);
19577 }
19578 }
19579 return true;
19580}
19581
19582/// isLegalICmpImmediate - Return true if the specified immediate is legal
19583/// icmp immediate, that is the target has icmp instructions which can compare
19584/// a register against the immediate without having to materialize the
19585/// immediate into a register.
19587 // Thumb2 and ARM modes can use cmn for negative immediates.
19588 if (!Subtarget->isThumb())
19589 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
19590 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
19591 if (Subtarget->isThumb2())
19592 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
19593 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
19594 // Thumb1 doesn't have cmn, and only 8-bit immediates.
19595 return Imm >= 0 && Imm <= 255;
19596}
19597
19598/// isLegalAddImmediate - Return true if the specified immediate is a legal add
19599/// *or sub* immediate, that is the target has add or sub instructions which can
19600/// add a register with the immediate without having to materialize the
19601/// immediate into a register.
19603 // Same encoding for add/sub, just flip the sign.
19604 uint64_t AbsImm = AbsoluteValue(Imm);
19605 if (!Subtarget->isThumb())
19606 return ARM_AM::getSOImmVal(AbsImm) != -1;
19607 if (Subtarget->isThumb2())
19608 return ARM_AM::getT2SOImmVal(AbsImm) != -1;
19609 // Thumb1 only has 8-bit unsigned immediate.
19610 return AbsImm <= 255;
19611}
19612
19613// Return false to prevent folding
19614// (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine,
19615// if the folding leads to worse code.
19617 SDValue ConstNode) const {
19618 // Let the DAGCombiner decide for vector types and large types.
19619 const EVT VT = AddNode.getValueType();
19620 if (VT.isVector() || VT.getScalarSizeInBits() > 32)
19621 return true;
19622
19623 // It is worse if c0 is legal add immediate, while c1*c0 is not
19624 // and has to be composed by at least two instructions.
19625 const ConstantSDNode *C0Node = cast<ConstantSDNode>(AddNode.getOperand(1));
19626 const ConstantSDNode *C1Node = cast<ConstantSDNode>(ConstNode);
19627 const int64_t C0 = C0Node->getSExtValue();
19628 APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue();
19630 return true;
19631 if (ConstantMaterializationCost((unsigned)CA.getZExtValue(), Subtarget) > 1)
19632 return false;
19633
19634 // Default to true and let the DAGCombiner decide.
19635 return true;
19636}
19637
19639 bool isSEXTLoad, SDValue &Base,
19640 SDValue &Offset, bool &isInc,
19641 SelectionDAG &DAG) {
19642 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19643 return false;
19644
19645 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
19646 // AddressingMode 3
19647 Base = Ptr->getOperand(0);
19648 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
19649 int RHSC = (int)RHS->getZExtValue();
19650 if (RHSC < 0 && RHSC > -256) {
19651 assert(Ptr->getOpcode() == ISD::ADD);
19652 isInc = false;
19653 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19654 return true;
19655 }
19656 }
19657 isInc = (Ptr->getOpcode() == ISD::ADD);
19658 Offset = Ptr->getOperand(1);
19659 return true;
19660 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
19661 // AddressingMode 2
19662 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
19663 int RHSC = (int)RHS->getZExtValue();
19664 if (RHSC < 0 && RHSC > -0x1000) {
19665 assert(Ptr->getOpcode() == ISD::ADD);
19666 isInc = false;
19667 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19668 Base = Ptr->getOperand(0);
19669 return true;
19670 }
19671 }
19672
19673 if (Ptr->getOpcode() == ISD::ADD) {
19674 isInc = true;
19675 ARM_AM::ShiftOpc ShOpcVal=
19676 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
19677 if (ShOpcVal != ARM_AM::no_shift) {
19678 Base = Ptr->getOperand(1);
19679 Offset = Ptr->getOperand(0);
19680 } else {
19681 Base = Ptr->getOperand(0);
19682 Offset = Ptr->getOperand(1);
19683 }
19684 return true;
19685 }
19686
19687 isInc = (Ptr->getOpcode() == ISD::ADD);
19688 Base = Ptr->getOperand(0);
19689 Offset = Ptr->getOperand(1);
19690 return true;
19691 }
19692
19693 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
19694 return false;
19695}
19696
19698 bool isSEXTLoad, SDValue &Base,
19699 SDValue &Offset, bool &isInc,
19700 SelectionDAG &DAG) {
19701 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19702 return false;
19703
19704 Base = Ptr->getOperand(0);
19705 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
19706 int RHSC = (int)RHS->getZExtValue();
19707 if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
19708 assert(Ptr->getOpcode() == ISD::ADD);
19709 isInc = false;
19710 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19711 return true;
19712 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
19713 isInc = Ptr->getOpcode() == ISD::ADD;
19714 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19715 return true;
19716 }
19717 }
19718
19719 return false;
19720}
19721
19722static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
19723 bool isSEXTLoad, bool IsMasked, bool isLE,
19725 bool &isInc, SelectionDAG &DAG) {
19726 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19727 return false;
19728 if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
19729 return false;
19730
19731 // We allow LE non-masked loads to change the type (for example use a vldrb.8
19732 // as opposed to a vldrw.32). This can allow extra addressing modes or
19733 // alignments for what is otherwise an equivalent instruction.
19734 bool CanChangeType = isLE && !IsMasked;
19735
19736 ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1));
19737 int RHSC = (int)RHS->getZExtValue();
19738
19739 auto IsInRange = [&](int RHSC, int Limit, int Scale) {
19740 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
19741 assert(Ptr->getOpcode() == ISD::ADD);
19742 isInc = false;
19743 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19744 return true;
19745 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
19746 isInc = Ptr->getOpcode() == ISD::ADD;
19747 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19748 return true;
19749 }
19750 return false;
19751 };
19752
19753 // Try to find a matching instruction based on s/zext, Alignment, Offset and
19754 // (in BE/masked) type.
19755 Base = Ptr->getOperand(0);
19756 if (VT == MVT::v4i16) {
19757 if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
19758 return true;
19759 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
19760 if (IsInRange(RHSC, 0x80, 1))
19761 return true;
19762 } else if (Alignment >= 4 &&
19763 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
19764 IsInRange(RHSC, 0x80, 4))
19765 return true;
19766 else if (Alignment >= 2 &&
19767 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
19768 IsInRange(RHSC, 0x80, 2))
19769 return true;
19770 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
19771 return true;
19772 return false;
19773}
19774
19775/// getPreIndexedAddressParts - returns true by value, base pointer and
19776/// offset pointer and addressing mode by reference if the node's address
19777/// can be legally represented as pre-indexed load / store address.
19778bool
19780 SDValue &Offset,
19782 SelectionDAG &DAG) const {
19783 if (Subtarget->isThumb1Only())
19784 return false;
19785
19786 EVT VT;
19787 SDValue Ptr;
19788 Align Alignment;
19789 bool isSEXTLoad = false;
19790 bool IsMasked = false;
19791 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19792 Ptr = LD->getBasePtr();
19793 VT = LD->getMemoryVT();
19794 Alignment = LD->getAlign();
19795 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19796 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19797 Ptr = ST->getBasePtr();
19798 VT = ST->getMemoryVT();
19799 Alignment = ST->getAlign();
19800 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19801 Ptr = LD->getBasePtr();
19802 VT = LD->getMemoryVT();
19803 Alignment = LD->getAlign();
19804 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19805 IsMasked = true;
19807 Ptr = ST->getBasePtr();
19808 VT = ST->getMemoryVT();
19809 Alignment = ST->getAlign();
19810 IsMasked = true;
19811 } else
19812 return false;
19813
19814 bool isInc;
19815 bool isLegal = false;
19816 if (VT.isVector())
19817 isLegal = Subtarget->hasMVEIntegerOps() &&
19819 Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
19820 Subtarget->isLittle(), Base, Offset, isInc, DAG);
19821 else {
19822 if (Subtarget->isThumb2())
19823 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19824 Offset, isInc, DAG);
19825 else
19826 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19827 Offset, isInc, DAG);
19828 }
19829 if (!isLegal)
19830 return false;
19831
19832 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
19833 return true;
19834}
19835
19836/// getPostIndexedAddressParts - returns true by value, base pointer and
19837/// offset pointer and addressing mode by reference if this node can be
19838/// combined with a load / store to form a post-indexed load / store.
19840 SDValue &Base,
19841 SDValue &Offset,
19843 SelectionDAG &DAG) const {
19844 EVT VT;
19845 SDValue Ptr;
19846 Align Alignment;
19847 bool isSEXTLoad = false, isNonExt;
19848 bool IsMasked = false;
19849 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19850 VT = LD->getMemoryVT();
19851 Ptr = LD->getBasePtr();
19852 Alignment = LD->getAlign();
19853 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19854 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19855 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19856 VT = ST->getMemoryVT();
19857 Ptr = ST->getBasePtr();
19858 Alignment = ST->getAlign();
19859 isNonExt = !ST->isTruncatingStore();
19860 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19861 VT = LD->getMemoryVT();
19862 Ptr = LD->getBasePtr();
19863 Alignment = LD->getAlign();
19864 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19865 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19866 IsMasked = true;
19868 VT = ST->getMemoryVT();
19869 Ptr = ST->getBasePtr();
19870 Alignment = ST->getAlign();
19871 isNonExt = !ST->isTruncatingStore();
19872 IsMasked = true;
19873 } else
19874 return false;
19875
19876 if (Subtarget->isThumb1Only()) {
19877 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
19878 // must be non-extending/truncating, i32, with an offset of 4.
19879 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
19880 if (Op->getOpcode() != ISD::ADD || !isNonExt)
19881 return false;
19882 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
19883 if (!RHS || RHS->getZExtValue() != 4)
19884 return false;
19885 if (Alignment < Align(4))
19886 return false;
19887
19888 Offset = Op->getOperand(1);
19889 Base = Op->getOperand(0);
19890 AM = ISD::POST_INC;
19891 return true;
19892 }
19893
19894 bool isInc;
19895 bool isLegal = false;
19896 if (VT.isVector())
19897 isLegal = Subtarget->hasMVEIntegerOps() &&
19898 getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,
19899 Subtarget->isLittle(), Base, Offset,
19900 isInc, DAG);
19901 else {
19902 if (Subtarget->isThumb2())
19903 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19904 isInc, DAG);
19905 else
19906 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19907 isInc, DAG);
19908 }
19909 if (!isLegal)
19910 return false;
19911
19912 if (Ptr != Base) {
19913 // Swap base ptr and offset to catch more post-index load / store when
19914 // it's legal. In Thumb2 mode, offset must be an immediate.
19915 if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
19916 !Subtarget->isThumb2())
19918
19919 // Post-indexed load / store update the base pointer.
19920 if (Ptr != Base)
19921 return false;
19922 }
19923
19924 AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
19925 return true;
19926}
19927
19929 KnownBits &Known,
19930 const APInt &DemandedElts,
19931 const SelectionDAG &DAG,
19932 unsigned Depth) const {
19933 unsigned BitWidth = Known.getBitWidth();
19934 Known.resetAll();
19935 switch (Op.getOpcode()) {
19936 default: break;
19937 case ARMISD::ADDC:
19938 case ARMISD::ADDE:
19939 case ARMISD::SUBC:
19940 case ARMISD::SUBE:
19941 // Special cases when we convert a carry to a boolean.
19942 if (Op.getResNo() == 0) {
19943 SDValue LHS = Op.getOperand(0);
19944 SDValue RHS = Op.getOperand(1);
19945 // (ADDE 0, 0, C) will give us a single bit.
19946 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
19947 isNullConstant(RHS)) {
19949 return;
19950 }
19951 }
19952 break;
19953 case ARMISD::CMOV: {
19954 // Bits are known zero/one if known on the LHS and RHS.
19955 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
19956 if (Known.isUnknown())
19957 return;
19958
19959 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
19960 Known = Known.intersectWith(KnownRHS);
19961 return;
19962 }
19964 Intrinsic::ID IntID =
19965 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
19966 switch (IntID) {
19967 default: return;
19968 case Intrinsic::arm_ldaex:
19969 case Intrinsic::arm_ldrex: {
19970 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
19971 unsigned MemBits = VT.getScalarSizeInBits();
19972 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
19973 return;
19974 }
19975 }
19976 }
19977 case ARMISD::BFI: {
19978 // Conservatively, we can recurse down the first operand
19979 // and just mask out all affected bits.
19980 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
19981
19982 // The operand to BFI is already a mask suitable for removing the bits it
19983 // sets.
19984 const APInt &Mask = Op.getConstantOperandAPInt(2);
19985 Known.Zero &= Mask;
19986 Known.One &= Mask;
19987 return;
19988 }
19989 case ARMISD::VGETLANEs:
19990 case ARMISD::VGETLANEu: {
19991 const SDValue &SrcSV = Op.getOperand(0);
19992 EVT VecVT = SrcSV.getValueType();
19993 assert(VecVT.isVector() && "VGETLANE expected a vector type");
19994 const unsigned NumSrcElts = VecVT.getVectorNumElements();
19995 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
19996 assert(Pos->getAPIntValue().ult(NumSrcElts) &&
19997 "VGETLANE index out of bounds");
19998 unsigned Idx = Pos->getZExtValue();
19999 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
20000 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
20001
20002 EVT VT = Op.getValueType();
20003 const unsigned DstSz = VT.getScalarSizeInBits();
20004 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
20005 (void)SrcSz;
20006 assert(SrcSz == Known.getBitWidth());
20007 assert(DstSz > SrcSz);
20008 if (Op.getOpcode() == ARMISD::VGETLANEs)
20009 Known = Known.sext(DstSz);
20010 else {
20011 Known = Known.zext(DstSz);
20012 }
20013 assert(DstSz == Known.getBitWidth());
20014 break;
20015 }
20016 case ARMISD::VMOVrh: {
20017 KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20018 assert(KnownOp.getBitWidth() == 16);
20019 Known = KnownOp.zext(32);
20020 break;
20021 }
20022 case ARMISD::CSINC:
20023 case ARMISD::CSINV:
20024 case ARMISD::CSNEG: {
20025 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20026 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
20027
20028 // The result is either:
20029 // CSINC: KnownOp0 or KnownOp1 + 1
20030 // CSINV: KnownOp0 or ~KnownOp1
20031 // CSNEG: KnownOp0 or KnownOp1 * -1
20032 if (Op.getOpcode() == ARMISD::CSINC)
20033 KnownOp1 =
20034 KnownBits::add(KnownOp1, KnownBits::makeConstant(APInt(32, 1)));
20035 else if (Op.getOpcode() == ARMISD::CSINV)
20036 std::swap(KnownOp1.Zero, KnownOp1.One);
20037 else if (Op.getOpcode() == ARMISD::CSNEG)
20038 KnownOp1 = KnownBits::mul(KnownOp1,
20040
20041 Known = KnownOp0.intersectWith(KnownOp1);
20042 break;
20043 }
20044 case ARMISD::VORRIMM:
20045 case ARMISD::VBICIMM: {
20046 unsigned Encoded = Op.getConstantOperandVal(1);
20047 unsigned DecEltBits = 0;
20048 uint64_t DecodedVal = ARM_AM::decodeVMOVModImm(Encoded, DecEltBits);
20049
20050 unsigned EltBits = Op.getScalarValueSizeInBits();
20051 if (EltBits != DecEltBits) {
20052 // Be conservative: only update Known when EltBits == DecEltBits.
20053 // This is believed to always be true for VORRIMM/VBICIMM today, but if
20054 // that changes in the future, doing nothing here is safer than risking
20055 // subtle bugs.
20056 break;
20057 }
20058
20059 KnownBits KnownLHS = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
20060 bool IsVORR = Op.getOpcode() == ARMISD::VORRIMM;
20061 APInt Imm(DecEltBits, DecodedVal);
20062
20063 Known.One = IsVORR ? (KnownLHS.One | Imm) : (KnownLHS.One & ~Imm);
20064 Known.Zero = IsVORR ? (KnownLHS.Zero & ~Imm) : (KnownLHS.Zero | Imm);
20065 break;
20066 }
20067 }
20068}
20069
20071 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
20072 TargetLoweringOpt &TLO) const {
20073 // Delay optimization, so we don't have to deal with illegal types, or block
20074 // optimizations.
20075 if (!TLO.LegalOps)
20076 return false;
20077
20078 // Only optimize AND for now.
20079 if (Op.getOpcode() != ISD::AND)
20080 return false;
20081
20082 EVT VT = Op.getValueType();
20083
20084 // Ignore vectors.
20085 if (VT.isVector())
20086 return false;
20087
20088 assert(VT == MVT::i32 && "Unexpected integer type");
20089
20090 // Make sure the RHS really is a constant.
20091 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
20092 if (!C)
20093 return false;
20094
20095 unsigned Mask = C->getZExtValue();
20096
20097 unsigned Demanded = DemandedBits.getZExtValue();
20098 unsigned ShrunkMask = Mask & Demanded;
20099 unsigned ExpandedMask = Mask | ~Demanded;
20100
20101 // If the mask is all zeros, let the target-independent code replace the
20102 // result with zero.
20103 if (ShrunkMask == 0)
20104 return false;
20105
20106 // If the mask is all ones, erase the AND. (Currently, the target-independent
20107 // code won't do this, so we have to do it explicitly to avoid an infinite
20108 // loop in obscure cases.)
20109 if (ExpandedMask == ~0U)
20110 return TLO.CombineTo(Op, Op.getOperand(0));
20111
20112 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
20113 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
20114 };
20115 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
20116 if (NewMask == Mask)
20117 return true;
20118 SDLoc DL(Op);
20119 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
20120 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
20121 return TLO.CombineTo(Op, NewOp);
20122 };
20123
20124 // Prefer uxtb mask.
20125 if (IsLegalMask(0xFF))
20126 return UseMask(0xFF);
20127
20128 // Prefer uxth mask.
20129 if (IsLegalMask(0xFFFF))
20130 return UseMask(0xFFFF);
20131
20132 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
20133 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
20134 if (ShrunkMask < 256)
20135 return UseMask(ShrunkMask);
20136
20137 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
20138 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
20139 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
20140 return UseMask(ExpandedMask);
20141
20142 // Potential improvements:
20143 //
20144 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
20145 // We could try to prefer Thumb1 immediates which can be lowered to a
20146 // two-instruction sequence.
20147 // We could try to recognize more legal ARM/Thumb2 immediates here.
20148
20149 return false;
20150}
20151
20153 SDValue Op, const APInt &OriginalDemandedBits,
20154 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
20155 unsigned Depth) const {
20156 unsigned Opc = Op.getOpcode();
20157
20158 switch (Opc) {
20159 case ARMISD::ASRL:
20160 case ARMISD::LSRL: {
20161 // If this is result 0 and the other result is unused, see if the demand
20162 // bits allow us to shrink this long shift into a standard small shift in
20163 // the opposite direction.
20164 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
20165 isa<ConstantSDNode>(Op->getOperand(2))) {
20166 unsigned ShAmt = Op->getConstantOperandVal(2);
20167 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32)
20168 << (32 - ShAmt)))
20169 return TLO.CombineTo(
20170 Op, TLO.DAG.getNode(
20171 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
20172 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
20173 }
20174 break;
20175 }
20176 case ARMISD::VBICIMM: {
20177 SDValue Op0 = Op.getOperand(0);
20178 unsigned ModImm = Op.getConstantOperandVal(1);
20179 unsigned EltBits = 0;
20180 uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits);
20181 if ((OriginalDemandedBits & Mask) == 0)
20182 return TLO.CombineTo(Op, Op0);
20183 }
20184 }
20185
20187 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
20188}
20189
20190//===----------------------------------------------------------------------===//
20191// ARM Inline Assembly Support
20192//===----------------------------------------------------------------------===//
20193
20194const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
20195 // At this point, we have to lower this constraint to something else, so we
20196 // lower it to an "r" or "w". However, by doing this we will force the result
20197 // to be in register, while the X constraint is much more permissive.
20198 //
20199 // Although we are correct (we are free to emit anything, without
20200 // constraints), we might break use cases that would expect us to be more
20201 // efficient and emit something else.
20202 if (!Subtarget->hasVFP2Base())
20203 return "r";
20204 if (ConstraintVT.isFloatingPoint())
20205 return "w";
20206 if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
20207 (ConstraintVT.getSizeInBits() == 64 ||
20208 ConstraintVT.getSizeInBits() == 128))
20209 return "w";
20210
20211 return "r";
20212}
20213
20214/// getConstraintType - Given a constraint letter, return the type of
20215/// constraint it is for this target.
20218 unsigned S = Constraint.size();
20219 if (S == 1) {
20220 switch (Constraint[0]) {
20221 default: break;
20222 case 'l': return C_RegisterClass;
20223 case 'w': return C_RegisterClass;
20224 case 'h': return C_RegisterClass;
20225 case 'x': return C_RegisterClass;
20226 case 't': return C_RegisterClass;
20227 case 'j': return C_Immediate; // Constant for movw.
20228 // An address with a single base register. Due to the way we
20229 // currently handle addresses it is the same as an 'r' memory constraint.
20230 case 'Q': return C_Memory;
20231 }
20232 } else if (S == 2) {
20233 switch (Constraint[0]) {
20234 default: break;
20235 case 'T': return C_RegisterClass;
20236 // All 'U+' constraints are addresses.
20237 case 'U': return C_Memory;
20238 }
20239 }
20240 return TargetLowering::getConstraintType(Constraint);
20241}
20242
20243/// Examine constraint type and operand type and determine a weight value.
20244/// This object must already have been set up with the operand type
20245/// and the current alternative constraint selected.
20248 AsmOperandInfo &info, const char *constraint) const {
20250 Value *CallOperandVal = info.CallOperandVal;
20251 // If we don't have a value, we can't do a match,
20252 // but allow it at the lowest weight.
20253 if (!CallOperandVal)
20254 return CW_Default;
20255 Type *type = CallOperandVal->getType();
20256 // Look at the constraint type.
20257 switch (*constraint) {
20258 default:
20260 break;
20261 case 'l':
20262 if (type->isIntegerTy()) {
20263 if (Subtarget->isThumb())
20264 weight = CW_SpecificReg;
20265 else
20266 weight = CW_Register;
20267 }
20268 break;
20269 case 'w':
20270 if (type->isFloatingPointTy())
20271 weight = CW_Register;
20272 break;
20273 }
20274 return weight;
20275}
20276
20277static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT) {
20278 if (PR == 0 || VT == MVT::Other)
20279 return false;
20280 if (ARM::SPRRegClass.contains(PR))
20281 return VT != MVT::f32 && VT != MVT::f16 && VT != MVT::i32;
20282 if (ARM::DPRRegClass.contains(PR))
20283 return VT != MVT::f64 && !VT.is64BitVector();
20284 return false;
20285}
20286
20287using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
20288
20290 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
20291 switch (Constraint.size()) {
20292 case 1:
20293 // GCC ARM Constraint Letters
20294 switch (Constraint[0]) {
20295 case 'l': // Low regs or general regs.
20296 if (Subtarget->isThumb())
20297 return RCPair(0U, &ARM::tGPRRegClass);
20298 return RCPair(0U, &ARM::GPRRegClass);
20299 case 'h': // High regs or no regs.
20300 if (Subtarget->isThumb())
20301 return RCPair(0U, &ARM::hGPRRegClass);
20302 break;
20303 case 'r':
20304 if (Subtarget->isThumb1Only())
20305 return RCPair(0U, &ARM::tGPRRegClass);
20306 return RCPair(0U, &ARM::GPRRegClass);
20307 case 'w':
20308 if (VT == MVT::Other)
20309 break;
20310 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20311 return RCPair(0U, &ARM::SPRRegClass);
20312 if (VT.getSizeInBits() == 64)
20313 return RCPair(0U, &ARM::DPRRegClass);
20314 if (VT.getSizeInBits() == 128)
20315 return RCPair(0U, &ARM::QPRRegClass);
20316 break;
20317 case 'x':
20318 if (VT == MVT::Other)
20319 break;
20320 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20321 return RCPair(0U, &ARM::SPR_8RegClass);
20322 if (VT.getSizeInBits() == 64)
20323 return RCPair(0U, &ARM::DPR_8RegClass);
20324 if (VT.getSizeInBits() == 128)
20325 return RCPair(0U, &ARM::QPR_8RegClass);
20326 break;
20327 case 't':
20328 if (VT == MVT::Other)
20329 break;
20330 if (VT == MVT::f32 || VT == MVT::i32 || VT == MVT::f16 || VT == MVT::bf16)
20331 return RCPair(0U, &ARM::SPRRegClass);
20332 if (VT.getSizeInBits() == 64)
20333 return RCPair(0U, &ARM::DPR_VFP2RegClass);
20334 if (VT.getSizeInBits() == 128)
20335 return RCPair(0U, &ARM::QPR_VFP2RegClass);
20336 break;
20337 }
20338 break;
20339
20340 case 2:
20341 if (Constraint[0] == 'T') {
20342 switch (Constraint[1]) {
20343 default:
20344 break;
20345 case 'e':
20346 return RCPair(0U, &ARM::tGPREvenRegClass);
20347 case 'o':
20348 return RCPair(0U, &ARM::tGPROddRegClass);
20349 }
20350 }
20351 break;
20352
20353 default:
20354 break;
20355 }
20356
20357 if (StringRef("{cc}").equals_insensitive(Constraint))
20358 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
20359
20360 auto RCP = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
20361 if (isIncompatibleReg(RCP.first, VT))
20362 return {0, nullptr};
20363 return RCP;
20364}
20365
20366/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
20367/// vector. If it is invalid, don't add anything to Ops.
20369 StringRef Constraint,
20370 std::vector<SDValue> &Ops,
20371 SelectionDAG &DAG) const {
20372 SDValue Result;
20373
20374 // Currently only support length 1 constraints.
20375 if (Constraint.size() != 1)
20376 return;
20377
20378 char ConstraintLetter = Constraint[0];
20379 switch (ConstraintLetter) {
20380 default: break;
20381 case 'j':
20382 case 'I': case 'J': case 'K': case 'L':
20383 case 'M': case 'N': case 'O':
20385 if (!C)
20386 return;
20387
20388 int64_t CVal64 = C->getSExtValue();
20389 int CVal = (int) CVal64;
20390 // None of these constraints allow values larger than 32 bits. Check
20391 // that the value fits in an int.
20392 if (CVal != CVal64)
20393 return;
20394
20395 switch (ConstraintLetter) {
20396 case 'j':
20397 // Constant suitable for movw, must be between 0 and
20398 // 65535.
20399 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
20400 if (CVal >= 0 && CVal <= 65535)
20401 break;
20402 return;
20403 case 'I':
20404 if (Subtarget->isThumb1Only()) {
20405 // This must be a constant between 0 and 255, for ADD
20406 // immediates.
20407 if (CVal >= 0 && CVal <= 255)
20408 break;
20409 } else if (Subtarget->isThumb2()) {
20410 // A constant that can be used as an immediate value in a
20411 // data-processing instruction.
20412 if (ARM_AM::getT2SOImmVal(CVal) != -1)
20413 break;
20414 } else {
20415 // A constant that can be used as an immediate value in a
20416 // data-processing instruction.
20417 if (ARM_AM::getSOImmVal(CVal) != -1)
20418 break;
20419 }
20420 return;
20421
20422 case 'J':
20423 if (Subtarget->isThumb1Only()) {
20424 // This must be a constant between -255 and -1, for negated ADD
20425 // immediates. This can be used in GCC with an "n" modifier that
20426 // prints the negated value, for use with SUB instructions. It is
20427 // not useful otherwise but is implemented for compatibility.
20428 if (CVal >= -255 && CVal <= -1)
20429 break;
20430 } else {
20431 // This must be a constant between -4095 and 4095. It is not clear
20432 // what this constraint is intended for. Implemented for
20433 // compatibility with GCC.
20434 if (CVal >= -4095 && CVal <= 4095)
20435 break;
20436 }
20437 return;
20438
20439 case 'K':
20440 if (Subtarget->isThumb1Only()) {
20441 // A 32-bit value where only one byte has a nonzero value. Exclude
20442 // zero to match GCC. This constraint is used by GCC internally for
20443 // constants that can be loaded with a move/shift combination.
20444 // It is not useful otherwise but is implemented for compatibility.
20445 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
20446 break;
20447 } else if (Subtarget->isThumb2()) {
20448 // A constant whose bitwise inverse can be used as an immediate
20449 // value in a data-processing instruction. This can be used in GCC
20450 // with a "B" modifier that prints the inverted value, for use with
20451 // BIC and MVN instructions. It is not useful otherwise but is
20452 // implemented for compatibility.
20453 if (ARM_AM::getT2SOImmVal(~CVal) != -1)
20454 break;
20455 } else {
20456 // A constant whose bitwise inverse can be used as an immediate
20457 // value in a data-processing instruction. This can be used in GCC
20458 // with a "B" modifier that prints the inverted value, for use with
20459 // BIC and MVN instructions. It is not useful otherwise but is
20460 // implemented for compatibility.
20461 if (ARM_AM::getSOImmVal(~CVal) != -1)
20462 break;
20463 }
20464 return;
20465
20466 case 'L':
20467 if (Subtarget->isThumb1Only()) {
20468 // This must be a constant between -7 and 7,
20469 // for 3-operand ADD/SUB immediate instructions.
20470 if (CVal >= -7 && CVal < 7)
20471 break;
20472 } else if (Subtarget->isThumb2()) {
20473 // A constant whose negation can be used as an immediate value in a
20474 // data-processing instruction. This can be used in GCC with an "n"
20475 // modifier that prints the negated value, for use with SUB
20476 // instructions. It is not useful otherwise but is implemented for
20477 // compatibility.
20478 if (ARM_AM::getT2SOImmVal(-CVal) != -1)
20479 break;
20480 } else {
20481 // A constant whose negation can be used as an immediate value in a
20482 // data-processing instruction. This can be used in GCC with an "n"
20483 // modifier that prints the negated value, for use with SUB
20484 // instructions. It is not useful otherwise but is implemented for
20485 // compatibility.
20486 if (ARM_AM::getSOImmVal(-CVal) != -1)
20487 break;
20488 }
20489 return;
20490
20491 case 'M':
20492 if (Subtarget->isThumb1Only()) {
20493 // This must be a multiple of 4 between 0 and 1020, for
20494 // ADD sp + immediate.
20495 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
20496 break;
20497 } else {
20498 // A power of two or a constant between 0 and 32. This is used in
20499 // GCC for the shift amount on shifted register operands, but it is
20500 // useful in general for any shift amounts.
20501 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
20502 break;
20503 }
20504 return;
20505
20506 case 'N':
20507 if (Subtarget->isThumb1Only()) {
20508 // This must be a constant between 0 and 31, for shift amounts.
20509 if (CVal >= 0 && CVal <= 31)
20510 break;
20511 }
20512 return;
20513
20514 case 'O':
20515 if (Subtarget->isThumb1Only()) {
20516 // This must be a multiple of 4 between -508 and 508, for
20517 // ADD/SUB sp = sp + immediate.
20518 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
20519 break;
20520 }
20521 return;
20522 }
20523 Result = DAG.getSignedTargetConstant(CVal, SDLoc(Op), Op.getValueType());
20524 break;
20525 }
20526
20527 if (Result.getNode()) {
20528 Ops.push_back(Result);
20529 return;
20530 }
20531 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
20532}
20533
20534static RTLIB::Libcall getDivRemLibcall(
20535 const SDNode *N, MVT::SimpleValueType SVT) {
20536 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20537 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20538 "Unhandled Opcode in getDivRemLibcall");
20539 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20540 N->getOpcode() == ISD::SREM;
20541 RTLIB::Libcall LC;
20542 switch (SVT) {
20543 default: llvm_unreachable("Unexpected request for libcall!");
20544 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
20545 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
20546 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
20547 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
20548 }
20549 return LC;
20550}
20551
20553 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
20554 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20555 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20556 "Unhandled Opcode in getDivRemArgList");
20557 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20558 N->getOpcode() == ISD::SREM;
20560 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
20561 EVT ArgVT = N->getOperand(i).getValueType();
20562 Type *ArgTy = ArgVT.getTypeForEVT(*Context);
20563 TargetLowering::ArgListEntry Entry(N->getOperand(i), ArgTy);
20564 Entry.IsSExt = isSigned;
20565 Entry.IsZExt = !isSigned;
20566 Args.push_back(Entry);
20567 }
20568 if (Subtarget->isTargetWindows() && Args.size() >= 2)
20569 std::swap(Args[0], Args[1]);
20570 return Args;
20571}
20572
20573SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
20574 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
20575 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
20576 Subtarget->isTargetWindows()) &&
20577 "Register-based DivRem lowering only");
20578 unsigned Opcode = Op->getOpcode();
20579 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
20580 "Invalid opcode for Div/Rem lowering");
20581 bool isSigned = (Opcode == ISD::SDIVREM);
20582 EVT VT = Op->getValueType(0);
20583 SDLoc dl(Op);
20584
20585 if (VT == MVT::i64 && isa<ConstantSDNode>(Op.getOperand(1))) {
20587 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i32, DAG)) {
20588 SDValue Res0 =
20589 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[0], Result[1]);
20590 SDValue Res1 =
20591 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[2], Result[3]);
20592 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20593 {Res0, Res1});
20594 }
20595 }
20596
20597 Type *Ty = VT.getTypeForEVT(*DAG.getContext());
20598
20599 // If the target has hardware divide, use divide + multiply + subtract:
20600 // div = a / b
20601 // rem = a - b * div
20602 // return {div, rem}
20603 // This should be lowered into UDIV/SDIV + MLS later on.
20604 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
20605 : Subtarget->hasDivideInARMMode();
20606 if (hasDivide && Op->getValueType(0).isSimple() &&
20607 Op->getSimpleValueType(0) == MVT::i32) {
20608 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
20609 const SDValue Dividend = Op->getOperand(0);
20610 const SDValue Divisor = Op->getOperand(1);
20611 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
20612 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
20613 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
20614
20615 SDValue Values[2] = {Div, Rem};
20616 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
20617 }
20618
20619 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
20620 VT.getSimpleVT().SimpleTy);
20621 SDValue InChain = DAG.getEntryNode();
20622
20624 DAG.getContext(),
20625 Subtarget);
20626
20629
20630 Type *RetTy = StructType::get(Ty, Ty);
20631
20632 if (Subtarget->isTargetWindows())
20633 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
20634
20635 TargetLowering::CallLoweringInfo CLI(DAG);
20636 CLI.setDebugLoc(dl).setChain(InChain)
20637 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
20639
20640 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
20641 return CallInfo.first;
20642}
20643
20644// Lowers REM using divmod helpers
20645// see RTABI section 4.2/4.3
20646SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
20647 EVT VT = N->getValueType(0);
20648
20649 if (VT == MVT::i64 && isa<ConstantSDNode>(N->getOperand(1))) {
20651 if (expandDIVREMByConstant(N, Result, MVT::i32, DAG))
20652 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), N->getValueType(0),
20653 Result[0], Result[1]);
20654 }
20655
20656 // Build return types (div and rem)
20657 std::vector<Type*> RetTyParams;
20658 Type *RetTyElement;
20659
20660 switch (VT.getSimpleVT().SimpleTy) {
20661 default: llvm_unreachable("Unexpected request for libcall!");
20662 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
20663 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
20664 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
20665 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
20666 }
20667
20668 RetTyParams.push_back(RetTyElement);
20669 RetTyParams.push_back(RetTyElement);
20670 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
20671 Type *RetTy = StructType::get(*DAG.getContext(), ret);
20672
20673 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
20674 SimpleTy);
20675 SDValue InChain = DAG.getEntryNode();
20677 Subtarget);
20678 bool isSigned = N->getOpcode() == ISD::SREM;
20681
20682 if (Subtarget->isTargetWindows())
20683 InChain = WinDBZCheckDenominator(DAG, N, InChain);
20684
20685 // Lower call
20686 CallLoweringInfo CLI(DAG);
20687 CLI.setChain(InChain)
20688 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
20690 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
20691
20692 // Return second (rem) result operand (first contains div)
20693 SDNode *ResNode = CallResult.first.getNode();
20694 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
20695 return ResNode->getOperand(1);
20696}
20697
20698SDValue
20699ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
20700 assert(Subtarget->isTargetWindows() && "unsupported target platform");
20701 SDLoc DL(Op);
20702
20703 // Get the inputs.
20704 SDValue Chain = Op.getOperand(0);
20705 SDValue Size = Op.getOperand(1);
20706
20708 "no-stack-arg-probe")) {
20709 MaybeAlign Align =
20710 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
20711 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20712 Chain = SP.getValue(1);
20713 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
20714 if (Align)
20715 SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
20716 DAG.getSignedConstant(-Align->value(), DL, MVT::i32));
20717 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
20718 SDValue Ops[2] = { SP, Chain };
20719 return DAG.getMergeValues(Ops, DL);
20720 }
20721
20722 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
20723 DAG.getConstant(2, DL, MVT::i32));
20724
20725 SDValue Glue;
20726 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Glue);
20727 Glue = Chain.getValue(1);
20728
20729 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20730 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Glue);
20731
20732 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20733 Chain = NewSP.getValue(1);
20734
20735 SDValue Ops[2] = { NewSP, Chain };
20736 return DAG.getMergeValues(Ops, DL);
20737}
20738
20739SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
20740 bool IsStrict = Op->isStrictFPOpcode();
20741 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20742 const unsigned DstSz = Op.getValueType().getSizeInBits();
20743 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
20744 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
20745 "Unexpected type for custom-lowering FP_EXTEND");
20746
20747 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20748 "With both FP DP and 16, any FP conversion is legal!");
20749
20750 assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
20751 "With FP16, 16 to 32 conversion is legal!");
20752
20753 // Converting from 32 -> 64 is valid if we have FP64.
20754 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) {
20755 // FIXME: Remove this when we have strict fp instruction selection patterns
20756 if (IsStrict) {
20757 SDLoc Loc(Op);
20758 SDValue Result = DAG.getNode(ISD::FP_EXTEND,
20759 Loc, Op.getValueType(), SrcVal);
20760 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
20761 }
20762 return Op;
20763 }
20764
20765 // Either we are converting from 16 -> 64, without FP16 and/or
20766 // FP.double-precision or without Armv8-fp. So we must do it in two
20767 // steps.
20768 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
20769 // without FP16. So we must do a function call.
20770 SDLoc Loc(Op);
20771 RTLIB::Libcall LC;
20772 MakeLibCallOptions CallOptions;
20773 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20774 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {
20775 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());
20776 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);
20777 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);
20778 if (Supported) {
20779 if (IsStrict) {
20780 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc,
20781 {DstVT, MVT::Other}, {Chain, SrcVal});
20782 Chain = SrcVal.getValue(1);
20783 } else {
20784 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal);
20785 }
20786 } else {
20787 LC = RTLIB::getFPEXT(SrcVT, DstVT);
20788 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20789 "Unexpected type for custom-lowering FP_EXTEND");
20790 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20791 Loc, Chain);
20792 }
20793 }
20794
20795 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal;
20796}
20797
20798SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
20799 bool IsStrict = Op->isStrictFPOpcode();
20800
20801 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20802 EVT SrcVT = SrcVal.getValueType();
20803 EVT DstVT = Op.getValueType();
20804 const unsigned DstSz = Op.getValueType().getSizeInBits();
20805 const unsigned SrcSz = SrcVT.getSizeInBits();
20806 (void)DstSz;
20807 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
20808 "Unexpected type for custom-lowering FP_ROUND");
20809
20810 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20811 "With both FP DP and 16, any FP conversion is legal!");
20812
20813 SDLoc Loc(Op);
20814
20815 // Instruction from 32 -> 16 if hasFP16 is valid
20816 if (SrcSz == 32 && Subtarget->hasFP16())
20817 return Op;
20818
20819 // Lib call from 32 -> 16 / 64 -> [32, 16]
20820 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
20821 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20822 "Unexpected type for custom-lowering FP_ROUND");
20823 MakeLibCallOptions CallOptions;
20824 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20826 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20827 Loc, Chain);
20828 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
20829}
20830
20831bool
20833 // The ARM target isn't yet aware of offsets.
20834 return false;
20835}
20836
20838 if (v == 0xffffffff)
20839 return false;
20840
20841 // there can be 1's on either or both "outsides", all the "inside"
20842 // bits must be 0's
20843 return isShiftedMask_32(~v);
20844}
20845
20846/// isFPImmLegal - Returns true if the target can instruction select the
20847/// specified FP immediate natively. If false, the legalizer will
20848/// materialize the FP immediate as a load from a constant pool.
20850 bool ForCodeSize) const {
20851 if (!Subtarget->hasVFP3Base())
20852 return false;
20853 if (VT == MVT::f16 && Subtarget->hasFullFP16())
20854 return ARM_AM::getFP16Imm(Imm) != -1;
20855 if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
20856 ARM_AM::getFP32FP16Imm(Imm) != -1)
20857 return true;
20858 if (VT == MVT::f32)
20859 return ARM_AM::getFP32Imm(Imm) != -1;
20860 if (VT == MVT::f64 && Subtarget->hasFP64())
20861 return ARM_AM::getFP64Imm(Imm) != -1;
20862 return false;
20863}
20864
20865/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
20866/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
20867/// specified in the intrinsic calls.
20869 const CallInst &I,
20870 MachineFunction &MF,
20871 unsigned Intrinsic) const {
20872 switch (Intrinsic) {
20873 case Intrinsic::arm_neon_vld1:
20874 case Intrinsic::arm_neon_vld2:
20875 case Intrinsic::arm_neon_vld3:
20876 case Intrinsic::arm_neon_vld4:
20877 case Intrinsic::arm_neon_vld2lane:
20878 case Intrinsic::arm_neon_vld3lane:
20879 case Intrinsic::arm_neon_vld4lane:
20880 case Intrinsic::arm_neon_vld2dup:
20881 case Intrinsic::arm_neon_vld3dup:
20882 case Intrinsic::arm_neon_vld4dup: {
20883 Info.opc = ISD::INTRINSIC_W_CHAIN;
20884 // Conservatively set memVT to the entire set of vectors loaded.
20885 auto &DL = I.getDataLayout();
20886 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20887 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20888 Info.ptrVal = I.getArgOperand(0);
20889 Info.offset = 0;
20890 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
20891 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
20892 // volatile loads with NEON intrinsics not supported
20893 Info.flags = MachineMemOperand::MOLoad;
20894 return true;
20895 }
20896 case Intrinsic::arm_neon_vld1x2:
20897 case Intrinsic::arm_neon_vld1x3:
20898 case Intrinsic::arm_neon_vld1x4: {
20899 Info.opc = ISD::INTRINSIC_W_CHAIN;
20900 // Conservatively set memVT to the entire set of vectors loaded.
20901 auto &DL = I.getDataLayout();
20902 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20903 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20904 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
20905 Info.offset = 0;
20906 Info.align = I.getParamAlign(I.arg_size() - 1).valueOrOne();
20907 // volatile loads with NEON intrinsics not supported
20908 Info.flags = MachineMemOperand::MOLoad;
20909 return true;
20910 }
20911 case Intrinsic::arm_neon_vst1:
20912 case Intrinsic::arm_neon_vst2:
20913 case Intrinsic::arm_neon_vst3:
20914 case Intrinsic::arm_neon_vst4:
20915 case Intrinsic::arm_neon_vst2lane:
20916 case Intrinsic::arm_neon_vst3lane:
20917 case Intrinsic::arm_neon_vst4lane: {
20918 Info.opc = ISD::INTRINSIC_VOID;
20919 // Conservatively set memVT to the entire set of vectors stored.
20920 auto &DL = I.getDataLayout();
20921 unsigned NumElts = 0;
20922 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
20923 Type *ArgTy = I.getArgOperand(ArgI)->getType();
20924 if (!ArgTy->isVectorTy())
20925 break;
20926 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
20927 }
20928 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20929 Info.ptrVal = I.getArgOperand(0);
20930 Info.offset = 0;
20931 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
20932 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
20933 // volatile stores with NEON intrinsics not supported
20934 Info.flags = MachineMemOperand::MOStore;
20935 return true;
20936 }
20937 case Intrinsic::arm_neon_vst1x2:
20938 case Intrinsic::arm_neon_vst1x3:
20939 case Intrinsic::arm_neon_vst1x4: {
20940 Info.opc = ISD::INTRINSIC_VOID;
20941 // Conservatively set memVT to the entire set of vectors stored.
20942 auto &DL = I.getDataLayout();
20943 unsigned NumElts = 0;
20944 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
20945 Type *ArgTy = I.getArgOperand(ArgI)->getType();
20946 if (!ArgTy->isVectorTy())
20947 break;
20948 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
20949 }
20950 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20951 Info.ptrVal = I.getArgOperand(0);
20952 Info.offset = 0;
20953 Info.align = I.getParamAlign(0).valueOrOne();
20954 // volatile stores with NEON intrinsics not supported
20955 Info.flags = MachineMemOperand::MOStore;
20956 return true;
20957 }
20958 case Intrinsic::arm_mve_vld2q:
20959 case Intrinsic::arm_mve_vld4q: {
20960 Info.opc = ISD::INTRINSIC_W_CHAIN;
20961 // Conservatively set memVT to the entire set of vectors loaded.
20962 Type *VecTy = cast<StructType>(I.getType())->getElementType(1);
20963 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
20964 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
20965 Info.ptrVal = I.getArgOperand(0);
20966 Info.offset = 0;
20967 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
20968 // volatile loads with MVE intrinsics not supported
20969 Info.flags = MachineMemOperand::MOLoad;
20970 return true;
20971 }
20972 case Intrinsic::arm_mve_vst2q:
20973 case Intrinsic::arm_mve_vst4q: {
20974 Info.opc = ISD::INTRINSIC_VOID;
20975 // Conservatively set memVT to the entire set of vectors stored.
20976 Type *VecTy = I.getArgOperand(1)->getType();
20977 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
20978 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
20979 Info.ptrVal = I.getArgOperand(0);
20980 Info.offset = 0;
20981 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
20982 // volatile stores with MVE intrinsics not supported
20983 Info.flags = MachineMemOperand::MOStore;
20984 return true;
20985 }
20986 case Intrinsic::arm_mve_vldr_gather_base:
20987 case Intrinsic::arm_mve_vldr_gather_base_predicated: {
20988 Info.opc = ISD::INTRINSIC_W_CHAIN;
20989 Info.ptrVal = nullptr;
20990 Info.memVT = MVT::getVT(I.getType());
20991 Info.align = Align(1);
20992 Info.flags |= MachineMemOperand::MOLoad;
20993 return true;
20994 }
20995 case Intrinsic::arm_mve_vldr_gather_base_wb:
20996 case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {
20997 Info.opc = ISD::INTRINSIC_W_CHAIN;
20998 Info.ptrVal = nullptr;
20999 Info.memVT = MVT::getVT(I.getType()->getContainedType(0));
21000 Info.align = Align(1);
21001 Info.flags |= MachineMemOperand::MOLoad;
21002 return true;
21003 }
21004 case Intrinsic::arm_mve_vldr_gather_offset:
21005 case Intrinsic::arm_mve_vldr_gather_offset_predicated: {
21006 Info.opc = ISD::INTRINSIC_W_CHAIN;
21007 Info.ptrVal = nullptr;
21008 MVT DataVT = MVT::getVT(I.getType());
21009 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
21010 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21011 DataVT.getVectorNumElements());
21012 Info.align = Align(1);
21013 Info.flags |= MachineMemOperand::MOLoad;
21014 return true;
21015 }
21016 case Intrinsic::arm_mve_vstr_scatter_base:
21017 case Intrinsic::arm_mve_vstr_scatter_base_predicated: {
21018 Info.opc = ISD::INTRINSIC_VOID;
21019 Info.ptrVal = nullptr;
21020 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21021 Info.align = Align(1);
21022 Info.flags |= MachineMemOperand::MOStore;
21023 return true;
21024 }
21025 case Intrinsic::arm_mve_vstr_scatter_base_wb:
21026 case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: {
21027 Info.opc = ISD::INTRINSIC_W_CHAIN;
21028 Info.ptrVal = nullptr;
21029 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21030 Info.align = Align(1);
21031 Info.flags |= MachineMemOperand::MOStore;
21032 return true;
21033 }
21034 case Intrinsic::arm_mve_vstr_scatter_offset:
21035 case Intrinsic::arm_mve_vstr_scatter_offset_predicated: {
21036 Info.opc = ISD::INTRINSIC_VOID;
21037 Info.ptrVal = nullptr;
21038 MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType());
21039 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
21040 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21041 DataVT.getVectorNumElements());
21042 Info.align = Align(1);
21043 Info.flags |= MachineMemOperand::MOStore;
21044 return true;
21045 }
21046 case Intrinsic::arm_ldaex:
21047 case Intrinsic::arm_ldrex: {
21048 auto &DL = I.getDataLayout();
21049 Type *ValTy = I.getParamElementType(0);
21050 Info.opc = ISD::INTRINSIC_W_CHAIN;
21051 Info.memVT = MVT::getVT(ValTy);
21052 Info.ptrVal = I.getArgOperand(0);
21053 Info.offset = 0;
21054 Info.align = DL.getABITypeAlign(ValTy);
21056 return true;
21057 }
21058 case Intrinsic::arm_stlex:
21059 case Intrinsic::arm_strex: {
21060 auto &DL = I.getDataLayout();
21061 Type *ValTy = I.getParamElementType(1);
21062 Info.opc = ISD::INTRINSIC_W_CHAIN;
21063 Info.memVT = MVT::getVT(ValTy);
21064 Info.ptrVal = I.getArgOperand(1);
21065 Info.offset = 0;
21066 Info.align = DL.getABITypeAlign(ValTy);
21068 return true;
21069 }
21070 case Intrinsic::arm_stlexd:
21071 case Intrinsic::arm_strexd:
21072 Info.opc = ISD::INTRINSIC_W_CHAIN;
21073 Info.memVT = MVT::i64;
21074 Info.ptrVal = I.getArgOperand(2);
21075 Info.offset = 0;
21076 Info.align = Align(8);
21078 return true;
21079
21080 case Intrinsic::arm_ldaexd:
21081 case Intrinsic::arm_ldrexd:
21082 Info.opc = ISD::INTRINSIC_W_CHAIN;
21083 Info.memVT = MVT::i64;
21084 Info.ptrVal = I.getArgOperand(0);
21085 Info.offset = 0;
21086 Info.align = Align(8);
21088 return true;
21089
21090 default:
21091 break;
21092 }
21093
21094 return false;
21095}
21096
21097/// Returns true if it is beneficial to convert a load of a constant
21098/// to just the constant itself.
21100 Type *Ty) const {
21101 assert(Ty->isIntegerTy());
21102
21103 unsigned Bits = Ty->getPrimitiveSizeInBits();
21104 if (Bits == 0 || Bits > 32)
21105 return false;
21106 return true;
21107}
21108
21110 unsigned Index) const {
21112 return false;
21113
21114 return (Index == 0 || Index == ResVT.getVectorNumElements());
21115}
21116
21118 ARM_MB::MemBOpt Domain) const {
21119 // First, if the target has no DMB, see what fallback we can use.
21120 if (!Subtarget->hasDataBarrier()) {
21121 // Some ARMv6 cpus can support data barriers with an mcr instruction.
21122 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
21123 // here.
21124 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
21125 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
21126 Builder.getInt32(0), Builder.getInt32(7),
21127 Builder.getInt32(10), Builder.getInt32(5)};
21128 return Builder.CreateIntrinsic(Intrinsic::arm_mcr, args);
21129 } else {
21130 // Instead of using barriers, atomic accesses on these subtargets use
21131 // libcalls.
21132 llvm_unreachable("makeDMB on a target so old that it has no barriers");
21133 }
21134 } else {
21135 // Only a full system barrier exists in the M-class architectures.
21136 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
21137 Constant *CDomain = Builder.getInt32(Domain);
21138 return Builder.CreateIntrinsic(Intrinsic::arm_dmb, CDomain);
21139 }
21140}
21141
21142// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
21144 Instruction *Inst,
21145 AtomicOrdering Ord) const {
21146 switch (Ord) {
21149 llvm_unreachable("Invalid fence: unordered/non-atomic");
21152 return nullptr; // Nothing to do
21154 if (!Inst->hasAtomicStore())
21155 return nullptr; // Nothing to do
21156 [[fallthrough]];
21159 if (Subtarget->preferISHSTBarriers())
21160 return makeDMB(Builder, ARM_MB::ISHST);
21161 // FIXME: add a comment with a link to documentation justifying this.
21162 else
21163 return makeDMB(Builder, ARM_MB::ISH);
21164 }
21165 llvm_unreachable("Unknown fence ordering in emitLeadingFence");
21166}
21167
21169 Instruction *Inst,
21170 AtomicOrdering Ord) const {
21171 switch (Ord) {
21174 llvm_unreachable("Invalid fence: unordered/not-atomic");
21177 return nullptr; // Nothing to do
21181 return makeDMB(Builder, ARM_MB::ISH);
21182 }
21183 llvm_unreachable("Unknown fence ordering in emitTrailingFence");
21184}
21185
21186// Loads and stores less than 64-bits are already atomic; ones above that
21187// are doomed anyway, so defer to the default libcall and blame the OS when
21188// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21189// anything for those.
21192 bool has64BitAtomicStore;
21193 if (Subtarget->isMClass())
21194 has64BitAtomicStore = false;
21195 else if (Subtarget->isThumb())
21196 has64BitAtomicStore = Subtarget->hasV7Ops();
21197 else
21198 has64BitAtomicStore = Subtarget->hasV6Ops();
21199
21200 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
21201 return Size == 64 && has64BitAtomicStore ? AtomicExpansionKind::Expand
21203}
21204
21205// Loads and stores less than 64-bits are already atomic; ones above that
21206// are doomed anyway, so defer to the default libcall and blame the OS when
21207// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21208// anything for those.
21209// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
21210// guarantee, see DDI0406C ARM architecture reference manual,
21211// sections A8.8.72-74 LDRD)
21214 bool has64BitAtomicLoad;
21215 if (Subtarget->isMClass())
21216 has64BitAtomicLoad = false;
21217 else if (Subtarget->isThumb())
21218 has64BitAtomicLoad = Subtarget->hasV7Ops();
21219 else
21220 has64BitAtomicLoad = Subtarget->hasV6Ops();
21221
21222 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
21223 return (Size == 64 && has64BitAtomicLoad) ? AtomicExpansionKind::LLOnly
21225}
21226
21227// For the real atomic operations, we have ldrex/strex up to 32 bits,
21228// and up to 64 bits on the non-M profiles
21231 if (AI->isFloatingPointOperation())
21233
21234 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
21235 bool hasAtomicRMW;
21236 if (Subtarget->isMClass())
21237 hasAtomicRMW = Subtarget->hasV8MBaselineOps();
21238 else if (Subtarget->isThumb())
21239 hasAtomicRMW = Subtarget->hasV7Ops();
21240 else
21241 hasAtomicRMW = Subtarget->hasV6Ops();
21242 if (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) {
21243 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21244 // implement atomicrmw without spilling. If the target address is also on
21245 // the stack and close enough to the spill slot, this can lead to a
21246 // situation where the monitor always gets cleared and the atomic operation
21247 // can never succeed. So at -O0 lower this operation to a CAS loop.
21248 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
21251 }
21253}
21254
21255// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32
21256// bits, and up to 64 bits on the non-M profiles.
21259 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21260 // implement cmpxchg without spilling. If the address being exchanged is also
21261 // on the stack and close enough to the spill slot, this can lead to a
21262 // situation where the monitor always gets cleared and the atomic operation
21263 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
21264 unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
21265 bool HasAtomicCmpXchg;
21266 if (Subtarget->isMClass())
21267 HasAtomicCmpXchg = Subtarget->hasV8MBaselineOps();
21268 else if (Subtarget->isThumb())
21269 HasAtomicCmpXchg = Subtarget->hasV7Ops();
21270 else
21271 HasAtomicCmpXchg = Subtarget->hasV6Ops();
21272 if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None &&
21273 HasAtomicCmpXchg && Size <= (Subtarget->isMClass() ? 32U : 64U))
21276}
21277
21279 const Instruction *I) const {
21280 return InsertFencesForAtomic;
21281}
21282
21284 // ROPI/RWPI are not supported currently.
21285 return !Subtarget->isROPI() && !Subtarget->isRWPI();
21286}
21287
21289 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
21290 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
21291 if (SecurityCheckCookieLibcall == RTLIB::Unsupported)
21293
21294 // MSVC CRT has a global variable holding security cookie.
21295 M.getOrInsertGlobal("__security_cookie",
21296 PointerType::getUnqual(M.getContext()));
21297
21298 // MSVC CRT has a function to validate security cookie.
21299 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
21300 getLibcallImplName(SecurityCheckCookieLibcall),
21301 Type::getVoidTy(M.getContext()), PointerType::getUnqual(M.getContext()));
21302 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
21303 F->addParamAttr(0, Attribute::AttrKind::InReg);
21304}
21305
21307 // MSVC CRT has a function to validate security cookie.
21308 RTLIB::LibcallImpl SecurityCheckCookie =
21309 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
21310 if (SecurityCheckCookie != RTLIB::Unsupported)
21311 return M.getFunction(getLibcallImplName(SecurityCheckCookie));
21313}
21314
21316 unsigned &Cost) const {
21317 // If we do not have NEON, vector types are not natively supported.
21318 if (!Subtarget->hasNEON())
21319 return false;
21320
21321 // Floating point values and vector values map to the same register file.
21322 // Therefore, although we could do a store extract of a vector type, this is
21323 // better to leave at float as we have more freedom in the addressing mode for
21324 // those.
21325 if (VectorTy->isFPOrFPVectorTy())
21326 return false;
21327
21328 // If the index is unknown at compile time, this is very expensive to lower
21329 // and it is not possible to combine the store with the extract.
21330 if (!isa<ConstantInt>(Idx))
21331 return false;
21332
21333 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
21334 unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();
21335 // We can do a store + vector extract on any vector that fits perfectly in a D
21336 // or Q register.
21337 if (BitWidth == 64 || BitWidth == 128) {
21338 Cost = 0;
21339 return true;
21340 }
21341 return false;
21342}
21343
21345 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
21346 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
21347 unsigned Opcode = Op.getOpcode();
21348 switch (Opcode) {
21349 case ARMISD::VORRIMM:
21350 case ARMISD::VBICIMM:
21351 return false;
21352 }
21354 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
21355}
21356
21358 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21359}
21360
21362 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21363}
21364
21366 const Instruction &AndI) const {
21367 if (!Subtarget->hasV7Ops())
21368 return false;
21369
21370 // Sink the `and` instruction only if the mask would fit into a modified
21371 // immediate operand.
21373 if (!Mask || Mask->getValue().getBitWidth() > 32u)
21374 return false;
21375 auto MaskVal = unsigned(Mask->getValue().getZExtValue());
21376 return (Subtarget->isThumb2() ? ARM_AM::getT2SOImmVal(MaskVal)
21377 : ARM_AM::getSOImmVal(MaskVal)) != -1;
21378}
21379
21382 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
21383 if (Subtarget->hasMinSize() && !Subtarget->isTargetWindows())
21386 ExpansionFactor);
21387}
21388
21390 Value *Addr,
21391 AtomicOrdering Ord) const {
21392 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21393 bool IsAcquire = isAcquireOrStronger(Ord);
21394
21395 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
21396 // intrinsic must return {i32, i32} and we have to recombine them into a
21397 // single i64 here.
21398 if (ValueTy->getPrimitiveSizeInBits() == 64) {
21400 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
21401
21402 Value *LoHi =
21403 Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");
21404
21405 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
21406 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
21407 if (!Subtarget->isLittle())
21408 std::swap (Lo, Hi);
21409 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
21410 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
21411 return Builder.CreateOr(
21412 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64");
21413 }
21414
21415 Type *Tys[] = { Addr->getType() };
21416 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
21417 CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
21418
21419 CI->addParamAttr(
21420 0, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy));
21421 return Builder.CreateTruncOrBitCast(CI, ValueTy);
21422}
21423
21425 IRBuilderBase &Builder) const {
21426 if (!Subtarget->hasV7Ops())
21427 return;
21428 Builder.CreateIntrinsic(Intrinsic::arm_clrex, {});
21429}
21430
21432 Value *Val, Value *Addr,
21433 AtomicOrdering Ord) const {
21434 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21435 bool IsRelease = isReleaseOrStronger(Ord);
21436
21437 // Since the intrinsics must have legal type, the i64 intrinsics take two
21438 // parameters: "i32, i32". We must marshal Val into the appropriate form
21439 // before the call.
21440 if (Val->getType()->getPrimitiveSizeInBits() == 64) {
21442 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
21443 Type *Int32Ty = Type::getInt32Ty(M->getContext());
21444
21445 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
21446 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
21447 if (!Subtarget->isLittle())
21448 std::swap(Lo, Hi);
21449 return Builder.CreateIntrinsic(Int, {Lo, Hi, Addr});
21450 }
21451
21452 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
21453 Type *Tys[] = { Addr->getType() };
21455
21456 CallInst *CI = Builder.CreateCall(
21457 Strex, {Builder.CreateZExtOrBitCast(
21458 Val, Strex->getFunctionType()->getParamType(0)),
21459 Addr});
21460 CI->addParamAttr(1, Attribute::get(M->getContext(), Attribute::ElementType,
21461 Val->getType()));
21462 return CI;
21463}
21464
21465
21467 return Subtarget->isMClass();
21468}
21469
21470/// A helper function for determining the number of interleaved accesses we
21471/// will generate when lowering accesses of the given type.
21472unsigned
21474 const DataLayout &DL) const {
21475 return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
21476}
21477
21479 unsigned Factor, FixedVectorType *VecTy, Align Alignment,
21480 const DataLayout &DL) const {
21481
21482 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
21483 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
21484
21485 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
21486 return false;
21487
21488 // Ensure the vector doesn't have f16 elements. Even though we could do an
21489 // i16 vldN, we can't hold the f16 vectors and will end up converting via
21490 // f32.
21491 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
21492 return false;
21493 if (Subtarget->hasMVEIntegerOps() && Factor == 3)
21494 return false;
21495
21496 // Ensure the number of vector elements is greater than 1.
21497 if (VecTy->getNumElements() < 2)
21498 return false;
21499
21500 // Ensure the element type is legal.
21501 if (ElSize != 8 && ElSize != 16 && ElSize != 32)
21502 return false;
21503 // And the alignment if high enough under MVE.
21504 if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8)
21505 return false;
21506
21507 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
21508 // 128 will be split into multiple interleaved accesses.
21509 if (Subtarget->hasNEON() && VecSize == 64)
21510 return true;
21511 return VecSize % 128 == 0;
21512}
21513
21515 if (Subtarget->hasNEON())
21516 return 4;
21517 if (Subtarget->hasMVEIntegerOps())
21520}
21521
21522/// Lower an interleaved load into a vldN intrinsic.
21523///
21524/// E.g. Lower an interleaved load (Factor = 2):
21525/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
21526/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
21527/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
21528///
21529/// Into:
21530/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
21531/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
21532/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
21534 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
21535 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
21536 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21537 "Invalid interleave factor");
21538 assert(!Shuffles.empty() && "Empty shufflevector input");
21539 assert(Shuffles.size() == Indices.size() &&
21540 "Unmatched number of shufflevectors and indices");
21541
21542 auto *LI = dyn_cast<LoadInst>(Load);
21543 if (!LI)
21544 return false;
21545 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
21546
21547 auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
21548 Type *EltTy = VecTy->getElementType();
21549
21550 const DataLayout &DL = LI->getDataLayout();
21551 Align Alignment = LI->getAlign();
21552
21553 // Skip if we do not have NEON and skip illegal vector types. We can
21554 // "legalize" wide vector types into multiple interleaved accesses as long as
21555 // the vector types are divisible by 128.
21556 if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL))
21557 return false;
21558
21559 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
21560
21561 // A pointer vector can not be the return type of the ldN intrinsics. Need to
21562 // load integer vectors first and then convert to pointer vectors.
21563 if (EltTy->isPointerTy())
21564 VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);
21565
21566 IRBuilder<> Builder(LI);
21567
21568 // The base address of the load.
21569 Value *BaseAddr = LI->getPointerOperand();
21570
21571 if (NumLoads > 1) {
21572 // If we're going to generate more than one load, reset the sub-vector type
21573 // to something legal.
21574 VecTy = FixedVectorType::get(VecTy->getElementType(),
21575 VecTy->getNumElements() / NumLoads);
21576 }
21577
21578 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
21579
21580 auto createLoadIntrinsic = [&](Value *BaseAddr) {
21581 if (Subtarget->hasNEON()) {
21582 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21583 Type *Tys[] = {VecTy, PtrTy};
21584 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
21585 Intrinsic::arm_neon_vld3,
21586 Intrinsic::arm_neon_vld4};
21587
21589 Ops.push_back(BaseAddr);
21590 Ops.push_back(Builder.getInt32(LI->getAlign().value()));
21591
21592 return Builder.CreateIntrinsic(LoadInts[Factor - 2], Tys, Ops,
21593 /*FMFSource=*/nullptr, "vldN");
21594 } else {
21595 assert((Factor == 2 || Factor == 4) &&
21596 "expected interleave factor of 2 or 4 for MVE");
21597 Intrinsic::ID LoadInts =
21598 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
21599 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21600 Type *Tys[] = {VecTy, PtrTy};
21601
21603 Ops.push_back(BaseAddr);
21604 return Builder.CreateIntrinsic(LoadInts, Tys, Ops, /*FMFSource=*/nullptr,
21605 "vldN");
21606 }
21607 };
21608
21609 // Holds sub-vectors extracted from the load intrinsic return values. The
21610 // sub-vectors are associated with the shufflevector instructions they will
21611 // replace.
21613
21614 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
21615 // If we're generating more than one load, compute the base address of
21616 // subsequent loads as an offset from the previous.
21617 if (LoadCount > 0)
21618 BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,
21619 VecTy->getNumElements() * Factor);
21620
21621 CallInst *VldN = createLoadIntrinsic(BaseAddr);
21622
21623 // Replace uses of each shufflevector with the corresponding vector loaded
21624 // by ldN.
21625 for (unsigned i = 0; i < Shuffles.size(); i++) {
21626 ShuffleVectorInst *SV = Shuffles[i];
21627 unsigned Index = Indices[i];
21628
21629 Value *SubVec = Builder.CreateExtractValue(VldN, Index);
21630
21631 // Convert the integer vector to pointer vector if the element is pointer.
21632 if (EltTy->isPointerTy())
21633 SubVec = Builder.CreateIntToPtr(
21634 SubVec,
21636
21637 SubVecs[SV].push_back(SubVec);
21638 }
21639 }
21640
21641 // Replace uses of the shufflevector instructions with the sub-vectors
21642 // returned by the load intrinsic. If a shufflevector instruction is
21643 // associated with more than one sub-vector, those sub-vectors will be
21644 // concatenated into a single wide vector.
21645 for (ShuffleVectorInst *SVI : Shuffles) {
21646 auto &SubVec = SubVecs[SVI];
21647 auto *WideVec =
21648 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
21649 SVI->replaceAllUsesWith(WideVec);
21650 }
21651
21652 return true;
21653}
21654
21655/// Lower an interleaved store into a vstN intrinsic.
21656///
21657/// E.g. Lower an interleaved store (Factor = 3):
21658/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
21659/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
21660/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
21661///
21662/// Into:
21663/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
21664/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
21665/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
21666/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21667///
21668/// Note that the new shufflevectors will be removed and we'll only generate one
21669/// vst3 instruction in CodeGen.
21670///
21671/// Example for a more general valid mask (Factor 3). Lower:
21672/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
21673/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
21674/// store <12 x i32> %i.vec, <12 x i32>* %ptr
21675///
21676/// Into:
21677/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
21678/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
21679/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
21680/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21682 Value *LaneMask,
21683 ShuffleVectorInst *SVI,
21684 unsigned Factor,
21685 const APInt &GapMask) const {
21686 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21687 "Invalid interleave factor");
21688 auto *SI = dyn_cast<StoreInst>(Store);
21689 if (!SI)
21690 return false;
21691 assert(!LaneMask && GapMask.popcount() == Factor &&
21692 "Unexpected mask on store");
21693
21694 auto *VecTy = cast<FixedVectorType>(SVI->getType());
21695 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
21696
21697 unsigned LaneLen = VecTy->getNumElements() / Factor;
21698 Type *EltTy = VecTy->getElementType();
21699 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
21700
21701 const DataLayout &DL = SI->getDataLayout();
21702 Align Alignment = SI->getAlign();
21703
21704 // Skip if we do not have NEON and skip illegal vector types. We can
21705 // "legalize" wide vector types into multiple interleaved accesses as long as
21706 // the vector types are divisible by 128.
21707 if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
21708 return false;
21709
21710 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
21711
21712 Value *Op0 = SVI->getOperand(0);
21713 Value *Op1 = SVI->getOperand(1);
21714 IRBuilder<> Builder(SI);
21715
21716 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
21717 // vectors to integer vectors.
21718 if (EltTy->isPointerTy()) {
21719 Type *IntTy = DL.getIntPtrType(EltTy);
21720
21721 // Convert to the corresponding integer vector.
21722 auto *IntVecTy =
21724 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
21725 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
21726
21727 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
21728 }
21729
21730 // The base address of the store.
21731 Value *BaseAddr = SI->getPointerOperand();
21732
21733 if (NumStores > 1) {
21734 // If we're going to generate more than one store, reset the lane length
21735 // and sub-vector type to something legal.
21736 LaneLen /= NumStores;
21737 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
21738 }
21739
21740 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
21741
21742 auto Mask = SVI->getShuffleMask();
21743
21744 auto createStoreIntrinsic = [&](Value *BaseAddr,
21745 SmallVectorImpl<Value *> &Shuffles) {
21746 if (Subtarget->hasNEON()) {
21747 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
21748 Intrinsic::arm_neon_vst3,
21749 Intrinsic::arm_neon_vst4};
21750 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21751 Type *Tys[] = {PtrTy, SubVecTy};
21752
21754 Ops.push_back(BaseAddr);
21755 append_range(Ops, Shuffles);
21756 Ops.push_back(Builder.getInt32(SI->getAlign().value()));
21757 Builder.CreateIntrinsic(StoreInts[Factor - 2], Tys, Ops);
21758 } else {
21759 assert((Factor == 2 || Factor == 4) &&
21760 "expected interleave factor of 2 or 4 for MVE");
21761 Intrinsic::ID StoreInts =
21762 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
21763 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21764 Type *Tys[] = {PtrTy, SubVecTy};
21765
21767 Ops.push_back(BaseAddr);
21768 append_range(Ops, Shuffles);
21769 for (unsigned F = 0; F < Factor; F++) {
21770 Ops.push_back(Builder.getInt32(F));
21771 Builder.CreateIntrinsic(StoreInts, Tys, Ops);
21772 Ops.pop_back();
21773 }
21774 }
21775 };
21776
21777 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
21778 // If we generating more than one store, we compute the base address of
21779 // subsequent stores as an offset from the previous.
21780 if (StoreCount > 0)
21781 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
21782 BaseAddr, LaneLen * Factor);
21783
21784 SmallVector<Value *, 4> Shuffles;
21785
21786 // Split the shufflevector operands into sub vectors for the new vstN call.
21787 for (unsigned i = 0; i < Factor; i++) {
21788 unsigned IdxI = StoreCount * LaneLen * Factor + i;
21789 if (Mask[IdxI] >= 0) {
21790 Shuffles.push_back(Builder.CreateShuffleVector(
21791 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
21792 } else {
21793 unsigned StartMask = 0;
21794 for (unsigned j = 1; j < LaneLen; j++) {
21795 unsigned IdxJ = StoreCount * LaneLen * Factor + j;
21796 if (Mask[IdxJ * Factor + IdxI] >= 0) {
21797 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
21798 break;
21799 }
21800 }
21801 // Note: If all elements in a chunk are undefs, StartMask=0!
21802 // Note: Filling undef gaps with random elements is ok, since
21803 // those elements were being written anyway (with undefs).
21804 // In the case of all undefs we're defaulting to using elems from 0
21805 // Note: StartMask cannot be negative, it's checked in
21806 // isReInterleaveMask
21807 Shuffles.push_back(Builder.CreateShuffleVector(
21808 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
21809 }
21810 }
21811
21812 createStoreIntrinsic(BaseAddr, Shuffles);
21813 }
21814 return true;
21815}
21816
21824
21826 uint64_t &Members) {
21827 if (auto *ST = dyn_cast<StructType>(Ty)) {
21828 for (unsigned i = 0; i < ST->getNumElements(); ++i) {
21829 uint64_t SubMembers = 0;
21830 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
21831 return false;
21832 Members += SubMembers;
21833 }
21834 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
21835 uint64_t SubMembers = 0;
21836 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
21837 return false;
21838 Members += SubMembers * AT->getNumElements();
21839 } else if (Ty->isFloatTy()) {
21840 if (Base != HA_UNKNOWN && Base != HA_FLOAT)
21841 return false;
21842 Members = 1;
21843 Base = HA_FLOAT;
21844 } else if (Ty->isDoubleTy()) {
21845 if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
21846 return false;
21847 Members = 1;
21848 Base = HA_DOUBLE;
21849 } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
21850 Members = 1;
21851 switch (Base) {
21852 case HA_FLOAT:
21853 case HA_DOUBLE:
21854 return false;
21855 case HA_VECT64:
21856 return VT->getPrimitiveSizeInBits().getFixedValue() == 64;
21857 case HA_VECT128:
21858 return VT->getPrimitiveSizeInBits().getFixedValue() == 128;
21859 case HA_UNKNOWN:
21860 switch (VT->getPrimitiveSizeInBits().getFixedValue()) {
21861 case 64:
21862 Base = HA_VECT64;
21863 return true;
21864 case 128:
21865 Base = HA_VECT128;
21866 return true;
21867 default:
21868 return false;
21869 }
21870 }
21871 }
21872
21873 return (Members > 0 && Members <= 4);
21874}
21875
21876/// Return the correct alignment for the current calling convention.
21878 Type *ArgTy, const DataLayout &DL) const {
21879 const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
21880 if (!ArgTy->isVectorTy())
21881 return ABITypeAlign;
21882
21883 // Avoid over-aligning vector parameters. It would require realigning the
21884 // stack and waste space for no real benefit.
21885 MaybeAlign StackAlign = DL.getStackAlignment();
21886 assert(StackAlign && "data layout string is missing stack alignment");
21887 return std::min(ABITypeAlign, *StackAlign);
21888}
21889
21890/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
21891/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
21892/// passing according to AAPCS rules.
21894 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
21895 const DataLayout &DL) const {
21896 if (getEffectiveCallingConv(CallConv, isVarArg) !=
21898 return false;
21899
21901 uint64_t Members = 0;
21902 bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
21903 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
21904
21905 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
21906 return IsHA || IsIntArray;
21907}
21908
21910 const Constant *PersonalityFn) const {
21911 // Platforms which do not use SjLj EH may return values in these registers
21912 // via the personality function.
21914 return EM == ExceptionHandling::SjLj ? Register() : ARM::R0;
21915}
21916
21918 const Constant *PersonalityFn) const {
21919 // Platforms which do not use SjLj EH may return values in these registers
21920 // via the personality function.
21922 return EM == ExceptionHandling::SjLj ? Register() : ARM::R1;
21923}
21924
21925void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
21926 // Update IsSplitCSR in ARMFunctionInfo.
21927 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
21928 AFI->setIsSplitCSR(true);
21929}
21930
21931void ARMTargetLowering::insertCopiesSplitCSR(
21932 MachineBasicBlock *Entry,
21933 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
21934 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
21935 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
21936 if (!IStart)
21937 return;
21938
21939 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21940 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
21941 MachineBasicBlock::iterator MBBI = Entry->begin();
21942 for (const MCPhysReg *I = IStart; *I; ++I) {
21943 const TargetRegisterClass *RC = nullptr;
21944 if (ARM::GPRRegClass.contains(*I))
21945 RC = &ARM::GPRRegClass;
21946 else if (ARM::DPRRegClass.contains(*I))
21947 RC = &ARM::DPRRegClass;
21948 else
21949 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
21950
21951 Register NewVR = MRI->createVirtualRegister(RC);
21952 // Create copy from CSR to a virtual register.
21953 // FIXME: this currently does not emit CFI pseudo-instructions, it works
21954 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
21955 // nounwind. If we want to generalize this later, we may need to emit
21956 // CFI pseudo-instructions.
21957 assert(Entry->getParent()->getFunction().hasFnAttribute(
21958 Attribute::NoUnwind) &&
21959 "Function should be nounwind in insertCopiesSplitCSR!");
21960 Entry->addLiveIn(*I);
21961 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
21962 .addReg(*I);
21963
21964 // Insert the copy-back instructions right before the terminator.
21965 for (auto *Exit : Exits)
21966 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
21967 TII->get(TargetOpcode::COPY), *I)
21968 .addReg(NewVR);
21969 }
21970}
21971
21976
21978 return Subtarget->hasMVEIntegerOps();
21979}
21980
21983 auto *VTy = dyn_cast<FixedVectorType>(Ty);
21984 if (!VTy)
21985 return false;
21986
21987 auto *ScalarTy = VTy->getScalarType();
21988 unsigned NumElements = VTy->getNumElements();
21989
21990 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
21991 if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth))
21992 return false;
21993
21994 // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32
21995 if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy())
21996 return Subtarget->hasMVEFloatOps();
21997
21999 return false;
22000
22001 return Subtarget->hasMVEIntegerOps() &&
22002 (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
22003 ScalarTy->isIntegerTy(32));
22004}
22005
22008 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
22009 Value *Accumulator) const {
22010
22012
22013 unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
22014
22015 assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits");
22016
22017 if (TyWidth > 128) {
22018 int Stride = Ty->getNumElements() / 2;
22019 auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());
22020 auto SplitSeqVec = llvm::to_vector(SplitSeq);
22021 ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
22022 ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
22023
22024 auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);
22025 auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);
22026 auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);
22027 auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);
22028 Value *LowerSplitAcc = nullptr;
22029 Value *UpperSplitAcc = nullptr;
22030
22031 if (Accumulator) {
22032 LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);
22033 UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);
22034 }
22035
22036 auto *LowerSplitInt = createComplexDeinterleavingIR(
22037 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
22038 auto *UpperSplitInt = createComplexDeinterleavingIR(
22039 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
22040
22041 ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
22042 return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
22043 }
22044
22045 auto *IntTy = Type::getInt32Ty(B.getContext());
22046
22047 ConstantInt *ConstRotation = nullptr;
22048 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
22049 ConstRotation = ConstantInt::get(IntTy, (int)Rotation);
22050
22051 if (Accumulator)
22052 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty,
22053 {ConstRotation, Accumulator, InputB, InputA});
22054 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty,
22055 {ConstRotation, InputB, InputA});
22056 }
22057
22058 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
22059 // 1 means the value is not halved.
22060 auto *ConstHalving = ConstantInt::get(IntTy, 1);
22061
22063 ConstRotation = ConstantInt::get(IntTy, 0);
22065 ConstRotation = ConstantInt::get(IntTy, 1);
22066
22067 if (!ConstRotation)
22068 return nullptr; // Invalid rotation for arm_mve_vcaddq
22069
22070 return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty,
22071 {ConstHalving, ConstRotation, InputA, InputB});
22072 }
22073
22074 return nullptr;
22075}
unsigned const MachineRegisterInfo * MRI
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
return SDValue()
static const MCPhysReg GPRArgRegs[]
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &DL)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
constexpr MVT FlagsVT
Value type used for NZCV flags.
static bool isNegatedInteger(SDValue Op)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static bool isConstant(const MachineInstr &MI)
constexpr LLT F64
constexpr LLT S1
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG)
static bool isStore(int Opcode)
static bool isThumb(const MCSubtargetInfo &STI)
static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT)
static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total value size to 64 bits.
static cl::opt< unsigned > ConstpoolPromotionMaxSize("arm-promote-constant-max-size", cl::Hidden, cl::desc("Maximum size of constant to promote into a constant pool"), cl::init(64))
static bool isZeroOrAllOnes(SDValue N, bool AllOnes)
static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isVTBLMask(ArrayRef< int > M, EVT VT)
static SDValue PerformSUBCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
static cl::opt< bool > EnableConstpoolPromotion("arm-promote-constant", cl::Hidden, cl::desc("Enable / disable promotion of unnamed_addr constants into " "constant pools"), cl::init(false))
static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG)
static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
static SDValue PerformExtractEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static const APInt * isPowerOf2Constant(SDValue V)
static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) can replace combinations of ...
static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static bool isValidMVECond(unsigned CC, bool IsFloat)
static SDValue PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC)
IntCCToARMCC - Convert a DAG integer condition code to an ARM CC.
static SDValue PerformSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSTORECombine - Target-specific dag combine xforms for ISD::STORE.
static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, SelectionDAG &DAG)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isGTorGE(ISD::CondCode CC)
static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1) intrinsic,...
static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask)
static bool isReverseMask(ArrayRef< int > M, EVT VT)
static bool isVZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of "vector_shuffle v,...
static SDValue PerformSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVMulVCTPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD) can replace combinations...
static SDValue createGPRPairNode2xi32(SelectionDAG &DAG, SDValue V0, SDValue V1)
static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG)
static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc)
static bool isVTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool CanInvertMVEVCMP(SDValue N)
static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG)
static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformShiftCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
PerformShiftCombine - Checks for immediate versions of vector shifts and lowers them.
static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, ARMCC::CondCodes &CondCode2)
FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static EVT getVectorTyFromPredicateVector(EVT VT)
static SDValue PerformFADDVCMLACombine(SDNode *N, SelectionDAG &DAG)
static SDValue handleCMSEValue(const SDValue &Value, const ISD::InputArg &Arg, SelectionDAG &DAG, const SDLoc &DL)
static SDValue PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
static bool isSRL16(const SDValue &Op)
static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC)
static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr, SDValue Inc, const SelectionDAG &DAG)
static SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static Register genTPEntry(MachineBasicBlock *TpEntry, MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpExit, Register OpSizeReg, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI)
Adds logic in loop entry MBB to calculate loop iteration count and adds t2WhileLoopSetup and t2WhileL...
static SDValue createGPRPairNodei64(SelectionDAG &DAG, SDValue V)
static bool isLTorLE(ISD::CondCode CC)
static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, SelectionDAG &DAG)
static SDValue performNegCMovCombine(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static SDValue PerformBITCASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG)
static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG)
static bool hasNormalLoadOperand(SDNode *N)
hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node are normal,...
static SDValue PerformInsertEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
PerformInsertEltCombine - Target-specific dag combine xforms for ISD::INSERT_VECTOR_ELT.
static SDValue PerformVDUPLANECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVDUPLANECombine - Target-specific dag combine xforms for ARMISD::VDUPLANE.
static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static cl::opt< unsigned > ConstpoolPromotionMaxTotal("arm-promote-constant-max-total", cl::Hidden, cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128))
static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static RTLIB::Libcall getDivRemLibcall(const SDNode *N, MVT::SimpleValueType SVT)
static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG &DAG)
SkipLoadExtensionForVMULL - return a load of the original vector size that does not do any sign/zero ...
static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, SelectionDAG &DAG)
static bool isVZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformORCombineToSMULWBT(SDNode *OR, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isVTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of "vector_shuffle v,...
static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue FindBFIToCombineWith(SDNode *N)
static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, SelectionDAG &DAG)
static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, bool &swpCmpOps, bool &swpVselOps)
static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isS16(const SDValue &Op, SelectionDAG &DAG)
static bool isSRA16(const SDValue &Op)
static SDValue AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue LowerInterruptReturn(SmallVectorImpl< SDValue > &RetOps, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, SelectionDAG &DAG)
static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, SDValue &RetVal1, SDValue &RetVal2)
static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isSHL16(const SDValue &Op)
static bool isVEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseVEXT, unsigned &Imm)
static SDValue PerformMVEVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
cl::opt< unsigned > ArmMaxBaseUpdatesToCheck("arm-max-base-updates-to-check", cl::Hidden, cl::desc("Maximum number of base-updates to check generating postindex."), cl::init(64))
static bool isTruncMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2)
Return the load opcode for a given load size.
static bool isLegalT2AddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
static bool isLegalMVEShuffleOp(unsigned PFEntry)
static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N, SelectionDAG &DAG)
static bool isVUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG)
PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for ISD::VECTOR_SHUFFLE.
static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG)
SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, ANY_EXTEND,...
static bool isVMOVNTruncMask(ArrayRef< int > M, EVT ToVT, bool rev)
static SDValue PerformVQMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static MachineBasicBlock * OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ)
static SDValue LowerVecReduceMinMax(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformAddcSubcCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static TargetLowering::ArgListTy getDivRemArgList(const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget)
static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static ARMCC::CondCodes getVCMPCondCode(SDValue N)
static cl::opt< bool > ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true))
static void ReplaceREADCYCLECOUNTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformORCombineToBFI(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDValue &CC, bool &Invert, SDValue &OtherOp, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVSetCCToVCTPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isZeroVector(SDValue N)
static SDValue PerformAddeSubeCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void ReplaceCMP_SWAP_64Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, const SDValue TrueVal, const SDValue FalseVal, const ISD::CondCode CC, const SDValue K)
static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG)
static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned StSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment store operation with given size.
static bool isVMOVNMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, NEON load/store intrinsics,...
static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVRRDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMOVRRDCombine - Target-specific dag combine xforms for ARMISD::VMOVRRD.
static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain)
static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue PerformVMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMULCombine Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the special multi...
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformORCombine - Target-specific dag combine xforms for ISD::OR.
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG)
static unsigned SelectPairHalf(unsigned Elements, ArrayRef< int > Mask, unsigned Index)
static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned LdSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment load operation with given size.
static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG)
static bool isValidBaseUpdate(SDNode *N, SDNode *User)
static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, const ARMSubtarget *ST, const SDLoc &dl)
static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op)
static SDValue PerformMinMaxToSatCombine(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformXORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, bool isSEXTLoad, bool IsMasked, bool isLE, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
std::pair< unsigned, const TargetRegisterClass * > RCPair
static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, TargetLowering::DAGCombinerInfo &DCI, bool AllOnes=false)
static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, ISD::ZERO_EXTEND,...
static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, const SDLoc &dl, EVT &VT, EVT VectorVT, VMOVModImmType type)
isVMOVModifiedImm - Check if the specified splat value corresponds to a valid vector constant for a N...
static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, SelectionDAG &DAG)
BC is a bitcast that is about to be turned into a VMOVDRR.
static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, const GlobalValue *GV, SelectionDAG &DAG, EVT PtrVT, const SDLoc &dl)
static unsigned isNEONTwoResultShuffleMask(ArrayRef< int > ShuffleMask, EVT VT, unsigned &WhichResult, bool &isV_UNDEF)
Check if ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), and return the corresponding AR...
static bool BitsProperlyConcatenate(const APInt &A, const APInt &B)
static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG)
static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target, struct BaseUpdateUser &User, bool SimpleConstIncOnly, TargetLowering::DAGCombinerInfo &DCI)
static bool allUsersAreInFunction(const Value *V, const Function *F)
Return true if all users of V are within function F, looking through ConstantExprs.
static bool isSingletonVEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG)
PerformVMOVDRRCombine - Target-specific dag combine xforms for ARMISD::VMOVDRR.
static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, SDValue &SatK)
static bool isLegalAddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
isLegalAddressImmediate - Return true if the integer value can be used as the offset of the target ad...
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isLegalT1AddressImmediate(int64_t V, EVT VT)
static SDValue CombineANDShift(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformADDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDECombine - Target-specific dag combine transform from ARMISD::ADDC, ARMISD::ADDE,...
static SDValue PerformReduceShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformHWLoopCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool isVUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of "vector_shuffle v,...
static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, uint64_t &Members)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG)
static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, bool &Negate)
static bool canChangeToInt(SDValue Op, bool &SeenZero, const ARMSubtarget *Subtarget)
canChangeToInt - Given the fp compare operand, return true if it is suitable to morph to an integer c...
static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2)
Return the store opcode for a given store size.
static bool IsVUZPShuffleNode(SDNode *N)
static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, MachineInstr &MI, const SDNode *Node)
Attaches vregs to MEMCPY that it will use as scratch registers when it is expanded into LDM/STM.
static bool isFloatingPointZero(SDValue Op)
isFloatingPointZero - Return true if this is +0.0.
static SDValue findMUL_LOHI(SDValue V)
static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG)
static void genTPLoopBody(MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI, Register OpSrcReg, Register OpDestReg, Register ElementCountReg, Register TotalIterationsReg, bool IsMemcpy)
Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and t2DoLoopEnd.
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformMinMaxCombine - Target-specific DAG combining for creating truncating saturates.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
This file a TargetTransformInfoImplBase conforming object specific to the ARM target machine.
Function Alias Analysis false
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< bool > isBigEndian(const SmallDenseMap< int64_t, int64_t, 8 > &MemOffset2Idx, int64_t LowestIdx)
Given a map from byte offsets in memory to indices in a load/store, determine if that map corresponds...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset, dxil::ResourceTypeInfo &RTI)
static void createStoreIntrinsic(IntrinsicInst *II, StoreInst *SI, Value *Offset, dxil::ResourceTypeInfo &RTI)
This file defines the DenseMap class.
static bool isSigned(unsigned int Opcode)
#define Check(C,...)
#define op(i)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
This file declares the MachineConstantPool class which is an abstract constant pool to keep track of ...
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
#define MAKE_CASE(V)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
nvptx lower args
uint64_t High
uint64_t IntrinsicInst * II
PowerPC Reduce CR logical Operation
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
SI Lower i1 Copies
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
static cl::opt< unsigned > MaxSteps("has-predecessor-max-steps", cl::Hidden, cl::init(8192), cl::desc("DAG combiner limit number of steps when searching DAG " "for predecessor nodes"))
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static bool isIntrinsic(const CallBase &Call, Intrinsic::ID ID)
The Input class is used to parse a yaml document into in-memory structs and vectors.
bool getExactInverse(APFloat *Inv) const
If this value is normal and has an exact, normal, multiplicative inverse, store it in inv and return ...
Definition APFloat.cpp:5999
APInt bitcastToAPInt() const
Definition APFloat.h:1353
opStatus convertToInteger(MutableArrayRef< integerPart > Input, unsigned int Width, bool IsSigned, roundingMode RM, bool *IsExact) const
Definition APFloat.h:1332
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1512
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition APInt.h:1201
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1111
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1598
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
unsigned logBase2() const
Definition APInt.h:1761
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition APInt.h:475
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:239
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:858
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:851
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1656
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
virtual const ARMBaseRegisterInfo & getRegisterInfo() const =0
const uint32_t * getSjLjDispatchPreservedMask(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
Code Generation virtual methods...
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
const uint32_t * getTLSCallPreservedMask(const MachineFunction &MF) const
const uint32_t * getThisReturnPreservedMask(const MachineFunction &MF, CallingConv::ID) const
getThisReturnPreservedMask - Returns a call preserved mask specific to the case that 'returned' is on...
static ARMConstantPoolConstant * Create(const Constant *C, unsigned ID)
static ARMConstantPoolMBB * Create(LLVMContext &C, const MachineBasicBlock *mbb, unsigned ID, unsigned char PCAdj)
static ARMConstantPoolSymbol * Create(LLVMContext &C, StringRef s, unsigned ID, unsigned char PCAdj)
ARMConstantPoolValue - ARM specific constantpool value.
ARMFunctionInfo - This class is derived from MachineFunctionInfo and contains private ARM-specific in...
SmallPtrSet< const GlobalVariable *, 2 > & getGlobalsPromotedToConstantPool()
void setArgumentStackToRestore(unsigned v)
void setArgRegsSaveSize(unsigned s)
void setReturnRegsCount(unsigned s)
unsigned getArgRegsSaveSize() const
void markGlobalAsPromotedToConstantPool(const GlobalVariable *GV)
Indicate to the backend that GV has had its storage changed to inside a constant pool.
void setArgumentStackSize(unsigned size)
unsigned getArgumentStackSize() const
const Triple & getTargetTriple() const
const ARMBaseInstrInfo * getInstrInfo() const override
bool isThumb1Only() const
bool useFPVFMx() const
bool isThumb2() const
bool isTargetWindows() const
bool hasBaseDSP() const
const ARMTargetLowering * getTargetLowering() const override
const ARMBaseRegisterInfo * getRegisterInfo() const override
bool hasVFP2Base() const
bool useFPVFMx64() const
bool isLittle() const
bool useFPVFMx16() const
bool isMClass() const
bool useMulOps() const
Align getDualLoadStoreAlignment() const
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isReadOnly(const GlobalValue *GV) const
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getABIAlignmentForCallingConv(Type *ArgTy, const DataLayout &DL) const override
Return the correct alignment for the current calling convention.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const override
Examine constraint string and operand type and determine a weight value.
bool shouldExpandCmpUsingSelects(EVT VT) const override
Should we expand [US]CMP nodes using two selects and two compares, or by doing arithmetic on boolean ...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
const ARMSubtarget * getSubtarget() const
bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const
bool isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const
Returns true if the addressing mode representing by AM is legal for the Thumb1 target,...
bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPreIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mod...
bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, Align &PrefAlign) const override
Return true if the pointer arguments to CI should be aligned by aligning the object whose address is ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
ReplaceNodeResults - Replace the results of node with an illegal result type with new values built ou...
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool isLegalAddImmediate(int64_t Imm) const override
isLegalAddImmediate - Return true if the specified immediate is legal add immediate,...
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
SDValue PerformMVETruncCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize=false) const override
isFPImmLegal - Returns true if the target can instruction select the specified FP immediate natively.
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const
PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Return true if it is profitable to combine an XOR of a logical shift to create a logical shift of NOT...
SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const
PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
SDValue PerformMVEExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
getSetCCResultType - Return the value type to use for ISD::SETCC.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
createFastISel - This method returns a target specific FastISel object, or null if the target does no...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
This method should be implemented by targets that mark instructions with the 'hasPostISelHook' flag.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isTruncateFree(Type *SrcTy, Type *DstTy) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
isShuffleMaskLegal - Targets can use this to indicate that they only support some VECTOR_SHUFFLE oper...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved store into a vstN intrinsic.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const override
getRegClassFor - Return the register class that should be used for the specified value type.
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved load into a vldN intrinsic.
std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override
Return the largest legal super-reg register class of the register class for the specified type and it...
bool isZExtFree(SDValue Val, EVT VT2) const override
Return true if zero-extending the specific node Val to type VT2 is free (either because it's implicit...
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI)
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const
PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
Type * shouldConvertSplatType(ShuffleVectorInst *SVI) const override
Given a shuffle vector SVI representing a vector splat, return a new scalar type of size equal to SVI...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
Instruction * makeDMB(IRBuilderBase &Builder, ARM_MB::MemBOpt Domain) const
bool isLegalICmpImmediate(int64_t Imm) const override
isLegalICmpImmediate - Return true if the specified immediate is legal icmp immediate,...
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override
Return true if it is profitable for dag combiner to transform a floating point op of specified opcode...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const override
allowsMisalignedMemoryAccesses - Returns true if the target allows unaligned memory accesses of the s...
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
bool isVectorLoadExtDesirable(SDValue ExtVal) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override
Return true if the target can combine store(extractelement VectorTy,Idx).
bool useSoftFloat() const override
bool alignLoopsWithOptSize() const override
Should loops be aligned even when the function is marked OptSize (but not MinSize).
SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
LowerAsmOperandForConstraint - Lower the specified operand into the Ops vector.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
Returns true if an argument of type Ty needs to be passed in a contiguous block of registers in calli...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
const ARMBaseTargetMachine & getTM() const
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPostIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mo...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
bool isFloatingPointOperation() const
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
static LLVM_ABI BaseIndexOffset match(const SDNode *N, const SelectionDAG &DAG)
Parses tree in N for base, index, offset addresses.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
The address of a basic block.
Definition Constants.h:899
static BranchProbability getZero()
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
void getInRegsParamInfo(unsigned InRegsParamRecordIndex, unsigned &BeginReg, unsigned &EndReg) const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
unsigned getInRegsParamsProcessed() const
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
void addInRegsParamInfo(unsigned RegBegin, unsigned RegEnd)
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
unsigned getInRegsParamsCount() const
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
int64_t getLocMemOffset() const
unsigned getValNo() const
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
LLVM_ABI bool isIndirectCall() const
Return true if the callsite is an indirect call.
AttributeList getAttributes() const
Return the attributes for this call.
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:715
const APFloat & getValueAPF() const
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:277
This is the shared class of boolean and integer constants.
Definition Constants.h:87
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
bool isLittleEndian() const
Layout endianness...
Definition DataLayout.h:198
bool isBigEndian() const
Definition DataLayout.h:199
MaybeAlign getStackAlignment() const
Returns the natural stack alignment, or MaybeAlign() if one wasn't specified.
Definition DataLayout.h:228
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPreferredAlign(const GlobalVariable *GV) const
Returns the preferred alignment of the specified global.
StringRef getPrivateGlobalPrefix() const
Definition DataLayout.h:286
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
A debug info location.
Definition DebugLoc.h:124
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:165
unsigned size() const
Definition DenseMap.h:108
bool empty() const
Definition DenseMap.h:107
iterator begin()
Definition DenseMap.h:78
iterator end()
Definition DenseMap.h:81
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
arg_iterator arg_begin()
Definition Function.h:866
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasStructRetAttr() const
Determine if the function returns a structure through first or second pointer argument.
Definition Function.h:687
const Argument * const_arg_iterator
Definition Function.h:73
bool isVarArg() const
isVarArg - Return true if this function takes a variable number of arguments.
Definition Function.h:227
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727
const GlobalValue * getGlobal() const
bool isDSOLocal() const
bool hasExternalWeakLinkage() const
bool hasDLLImportStorageClass() const
Module * getParent()
Get the module that this global value is contained inside of...
bool isStrongDefinitionForLinker() const
Returns true if this global's definition will be the one chosen by the linker.
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
LLVM_ABI bool hasAtomicStore() const LLVM_READONLY
Return true if this atomic instruction stores to memory.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Describe properties that are true of each instruction in the target description file.
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isInteger() const
Return true if this is an integer or a vector integer type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
bool is64BitVector() const
Return true if this is a 64-bit vector type.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
LLVM_ABI void moveAfter(MachineBasicBlock *NewBefore)
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineConstantPool class keeps track of constants referenced by a function which must be spilled...
unsigned getConstantPoolIndex(const Constant *C, Align Alignment)
getConstantPoolIndex - Create a new entry in the constant pool or return an existing one.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
Properties which a MachineFunction may have at a given point in time.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineConstantPool * getConstantPool()
getConstantPool - Return the constant pool object for the current function.
const MachineFunctionProperties & getProperties() const
Get the function properties.
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addConstantPoolIndex(unsigned Idx, int Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
LLVM_ABI void setIsDef(bool Val=true)
Change a def to a use, or a use to a def.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class is used to represent an MLOAD node.
This class is used to represent an MSTORE node.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Wrapper class representing virtual and physical registers.
Definition Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< use_iterator > uses()
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
LLVM_ABI SDValue getStackArgumentTokenFactor(SDValue Chain)
Compute a TokenFactor to force all the incoming stack arguments to be loaded from the stack.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
DenormalMode getDenormalMode(EVT VT) const
Return the current function's default denormal handling kind for the given floating point type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a logical NOT operation as (XOR Val, BooleanOne).
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
bool empty() const
Definition SmallSet.h:168
bool erase(const T &V)
Definition SmallSet.h:197
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:181
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
const unsigned char * bytes_end() const
Definition StringRef.h:135
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:154
const unsigned char * bytes_begin() const
Definition StringRef.h:132
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
void setMinStackArgumentAlignment(Align Alignment)
Set the minimum stack alignment of an argument.
const TargetMachine & getTargetMachine() const
void setIndexedMaskedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked load does or does not work with the specified type and ind...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
Sched::Preference getSchedulingPreference() const
Return target scheduling preference.
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual unsigned getMaxSupportedInterleaveFactor() const
Get the maximum supported factor for interleaved memory accesses.
void setIndexedMaskedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked store does or does not work with the specified type and in...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
virtual std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const
Return the largest legal super-reg register class of the register class for the specified type and it...
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Get the libcall impl routine name for the specified libcall.
static StringRef getLibcallImplName(RTLIB::LibcallImpl Call)
Get the libcall routine name for the specified libcall implementation.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
void setLibcallImpl(RTLIB::Libcall Call, RTLIB::LibcallImpl Impl)
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
SDValue buildLegalVectorShuffle(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, MutableArrayRef< int > Mask, SelectionDAG &DAG) const
Tries to build a legal vector shuffle using the provided parameters or equivalent variations.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
bool isConstTrueVal(SDValue N) const
Return if the N is a constant or constant vector equal to the true value from getBooleanContents().
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
ExceptionHandling getExceptionModel() const
Return the ExceptionHandling to use, considering TargetOptions and the Triple's default.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
TargetOptions Options
unsigned EnableFastISel
EnableFastISel - This flag enables fast-path instruction selection which trades away generated code q...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned EmitCallGraphSection
Emit section containing call graph metadata.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
ObjectFormatType getObjectFormat() const
Get the object format for this triple.
Definition Triple.h:437
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:281
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:295
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:296
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:225
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
Base class of all SIMD vector types.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:194
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:169
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:130
IteratorT end() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
static CondCodes getOppositeCondition(CondCodes CC)
Definition ARMBaseInfo.h:48
@ SECREL
Thread Pointer Offset.
@ SBREL
Section Relative (Windows TLS)
@ GOTTPOFF
Global Offset Table, PC Relative.
@ TPOFF
Global Offset Table, Thread Pointer Offset.
TOF
Target Operand Flag enum.
@ MO_NONLAZY
MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it represents a symbol which,...
@ MO_SBREL
MO_SBREL - On a symbol operand, this represents a static base relative relocation.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_GOT
MO_GOT - On a symbol operand, this represents a GOT relative relocation.
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
static ShiftOpc getShiftOpcForNode(unsigned Opcode)
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits)
decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the element value and the element ...
unsigned getAM2Offset(unsigned AM2Opc)
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
unsigned createVMOVModImm(unsigned OpCmode, unsigned Val)
int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm)
int getFP32FP16Imm(const APInt &Imm)
If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding for it.
AddrOpc getAM2Op(unsigned AM2Opc)
bool isBitFieldInvertedMask(unsigned v)
const unsigned FPStatusBits
const unsigned FPReservedBits
const unsigned RoundingBitsPos
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Swift
Calling convention for Swift.
Definition CallingConv.h:69
@ ARM_APCS
ARM Procedure Calling Standard (obsolete, but still used on some targets).
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition CallingConv.h:63
@ ARM_AAPCS
ARM Architecture Procedure Calling Standard calling convention (aka EABI).
@ CXX_FAST_TLS
Used for access functions.
Definition CallingConv.h:72
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ ARM_AAPCS_VFP
Same as ARM_AAPCS, but uses hard floating point ABI.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:163
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:525
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:167
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:706
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:809
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition ISDOpcodes.h:134
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ SCMP
[US]CMP - 3-way comparison of signed or unsigned integers.
Definition ISDOpcodes.h:726
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:701
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:927
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:157
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:713
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isZEXTLoad(const SDNode *N)
Returns true if the specified node is a ZEXTLOAD.
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
bool isSEXTLoad(const SDNode *N)
Returns true if the specified node is a SEXTLOAD.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
static const int LAST_INDEXED_MODE
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPEXT(EVT OpVT, EVT RetVT)
getFPEXT - Return the FPEXT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
bool RetFastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:262
@ Offset
Definition DWP.cpp:477
@ Length
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2038
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1731
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns true if Val1 has a lower Constant Materialization Cost than Val2.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
bool isStrongerThanMonotonic(AtomicOrdering AO)
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition bit.h:279
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
bool CC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:264
ExceptionHandling
Definition CodeGen.h:53
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56
bool RetCC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool RetCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2116
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:252
bool RetCC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
void shuffle(Iterator first, Iterator last, RNG &&g)
Definition STLExtras.h:1518
bool CC_ARM_APCS_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
static std::array< MachineOperand, 2 > predOps(ARMCC::CondCodes Pred, unsigned PredReg=0)
Get the operands corresponding to the given Pred value.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
constexpr bool isShiftedMask_32(uint32_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (32 bit ver...
Definition MathExtras.h:276
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
bool isReleaseOrStronger(AtomicOrdering AO)
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
bool FastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool CC_ARM_Win32_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
CombineLevel
Definition DAGCombine.h:15
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
unsigned ConstantMaterializationCost(unsigned Val, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns the number of instructions required to materialize the given constant in a register,...
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr U AbsoluteValue(T X)
Return the absolute value of a signed integer, converted to the corresponding unsigned integer type.
Definition MathExtras.h:603
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
static MachineOperand t1CondCodeOp(bool isDead=false)
Get the operand corresponding to the conditional code result for Thumb1.
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1941
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:257
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
static MachineOperand condCodeOp(unsigned CCReg=0)
Get the operand corresponding to the conditional code result.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
unsigned gettBLXrOpcode(const MachineFunction &MF)
bool CC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool CC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
constexpr bool isShiftedUInt(uint64_t x)
Checks if a unsigned integer is an N bit number shifted left by S.
Definition MathExtras.h:207
unsigned convertAddSubFlagsOpcode(unsigned OldOpc)
Map pseudo instructions that imply an 'S' bit onto real opcodes.
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
Load/store instruction that can be merged with a base address update.
SDNode * N
Instruction that updates a pointer.
unsigned ConstInc
Pointer increment value if it is a constant, or 0 otherwise.
SDValue Inc
Pointer increment operand.
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:760
static constexpr roundingMode rmTowardZero
Definition APFloat.h:308
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool isFixedLengthVector() const
Definition ValueTypes.h:181
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:308
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
EVT ArgVT
Usually the non-legalized type of the argument, which is the EVT corresponding to the OrigTy IR type.
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:311
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition KnownBits.h:135
Matching combinators.
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getJumpTable(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a jump table entry.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setInRegister(bool Value=true)
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
SmallVector< ISD::InputArg, 32 > Ins
CallLoweringInfo & setDiscardResult(bool Value=true)
CallLoweringInfo & setZExtResult(bool Value=true)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setSExtResult(bool Value=true)
SmallVector< ISD::OutputArg, 32 > Outs
CallLoweringInfo & setChain(SDValue InChain)
CallLoweringInfo & setCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList, AttributeSet ResultAttrs={})
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
This structure is used to pass arguments to makeLibCall function.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...