LLVM 22.0.0git
ARMISelLowering.cpp
Go to the documentation of this file.
1//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that ARM uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "ARMISelLowering.h"
15#include "ARMBaseInstrInfo.h"
16#include "ARMBaseRegisterInfo.h"
17#include "ARMCallingConv.h"
20#include "ARMPerfectShuffle.h"
21#include "ARMRegisterInfo.h"
22#include "ARMSelectionDAGInfo.h"
23#include "ARMSubtarget.h"
27#include "Utils/ARMBaseInfo.h"
28#include "llvm/ADT/APFloat.h"
29#include "llvm/ADT/APInt.h"
30#include "llvm/ADT/ArrayRef.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/DenseMap.h"
33#include "llvm/ADT/STLExtras.h"
36#include "llvm/ADT/Statistic.h"
38#include "llvm/ADT/StringRef.h"
40#include "llvm/ADT/Twine.h"
67#include "llvm/IR/Attributes.h"
68#include "llvm/IR/CallingConv.h"
69#include "llvm/IR/Constant.h"
70#include "llvm/IR/Constants.h"
71#include "llvm/IR/DataLayout.h"
72#include "llvm/IR/DebugLoc.h"
74#include "llvm/IR/Function.h"
75#include "llvm/IR/GlobalAlias.h"
76#include "llvm/IR/GlobalValue.h"
78#include "llvm/IR/IRBuilder.h"
79#include "llvm/IR/InlineAsm.h"
80#include "llvm/IR/Instruction.h"
83#include "llvm/IR/Intrinsics.h"
84#include "llvm/IR/IntrinsicsARM.h"
85#include "llvm/IR/Module.h"
86#include "llvm/IR/Type.h"
87#include "llvm/IR/User.h"
88#include "llvm/IR/Value.h"
89#include "llvm/MC/MCInstrDesc.h"
91#include "llvm/MC/MCSchedule.h"
98#include "llvm/Support/Debug.h"
106#include <algorithm>
107#include <cassert>
108#include <cstdint>
109#include <cstdlib>
110#include <iterator>
111#include <limits>
112#include <optional>
113#include <tuple>
114#include <utility>
115#include <vector>
116
117using namespace llvm;
118
119#define DEBUG_TYPE "arm-isel"
120
121STATISTIC(NumTailCalls, "Number of tail calls");
122STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
123STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
124STATISTIC(NumConstpoolPromoted,
125 "Number of constants with their storage promoted into constant pools");
126
127static cl::opt<bool>
128ARMInterworking("arm-interworking", cl::Hidden,
129 cl::desc("Enable / disable ARM interworking (for debugging only)"),
130 cl::init(true));
131
133 "arm-promote-constant", cl::Hidden,
134 cl::desc("Enable / disable promotion of unnamed_addr constants into "
135 "constant pools"),
136 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
138 "arm-promote-constant-max-size", cl::Hidden,
139 cl::desc("Maximum size of constant to promote into a constant pool"),
140 cl::init(64));
142 "arm-promote-constant-max-total", cl::Hidden,
143 cl::desc("Maximum size of ALL constants to promote into a constant pool"),
144 cl::init(128));
145
147MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
148 cl::desc("Maximum interleave factor for MVE VLDn to generate."),
149 cl::init(2));
150
152 "arm-max-base-updates-to-check", cl::Hidden,
153 cl::desc("Maximum number of base-updates to check generating postindex."),
154 cl::init(64));
155
156/// Value type used for "flags" operands / results (either CPSR or FPSCR_NZCV).
157constexpr MVT FlagsVT = MVT::i32;
158
159// The APCS parameter registers.
160static const MCPhysReg GPRArgRegs[] = {
161 ARM::R0, ARM::R1, ARM::R2, ARM::R3
162};
163
165 SelectionDAG &DAG, const SDLoc &DL) {
167 assert(Arg.ArgVT.bitsLT(MVT::i32));
168 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, Arg.ArgVT, Value);
169 SDValue Ext =
171 MVT::i32, Trunc);
172 return Ext;
173}
174
175void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
176 if (VT != PromotedLdStVT) {
178 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
179
181 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
182 }
183
184 MVT ElemTy = VT.getVectorElementType();
185 if (ElemTy != MVT::f64)
189 if (ElemTy == MVT::i32) {
194 } else {
199 }
208 if (VT.isInteger()) {
212 }
213
214 // Neon does not support vector divide/remainder operations.
223
224 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
225 for (auto Opcode : {ISD::ABS, ISD::ABDS, ISD::ABDU, ISD::SMIN, ISD::SMAX,
227 setOperationAction(Opcode, VT, Legal);
228 if (!VT.isFloatingPoint())
229 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
230 setOperationAction(Opcode, VT, Legal);
231}
232
233void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
234 addRegisterClass(VT, &ARM::DPRRegClass);
235 addTypeForNEON(VT, MVT::f64);
236}
237
238void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
239 addRegisterClass(VT, &ARM::DPairRegClass);
240 addTypeForNEON(VT, MVT::v2f64);
241}
242
243void ARMTargetLowering::setAllExpand(MVT VT) {
244 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
246
247 // We support these really simple operations even on types where all
248 // the actual arithmetic has to be broken down into simpler
249 // operations or turned into library calls.
254}
255
256void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
257 LegalizeAction Action) {
258 setLoadExtAction(ISD::EXTLOAD, From, To, Action);
259 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
260 setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
261}
262
263void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
264 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
265
266 for (auto VT : IntTypes) {
267 addRegisterClass(VT, &ARM::MQPRRegClass);
297
298 // No native support for these.
308
309 // Vector reductions
319
320 if (!HasMVEFP) {
325 } else {
328 }
329
330 // Pre and Post inc are supported on loads and stores
331 for (unsigned im = (unsigned)ISD::PRE_INC;
337 }
338 }
339
340 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
341 for (auto VT : FloatTypes) {
342 addRegisterClass(VT, &ARM::MQPRRegClass);
343 if (!HasMVEFP)
344 setAllExpand(VT);
345
346 // These are legal or custom whether we have MVE.fp or not
359
360 // Pre and Post inc are supported on loads and stores
361 for (unsigned im = (unsigned)ISD::PRE_INC;
367 }
368
369 if (HasMVEFP) {
382
383 // No native support for these.
398 }
399 }
400
401 // Custom Expand smaller than legal vector reductions to prevent false zero
402 // items being added.
411
412 // We 'support' these types up to bitcast/load/store level, regardless of
413 // MVE integer-only / float support. Only doing FP data processing on the FP
414 // vector types is inhibited at integer-only level.
415 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
416 for (auto VT : LongTypes) {
417 addRegisterClass(VT, &ARM::MQPRRegClass);
418 setAllExpand(VT);
424 }
426
427 // We can do bitwise operations on v2i64 vectors
428 setOperationAction(ISD::AND, MVT::v2i64, Legal);
429 setOperationAction(ISD::OR, MVT::v2i64, Legal);
430 setOperationAction(ISD::XOR, MVT::v2i64, Legal);
431
432 // It is legal to extload from v4i8 to v4i16 or v4i32.
433 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
434 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
435 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
436
437 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
443
444 // Some truncating stores are legal too.
445 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
446 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
447 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
448
449 // Pre and Post inc on these are legal, given the correct extends
450 for (unsigned im = (unsigned)ISD::PRE_INC;
452 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
457 }
458 }
459
460 // Predicate types
461 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
462 for (auto VT : pTypes) {
463 addRegisterClass(VT, &ARM::VCCRRegClass);
478
479 if (!HasMVEFP) {
484 }
485 }
489 setOperationAction(ISD::OR, MVT::v2i1, Expand);
495
504}
505
507 return static_cast<const ARMBaseTargetMachine &>(getTargetMachine());
508}
509
511 const ARMSubtarget &STI)
512 : TargetLowering(TM_), Subtarget(&STI),
513 RegInfo(Subtarget->getRegisterInfo()),
514 Itins(Subtarget->getInstrItineraryData()) {
515 const auto &TM = static_cast<const ARMBaseTargetMachine &>(TM_);
516
519
520 const Triple &TT = TM.getTargetTriple();
521
522 if (TT.isOSBinFormatMachO()) {
523 // Uses VFP for Thumb libfuncs if available.
524 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
525 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
526 // clang-format off
527 static const struct {
528 const RTLIB::Libcall Op;
529 const RTLIB::LibcallImpl Impl;
530 } LibraryCalls[] = {
531 // Single-precision floating-point arithmetic.
532 { RTLIB::ADD_F32, RTLIB::impl___addsf3vfp },
533 { RTLIB::SUB_F32, RTLIB::impl___subsf3vfp },
534 { RTLIB::MUL_F32, RTLIB::impl___mulsf3vfp },
535 { RTLIB::DIV_F32, RTLIB::impl___divsf3vfp },
536
537 // Double-precision floating-point arithmetic.
538 { RTLIB::ADD_F64, RTLIB::impl___adddf3vfp },
539 { RTLIB::SUB_F64, RTLIB::impl___subdf3vfp },
540 { RTLIB::MUL_F64, RTLIB::impl___muldf3vfp },
541 { RTLIB::DIV_F64, RTLIB::impl___divdf3vfp },
542
543 // Single-precision comparisons.
544 { RTLIB::OEQ_F32, RTLIB::impl___eqsf2vfp },
545 { RTLIB::UNE_F32, RTLIB::impl___nesf2vfp },
546 { RTLIB::OLT_F32, RTLIB::impl___ltsf2vfp },
547 { RTLIB::OLE_F32, RTLIB::impl___lesf2vfp },
548 { RTLIB::OGE_F32, RTLIB::impl___gesf2vfp },
549 { RTLIB::OGT_F32, RTLIB::impl___gtsf2vfp },
550 { RTLIB::UO_F32, RTLIB::impl___unordsf2vfp },
551
552 // Double-precision comparisons.
553 { RTLIB::OEQ_F64, RTLIB::impl___eqdf2vfp },
554 { RTLIB::UNE_F64, RTLIB::impl___nedf2vfp },
555 { RTLIB::OLT_F64, RTLIB::impl___ltdf2vfp },
556 { RTLIB::OLE_F64, RTLIB::impl___ledf2vfp },
557 { RTLIB::OGE_F64, RTLIB::impl___gedf2vfp },
558 { RTLIB::OGT_F64, RTLIB::impl___gtdf2vfp },
559 { RTLIB::UO_F64, RTLIB::impl___unorddf2vfp },
560
561 // Floating-point to integer conversions.
562 // i64 conversions are done via library routines even when generating VFP
563 // instructions, so use the same ones.
564 { RTLIB::FPTOSINT_F64_I32, RTLIB::impl___fixdfsivfp },
565 { RTLIB::FPTOUINT_F64_I32, RTLIB::impl___fixunsdfsivfp },
566 { RTLIB::FPTOSINT_F32_I32, RTLIB::impl___fixsfsivfp },
567 { RTLIB::FPTOUINT_F32_I32, RTLIB::impl___fixunssfsivfp },
568
569 // Conversions between floating types.
570 { RTLIB::FPROUND_F64_F32, RTLIB::impl___truncdfsf2vfp },
571 { RTLIB::FPEXT_F32_F64, RTLIB::impl___extendsfdf2vfp },
572
573 // Integer to floating-point conversions.
574 // i64 conversions are done via library routines even when generating VFP
575 // instructions, so use the same ones.
576 // FIXME: There appears to be some naming inconsistency in ARM libgcc:
577 // e.g., __floatunsidf vs. __floatunssidfvfp.
578 { RTLIB::SINTTOFP_I32_F64, RTLIB::impl___floatsidfvfp },
579 { RTLIB::UINTTOFP_I32_F64, RTLIB::impl___floatunssidfvfp },
580 { RTLIB::SINTTOFP_I32_F32, RTLIB::impl___floatsisfvfp },
581 { RTLIB::UINTTOFP_I32_F32, RTLIB::impl___floatunssisfvfp },
582 };
583 // clang-format on
584
585 for (const auto &LC : LibraryCalls)
586 setLibcallImpl(LC.Op, LC.Impl);
587 }
588 }
589
590 if (Subtarget->isThumb1Only())
591 addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
592 else
593 addRegisterClass(MVT::i32, &ARM::GPRRegClass);
594
595 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
596 Subtarget->hasFPRegs()) {
597 addRegisterClass(MVT::f32, &ARM::SPRRegClass);
598 addRegisterClass(MVT::f64, &ARM::DPRRegClass);
599
604
605 if (!Subtarget->hasVFP2Base())
606 setAllExpand(MVT::f32);
607 if (!Subtarget->hasFP64())
608 setAllExpand(MVT::f64);
609 }
610
611 if (Subtarget->hasFullFP16()) {
612 addRegisterClass(MVT::f16, &ARM::HPRRegClass);
615
618 }
619
620 if (Subtarget->hasBF16()) {
621 addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
622 setAllExpand(MVT::bf16);
623 if (!Subtarget->hasFullFP16())
625 } else {
630 }
631
633 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
634 setTruncStoreAction(VT, InnerVT, Expand);
635 addAllExtLoads(VT, InnerVT, Expand);
636 }
637
640
642 }
643
644 if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps())
646
647 if (!Subtarget->hasV8_1MMainlineOps())
649
652
655
656 if (Subtarget->hasMVEIntegerOps())
657 addMVEVectorTypes(Subtarget->hasMVEFloatOps());
658
659 // Combine low-overhead loop intrinsics so that we can lower i1 types.
660 if (Subtarget->hasLOB()) {
662 }
663
664 if (Subtarget->hasNEON()) {
665 addDRTypeForNEON(MVT::v2f32);
666 addDRTypeForNEON(MVT::v8i8);
667 addDRTypeForNEON(MVT::v4i16);
668 addDRTypeForNEON(MVT::v2i32);
669 addDRTypeForNEON(MVT::v1i64);
670
671 addQRTypeForNEON(MVT::v4f32);
672 addQRTypeForNEON(MVT::v2f64);
673 addQRTypeForNEON(MVT::v16i8);
674 addQRTypeForNEON(MVT::v8i16);
675 addQRTypeForNEON(MVT::v4i32);
676 addQRTypeForNEON(MVT::v2i64);
677
678 if (Subtarget->hasFullFP16()) {
679 addQRTypeForNEON(MVT::v8f16);
680 addDRTypeForNEON(MVT::v4f16);
681 }
682
683 if (Subtarget->hasBF16()) {
684 addQRTypeForNEON(MVT::v8bf16);
685 addDRTypeForNEON(MVT::v4bf16);
686 }
687 }
688
689 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
690 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
691 // none of Neon, MVE or VFP supports any arithmetic operations on it.
692 setOperationAction(ISD::FADD, MVT::v2f64, Expand);
693 setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
694 setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
695 // FIXME: Code duplication: FDIV and FREM are expanded always, see
696 // ARMTargetLowering::addTypeForNEON method for details.
697 setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
698 setOperationAction(ISD::FREM, MVT::v2f64, Expand);
699 // FIXME: Create unittest.
700 // In another words, find a way when "copysign" appears in DAG with vector
701 // operands.
703 // FIXME: Code duplication: SETCC has custom operation action, see
704 // ARMTargetLowering::addTypeForNEON method for details.
706 // FIXME: Create unittest for FNEG and for FABS.
707 setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
708 setOperationAction(ISD::FABS, MVT::v2f64, Expand);
710 setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
711 setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
712 setOperationAction(ISD::FTAN, MVT::v2f64, Expand);
713 setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
714 setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
717 setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
726 setOperationAction(ISD::FMA, MVT::v2f64, Expand);
727 }
728
729 if (Subtarget->hasNEON()) {
730 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
731 // supported for v4f32.
733 setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
734 setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
735 setOperationAction(ISD::FTAN, MVT::v4f32, Expand);
736 setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
737 setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
740 setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
749
750 // Mark v2f32 intrinsics.
752 setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
753 setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
754 setOperationAction(ISD::FTAN, MVT::v2f32, Expand);
755 setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
756 setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
759 setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
768
771 setOperationAction(Op, MVT::v4f16, Expand);
772 setOperationAction(Op, MVT::v8f16, Expand);
773 }
774
775 // Neon does not support some operations on v1i64 and v2i64 types.
776 setOperationAction(ISD::MUL, MVT::v1i64, Expand);
777 // Custom handling for some quad-vector types to detect VMULL.
778 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
779 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
780 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
781 // Custom handling for some vector types to avoid expensive expansions
782 setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
784 setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
786 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
787 // a destination type that is wider than the source, and nor does
788 // it have a FP_TO_[SU]INT instruction with a narrower destination than
789 // source.
798
801
802 // NEON does not have single instruction CTPOP for vectors with element
803 // types wider than 8-bits. However, custom lowering can leverage the
804 // v8i8/v16i8 vcnt instruction.
811
812 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
813 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
814
815 // NEON does not have single instruction CTTZ for vectors.
817 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
818 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
819 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
820
821 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
822 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
823 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
824 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
825
830
835
839 }
840
841 // NEON only has FMA instructions as of VFP4.
842 if (!Subtarget->hasVFP4Base()) {
843 setOperationAction(ISD::FMA, MVT::v2f32, Expand);
844 setOperationAction(ISD::FMA, MVT::v4f32, Expand);
845 }
846
849
850 // It is legal to extload from v4i8 to v4i16 or v4i32.
851 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
852 MVT::v2i32}) {
857 }
858 }
859
860 for (auto VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
861 MVT::v4i32}) {
866 }
867 }
868
869 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
876 }
877 if (Subtarget->hasMVEIntegerOps()) {
880 ISD::SETCC});
881 }
882 if (Subtarget->hasMVEFloatOps()) {
884 }
885
886 if (!Subtarget->hasFP64()) {
887 // When targeting a floating-point unit with only single-precision
888 // operations, f64 is legal for the few double-precision instructions which
889 // are present However, no double-precision operations other than moves,
890 // loads and stores are provided by the hardware.
929 }
930
931 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
934 if (Subtarget->hasFullFP16()) {
937 }
938 }
939
940 if (!Subtarget->hasFP16()) {
943 }
944
946
947 // ARM does not have floating-point extending loads.
948 for (MVT VT : MVT::fp_valuetypes()) {
949 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
950 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
951 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
952 }
953
954 // ... or truncating stores
955 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
956 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
957 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
958 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
959 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
960
961 // ARM does not have i1 sign extending load.
962 for (MVT VT : MVT::integer_valuetypes())
964
965 // ARM supports all 4 flavors of integer indexed load / store.
966 if (!Subtarget->isThumb1Only()) {
967 for (unsigned im = (unsigned)ISD::PRE_INC;
969 setIndexedLoadAction(im, MVT::i1, Legal);
970 setIndexedLoadAction(im, MVT::i8, Legal);
971 setIndexedLoadAction(im, MVT::i16, Legal);
972 setIndexedLoadAction(im, MVT::i32, Legal);
973 setIndexedStoreAction(im, MVT::i1, Legal);
974 setIndexedStoreAction(im, MVT::i8, Legal);
975 setIndexedStoreAction(im, MVT::i16, Legal);
976 setIndexedStoreAction(im, MVT::i32, Legal);
977 }
978 } else {
979 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
982 }
983
988
991 if (Subtarget->hasDSP()) {
1000 }
1001 if (Subtarget->hasBaseDSP()) {
1004 }
1005
1006 // i64 operation support.
1009 if (Subtarget->isThumb1Only()) {
1012 }
1013 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
1014 || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
1016
1026
1027 // MVE lowers 64 bit shifts to lsll and lsrl
1028 // assuming that ISD::SRL and SRA of i64 are already marked custom
1029 if (Subtarget->hasMVEIntegerOps())
1031
1032 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1033 if (Subtarget->isThumb1Only()) {
1037 }
1038
1039 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
1041
1042 // ARM does not have ROTL.
1047 }
1049 // TODO: These two should be set to LibCall, but this currently breaks
1050 // the Linux kernel build. See #101786.
1053 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1056 }
1057
1058 // @llvm.readcyclecounter requires the Performance Monitors extension.
1059 // Default to the 0 expansion on unsupported platforms.
1060 // FIXME: Technically there are older ARM CPUs that have
1061 // implementation-specific ways of obtaining this information.
1062 if (Subtarget->hasPerfMon())
1064
1065 // Only ARMv6 has BSWAP.
1066 if (!Subtarget->hasV6Ops())
1068
1069 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1070 : Subtarget->hasDivideInARMMode();
1071 if (!hasDivide) {
1072 // These are expanded into libcalls if the cpu doesn't have HW divider.
1075 }
1076
1077 if (TT.isOSWindows() && !Subtarget->hasDivideInThumbMode()) {
1080
1083 }
1084
1087
1088 // Register based DivRem for AEABI (RTABI 4.2)
1089 if (TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() ||
1090 TT.isTargetMuslAEABI() || TT.isOSWindows()) {
1093 HasStandaloneRem = false;
1094
1099 } else {
1102 }
1103
1108
1109 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1111
1112 // Use the default implementation.
1114 setOperationAction(ISD::VAARG, MVT::Other, Expand);
1116 setOperationAction(ISD::VAEND, MVT::Other, Expand);
1119
1120 if (TT.isOSWindows())
1122 else
1124
1125 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1126 // the default expansion.
1127 InsertFencesForAtomic = false;
1128 if (Subtarget->hasAnyDataBarrier() &&
1129 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1130 // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1131 // to ldrex/strex loops already.
1133 if (!Subtarget->isThumb() || !Subtarget->isMClass())
1135
1136 // On v8, we have particularly efficient implementations of atomic fences
1137 // if they can be combined with nearby atomic loads and stores.
1138 if (!Subtarget->hasAcquireRelease() ||
1139 getTargetMachine().getOptLevel() == CodeGenOptLevel::None) {
1140 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1141 InsertFencesForAtomic = true;
1142 }
1143 } else {
1144 // If there's anything we can use as a barrier, go through custom lowering
1145 // for ATOMIC_FENCE.
1146 // If target has DMB in thumb, Fences can be inserted.
1147 if (Subtarget->hasDataBarrier())
1148 InsertFencesForAtomic = true;
1149
1151 Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1152
1153 // Set them all for libcall, which will force libcalls.
1166 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1167 // Unordered/Monotonic case.
1168 if (!InsertFencesForAtomic) {
1171 }
1172 }
1173
1174 // Compute supported atomic widths.
1175 if (TT.isOSLinux() || (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
1176 // For targets where __sync_* routines are reliably available, we use them
1177 // if necessary.
1178 //
1179 // ARM Linux always supports 64-bit atomics through kernel-assisted atomic
1180 // routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
1181 //
1182 // ARMv6 targets have native instructions in ARM mode. For Thumb mode,
1183 // such targets should provide __sync_* routines, which use the ARM mode
1184 // instructions. (ARMv6 doesn't have dmb, but it has an equivalent
1185 // encoding; see ARMISD::MEMBARRIER_MCR.)
1187 } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||
1188 Subtarget->hasForced32BitAtomics()) {
1189 // Cortex-M (besides Cortex-M0) have 32-bit atomics.
1191 } else {
1192 // We can't assume anything about other targets; just use libatomic
1193 // routines.
1195 }
1196
1198
1200
1201 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1202 if (!Subtarget->hasV6Ops()) {
1205 }
1207
1208 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1209 !Subtarget->isThumb1Only()) {
1210 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1211 // iff target supports vfp2.
1221 }
1222
1223 // We want to custom lower some of our intrinsics.
1228
1238 if (Subtarget->hasFullFP16()) {
1242 }
1243
1245
1248 if (Subtarget->hasFullFP16())
1252 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
1253
1254 // We don't support sin/cos/fmod/copysign/pow
1263 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1264 !Subtarget->isThumb1Only()) {
1267 }
1270
1271 if (!Subtarget->hasVFP4Base()) {
1274 }
1275
1276 // Various VFP goodness
1277 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1278 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1279 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1282 }
1283
1284 // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1285 if (!Subtarget->hasFP16()) {
1288 }
1289
1290 // Strict floating-point comparisons need custom lowering.
1297 }
1298
1299 // Use __sincos_stret if available.
1300 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1301 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1304 }
1305
1306 // FP-ARMv8 implements a lot of rounding-like FP operations.
1307 if (Subtarget->hasFPARMv8Base()) {
1317 if (Subtarget->hasNEON()) {
1322 }
1323
1324 if (Subtarget->hasFP64()) {
1334 }
1335 }
1336
1337 // FP16 often need to be promoted to call lib functions
1338 if (Subtarget->hasFullFP16()) {
1353
1361 }
1362
1363 if (Subtarget->hasNEON()) {
1364 // vmin and vmax aren't available in a scalar form, so we can use
1365 // a NEON instruction with an undef lane instead.
1374
1375 if (Subtarget->hasV8Ops()) {
1376 setOperationAction(ISD::FFLOOR, MVT::v2f32, Legal);
1377 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
1378 setOperationAction(ISD::FROUND, MVT::v2f32, Legal);
1379 setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
1382 setOperationAction(ISD::FCEIL, MVT::v2f32, Legal);
1383 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
1384 setOperationAction(ISD::FTRUNC, MVT::v2f32, Legal);
1385 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
1386 setOperationAction(ISD::FRINT, MVT::v2f32, Legal);
1387 setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
1388 }
1389
1390 if (Subtarget->hasFullFP16()) {
1395
1400
1401 setOperationAction(ISD::FFLOOR, MVT::v4f16, Legal);
1402 setOperationAction(ISD::FFLOOR, MVT::v8f16, Legal);
1403 setOperationAction(ISD::FROUND, MVT::v4f16, Legal);
1404 setOperationAction(ISD::FROUND, MVT::v8f16, Legal);
1407 setOperationAction(ISD::FCEIL, MVT::v4f16, Legal);
1408 setOperationAction(ISD::FCEIL, MVT::v8f16, Legal);
1409 setOperationAction(ISD::FTRUNC, MVT::v4f16, Legal);
1410 setOperationAction(ISD::FTRUNC, MVT::v8f16, Legal);
1411 setOperationAction(ISD::FRINT, MVT::v4f16, Legal);
1412 setOperationAction(ISD::FRINT, MVT::v8f16, Legal);
1413 }
1414 }
1415
1416 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1417 // it, but it's just a wrapper around ldexp.
1418 if (TT.isOSWindows()) {
1420 if (isOperationExpand(Op, MVT::f32))
1421 setOperationAction(Op, MVT::f32, Promote);
1422 }
1423
1424 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1425 // isn't legal.
1427 if (isOperationExpand(Op, MVT::f16))
1428 setOperationAction(Op, MVT::f16, Promote);
1429
1430 // We have target-specific dag combine patterns for the following nodes:
1431 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
1434
1435 if (Subtarget->hasMVEIntegerOps())
1437
1438 if (Subtarget->hasV6Ops())
1440 if (Subtarget->isThumb1Only())
1442 // Attempt to lower smin/smax to ssat/usat
1443 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
1444 Subtarget->isThumb2()) {
1446 }
1447
1449
1450 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1451 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1453 else
1455
1456 //// temporary - rewrite interface to use type
1459 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1461 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1463
1464 // On ARM arguments smaller than 4 bytes are extended, so all arguments
1465 // are at least 4 bytes aligned.
1467
1468 // Prefer likely predicted branches to selects on out-of-order cores.
1469 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1470
1473 Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1474
1475 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1476}
1477
1479 return Subtarget->useSoftFloat();
1480}
1481
1483 return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32;
1484}
1485
1486// FIXME: It might make sense to define the representative register class as the
1487// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1488// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1489// SPR's representative would be DPR_VFP2. This should work well if register
1490// pressure tracking were modified such that a register use would increment the
1491// pressure of the register class's representative and all of it's super
1492// classes' representatives transitively. We have not implemented this because
1493// of the difficulty prior to coalescing of modeling operand register classes
1494// due to the common occurrence of cross class copies and subregister insertions
1495// and extractions.
1496std::pair<const TargetRegisterClass *, uint8_t>
1498 MVT VT) const {
1499 const TargetRegisterClass *RRC = nullptr;
1500 uint8_t Cost = 1;
1501 switch (VT.SimpleTy) {
1502 default:
1504 // Use DPR as representative register class for all floating point
1505 // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1506 // the cost is 1 for both f32 and f64.
1507 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1508 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1509 RRC = &ARM::DPRRegClass;
1510 // When NEON is used for SP, only half of the register file is available
1511 // because operations that define both SP and DP results will be constrained
1512 // to the VFP2 class (D0-D15). We currently model this constraint prior to
1513 // coalescing by double-counting the SP regs. See the FIXME above.
1514 if (Subtarget->useNEONForSinglePrecisionFP())
1515 Cost = 2;
1516 break;
1517 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1518 case MVT::v4f32: case MVT::v2f64:
1519 RRC = &ARM::DPRRegClass;
1520 Cost = 2;
1521 break;
1522 case MVT::v4i64:
1523 RRC = &ARM::DPRRegClass;
1524 Cost = 4;
1525 break;
1526 case MVT::v8i64:
1527 RRC = &ARM::DPRRegClass;
1528 Cost = 8;
1529 break;
1530 }
1531 return std::make_pair(RRC, Cost);
1532}
1533
1534const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
1535#define MAKE_CASE(V) \
1536 case V: \
1537 return #V;
1538 switch ((ARMISD::NodeType)Opcode) {
1540 break;
1743#undef MAKE_CASE
1744 }
1745 return nullptr;
1746}
1747
1749 EVT VT) const {
1750 if (!VT.isVector())
1751 return getPointerTy(DL);
1752
1753 // MVE has a predicate register.
1754 if ((Subtarget->hasMVEIntegerOps() &&
1755 (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
1756 VT == MVT::v16i8)) ||
1757 (Subtarget->hasMVEFloatOps() &&
1758 (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16)))
1759 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
1761}
1762
1763/// getRegClassFor - Return the register class that should be used for the
1764/// specified value type.
1765const TargetRegisterClass *
1766ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1767 (void)isDivergent;
1768 // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1769 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1770 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1771 // MVE Q registers.
1772 if (Subtarget->hasNEON()) {
1773 if (VT == MVT::v4i64)
1774 return &ARM::QQPRRegClass;
1775 if (VT == MVT::v8i64)
1776 return &ARM::QQQQPRRegClass;
1777 }
1778 if (Subtarget->hasMVEIntegerOps()) {
1779 if (VT == MVT::v4i64)
1780 return &ARM::MQQPRRegClass;
1781 if (VT == MVT::v8i64)
1782 return &ARM::MQQQQPRRegClass;
1783 }
1785}
1786
1787// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1788// source/dest is aligned and the copy size is large enough. We therefore want
1789// to align such objects passed to memory intrinsics.
1791 Align &PrefAlign) const {
1792 if (!isa<MemIntrinsic>(CI))
1793 return false;
1794 MinSize = 8;
1795 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1796 // cycle faster than 4-byte aligned LDM.
1797 PrefAlign =
1798 (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));
1799 return true;
1800}
1801
1802// Create a fast isel object.
1803FastISel *
1805 const TargetLibraryInfo *libInfo) const {
1806 return ARM::createFastISel(funcInfo, libInfo);
1807}
1808
1810 unsigned NumVals = N->getNumValues();
1811 if (!NumVals)
1812 return Sched::RegPressure;
1813
1814 for (unsigned i = 0; i != NumVals; ++i) {
1815 EVT VT = N->getValueType(i);
1816 if (VT == MVT::Glue || VT == MVT::Other)
1817 continue;
1818 if (VT.isFloatingPoint() || VT.isVector())
1819 return Sched::ILP;
1820 }
1821
1822 if (!N->isMachineOpcode())
1823 return Sched::RegPressure;
1824
1825 // Load are scheduled for latency even if there instruction itinerary
1826 // is not available.
1827 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1828 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1829
1830 if (MCID.getNumDefs() == 0)
1831 return Sched::RegPressure;
1832 if (!Itins->isEmpty() &&
1833 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2U)
1834 return Sched::ILP;
1835
1836 return Sched::RegPressure;
1837}
1838
1839//===----------------------------------------------------------------------===//
1840// Lowering Code
1841//===----------------------------------------------------------------------===//
1842
1843static bool isSRL16(const SDValue &Op) {
1844 if (Op.getOpcode() != ISD::SRL)
1845 return false;
1846 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1847 return Const->getZExtValue() == 16;
1848 return false;
1849}
1850
1851static bool isSRA16(const SDValue &Op) {
1852 if (Op.getOpcode() != ISD::SRA)
1853 return false;
1854 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1855 return Const->getZExtValue() == 16;
1856 return false;
1857}
1858
1859static bool isSHL16(const SDValue &Op) {
1860 if (Op.getOpcode() != ISD::SHL)
1861 return false;
1862 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1863 return Const->getZExtValue() == 16;
1864 return false;
1865}
1866
1867// Check for a signed 16-bit value. We special case SRA because it makes it
1868// more simple when also looking for SRAs that aren't sign extending a
1869// smaller value. Without the check, we'd need to take extra care with
1870// checking order for some operations.
1871static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
1872 if (isSRA16(Op))
1873 return isSHL16(Op.getOperand(0));
1874 return DAG.ComputeNumSignBits(Op) == 17;
1875}
1876
1877/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1879 switch (CC) {
1880 default: llvm_unreachable("Unknown condition code!");
1881 case ISD::SETNE: return ARMCC::NE;
1882 case ISD::SETEQ: return ARMCC::EQ;
1883 case ISD::SETGT: return ARMCC::GT;
1884 case ISD::SETGE: return ARMCC::GE;
1885 case ISD::SETLT: return ARMCC::LT;
1886 case ISD::SETLE: return ARMCC::LE;
1887 case ISD::SETUGT: return ARMCC::HI;
1888 case ISD::SETUGE: return ARMCC::HS;
1889 case ISD::SETULT: return ARMCC::LO;
1890 case ISD::SETULE: return ARMCC::LS;
1891 }
1892}
1893
1894/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1896 ARMCC::CondCodes &CondCode2) {
1897 CondCode2 = ARMCC::AL;
1898 switch (CC) {
1899 default: llvm_unreachable("Unknown FP condition!");
1900 case ISD::SETEQ:
1901 case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1902 case ISD::SETGT:
1903 case ISD::SETOGT: CondCode = ARMCC::GT; break;
1904 case ISD::SETGE:
1905 case ISD::SETOGE: CondCode = ARMCC::GE; break;
1906 case ISD::SETOLT: CondCode = ARMCC::MI; break;
1907 case ISD::SETOLE: CondCode = ARMCC::LS; break;
1908 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1909 case ISD::SETO: CondCode = ARMCC::VC; break;
1910 case ISD::SETUO: CondCode = ARMCC::VS; break;
1911 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1912 case ISD::SETUGT: CondCode = ARMCC::HI; break;
1913 case ISD::SETUGE: CondCode = ARMCC::PL; break;
1914 case ISD::SETLT:
1915 case ISD::SETULT: CondCode = ARMCC::LT; break;
1916 case ISD::SETLE:
1917 case ISD::SETULE: CondCode = ARMCC::LE; break;
1918 case ISD::SETNE:
1919 case ISD::SETUNE: CondCode = ARMCC::NE; break;
1920 }
1921}
1922
1923//===----------------------------------------------------------------------===//
1924// Calling Convention Implementation
1925//===----------------------------------------------------------------------===//
1926
1927/// getEffectiveCallingConv - Get the effective calling convention, taking into
1928/// account presence of floating point hardware and calling convention
1929/// limitations, such as support for variadic functions.
1931ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
1932 bool isVarArg) const {
1933 switch (CC) {
1934 default:
1935 report_fatal_error("Unsupported calling convention");
1938 case CallingConv::GHC:
1940 return CC;
1946 case CallingConv::Swift:
1949 case CallingConv::C:
1950 case CallingConv::Tail:
1951 if (!getTM().isAAPCS_ABI())
1952 return CallingConv::ARM_APCS;
1953 else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
1954 getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1955 !isVarArg)
1957 else
1959 case CallingConv::Fast:
1961 if (!getTM().isAAPCS_ABI()) {
1962 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
1963 return CallingConv::Fast;
1964 return CallingConv::ARM_APCS;
1965 } else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
1966 !isVarArg)
1968 else
1970 }
1971}
1972
1974 bool isVarArg) const {
1975 return CCAssignFnForNode(CC, false, isVarArg);
1976}
1977
1979 bool isVarArg) const {
1980 return CCAssignFnForNode(CC, true, isVarArg);
1981}
1982
1983/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
1984/// CallingConvention.
1985CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1986 bool Return,
1987 bool isVarArg) const {
1988 switch (getEffectiveCallingConv(CC, isVarArg)) {
1989 default:
1990 report_fatal_error("Unsupported calling convention");
1992 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1994 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1996 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1997 case CallingConv::Fast:
1998 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
1999 case CallingConv::GHC:
2000 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
2002 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2004 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2006 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
2007 }
2008}
2009
2010SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
2011 MVT LocVT, MVT ValVT, SDValue Val) const {
2012 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
2013 Val);
2014 if (Subtarget->hasFullFP16()) {
2015 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
2016 } else {
2017 Val = DAG.getNode(ISD::TRUNCATE, dl,
2018 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2019 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
2020 }
2021 return Val;
2022}
2023
2024SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
2025 MVT LocVT, MVT ValVT,
2026 SDValue Val) const {
2027 if (Subtarget->hasFullFP16()) {
2028 Val = DAG.getNode(ARMISD::VMOVrh, dl,
2029 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2030 } else {
2031 Val = DAG.getNode(ISD::BITCAST, dl,
2032 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2033 Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
2034 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2035 }
2036 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
2037}
2038
2039/// LowerCallResult - Lower the result values of a call into the
2040/// appropriate copies out of appropriate physical registers.
2041SDValue ARMTargetLowering::LowerCallResult(
2042 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
2043 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2044 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
2045 SDValue ThisVal, bool isCmseNSCall) const {
2046 // Assign locations to each value returned by this call.
2048 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2049 *DAG.getContext());
2050 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
2051
2052 // Copy all of the result registers out of their specified physreg.
2053 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2054 CCValAssign VA = RVLocs[i];
2055
2056 // Pass 'this' value directly from the argument to return value, to avoid
2057 // reg unit interference
2058 if (i == 0 && isThisReturn) {
2059 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
2060 "unexpected return calling convention register assignment");
2061 InVals.push_back(ThisVal);
2062 continue;
2063 }
2064
2065 SDValue Val;
2066 if (VA.needsCustom() &&
2067 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
2068 // Handle f64 or half of a v2f64.
2069 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2070 InGlue);
2071 Chain = Lo.getValue(1);
2072 InGlue = Lo.getValue(2);
2073 VA = RVLocs[++i]; // skip ahead to next loc
2074 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2075 InGlue);
2076 Chain = Hi.getValue(1);
2077 InGlue = Hi.getValue(2);
2078 if (!Subtarget->isLittle())
2079 std::swap (Lo, Hi);
2080 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2081
2082 if (VA.getLocVT() == MVT::v2f64) {
2083 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
2084 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2085 DAG.getConstant(0, dl, MVT::i32));
2086
2087 VA = RVLocs[++i]; // skip ahead to next loc
2088 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
2089 Chain = Lo.getValue(1);
2090 InGlue = Lo.getValue(2);
2091 VA = RVLocs[++i]; // skip ahead to next loc
2092 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
2093 Chain = Hi.getValue(1);
2094 InGlue = Hi.getValue(2);
2095 if (!Subtarget->isLittle())
2096 std::swap (Lo, Hi);
2097 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2098 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2099 DAG.getConstant(1, dl, MVT::i32));
2100 }
2101 } else {
2102 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
2103 InGlue);
2104 Chain = Val.getValue(1);
2105 InGlue = Val.getValue(2);
2106 }
2107
2108 switch (VA.getLocInfo()) {
2109 default: llvm_unreachable("Unknown loc info!");
2110 case CCValAssign::Full: break;
2111 case CCValAssign::BCvt:
2112 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
2113 break;
2114 }
2115
2116 // f16 arguments have their size extended to 4 bytes and passed as if they
2117 // had been copied to the LSBs of a 32-bit register.
2118 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2119 if (VA.needsCustom() &&
2120 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
2121 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
2122
2123 // On CMSE Non-secure Calls, call results (returned values) whose bitwidth
2124 // is less than 32 bits must be sign- or zero-extended after the call for
2125 // security reasons. Although the ABI mandates an extension done by the
2126 // callee, the latter cannot be trusted to follow the rules of the ABI.
2127 const ISD::InputArg &Arg = Ins[VA.getValNo()];
2128 if (isCmseNSCall && Arg.ArgVT.isScalarInteger() &&
2129 VA.getLocVT().isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
2130 Val = handleCMSEValue(Val, Arg, DAG, dl);
2131
2132 InVals.push_back(Val);
2133 }
2134
2135 return Chain;
2136}
2137
2138std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
2139 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
2140 bool IsTailCall, int SPDiff) const {
2141 SDValue DstAddr;
2142 MachinePointerInfo DstInfo;
2143 int32_t Offset = VA.getLocMemOffset();
2145
2146 if (IsTailCall) {
2147 Offset += SPDiff;
2148 auto PtrVT = getPointerTy(DAG.getDataLayout());
2149 int Size = VA.getLocVT().getFixedSizeInBits() / 8;
2150 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
2151 DstAddr = DAG.getFrameIndex(FI, PtrVT);
2152 DstInfo =
2154 } else {
2155 SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
2156 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2157 StackPtr, PtrOff);
2158 DstInfo =
2160 }
2161
2162 return std::make_pair(DstAddr, DstInfo);
2163}
2164
2165// Returns the type of copying which is required to set up a byval argument to
2166// a tail-called function. This isn't needed for non-tail calls, because they
2167// always need the equivalent of CopyOnce, but tail-calls sometimes need two to
2168// avoid clobbering another argument (CopyViaTemp), and sometimes can be
2169// optimised to zero copies when forwarding an argument from the caller's
2170// caller (NoCopy).
2171ARMTargetLowering::ByValCopyKind ARMTargetLowering::ByValNeedsCopyForTailCall(
2172 SelectionDAG &DAG, SDValue Src, SDValue Dst, ISD::ArgFlagsTy Flags) const {
2175
2176 // Globals are always safe to copy from.
2177 if (isa<GlobalAddressSDNode>(Src) || isa<ExternalSymbolSDNode>(Src))
2178 return CopyOnce;
2179
2180 // Can only analyse frame index nodes, conservatively assume we need a
2181 // temporary.
2182 auto *SrcFrameIdxNode = dyn_cast<FrameIndexSDNode>(Src);
2183 auto *DstFrameIdxNode = dyn_cast<FrameIndexSDNode>(Dst);
2184 if (!SrcFrameIdxNode || !DstFrameIdxNode)
2185 return CopyViaTemp;
2186
2187 int SrcFI = SrcFrameIdxNode->getIndex();
2188 int DstFI = DstFrameIdxNode->getIndex();
2189 assert(MFI.isFixedObjectIndex(DstFI) &&
2190 "byval passed in non-fixed stack slot");
2191
2192 int64_t SrcOffset = MFI.getObjectOffset(SrcFI);
2193 int64_t DstOffset = MFI.getObjectOffset(DstFI);
2194
2195 // If the source is in the local frame, then the copy to the argument memory
2196 // is always valid.
2197 bool FixedSrc = MFI.isFixedObjectIndex(SrcFI);
2198 if (!FixedSrc ||
2199 (FixedSrc && SrcOffset < -(int64_t)AFI->getArgRegsSaveSize()))
2200 return CopyOnce;
2201
2202 // In the case of byval arguments split between registers and the stack,
2203 // computeAddrForCallArg returns a FrameIndex which corresponds only to the
2204 // stack portion, but the Src SDValue will refer to the full value, including
2205 // the local stack memory that the register portion gets stored into. We only
2206 // need to compare them for equality, so normalise on the full value version.
2207 uint64_t RegSize = Flags.getByValSize() - MFI.getObjectSize(DstFI);
2208 DstOffset -= RegSize;
2209
2210 // If the value is already in the correct location, then no copying is
2211 // needed. If not, then we need to copy via a temporary.
2212 if (SrcOffset == DstOffset)
2213 return NoCopy;
2214 else
2215 return CopyViaTemp;
2216}
2217
2218void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
2219 SDValue Chain, SDValue &Arg,
2220 RegsToPassVector &RegsToPass,
2221 CCValAssign &VA, CCValAssign &NextVA,
2222 SDValue &StackPtr,
2223 SmallVectorImpl<SDValue> &MemOpChains,
2224 bool IsTailCall,
2225 int SPDiff) const {
2226 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2227 DAG.getVTList(MVT::i32, MVT::i32), Arg);
2228 unsigned id = Subtarget->isLittle() ? 0 : 1;
2229 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
2230
2231 if (NextVA.isRegLoc())
2232 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
2233 else {
2234 assert(NextVA.isMemLoc());
2235 if (!StackPtr.getNode())
2236 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
2238
2239 SDValue DstAddr;
2240 MachinePointerInfo DstInfo;
2241 std::tie(DstAddr, DstInfo) =
2242 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
2243 MemOpChains.push_back(
2244 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
2245 }
2246}
2247
2248static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
2249 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
2251}
2252
2253/// LowerCall - Lowering a call into a callseq_start <-
2254/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2255/// nodes.
2256SDValue
2257ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2258 SmallVectorImpl<SDValue> &InVals) const {
2259 SelectionDAG &DAG = CLI.DAG;
2260 SDLoc &dl = CLI.DL;
2262 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2264 SDValue Chain = CLI.Chain;
2265 SDValue Callee = CLI.Callee;
2266 bool &isTailCall = CLI.IsTailCall;
2267 CallingConv::ID CallConv = CLI.CallConv;
2268 bool doesNotRet = CLI.DoesNotReturn;
2269 bool isVarArg = CLI.IsVarArg;
2270 const CallBase *CB = CLI.CB;
2271
2276 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2277 bool isThisReturn = false;
2278 bool isCmseNSCall = false;
2279 bool isSibCall = false;
2280 bool PreferIndirect = false;
2281 bool GuardWithBTI = false;
2282
2283 // Analyze operands of the call, assigning locations to each operand.
2285 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2286 *DAG.getContext());
2287 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2288
2289 // Lower 'returns_twice' calls to a pseudo-instruction.
2290 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
2291 !Subtarget->noBTIAtReturnTwice())
2292 GuardWithBTI = AFI->branchTargetEnforcement();
2293
2294 // Set type id for call site info.
2295 if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
2296 CSInfo = MachineFunction::CallSiteInfo(*CB);
2297
2298 // Determine whether this is a non-secure function call.
2299 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
2300 isCmseNSCall = true;
2301
2302 // Disable tail calls if they're not supported.
2303 if (!Subtarget->supportsTailCall())
2304 isTailCall = false;
2305
2306 // For both the non-secure calls and the returns from a CMSE entry function,
2307 // the function needs to do some extra work after the call, or before the
2308 // return, respectively, thus it cannot end with a tail call
2309 if (isCmseNSCall || AFI->isCmseNSEntryFunction())
2310 isTailCall = false;
2311
2312 if (isa<GlobalAddressSDNode>(Callee)) {
2313 // If we're optimizing for minimum size and the function is called three or
2314 // more times in this block, we can improve codesize by calling indirectly
2315 // as BLXr has a 16-bit encoding.
2316 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2317 if (CLI.CB) {
2318 auto *BB = CLI.CB->getParent();
2319 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2320 count_if(GV->users(), [&BB](const User *U) {
2321 return isa<Instruction>(U) &&
2322 cast<Instruction>(U)->getParent() == BB;
2323 }) > 2;
2324 }
2325 }
2326 if (isTailCall) {
2327 // Check if it's really possible to do a tail call.
2328 isTailCall =
2329 IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, PreferIndirect);
2330
2331 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
2332 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
2333 isSibCall = true;
2334
2335 // We don't support GuaranteedTailCallOpt for ARM, only automatically
2336 // detected sibcalls.
2337 if (isTailCall)
2338 ++NumTailCalls;
2339 }
2340
2341 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
2342 report_fatal_error("failed to perform tail call elimination on a call "
2343 "site marked musttail");
2344
2345 // Get a count of how many bytes are to be pushed on the stack.
2346 unsigned NumBytes = CCInfo.getStackSize();
2347
2348 // SPDiff is the byte offset of the call's argument area from the callee's.
2349 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2350 // by this amount for a tail call. In a sibling call it must be 0 because the
2351 // caller will deallocate the entire stack and the callee still expects its
2352 // arguments to begin at SP+0. Completely unused for non-tail calls.
2353 int SPDiff = 0;
2354
2355 if (isTailCall && !isSibCall) {
2356 auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
2357 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
2358
2359 // Since callee will pop argument stack as a tail call, we must keep the
2360 // popped size 16-byte aligned.
2361 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
2362 assert(StackAlign && "data layout string is missing stack alignment");
2363 NumBytes = alignTo(NumBytes, *StackAlign);
2364
2365 // SPDiff will be negative if this tail call requires more space than we
2366 // would automatically have in our incoming argument space. Positive if we
2367 // can actually shrink the stack.
2368 SPDiff = NumReusableBytes - NumBytes;
2369
2370 // If this call requires more stack than we have available from
2371 // LowerFormalArguments, tell FrameLowering to reserve space for it.
2372 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
2373 AFI->setArgRegsSaveSize(-SPDiff);
2374 }
2375
2376 if (isSibCall) {
2377 // For sibling tail calls, memory operands are available in our caller's stack.
2378 NumBytes = 0;
2379 } else {
2380 // Adjust the stack pointer for the new arguments...
2381 // These operations are automatically eliminated by the prolog/epilog pass
2382 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
2383 }
2384
2386 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2387
2388 RegsToPassVector RegsToPass;
2389 SmallVector<SDValue, 8> MemOpChains;
2390
2391 // If we are doing a tail-call, any byval arguments will be written to stack
2392 // space which was used for incoming arguments. If any the values being used
2393 // are incoming byval arguments to this function, then they might be
2394 // overwritten by the stores of the outgoing arguments. To avoid this, we
2395 // need to make a temporary copy of them in local stack space, then copy back
2396 // to the argument area.
2397 DenseMap<unsigned, SDValue> ByValTemporaries;
2398 SDValue ByValTempChain;
2399 if (isTailCall) {
2400 SmallVector<SDValue, 8> ByValCopyChains;
2401 for (const CCValAssign &VA : ArgLocs) {
2402 unsigned ArgIdx = VA.getValNo();
2403 SDValue Src = OutVals[ArgIdx];
2404 ISD::ArgFlagsTy Flags = Outs[ArgIdx].Flags;
2405
2406 if (!Flags.isByVal())
2407 continue;
2408
2409 SDValue Dst;
2410 MachinePointerInfo DstInfo;
2411 std::tie(Dst, DstInfo) =
2412 computeAddrForCallArg(dl, DAG, VA, SDValue(), true, SPDiff);
2413 ByValCopyKind Copy = ByValNeedsCopyForTailCall(DAG, Src, Dst, Flags);
2414
2415 if (Copy == NoCopy) {
2416 // If the argument is already at the correct offset on the stack
2417 // (because we are forwarding a byval argument from our caller), we
2418 // don't need any copying.
2419 continue;
2420 } else if (Copy == CopyOnce) {
2421 // If the argument is in our local stack frame, no other argument
2422 // preparation can clobber it, so we can copy it to the final location
2423 // later.
2424 ByValTemporaries[ArgIdx] = Src;
2425 } else {
2426 assert(Copy == CopyViaTemp && "unexpected enum value");
2427 // If we might be copying this argument from the outgoing argument
2428 // stack area, we need to copy via a temporary in the local stack
2429 // frame.
2430 int TempFrameIdx = MFI.CreateStackObject(
2431 Flags.getByValSize(), Flags.getNonZeroByValAlign(), false);
2432 SDValue Temp =
2433 DAG.getFrameIndex(TempFrameIdx, getPointerTy(DAG.getDataLayout()));
2434
2435 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2436 SDValue AlignNode =
2437 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2438
2439 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2440 SDValue Ops[] = {Chain, Temp, Src, SizeNode, AlignNode};
2441 ByValCopyChains.push_back(
2442 DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops));
2443 ByValTemporaries[ArgIdx] = Temp;
2444 }
2445 }
2446 if (!ByValCopyChains.empty())
2447 ByValTempChain =
2448 DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ByValCopyChains);
2449 }
2450
2451 // During a tail call, stores to the argument area must happen after all of
2452 // the function's incoming arguments have been loaded because they may alias.
2453 // This is done by folding in a TokenFactor from LowerFormalArguments, but
2454 // there's no point in doing so repeatedly so this tracks whether that's
2455 // happened yet.
2456 bool AfterFormalArgLoads = false;
2457
2458 // Walk the register/memloc assignments, inserting copies/loads. In the case
2459 // of tail call optimization, arguments are handled later.
2460 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2461 i != e;
2462 ++i, ++realArgIdx) {
2463 CCValAssign &VA = ArgLocs[i];
2464 SDValue Arg = OutVals[realArgIdx];
2465 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2466 bool isByVal = Flags.isByVal();
2467
2468 // Promote the value if needed.
2469 switch (VA.getLocInfo()) {
2470 default: llvm_unreachable("Unknown loc info!");
2471 case CCValAssign::Full: break;
2472 case CCValAssign::SExt:
2473 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2474 break;
2475 case CCValAssign::ZExt:
2476 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2477 break;
2478 case CCValAssign::AExt:
2479 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2480 break;
2481 case CCValAssign::BCvt:
2482 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2483 break;
2484 }
2485
2486 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
2487 Chain = DAG.getStackArgumentTokenFactor(Chain);
2488 if (ByValTempChain)
2489 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain,
2490 ByValTempChain);
2491 AfterFormalArgLoads = true;
2492 }
2493
2494 // f16 arguments have their size extended to 4 bytes and passed as if they
2495 // had been copied to the LSBs of a 32-bit register.
2496 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2497 if (VA.needsCustom() &&
2498 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
2499 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2500 } else {
2501 // f16 arguments could have been extended prior to argument lowering.
2502 // Mask them arguments if this is a CMSE nonsecure call.
2503 auto ArgVT = Outs[realArgIdx].ArgVT;
2504 if (isCmseNSCall && (ArgVT == MVT::f16)) {
2505 auto LocBits = VA.getLocVT().getSizeInBits();
2506 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
2507 SDValue Mask =
2508 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2509 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2510 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2511 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2512 }
2513 }
2514
2515 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2516 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
2517 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2518 DAG.getConstant(0, dl, MVT::i32));
2519 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2520 DAG.getConstant(1, dl, MVT::i32));
2521
2522 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
2523 StackPtr, MemOpChains, isTailCall, SPDiff);
2524
2525 VA = ArgLocs[++i]; // skip ahead to next loc
2526 if (VA.isRegLoc()) {
2527 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
2528 StackPtr, MemOpChains, isTailCall, SPDiff);
2529 } else {
2530 assert(VA.isMemLoc());
2531 SDValue DstAddr;
2532 MachinePointerInfo DstInfo;
2533 std::tie(DstAddr, DstInfo) =
2534 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2535 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
2536 }
2537 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
2538 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2539 StackPtr, MemOpChains, isTailCall, SPDiff);
2540 } else if (VA.isRegLoc()) {
2541 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2542 Outs[0].VT == MVT::i32) {
2543 assert(VA.getLocVT() == MVT::i32 &&
2544 "unexpected calling convention register assignment");
2545 assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2546 "unexpected use of 'returned'");
2547 isThisReturn = true;
2548 }
2549 const TargetOptions &Options = DAG.getTarget().Options;
2550 if (Options.EmitCallSiteInfo)
2551 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
2552 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2553 } else if (isByVal) {
2554 assert(VA.isMemLoc());
2555 unsigned offset = 0;
2556
2557 // True if this byval aggregate will be split between registers
2558 // and memory.
2559 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2560 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2561
2562 SDValue ByValSrc;
2563 bool NeedsStackCopy;
2564 if (auto It = ByValTemporaries.find(realArgIdx);
2565 It != ByValTemporaries.end()) {
2566 ByValSrc = It->second;
2567 NeedsStackCopy = true;
2568 } else {
2569 ByValSrc = Arg;
2570 NeedsStackCopy = !isTailCall;
2571 }
2572
2573 // If part of the argument is in registers, load them.
2574 if (CurByValIdx < ByValArgsCount) {
2575 unsigned RegBegin, RegEnd;
2576 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2577
2578 EVT PtrVT = getPointerTy(DAG.getDataLayout());
2579 unsigned int i, j;
2580 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2581 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2582 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, Const);
2583 SDValue Load =
2584 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
2585 DAG.InferPtrAlign(AddArg));
2586 MemOpChains.push_back(Load.getValue(1));
2587 RegsToPass.push_back(std::make_pair(j, Load));
2588 }
2589
2590 // If parameter size outsides register area, "offset" value
2591 // helps us to calculate stack slot for remained part properly.
2592 offset = RegEnd - RegBegin;
2593
2594 CCInfo.nextInRegsParam();
2595 }
2596
2597 // If the memory part of the argument isn't already in the correct place
2598 // (which can happen with tail calls), copy it into the argument area.
2599 if (NeedsStackCopy && Flags.getByValSize() > 4 * offset) {
2600 auto PtrVT = getPointerTy(DAG.getDataLayout());
2601 SDValue Dst;
2602 MachinePointerInfo DstInfo;
2603 std::tie(Dst, DstInfo) =
2604 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2605 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2606 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, SrcOffset);
2607 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2608 MVT::i32);
2609 SDValue AlignNode =
2610 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2611
2612 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2613 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2614 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2615 Ops));
2616 }
2617 } else {
2618 assert(VA.isMemLoc());
2619 SDValue DstAddr;
2620 MachinePointerInfo DstInfo;
2621 std::tie(DstAddr, DstInfo) =
2622 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2623
2624 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
2625 MemOpChains.push_back(Store);
2626 }
2627 }
2628
2629 if (!MemOpChains.empty())
2630 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2631
2632 // Build a sequence of copy-to-reg nodes chained together with token chain
2633 // and flag operands which copy the outgoing args into the appropriate regs.
2634 SDValue InGlue;
2635 for (const auto &[Reg, N] : RegsToPass) {
2636 Chain = DAG.getCopyToReg(Chain, dl, Reg, N, InGlue);
2637 InGlue = Chain.getValue(1);
2638 }
2639
2640 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2641 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2642 // node so that legalize doesn't hack it.
2643 bool isDirect = false;
2644
2646 const GlobalValue *GVal = nullptr;
2647 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2648 GVal = G->getGlobal();
2649 bool isStub = !TM.shouldAssumeDSOLocal(GVal) && Subtarget->isTargetMachO();
2650
2651 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2652 bool isLocalARMFunc = false;
2653 auto PtrVt = getPointerTy(DAG.getDataLayout());
2654
2655 if (Subtarget->genLongCalls()) {
2656 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
2657 "long-calls codegen is not position independent!");
2658 // Handle a global address or an external symbol. If it's not one of
2659 // those, the target's already in a register, so we don't need to do
2660 // anything extra.
2661 if (isa<GlobalAddressSDNode>(Callee)) {
2662 if (Subtarget->genExecuteOnly()) {
2663 if (Subtarget->useMovt())
2664 ++NumMovwMovt;
2665 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2666 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2667 } else {
2668 // Create a constant pool entry for the callee address
2669 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2671 GVal, ARMPCLabelIndex, ARMCP::CPValue, 0);
2672
2673 // Get the address of the callee into a register
2674 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2675 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2676 Callee = DAG.getLoad(
2677 PtrVt, dl, DAG.getEntryNode(), Addr,
2679 }
2680 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2681 const char *Sym = S->getSymbol();
2682
2683 if (Subtarget->genExecuteOnly()) {
2684 if (Subtarget->useMovt())
2685 ++NumMovwMovt;
2686 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2687 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2688 } else {
2689 // Create a constant pool entry for the callee address
2690 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2692 *DAG.getContext(), Sym, ARMPCLabelIndex, 0);
2693
2694 // Get the address of the callee into a register
2695 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2696 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2697 Callee = DAG.getLoad(
2698 PtrVt, dl, DAG.getEntryNode(), Addr,
2700 }
2701 }
2702 } else if (isa<GlobalAddressSDNode>(Callee)) {
2703 if (!PreferIndirect) {
2704 isDirect = true;
2705 bool isDef = GVal->isStrongDefinitionForLinker();
2706
2707 // ARM call to a local ARM function is predicable.
2708 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2709 // tBX takes a register source operand.
2710 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2711 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2712 Callee = DAG.getNode(
2713 ARMISD::WrapperPIC, dl, PtrVt,
2714 DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2715 Callee = DAG.getLoad(
2716 PtrVt, dl, DAG.getEntryNode(), Callee,
2720 } else if (Subtarget->isTargetCOFF()) {
2721 assert(Subtarget->isTargetWindows() &&
2722 "Windows is the only supported COFF target");
2723 unsigned TargetFlags = ARMII::MO_NO_FLAG;
2724 if (GVal->hasDLLImportStorageClass())
2725 TargetFlags = ARMII::MO_DLLIMPORT;
2726 else if (!TM.shouldAssumeDSOLocal(GVal))
2727 TargetFlags = ARMII::MO_COFFSTUB;
2728 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
2729 TargetFlags);
2730 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
2731 Callee =
2732 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2733 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2735 } else {
2736 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0);
2737 }
2738 }
2739 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2740 isDirect = true;
2741 // tBX takes a register source operand.
2742 const char *Sym = S->getSymbol();
2743 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2744 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2747 ARMPCLabelIndex, 4);
2748 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2749 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2750 Callee = DAG.getLoad(
2751 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2753 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2754 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2755 } else {
2756 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2757 }
2758 }
2759
2760 if (isCmseNSCall) {
2761 assert(!isARMFunc && !isDirect &&
2762 "Cannot handle call to ARM function or direct call");
2763 if (NumBytes > 0) {
2764 DAG.getContext()->diagnose(
2766 "call to non-secure function would require "
2767 "passing arguments on stack",
2768 dl.getDebugLoc()));
2769 }
2770 if (isStructRet) {
2773 "call to non-secure function would return value through pointer",
2774 dl.getDebugLoc()));
2775 }
2776 }
2777
2778 // FIXME: handle tail calls differently.
2779 unsigned CallOpc;
2780 if (Subtarget->isThumb()) {
2781 if (GuardWithBTI)
2782 CallOpc = ARMISD::t2CALL_BTI;
2783 else if (isCmseNSCall)
2784 CallOpc = ARMISD::tSECALL;
2785 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2786 CallOpc = ARMISD::CALL_NOLINK;
2787 else
2788 CallOpc = ARMISD::CALL;
2789 } else {
2790 if (!isDirect && !Subtarget->hasV5TOps())
2791 CallOpc = ARMISD::CALL_NOLINK;
2792 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2793 // Emit regular call when code size is the priority
2794 !Subtarget->hasMinSize())
2795 // "mov lr, pc; b _foo" to avoid confusing the RSP
2796 CallOpc = ARMISD::CALL_NOLINK;
2797 else
2798 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2799 }
2800
2801 // We don't usually want to end the call-sequence here because we would tidy
2802 // the frame up *after* the call, however in the ABI-changing tail-call case
2803 // we've carefully laid out the parameters so that when sp is reset they'll be
2804 // in the correct location.
2805 if (isTailCall && !isSibCall) {
2806 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, dl);
2807 InGlue = Chain.getValue(1);
2808 }
2809
2810 std::vector<SDValue> Ops;
2811 Ops.push_back(Chain);
2812 Ops.push_back(Callee);
2813
2814 if (isTailCall) {
2815 Ops.push_back(DAG.getSignedTargetConstant(SPDiff, dl, MVT::i32));
2816 }
2817
2818 // Add argument registers to the end of the list so that they are known live
2819 // into the call.
2820 for (const auto &[Reg, N] : RegsToPass)
2821 Ops.push_back(DAG.getRegister(Reg, N.getValueType()));
2822
2823 // Add a register mask operand representing the call-preserved registers.
2824 const uint32_t *Mask;
2825 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2826 if (isThisReturn) {
2827 // For 'this' returns, use the R0-preserving mask if applicable
2828 Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2829 if (!Mask) {
2830 // Set isThisReturn to false if the calling convention is not one that
2831 // allows 'returned' to be modeled in this way, so LowerCallResult does
2832 // not try to pass 'this' straight through
2833 isThisReturn = false;
2834 Mask = ARI->getCallPreservedMask(MF, CallConv);
2835 }
2836 } else
2837 Mask = ARI->getCallPreservedMask(MF, CallConv);
2838
2839 assert(Mask && "Missing call preserved mask for calling convention");
2840 Ops.push_back(DAG.getRegisterMask(Mask));
2841
2842 if (InGlue.getNode())
2843 Ops.push_back(InGlue);
2844
2845 if (isTailCall) {
2847 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, MVT::Other, Ops);
2848 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
2849 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2850 return Ret;
2851 }
2852
2853 // Returns a chain and a flag for retval copy to use.
2854 Chain = DAG.getNode(CallOpc, dl, {MVT::Other, MVT::Glue}, Ops);
2855 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2856 InGlue = Chain.getValue(1);
2857 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2858
2859 // If we're guaranteeing tail-calls will be honoured, the callee must
2860 // pop its own argument stack on return. But this call is *not* a tail call so
2861 // we need to undo that after it returns to restore the status-quo.
2862 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
2863 uint64_t CalleePopBytes =
2864 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1U;
2865
2866 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, dl);
2867 if (!Ins.empty())
2868 InGlue = Chain.getValue(1);
2869
2870 // Handle result values, copying them out of physregs into vregs that we
2871 // return.
2872 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2873 InVals, isThisReturn,
2874 isThisReturn ? OutVals[0] : SDValue(), isCmseNSCall);
2875}
2876
2877/// HandleByVal - Every parameter *after* a byval parameter is passed
2878/// on the stack. Remember the next parameter register to allocate,
2879/// and then confiscate the rest of the parameter registers to insure
2880/// this.
2881void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2882 Align Alignment) const {
2883 // Byval (as with any stack) slots are always at least 4 byte aligned.
2884 Alignment = std::max(Alignment, Align(4));
2885
2887 if (!Reg)
2888 return;
2889
2890 unsigned AlignInRegs = Alignment.value() / 4;
2891 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2892 for (unsigned i = 0; i < Waste; ++i)
2893 Reg = State->AllocateReg(GPRArgRegs);
2894
2895 if (!Reg)
2896 return;
2897
2898 unsigned Excess = 4 * (ARM::R4 - Reg);
2899
2900 // Special case when NSAA != SP and parameter size greater than size of
2901 // all remained GPR regs. In that case we can't split parameter, we must
2902 // send it to stack. We also must set NCRN to R4, so waste all
2903 // remained registers.
2904 const unsigned NSAAOffset = State->getStackSize();
2905 if (NSAAOffset != 0 && Size > Excess) {
2906 while (State->AllocateReg(GPRArgRegs))
2907 ;
2908 return;
2909 }
2910
2911 // First register for byval parameter is the first register that wasn't
2912 // allocated before this method call, so it would be "reg".
2913 // If parameter is small enough to be saved in range [reg, r4), then
2914 // the end (first after last) register would be reg + param-size-in-regs,
2915 // else parameter would be splitted between registers and stack,
2916 // end register would be r4 in this case.
2917 unsigned ByValRegBegin = Reg;
2918 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2919 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2920 // Note, first register is allocated in the beginning of function already,
2921 // allocate remained amount of registers we need.
2922 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2923 State->AllocateReg(GPRArgRegs);
2924 // A byval parameter that is split between registers and memory needs its
2925 // size truncated here.
2926 // In the case where the entire structure fits in registers, we set the
2927 // size in memory to zero.
2928 Size = std::max<int>(Size - Excess, 0);
2929}
2930
2931/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2932/// for tail call optimization. Targets which want to do tail call
2933/// optimization should implement this function. Note that this function also
2934/// processes musttail calls, so when this function returns false on a valid
2935/// musttail call, a fatal backend error occurs.
2936bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2938 SmallVectorImpl<CCValAssign> &ArgLocs, const bool isIndirect) const {
2939 CallingConv::ID CalleeCC = CLI.CallConv;
2940 SDValue Callee = CLI.Callee;
2941 bool isVarArg = CLI.IsVarArg;
2942 const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2943 const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2945 const SelectionDAG &DAG = CLI.DAG;
2947 const Function &CallerF = MF.getFunction();
2948 CallingConv::ID CallerCC = CallerF.getCallingConv();
2949
2950 assert(Subtarget->supportsTailCall());
2951
2952 // Indirect tail-calls require a register to hold the target address. That
2953 // register must be:
2954 // * Allocatable (i.e. r0-r7 if the target is Thumb1).
2955 // * Not callee-saved, so must be one of r0-r3 or r12.
2956 // * Not used to hold an argument to the tail-called function, which might be
2957 // in r0-r3.
2958 // * Not used to hold the return address authentication code, which is in r12
2959 // if enabled.
2960 // Sometimes, no register matches all of these conditions, so we can't do a
2961 // tail-call.
2962 if (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect) {
2963 SmallSet<MCPhysReg, 5> AddressRegisters = {ARM::R0, ARM::R1, ARM::R2,
2964 ARM::R3};
2965 if (!(Subtarget->isThumb1Only() ||
2967 AddressRegisters.insert(ARM::R12);
2968 for (const CCValAssign &AL : ArgLocs)
2969 if (AL.isRegLoc())
2970 AddressRegisters.erase(AL.getLocReg());
2971 if (AddressRegisters.empty()) {
2972 LLVM_DEBUG(dbgs() << "false (no reg to hold function pointer)\n");
2973 return false;
2974 }
2975 }
2976
2977 // Look for obvious safe cases to perform tail call optimization that do not
2978 // require ABI changes. This is what gcc calls sibcall.
2979
2980 // Exception-handling functions need a special set of instructions to indicate
2981 // a return to the hardware. Tail-calling another function would probably
2982 // break this.
2983 if (CallerF.hasFnAttribute("interrupt")) {
2984 LLVM_DEBUG(dbgs() << "false (interrupt attribute)\n");
2985 return false;
2986 }
2987
2988 if (canGuaranteeTCO(CalleeCC,
2989 getTargetMachine().Options.GuaranteedTailCallOpt)) {
2990 LLVM_DEBUG(dbgs() << (CalleeCC == CallerCC ? "true" : "false")
2991 << " (guaranteed tail-call CC)\n");
2992 return CalleeCC == CallerCC;
2993 }
2994
2995 // Also avoid sibcall optimization if either caller or callee uses struct
2996 // return semantics.
2997 bool isCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
2998 bool isCallerStructRet = MF.getFunction().hasStructRetAttr();
2999 if (isCalleeStructRet != isCallerStructRet) {
3000 LLVM_DEBUG(dbgs() << "false (struct-ret)\n");
3001 return false;
3002 }
3003
3004 // Externally-defined functions with weak linkage should not be
3005 // tail-called on ARM when the OS does not support dynamic
3006 // pre-emption of symbols, as the AAELF spec requires normal calls
3007 // to undefined weak functions to be replaced with a NOP or jump to the
3008 // next instruction. The behaviour of branch instructions in this
3009 // situation (as used for tail calls) is implementation-defined, so we
3010 // cannot rely on the linker replacing the tail call with a return.
3011 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3012 const GlobalValue *GV = G->getGlobal();
3014 if (GV->hasExternalWeakLinkage() &&
3015 (!TT.isOSWindows() || TT.isOSBinFormatELF() ||
3016 TT.isOSBinFormatMachO())) {
3017 LLVM_DEBUG(dbgs() << "false (external weak linkage)\n");
3018 return false;
3019 }
3020 }
3021
3022 // Check that the call results are passed in the same way.
3023 LLVMContext &C = *DAG.getContext();
3025 getEffectiveCallingConv(CalleeCC, isVarArg),
3026 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
3027 CCAssignFnForReturn(CalleeCC, isVarArg),
3028 CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) {
3029 LLVM_DEBUG(dbgs() << "false (incompatible results)\n");
3030 return false;
3031 }
3032 // The callee has to preserve all registers the caller needs to preserve.
3033 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3034 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3035 if (CalleeCC != CallerCC) {
3036 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3037 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) {
3038 LLVM_DEBUG(dbgs() << "false (not all registers preserved)\n");
3039 return false;
3040 }
3041 }
3042
3043 // If Caller's vararg argument has been split between registers and stack, do
3044 // not perform tail call, since part of the argument is in caller's local
3045 // frame.
3046 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
3047 if (CLI.IsVarArg && AFI_Caller->getArgRegsSaveSize()) {
3048 LLVM_DEBUG(dbgs() << "false (arg reg save area)\n");
3049 return false;
3050 }
3051
3052 // If the callee takes no arguments then go on to check the results of the
3053 // call.
3054 const MachineRegisterInfo &MRI = MF.getRegInfo();
3055 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) {
3056 LLVM_DEBUG(dbgs() << "false (parameters in CSRs do not match)\n");
3057 return false;
3058 }
3059
3060 // If the stack arguments for this call do not fit into our own save area then
3061 // the call cannot be made tail.
3062 if (CCInfo.getStackSize() > AFI_Caller->getArgumentStackSize())
3063 return false;
3064
3065 LLVM_DEBUG(dbgs() << "true\n");
3066 return true;
3067}
3068
3069bool
3070ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
3071 MachineFunction &MF, bool isVarArg,
3073 LLVMContext &Context, const Type *RetTy) const {
3075 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3076 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3077}
3078
3080 const SDLoc &DL, SelectionDAG &DAG) {
3081 const MachineFunction &MF = DAG.getMachineFunction();
3082 const Function &F = MF.getFunction();
3083
3084 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
3085
3086 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
3087 // version of the "preferred return address". These offsets affect the return
3088 // instruction if this is a return from PL1 without hypervisor extensions.
3089 // IRQ/FIQ: +4 "subs pc, lr, #4"
3090 // SWI: 0 "subs pc, lr, #0"
3091 // ABORT: +4 "subs pc, lr, #4"
3092 // UNDEF: +4/+2 "subs pc, lr, #0"
3093 // UNDEF varies depending on where the exception came from ARM or Thumb
3094 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
3095
3096 int64_t LROffset;
3097 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
3098 IntKind == "ABORT")
3099 LROffset = 4;
3100 else if (IntKind == "SWI" || IntKind == "UNDEF")
3101 LROffset = 0;
3102 else
3103 report_fatal_error("Unsupported interrupt attribute. If present, value "
3104 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
3105
3106 RetOps.insert(RetOps.begin() + 1,
3107 DAG.getConstant(LROffset, DL, MVT::i32, false));
3108
3109 return DAG.getNode(ARMISD::INTRET_GLUE, DL, MVT::Other, RetOps);
3110}
3111
3112SDValue
3113ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3114 bool isVarArg,
3116 const SmallVectorImpl<SDValue> &OutVals,
3117 const SDLoc &dl, SelectionDAG &DAG) const {
3118 // CCValAssign - represent the assignment of the return value to a location.
3120
3121 // CCState - Info about the registers and stack slots.
3122 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3123 *DAG.getContext());
3124
3125 // Analyze outgoing return values.
3126 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3127
3128 SDValue Glue;
3130 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3131 bool isLittleEndian = Subtarget->isLittle();
3132
3135 AFI->setReturnRegsCount(RVLocs.size());
3136
3137 // Report error if cmse entry function returns structure through first ptr arg.
3138 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
3139 // Note: using an empty SDLoc(), as the first line of the function is a
3140 // better place to report than the last line.
3143 "secure entry function would return value through pointer",
3144 SDLoc().getDebugLoc()));
3145 }
3146
3147 // Copy the result values into the output registers.
3148 for (unsigned i = 0, realRVLocIdx = 0;
3149 i != RVLocs.size();
3150 ++i, ++realRVLocIdx) {
3151 CCValAssign &VA = RVLocs[i];
3152 assert(VA.isRegLoc() && "Can only return in registers!");
3153
3154 SDValue Arg = OutVals[realRVLocIdx];
3155 bool ReturnF16 = false;
3156
3157 if (Subtarget->hasFullFP16() && getTM().isTargetHardFloat()) {
3158 // Half-precision return values can be returned like this:
3159 //
3160 // t11 f16 = fadd ...
3161 // t12: i16 = bitcast t11
3162 // t13: i32 = zero_extend t12
3163 // t14: f32 = bitcast t13 <~~~~~~~ Arg
3164 //
3165 // to avoid code generation for bitcasts, we simply set Arg to the node
3166 // that produces the f16 value, t11 in this case.
3167 //
3168 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
3169 SDValue ZE = Arg.getOperand(0);
3170 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
3171 SDValue BC = ZE.getOperand(0);
3172 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
3173 Arg = BC.getOperand(0);
3174 ReturnF16 = true;
3175 }
3176 }
3177 }
3178 }
3179
3180 switch (VA.getLocInfo()) {
3181 default: llvm_unreachable("Unknown loc info!");
3182 case CCValAssign::Full: break;
3183 case CCValAssign::BCvt:
3184 if (!ReturnF16)
3185 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3186 break;
3187 }
3188
3189 // Mask f16 arguments if this is a CMSE nonsecure entry.
3190 auto RetVT = Outs[realRVLocIdx].ArgVT;
3191 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
3192 if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
3193 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
3194 } else {
3195 auto LocBits = VA.getLocVT().getSizeInBits();
3196 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
3197 SDValue Mask =
3198 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
3199 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
3200 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
3201 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3202 }
3203 }
3204
3205 if (VA.needsCustom() &&
3206 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
3207 if (VA.getLocVT() == MVT::v2f64) {
3208 // Extract the first half and return it in two registers.
3209 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3210 DAG.getConstant(0, dl, MVT::i32));
3211 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
3212 DAG.getVTList(MVT::i32, MVT::i32), Half);
3213
3214 Chain =
3215 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3216 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Glue);
3217 Glue = Chain.getValue(1);
3218 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3219 VA = RVLocs[++i]; // skip ahead to next loc
3220 Chain =
3221 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3222 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Glue);
3223 Glue = Chain.getValue(1);
3224 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3225 VA = RVLocs[++i]; // skip ahead to next loc
3226
3227 // Extract the 2nd half and fall through to handle it as an f64 value.
3228 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3229 DAG.getConstant(1, dl, MVT::i32));
3230 }
3231 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
3232 // available.
3233 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
3234 DAG.getVTList(MVT::i32, MVT::i32), Arg);
3235 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3236 fmrrd.getValue(isLittleEndian ? 0 : 1), Glue);
3237 Glue = Chain.getValue(1);
3238 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3239 VA = RVLocs[++i]; // skip ahead to next loc
3240 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3241 fmrrd.getValue(isLittleEndian ? 1 : 0), Glue);
3242 } else
3243 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Glue);
3244
3245 // Guarantee that all emitted copies are
3246 // stuck together, avoiding something bad.
3247 Glue = Chain.getValue(1);
3248 RetOps.push_back(DAG.getRegister(
3249 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
3250 }
3251 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3252 const MCPhysReg *I =
3253 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3254 if (I) {
3255 for (; *I; ++I) {
3256 if (ARM::GPRRegClass.contains(*I))
3257 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3258 else if (ARM::DPRRegClass.contains(*I))
3260 else
3261 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3262 }
3263 }
3264
3265 // Update chain and glue.
3266 RetOps[0] = Chain;
3267 if (Glue.getNode())
3268 RetOps.push_back(Glue);
3269
3270 // CPUs which aren't M-class use a special sequence to return from
3271 // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3272 // though we use "subs pc, lr, #N").
3273 //
3274 // M-class CPUs actually use a normal return sequence with a special
3275 // (hardware-provided) value in LR, so the normal code path works.
3276 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
3277 !Subtarget->isMClass()) {
3278 if (Subtarget->isThumb1Only())
3279 report_fatal_error("interrupt attribute is not supported in Thumb1");
3280 return LowerInterruptReturn(RetOps, dl, DAG);
3281 }
3282
3285 return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
3286}
3287
3288bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3289 if (N->getNumValues() != 1)
3290 return false;
3291 if (!N->hasNUsesOfValue(1, 0))
3292 return false;
3293
3294 SDValue TCChain = Chain;
3295 SDNode *Copy = *N->user_begin();
3296 if (Copy->getOpcode() == ISD::CopyToReg) {
3297 // If the copy has a glue operand, we conservatively assume it isn't safe to
3298 // perform a tail call.
3299 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3300 return false;
3301 TCChain = Copy->getOperand(0);
3302 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
3303 SDNode *VMov = Copy;
3304 // f64 returned in a pair of GPRs.
3306 for (SDNode *U : VMov->users()) {
3307 if (U->getOpcode() != ISD::CopyToReg)
3308 return false;
3309 Copies.insert(U);
3310 }
3311 if (Copies.size() > 2)
3312 return false;
3313
3314 for (SDNode *U : VMov->users()) {
3315 SDValue UseChain = U->getOperand(0);
3316 if (Copies.count(UseChain.getNode()))
3317 // Second CopyToReg
3318 Copy = U;
3319 else {
3320 // We are at the top of this chain.
3321 // If the copy has a glue operand, we conservatively assume it
3322 // isn't safe to perform a tail call.
3323 if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)
3324 return false;
3325 // First CopyToReg
3326 TCChain = UseChain;
3327 }
3328 }
3329 } else if (Copy->getOpcode() == ISD::BITCAST) {
3330 // f32 returned in a single GPR.
3331 if (!Copy->hasOneUse())
3332 return false;
3333 Copy = *Copy->user_begin();
3334 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
3335 return false;
3336 // If the copy has a glue operand, we conservatively assume it isn't safe to
3337 // perform a tail call.
3338 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3339 return false;
3340 TCChain = Copy->getOperand(0);
3341 } else {
3342 return false;
3343 }
3344
3345 bool HasRet = false;
3346 for (const SDNode *U : Copy->users()) {
3347 if (U->getOpcode() != ARMISD::RET_GLUE &&
3348 U->getOpcode() != ARMISD::INTRET_GLUE)
3349 return false;
3350 HasRet = true;
3351 }
3352
3353 if (!HasRet)
3354 return false;
3355
3356 Chain = TCChain;
3357 return true;
3358}
3359
3360bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3361 if (!Subtarget->supportsTailCall())
3362 return false;
3363
3364 if (!CI->isTailCall())
3365 return false;
3366
3367 return true;
3368}
3369
3370// Trying to write a 64 bit value so need to split into two 32 bit values first,
3371// and pass the lower and high parts through.
3373 SDLoc DL(Op);
3374 SDValue WriteValue = Op->getOperand(2);
3375
3376 // This function is only supposed to be called for i64 type argument.
3377 assert(WriteValue.getValueType() == MVT::i64
3378 && "LowerWRITE_REGISTER called for non-i64 type argument.");
3379
3380 SDValue Lo, Hi;
3381 std::tie(Lo, Hi) = DAG.SplitScalar(WriteValue, DL, MVT::i32, MVT::i32);
3382 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
3383 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
3384}
3385
3386// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
3387// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
3388// one of the above mentioned nodes. It has to be wrapped because otherwise
3389// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
3390// be used to form addressing mode. These wrapped nodes will be selected
3391// into MOVi.
3392SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
3393 SelectionDAG &DAG) const {
3394 EVT PtrVT = Op.getValueType();
3395 // FIXME there is no actual debug info here
3396 SDLoc dl(Op);
3397 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
3398 SDValue Res;
3399
3400 // When generating execute-only code Constant Pools must be promoted to the
3401 // global data section. It's a bit ugly that we can't share them across basic
3402 // blocks, but this way we guarantee that execute-only behaves correct with
3403 // position-independent addressing modes.
3404 if (Subtarget->genExecuteOnly()) {
3405 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3406 auto *T = CP->getType();
3407 auto C = const_cast<Constant*>(CP->getConstVal());
3408 auto M = DAG.getMachineFunction().getFunction().getParent();
3409 auto GV = new GlobalVariable(
3410 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
3413 Twine(AFI->createPICLabelUId())
3414 );
3415 SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV),
3416 dl, PtrVT);
3417 return LowerGlobalAddress(GA, DAG);
3418 }
3419
3420 // The 16-bit ADR instruction can only encode offsets that are multiples of 4,
3421 // so we need to align to at least 4 bytes when we don't have 32-bit ADR.
3422 Align CPAlign = CP->getAlign();
3423 if (Subtarget->isThumb1Only())
3424 CPAlign = std::max(CPAlign, Align(4));
3425 if (CP->isMachineConstantPoolEntry())
3426 Res =
3427 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign);
3428 else
3429 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign);
3430 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
3431}
3432
3434 // If we don't have a 32-bit pc-relative branch instruction then the jump
3435 // table consists of block addresses. Usually this is inline, but for
3436 // execute-only it must be placed out-of-line.
3437 if (Subtarget->genExecuteOnly() && !Subtarget->hasV8MBaselineOps())
3440}
3441
3442SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
3443 SelectionDAG &DAG) const {
3446 unsigned ARMPCLabelIndex = 0;
3447 SDLoc DL(Op);
3448 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3449 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
3450 SDValue CPAddr;
3451 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
3452 if (!IsPositionIndependent) {
3453 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
3454 } else {
3455 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3456 ARMPCLabelIndex = AFI->createPICLabelUId();
3458 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
3459 ARMCP::CPBlockAddress, PCAdj);
3460 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3461 }
3462 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
3463 SDValue Result = DAG.getLoad(
3464 PtrVT, DL, DAG.getEntryNode(), CPAddr,
3466 if (!IsPositionIndependent)
3467 return Result;
3468 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
3469 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
3470}
3471
3472/// Convert a TLS address reference into the correct sequence of loads
3473/// and calls to compute the variable's address for Darwin, and return an
3474/// SDValue containing the final node.
3475
3476/// Darwin only has one TLS scheme which must be capable of dealing with the
3477/// fully general situation, in the worst case. This means:
3478/// + "extern __thread" declaration.
3479/// + Defined in a possibly unknown dynamic library.
3480///
3481/// The general system is that each __thread variable has a [3 x i32] descriptor
3482/// which contains information used by the runtime to calculate the address. The
3483/// only part of this the compiler needs to know about is the first word, which
3484/// contains a function pointer that must be called with the address of the
3485/// entire descriptor in "r0".
3486///
3487/// Since this descriptor may be in a different unit, in general access must
3488/// proceed along the usual ARM rules. A common sequence to produce is:
3489///
3490/// movw rT1, :lower16:_var$non_lazy_ptr
3491/// movt rT1, :upper16:_var$non_lazy_ptr
3492/// ldr r0, [rT1]
3493/// ldr rT2, [r0]
3494/// blx rT2
3495/// [...address now in r0...]
3496SDValue
3497ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3498 SelectionDAG &DAG) const {
3499 assert(Subtarget->isTargetDarwin() &&
3500 "This function expects a Darwin target");
3501 SDLoc DL(Op);
3502
3503 // First step is to get the address of the actua global symbol. This is where
3504 // the TLS descriptor lives.
3505 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3506
3507 // The first entry in the descriptor is a function pointer that we must call
3508 // to obtain the address of the variable.
3509 SDValue Chain = DAG.getEntryNode();
3510 SDValue FuncTLVGet = DAG.getLoad(
3511 MVT::i32, DL, Chain, DescAddr,
3515 Chain = FuncTLVGet.getValue(1);
3516
3518 MachineFrameInfo &MFI = F.getFrameInfo();
3519 MFI.setAdjustsStack(true);
3520
3521 // TLS calls preserve all registers except those that absolutely must be
3522 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3523 // silly).
3524 auto TRI =
3526 auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3528
3529 // Finally, we can make the call. This is just a degenerate version of a
3530 // normal AArch64 call node: r0 takes the address of the descriptor, and
3531 // returns the address of the variable in this thread.
3532 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3533 Chain =
3534 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3535 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3536 DAG.getRegisterMask(Mask), Chain.getValue(1));
3537 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3538}
3539
3540SDValue
3541ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3542 SelectionDAG &DAG) const {
3543 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
3544
3545 SDValue Chain = DAG.getEntryNode();
3546 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3547 SDLoc DL(Op);
3548
3549 // Load the current TEB (thread environment block)
3550 SDValue Ops[] = {Chain,
3551 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3552 DAG.getTargetConstant(15, DL, MVT::i32),
3553 DAG.getTargetConstant(0, DL, MVT::i32),
3554 DAG.getTargetConstant(13, DL, MVT::i32),
3555 DAG.getTargetConstant(0, DL, MVT::i32),
3556 DAG.getTargetConstant(2, DL, MVT::i32)};
3557 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
3558 DAG.getVTList(MVT::i32, MVT::Other), Ops);
3559
3560 SDValue TEB = CurrentTEB.getValue(0);
3561 Chain = CurrentTEB.getValue(1);
3562
3563 // Load the ThreadLocalStoragePointer from the TEB
3564 // A pointer to the TLS array is located at offset 0x2c from the TEB.
3565 SDValue TLSArray =
3566 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
3567 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
3568
3569 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3570 // offset into the TLSArray.
3571
3572 // Load the TLS index from the C runtime
3573 SDValue TLSIndex =
3574 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
3575 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
3576 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
3577
3578 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
3579 DAG.getConstant(2, DL, MVT::i32));
3580 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
3581 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
3583
3584 // Get the offset of the start of the .tls section (section base)
3585 const auto *GA = cast<GlobalAddressSDNode>(Op);
3586 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
3587 SDValue Offset = DAG.getLoad(
3588 PtrVT, DL, Chain,
3589 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
3590 DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
3592
3593 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
3594}
3595
3596// Lower ISD::GlobalTLSAddress using the "general dynamic" model
3597SDValue
3598ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
3599 SelectionDAG &DAG) const {
3600 SDLoc dl(GA);
3601 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3602 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3605 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3607 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3608 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
3609 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3610 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
3611 Argument = DAG.getLoad(
3612 PtrVT, dl, DAG.getEntryNode(), Argument,
3614 SDValue Chain = Argument.getValue(1);
3615
3616 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3617 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
3618
3619 // call __tls_get_addr.
3621 Args.emplace_back(Argument, Type::getInt32Ty(*DAG.getContext()));
3622
3623 // FIXME: is there useful debug info available here?
3625 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3627 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
3628
3629 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3630 return CallResult.first;
3631}
3632
3633// Lower ISD::GlobalTLSAddress using the "initial exec" or
3634// "local exec" model.
3635SDValue
3636ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
3637 SelectionDAG &DAG,
3638 TLSModel::Model model) const {
3639 const GlobalValue *GV = GA->getGlobal();
3640 SDLoc dl(GA);
3642 SDValue Chain = DAG.getEntryNode();
3643 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3644 // Get the Thread Pointer
3646
3647 if (model == TLSModel::InitialExec) {
3650 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3651 // Initial exec model.
3652 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3654 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3656 true);
3657 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3658 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3659 Offset = DAG.getLoad(
3660 PtrVT, dl, Chain, Offset,
3662 Chain = Offset.getValue(1);
3663
3664 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3665 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
3666
3667 Offset = DAG.getLoad(
3668 PtrVT, dl, Chain, Offset,
3670 } else {
3671 // local exec model
3672 assert(model == TLSModel::LocalExec);
3675 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3676 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3677 Offset = DAG.getLoad(
3678 PtrVT, dl, Chain, Offset,
3680 }
3681
3682 // The address of the thread local variable is the add of the thread
3683 // pointer with the offset of the variable.
3684 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
3685}
3686
3687SDValue
3688ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
3689 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3690 if (DAG.getTarget().useEmulatedTLS())
3691 return LowerToTLSEmulatedModel(GA, DAG);
3692
3693 if (Subtarget->isTargetDarwin())
3694 return LowerGlobalTLSAddressDarwin(Op, DAG);
3695
3696 if (Subtarget->isTargetWindows())
3697 return LowerGlobalTLSAddressWindows(Op, DAG);
3698
3699 // TODO: implement the "local dynamic" model
3700 assert(Subtarget->isTargetELF() && "Only ELF implemented here");
3702
3703 switch (model) {
3706 return LowerToTLSGeneralDynamicModel(GA, DAG);
3709 return LowerToTLSExecModels(GA, DAG, model);
3710 }
3711 llvm_unreachable("bogus TLS model");
3712}
3713
3714/// Return true if all users of V are within function F, looking through
3715/// ConstantExprs.
3716static bool allUsersAreInFunction(const Value *V, const Function *F) {
3717 SmallVector<const User*,4> Worklist(V->users());
3718 while (!Worklist.empty()) {
3719 auto *U = Worklist.pop_back_val();
3720 if (isa<ConstantExpr>(U)) {
3721 append_range(Worklist, U->users());
3722 continue;
3723 }
3724
3725 auto *I = dyn_cast<Instruction>(U);
3726 if (!I || I->getParent()->getParent() != F)
3727 return false;
3728 }
3729 return true;
3730}
3731
3733 const GlobalValue *GV, SelectionDAG &DAG,
3734 EVT PtrVT, const SDLoc &dl) {
3735 // If we're creating a pool entry for a constant global with unnamed address,
3736 // and the global is small enough, we can emit it inline into the constant pool
3737 // to save ourselves an indirection.
3738 //
3739 // This is a win if the constant is only used in one function (so it doesn't
3740 // need to be duplicated) or duplicating the constant wouldn't increase code
3741 // size (implying the constant is no larger than 4 bytes).
3742 const Function &F = DAG.getMachineFunction().getFunction();
3743
3744 // We rely on this decision to inline being idemopotent and unrelated to the
3745 // use-site. We know that if we inline a variable at one use site, we'll
3746 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3747 // doesn't know about this optimization, so bail out if it's enabled else
3748 // we could decide to inline here (and thus never emit the GV) but require
3749 // the GV from fast-isel generated code.
3752 return SDValue();
3753
3754 auto *GVar = dyn_cast<GlobalVariable>(GV);
3755 if (!GVar || !GVar->hasInitializer() ||
3756 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3757 !GVar->hasLocalLinkage())
3758 return SDValue();
3759
3760 // If we inline a value that contains relocations, we move the relocations
3761 // from .data to .text. This is not allowed in position-independent code.
3762 auto *Init = GVar->getInitializer();
3763 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
3764 Init->needsDynamicRelocation())
3765 return SDValue();
3766
3767 // The constant islands pass can only really deal with alignment requests
3768 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3769 // any type wanting greater alignment requirements than 4 bytes. We also
3770 // can only promote constants that are multiples of 4 bytes in size or
3771 // are paddable to a multiple of 4. Currently we only try and pad constants
3772 // that are strings for simplicity.
3773 auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
3774 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3775 Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);
3776 unsigned RequiredPadding = 4 - (Size % 4);
3777 bool PaddingPossible =
3778 RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3779 if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
3780 Size == 0)
3781 return SDValue();
3782
3783 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3786
3787 // We can't bloat the constant pool too much, else the ConstantIslands pass
3788 // may fail to converge. If we haven't promoted this global yet (it may have
3789 // multiple uses), and promoting it would increase the constant pool size (Sz
3790 // > 4), ensure we have space to do so up to MaxTotal.
3791 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3792 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3794 return SDValue();
3795
3796 // This is only valid if all users are in a single function; we can't clone
3797 // the constant in general. The LLVM IR unnamed_addr allows merging
3798 // constants, but not cloning them.
3799 //
3800 // We could potentially allow cloning if we could prove all uses of the
3801 // constant in the current function don't care about the address, like
3802 // printf format strings. But that isn't implemented for now.
3803 if (!allUsersAreInFunction(GVar, &F))
3804 return SDValue();
3805
3806 // We're going to inline this global. Pad it out if needed.
3807 if (RequiredPadding != 4) {
3808 StringRef S = CDAInit->getAsString();
3809
3811 std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3812 while (RequiredPadding--)
3813 V.push_back(0);
3815 }
3816
3817 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
3818 SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4));
3819 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3822 PaddedSize - 4);
3823 }
3824 ++NumConstpoolPromoted;
3825 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3826}
3827
3829 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3830 if (!(GV = GA->getAliaseeObject()))
3831 return false;
3832 if (const auto *V = dyn_cast<GlobalVariable>(GV))
3833 return V->isConstant();
3834 return isa<Function>(GV);
3835}
3836
3837SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
3838 SelectionDAG &DAG) const {
3839 switch (Subtarget->getTargetTriple().getObjectFormat()) {
3840 default: llvm_unreachable("unknown object format");
3841 case Triple::COFF:
3842 return LowerGlobalAddressWindows(Op, DAG);
3843 case Triple::ELF:
3844 return LowerGlobalAddressELF(Op, DAG);
3845 case Triple::MachO:
3846 return LowerGlobalAddressDarwin(Op, DAG);
3847 }
3848}
3849
3850SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3851 SelectionDAG &DAG) const {
3852 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3853 SDLoc dl(Op);
3854 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3855 bool IsRO = isReadOnly(GV);
3856
3857 // promoteToConstantPool only if not generating XO text section
3858 if (GV->isDSOLocal() && !Subtarget->genExecuteOnly())
3859 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
3860 return V;
3861
3862 if (isPositionIndependent()) {
3864 GV, dl, PtrVT, 0, GV->isDSOLocal() ? 0 : ARMII::MO_GOT);
3865 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3866 if (!GV->isDSOLocal())
3867 Result =
3868 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3870 return Result;
3871 } else if (Subtarget->isROPI() && IsRO) {
3872 // PC-relative.
3873 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3874 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3875 return Result;
3876 } else if (Subtarget->isRWPI() && !IsRO) {
3877 // SB-relative.
3878 SDValue RelAddr;
3879 if (Subtarget->useMovt()) {
3880 ++NumMovwMovt;
3881 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
3882 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
3883 } else { // use literal pool for address constant
3886 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3887 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3888 RelAddr = DAG.getLoad(
3889 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3891 }
3892 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3893 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
3894 return Result;
3895 }
3896
3897 // If we have T2 ops, we can materialize the address directly via movt/movw
3898 // pair. This is always cheaper. If need to generate Execute Only code, and we
3899 // only have Thumb1 available, we can't use a constant pool and are forced to
3900 // use immediate relocations.
3901 if (Subtarget->useMovt() || Subtarget->genExecuteOnly()) {
3902 if (Subtarget->useMovt())
3903 ++NumMovwMovt;
3904 // FIXME: Once remat is capable of dealing with instructions with register
3905 // operands, expand this into two nodes.
3906 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
3907 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
3908 } else {
3909 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
3910 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3911 return DAG.getLoad(
3912 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3914 }
3915}
3916
3917SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
3918 SelectionDAG &DAG) const {
3919 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3920 "ROPI/RWPI not currently supported for Darwin");
3921 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3922 SDLoc dl(Op);
3923 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3924
3925 if (Subtarget->useMovt())
3926 ++NumMovwMovt;
3927
3928 // FIXME: Once remat is capable of dealing with instructions with register
3929 // operands, expand this into multiple nodes
3930 unsigned Wrapper =
3932
3933 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
3934 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
3935
3936 if (Subtarget->isGVIndirectSymbol(GV))
3937 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3939 return Result;
3940}
3941
3942SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
3943 SelectionDAG &DAG) const {
3944 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
3945 assert(Subtarget->useMovt() &&
3946 "Windows on ARM expects to use movw/movt");
3947 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3948 "ROPI/RWPI not currently supported for Windows");
3949
3951 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3952 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
3953 if (GV->hasDLLImportStorageClass())
3954 TargetFlags = ARMII::MO_DLLIMPORT;
3955 else if (!TM.shouldAssumeDSOLocal(GV))
3956 TargetFlags = ARMII::MO_COFFSTUB;
3957 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3959 SDLoc DL(Op);
3960
3961 ++NumMovwMovt;
3962
3963 // FIXME: Once remat is capable of dealing with instructions with register
3964 // operands, expand this into two nodes.
3965 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
3966 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
3967 TargetFlags));
3968 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
3969 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3971 return Result;
3972}
3973
3974SDValue
3975ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
3976 SDLoc dl(Op);
3977 SDValue Val = DAG.getConstant(0, dl, MVT::i32);
3978 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
3979 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
3980 Op.getOperand(1), Val);
3981}
3982
3983SDValue
3984ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
3985 SDLoc dl(Op);
3986 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
3987 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
3988}
3989
3990SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
3991 SelectionDAG &DAG) const {
3992 SDLoc dl(Op);
3993 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
3994 Op.getOperand(0));
3995}
3996
3997SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
3998 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
3999 unsigned IntNo =
4000 Op.getConstantOperandVal(Op.getOperand(0).getValueType() == MVT::Other);
4001 switch (IntNo) {
4002 default:
4003 return SDValue(); // Don't custom lower most intrinsics.
4004 case Intrinsic::arm_gnu_eabi_mcount: {
4006 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4007 SDLoc dl(Op);
4008 SDValue Chain = Op.getOperand(0);
4009 // call "\01__gnu_mcount_nc"
4010 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
4011 const uint32_t *Mask =
4013 assert(Mask && "Missing call preserved mask for calling convention");
4014 // Mark LR an implicit live-in.
4015 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
4016 SDValue ReturnAddress =
4017 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
4018 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
4019 SDValue Callee =
4020 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
4022 if (Subtarget->isThumb())
4023 return SDValue(
4024 DAG.getMachineNode(
4025 ARM::tBL_PUSHLR, dl, ResultTys,
4026 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
4027 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
4028 0);
4029 return SDValue(
4030 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
4031 {ReturnAddress, Callee, RegisterMask, Chain}),
4032 0);
4033 }
4034 }
4035}
4036
4037SDValue
4038ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
4039 const ARMSubtarget *Subtarget) const {
4040 unsigned IntNo = Op.getConstantOperandVal(0);
4041 SDLoc dl(Op);
4042 switch (IntNo) {
4043 default: return SDValue(); // Don't custom lower most intrinsics.
4044 case Intrinsic::thread_pointer: {
4045 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4046 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
4047 }
4048 case Intrinsic::arm_cls: {
4049 const SDValue &Operand = Op.getOperand(1);
4050 const EVT VTy = Op.getValueType();
4051 SDValue SRA =
4052 DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
4053 SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
4054 SDValue SHL =
4055 DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
4056 SDValue OR =
4057 DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
4058 SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR);
4059 return Result;
4060 }
4061 case Intrinsic::arm_cls64: {
4062 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x))
4063 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x)))
4064 const SDValue &Operand = Op.getOperand(1);
4065 const EVT VTy = Op.getValueType();
4066 SDValue Lo, Hi;
4067 std::tie(Lo, Hi) = DAG.SplitScalar(Operand, dl, VTy, VTy);
4068 SDValue Constant0 = DAG.getConstant(0, dl, VTy);
4069 SDValue Constant1 = DAG.getConstant(1, dl, VTy);
4070 SDValue Constant31 = DAG.getConstant(31, dl, VTy);
4071 SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31);
4072 SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi);
4073 SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1);
4074 SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1);
4075 SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi);
4076 SDValue CheckLo =
4077 DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ);
4078 SDValue HiIsZero =
4079 DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ);
4080 SDValue AdjustedLo =
4081 DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy));
4082 SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo);
4083 SDValue Result =
4084 DAG.getSelect(dl, VTy, CheckLo,
4085 DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi);
4086 return Result;
4087 }
4088 case Intrinsic::eh_sjlj_lsda: {
4091 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
4092 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4093 SDValue CPAddr;
4094 bool IsPositionIndependent = isPositionIndependent();
4095 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
4097 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
4098 ARMCP::CPLSDA, PCAdj);
4099 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
4100 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
4101 SDValue Result = DAG.getLoad(
4102 PtrVT, dl, DAG.getEntryNode(), CPAddr,
4104
4105 if (IsPositionIndependent) {
4106 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
4107 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
4108 }
4109 return Result;
4110 }
4111 case Intrinsic::arm_neon_vabs:
4112 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
4113 Op.getOperand(1));
4114 case Intrinsic::arm_neon_vabds:
4115 if (Op.getValueType().isInteger())
4116 return DAG.getNode(ISD::ABDS, SDLoc(Op), Op.getValueType(),
4117 Op.getOperand(1), Op.getOperand(2));
4118 return SDValue();
4119 case Intrinsic::arm_neon_vabdu:
4120 return DAG.getNode(ISD::ABDU, SDLoc(Op), Op.getValueType(),
4121 Op.getOperand(1), Op.getOperand(2));
4122 case Intrinsic::arm_neon_vmulls:
4123 case Intrinsic::arm_neon_vmullu: {
4124 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
4126 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4127 Op.getOperand(1), Op.getOperand(2));
4128 }
4129 case Intrinsic::arm_neon_vminnm:
4130 case Intrinsic::arm_neon_vmaxnm: {
4131 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
4133 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4134 Op.getOperand(1), Op.getOperand(2));
4135 }
4136 case Intrinsic::arm_neon_vminu:
4137 case Intrinsic::arm_neon_vmaxu: {
4138 if (Op.getValueType().isFloatingPoint())
4139 return SDValue();
4140 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
4141 ? ISD::UMIN : ISD::UMAX;
4142 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4143 Op.getOperand(1), Op.getOperand(2));
4144 }
4145 case Intrinsic::arm_neon_vmins:
4146 case Intrinsic::arm_neon_vmaxs: {
4147 // v{min,max}s is overloaded between signed integers and floats.
4148 if (!Op.getValueType().isFloatingPoint()) {
4149 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
4150 ? ISD::SMIN : ISD::SMAX;
4151 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4152 Op.getOperand(1), Op.getOperand(2));
4153 }
4154 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
4156 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4157 Op.getOperand(1), Op.getOperand(2));
4158 }
4159 case Intrinsic::arm_neon_vtbl1:
4160 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
4161 Op.getOperand(1), Op.getOperand(2));
4162 case Intrinsic::arm_neon_vtbl2:
4163 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
4164 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4165 case Intrinsic::arm_mve_pred_i2v:
4166 case Intrinsic::arm_mve_pred_v2i:
4167 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
4168 Op.getOperand(1));
4169 case Intrinsic::arm_mve_vreinterpretq:
4170 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
4171 Op.getOperand(1));
4172 case Intrinsic::arm_mve_lsll:
4173 return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),
4174 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4175 case Intrinsic::arm_mve_asrl:
4176 return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),
4177 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4178 }
4179}
4180
4182 const ARMSubtarget *Subtarget) {
4183 SDLoc dl(Op);
4184 auto SSID = static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
4185 if (SSID == SyncScope::SingleThread)
4186 return Op;
4187
4188 if (!Subtarget->hasDataBarrier()) {
4189 // Some ARMv6 cpus can support data barriers with an mcr instruction.
4190 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
4191 // here.
4192 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
4193 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
4194 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
4195 DAG.getConstant(0, dl, MVT::i32));
4196 }
4197
4198 AtomicOrdering Ord =
4199 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
4201 if (Subtarget->isMClass()) {
4202 // Only a full system barrier exists in the M-class architectures.
4204 } else if (Subtarget->preferISHSTBarriers() &&
4205 Ord == AtomicOrdering::Release) {
4206 // Swift happens to implement ISHST barriers in a way that's compatible with
4207 // Release semantics but weaker than ISH so we'd be fools not to use
4208 // it. Beware: other processors probably don't!
4210 }
4211
4212 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
4213 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
4214 DAG.getConstant(Domain, dl, MVT::i32));
4215}
4216
4218 const ARMSubtarget *Subtarget) {
4219 // ARM pre v5TE and Thumb1 does not have preload instructions.
4220 if (!(Subtarget->isThumb2() ||
4221 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
4222 // Just preserve the chain.
4223 return Op.getOperand(0);
4224
4225 SDLoc dl(Op);
4226 unsigned isRead = ~Op.getConstantOperandVal(2) & 1;
4227 if (!isRead &&
4228 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
4229 // ARMv7 with MP extension has PLDW.
4230 return Op.getOperand(0);
4231
4232 unsigned isData = Op.getConstantOperandVal(4);
4233 if (Subtarget->isThumb()) {
4234 // Invert the bits.
4235 isRead = ~isRead & 1;
4236 isData = ~isData & 1;
4237 }
4238
4239 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
4240 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
4241 DAG.getConstant(isData, dl, MVT::i32));
4242}
4243
4246 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
4247
4248 // vastart just stores the address of the VarArgsFrameIndex slot into the
4249 // memory location argument.
4250 SDLoc dl(Op);
4252 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
4253 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4254 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
4255 MachinePointerInfo(SV));
4256}
4257
4258SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
4259 CCValAssign &NextVA,
4260 SDValue &Root,
4261 SelectionDAG &DAG,
4262 const SDLoc &dl) const {
4265
4266 const TargetRegisterClass *RC;
4267 if (AFI->isThumb1OnlyFunction())
4268 RC = &ARM::tGPRRegClass;
4269 else
4270 RC = &ARM::GPRRegClass;
4271
4272 // Transform the arguments stored in physical registers into virtual ones.
4273 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4274 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4275
4276 SDValue ArgValue2;
4277 if (NextVA.isMemLoc()) {
4278 MachineFrameInfo &MFI = MF.getFrameInfo();
4279 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
4280
4281 // Create load node to retrieve arguments from the stack.
4282 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4283 ArgValue2 = DAG.getLoad(
4284 MVT::i32, dl, Root, FIN,
4286 } else {
4287 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
4288 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4289 }
4290 if (!Subtarget->isLittle())
4291 std::swap (ArgValue, ArgValue2);
4292 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
4293}
4294
4295// The remaining GPRs hold either the beginning of variable-argument
4296// data, or the beginning of an aggregate passed by value (usually
4297// byval). Either way, we allocate stack slots adjacent to the data
4298// provided by our caller, and store the unallocated registers there.
4299// If this is a variadic function, the va_list pointer will begin with
4300// these values; otherwise, this reassembles a (byval) structure that
4301// was split between registers and memory.
4302// Return: The frame index registers were stored into.
4303int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
4304 const SDLoc &dl, SDValue &Chain,
4305 const Value *OrigArg,
4306 unsigned InRegsParamRecordIdx,
4307 int ArgOffset, unsigned ArgSize) const {
4308 // Currently, two use-cases possible:
4309 // Case #1. Non-var-args function, and we meet first byval parameter.
4310 // Setup first unallocated register as first byval register;
4311 // eat all remained registers
4312 // (these two actions are performed by HandleByVal method).
4313 // Then, here, we initialize stack frame with
4314 // "store-reg" instructions.
4315 // Case #2. Var-args function, that doesn't contain byval parameters.
4316 // The same: eat all remained unallocated registers,
4317 // initialize stack frame.
4318
4320 MachineFrameInfo &MFI = MF.getFrameInfo();
4322 unsigned RBegin, REnd;
4323 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
4324 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
4325 } else {
4326 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4327 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
4328 REnd = ARM::R4;
4329 }
4330
4331 if (REnd != RBegin)
4332 ArgOffset = -4 * (ARM::R4 - RBegin);
4333
4334 auto PtrVT = getPointerTy(DAG.getDataLayout());
4335 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
4336 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
4337
4339 const TargetRegisterClass *RC =
4340 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
4341
4342 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
4343 Register VReg = MF.addLiveIn(Reg, RC);
4344 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
4345 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
4346 MachinePointerInfo(OrigArg, 4 * i));
4347 MemOps.push_back(Store);
4348 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
4349 }
4350
4351 if (!MemOps.empty())
4352 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
4353 return FrameIndex;
4354}
4355
4356// Setup stack frame, the va_list pointer will start from.
4357void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
4358 const SDLoc &dl, SDValue &Chain,
4359 unsigned ArgOffset,
4360 unsigned TotalArgRegsSaveSize,
4361 bool ForceMutable) const {
4364
4365 // Try to store any remaining integer argument regs
4366 // to their spots on the stack so that they may be loaded by dereferencing
4367 // the result of va_next.
4368 // If there is no regs to be stored, just point address after last
4369 // argument passed via stack.
4370 int FrameIndex = StoreByValRegs(
4371 CCInfo, DAG, dl, Chain, nullptr, CCInfo.getInRegsParamsCount(),
4372 CCInfo.getStackSize(), std::max(4U, TotalArgRegsSaveSize));
4373 AFI->setVarArgsFrameIndex(FrameIndex);
4374}
4375
4376bool ARMTargetLowering::splitValueIntoRegisterParts(
4377 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
4378 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
4379 EVT ValueVT = Val.getValueType();
4380 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4381 unsigned ValueBits = ValueVT.getSizeInBits();
4382 unsigned PartBits = PartVT.getSizeInBits();
4383 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
4384 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
4385 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
4386 Parts[0] = Val;
4387 return true;
4388 }
4389 return false;
4390}
4391
4392SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
4393 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
4394 MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
4395 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4396 unsigned ValueBits = ValueVT.getSizeInBits();
4397 unsigned PartBits = PartVT.getSizeInBits();
4398 SDValue Val = Parts[0];
4399
4400 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
4401 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
4402 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
4403 return Val;
4404 }
4405 return SDValue();
4406}
4407
4408SDValue ARMTargetLowering::LowerFormalArguments(
4409 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4410 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4411 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4413 MachineFrameInfo &MFI = MF.getFrameInfo();
4414
4416
4417 // Assign locations to all of the incoming arguments.
4419 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4420 *DAG.getContext());
4421 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
4422
4424 unsigned CurArgIdx = 0;
4425
4426 // Initially ArgRegsSaveSize is zero.
4427 // Then we increase this value each time we meet byval parameter.
4428 // We also increase this value in case of varargs function.
4429 AFI->setArgRegsSaveSize(0);
4430
4431 // Calculate the amount of stack space that we need to allocate to store
4432 // byval and variadic arguments that are passed in registers.
4433 // We need to know this before we allocate the first byval or variadic
4434 // argument, as they will be allocated a stack slot below the CFA (Canonical
4435 // Frame Address, the stack pointer at entry to the function).
4436 unsigned ArgRegBegin = ARM::R4;
4437 for (const CCValAssign &VA : ArgLocs) {
4438 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
4439 break;
4440
4441 unsigned Index = VA.getValNo();
4442 ISD::ArgFlagsTy Flags = Ins[Index].Flags;
4443 if (!Flags.isByVal())
4444 continue;
4445
4446 assert(VA.isMemLoc() && "unexpected byval pointer in reg");
4447 unsigned RBegin, REnd;
4448 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
4449 ArgRegBegin = std::min(ArgRegBegin, RBegin);
4450
4451 CCInfo.nextInRegsParam();
4452 }
4453 CCInfo.rewindByValRegsInfo();
4454
4455 int lastInsIndex = -1;
4456 if (isVarArg && MFI.hasVAStart()) {
4457 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4458 if (RegIdx != std::size(GPRArgRegs))
4459 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
4460 }
4461
4462 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
4463 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
4464 auto PtrVT = getPointerTy(DAG.getDataLayout());
4465
4466 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4467 CCValAssign &VA = ArgLocs[i];
4468 if (Ins[VA.getValNo()].isOrigArg()) {
4469 std::advance(CurOrigArg,
4470 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
4471 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
4472 }
4473 // Arguments stored in registers.
4474 if (VA.isRegLoc()) {
4475 EVT RegVT = VA.getLocVT();
4476 SDValue ArgValue;
4477
4478 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
4479 // f64 and vector types are split up into multiple registers or
4480 // combinations of registers and stack slots.
4481 SDValue ArgValue1 =
4482 GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4483 VA = ArgLocs[++i]; // skip ahead to next loc
4484 SDValue ArgValue2;
4485 if (VA.isMemLoc()) {
4486 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
4487 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4488 ArgValue2 = DAG.getLoad(
4489 MVT::f64, dl, Chain, FIN,
4491 } else {
4492 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4493 }
4494 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
4495 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4496 ArgValue1, DAG.getIntPtrConstant(0, dl));
4497 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4498 ArgValue2, DAG.getIntPtrConstant(1, dl));
4499 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
4500 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4501 } else {
4502 const TargetRegisterClass *RC;
4503
4504 if (RegVT == MVT::f16 || RegVT == MVT::bf16)
4505 RC = &ARM::HPRRegClass;
4506 else if (RegVT == MVT::f32)
4507 RC = &ARM::SPRRegClass;
4508 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
4509 RegVT == MVT::v4bf16)
4510 RC = &ARM::DPRRegClass;
4511 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
4512 RegVT == MVT::v8bf16)
4513 RC = &ARM::QPRRegClass;
4514 else if (RegVT == MVT::i32)
4515 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
4516 : &ARM::GPRRegClass;
4517 else
4518 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
4519
4520 // Transform the arguments in physical registers into virtual ones.
4521 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4522 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4523
4524 // If this value is passed in r0 and has the returned attribute (e.g.
4525 // C++ 'structors), record this fact for later use.
4526 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
4527 AFI->setPreservesR0();
4528 }
4529 }
4530
4531 // If this is an 8 or 16-bit value, it is really passed promoted
4532 // to 32 bits. Insert an assert[sz]ext to capture this, then
4533 // truncate to the right size.
4534 switch (VA.getLocInfo()) {
4535 default: llvm_unreachable("Unknown loc info!");
4536 case CCValAssign::Full: break;
4537 case CCValAssign::BCvt:
4538 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
4539 break;
4540 }
4541
4542 // f16 arguments have their size extended to 4 bytes and passed as if they
4543 // had been copied to the LSBs of a 32-bit register.
4544 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
4545 if (VA.needsCustom() &&
4546 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
4547 ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
4548
4549 // On CMSE Entry Functions, formal integer arguments whose bitwidth is
4550 // less than 32 bits must be sign- or zero-extended in the callee for
4551 // security reasons. Although the ABI mandates an extension done by the
4552 // caller, the latter cannot be trusted to follow the rules of the ABI.
4553 const ISD::InputArg &Arg = Ins[VA.getValNo()];
4554 if (AFI->isCmseNSEntryFunction() && Arg.ArgVT.isScalarInteger() &&
4555 RegVT.isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
4556 ArgValue = handleCMSEValue(ArgValue, Arg, DAG, dl);
4557
4558 InVals.push_back(ArgValue);
4559 } else { // VA.isRegLoc()
4560 // Only arguments passed on the stack should make it here.
4561 assert(VA.isMemLoc());
4562 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
4563
4564 int index = VA.getValNo();
4565
4566 // Some Ins[] entries become multiple ArgLoc[] entries.
4567 // Process them only once.
4568 if (index != lastInsIndex)
4569 {
4570 ISD::ArgFlagsTy Flags = Ins[index].Flags;
4571 // FIXME: For now, all byval parameter objects are marked mutable.
4572 // This can be changed with more analysis.
4573 // In case of tail call optimization mark all arguments mutable.
4574 // Since they could be overwritten by lowering of arguments in case of
4575 // a tail call.
4576 if (Flags.isByVal()) {
4577 assert(Ins[index].isOrigArg() &&
4578 "Byval arguments cannot be implicit");
4579 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
4580
4581 int FrameIndex = StoreByValRegs(
4582 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
4583 VA.getLocMemOffset(), Flags.getByValSize());
4584 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
4585 CCInfo.nextInRegsParam();
4586 } else if (VA.needsCustom() && (VA.getValVT() == MVT::f16 ||
4587 VA.getValVT() == MVT::bf16)) {
4588 // f16 and bf16 values are passed in the least-significant half of
4589 // a 4 byte stack slot. This is done as-if the extension was done
4590 // in a 32-bit register, so the actual bytes used for the value
4591 // differ between little and big endian.
4592 assert(VA.getLocVT().getSizeInBits() == 32);
4593 unsigned FIOffset = VA.getLocMemOffset();
4594 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits() / 8,
4595 FIOffset, true);
4596
4597 SDValue Addr = DAG.getFrameIndex(FI, PtrVT);
4598 if (DAG.getDataLayout().isBigEndian())
4600
4601 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, Addr,
4603 DAG.getMachineFunction(), FI)));
4604
4605 } else {
4606 unsigned FIOffset = VA.getLocMemOffset();
4607 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
4608 FIOffset, true);
4609
4610 // Create load nodes to retrieve arguments from the stack.
4611 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4612 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
4614 DAG.getMachineFunction(), FI)));
4615 }
4616 lastInsIndex = index;
4617 }
4618 }
4619 }
4620
4621 // varargs
4622 if (isVarArg && MFI.hasVAStart()) {
4623 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getStackSize(),
4624 TotalArgRegsSaveSize);
4625 if (AFI->isCmseNSEntryFunction()) {
4628 "secure entry function must not be variadic", dl.getDebugLoc()));
4629 }
4630 }
4631
4632 unsigned StackArgSize = CCInfo.getStackSize();
4633 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4634 if (canGuaranteeTCO(CallConv, TailCallOpt)) {
4635 // The only way to guarantee a tail call is if the callee restores its
4636 // argument area, but it must also keep the stack aligned when doing so.
4637 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
4638 assert(StackAlign && "data layout string is missing stack alignment");
4639 StackArgSize = alignTo(StackArgSize, *StackAlign);
4640
4641 AFI->setArgumentStackToRestore(StackArgSize);
4642 }
4643 AFI->setArgumentStackSize(StackArgSize);
4644
4645 if (CCInfo.getStackSize() > 0 && AFI->isCmseNSEntryFunction()) {
4648 "secure entry function requires arguments on stack", dl.getDebugLoc()));
4649 }
4650
4651 return Chain;
4652}
4653
4654/// isFloatingPointZero - Return true if this is +0.0.
4656 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
4657 return CFP->getValueAPF().isPosZero();
4658 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
4659 // Maybe this has already been legalized into the constant pool?
4660 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
4661 SDValue WrapperOp = Op.getOperand(1).getOperand(0);
4662 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
4663 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
4664 return CFP->getValueAPF().isPosZero();
4665 }
4666 } else if (Op->getOpcode() == ISD::BITCAST &&
4667 Op->getValueType(0) == MVT::f64) {
4668 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
4669 // created by LowerConstantFP().
4670 SDValue BitcastOp = Op->getOperand(0);
4671 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
4672 isNullConstant(BitcastOp->getOperand(0)))
4673 return true;
4674 }
4675 return false;
4676}
4677
4678/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
4679/// the given operands.
4680SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
4681 SDValue &ARMcc, SelectionDAG &DAG,
4682 const SDLoc &dl) const {
4683 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4684 unsigned C = RHSC->getZExtValue();
4685 if (!isLegalICmpImmediate((int32_t)C)) {
4686 // Constant does not fit, try adjusting it by one.
4687 switch (CC) {
4688 default: break;
4689 case ISD::SETLT:
4690 case ISD::SETGE:
4691 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
4692 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4693 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4694 }
4695 break;
4696 case ISD::SETULT:
4697 case ISD::SETUGE:
4698 if (C != 0 && isLegalICmpImmediate(C-1)) {
4699 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4700 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4701 }
4702 break;
4703 case ISD::SETLE:
4704 case ISD::SETGT:
4705 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
4706 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4707 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4708 }
4709 break;
4710 case ISD::SETULE:
4711 case ISD::SETUGT:
4712 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
4713 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4714 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4715 }
4716 break;
4717 }
4718 }
4719 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
4721 // In ARM and Thumb-2, the compare instructions can shift their second
4722 // operand.
4724 std::swap(LHS, RHS);
4725 }
4726
4727 // Thumb1 has very limited immediate modes, so turning an "and" into a
4728 // shift can save multiple instructions.
4729 //
4730 // If we have (x & C1), and C1 is an appropriate mask, we can transform it
4731 // into "((x << n) >> n)". But that isn't necessarily profitable on its
4732 // own. If it's the operand to an unsigned comparison with an immediate,
4733 // we can eliminate one of the shifts: we transform
4734 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
4735 //
4736 // We avoid transforming cases which aren't profitable due to encoding
4737 // details:
4738 //
4739 // 1. C2 fits into the immediate field of a cmp, and the transformed version
4740 // would not; in that case, we're essentially trading one immediate load for
4741 // another.
4742 // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
4743 // 3. C2 is zero; we have other code for this special case.
4744 //
4745 // FIXME: Figure out profitability for Thumb2; we usually can't save an
4746 // instruction, since the AND is always one instruction anyway, but we could
4747 // use narrow instructions in some cases.
4748 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
4749 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4750 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
4751 !isSignedIntSetCC(CC)) {
4752 unsigned Mask = LHS.getConstantOperandVal(1);
4753 auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
4754 uint64_t RHSV = RHSC->getZExtValue();
4755 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
4756 unsigned ShiftBits = llvm::countl_zero(Mask);
4757 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
4758 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
4759 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
4760 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
4761 }
4762 }
4763 }
4764
4765 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
4766 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same
4767 // way a cmp would.
4768 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
4769 // some tweaks to the heuristics for the previous and->shift transform.
4770 // FIXME: Optimize cases where the LHS isn't a shift.
4771 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
4772 isa<ConstantSDNode>(RHS) && RHS->getAsZExtVal() == 0x80000000U &&
4773 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4774 LHS.getConstantOperandVal(1) < 31) {
4775 unsigned ShiftAmt = LHS.getConstantOperandVal(1) + 1;
4776 SDValue Shift =
4777 DAG.getNode(ARMISD::LSLS, dl, DAG.getVTList(MVT::i32, FlagsVT),
4778 LHS.getOperand(0), DAG.getConstant(ShiftAmt, dl, MVT::i32));
4779 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
4780 return Shift.getValue(1);
4781 }
4782
4784
4785 // If the RHS is a constant zero then the V (overflow) flag will never be
4786 // set. This can allow us to simplify GE to PL or LT to MI, which can be
4787 // simpler for other passes (like the peephole optimiser) to deal with.
4788 if (isNullConstant(RHS)) {
4789 switch (CondCode) {
4790 default: break;
4791 case ARMCC::GE:
4793 break;
4794 case ARMCC::LT:
4796 break;
4797 }
4798 }
4799
4800 ARMISD::NodeType CompareType;
4801 switch (CondCode) {
4802 default:
4803 CompareType = ARMISD::CMP;
4804 break;
4805 case ARMCC::EQ:
4806 case ARMCC::NE:
4807 // Uses only Z Flag
4808 CompareType = ARMISD::CMPZ;
4809 break;
4810 }
4811 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4812 return DAG.getNode(CompareType, dl, FlagsVT, LHS, RHS);
4813}
4814
4815/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
4816SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
4817 SelectionDAG &DAG, const SDLoc &dl,
4818 bool Signaling) const {
4819 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
4820 SDValue Flags;
4821 if (!isFloatingPointZero(RHS))
4822 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP, dl, FlagsVT,
4823 LHS, RHS);
4824 else
4825 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0, dl,
4826 FlagsVT, LHS);
4827 return DAG.getNode(ARMISD::FMSTAT, dl, FlagsVT, Flags);
4828}
4829
4830// This function returns three things: the arithmetic computation itself
4831// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The
4832// comparison and the condition code define the case in which the arithmetic
4833// computation *does not* overflow.
4834std::pair<SDValue, SDValue>
4835ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
4836 SDValue &ARMcc) const {
4837 assert(Op.getValueType() == MVT::i32 && "Unsupported value type");
4838
4839 SDValue Value, OverflowCmp;
4840 SDValue LHS = Op.getOperand(0);
4841 SDValue RHS = Op.getOperand(1);
4842 SDLoc dl(Op);
4843
4844 // FIXME: We are currently always generating CMPs because we don't support
4845 // generating CMN through the backend. This is not as good as the natural
4846 // CMP case because it causes a register dependency and cannot be folded
4847 // later.
4848
4849 switch (Op.getOpcode()) {
4850 default:
4851 llvm_unreachable("Unknown overflow instruction!");
4852 case ISD::SADDO:
4853 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4854 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
4855 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4856 break;
4857 case ISD::UADDO:
4858 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4859 // We use ADDC here to correspond to its use in LowerUnsignedALUO.
4860 // We do not use it in the USUBO case as Value may not be used.
4861 Value = DAG.getNode(ARMISD::ADDC, dl,
4862 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
4863 .getValue(0);
4864 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4865 break;
4866 case ISD::SSUBO:
4867 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4868 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4869 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4870 break;
4871 case ISD::USUBO:
4872 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4873 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4874 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4875 break;
4876 case ISD::UMULO:
4877 // We generate a UMUL_LOHI and then check if the high word is 0.
4878 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4879 Value = DAG.getNode(ISD::UMUL_LOHI, dl,
4880 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4881 LHS, RHS);
4882 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value.getValue(1),
4883 DAG.getConstant(0, dl, MVT::i32));
4884 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4885 break;
4886 case ISD::SMULO:
4887 // We generate a SMUL_LOHI and then check if all the bits of the high word
4888 // are the same as the sign bit of the low word.
4889 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4890 Value = DAG.getNode(ISD::SMUL_LOHI, dl,
4891 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4892 LHS, RHS);
4893 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value.getValue(1),
4894 DAG.getNode(ISD::SRA, dl, Op.getValueType(),
4895 Value.getValue(0),
4896 DAG.getConstant(31, dl, MVT::i32)));
4897 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4898 break;
4899 } // switch (...)
4900
4901 return std::make_pair(Value, OverflowCmp);
4902}
4903
4904SDValue
4905ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
4906 // Let legalize expand this if it isn't a legal type yet.
4907 if (!isTypeLegal(Op.getValueType()))
4908 return SDValue();
4909
4910 SDValue Value, OverflowCmp;
4911 SDValue ARMcc;
4912 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
4913 SDLoc dl(Op);
4914 // We use 0 and 1 as false and true values.
4915 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4916 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4917 EVT VT = Op.getValueType();
4918
4919 SDValue Overflow =
4920 DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, ARMcc, OverflowCmp);
4921
4922 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4923 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4924}
4925
4927 SelectionDAG &DAG) {
4928 SDLoc DL(BoolCarry);
4929 EVT CarryVT = BoolCarry.getValueType();
4930
4931 // This converts the boolean value carry into the carry flag by doing
4932 // ARMISD::SUBC Carry, 1
4933 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
4934 DAG.getVTList(CarryVT, MVT::i32),
4935 BoolCarry, DAG.getConstant(1, DL, CarryVT));
4936 return Carry.getValue(1);
4937}
4938
4940 SelectionDAG &DAG) {
4941 SDLoc DL(Flags);
4942
4943 // Now convert the carry flag into a boolean carry. We do this
4944 // using ARMISD:ADDE 0, 0, Carry
4945 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
4946 DAG.getConstant(0, DL, MVT::i32),
4947 DAG.getConstant(0, DL, MVT::i32), Flags);
4948}
4949
4950SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
4951 SelectionDAG &DAG) const {
4952 // Let legalize expand this if it isn't a legal type yet.
4953 if (!isTypeLegal(Op.getValueType()))
4954 return SDValue();
4955
4956 SDValue LHS = Op.getOperand(0);
4957 SDValue RHS = Op.getOperand(1);
4958 SDLoc dl(Op);
4959
4960 EVT VT = Op.getValueType();
4961 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
4962 SDValue Value;
4963 SDValue Overflow;
4964 switch (Op.getOpcode()) {
4965 default:
4966 llvm_unreachable("Unknown overflow instruction!");
4967 case ISD::UADDO:
4968 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
4969 // Convert the carry flag into a boolean value.
4970 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4971 break;
4972 case ISD::USUBO: {
4973 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
4974 // Convert the carry flag into a boolean value.
4975 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4976 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
4977 // value. So compute 1 - C.
4978 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
4979 DAG.getConstant(1, dl, MVT::i32), Overflow);
4980 break;
4981 }
4982 }
4983
4984 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4985}
4986
4988 const ARMSubtarget *Subtarget) {
4989 EVT VT = Op.getValueType();
4990 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() || Subtarget->isThumb1Only())
4991 return SDValue();
4992 if (!VT.isSimple())
4993 return SDValue();
4994
4995 unsigned NewOpcode;
4996 switch (VT.getSimpleVT().SimpleTy) {
4997 default:
4998 return SDValue();
4999 case MVT::i8:
5000 switch (Op->getOpcode()) {
5001 case ISD::UADDSAT:
5002 NewOpcode = ARMISD::UQADD8b;
5003 break;
5004 case ISD::SADDSAT:
5005 NewOpcode = ARMISD::QADD8b;
5006 break;
5007 case ISD::USUBSAT:
5008 NewOpcode = ARMISD::UQSUB8b;
5009 break;
5010 case ISD::SSUBSAT:
5011 NewOpcode = ARMISD::QSUB8b;
5012 break;
5013 }
5014 break;
5015 case MVT::i16:
5016 switch (Op->getOpcode()) {
5017 case ISD::UADDSAT:
5018 NewOpcode = ARMISD::UQADD16b;
5019 break;
5020 case ISD::SADDSAT:
5021 NewOpcode = ARMISD::QADD16b;
5022 break;
5023 case ISD::USUBSAT:
5024 NewOpcode = ARMISD::UQSUB16b;
5025 break;
5026 case ISD::SSUBSAT:
5027 NewOpcode = ARMISD::QSUB16b;
5028 break;
5029 }
5030 break;
5031 }
5032
5033 SDLoc dl(Op);
5034 SDValue Add =
5035 DAG.getNode(NewOpcode, dl, MVT::i32,
5036 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
5037 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
5038 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
5039}
5040
5041SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
5042 SDValue Cond = Op.getOperand(0);
5043 SDValue SelectTrue = Op.getOperand(1);
5044 SDValue SelectFalse = Op.getOperand(2);
5045 SDLoc dl(Op);
5046 unsigned Opc = Cond.getOpcode();
5047
5048 if (Cond.getResNo() == 1 &&
5049 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5050 Opc == ISD::USUBO)) {
5051 if (!isTypeLegal(Cond->getValueType(0)))
5052 return SDValue();
5053
5054 SDValue Value, OverflowCmp;
5055 SDValue ARMcc;
5056 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5057 EVT VT = Op.getValueType();
5058
5059 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, OverflowCmp, DAG);
5060 }
5061
5062 // Convert:
5063 //
5064 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
5065 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
5066 //
5067 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
5068 const ConstantSDNode *CMOVTrue =
5069 dyn_cast<ConstantSDNode>(Cond.getOperand(0));
5070 const ConstantSDNode *CMOVFalse =
5071 dyn_cast<ConstantSDNode>(Cond.getOperand(1));
5072
5073 if (CMOVTrue && CMOVFalse) {
5074 unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
5075 unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
5076
5077 SDValue True;
5078 SDValue False;
5079 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
5080 True = SelectTrue;
5081 False = SelectFalse;
5082 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
5083 True = SelectFalse;
5084 False = SelectTrue;
5085 }
5086
5087 if (True.getNode() && False.getNode())
5088 return getCMOV(dl, Op.getValueType(), True, False, Cond.getOperand(2),
5089 Cond.getOperand(3), DAG);
5090 }
5091 }
5092
5093 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
5094 // undefined bits before doing a full-word comparison with zero.
5095 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
5096 DAG.getConstant(1, dl, Cond.getValueType()));
5097
5098 return DAG.getSelectCC(dl, Cond,
5099 DAG.getConstant(0, dl, Cond.getValueType()),
5100 SelectTrue, SelectFalse, ISD::SETNE);
5101}
5102
5104 bool &swpCmpOps, bool &swpVselOps) {
5105 // Start by selecting the GE condition code for opcodes that return true for
5106 // 'equality'
5107 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
5108 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)
5109 CondCode = ARMCC::GE;
5110
5111 // and GT for opcodes that return false for 'equality'.
5112 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
5113 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)
5114 CondCode = ARMCC::GT;
5115
5116 // Since we are constrained to GE/GT, if the opcode contains 'less', we need
5117 // to swap the compare operands.
5118 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
5119 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)
5120 swpCmpOps = true;
5121
5122 // Both GT and GE are ordered comparisons, and return false for 'unordered'.
5123 // If we have an unordered opcode, we need to swap the operands to the VSEL
5124 // instruction (effectively negating the condition).
5125 //
5126 // This also has the effect of swapping which one of 'less' or 'greater'
5127 // returns true, so we also swap the compare operands. It also switches
5128 // whether we return true for 'equality', so we compensate by picking the
5129 // opposite condition code to our original choice.
5130 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
5131 CC == ISD::SETUGT) {
5132 swpCmpOps = !swpCmpOps;
5133 swpVselOps = !swpVselOps;
5134 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
5135 }
5136
5137 // 'ordered' is 'anything but unordered', so use the VS condition code and
5138 // swap the VSEL operands.
5139 if (CC == ISD::SETO) {
5140 CondCode = ARMCC::VS;
5141 swpVselOps = true;
5142 }
5143
5144 // 'unordered or not equal' is 'anything but equal', so use the EQ condition
5145 // code and swap the VSEL operands. Also do this if we don't care about the
5146 // unordered case.
5147 if (CC == ISD::SETUNE || CC == ISD::SETNE) {
5148 CondCode = ARMCC::EQ;
5149 swpVselOps = true;
5150 }
5151}
5152
5153SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
5154 SDValue TrueVal, SDValue ARMcc,
5155 SDValue Flags, SelectionDAG &DAG) const {
5156 if (!Subtarget->hasFP64() && VT == MVT::f64) {
5158 DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
5160 DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
5161
5162 SDValue TrueLow = TrueVal.getValue(0);
5163 SDValue TrueHigh = TrueVal.getValue(1);
5164 SDValue FalseLow = FalseVal.getValue(0);
5165 SDValue FalseHigh = FalseVal.getValue(1);
5166
5167 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
5168 ARMcc, Flags);
5169 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
5170 ARMcc, Flags);
5171
5172 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
5173 }
5174 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, Flags);
5175}
5176
5177static bool isGTorGE(ISD::CondCode CC) {
5178 return CC == ISD::SETGT || CC == ISD::SETGE;
5179}
5180
5181static bool isLTorLE(ISD::CondCode CC) {
5182 return CC == ISD::SETLT || CC == ISD::SETLE;
5183}
5184
5185// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
5186// All of these conditions (and their <= and >= counterparts) will do:
5187// x < k ? k : x
5188// x > k ? x : k
5189// k < x ? x : k
5190// k > x ? k : x
5191static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
5192 const SDValue TrueVal, const SDValue FalseVal,
5193 const ISD::CondCode CC, const SDValue K) {
5194 return (isGTorGE(CC) &&
5195 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
5196 (isLTorLE(CC) &&
5197 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
5198}
5199
5200// Check if two chained conditionals could be converted into SSAT or USAT.
5201//
5202// SSAT can replace a set of two conditional selectors that bound a number to an
5203// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
5204//
5205// x < -k ? -k : (x > k ? k : x)
5206// x < -k ? -k : (x < k ? x : k)
5207// x > -k ? (x > k ? k : x) : -k
5208// x < k ? (x < -k ? -k : x) : k
5209// etc.
5210//
5211// LLVM canonicalizes these to either a min(max()) or a max(min())
5212// pattern. This function tries to match one of these and will return a SSAT
5213// node if successful.
5214//
5215// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1
5216// is a power of 2.
5218 EVT VT = Op.getValueType();
5219 SDValue V1 = Op.getOperand(0);
5220 SDValue K1 = Op.getOperand(1);
5221 SDValue TrueVal1 = Op.getOperand(2);
5222 SDValue FalseVal1 = Op.getOperand(3);
5223 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5224
5225 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
5226 if (Op2.getOpcode() != ISD::SELECT_CC)
5227 return SDValue();
5228
5229 SDValue V2 = Op2.getOperand(0);
5230 SDValue K2 = Op2.getOperand(1);
5231 SDValue TrueVal2 = Op2.getOperand(2);
5232 SDValue FalseVal2 = Op2.getOperand(3);
5233 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
5234
5235 SDValue V1Tmp = V1;
5236 SDValue V2Tmp = V2;
5237
5238 // Check that the registers and the constants match a max(min()) or min(max())
5239 // pattern
5240 if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
5241 K2 != FalseVal2 ||
5242 !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))
5243 return SDValue();
5244
5245 // Check that the constant in the lower-bound check is
5246 // the opposite of the constant in the upper-bound check
5247 // in 1's complement.
5248 if (!isa<ConstantSDNode>(K1) || !isa<ConstantSDNode>(K2))
5249 return SDValue();
5250
5251 int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();
5252 int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();
5253 int64_t PosVal = std::max(Val1, Val2);
5254 int64_t NegVal = std::min(Val1, Val2);
5255
5256 if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||
5257 !isPowerOf2_64(PosVal + 1))
5258 return SDValue();
5259
5260 // Handle the difference between USAT (unsigned) and SSAT (signed)
5261 // saturation
5262 // At this point, PosVal is guaranteed to be positive
5263 uint64_t K = PosVal;
5264 SDLoc dl(Op);
5265 if (Val1 == ~Val2)
5266 return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,
5267 DAG.getConstant(llvm::countr_one(K), dl, VT));
5268 if (NegVal == 0)
5269 return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,
5270 DAG.getConstant(llvm::countr_one(K), dl, VT));
5271
5272 return SDValue();
5273}
5274
5275// Check if a condition of the type x < k ? k : x can be converted into a
5276// bit operation instead of conditional moves.
5277// Currently this is allowed given:
5278// - The conditions and values match up
5279// - k is 0 or -1 (all ones)
5280// This function will not check the last condition, thats up to the caller
5281// It returns true if the transformation can be made, and in such case
5282// returns x in V, and k in SatK.
5284 SDValue &SatK)
5285{
5286 SDValue LHS = Op.getOperand(0);
5287 SDValue RHS = Op.getOperand(1);
5288 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5289 SDValue TrueVal = Op.getOperand(2);
5290 SDValue FalseVal = Op.getOperand(3);
5291
5292 SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS)
5293 ? &RHS
5294 : nullptr;
5295
5296 // No constant operation in comparison, early out
5297 if (!K)
5298 return false;
5299
5300 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
5301 V = (KTmp == TrueVal) ? FalseVal : TrueVal;
5302 SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
5303
5304 // If the constant on left and right side, or variable on left and right,
5305 // does not match, early out
5306 if (*K != KTmp || V != VTmp)
5307 return false;
5308
5309 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
5310 SatK = *K;
5311 return true;
5312 }
5313
5314 return false;
5315}
5316
5317bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
5318 if (VT == MVT::f32)
5319 return !Subtarget->hasVFP2Base();
5320 if (VT == MVT::f64)
5321 return !Subtarget->hasFP64();
5322 if (VT == MVT::f16)
5323 return !Subtarget->hasFullFP16();
5324 return false;
5325}
5326
5327SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
5328 EVT VT = Op.getValueType();
5329 SDLoc dl(Op);
5330
5331 // Try to convert two saturating conditional selects into a single SSAT
5332 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
5333 if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))
5334 return SatValue;
5335
5336 // Try to convert expressions of the form x < k ? k : x (and similar forms)
5337 // into more efficient bit operations, which is possible when k is 0 or -1
5338 // On ARM and Thumb-2 which have flexible operand 2 this will result in
5339 // single instructions. On Thumb the shift and the bit operation will be two
5340 // instructions.
5341 // Only allow this transformation on full-width (32-bit) operations
5342 SDValue LowerSatConstant;
5343 SDValue SatValue;
5344 if (VT == MVT::i32 &&
5345 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
5346 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
5347 DAG.getConstant(31, dl, VT));
5348 if (isNullConstant(LowerSatConstant)) {
5349 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
5350 DAG.getAllOnesConstant(dl, VT));
5351 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
5352 } else if (isAllOnesConstant(LowerSatConstant))
5353 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
5354 }
5355
5356 SDValue LHS = Op.getOperand(0);
5357 SDValue RHS = Op.getOperand(1);
5358 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5359 SDValue TrueVal = Op.getOperand(2);
5360 SDValue FalseVal = Op.getOperand(3);
5361 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
5362 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
5363 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
5364 if (Op.getValueType().isInteger()) {
5365
5366 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
5367 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
5368 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
5369 // Both require less instructions than compare and conditional select.
5370 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TrueVal && RHSC &&
5371 RHSC->isZero() && CFVal && CFVal->isZero() &&
5372 LHS.getValueType() == RHS.getValueType()) {
5373 EVT VT = LHS.getValueType();
5374 SDValue Shift =
5375 DAG.getNode(ISD::SRA, dl, VT, LHS,
5376 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
5377
5378 if (CC == ISD::SETGT)
5379 Shift = DAG.getNOT(dl, Shift, VT);
5380
5381 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
5382 }
5383 }
5384
5385 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal &&
5386 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) {
5387 unsigned TVal = CTVal->getZExtValue();
5388 unsigned FVal = CFVal->getZExtValue();
5389 unsigned Opcode = 0;
5390
5391 if (TVal == ~FVal) {
5392 Opcode = ARMISD::CSINV;
5393 } else if (TVal == ~FVal + 1) {
5394 Opcode = ARMISD::CSNEG;
5395 } else if (TVal + 1 == FVal) {
5396 Opcode = ARMISD::CSINC;
5397 } else if (TVal == FVal + 1) {
5398 Opcode = ARMISD::CSINC;
5399 std::swap(TrueVal, FalseVal);
5400 std::swap(TVal, FVal);
5401 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5402 }
5403
5404 if (Opcode) {
5405 // If one of the constants is cheaper than another, materialise the
5406 // cheaper one and let the csel generate the other.
5407 if (Opcode != ARMISD::CSINC &&
5408 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
5409 std::swap(TrueVal, FalseVal);
5410 std::swap(TVal, FVal);
5411 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5412 }
5413
5414 // Attempt to use ZR checking TVal is 0, possibly inverting the condition
5415 // to get there. CSINC not is invertable like the other two (~(~a) == a,
5416 // -(-a) == a, but (a+1)+1 != a).
5417 if (FVal == 0 && Opcode != ARMISD::CSINC) {
5418 std::swap(TrueVal, FalseVal);
5419 std::swap(TVal, FVal);
5420 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5421 }
5422
5423 // Drops F's value because we can get it by inverting/negating TVal.
5424 FalseVal = TrueVal;
5425
5426 SDValue ARMcc;
5427 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5428 EVT VT = TrueVal.getValueType();
5429 return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp);
5430 }
5431 }
5432
5433 if (isUnsupportedFloatingType(LHS.getValueType())) {
5434 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5435
5436 // If softenSetCCOperands only returned one value, we should compare it to
5437 // zero.
5438 if (!RHS.getNode()) {
5439 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5440 CC = ISD::SETNE;
5441 }
5442 }
5443
5444 if (LHS.getValueType() == MVT::i32) {
5445 // Try to generate VSEL on ARMv8.
5446 // The VSEL instruction can't use all the usual ARM condition
5447 // codes: it only has two bits to select the condition code, so it's
5448 // constrained to use only GE, GT, VS and EQ.
5449 //
5450 // To implement all the various ISD::SETXXX opcodes, we sometimes need to
5451 // swap the operands of the previous compare instruction (effectively
5452 // inverting the compare condition, swapping 'less' and 'greater') and
5453 // sometimes need to swap the operands to the VSEL (which inverts the
5454 // condition in the sense of firing whenever the previous condition didn't)
5455 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
5456 TrueVal.getValueType() == MVT::f32 ||
5457 TrueVal.getValueType() == MVT::f64)) {
5459 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
5460 CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
5461 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5462 std::swap(TrueVal, FalseVal);
5463 }
5464 }
5465
5466 SDValue ARMcc;
5467 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5468 // Choose GE over PL, which vsel does now support
5469 if (ARMcc->getAsZExtVal() == ARMCC::PL)
5470 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
5471 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5472 }
5473
5474 ARMCC::CondCodes CondCode, CondCode2;
5475 FPCCToARMCC(CC, CondCode, CondCode2);
5476
5477 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
5478 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
5479 // must use VSEL (limited condition codes), due to not having conditional f16
5480 // moves.
5481 if (Subtarget->hasFPARMv8Base() &&
5482 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
5483 (TrueVal.getValueType() == MVT::f16 ||
5484 TrueVal.getValueType() == MVT::f32 ||
5485 TrueVal.getValueType() == MVT::f64)) {
5486 bool swpCmpOps = false;
5487 bool swpVselOps = false;
5488 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
5489
5490 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
5491 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
5492 if (swpCmpOps)
5493 std::swap(LHS, RHS);
5494 if (swpVselOps)
5495 std::swap(TrueVal, FalseVal);
5496 }
5497 }
5498
5499 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5500 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5501 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5502 if (CondCode2 != ARMCC::AL) {
5503 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
5504 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, Cmp, DAG);
5505 }
5506 return Result;
5507}
5508
5509/// canChangeToInt - Given the fp compare operand, return true if it is suitable
5510/// to morph to an integer compare sequence.
5511static bool canChangeToInt(SDValue Op, bool &SeenZero,
5512 const ARMSubtarget *Subtarget) {
5513 SDNode *N = Op.getNode();
5514 if (!N->hasOneUse())
5515 // Otherwise it requires moving the value from fp to integer registers.
5516 return false;
5517 if (!N->getNumValues())
5518 return false;
5519 EVT VT = Op.getValueType();
5520 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
5521 // f32 case is generally profitable. f64 case only makes sense when vcmpe +
5522 // vmrs are very slow, e.g. cortex-a8.
5523 return false;
5524
5525 if (isFloatingPointZero(Op)) {
5526 SeenZero = true;
5527 return true;
5528 }
5529 return ISD::isNormalLoad(N);
5530}
5531
5534 return DAG.getConstant(0, SDLoc(Op), MVT::i32);
5535
5536 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
5537 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
5538 Ld->getPointerInfo(), Ld->getAlign(),
5539 Ld->getMemOperand()->getFlags());
5540
5541 llvm_unreachable("Unknown VFP cmp argument!");
5542}
5543
5545 SDValue &RetVal1, SDValue &RetVal2) {
5546 SDLoc dl(Op);
5547
5548 if (isFloatingPointZero(Op)) {
5549 RetVal1 = DAG.getConstant(0, dl, MVT::i32);
5550 RetVal2 = DAG.getConstant(0, dl, MVT::i32);
5551 return;
5552 }
5553
5554 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
5555 SDValue Ptr = Ld->getBasePtr();
5556 RetVal1 =
5557 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
5558 Ld->getAlign(), Ld->getMemOperand()->getFlags());
5559
5560 EVT PtrType = Ptr.getValueType();
5561 SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
5562 PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
5563 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
5564 Ld->getPointerInfo().getWithOffset(4),
5565 commonAlignment(Ld->getAlign(), 4),
5566 Ld->getMemOperand()->getFlags());
5567 return;
5568 }
5569
5570 llvm_unreachable("Unknown VFP cmp argument!");
5571}
5572
5573/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
5574/// f32 and even f64 comparisons to integer ones.
5575SDValue
5576ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
5577 SDValue Chain = Op.getOperand(0);
5578 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5579 SDValue LHS = Op.getOperand(2);
5580 SDValue RHS = Op.getOperand(3);
5581 SDValue Dest = Op.getOperand(4);
5582 SDLoc dl(Op);
5583
5584 bool LHSSeenZero = false;
5585 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
5586 bool RHSSeenZero = false;
5587 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
5588 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
5589 // If unsafe fp math optimization is enabled and there are no other uses of
5590 // the CMP operands, and the condition code is EQ or NE, we can optimize it
5591 // to an integer comparison.
5592 if (CC == ISD::SETOEQ)
5593 CC = ISD::SETEQ;
5594 else if (CC == ISD::SETUNE)
5595 CC = ISD::SETNE;
5596
5597 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5598 SDValue ARMcc;
5599 if (LHS.getValueType() == MVT::f32) {
5600 LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5601 bitcastf32Toi32(LHS, DAG), Mask);
5602 RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5603 bitcastf32Toi32(RHS, DAG), Mask);
5604 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5605 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5606 Cmp);
5607 }
5608
5609 SDValue LHS1, LHS2;
5610 SDValue RHS1, RHS2;
5611 expandf64Toi32(LHS, DAG, LHS1, LHS2);
5612 expandf64Toi32(RHS, DAG, RHS1, RHS2);
5613 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
5614 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
5616 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5617 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
5618 return DAG.getNode(ARMISD::BCC_i64, dl, MVT::Other, Ops);
5619 }
5620
5621 return SDValue();
5622}
5623
5624SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
5625 SDValue Chain = Op.getOperand(0);
5626 SDValue Cond = Op.getOperand(1);
5627 SDValue Dest = Op.getOperand(2);
5628 SDLoc dl(Op);
5629
5630 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5631 // instruction.
5632 unsigned Opc = Cond.getOpcode();
5633 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5634 !Subtarget->isThumb1Only();
5635 if (Cond.getResNo() == 1 &&
5636 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5637 Opc == ISD::USUBO || OptimizeMul)) {
5638 // Only lower legal XALUO ops.
5639 if (!isTypeLegal(Cond->getValueType(0)))
5640 return SDValue();
5641
5642 // The actual operation with overflow check.
5643 SDValue Value, OverflowCmp;
5644 SDValue ARMcc;
5645 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5646
5647 // Reverse the condition code.
5649 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5651 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5652
5653 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5654 OverflowCmp);
5655 }
5656
5657 return SDValue();
5658}
5659
5660SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
5661 SDValue Chain = Op.getOperand(0);
5662 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5663 SDValue LHS = Op.getOperand(2);
5664 SDValue RHS = Op.getOperand(3);
5665 SDValue Dest = Op.getOperand(4);
5666 SDLoc dl(Op);
5667
5668 if (isUnsupportedFloatingType(LHS.getValueType())) {
5669 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5670
5671 // If softenSetCCOperands only returned one value, we should compare it to
5672 // zero.
5673 if (!RHS.getNode()) {
5674 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5675 CC = ISD::SETNE;
5676 }
5677 }
5678
5679 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5680 // instruction.
5681 unsigned Opc = LHS.getOpcode();
5682 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5683 !Subtarget->isThumb1Only();
5684 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
5685 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5686 Opc == ISD::USUBO || OptimizeMul) &&
5687 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5688 // Only lower legal XALUO ops.
5689 if (!isTypeLegal(LHS->getValueType(0)))
5690 return SDValue();
5691
5692 // The actual operation with overflow check.
5693 SDValue Value, OverflowCmp;
5694 SDValue ARMcc;
5695 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
5696
5697 if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
5698 // Reverse the condition code.
5700 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5702 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5703 }
5704
5705 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5706 OverflowCmp);
5707 }
5708
5709 if (LHS.getValueType() == MVT::i32) {
5710 SDValue ARMcc;
5711 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5712 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, Cmp);
5713 }
5714
5715 if (getTargetMachine().Options.UnsafeFPMath &&
5716 (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
5717 CC == ISD::SETNE || CC == ISD::SETUNE)) {
5718 if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
5719 return Result;
5720 }
5721
5722 ARMCC::CondCodes CondCode, CondCode2;
5723 FPCCToARMCC(CC, CondCode, CondCode2);
5724
5725 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5726 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5727 SDValue Ops[] = {Chain, Dest, ARMcc, Cmp};
5728 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5729 if (CondCode2 != ARMCC::AL) {
5730 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
5731 SDValue Ops[] = {Res, Dest, ARMcc, Cmp};
5732 Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5733 }
5734 return Res;
5735}
5736
5737SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
5738 SDValue Chain = Op.getOperand(0);
5739 SDValue Table = Op.getOperand(1);
5740 SDValue Index = Op.getOperand(2);
5741 SDLoc dl(Op);
5742
5743 EVT PTy = getPointerTy(DAG.getDataLayout());
5744 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
5745 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
5746 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
5747 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
5748 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
5749 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
5750 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
5751 // which does another jump to the destination. This also makes it easier
5752 // to translate it to TBB / TBH later (Thumb2 only).
5753 // FIXME: This might not work if the function is extremely large.
5754 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
5755 Addr, Op.getOperand(2), JTI);
5756 }
5757 if (isPositionIndependent() || Subtarget->isROPI()) {
5758 Addr =
5759 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
5761 Chain = Addr.getValue(1);
5762 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
5763 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5764 } else {
5765 Addr =
5766 DAG.getLoad(PTy, dl, Chain, Addr,
5768 Chain = Addr.getValue(1);
5769 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5770 }
5771}
5772
5774 EVT VT = Op.getValueType();
5775 SDLoc dl(Op);
5776
5777 if (Op.getValueType().getVectorElementType() == MVT::i32) {
5778 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
5779 return Op;
5780 return DAG.UnrollVectorOp(Op.getNode());
5781 }
5782
5783 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5784
5785 EVT NewTy;
5786 const EVT OpTy = Op.getOperand(0).getValueType();
5787 if (OpTy == MVT::v4f32)
5788 NewTy = MVT::v4i32;
5789 else if (OpTy == MVT::v4f16 && HasFullFP16)
5790 NewTy = MVT::v4i16;
5791 else if (OpTy == MVT::v8f16 && HasFullFP16)
5792 NewTy = MVT::v8i16;
5793 else
5794 llvm_unreachable("Invalid type for custom lowering!");
5795
5796 if (VT != MVT::v4i16 && VT != MVT::v8i16)
5797 return DAG.UnrollVectorOp(Op.getNode());
5798
5799 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
5800 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
5801}
5802
5803SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
5804 EVT VT = Op.getValueType();
5805 if (VT.isVector())
5806 return LowerVectorFP_TO_INT(Op, DAG);
5807
5808 bool IsStrict = Op->isStrictFPOpcode();
5809 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5810
5811 if (isUnsupportedFloatingType(SrcVal.getValueType())) {
5812 RTLIB::Libcall LC;
5813 if (Op.getOpcode() == ISD::FP_TO_SINT ||
5814 Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
5815 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(),
5816 Op.getValueType());
5817 else
5818 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(),
5819 Op.getValueType());
5820 SDLoc Loc(Op);
5821 MakeLibCallOptions CallOptions;
5822 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
5824 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
5825 CallOptions, Loc, Chain);
5826 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
5827 }
5828
5829 // FIXME: Remove this when we have strict fp instruction selection patterns
5830 if (IsStrict) {
5831 SDLoc Loc(Op);
5832 SDValue Result =
5835 Loc, Op.getValueType(), SrcVal);
5836 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
5837 }
5838
5839 return Op;
5840}
5841
5843 const ARMSubtarget *Subtarget) {
5844 EVT VT = Op.getValueType();
5845 EVT ToVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5846 EVT FromVT = Op.getOperand(0).getValueType();
5847
5848 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32)
5849 return Op;
5850 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 &&
5851 Subtarget->hasFP64())
5852 return Op;
5853 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 &&
5854 Subtarget->hasFullFP16())
5855 return Op;
5856 if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 &&
5857 Subtarget->hasMVEFloatOps())
5858 return Op;
5859 if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 &&
5860 Subtarget->hasMVEFloatOps())
5861 return Op;
5862
5863 if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16)
5864 return SDValue();
5865
5866 SDLoc DL(Op);
5867 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;
5868 unsigned BW = ToVT.getScalarSizeInBits() - IsSigned;
5869 SDValue CVT = DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
5870 DAG.getValueType(VT.getScalarType()));
5871 SDValue Max = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, CVT,
5872 DAG.getConstant((1 << BW) - 1, DL, VT));
5873 if (IsSigned)
5874 Max = DAG.getNode(ISD::SMAX, DL, VT, Max,
5875 DAG.getSignedConstant(-(1 << BW), DL, VT));
5876 return Max;
5877}
5878
5880 EVT VT = Op.getValueType();
5881 SDLoc dl(Op);
5882
5883 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
5884 if (VT.getVectorElementType() == MVT::f32)
5885 return Op;
5886 return DAG.UnrollVectorOp(Op.getNode());
5887 }
5888
5889 assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
5890 Op.getOperand(0).getValueType() == MVT::v8i16) &&
5891 "Invalid type for custom lowering!");
5892
5893 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5894
5895 EVT DestVecType;
5896 if (VT == MVT::v4f32)
5897 DestVecType = MVT::v4i32;
5898 else if (VT == MVT::v4f16 && HasFullFP16)
5899 DestVecType = MVT::v4i16;
5900 else if (VT == MVT::v8f16 && HasFullFP16)
5901 DestVecType = MVT::v8i16;
5902 else
5903 return DAG.UnrollVectorOp(Op.getNode());
5904
5905 unsigned CastOpc;
5906 unsigned Opc;
5907 switch (Op.getOpcode()) {
5908 default: llvm_unreachable("Invalid opcode!");
5909 case ISD::SINT_TO_FP:
5910 CastOpc = ISD::SIGN_EXTEND;
5912 break;
5913 case ISD::UINT_TO_FP:
5914 CastOpc = ISD::ZERO_EXTEND;
5916 break;
5917 }
5918
5919 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
5920 return DAG.getNode(Opc, dl, VT, Op);
5921}
5922
5923SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
5924 EVT VT = Op.getValueType();
5925 if (VT.isVector())
5926 return LowerVectorINT_TO_FP(Op, DAG);
5927 if (isUnsupportedFloatingType(VT)) {
5928 RTLIB::Libcall LC;
5929 if (Op.getOpcode() == ISD::SINT_TO_FP)
5930 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
5931 Op.getValueType());
5932 else
5933 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
5934 Op.getValueType());
5935 MakeLibCallOptions CallOptions;
5936 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
5937 CallOptions, SDLoc(Op)).first;
5938 }
5939
5940 return Op;
5941}
5942
5943SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
5944 // Implement fcopysign with a fabs and a conditional fneg.
5945 SDValue Tmp0 = Op.getOperand(0);
5946 SDValue Tmp1 = Op.getOperand(1);
5947 SDLoc dl(Op);
5948 EVT VT = Op.getValueType();
5949 EVT SrcVT = Tmp1.getValueType();
5950 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
5951 Tmp0.getOpcode() == ARMISD::VMOVDRR;
5952 bool UseNEON = !InGPR && Subtarget->hasNEON();
5953
5954 if (UseNEON) {
5955 // Use VBSL to copy the sign bit.
5956 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
5957 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
5958 DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
5959 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
5960 if (VT == MVT::f64)
5961 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5962 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
5963 DAG.getConstant(32, dl, MVT::i32));
5964 else /*if (VT == MVT::f32)*/
5965 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
5966 if (SrcVT == MVT::f32) {
5967 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
5968 if (VT == MVT::f64)
5969 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5970 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
5971 DAG.getConstant(32, dl, MVT::i32));
5972 } else if (VT == MVT::f32)
5973 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
5974 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
5975 DAG.getConstant(32, dl, MVT::i32));
5976 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
5977 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
5978
5980 dl, MVT::i32);
5981 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
5982 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
5983 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
5984
5985 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
5986 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
5987 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
5988 if (VT == MVT::f32) {
5989 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
5990 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
5991 DAG.getConstant(0, dl, MVT::i32));
5992 } else {
5993 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
5994 }
5995
5996 return Res;
5997 }
5998
5999 // Bitcast operand 1 to i32.
6000 if (SrcVT == MVT::f64)
6001 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
6002 Tmp1).getValue(1);
6003 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
6004
6005 // Or in the signbit with integer operations.
6006 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
6007 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
6008 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
6009 if (VT == MVT::f32) {
6010 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
6011 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
6012 return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
6013 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
6014 }
6015
6016 // f64: Or the high part with signbit and then combine two parts.
6017 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
6018 Tmp0);
6019 SDValue Lo = Tmp0.getValue(0);
6020 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
6021 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
6022 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
6023}
6024
6025SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
6027 MachineFrameInfo &MFI = MF.getFrameInfo();
6028 MFI.setReturnAddressIsTaken(true);
6029
6030 EVT VT = Op.getValueType();
6031 SDLoc dl(Op);
6032 unsigned Depth = Op.getConstantOperandVal(0);
6033 if (Depth) {
6034 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
6035 SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
6036 return DAG.getLoad(VT, dl, DAG.getEntryNode(),
6037 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
6039 }
6040
6041 // Return LR, which contains the return address. Mark it an implicit live-in.
6042 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
6043 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
6044}
6045
6046SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
6047 const ARMBaseRegisterInfo &ARI =
6048 *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
6050 MachineFrameInfo &MFI = MF.getFrameInfo();
6051 MFI.setFrameAddressIsTaken(true);
6052
6053 EVT VT = Op.getValueType();
6054 SDLoc dl(Op); // FIXME probably not meaningful
6055 unsigned Depth = Op.getConstantOperandVal(0);
6056 Register FrameReg = ARI.getFrameRegister(MF);
6057 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
6058 while (Depth--)
6059 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
6061 return FrameAddr;
6062}
6063
6064// FIXME? Maybe this could be a TableGen attribute on some registers and
6065// this table could be generated automatically from RegInfo.
6066Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
6067 const MachineFunction &MF) const {
6069 .Case("sp", ARM::SP)
6070 .Default(Register());
6071}
6072
6073// Result is 64 bit value so split into two 32 bit values and return as a
6074// pair of values.
6076 SelectionDAG &DAG) {
6077 SDLoc DL(N);
6078
6079 // This function is only supposed to be called for i64 type destination.
6080 assert(N->getValueType(0) == MVT::i64
6081 && "ExpandREAD_REGISTER called for non-i64 type result.");
6082
6084 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
6085 N->getOperand(0),
6086 N->getOperand(1));
6087
6088 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
6089 Read.getValue(1)));
6090 Results.push_back(Read.getValue(2)); // Chain
6091}
6092
6093/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
6094/// When \p DstVT, the destination type of \p BC, is on the vector
6095/// register bank and the source of bitcast, \p Op, operates on the same bank,
6096/// it might be possible to combine them, such that everything stays on the
6097/// vector register bank.
6098/// \p return The node that would replace \p BT, if the combine
6099/// is possible.
6101 SelectionDAG &DAG) {
6102 SDValue Op = BC->getOperand(0);
6103 EVT DstVT = BC->getValueType(0);
6104
6105 // The only vector instruction that can produce a scalar (remember,
6106 // since the bitcast was about to be turned into VMOVDRR, the source
6107 // type is i64) from a vector is EXTRACT_VECTOR_ELT.
6108 // Moreover, we can do this combine only if there is one use.
6109 // Finally, if the destination type is not a vector, there is not
6110 // much point on forcing everything on the vector bank.
6111 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6112 !Op.hasOneUse())
6113 return SDValue();
6114
6115 // If the index is not constant, we will introduce an additional
6116 // multiply that will stick.
6117 // Give up in that case.
6118 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
6119 if (!Index)
6120 return SDValue();
6121 unsigned DstNumElt = DstVT.getVectorNumElements();
6122
6123 // Compute the new index.
6124 const APInt &APIntIndex = Index->getAPIntValue();
6125 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
6126 NewIndex *= APIntIndex;
6127 // Check if the new constant index fits into i32.
6128 if (NewIndex.getBitWidth() > 32)
6129 return SDValue();
6130
6131 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
6132 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
6133 SDLoc dl(Op);
6134 SDValue ExtractSrc = Op.getOperand(0);
6135 EVT VecVT = EVT::getVectorVT(
6136 *DAG.getContext(), DstVT.getScalarType(),
6137 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
6138 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
6139 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
6140 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
6141}
6142
6143/// ExpandBITCAST - If the target supports VFP, this function is called to
6144/// expand a bit convert where either the source or destination type is i64 to
6145/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
6146/// operand type is illegal (e.g., v2f32 for a target that doesn't support
6147/// vectors), since the legalizer won't know what to do with that.
6148SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
6149 const ARMSubtarget *Subtarget) const {
6150 SDLoc dl(N);
6151 SDValue Op = N->getOperand(0);
6152
6153 // This function is only supposed to be called for i16 and i64 types, either
6154 // as the source or destination of the bit convert.
6155 EVT SrcVT = Op.getValueType();
6156 EVT DstVT = N->getValueType(0);
6157
6158 if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
6159 (DstVT == MVT::f16 || DstVT == MVT::bf16))
6160 return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
6161 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
6162
6163 if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
6164 (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) {
6165 if (Subtarget->hasFullFP16() && !Subtarget->hasBF16())
6166 Op = DAG.getBitcast(MVT::f16, Op);
6167 return DAG.getNode(
6168 ISD::TRUNCATE, SDLoc(N), DstVT,
6169 MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
6170 }
6171
6172 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
6173 return SDValue();
6174
6175 // Turn i64->f64 into VMOVDRR.
6176 if (SrcVT == MVT::i64 && isTypeLegal(DstVT)) {
6177 // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
6178 // if we can combine the bitcast with its source.
6180 return Val;
6181 SDValue Lo, Hi;
6182 std::tie(Lo, Hi) = DAG.SplitScalar(Op, dl, MVT::i32, MVT::i32);
6183 return DAG.getNode(ISD::BITCAST, dl, DstVT,
6184 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
6185 }
6186
6187 // Turn f64->i64 into VMOVRRD.
6188 if (DstVT == MVT::i64 && isTypeLegal(SrcVT)) {
6189 SDValue Cvt;
6190 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
6191 SrcVT.getVectorNumElements() > 1)
6192 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6193 DAG.getVTList(MVT::i32, MVT::i32),
6194 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
6195 else
6196 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6197 DAG.getVTList(MVT::i32, MVT::i32), Op);
6198 // Merge the pieces into a single i64 value.
6199 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
6200 }
6201
6202 return SDValue();
6203}
6204
6205/// getZeroVector - Returns a vector of specified type with all zero elements.
6206/// Zero vectors are used to represent vector negation and in those cases
6207/// will be implemented with the NEON VNEG instruction. However, VNEG does
6208/// not support i64 elements, so sometimes the zero vectors will need to be
6209/// explicitly constructed. Regardless, use a canonical VMOV to create the
6210/// zero vector.
6211static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6212 assert(VT.isVector() && "Expected a vector type");
6213 // The canonical modified immediate encoding of a zero vector is....0!
6214 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
6215 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
6216 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
6217 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6218}
6219
6220/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
6221/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6222SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
6223 SelectionDAG &DAG) const {
6224 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6225 EVT VT = Op.getValueType();
6226 unsigned VTBits = VT.getSizeInBits();
6227 SDLoc dl(Op);
6228 SDValue ShOpLo = Op.getOperand(0);
6229 SDValue ShOpHi = Op.getOperand(1);
6230 SDValue ShAmt = Op.getOperand(2);
6231 SDValue ARMcc;
6232 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
6233
6234 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
6235
6236 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6237 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6238 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
6239 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6240 DAG.getConstant(VTBits, dl, MVT::i32));
6241 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
6242 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6243 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
6244 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6245 ISD::SETGE, ARMcc, DAG, dl);
6246 SDValue Lo =
6247 DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, ARMcc, CmpLo);
6248
6249 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
6250 SDValue HiBigShift = Opc == ISD::SRA
6251 ? DAG.getNode(Opc, dl, VT, ShOpHi,
6252 DAG.getConstant(VTBits - 1, dl, VT))
6253 : DAG.getConstant(0, dl, VT);
6254 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6255 ISD::SETGE, ARMcc, DAG, dl);
6256 SDValue Hi =
6257 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6258
6259 SDValue Ops[2] = { Lo, Hi };
6260 return DAG.getMergeValues(Ops, dl);
6261}
6262
6263/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
6264/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6265SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
6266 SelectionDAG &DAG) const {
6267 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6268 EVT VT = Op.getValueType();
6269 unsigned VTBits = VT.getSizeInBits();
6270 SDLoc dl(Op);
6271 SDValue ShOpLo = Op.getOperand(0);
6272 SDValue ShOpHi = Op.getOperand(1);
6273 SDValue ShAmt = Op.getOperand(2);
6274 SDValue ARMcc;
6275
6276 assert(Op.getOpcode() == ISD::SHL_PARTS);
6277 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6278 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6279 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
6280 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
6281 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6282
6283 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6284 DAG.getConstant(VTBits, dl, MVT::i32));
6285 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
6286 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6287 ISD::SETGE, ARMcc, DAG, dl);
6288 SDValue Hi =
6289 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6290
6291 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6292 ISD::SETGE, ARMcc, DAG, dl);
6293 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
6294 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
6295 DAG.getConstant(0, dl, VT), ARMcc, CmpLo);
6296
6297 SDValue Ops[2] = { Lo, Hi };
6298 return DAG.getMergeValues(Ops, dl);
6299}
6300
6301SDValue ARMTargetLowering::LowerGET_ROUNDING(SDValue Op,
6302 SelectionDAG &DAG) const {
6303 // The rounding mode is in bits 23:22 of the FPSCR.
6304 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
6305 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
6306 // so that the shift + and get folded into a bitfield extract.
6307 SDLoc dl(Op);
6308 SDValue Chain = Op.getOperand(0);
6309 SDValue Ops[] = {Chain,
6310 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
6311
6312 SDValue FPSCR =
6313 DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);
6314 Chain = FPSCR.getValue(1);
6315 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
6316 DAG.getConstant(1U << 22, dl, MVT::i32));
6317 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
6318 DAG.getConstant(22, dl, MVT::i32));
6319 SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
6320 DAG.getConstant(3, dl, MVT::i32));
6321 return DAG.getMergeValues({And, Chain}, dl);
6322}
6323
6324SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op,
6325 SelectionDAG &DAG) const {
6326 SDLoc DL(Op);
6327 SDValue Chain = Op->getOperand(0);
6328 SDValue RMValue = Op->getOperand(1);
6329
6330 // The rounding mode is in bits 23:22 of the FPSCR.
6331 // The llvm.set.rounding argument value to ARM rounding mode value mapping
6332 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
6333 // ((arg - 1) & 3) << 22).
6334 //
6335 // It is expected that the argument of llvm.set.rounding is within the
6336 // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is
6337 // responsibility of the code generated llvm.set.rounding to ensure this
6338 // condition.
6339
6340 // Calculate new value of FPSCR[23:22].
6341 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
6342 DAG.getConstant(1, DL, MVT::i32));
6343 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
6344 DAG.getConstant(0x3, DL, MVT::i32));
6345 RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
6346 DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32));
6347
6348 // Get current value of FPSCR.
6349 SDValue Ops[] = {Chain,
6350 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6351 SDValue FPSCR =
6352 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6353 Chain = FPSCR.getValue(1);
6354 FPSCR = FPSCR.getValue(0);
6355
6356 // Put new rounding mode into FPSCR[23:22].
6357 const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos);
6358 FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6359 DAG.getConstant(RMMask, DL, MVT::i32));
6360 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue);
6361 SDValue Ops2[] = {
6362 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6363 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6364}
6365
6366SDValue ARMTargetLowering::LowerSET_FPMODE(SDValue Op,
6367 SelectionDAG &DAG) const {
6368 SDLoc DL(Op);
6369 SDValue Chain = Op->getOperand(0);
6370 SDValue Mode = Op->getOperand(1);
6371
6372 // Generate nodes to build:
6373 // FPSCR = (FPSCR & FPStatusBits) | (Mode & ~FPStatusBits)
6374 SDValue Ops[] = {Chain,
6375 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6376 SDValue FPSCR =
6377 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6378 Chain = FPSCR.getValue(1);
6379 FPSCR = FPSCR.getValue(0);
6380
6381 SDValue FPSCRMasked =
6382 DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6383 DAG.getConstant(ARM::FPStatusBits, DL, MVT::i32));
6384 SDValue InputMasked =
6385 DAG.getNode(ISD::AND, DL, MVT::i32, Mode,
6386 DAG.getConstant(~ARM::FPStatusBits, DL, MVT::i32));
6387 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCRMasked, InputMasked);
6388
6389 SDValue Ops2[] = {
6390 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6391 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6392}
6393
6394SDValue ARMTargetLowering::LowerRESET_FPMODE(SDValue Op,
6395 SelectionDAG &DAG) const {
6396 SDLoc DL(Op);
6397 SDValue Chain = Op->getOperand(0);
6398
6399 // To get the default FP mode all control bits are cleared:
6400 // FPSCR = FPSCR & (FPStatusBits | FPReservedBits)
6401 SDValue Ops[] = {Chain,
6402 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6403 SDValue FPSCR =
6404 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6405 Chain = FPSCR.getValue(1);
6406 FPSCR = FPSCR.getValue(0);
6407
6408 SDValue FPSCRMasked = DAG.getNode(
6409 ISD::AND, DL, MVT::i32, FPSCR,
6411 SDValue Ops2[] = {Chain,
6412 DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32),
6413 FPSCRMasked};
6414 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6415}
6416
6418 const ARMSubtarget *ST) {
6419 SDLoc dl(N);
6420 EVT VT = N->getValueType(0);
6421 if (VT.isVector() && ST->hasNEON()) {
6422
6423 // Compute the least significant set bit: LSB = X & -X
6424 SDValue X = N->getOperand(0);
6425 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
6426 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
6427
6428 EVT ElemTy = VT.getVectorElementType();
6429
6430 if (ElemTy == MVT::i8) {
6431 // Compute with: cttz(x) = ctpop(lsb - 1)
6432 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6433 DAG.getTargetConstant(1, dl, ElemTy));
6434 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6435 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6436 }
6437
6438 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
6439 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
6440 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
6441 unsigned NumBits = ElemTy.getSizeInBits();
6442 SDValue WidthMinus1 =
6443 DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6444 DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
6445 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
6446 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
6447 }
6448
6449 // Compute with: cttz(x) = ctpop(lsb - 1)
6450
6451 // Compute LSB - 1.
6452 SDValue Bits;
6453 if (ElemTy == MVT::i64) {
6454 // Load constant 0xffff'ffff'ffff'ffff to register.
6455 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6456 DAG.getTargetConstant(0x1eff, dl, MVT::i32));
6457 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
6458 } else {
6459 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6460 DAG.getTargetConstant(1, dl, ElemTy));
6461 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6462 }
6463 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6464 }
6465
6466 if (!ST->hasV6T2Ops())
6467 return SDValue();
6468
6469 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
6470 return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
6471}
6472
6474 const ARMSubtarget *ST) {
6475 EVT VT = N->getValueType(0);
6476 SDLoc DL(N);
6477
6478 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
6479 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
6480 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
6481 "Unexpected type for custom ctpop lowering");
6482
6483 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6484 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
6485 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
6486 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
6487
6488 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
6489 unsigned EltSize = 8;
6490 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
6491 while (EltSize != VT.getScalarSizeInBits()) {
6493 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
6494 TLI.getPointerTy(DAG.getDataLayout())));
6495 Ops.push_back(Res);
6496
6497 EltSize *= 2;
6498 NumElts /= 2;
6499 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
6500 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
6501 }
6502
6503 return Res;
6504}
6505
6506/// Getvshiftimm - Check if this is a valid build_vector for the immediate
6507/// operand of a vector shift operation, where all the elements of the
6508/// build_vector must have the same constant integer value.
6509static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
6510 // Ignore bit_converts.
6511 while (Op.getOpcode() == ISD::BITCAST)
6512 Op = Op.getOperand(0);
6513 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
6514 APInt SplatBits, SplatUndef;
6515 unsigned SplatBitSize;
6516 bool HasAnyUndefs;
6517 if (!BVN ||
6518 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
6519 ElementBits) ||
6520 SplatBitSize > ElementBits)
6521 return false;
6522 Cnt = SplatBits.getSExtValue();
6523 return true;
6524}
6525
6526/// isVShiftLImm - Check if this is a valid build_vector for the immediate
6527/// operand of a vector shift left operation. That value must be in the range:
6528/// 0 <= Value < ElementBits for a left shift; or
6529/// 0 <= Value <= ElementBits for a long left shift.
6530static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
6531 assert(VT.isVector() && "vector shift count is not a vector type");
6532 int64_t ElementBits = VT.getScalarSizeInBits();
6533 if (!getVShiftImm(Op, ElementBits, Cnt))
6534 return false;
6535 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
6536}
6537
6538/// isVShiftRImm - Check if this is a valid build_vector for the immediate
6539/// operand of a vector shift right operation. For a shift opcode, the value
6540/// is positive, but for an intrinsic the value count must be negative. The
6541/// absolute value must be in the range:
6542/// 1 <= |Value| <= ElementBits for a right shift; or
6543/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
6544static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
6545 int64_t &Cnt) {
6546 assert(VT.isVector() && "vector shift count is not a vector type");
6547 int64_t ElementBits = VT.getScalarSizeInBits();
6548 if (!getVShiftImm(Op, ElementBits, Cnt))
6549 return false;
6550 if (!isIntrinsic)
6551 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
6552 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
6553 Cnt = -Cnt;
6554 return true;
6555 }
6556 return false;
6557}
6558
6560 const ARMSubtarget *ST) {
6561 EVT VT = N->getValueType(0);
6562 SDLoc dl(N);
6563 int64_t Cnt;
6564
6565 if (!VT.isVector())
6566 return SDValue();
6567
6568 // We essentially have two forms here. Shift by an immediate and shift by a
6569 // vector register (there are also shift by a gpr, but that is just handled
6570 // with a tablegen pattern). We cannot easily match shift by an immediate in
6571 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
6572 // For shifting by a vector, we don't have VSHR, only VSHL (which can be
6573 // signed or unsigned, and a negative shift indicates a shift right).
6574 if (N->getOpcode() == ISD::SHL) {
6575 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
6576 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
6577 DAG.getConstant(Cnt, dl, MVT::i32));
6578 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
6579 N->getOperand(1));
6580 }
6581
6582 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
6583 "unexpected vector shift opcode");
6584
6585 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
6586 unsigned VShiftOpc =
6587 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
6588 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
6589 DAG.getConstant(Cnt, dl, MVT::i32));
6590 }
6591
6592 // Other right shifts we don't have operations for (we use a shift left by a
6593 // negative number).
6594 EVT ShiftVT = N->getOperand(1).getValueType();
6595 SDValue NegatedCount = DAG.getNode(
6596 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
6597 unsigned VShiftOpc =
6598 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
6599 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
6600}
6601
6603 const ARMSubtarget *ST) {
6604 EVT VT = N->getValueType(0);
6605 SDLoc dl(N);
6606
6607 // We can get here for a node like i32 = ISD::SHL i32, i64
6608 if (VT != MVT::i64)
6609 return SDValue();
6610
6611 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
6612 N->getOpcode() == ISD::SHL) &&
6613 "Unknown shift to lower!");
6614
6615 unsigned ShOpc = N->getOpcode();
6616 if (ST->hasMVEIntegerOps()) {
6617 SDValue ShAmt = N->getOperand(1);
6618 unsigned ShPartsOpc = ARMISD::LSLL;
6619 ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt);
6620
6621 // If the shift amount is greater than 32 or has a greater bitwidth than 64
6622 // then do the default optimisation
6623 if ((!Con && ShAmt->getValueType(0).getSizeInBits() > 64) ||
6624 (Con && (Con->getAPIntValue() == 0 || Con->getAPIntValue().uge(32))))
6625 return SDValue();
6626
6627 // Extract the lower 32 bits of the shift amount if it's not an i32
6628 if (ShAmt->getValueType(0) != MVT::i32)
6629 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
6630
6631 if (ShOpc == ISD::SRL) {
6632 if (!Con)
6633 // There is no t2LSRLr instruction so negate and perform an lsll if the
6634 // shift amount is in a register, emulating a right shift.
6635 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6636 DAG.getConstant(0, dl, MVT::i32), ShAmt);
6637 else
6638 // Else generate an lsrl on the immediate shift amount
6639 ShPartsOpc = ARMISD::LSRL;
6640 } else if (ShOpc == ISD::SRA)
6641 ShPartsOpc = ARMISD::ASRL;
6642
6643 // Split Lower/Upper 32 bits of the destination/source
6644 SDValue Lo, Hi;
6645 std::tie(Lo, Hi) =
6646 DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6647 // Generate the shift operation as computed above
6648 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
6649 ShAmt);
6650 // The upper 32 bits come from the second return value of lsll
6651 Hi = SDValue(Lo.getNode(), 1);
6652 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6653 }
6654
6655 // We only lower SRA, SRL of 1 here, all others use generic lowering.
6656 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
6657 return SDValue();
6658
6659 // If we are in thumb mode, we don't have RRX.
6660 if (ST->isThumb1Only())
6661 return SDValue();
6662
6663 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
6664 SDValue Lo, Hi;
6665 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6666
6667 // First, build a LSRS1/ASRS1 op, which shifts the top part by one and
6668 // captures the shifted out bit into a carry flag.
6669 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::LSRS1 : ARMISD::ASRS1;
6670 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, FlagsVT), Hi);
6671
6672 // The low part is an ARMISD::RRX operand, which shifts the carry in.
6673 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
6674
6675 // Merge the pieces into a single i64 value.
6676 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6677}
6678
6680 const ARMSubtarget *ST) {
6681 bool Invert = false;
6682 bool Swap = false;
6683 unsigned Opc = ARMCC::AL;
6684
6685 SDValue Op0 = Op.getOperand(0);
6686 SDValue Op1 = Op.getOperand(1);
6687 SDValue CC = Op.getOperand(2);
6688 EVT VT = Op.getValueType();
6689 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
6690 SDLoc dl(Op);
6691
6692 EVT CmpVT;
6693 if (ST->hasNEON())
6695 else {
6696 assert(ST->hasMVEIntegerOps() &&
6697 "No hardware support for integer vector comparison!");
6698
6699 if (Op.getValueType().getVectorElementType() != MVT::i1)
6700 return SDValue();
6701
6702 // Make sure we expand floating point setcc to scalar if we do not have
6703 // mve.fp, so that we can handle them from there.
6704 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
6705 return SDValue();
6706
6707 CmpVT = VT;
6708 }
6709
6710 if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
6711 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
6712 // Special-case integer 64-bit equality comparisons. They aren't legal,
6713 // but they can be lowered with a few vector instructions.
6714 unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
6715 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
6716 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
6717 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
6718 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
6719 DAG.getCondCode(ISD::SETEQ));
6720 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
6721 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
6722 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
6723 if (SetCCOpcode == ISD::SETNE)
6724 Merged = DAG.getNOT(dl, Merged, CmpVT);
6725 Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
6726 return Merged;
6727 }
6728
6729 if (CmpVT.getVectorElementType() == MVT::i64)
6730 // 64-bit comparisons are not legal in general.
6731 return SDValue();
6732
6733 if (Op1.getValueType().isFloatingPoint()) {
6734 switch (SetCCOpcode) {
6735 default: llvm_unreachable("Illegal FP comparison");
6736 case ISD::SETUNE:
6737 case ISD::SETNE:
6738 if (ST->hasMVEFloatOps()) {
6739 Opc = ARMCC::NE; break;
6740 } else {
6741 Invert = true; [[fallthrough]];
6742 }
6743 case ISD::SETOEQ:
6744 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6745 case ISD::SETOLT:
6746 case ISD::SETLT: Swap = true; [[fallthrough]];
6747 case ISD::SETOGT:
6748 case ISD::SETGT: Opc = ARMCC::GT; break;
6749 case ISD::SETOLE:
6750 case ISD::SETLE: Swap = true; [[fallthrough]];
6751 case ISD::SETOGE:
6752 case ISD::SETGE: Opc = ARMCC::GE; break;
6753 case ISD::SETUGE: Swap = true; [[fallthrough]];
6754 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
6755 case ISD::SETUGT: Swap = true; [[fallthrough]];
6756 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
6757 case ISD::SETUEQ: Invert = true; [[fallthrough]];
6758 case ISD::SETONE: {
6759 // Expand this to (OLT | OGT).
6760 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6761 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6762 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6763 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6764 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6765 if (Invert)
6766 Result = DAG.getNOT(dl, Result, VT);
6767 return Result;
6768 }
6769 case ISD::SETUO: Invert = true; [[fallthrough]];
6770 case ISD::SETO: {
6771 // Expand this to (OLT | OGE).
6772 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6773 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6774 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6775 DAG.getConstant(ARMCC::GE, dl, MVT::i32));
6776 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6777 if (Invert)
6778 Result = DAG.getNOT(dl, Result, VT);
6779 return Result;
6780 }
6781 }
6782 } else {
6783 // Integer comparisons.
6784 switch (SetCCOpcode) {
6785 default: llvm_unreachable("Illegal integer comparison");
6786 case ISD::SETNE:
6787 if (ST->hasMVEIntegerOps()) {
6788 Opc = ARMCC::NE; break;
6789 } else {
6790 Invert = true; [[fallthrough]];
6791 }
6792 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6793 case ISD::SETLT: Swap = true; [[fallthrough]];
6794 case ISD::SETGT: Opc = ARMCC::GT; break;
6795 case ISD::SETLE: Swap = true; [[fallthrough]];
6796 case ISD::SETGE: Opc = ARMCC::GE; break;
6797 case ISD::SETULT: Swap = true; [[fallthrough]];
6798 case ISD::SETUGT: Opc = ARMCC::HI; break;
6799 case ISD::SETULE: Swap = true; [[fallthrough]];
6800 case ISD::SETUGE: Opc = ARMCC::HS; break;
6801 }
6802
6803 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
6804 if (ST->hasNEON() && Opc == ARMCC::EQ) {
6805 SDValue AndOp;
6807 AndOp = Op0;
6808 else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
6809 AndOp = Op1;
6810
6811 // Ignore bitconvert.
6812 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
6813 AndOp = AndOp.getOperand(0);
6814
6815 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
6816 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
6817 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
6818 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
6819 if (!Invert)
6820 Result = DAG.getNOT(dl, Result, VT);
6821 return Result;
6822 }
6823 }
6824 }
6825
6826 if (Swap)
6827 std::swap(Op0, Op1);
6828
6829 // If one of the operands is a constant vector zero, attempt to fold the
6830 // comparison to a specialized compare-against-zero form.
6832 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::EQ ||
6833 Opc == ARMCC::NE)) {
6834 if (Opc == ARMCC::GE)
6835 Opc = ARMCC::LE;
6836 else if (Opc == ARMCC::GT)
6837 Opc = ARMCC::LT;
6838 std::swap(Op0, Op1);
6839 }
6840
6841 SDValue Result;
6843 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::LE ||
6844 Opc == ARMCC::LT || Opc == ARMCC::NE || Opc == ARMCC::EQ))
6845 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, Op0,
6846 DAG.getConstant(Opc, dl, MVT::i32));
6847 else
6848 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6849 DAG.getConstant(Opc, dl, MVT::i32));
6850
6851 Result = DAG.getSExtOrTrunc(Result, dl, VT);
6852
6853 if (Invert)
6854 Result = DAG.getNOT(dl, Result, VT);
6855
6856 return Result;
6857}
6858
6860 SDValue LHS = Op.getOperand(0);
6861 SDValue RHS = Op.getOperand(1);
6862 SDValue Carry = Op.getOperand(2);
6863 SDValue Cond = Op.getOperand(3);
6864 SDLoc DL(Op);
6865
6866 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
6867
6868 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
6869 // have to invert the carry first.
6870 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
6871 DAG.getConstant(1, DL, MVT::i32), Carry);
6872 // This converts the boolean value carry into the carry flag.
6873 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
6874
6875 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
6876 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
6877
6878 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
6879 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
6880 SDValue ARMcc = DAG.getConstant(
6881 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
6882 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
6883 Cmp.getValue(1));
6884}
6885
6886/// isVMOVModifiedImm - Check if the specified splat value corresponds to a
6887/// valid vector constant for a NEON or MVE instruction with a "modified
6888/// immediate" operand (e.g., VMOV). If so, return the encoded value.
6889static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
6890 unsigned SplatBitSize, SelectionDAG &DAG,
6891 const SDLoc &dl, EVT &VT, EVT VectorVT,
6892 VMOVModImmType type) {
6893 unsigned OpCmode, Imm;
6894 bool is128Bits = VectorVT.is128BitVector();
6895
6896 // SplatBitSize is set to the smallest size that splats the vector, so a
6897 // zero vector will always have SplatBitSize == 8. However, NEON modified
6898 // immediate instructions others than VMOV do not support the 8-bit encoding
6899 // of a zero vector, and the default encoding of zero is supposed to be the
6900 // 32-bit version.
6901 if (SplatBits == 0)
6902 SplatBitSize = 32;
6903
6904 switch (SplatBitSize) {
6905 case 8:
6906 if (type != VMOVModImm)
6907 return SDValue();
6908 // Any 1-byte value is OK. Op=0, Cmode=1110.
6909 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
6910 OpCmode = 0xe;
6911 Imm = SplatBits;
6912 VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
6913 break;
6914
6915 case 16:
6916 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
6917 VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
6918 if ((SplatBits & ~0xff) == 0) {
6919 // Value = 0x00nn: Op=x, Cmode=100x.
6920 OpCmode = 0x8;
6921 Imm = SplatBits;
6922 break;
6923 }
6924 if ((SplatBits & ~0xff00) == 0) {
6925 // Value = 0xnn00: Op=x, Cmode=101x.
6926 OpCmode = 0xa;
6927 Imm = SplatBits >> 8;
6928 break;
6929 }
6930 return SDValue();
6931
6932 case 32:
6933 // NEON's 32-bit VMOV supports splat values where:
6934 // * only one byte is nonzero, or
6935 // * the least significant byte is 0xff and the second byte is nonzero, or
6936 // * the least significant 2 bytes are 0xff and the third is nonzero.
6937 VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
6938 if ((SplatBits & ~0xff) == 0) {
6939 // Value = 0x000000nn: Op=x, Cmode=000x.
6940 OpCmode = 0;
6941 Imm = SplatBits;
6942 break;
6943 }
6944 if ((SplatBits & ~0xff00) == 0) {
6945 // Value = 0x0000nn00: Op=x, Cmode=001x.
6946 OpCmode = 0x2;
6947 Imm = SplatBits >> 8;
6948 break;
6949 }
6950 if ((SplatBits & ~0xff0000) == 0) {
6951 // Value = 0x00nn0000: Op=x, Cmode=010x.
6952 OpCmode = 0x4;
6953 Imm = SplatBits >> 16;
6954 break;
6955 }
6956 if ((SplatBits & ~0xff000000) == 0) {
6957 // Value = 0xnn000000: Op=x, Cmode=011x.
6958 OpCmode = 0x6;
6959 Imm = SplatBits >> 24;
6960 break;
6961 }
6962
6963 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
6964 if (type == OtherModImm) return SDValue();
6965
6966 if ((SplatBits & ~0xffff) == 0 &&
6967 ((SplatBits | SplatUndef) & 0xff) == 0xff) {
6968 // Value = 0x0000nnff: Op=x, Cmode=1100.
6969 OpCmode = 0xc;
6970 Imm = SplatBits >> 8;
6971 break;
6972 }
6973
6974 // cmode == 0b1101 is not supported for MVE VMVN
6975 if (type == MVEVMVNModImm)
6976 return SDValue();
6977
6978 if ((SplatBits & ~0xffffff) == 0 &&
6979 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
6980 // Value = 0x00nnffff: Op=x, Cmode=1101.
6981 OpCmode = 0xd;
6982 Imm = SplatBits >> 16;
6983 break;
6984 }
6985
6986 // Note: there are a few 32-bit splat values (specifically: 00ffff00,
6987 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
6988 // VMOV.I32. A (very) minor optimization would be to replicate the value
6989 // and fall through here to test for a valid 64-bit splat. But, then the
6990 // caller would also need to check and handle the change in size.
6991 return SDValue();
6992
6993 case 64: {
6994 if (type != VMOVModImm)
6995 return SDValue();
6996 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
6997 uint64_t BitMask = 0xff;
6998 unsigned ImmMask = 1;
6999 Imm = 0;
7000 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
7001 if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
7002 Imm |= ImmMask;
7003 } else if ((SplatBits & BitMask) != 0) {
7004 return SDValue();
7005 }
7006 BitMask <<= 8;
7007 ImmMask <<= 1;
7008 }
7009
7010 // Op=1, Cmode=1110.
7011 OpCmode = 0x1e;
7012 VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
7013 break;
7014 }
7015
7016 default:
7017 llvm_unreachable("unexpected size for isVMOVModifiedImm");
7018 }
7019
7020 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
7021 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
7022}
7023
7024SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
7025 const ARMSubtarget *ST) const {
7026 EVT VT = Op.getValueType();
7027 bool IsDouble = (VT == MVT::f64);
7028 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
7029 const APFloat &FPVal = CFP->getValueAPF();
7030
7031 // Prevent floating-point constants from using literal loads
7032 // when execute-only is enabled.
7033 if (ST->genExecuteOnly()) {
7034 // We shouldn't trigger this for v6m execute-only
7035 assert((!ST->isThumb1Only() || ST->hasV8MBaselineOps()) &&
7036 "Unexpected architecture");
7037
7038 // If we can represent the constant as an immediate, don't lower it
7039 if (isFPImmLegal(FPVal, VT))
7040 return Op;
7041 // Otherwise, construct as integer, and move to float register
7042 APInt INTVal = FPVal.bitcastToAPInt();
7043 SDLoc DL(CFP);
7044 switch (VT.getSimpleVT().SimpleTy) {
7045 default:
7046 llvm_unreachable("Unknown floating point type!");
7047 break;
7048 case MVT::f64: {
7049 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
7050 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
7051 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
7052 }
7053 case MVT::f32:
7054 return DAG.getNode(ARMISD::VMOVSR, DL, VT,
7055 DAG.getConstant(INTVal, DL, MVT::i32));
7056 }
7057 }
7058
7059 if (!ST->hasVFP3Base())
7060 return SDValue();
7061
7062 // Use the default (constant pool) lowering for double constants when we have
7063 // an SP-only FPU
7064 if (IsDouble && !Subtarget->hasFP64())
7065 return SDValue();
7066
7067 // Try splatting with a VMOV.f32...
7068 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
7069
7070 if (ImmVal != -1) {
7071 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
7072 // We have code in place to select a valid ConstantFP already, no need to
7073 // do any mangling.
7074 return Op;
7075 }
7076
7077 // It's a float and we are trying to use NEON operations where
7078 // possible. Lower it to a splat followed by an extract.
7079 SDLoc DL(Op);
7080 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
7081 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
7082 NewVal);
7083 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
7084 DAG.getConstant(0, DL, MVT::i32));
7085 }
7086
7087 // The rest of our options are NEON only, make sure that's allowed before
7088 // proceeding..
7089 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
7090 return SDValue();
7091
7092 EVT VMovVT;
7093 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
7094
7095 // It wouldn't really be worth bothering for doubles except for one very
7096 // important value, which does happen to match: 0.0. So make sure we don't do
7097 // anything stupid.
7098 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
7099 return SDValue();
7100
7101 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
7102 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
7103 VMovVT, VT, VMOVModImm);
7104 if (NewVal != SDValue()) {
7105 SDLoc DL(Op);
7106 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
7107 NewVal);
7108 if (IsDouble)
7109 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
7110
7111 // It's a float: cast and extract a vector element.
7112 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
7113 VecConstant);
7114 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
7115 DAG.getConstant(0, DL, MVT::i32));
7116 }
7117
7118 // Finally, try a VMVN.i32
7119 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
7120 VT, VMVNModImm);
7121 if (NewVal != SDValue()) {
7122 SDLoc DL(Op);
7123 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
7124
7125 if (IsDouble)
7126 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
7127
7128 // It's a float: cast and extract a vector element.
7129 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
7130 VecConstant);
7131 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
7132 DAG.getConstant(0, DL, MVT::i32));
7133 }
7134
7135 return SDValue();
7136}
7137
7138// check if an VEXT instruction can handle the shuffle mask when the
7139// vector sources of the shuffle are the same.
7140static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
7141 unsigned NumElts = VT.getVectorNumElements();
7142
7143 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7144 if (M[0] < 0)
7145 return false;
7146
7147 Imm = M[0];
7148
7149 // If this is a VEXT shuffle, the immediate value is the index of the first
7150 // element. The other shuffle indices must be the successive elements after
7151 // the first one.
7152 unsigned ExpectedElt = Imm;
7153 for (unsigned i = 1; i < NumElts; ++i) {
7154 // Increment the expected index. If it wraps around, just follow it
7155 // back to index zero and keep going.
7156 ++ExpectedElt;
7157 if (ExpectedElt == NumElts)
7158 ExpectedElt = 0;
7159
7160 if (M[i] < 0) continue; // ignore UNDEF indices
7161 if (ExpectedElt != static_cast<unsigned>(M[i]))
7162 return false;
7163 }
7164
7165 return true;
7166}
7167
7168static bool isVEXTMask(ArrayRef<int> M, EVT VT,
7169 bool &ReverseVEXT, unsigned &Imm) {
7170 unsigned NumElts = VT.getVectorNumElements();
7171 ReverseVEXT = false;
7172
7173 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7174 if (M[0] < 0)
7175 return false;
7176
7177 Imm = M[0];
7178
7179 // If this is a VEXT shuffle, the immediate value is the index of the first
7180 // element. The other shuffle indices must be the successive elements after
7181 // the first one.
7182 unsigned ExpectedElt = Imm;
7183 for (unsigned i = 1; i < NumElts; ++i) {
7184 // Increment the expected index. If it wraps around, it may still be
7185 // a VEXT but the source vectors must be swapped.
7186 ExpectedElt += 1;
7187 if (ExpectedElt == NumElts * 2) {
7188 ExpectedElt = 0;
7189 ReverseVEXT = true;
7190 }
7191
7192 if (M[i] < 0) continue; // ignore UNDEF indices
7193 if (ExpectedElt != static_cast<unsigned>(M[i]))
7194 return false;
7195 }
7196
7197 // Adjust the index value if the source operands will be swapped.
7198 if (ReverseVEXT)
7199 Imm -= NumElts;
7200
7201 return true;
7202}
7203
7204static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
7205 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
7206 // range, then 0 is placed into the resulting vector. So pretty much any mask
7207 // of 8 elements can work here.
7208 return VT == MVT::v8i8 && M.size() == 8;
7209}
7210
7211static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
7212 unsigned Index) {
7213 if (Mask.size() == Elements * 2)
7214 return Index / Elements;
7215 return Mask[Index] == 0 ? 0 : 1;
7216}
7217
7218// Checks whether the shuffle mask represents a vector transpose (VTRN) by
7219// checking that pairs of elements in the shuffle mask represent the same index
7220// in each vector, incrementing the expected index by 2 at each step.
7221// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
7222// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
7223// v2={e,f,g,h}
7224// WhichResult gives the offset for each element in the mask based on which
7225// of the two results it belongs to.
7226//
7227// The transpose can be represented either as:
7228// result1 = shufflevector v1, v2, result1_shuffle_mask
7229// result2 = shufflevector v1, v2, result2_shuffle_mask
7230// where v1/v2 and the shuffle masks have the same number of elements
7231// (here WhichResult (see below) indicates which result is being checked)
7232//
7233// or as:
7234// results = shufflevector v1, v2, shuffle_mask
7235// where both results are returned in one vector and the shuffle mask has twice
7236// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
7237// want to check the low half and high half of the shuffle mask as if it were
7238// the other case
7239static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7240 unsigned EltSz = VT.getScalarSizeInBits();
7241 if (EltSz == 64)
7242 return false;
7243
7244 unsigned NumElts = VT.getVectorNumElements();
7245 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7246 return false;
7247
7248 // If the mask is twice as long as the input vector then we need to check the
7249 // upper and lower parts of the mask with a matching value for WhichResult
7250 // FIXME: A mask with only even values will be rejected in case the first
7251 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
7252 // M[0] is used to determine WhichResult
7253 for (unsigned i = 0; i < M.size(); i += NumElts) {
7254 WhichResult = SelectPairHalf(NumElts, M, i);
7255 for (unsigned j = 0; j < NumElts; j += 2) {
7256 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7257 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
7258 return false;
7259 }
7260 }
7261
7262 if (M.size() == NumElts*2)
7263 WhichResult = 0;
7264
7265 return true;
7266}
7267
7268/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
7269/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7270/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
7271static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7272 unsigned EltSz = VT.getScalarSizeInBits();
7273 if (EltSz == 64)
7274 return false;
7275
7276 unsigned NumElts = VT.getVectorNumElements();
7277 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7278 return false;
7279
7280 for (unsigned i = 0; i < M.size(); i += NumElts) {
7281 WhichResult = SelectPairHalf(NumElts, M, i);
7282 for (unsigned j = 0; j < NumElts; j += 2) {
7283 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7284 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
7285 return false;
7286 }
7287 }
7288
7289 if (M.size() == NumElts*2)
7290 WhichResult = 0;
7291
7292 return true;
7293}
7294
7295// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
7296// that the mask elements are either all even and in steps of size 2 or all odd
7297// and in steps of size 2.
7298// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
7299// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
7300// v2={e,f,g,h}
7301// Requires similar checks to that of isVTRNMask with
7302// respect the how results are returned.
7303static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7304 unsigned EltSz = VT.getScalarSizeInBits();
7305 if (EltSz == 64)
7306 return false;
7307
7308 unsigned NumElts = VT.getVectorNumElements();
7309 if (M.size() != NumElts && M.size() != NumElts*2)
7310 return false;
7311
7312 for (unsigned i = 0; i < M.size(); i += NumElts) {
7313 WhichResult = SelectPairHalf(NumElts, M, i);
7314 for (unsigned j = 0; j < NumElts; ++j) {
7315 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
7316 return false;
7317 }
7318 }
7319
7320 if (M.size() == NumElts*2)
7321 WhichResult = 0;
7322
7323 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7324 if (VT.is64BitVector() && EltSz == 32)
7325 return false;
7326
7327 return true;
7328}
7329
7330/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
7331/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7332/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
7333static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7334 unsigned EltSz = VT.getScalarSizeInBits();
7335 if (EltSz == 64)
7336 return false;
7337
7338 unsigned NumElts = VT.getVectorNumElements();
7339 if (M.size() != NumElts && M.size() != NumElts*2)
7340 return false;
7341
7342 unsigned Half = NumElts / 2;
7343 for (unsigned i = 0; i < M.size(); i += NumElts) {
7344 WhichResult = SelectPairHalf(NumElts, M, i);
7345 for (unsigned j = 0; j < NumElts; j += Half) {
7346 unsigned Idx = WhichResult;
7347 for (unsigned k = 0; k < Half; ++k) {
7348 int MIdx = M[i + j + k];
7349 if (MIdx >= 0 && (unsigned) MIdx != Idx)
7350 return false;
7351 Idx += 2;
7352 }
7353 }
7354 }
7355
7356 if (M.size() == NumElts*2)
7357 WhichResult = 0;
7358
7359 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7360 if (VT.is64BitVector() && EltSz == 32)
7361 return false;
7362
7363 return true;
7364}
7365
7366// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
7367// that pairs of elements of the shufflemask represent the same index in each
7368// vector incrementing sequentially through the vectors.
7369// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
7370// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
7371// v2={e,f,g,h}
7372// Requires similar checks to that of isVTRNMask with respect the how results
7373// are returned.
7374static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7375 unsigned EltSz = VT.getScalarSizeInBits();
7376 if (EltSz == 64)
7377 return false;
7378
7379 unsigned NumElts = VT.getVectorNumElements();
7380 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7381 return false;
7382
7383 for (unsigned i = 0; i < M.size(); i += NumElts) {
7384 WhichResult = SelectPairHalf(NumElts, M, i);
7385 unsigned Idx = WhichResult * NumElts / 2;
7386 for (unsigned j = 0; j < NumElts; j += 2) {
7387 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7388 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
7389 return false;
7390 Idx += 1;
7391 }
7392 }
7393
7394 if (M.size() == NumElts*2)
7395 WhichResult = 0;
7396
7397 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7398 if (VT.is64BitVector() && EltSz == 32)
7399 return false;
7400
7401 return true;
7402}
7403
7404/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
7405/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7406/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
7407static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7408 unsigned EltSz = VT.getScalarSizeInBits();
7409 if (EltSz == 64)
7410 return false;
7411
7412 unsigned NumElts = VT.getVectorNumElements();
7413 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7414 return false;
7415
7416 for (unsigned i = 0; i < M.size(); i += NumElts) {
7417 WhichResult = SelectPairHalf(NumElts, M, i);
7418 unsigned Idx = WhichResult * NumElts / 2;
7419 for (unsigned j = 0; j < NumElts; j += 2) {
7420 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7421 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
7422 return false;
7423 Idx += 1;
7424 }
7425 }
7426
7427 if (M.size() == NumElts*2)
7428 WhichResult = 0;
7429
7430 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7431 if (VT.is64BitVector() && EltSz == 32)
7432 return false;
7433
7434 return true;
7435}
7436
7437/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
7438/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
7439static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
7440 unsigned &WhichResult,
7441 bool &isV_UNDEF) {
7442 isV_UNDEF = false;
7443 if (isVTRNMask(ShuffleMask, VT, WhichResult))
7444 return ARMISD::VTRN;
7445 if (isVUZPMask(ShuffleMask, VT, WhichResult))
7446 return ARMISD::VUZP;
7447 if (isVZIPMask(ShuffleMask, VT, WhichResult))
7448 return ARMISD::VZIP;
7449
7450 isV_UNDEF = true;
7451 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
7452 return ARMISD::VTRN;
7453 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7454 return ARMISD::VUZP;
7455 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7456 return ARMISD::VZIP;
7457
7458 return 0;
7459}
7460
7461/// \return true if this is a reverse operation on an vector.
7462static bool isReverseMask(ArrayRef<int> M, EVT VT) {
7463 unsigned NumElts = VT.getVectorNumElements();
7464 // Make sure the mask has the right size.
7465 if (NumElts != M.size())
7466 return false;
7467
7468 // Look for <15, ..., 3, -1, 1, 0>.
7469 for (unsigned i = 0; i != NumElts; ++i)
7470 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
7471 return false;
7472
7473 return true;
7474}
7475
7476static bool isTruncMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7477 unsigned NumElts = VT.getVectorNumElements();
7478 // Make sure the mask has the right size.
7479 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7480 return false;
7481
7482 // Half-width truncation patterns (e.g. v4i32 -> v8i16):
7483 // !Top && SingleSource: <0, 2, 4, 6, 0, 2, 4, 6>
7484 // !Top && !SingleSource: <0, 2, 4, 6, 8, 10, 12, 14>
7485 // Top && SingleSource: <1, 3, 5, 7, 1, 3, 5, 7>
7486 // Top && !SingleSource: <1, 3, 5, 7, 9, 11, 13, 15>
7487 int Ofs = Top ? 1 : 0;
7488 int Upper = SingleSource ? 0 : NumElts;
7489 for (int i = 0, e = NumElts / 2; i != e; ++i) {
7490 if (M[i] >= 0 && M[i] != (i * 2) + Ofs)
7491 return false;
7492 if (M[i + e] >= 0 && M[i + e] != (i * 2) + Ofs + Upper)
7493 return false;
7494 }
7495 return true;
7496}
7497
7498static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7499 unsigned NumElts = VT.getVectorNumElements();
7500 // Make sure the mask has the right size.
7501 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7502 return false;
7503
7504 // If Top
7505 // Look for <0, N, 2, N+2, 4, N+4, ..>.
7506 // This inserts Input2 into Input1
7507 // else if not Top
7508 // Look for <0, N+1, 2, N+3, 4, N+5, ..>
7509 // This inserts Input1 into Input2
7510 unsigned Offset = Top ? 0 : 1;
7511 unsigned N = SingleSource ? 0 : NumElts;
7512 for (unsigned i = 0; i < NumElts; i += 2) {
7513 if (M[i] >= 0 && M[i] != (int)i)
7514 return false;
7515 if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset))
7516 return false;
7517 }
7518
7519 return true;
7520}
7521
7522static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) {
7523 unsigned NumElts = ToVT.getVectorNumElements();
7524 if (NumElts != M.size())
7525 return false;
7526
7527 // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are
7528 // looking for patterns of:
7529 // !rev: 0 N/2 1 N/2+1 2 N/2+2 ...
7530 // rev: N/2 0 N/2+1 1 N/2+2 2 ...
7531
7532 unsigned Off0 = rev ? NumElts / 2 : 0;
7533 unsigned Off1 = rev ? 0 : NumElts / 2;
7534 for (unsigned i = 0; i < NumElts; i += 2) {
7535 if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
7536 return false;
7537 if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
7538 return false;
7539 }
7540
7541 return true;
7542}
7543
7544// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
7545// from a pair of inputs. For example:
7546// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7547// FP_ROUND(EXTRACT_ELT(Y, 0),
7548// FP_ROUND(EXTRACT_ELT(X, 1),
7549// FP_ROUND(EXTRACT_ELT(Y, 1), ...)
7551 const ARMSubtarget *ST) {
7552 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7553 if (!ST->hasMVEFloatOps())
7554 return SDValue();
7555
7556 SDLoc dl(BV);
7557 EVT VT = BV.getValueType();
7558 if (VT != MVT::v8f16)
7559 return SDValue();
7560
7561 // We are looking for a buildvector of fptrunc elements, where all the
7562 // elements are interleavingly extracted from two sources. Check the first two
7563 // items are valid enough and extract some info from them (they are checked
7564 // properly in the loop below).
7565 if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||
7568 return SDValue();
7569 if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||
7572 return SDValue();
7573 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7574 SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);
7575 if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
7576 return SDValue();
7577
7578 // Check all the values in the BuildVector line up with our expectations.
7579 for (unsigned i = 1; i < 4; i++) {
7580 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7581 return Trunc.getOpcode() == ISD::FP_ROUND &&
7583 Trunc.getOperand(0).getOperand(0) == Op &&
7584 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7585 };
7586 if (!Check(BV.getOperand(i * 2 + 0), Op0, i))
7587 return SDValue();
7588 if (!Check(BV.getOperand(i * 2 + 1), Op1, i))
7589 return SDValue();
7590 }
7591
7592 SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
7593 DAG.getConstant(0, dl, MVT::i32));
7594 return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
7595 DAG.getConstant(1, dl, MVT::i32));
7596}
7597
7598// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
7599// from a single input on alternating lanes. For example:
7600// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7601// FP_ROUND(EXTRACT_ELT(X, 2),
7602// FP_ROUND(EXTRACT_ELT(X, 4), ...)
7604 const ARMSubtarget *ST) {
7605 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7606 if (!ST->hasMVEFloatOps())
7607 return SDValue();
7608
7609 SDLoc dl(BV);
7610 EVT VT = BV.getValueType();
7611 if (VT != MVT::v4f32)
7612 return SDValue();
7613
7614 // We are looking for a buildvector of fptext elements, where all the
7615 // elements are alternating lanes from a single source. For example <0,2,4,6>
7616 // or <1,3,5,7>. Check the first two items are valid enough and extract some
7617 // info from them (they are checked properly in the loop below).
7618 if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
7620 return SDValue();
7621 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7623 if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
7624 return SDValue();
7625
7626 // Check all the values in the BuildVector line up with our expectations.
7627 for (unsigned i = 1; i < 4; i++) {
7628 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7629 return Trunc.getOpcode() == ISD::FP_EXTEND &&
7631 Trunc.getOperand(0).getOperand(0) == Op &&
7632 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7633 };
7634 if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
7635 return SDValue();
7636 }
7637
7638 return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
7639 DAG.getConstant(Offset, dl, MVT::i32));
7640}
7641
7642// If N is an integer constant that can be moved into a register in one
7643// instruction, return an SDValue of such a constant (will become a MOV
7644// instruction). Otherwise return null.
7646 const ARMSubtarget *ST, const SDLoc &dl) {
7647 uint64_t Val;
7648 if (!isa<ConstantSDNode>(N))
7649 return SDValue();
7650 Val = N->getAsZExtVal();
7651
7652 if (ST->isThumb1Only()) {
7653 if (Val <= 255 || ~Val <= 255)
7654 return DAG.getConstant(Val, dl, MVT::i32);
7655 } else {
7656 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
7657 return DAG.getConstant(Val, dl, MVT::i32);
7658 }
7659 return SDValue();
7660}
7661
7663 const ARMSubtarget *ST) {
7664 SDLoc dl(Op);
7665 EVT VT = Op.getValueType();
7666
7667 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
7668
7669 unsigned NumElts = VT.getVectorNumElements();
7670 unsigned BoolMask;
7671 unsigned BitsPerBool;
7672 if (NumElts == 2) {
7673 BitsPerBool = 8;
7674 BoolMask = 0xff;
7675 } else if (NumElts == 4) {
7676 BitsPerBool = 4;
7677 BoolMask = 0xf;
7678 } else if (NumElts == 8) {
7679 BitsPerBool = 2;
7680 BoolMask = 0x3;
7681 } else if (NumElts == 16) {
7682 BitsPerBool = 1;
7683 BoolMask = 0x1;
7684 } else
7685 return SDValue();
7686
7687 // If this is a single value copied into all lanes (a splat), we can just sign
7688 // extend that single value
7689 SDValue FirstOp = Op.getOperand(0);
7690 if (!isa<ConstantSDNode>(FirstOp) &&
7691 llvm::all_of(llvm::drop_begin(Op->ops()), [&FirstOp](const SDUse &U) {
7692 return U.get().isUndef() || U.get() == FirstOp;
7693 })) {
7694 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
7695 DAG.getValueType(MVT::i1));
7696 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
7697 }
7698
7699 // First create base with bits set where known
7700 unsigned Bits32 = 0;
7701 for (unsigned i = 0; i < NumElts; ++i) {
7702 SDValue V = Op.getOperand(i);
7703 if (!isa<ConstantSDNode>(V) && !V.isUndef())
7704 continue;
7705 bool BitSet = V.isUndef() ? false : V->getAsZExtVal();
7706 if (BitSet)
7707 Bits32 |= BoolMask << (i * BitsPerBool);
7708 }
7709
7710 // Add in unknown nodes
7712 DAG.getConstant(Bits32, dl, MVT::i32));
7713 for (unsigned i = 0; i < NumElts; ++i) {
7714 SDValue V = Op.getOperand(i);
7715 if (isa<ConstantSDNode>(V) || V.isUndef())
7716 continue;
7717 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
7718 DAG.getConstant(i, dl, MVT::i32));
7719 }
7720
7721 return Base;
7722}
7723
7725 const ARMSubtarget *ST) {
7726 if (!ST->hasMVEIntegerOps())
7727 return SDValue();
7728
7729 // We are looking for a buildvector where each element is Op[0] + i*N
7730 EVT VT = Op.getValueType();
7731 SDValue Op0 = Op.getOperand(0);
7732 unsigned NumElts = VT.getVectorNumElements();
7733
7734 // Get the increment value from operand 1
7735 SDValue Op1 = Op.getOperand(1);
7736 if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 ||
7737 !isa<ConstantSDNode>(Op1.getOperand(1)))
7738 return SDValue();
7739 unsigned N = Op1.getConstantOperandVal(1);
7740 if (N != 1 && N != 2 && N != 4 && N != 8)
7741 return SDValue();
7742
7743 // Check that each other operand matches
7744 for (unsigned I = 2; I < NumElts; I++) {
7745 SDValue OpI = Op.getOperand(I);
7746 if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 ||
7747 !isa<ConstantSDNode>(OpI.getOperand(1)) ||
7748 OpI.getConstantOperandVal(1) != I * N)
7749 return SDValue();
7750 }
7751
7752 SDLoc DL(Op);
7753 return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0,
7754 DAG.getConstant(N, DL, MVT::i32));
7755}
7756
7757// Returns true if the operation N can be treated as qr instruction variant at
7758// operand Op.
7759static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) {
7760 switch (N->getOpcode()) {
7761 case ISD::ADD:
7762 case ISD::MUL:
7763 case ISD::SADDSAT:
7764 case ISD::UADDSAT:
7765 case ISD::AVGFLOORS:
7766 case ISD::AVGFLOORU:
7767 return true;
7768 case ISD::SUB:
7769 case ISD::SSUBSAT:
7770 case ISD::USUBSAT:
7771 return N->getOperand(1).getNode() == Op;
7773 switch (N->getConstantOperandVal(0)) {
7774 case Intrinsic::arm_mve_add_predicated:
7775 case Intrinsic::arm_mve_mul_predicated:
7776 case Intrinsic::arm_mve_qadd_predicated:
7777 case Intrinsic::arm_mve_vhadd:
7778 case Intrinsic::arm_mve_hadd_predicated:
7779 case Intrinsic::arm_mve_vqdmulh:
7780 case Intrinsic::arm_mve_qdmulh_predicated:
7781 case Intrinsic::arm_mve_vqrdmulh:
7782 case Intrinsic::arm_mve_qrdmulh_predicated:
7783 case Intrinsic::arm_mve_vqdmull:
7784 case Intrinsic::arm_mve_vqdmull_predicated:
7785 return true;
7786 case Intrinsic::arm_mve_sub_predicated:
7787 case Intrinsic::arm_mve_qsub_predicated:
7788 case Intrinsic::arm_mve_vhsub:
7789 case Intrinsic::arm_mve_hsub_predicated:
7790 return N->getOperand(2).getNode() == Op;
7791 default:
7792 return false;
7793 }
7794 default:
7795 return false;
7796 }
7797}
7798
7799// If this is a case we can't handle, return null and let the default
7800// expansion code take care of it.
7801SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
7802 const ARMSubtarget *ST) const {
7803 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
7804 SDLoc dl(Op);
7805 EVT VT = Op.getValueType();
7806
7807 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
7808 return LowerBUILD_VECTOR_i1(Op, DAG, ST);
7809
7810 if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST))
7811 return R;
7812
7813 APInt SplatBits, SplatUndef;
7814 unsigned SplatBitSize;
7815 bool HasAnyUndefs;
7816 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
7817 if (SplatUndef.isAllOnes())
7818 return DAG.getUNDEF(VT);
7819
7820 // If all the users of this constant splat are qr instruction variants,
7821 // generate a vdup of the constant.
7822 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize &&
7823 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) &&
7824 all_of(BVN->users(),
7825 [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) {
7826 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7827 : SplatBitSize == 16 ? MVT::v8i16
7828 : MVT::v16i8;
7829 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7830 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7831 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7832 }
7833
7834 if ((ST->hasNEON() && SplatBitSize <= 64) ||
7835 (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
7836 // Check if an immediate VMOV works.
7837 EVT VmovVT;
7838 SDValue Val =
7839 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
7840 SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);
7841
7842 if (Val.getNode()) {
7843 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
7844 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7845 }
7846
7847 // Try an immediate VMVN.
7848 uint64_t NegatedImm = (~SplatBits).getZExtValue();
7849 Val = isVMOVModifiedImm(
7850 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
7851 VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
7852 if (Val.getNode()) {
7853 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
7854 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7855 }
7856
7857 // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
7858 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
7859 int ImmVal = ARM_AM::getFP32Imm(SplatBits);
7860 if (ImmVal != -1) {
7861 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
7862 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
7863 }
7864 }
7865
7866 // If we are under MVE, generate a VDUP(constant), bitcast to the original
7867 // type.
7868 if (ST->hasMVEIntegerOps() &&
7869 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) {
7870 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7871 : SplatBitSize == 16 ? MVT::v8i16
7872 : MVT::v16i8;
7873 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7874 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7875 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7876 }
7877 }
7878 }
7879
7880 // Scan through the operands to see if only one value is used.
7881 //
7882 // As an optimisation, even if more than one value is used it may be more
7883 // profitable to splat with one value then change some lanes.
7884 //
7885 // Heuristically we decide to do this if the vector has a "dominant" value,
7886 // defined as splatted to more than half of the lanes.
7887 unsigned NumElts = VT.getVectorNumElements();
7888 bool isOnlyLowElement = true;
7889 bool usesOnlyOneValue = true;
7890 bool hasDominantValue = false;
7891 bool isConstant = true;
7892
7893 // Map of the number of times a particular SDValue appears in the
7894 // element list.
7895 DenseMap<SDValue, unsigned> ValueCounts;
7896 SDValue Value;
7897 for (unsigned i = 0; i < NumElts; ++i) {
7898 SDValue V = Op.getOperand(i);
7899 if (V.isUndef())
7900 continue;
7901 if (i > 0)
7902 isOnlyLowElement = false;
7903 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
7904 isConstant = false;
7905
7906 unsigned &Count = ValueCounts[V];
7907
7908 // Is this value dominant? (takes up more than half of the lanes)
7909 if (++Count > (NumElts / 2)) {
7910 hasDominantValue = true;
7911 Value = V;
7912 }
7913 }
7914 if (ValueCounts.size() != 1)
7915 usesOnlyOneValue = false;
7916 if (!Value.getNode() && !ValueCounts.empty())
7917 Value = ValueCounts.begin()->first;
7918
7919 if (ValueCounts.empty())
7920 return DAG.getUNDEF(VT);
7921
7922 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
7923 // Keep going if we are hitting this case.
7924 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
7925 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
7926
7927 unsigned EltSize = VT.getScalarSizeInBits();
7928
7929 // Use VDUP for non-constant splats. For f32 constant splats, reduce to
7930 // i32 and try again.
7931 if (hasDominantValue && EltSize <= 32) {
7932 if (!isConstant) {
7933 SDValue N;
7934
7935 // If we are VDUPing a value that comes directly from a vector, that will
7936 // cause an unnecessary move to and from a GPR, where instead we could
7937 // just use VDUPLANE. We can only do this if the lane being extracted
7938 // is at a constant index, as the VDUP from lane instructions only have
7939 // constant-index forms.
7940 ConstantSDNode *constIndex;
7941 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7942 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
7943 // We need to create a new undef vector to use for the VDUPLANE if the
7944 // size of the vector from which we get the value is different than the
7945 // size of the vector that we need to create. We will insert the element
7946 // such that the register coalescer will remove unnecessary copies.
7947 if (VT != Value->getOperand(0).getValueType()) {
7948 unsigned index = constIndex->getAPIntValue().getLimitedValue() %
7950 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7951 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
7952 Value, DAG.getConstant(index, dl, MVT::i32)),
7953 DAG.getConstant(index, dl, MVT::i32));
7954 } else
7955 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7956 Value->getOperand(0), Value->getOperand(1));
7957 } else
7958 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
7959
7960 if (!usesOnlyOneValue) {
7961 // The dominant value was splatted as 'N', but we now have to insert
7962 // all differing elements.
7963 for (unsigned I = 0; I < NumElts; ++I) {
7964 if (Op.getOperand(I) == Value)
7965 continue;
7967 Ops.push_back(N);
7968 Ops.push_back(Op.getOperand(I));
7969 Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
7970 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
7971 }
7972 }
7973 return N;
7974 }
7978 assert(FVT == MVT::f32 || FVT == MVT::f16);
7979 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
7980 for (unsigned i = 0; i < NumElts; ++i)
7981 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
7982 Op.getOperand(i)));
7983 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
7984 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
7985 Val = LowerBUILD_VECTOR(Val, DAG, ST);
7986 if (Val.getNode())
7987 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7988 }
7989 if (usesOnlyOneValue) {
7990 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
7991 if (isConstant && Val.getNode())
7992 return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
7993 }
7994 }
7995
7996 // If all elements are constants and the case above didn't get hit, fall back
7997 // to the default expansion, which will generate a load from the constant
7998 // pool.
7999 if (isConstant)
8000 return SDValue();
8001
8002 // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
8003 // vmovn). Empirical tests suggest this is rarely worth it for vectors of
8004 // length <= 2.
8005 if (NumElts >= 4)
8006 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
8007 return shuffle;
8008
8009 // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
8010 // VCVT's
8011 if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
8012 return VCVT;
8013 if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
8014 return VCVT;
8015
8016 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
8017 // If we haven't found an efficient lowering, try splitting a 128-bit vector
8018 // into two 64-bit vectors; we might discover a better way to lower it.
8019 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
8020 EVT ExtVT = VT.getVectorElementType();
8021 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
8022 SDValue Lower = DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[0], NumElts / 2));
8023 if (Lower.getOpcode() == ISD::BUILD_VECTOR)
8024 Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
8025 SDValue Upper =
8026 DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[NumElts / 2], NumElts / 2));
8027 if (Upper.getOpcode() == ISD::BUILD_VECTOR)
8028 Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
8029 if (Lower && Upper)
8030 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
8031 }
8032
8033 // Vectors with 32- or 64-bit elements can be built by directly assigning
8034 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
8035 // will be legalized.
8036 if (EltSize >= 32) {
8037 // Do the expansion with floating-point types, since that is what the VFP
8038 // registers are defined to use, and since i64 is not legal.
8039 EVT EltVT = EVT::getFloatingPointVT(EltSize);
8040 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8042 for (unsigned i = 0; i < NumElts; ++i)
8043 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
8044 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8045 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8046 }
8047
8048 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
8049 // know the default expansion would otherwise fall back on something even
8050 // worse. For a vector with one or two non-undef values, that's
8051 // scalar_to_vector for the elements followed by a shuffle (provided the
8052 // shuffle is valid for the target) and materialization element by element
8053 // on the stack followed by a load for everything else.
8054 if (!isConstant && !usesOnlyOneValue) {
8055 SDValue Vec = DAG.getUNDEF(VT);
8056 for (unsigned i = 0 ; i < NumElts; ++i) {
8057 SDValue V = Op.getOperand(i);
8058 if (V.isUndef())
8059 continue;
8060 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
8061 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
8062 }
8063 return Vec;
8064 }
8065
8066 return SDValue();
8067}
8068
8069// Gather data to see if the operation can be modelled as a
8070// shuffle in combination with VEXTs.
8071SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
8072 SelectionDAG &DAG) const {
8073 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
8074 SDLoc dl(Op);
8075 EVT VT = Op.getValueType();
8076 unsigned NumElts = VT.getVectorNumElements();
8077
8078 struct ShuffleSourceInfo {
8079 SDValue Vec;
8080 unsigned MinElt = std::numeric_limits<unsigned>::max();
8081 unsigned MaxElt = 0;
8082
8083 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
8084 // be compatible with the shuffle we intend to construct. As a result
8085 // ShuffleVec will be some sliding window into the original Vec.
8086 SDValue ShuffleVec;
8087
8088 // Code should guarantee that element i in Vec starts at element "WindowBase
8089 // + i * WindowScale in ShuffleVec".
8090 int WindowBase = 0;
8091 int WindowScale = 1;
8092
8093 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
8094
8095 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
8096 };
8097
8098 // First gather all vectors used as an immediate source for this BUILD_VECTOR
8099 // node.
8101 for (unsigned i = 0; i < NumElts; ++i) {
8102 SDValue V = Op.getOperand(i);
8103 if (V.isUndef())
8104 continue;
8105 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
8106 // A shuffle can only come from building a vector from various
8107 // elements of other vectors.
8108 return SDValue();
8109 } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
8110 // Furthermore, shuffles require a constant mask, whereas extractelts
8111 // accept variable indices.
8112 return SDValue();
8113 }
8114
8115 // Add this element source to the list if it's not already there.
8116 SDValue SourceVec = V.getOperand(0);
8117 auto Source = llvm::find(Sources, SourceVec);
8118 if (Source == Sources.end())
8119 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
8120
8121 // Update the minimum and maximum lane number seen.
8122 unsigned EltNo = V.getConstantOperandVal(1);
8123 Source->MinElt = std::min(Source->MinElt, EltNo);
8124 Source->MaxElt = std::max(Source->MaxElt, EltNo);
8125 }
8126
8127 // Currently only do something sane when at most two source vectors
8128 // are involved.
8129 if (Sources.size() > 2)
8130 return SDValue();
8131
8132 // Find out the smallest element size among result and two sources, and use
8133 // it as element size to build the shuffle_vector.
8134 EVT SmallestEltTy = VT.getVectorElementType();
8135 for (auto &Source : Sources) {
8136 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
8137 if (SrcEltTy.bitsLT(SmallestEltTy))
8138 SmallestEltTy = SrcEltTy;
8139 }
8140 unsigned ResMultiplier =
8141 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
8142 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
8143 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
8144
8145 // If the source vector is too wide or too narrow, we may nevertheless be able
8146 // to construct a compatible shuffle either by concatenating it with UNDEF or
8147 // extracting a suitable range of elements.
8148 for (auto &Src : Sources) {
8149 EVT SrcVT = Src.ShuffleVec.getValueType();
8150
8151 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
8152 uint64_t VTSize = VT.getFixedSizeInBits();
8153 if (SrcVTSize == VTSize)
8154 continue;
8155
8156 // This stage of the search produces a source with the same element type as
8157 // the original, but with a total width matching the BUILD_VECTOR output.
8158 EVT EltVT = SrcVT.getVectorElementType();
8159 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
8160 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
8161
8162 if (SrcVTSize < VTSize) {
8163 if (2 * SrcVTSize != VTSize)
8164 return SDValue();
8165 // We can pad out the smaller vector for free, so if it's part of a
8166 // shuffle...
8167 Src.ShuffleVec =
8168 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
8169 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
8170 continue;
8171 }
8172
8173 if (SrcVTSize != 2 * VTSize)
8174 return SDValue();
8175
8176 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
8177 // Span too large for a VEXT to cope
8178 return SDValue();
8179 }
8180
8181 if (Src.MinElt >= NumSrcElts) {
8182 // The extraction can just take the second half
8183 Src.ShuffleVec =
8184 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8185 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8186 Src.WindowBase = -NumSrcElts;
8187 } else if (Src.MaxElt < NumSrcElts) {
8188 // The extraction can just take the first half
8189 Src.ShuffleVec =
8190 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8191 DAG.getConstant(0, dl, MVT::i32));
8192 } else {
8193 // An actual VEXT is needed
8194 SDValue VEXTSrc1 =
8195 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8196 DAG.getConstant(0, dl, MVT::i32));
8197 SDValue VEXTSrc2 =
8198 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8199 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8200
8201 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
8202 VEXTSrc2,
8203 DAG.getConstant(Src.MinElt, dl, MVT::i32));
8204 Src.WindowBase = -Src.MinElt;
8205 }
8206 }
8207
8208 // Another possible incompatibility occurs from the vector element types. We
8209 // can fix this by bitcasting the source vectors to the same type we intend
8210 // for the shuffle.
8211 for (auto &Src : Sources) {
8212 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
8213 if (SrcEltTy == SmallestEltTy)
8214 continue;
8215 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
8216 Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
8217 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
8218 Src.WindowBase *= Src.WindowScale;
8219 }
8220
8221 // Final check before we try to actually produce a shuffle.
8222 LLVM_DEBUG({
8223 for (auto Src : Sources)
8224 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
8225 });
8226
8227 // The stars all align, our next step is to produce the mask for the shuffle.
8229 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
8230 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
8231 SDValue Entry = Op.getOperand(i);
8232 if (Entry.isUndef())
8233 continue;
8234
8235 auto Src = llvm::find(Sources, Entry.getOperand(0));
8236 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
8237
8238 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8239 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8240 // segment.
8241 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
8242 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
8243 VT.getScalarSizeInBits());
8244 int LanesDefined = BitsDefined / BitsPerShuffleLane;
8245
8246 // This source is expected to fill ResMultiplier lanes of the final shuffle,
8247 // starting at the appropriate offset.
8248 int *LaneMask = &Mask[i * ResMultiplier];
8249
8250 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
8251 ExtractBase += NumElts * (Src - Sources.begin());
8252 for (int j = 0; j < LanesDefined; ++j)
8253 LaneMask[j] = ExtractBase + j;
8254 }
8255
8256
8257 // We can't handle more than two sources. This should have already
8258 // been checked before this point.
8259 assert(Sources.size() <= 2 && "Too many sources!");
8260
8261 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
8262 for (unsigned i = 0; i < Sources.size(); ++i)
8263 ShuffleOps[i] = Sources[i].ShuffleVec;
8264
8265 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
8266 ShuffleOps[1], Mask, DAG);
8267 if (!Shuffle)
8268 return SDValue();
8269 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
8270}
8271
8273 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
8282 OP_VUZPL, // VUZP, left result
8283 OP_VUZPR, // VUZP, right result
8284 OP_VZIPL, // VZIP, left result
8285 OP_VZIPR, // VZIP, right result
8286 OP_VTRNL, // VTRN, left result
8287 OP_VTRNR // VTRN, right result
8289
8290static bool isLegalMVEShuffleOp(unsigned PFEntry) {
8291 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8292 switch (OpNum) {
8293 case OP_COPY:
8294 case OP_VREV:
8295 case OP_VDUP0:
8296 case OP_VDUP1:
8297 case OP_VDUP2:
8298 case OP_VDUP3:
8299 return true;
8300 }
8301 return false;
8302}
8303
8304/// isShuffleMaskLegal - Targets can use this to indicate that they only
8305/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
8306/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
8307/// are assumed to be legal.
8309 if (VT.getVectorNumElements() == 4 &&
8310 (VT.is128BitVector() || VT.is64BitVector())) {
8311 unsigned PFIndexes[4];
8312 for (unsigned i = 0; i != 4; ++i) {
8313 if (M[i] < 0)
8314 PFIndexes[i] = 8;
8315 else
8316 PFIndexes[i] = M[i];
8317 }
8318
8319 // Compute the index in the perfect shuffle table.
8320 unsigned PFTableIndex =
8321 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8322 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8323 unsigned Cost = (PFEntry >> 30);
8324
8325 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
8326 return true;
8327 }
8328
8329 bool ReverseVEXT, isV_UNDEF;
8330 unsigned Imm, WhichResult;
8331
8332 unsigned EltSize = VT.getScalarSizeInBits();
8333 if (EltSize >= 32 ||
8335 ShuffleVectorInst::isIdentityMask(M, M.size()) ||
8336 isVREVMask(M, VT, 64) ||
8337 isVREVMask(M, VT, 32) ||
8338 isVREVMask(M, VT, 16))
8339 return true;
8340 else if (Subtarget->hasNEON() &&
8341 (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
8342 isVTBLMask(M, VT) ||
8343 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
8344 return true;
8345 else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8346 isReverseMask(M, VT))
8347 return true;
8348 else if (Subtarget->hasMVEIntegerOps() &&
8349 (isVMOVNMask(M, VT, true, false) ||
8350 isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true)))
8351 return true;
8352 else if (Subtarget->hasMVEIntegerOps() &&
8353 (isTruncMask(M, VT, false, false) ||
8354 isTruncMask(M, VT, false, true) ||
8355 isTruncMask(M, VT, true, false) || isTruncMask(M, VT, true, true)))
8356 return true;
8357 else
8358 return false;
8359}
8360
8361/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
8362/// the specified operations to build the shuffle.
8363static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
8364 SDValue RHS, SelectionDAG &DAG,
8365 const SDLoc &dl) {
8366 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8367 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8368 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8369
8370 if (OpNum == OP_COPY) {
8371 if (LHSID == (1*9+2)*9+3) return LHS;
8372 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
8373 return RHS;
8374 }
8375
8376 SDValue OpLHS, OpRHS;
8377 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
8378 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
8379 EVT VT = OpLHS.getValueType();
8380
8381 switch (OpNum) {
8382 default: llvm_unreachable("Unknown shuffle opcode!");
8383 case OP_VREV:
8384 // VREV divides the vector in half and swaps within the half.
8385 if (VT.getScalarSizeInBits() == 32)
8386 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
8387 // vrev <4 x i16> -> VREV32
8388 if (VT.getScalarSizeInBits() == 16)
8389 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
8390 // vrev <4 x i8> -> VREV16
8391 assert(VT.getScalarSizeInBits() == 8);
8392 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
8393 case OP_VDUP0:
8394 case OP_VDUP1:
8395 case OP_VDUP2:
8396 case OP_VDUP3:
8397 return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8398 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
8399 case OP_VEXT1:
8400 case OP_VEXT2:
8401 case OP_VEXT3:
8402 return DAG.getNode(ARMISD::VEXT, dl, VT,
8403 OpLHS, OpRHS,
8404 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
8405 case OP_VUZPL:
8406 case OP_VUZPR:
8407 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
8408 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
8409 case OP_VZIPL:
8410 case OP_VZIPR:
8411 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
8412 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
8413 case OP_VTRNL:
8414 case OP_VTRNR:
8415 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
8416 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
8417 }
8418}
8419
8421 ArrayRef<int> ShuffleMask,
8422 SelectionDAG &DAG) {
8423 // Check to see if we can use the VTBL instruction.
8424 SDValue V1 = Op.getOperand(0);
8425 SDValue V2 = Op.getOperand(1);
8426 SDLoc DL(Op);
8427
8428 SmallVector<SDValue, 8> VTBLMask;
8429 for (int I : ShuffleMask)
8430 VTBLMask.push_back(DAG.getSignedConstant(I, DL, MVT::i32));
8431
8432 if (V2.getNode()->isUndef())
8433 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
8434 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8435
8436 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
8437 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8438}
8439
8441 SDLoc DL(Op);
8442 EVT VT = Op.getValueType();
8443
8444 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8445 "Expect an v8i16/v16i8 type");
8446 SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0));
8447 // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now,
8448 // extract the first 8 bytes into the top double word and the last 8 bytes
8449 // into the bottom double word, through a new vector shuffle that will be
8450 // turned into a VEXT on Neon, or a couple of VMOVDs on MVE.
8451 std::vector<int> NewMask;
8452 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8453 NewMask.push_back(VT.getVectorNumElements() / 2 + i);
8454 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8455 NewMask.push_back(i);
8456 return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask);
8457}
8458
8460 switch (VT.getSimpleVT().SimpleTy) {
8461 case MVT::v2i1:
8462 return MVT::v2f64;
8463 case MVT::v4i1:
8464 return MVT::v4i32;
8465 case MVT::v8i1:
8466 return MVT::v8i16;
8467 case MVT::v16i1:
8468 return MVT::v16i8;
8469 default:
8470 llvm_unreachable("Unexpected vector predicate type");
8471 }
8472}
8473
8475 SelectionDAG &DAG) {
8476 // Converting from boolean predicates to integers involves creating a vector
8477 // of all ones or all zeroes and selecting the lanes based upon the real
8478 // predicate.
8480 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
8481 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
8482
8483 SDValue AllZeroes =
8484 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
8485 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
8486
8487 // Get full vector type from predicate type
8489
8490 SDValue RecastV1;
8491 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
8492 // this to a v16i1. This cannot be done with an ordinary bitcast because the
8493 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
8494 // since we know in hardware the sizes are really the same.
8495 if (VT != MVT::v16i1)
8496 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
8497 else
8498 RecastV1 = Pred;
8499
8500 // Select either all ones or zeroes depending upon the real predicate bits.
8501 SDValue PredAsVector =
8502 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
8503
8504 // Recast our new predicate-as-integer v16i8 vector into something
8505 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
8506 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
8507}
8508
8510 const ARMSubtarget *ST) {
8511 EVT VT = Op.getValueType();
8512 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
8513 ArrayRef<int> ShuffleMask = SVN->getMask();
8514
8515 assert(ST->hasMVEIntegerOps() &&
8516 "No support for vector shuffle of boolean predicates");
8517
8518 SDValue V1 = Op.getOperand(0);
8519 SDValue V2 = Op.getOperand(1);
8520 SDLoc dl(Op);
8521 if (isReverseMask(ShuffleMask, VT)) {
8522 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
8523 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
8524 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
8525 DAG.getConstant(16, dl, MVT::i32));
8526 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
8527 }
8528
8529 // Until we can come up with optimised cases for every single vector
8530 // shuffle in existence we have chosen the least painful strategy. This is
8531 // to essentially promote the boolean predicate to a 8-bit integer, where
8532 // each predicate represents a byte. Then we fall back on a normal integer
8533 // vector shuffle and convert the result back into a predicate vector. In
8534 // many cases the generated code might be even better than scalar code
8535 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
8536 // fields in a register into 8 other arbitrary 2-bit fields!
8537 SDValue PredAsVector1 = PromoteMVEPredVector(dl, V1, VT, DAG);
8538 EVT NewVT = PredAsVector1.getValueType();
8539 SDValue PredAsVector2 = V2.isUndef() ? DAG.getUNDEF(NewVT)
8540 : PromoteMVEPredVector(dl, V2, VT, DAG);
8541 assert(PredAsVector2.getValueType() == NewVT &&
8542 "Expected identical vector type in expanded i1 shuffle!");
8543
8544 // Do the shuffle!
8545 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector1,
8546 PredAsVector2, ShuffleMask);
8547
8548 // Now return the result of comparing the shuffled vector with zero,
8549 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1
8550 // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
8551 if (VT == MVT::v2i1) {
8552 SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled);
8553 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
8554 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8555 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
8556 }
8557 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
8558 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8559}
8560
8562 ArrayRef<int> ShuffleMask,
8563 SelectionDAG &DAG) {
8564 // Attempt to lower the vector shuffle using as many whole register movs as
8565 // possible. This is useful for types smaller than 32bits, which would
8566 // often otherwise become a series for grp movs.
8567 SDLoc dl(Op);
8568 EVT VT = Op.getValueType();
8569 if (VT.getScalarSizeInBits() >= 32)
8570 return SDValue();
8571
8572 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8573 "Unexpected vector type");
8574 int NumElts = VT.getVectorNumElements();
8575 int QuarterSize = NumElts / 4;
8576 // The four final parts of the vector, as i32's
8577 SDValue Parts[4];
8578
8579 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
8580 // <u,u,u,u>), returning the vmov lane index
8581 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
8582 // Detect which mov lane this would be from the first non-undef element.
8583 int MovIdx = -1;
8584 for (int i = 0; i < Length; i++) {
8585 if (ShuffleMask[Start + i] >= 0) {
8586 if (ShuffleMask[Start + i] % Length != i)
8587 return -1;
8588 MovIdx = ShuffleMask[Start + i] / Length;
8589 break;
8590 }
8591 }
8592 // If all items are undef, leave this for other combines
8593 if (MovIdx == -1)
8594 return -1;
8595 // Check the remaining values are the correct part of the same mov
8596 for (int i = 1; i < Length; i++) {
8597 if (ShuffleMask[Start + i] >= 0 &&
8598 (ShuffleMask[Start + i] / Length != MovIdx ||
8599 ShuffleMask[Start + i] % Length != i))
8600 return -1;
8601 }
8602 return MovIdx;
8603 };
8604
8605 for (int Part = 0; Part < 4; ++Part) {
8606 // Does this part look like a mov
8607 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
8608 if (Elt != -1) {
8609 SDValue Input = Op->getOperand(0);
8610 if (Elt >= 4) {
8611 Input = Op->getOperand(1);
8612 Elt -= 4;
8613 }
8614 SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input);
8615 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast,
8616 DAG.getConstant(Elt, dl, MVT::i32));
8617 }
8618 }
8619
8620 // Nothing interesting found, just return
8621 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
8622 return SDValue();
8623
8624 // The other parts need to be built with the old shuffle vector, cast to a
8625 // v4i32 and extract_vector_elts
8626 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
8627 SmallVector<int, 16> NewShuffleMask;
8628 for (int Part = 0; Part < 4; ++Part)
8629 for (int i = 0; i < QuarterSize; i++)
8630 NewShuffleMask.push_back(
8631 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
8632 SDValue NewShuffle = DAG.getVectorShuffle(
8633 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
8634 SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle);
8635
8636 for (int Part = 0; Part < 4; ++Part)
8637 if (!Parts[Part])
8638 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32,
8639 BitCast, DAG.getConstant(Part, dl, MVT::i32));
8640 }
8641 // Build a vector out of the various parts and bitcast it back to the original
8642 // type.
8643 SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts);
8644 return DAG.getBitcast(VT, NewVec);
8645}
8646
8648 ArrayRef<int> ShuffleMask,
8649 SelectionDAG &DAG) {
8650 SDValue V1 = Op.getOperand(0);
8651 SDValue V2 = Op.getOperand(1);
8652 EVT VT = Op.getValueType();
8653 unsigned NumElts = VT.getVectorNumElements();
8654
8655 // An One-Off Identity mask is one that is mostly an identity mask from as
8656 // single source but contains a single element out-of-place, either from a
8657 // different vector or from another position in the same vector. As opposed to
8658 // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
8659 // pair directly.
8660 auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,
8661 int &OffElement) {
8662 OffElement = -1;
8663 int NonUndef = 0;
8664 for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {
8665 if (Mask[i] == -1)
8666 continue;
8667 NonUndef++;
8668 if (Mask[i] != i + BaseOffset) {
8669 if (OffElement == -1)
8670 OffElement = i;
8671 else
8672 return false;
8673 }
8674 }
8675 return NonUndef > 2 && OffElement != -1;
8676 };
8677 int OffElement;
8678 SDValue VInput;
8679 if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))
8680 VInput = V1;
8681 else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))
8682 VInput = V2;
8683 else
8684 return SDValue();
8685
8686 SDLoc dl(Op);
8687 EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16
8688 ? MVT::i32
8689 : VT.getScalarType();
8690 SDValue Elt = DAG.getNode(
8691 ISD::EXTRACT_VECTOR_ELT, dl, SVT,
8692 ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,
8693 DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl));
8694 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt,
8695 DAG.getVectorIdxConstant(OffElement % NumElts, dl));
8696}
8697
8699 const ARMSubtarget *ST) {
8700 SDValue V1 = Op.getOperand(0);
8701 SDValue V2 = Op.getOperand(1);
8702 SDLoc dl(Op);
8703 EVT VT = Op.getValueType();
8704 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
8705 unsigned EltSize = VT.getScalarSizeInBits();
8706
8707 if (ST->hasMVEIntegerOps() && EltSize == 1)
8708 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
8709
8710 // Convert shuffles that are directly supported on NEON to target-specific
8711 // DAG nodes, instead of keeping them as shuffles and matching them again
8712 // during code selection. This is more efficient and avoids the possibility
8713 // of inconsistencies between legalization and selection.
8714 // FIXME: floating-point vectors should be canonicalized to integer vectors
8715 // of the same time so that they get CSEd properly.
8716 ArrayRef<int> ShuffleMask = SVN->getMask();
8717
8718 if (EltSize <= 32) {
8719 if (SVN->isSplat()) {
8720 int Lane = SVN->getSplatIndex();
8721 // If this is undef splat, generate it via "just" vdup, if possible.
8722 if (Lane == -1) Lane = 0;
8723
8724 // Test if V1 is a SCALAR_TO_VECTOR.
8725 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8726 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8727 }
8728 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
8729 // (and probably will turn into a SCALAR_TO_VECTOR once legalization
8730 // reaches it).
8731 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
8732 !isa<ConstantSDNode>(V1.getOperand(0))) {
8733 bool IsScalarToVector = true;
8734 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
8735 if (!V1.getOperand(i).isUndef()) {
8736 IsScalarToVector = false;
8737 break;
8738 }
8739 if (IsScalarToVector)
8740 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8741 }
8742 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
8743 DAG.getConstant(Lane, dl, MVT::i32));
8744 }
8745
8746 bool ReverseVEXT = false;
8747 unsigned Imm = 0;
8748 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
8749 if (ReverseVEXT)
8750 std::swap(V1, V2);
8751 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
8752 DAG.getConstant(Imm, dl, MVT::i32));
8753 }
8754
8755 if (isVREVMask(ShuffleMask, VT, 64))
8756 return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
8757 if (isVREVMask(ShuffleMask, VT, 32))
8758 return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
8759 if (isVREVMask(ShuffleMask, VT, 16))
8760 return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
8761
8762 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
8763 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
8764 DAG.getConstant(Imm, dl, MVT::i32));
8765 }
8766
8767 // Check for Neon shuffles that modify both input vectors in place.
8768 // If both results are used, i.e., if there are two shuffles with the same
8769 // source operands and with masks corresponding to both results of one of
8770 // these operations, DAG memoization will ensure that a single node is
8771 // used for both shuffles.
8772 unsigned WhichResult = 0;
8773 bool isV_UNDEF = false;
8774 if (ST->hasNEON()) {
8775 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8776 ShuffleMask, VT, WhichResult, isV_UNDEF)) {
8777 if (isV_UNDEF)
8778 V2 = V1;
8779 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
8780 .getValue(WhichResult);
8781 }
8782 }
8783 if (ST->hasMVEIntegerOps()) {
8784 if (isVMOVNMask(ShuffleMask, VT, false, false))
8785 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
8786 DAG.getConstant(0, dl, MVT::i32));
8787 if (isVMOVNMask(ShuffleMask, VT, true, false))
8788 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
8789 DAG.getConstant(1, dl, MVT::i32));
8790 if (isVMOVNMask(ShuffleMask, VT, true, true))
8791 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1,
8792 DAG.getConstant(1, dl, MVT::i32));
8793 }
8794
8795 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
8796 // shuffles that produce a result larger than their operands with:
8797 // shuffle(concat(v1, undef), concat(v2, undef))
8798 // ->
8799 // shuffle(concat(v1, v2), undef)
8800 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
8801 //
8802 // This is useful in the general case, but there are special cases where
8803 // native shuffles produce larger results: the two-result ops.
8804 //
8805 // Look through the concat when lowering them:
8806 // shuffle(concat(v1, v2), undef)
8807 // ->
8808 // concat(VZIP(v1, v2):0, :1)
8809 //
8810 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
8811 SDValue SubV1 = V1->getOperand(0);
8812 SDValue SubV2 = V1->getOperand(1);
8813 EVT SubVT = SubV1.getValueType();
8814
8815 // We expect these to have been canonicalized to -1.
8816 assert(llvm::all_of(ShuffleMask, [&](int i) {
8817 return i < (int)VT.getVectorNumElements();
8818 }) && "Unexpected shuffle index into UNDEF operand!");
8819
8820 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8821 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
8822 if (isV_UNDEF)
8823 SubV2 = SubV1;
8824 assert((WhichResult == 0) &&
8825 "In-place shuffle of concat can only have one result!");
8826 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
8827 SubV1, SubV2);
8828 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
8829 Res.getValue(1));
8830 }
8831 }
8832 }
8833
8834 if (ST->hasMVEIntegerOps() && EltSize <= 32) {
8835 if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
8836 return V;
8837
8838 for (bool Top : {false, true}) {
8839 for (bool SingleSource : {false, true}) {
8840 if (isTruncMask(ShuffleMask, VT, Top, SingleSource)) {
8841 MVT FromSVT = MVT::getIntegerVT(EltSize * 2);
8842 MVT FromVT = MVT::getVectorVT(FromSVT, ShuffleMask.size() / 2);
8843 SDValue Lo = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, V1);
8844 SDValue Hi = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT,
8845 SingleSource ? V1 : V2);
8846 if (Top) {
8847 SDValue Amt = DAG.getConstant(EltSize, dl, FromVT);
8848 Lo = DAG.getNode(ISD::SRL, dl, FromVT, Lo, Amt);
8849 Hi = DAG.getNode(ISD::SRL, dl, FromVT, Hi, Amt);
8850 }
8851 return DAG.getNode(ARMISD::MVETRUNC, dl, VT, Lo, Hi);
8852 }
8853 }
8854 }
8855 }
8856
8857 // If the shuffle is not directly supported and it has 4 elements, use
8858 // the PerfectShuffle-generated table to synthesize it from other shuffles.
8859 unsigned NumElts = VT.getVectorNumElements();
8860 if (NumElts == 4) {
8861 unsigned PFIndexes[4];
8862 for (unsigned i = 0; i != 4; ++i) {
8863 if (ShuffleMask[i] < 0)
8864 PFIndexes[i] = 8;
8865 else
8866 PFIndexes[i] = ShuffleMask[i];
8867 }
8868
8869 // Compute the index in the perfect shuffle table.
8870 unsigned PFTableIndex =
8871 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8872 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8873 unsigned Cost = (PFEntry >> 30);
8874
8875 if (Cost <= 4) {
8876 if (ST->hasNEON())
8877 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8878 else if (isLegalMVEShuffleOp(PFEntry)) {
8879 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8880 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8881 unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
8882 unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
8883 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
8884 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8885 }
8886 }
8887 }
8888
8889 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
8890 if (EltSize >= 32) {
8891 // Do the expansion with floating-point types, since that is what the VFP
8892 // registers are defined to use, and since i64 is not legal.
8893 EVT EltVT = EVT::getFloatingPointVT(EltSize);
8894 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8895 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
8896 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
8898 for (unsigned i = 0; i < NumElts; ++i) {
8899 if (ShuffleMask[i] < 0)
8900 Ops.push_back(DAG.getUNDEF(EltVT));
8901 else
8902 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
8903 ShuffleMask[i] < (int)NumElts ? V1 : V2,
8904 DAG.getConstant(ShuffleMask[i] & (NumElts-1),
8905 dl, MVT::i32)));
8906 }
8907 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8908 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8909 }
8910
8911 if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8912 isReverseMask(ShuffleMask, VT))
8913 return LowerReverse_VECTOR_SHUFFLE(Op, DAG);
8914
8915 if (ST->hasNEON() && VT == MVT::v8i8)
8916 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
8917 return NewOp;
8918
8919 if (ST->hasMVEIntegerOps())
8920 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
8921 return NewOp;
8922
8923 return SDValue();
8924}
8925
8927 const ARMSubtarget *ST) {
8928 EVT VecVT = Op.getOperand(0).getValueType();
8929 SDLoc dl(Op);
8930
8931 assert(ST->hasMVEIntegerOps() &&
8932 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8933
8934 SDValue Conv =
8935 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8936 unsigned Lane = Op.getConstantOperandVal(2);
8937 unsigned LaneWidth =
8939 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
8940 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
8941 Op.getOperand(1), DAG.getValueType(MVT::i1));
8942 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
8943 DAG.getConstant(~Mask, dl, MVT::i32));
8944 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
8945}
8946
8947SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
8948 SelectionDAG &DAG) const {
8949 // INSERT_VECTOR_ELT is legal only for immediate indexes.
8950 SDValue Lane = Op.getOperand(2);
8951 if (!isa<ConstantSDNode>(Lane))
8952 return SDValue();
8953
8954 SDValue Elt = Op.getOperand(1);
8955 EVT EltVT = Elt.getValueType();
8956
8957 if (Subtarget->hasMVEIntegerOps() &&
8958 Op.getValueType().getScalarSizeInBits() == 1)
8959 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
8960
8961 if (getTypeAction(*DAG.getContext(), EltVT) ==
8963 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
8964 // but the type system will try to do that if we don't intervene.
8965 // Reinterpret any such vector-element insertion as one with the
8966 // corresponding integer types.
8967
8968 SDLoc dl(Op);
8969
8970 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
8971 assert(getTypeAction(*DAG.getContext(), IEltVT) !=
8973
8974 SDValue VecIn = Op.getOperand(0);
8975 EVT VecVT = VecIn.getValueType();
8976 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
8977 VecVT.getVectorNumElements());
8978
8979 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
8980 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
8981 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
8982 IVecIn, IElt, Lane);
8983 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
8984 }
8985
8986 return Op;
8987}
8988
8990 const ARMSubtarget *ST) {
8991 EVT VecVT = Op.getOperand(0).getValueType();
8992 SDLoc dl(Op);
8993
8994 assert(ST->hasMVEIntegerOps() &&
8995 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8996
8997 SDValue Conv =
8998 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8999 unsigned Lane = Op.getConstantOperandVal(1);
9000 unsigned LaneWidth =
9002 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
9003 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
9004 return Shift;
9005}
9006
9008 const ARMSubtarget *ST) {
9009 // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
9010 SDValue Lane = Op.getOperand(1);
9011 if (!isa<ConstantSDNode>(Lane))
9012 return SDValue();
9013
9014 SDValue Vec = Op.getOperand(0);
9015 EVT VT = Vec.getValueType();
9016
9017 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
9018 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
9019
9020 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
9021 SDLoc dl(Op);
9022 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
9023 }
9024
9025 return Op;
9026}
9027
9029 const ARMSubtarget *ST) {
9030 SDLoc dl(Op);
9031 assert(Op.getValueType().getScalarSizeInBits() == 1 &&
9032 "Unexpected custom CONCAT_VECTORS lowering");
9034 "Unexpected custom CONCAT_VECTORS lowering");
9035 assert(ST->hasMVEIntegerOps() &&
9036 "CONCAT_VECTORS lowering only supported for MVE");
9037
9038 auto ConcatPair = [&](SDValue V1, SDValue V2) {
9039 EVT Op1VT = V1.getValueType();
9040 EVT Op2VT = V2.getValueType();
9041 assert(Op1VT == Op2VT && "Operand types don't match!");
9042 assert((Op1VT == MVT::v2i1 || Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) &&
9043 "Unexpected i1 concat operations!");
9044 EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext());
9045
9046 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
9047 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
9048
9049 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
9050 // promoted to v8i16, etc.
9051 MVT ElType =
9053 unsigned NumElts = 2 * Op1VT.getVectorNumElements();
9054
9055 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
9056 if (Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) {
9057 // Use MVETRUNC to truncate the combined NewV1::NewV2 into the smaller
9058 // ConcatVT.
9059 SDValue ConVec =
9060 DAG.getNode(ARMISD::MVETRUNC, dl, ConcatVT, NewV1, NewV2);
9061 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
9062 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9063 }
9064
9065 // Extract the vector elements from Op1 and Op2 one by one and truncate them
9066 // to be the right size for the destination. For example, if Op1 is v4i1
9067 // then the promoted vector is v4i32. The result of concatenation gives a
9068 // v8i1, which when promoted is v8i16. That means each i32 element from Op1
9069 // needs truncating to i16 and inserting in the result.
9070 auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
9071 EVT NewVT = NewV.getValueType();
9072 EVT ConcatVT = ConVec.getValueType();
9073 unsigned ExtScale = 1;
9074 if (NewVT == MVT::v2f64) {
9075 NewV = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, NewV);
9076 ExtScale = 2;
9077 }
9078 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
9079 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
9080 DAG.getIntPtrConstant(i * ExtScale, dl));
9081 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
9082 DAG.getConstant(j, dl, MVT::i32));
9083 }
9084 return ConVec;
9085 };
9086 unsigned j = 0;
9087 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
9088 ConVec = ExtractInto(NewV1, ConVec, j);
9089 ConVec = ExtractInto(NewV2, ConVec, j);
9090
9091 // Now return the result of comparing the subvector with zero, which will
9092 // generate a real predicate, i.e. v4i1, v8i1 or v16i1.
9093 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
9094 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9095 };
9096
9097 // Concat each pair of subvectors and pack into the lower half of the array.
9098 SmallVector<SDValue> ConcatOps(Op->ops());
9099 while (ConcatOps.size() > 1) {
9100 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
9101 SDValue V1 = ConcatOps[I];
9102 SDValue V2 = ConcatOps[I + 1];
9103 ConcatOps[I / 2] = ConcatPair(V1, V2);
9104 }
9105 ConcatOps.resize(ConcatOps.size() / 2);
9106 }
9107 return ConcatOps[0];
9108}
9109
9111 const ARMSubtarget *ST) {
9112 EVT VT = Op->getValueType(0);
9113 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
9114 return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
9115
9116 // The only time a CONCAT_VECTORS operation can have legal types is when
9117 // two 64-bit vectors are concatenated to a 128-bit vector.
9118 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
9119 "unexpected CONCAT_VECTORS");
9120 SDLoc dl(Op);
9121 SDValue Val = DAG.getUNDEF(MVT::v2f64);
9122 SDValue Op0 = Op.getOperand(0);
9123 SDValue Op1 = Op.getOperand(1);
9124 if (!Op0.isUndef())
9125 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
9126 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
9127 DAG.getIntPtrConstant(0, dl));
9128 if (!Op1.isUndef())
9129 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
9130 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
9131 DAG.getIntPtrConstant(1, dl));
9132 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
9133}
9134
9136 const ARMSubtarget *ST) {
9137 SDValue V1 = Op.getOperand(0);
9138 SDValue V2 = Op.getOperand(1);
9139 SDLoc dl(Op);
9140 EVT VT = Op.getValueType();
9141 EVT Op1VT = V1.getValueType();
9142 unsigned NumElts = VT.getVectorNumElements();
9143 unsigned Index = V2->getAsZExtVal();
9144
9145 assert(VT.getScalarSizeInBits() == 1 &&
9146 "Unexpected custom EXTRACT_SUBVECTOR lowering");
9147 assert(ST->hasMVEIntegerOps() &&
9148 "EXTRACT_SUBVECTOR lowering only supported for MVE");
9149
9150 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
9151
9152 // We now have Op1 promoted to a vector of integers, where v8i1 gets
9153 // promoted to v8i16, etc.
9154
9156
9157 if (NumElts == 2) {
9158 EVT SubVT = MVT::v4i32;
9159 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
9160 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) {
9161 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9162 DAG.getIntPtrConstant(i, dl));
9163 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9164 DAG.getConstant(j, dl, MVT::i32));
9165 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9166 DAG.getConstant(j + 1, dl, MVT::i32));
9167 }
9168 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec,
9169 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9170 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
9171 }
9172
9173 EVT SubVT = MVT::getVectorVT(ElType, NumElts);
9174 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
9175 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
9176 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9177 DAG.getIntPtrConstant(i, dl));
9178 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9179 DAG.getConstant(j, dl, MVT::i32));
9180 }
9181
9182 // Now return the result of comparing the subvector with zero,
9183 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
9184 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
9185 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9186}
9187
9188// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
9190 const ARMSubtarget *ST) {
9191 assert(ST->hasMVEIntegerOps() && "Expected MVE!");
9192 EVT VT = N->getValueType(0);
9193 assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
9194 "Expected a vector i1 type!");
9195 SDValue Op = N->getOperand(0);
9196 EVT FromVT = Op.getValueType();
9197 SDLoc DL(N);
9198
9199 SDValue And =
9200 DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));
9201 return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),
9202 DAG.getCondCode(ISD::SETNE));
9203}
9204
9206 const ARMSubtarget *Subtarget) {
9207 if (!Subtarget->hasMVEIntegerOps())
9208 return SDValue();
9209
9210 EVT ToVT = N->getValueType(0);
9211 if (ToVT.getScalarType() == MVT::i1)
9212 return LowerTruncatei1(N, DAG, Subtarget);
9213
9214 // MVE does not have a single instruction to perform the truncation of a v4i32
9215 // into the lower half of a v8i16, in the same way that a NEON vmovn would.
9216 // Most of the instructions in MVE follow the 'Beats' system, where moving
9217 // values from different lanes is usually something that the instructions
9218 // avoid.
9219 //
9220 // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,
9221 // which take a the top/bottom half of a larger lane and extend it (or do the
9222 // opposite, truncating into the top/bottom lane from a larger lane). Note
9223 // that because of the way we widen lanes, a v4i16 is really a v4i32 using the
9224 // bottom 16bits from each vector lane. This works really well with T/B
9225 // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need
9226 // to move order.
9227 //
9228 // But truncates and sext/zext are always going to be fairly common from llvm.
9229 // We have several options for how to deal with them:
9230 // - Wherever possible combine them into an instruction that makes them
9231 // "free". This includes loads/stores, which can perform the trunc as part
9232 // of the memory operation. Or certain shuffles that can be turned into
9233 // VMOVN/VMOVL.
9234 // - Lane Interleaving to transform blocks surrounded by ext/trunc. So
9235 // trunc(mul(sext(a), sext(b))) may become
9236 // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in
9237 // this case can use VMULL). This is performed in the
9238 // MVELaneInterleavingPass.
9239 // - Otherwise we have an option. By default we would expand the
9240 // zext/sext/trunc into a series of lane extract/inserts going via GPR
9241 // registers. One for each vector lane in the vector. This can obviously be
9242 // very expensive.
9243 // - The other option is to use the fact that loads/store can extend/truncate
9244 // to turn a trunc into two truncating stack stores and a stack reload. This
9245 // becomes 3 back-to-back memory operations, but at least that is less than
9246 // all the insert/extracts.
9247 //
9248 // In order to do the last, we convert certain trunc's into MVETRUNC, which
9249 // are either optimized where they can be, or eventually lowered into stack
9250 // stores/loads. This prevents us from splitting a v8i16 trunc into two stores
9251 // two early, where other instructions would be better, and stops us from
9252 // having to reconstruct multiple buildvector shuffles into loads/stores.
9253 if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)
9254 return SDValue();
9255 EVT FromVT = N->getOperand(0).getValueType();
9256 if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16)
9257 return SDValue();
9258
9259 SDValue Lo, Hi;
9260 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
9261 SDLoc DL(N);
9262 return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi);
9263}
9264
9266 const ARMSubtarget *Subtarget) {
9267 if (!Subtarget->hasMVEIntegerOps())
9268 return SDValue();
9269
9270 // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
9271
9272 EVT ToVT = N->getValueType(0);
9273 if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)
9274 return SDValue();
9275 SDValue Op = N->getOperand(0);
9276 EVT FromVT = Op.getValueType();
9277 if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)
9278 return SDValue();
9279
9280 SDLoc DL(N);
9281 EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
9282 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)
9283 ExtVT = MVT::v8i16;
9284
9285 unsigned Opcode =
9287 SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op);
9288 SDValue Ext1 = Ext.getValue(1);
9289
9290 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {
9291 Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);
9292 Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);
9293 }
9294
9295 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1);
9296}
9297
9298/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
9299/// element has been zero/sign-extended, depending on the isSigned parameter,
9300/// from an integer type half its size.
9302 bool isSigned) {
9303 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
9304 EVT VT = N->getValueType(0);
9305 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
9306 SDNode *BVN = N->getOperand(0).getNode();
9307 if (BVN->getValueType(0) != MVT::v4i32 ||
9308 BVN->getOpcode() != ISD::BUILD_VECTOR)
9309 return false;
9310 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9311 unsigned HiElt = 1 - LoElt;
9312 ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
9313 ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
9314 ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
9315 ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
9316 if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
9317 return false;
9318 if (isSigned) {
9319 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
9320 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
9321 return true;
9322 } else {
9323 if (Hi0->isZero() && Hi1->isZero())
9324 return true;
9325 }
9326 return false;
9327 }
9328
9329 if (N->getOpcode() != ISD::BUILD_VECTOR)
9330 return false;
9331
9332 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
9333 SDNode *Elt = N->getOperand(i).getNode();
9334 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
9335 unsigned EltSize = VT.getScalarSizeInBits();
9336 unsigned HalfSize = EltSize / 2;
9337 if (isSigned) {
9338 if (!isIntN(HalfSize, C->getSExtValue()))
9339 return false;
9340 } else {
9341 if (!isUIntN(HalfSize, C->getZExtValue()))
9342 return false;
9343 }
9344 continue;
9345 }
9346 return false;
9347 }
9348
9349 return true;
9350}
9351
9352/// isSignExtended - Check if a node is a vector value that is sign-extended
9353/// or a constant BUILD_VECTOR with sign-extended elements.
9355 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
9356 return true;
9357 if (isExtendedBUILD_VECTOR(N, DAG, true))
9358 return true;
9359 return false;
9360}
9361
9362/// isZeroExtended - Check if a node is a vector value that is zero-extended (or
9363/// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
9365 if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
9367 return true;
9368 if (isExtendedBUILD_VECTOR(N, DAG, false))
9369 return true;
9370 return false;
9371}
9372
9373static EVT getExtensionTo64Bits(const EVT &OrigVT) {
9374 if (OrigVT.getSizeInBits() >= 64)
9375 return OrigVT;
9376
9377 assert(OrigVT.isSimple() && "Expecting a simple value type");
9378
9379 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
9380 switch (OrigSimpleTy) {
9381 default: llvm_unreachable("Unexpected Vector Type");
9382 case MVT::v2i8:
9383 case MVT::v2i16:
9384 return MVT::v2i32;
9385 case MVT::v4i8:
9386 return MVT::v4i16;
9387 }
9388}
9389
9390/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
9391/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
9392/// We insert the required extension here to get the vector to fill a D register.
9394 const EVT &OrigTy,
9395 const EVT &ExtTy,
9396 unsigned ExtOpcode) {
9397 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
9398 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
9399 // 64-bits we need to insert a new extension so that it will be 64-bits.
9400 assert(ExtTy.is128BitVector() && "Unexpected extension size");
9401 if (OrigTy.getSizeInBits() >= 64)
9402 return N;
9403
9404 // Must extend size to at least 64 bits to be used as an operand for VMULL.
9405 EVT NewVT = getExtensionTo64Bits(OrigTy);
9406
9407 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
9408}
9409
9410/// SkipLoadExtensionForVMULL - return a load of the original vector size that
9411/// does not do any sign/zero extension. If the original vector is less
9412/// than 64 bits, an appropriate extension will be added after the load to
9413/// reach a total size of 64 bits. We have to add the extension separately
9414/// because ARM does not have a sign/zero extending load for vectors.
9416 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
9417
9418 // The load already has the right type.
9419 if (ExtendedTy == LD->getMemoryVT())
9420 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
9421 LD->getBasePtr(), LD->getPointerInfo(), LD->getAlign(),
9422 LD->getMemOperand()->getFlags());
9423
9424 // We need to create a zextload/sextload. We cannot just create a load
9425 // followed by a zext/zext node because LowerMUL is also run during normal
9426 // operation legalization where we can't create illegal types.
9427 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
9428 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
9429 LD->getMemoryVT(), LD->getAlign(),
9430 LD->getMemOperand()->getFlags());
9431}
9432
9433/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
9434/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
9435/// the unextended value. The unextended vector should be 64 bits so that it can
9436/// be used as an operand to a VMULL instruction. If the original vector size
9437/// before extension is less than 64 bits we add a an extension to resize
9438/// the vector to 64 bits.
9440 if (N->getOpcode() == ISD::SIGN_EXTEND ||
9441 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
9442 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
9443 N->getOperand(0)->getValueType(0),
9444 N->getValueType(0),
9445 N->getOpcode());
9446
9447 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
9448 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
9449 "Expected extending load");
9450
9451 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
9452 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
9453 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
9454 SDValue extLoad =
9455 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
9456 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
9457
9458 return newLoad;
9459 }
9460
9461 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
9462 // have been legalized as a BITCAST from v4i32.
9463 if (N->getOpcode() == ISD::BITCAST) {
9464 SDNode *BVN = N->getOperand(0).getNode();
9466 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
9467 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9468 return DAG.getBuildVector(
9469 MVT::v2i32, SDLoc(N),
9470 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
9471 }
9472 // Construct a new BUILD_VECTOR with elements truncated to half the size.
9473 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
9474 EVT VT = N->getValueType(0);
9475 unsigned EltSize = VT.getScalarSizeInBits() / 2;
9476 unsigned NumElts = VT.getVectorNumElements();
9477 MVT TruncVT = MVT::getIntegerVT(EltSize);
9479 SDLoc dl(N);
9480 for (unsigned i = 0; i != NumElts; ++i) {
9481 const APInt &CInt = N->getConstantOperandAPInt(i);
9482 // Element types smaller than 32 bits are not legal, so use i32 elements.
9483 // The values are implicitly truncated so sext vs. zext doesn't matter.
9484 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
9485 }
9486 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
9487}
9488
9489static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
9490 unsigned Opcode = N->getOpcode();
9491 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9492 SDNode *N0 = N->getOperand(0).getNode();
9493 SDNode *N1 = N->getOperand(1).getNode();
9494 return N0->hasOneUse() && N1->hasOneUse() &&
9495 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
9496 }
9497 return false;
9498}
9499
9500static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
9501 unsigned Opcode = N->getOpcode();
9502 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9503 SDNode *N0 = N->getOperand(0).getNode();
9504 SDNode *N1 = N->getOperand(1).getNode();
9505 return N0->hasOneUse() && N1->hasOneUse() &&
9506 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
9507 }
9508 return false;
9509}
9510
9512 // Multiplications are only custom-lowered for 128-bit vectors so that
9513 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
9514 EVT VT = Op.getValueType();
9515 assert(VT.is128BitVector() && VT.isInteger() &&
9516 "unexpected type for custom-lowering ISD::MUL");
9517 SDNode *N0 = Op.getOperand(0).getNode();
9518 SDNode *N1 = Op.getOperand(1).getNode();
9519 unsigned NewOpc = 0;
9520 bool isMLA = false;
9521 bool isN0SExt = isSignExtended(N0, DAG);
9522 bool isN1SExt = isSignExtended(N1, DAG);
9523 if (isN0SExt && isN1SExt)
9524 NewOpc = ARMISD::VMULLs;
9525 else {
9526 bool isN0ZExt = isZeroExtended(N0, DAG);
9527 bool isN1ZExt = isZeroExtended(N1, DAG);
9528 if (isN0ZExt && isN1ZExt)
9529 NewOpc = ARMISD::VMULLu;
9530 else if (isN1SExt || isN1ZExt) {
9531 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
9532 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
9533 if (isN1SExt && isAddSubSExt(N0, DAG)) {
9534 NewOpc = ARMISD::VMULLs;
9535 isMLA = true;
9536 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
9537 NewOpc = ARMISD::VMULLu;
9538 isMLA = true;
9539 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
9540 std::swap(N0, N1);
9541 NewOpc = ARMISD::VMULLu;
9542 isMLA = true;
9543 }
9544 }
9545
9546 if (!NewOpc) {
9547 if (VT == MVT::v2i64)
9548 // Fall through to expand this. It is not legal.
9549 return SDValue();
9550 else
9551 // Other vector multiplications are legal.
9552 return Op;
9553 }
9554 }
9555
9556 // Legalize to a VMULL instruction.
9557 SDLoc DL(Op);
9558 SDValue Op0;
9559 SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
9560 if (!isMLA) {
9561 Op0 = SkipExtensionForVMULL(N0, DAG);
9563 Op1.getValueType().is64BitVector() &&
9564 "unexpected types for extended operands to VMULL");
9565 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
9566 }
9567
9568 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
9569 // isel lowering to take advantage of no-stall back to back vmul + vmla.
9570 // vmull q0, d4, d6
9571 // vmlal q0, d5, d6
9572 // is faster than
9573 // vaddl q0, d4, d5
9574 // vmovl q1, d6
9575 // vmul q0, q0, q1
9576 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
9577 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
9578 EVT Op1VT = Op1.getValueType();
9579 return DAG.getNode(N0->getOpcode(), DL, VT,
9580 DAG.getNode(NewOpc, DL, VT,
9581 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
9582 DAG.getNode(NewOpc, DL, VT,
9583 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
9584}
9585
9587 SelectionDAG &DAG) {
9588 // TODO: Should this propagate fast-math-flags?
9589
9590 // Convert to float
9591 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
9592 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
9593 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
9594 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
9595 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
9596 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
9597 // Get reciprocal estimate.
9598 // float4 recip = vrecpeq_f32(yf);
9599 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9600 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9601 Y);
9602 // Because char has a smaller range than uchar, we can actually get away
9603 // without any newton steps. This requires that we use a weird bias
9604 // of 0xb000, however (again, this has been exhaustively tested).
9605 // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
9606 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
9607 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
9608 Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
9609 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
9610 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
9611 // Convert back to short.
9612 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
9613 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
9614 return X;
9615}
9616
9618 SelectionDAG &DAG) {
9619 // TODO: Should this propagate fast-math-flags?
9620
9621 SDValue N2;
9622 // Convert to float.
9623 // float4 yf = vcvt_f32_s32(vmovl_s16(y));
9624 // float4 xf = vcvt_f32_s32(vmovl_s16(x));
9625 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
9626 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
9627 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9628 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9629
9630 // Use reciprocal estimate and one refinement step.
9631 // float4 recip = vrecpeq_f32(yf);
9632 // recip *= vrecpsq_f32(yf, recip);
9633 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9634 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9635 N1);
9636 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9637 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9638 N1, N2);
9639 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9640 // Because short has a smaller range than ushort, we can actually get away
9641 // with only a single newton step. This requires that we use a weird bias
9642 // of 89, however (again, this has been exhaustively tested).
9643 // float4 result = as_float4(as_int4(xf*recip) + 0x89);
9644 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9645 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9646 N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
9647 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9648 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9649 // Convert back to integer and return.
9650 // return vmovn_s32(vcvt_s32_f32(result));
9651 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9652 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9653 return N0;
9654}
9655
9657 const ARMSubtarget *ST) {
9658 EVT VT = Op.getValueType();
9659 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9660 "unexpected type for custom-lowering ISD::SDIV");
9661
9662 SDLoc dl(Op);
9663 SDValue N0 = Op.getOperand(0);
9664 SDValue N1 = Op.getOperand(1);
9665 SDValue N2, N3;
9666
9667 if (VT == MVT::v8i8) {
9668 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
9669 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
9670
9671 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9672 DAG.getIntPtrConstant(4, dl));
9673 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9674 DAG.getIntPtrConstant(4, dl));
9675 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9676 DAG.getIntPtrConstant(0, dl));
9677 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9678 DAG.getIntPtrConstant(0, dl));
9679
9680 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
9681 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
9682
9683 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9684 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9685
9686 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
9687 return N0;
9688 }
9689 return LowerSDIV_v4i16(N0, N1, dl, DAG);
9690}
9691
9693 const ARMSubtarget *ST) {
9694 // TODO: Should this propagate fast-math-flags?
9695 EVT VT = Op.getValueType();
9696 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9697 "unexpected type for custom-lowering ISD::UDIV");
9698
9699 SDLoc dl(Op);
9700 SDValue N0 = Op.getOperand(0);
9701 SDValue N1 = Op.getOperand(1);
9702 SDValue N2, N3;
9703
9704 if (VT == MVT::v8i8) {
9705 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
9706 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
9707
9708 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9709 DAG.getIntPtrConstant(4, dl));
9710 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9711 DAG.getIntPtrConstant(4, dl));
9712 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9713 DAG.getIntPtrConstant(0, dl));
9714 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9715 DAG.getIntPtrConstant(0, dl));
9716
9717 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
9718 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
9719
9720 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9721 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9722
9723 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
9724 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
9725 MVT::i32),
9726 N0);
9727 return N0;
9728 }
9729
9730 // v4i16 sdiv ... Convert to float.
9731 // float4 yf = vcvt_f32_s32(vmovl_u16(y));
9732 // float4 xf = vcvt_f32_s32(vmovl_u16(x));
9733 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
9734 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
9735 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9736 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9737
9738 // Use reciprocal estimate and two refinement steps.
9739 // float4 recip = vrecpeq_f32(yf);
9740 // recip *= vrecpsq_f32(yf, recip);
9741 // recip *= vrecpsq_f32(yf, recip);
9742 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9743 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9744 BN1);
9745 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9746 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9747 BN1, N2);
9748 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9749 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9750 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9751 BN1, N2);
9752 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9753 // Simply multiplying by the reciprocal estimate can leave us a few ulps
9754 // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
9755 // and that it will never cause us to return an answer too large).
9756 // float4 result = as_float4(as_int4(xf*recip) + 2);
9757 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9758 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9759 N1 = DAG.getConstant(2, dl, MVT::v4i32);
9760 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9761 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9762 // Convert back to integer and return.
9763 // return vmovn_u32(vcvt_s32_f32(result));
9764 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9765 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9766 return N0;
9767}
9768
9770 SDNode *N = Op.getNode();
9771 EVT VT = N->getValueType(0);
9772 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
9773
9774 SDValue Carry = Op.getOperand(2);
9775
9776 SDLoc DL(Op);
9777
9778 SDValue Result;
9779 if (Op.getOpcode() == ISD::UADDO_CARRY) {
9780 // This converts the boolean value carry into the carry flag.
9781 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9782
9783 // Do the addition proper using the carry flag we wanted.
9784 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
9785 Op.getOperand(1), Carry);
9786
9787 // Now convert the carry flag into a boolean value.
9788 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9789 } else {
9790 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
9791 // have to invert the carry first.
9792 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9793 DAG.getConstant(1, DL, MVT::i32), Carry);
9794 // This converts the boolean value carry into the carry flag.
9795 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9796
9797 // Do the subtraction proper using the carry flag we wanted.
9798 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
9799 Op.getOperand(1), Carry);
9800
9801 // Now convert the carry flag into a boolean value.
9802 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9803 // But the carry returned by ARMISD::SUBE is not a borrow as expected
9804 // by ISD::USUBO_CARRY, so compute 1 - C.
9805 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9806 DAG.getConstant(1, DL, MVT::i32), Carry);
9807 }
9808
9809 // Return both values.
9810 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
9811}
9812
9813SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
9814 assert(Subtarget->isTargetDarwin());
9815
9816 // For iOS, we want to call an alternative entry point: __sincos_stret,
9817 // return values are passed via sret.
9818 SDLoc dl(Op);
9819 SDValue Arg = Op.getOperand(0);
9820 EVT ArgVT = Arg.getValueType();
9821 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
9822 auto PtrVT = getPointerTy(DAG.getDataLayout());
9823
9825
9826 // Pair of floats / doubles used to pass the result.
9827 Type *RetTy = StructType::get(ArgTy, ArgTy);
9828 auto &DL = DAG.getDataLayout();
9829
9831 bool ShouldUseSRet = getTM().isAPCS_ABI();
9832 SDValue SRet;
9833 if (ShouldUseSRet) {
9834 // Create stack object for sret.
9835 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
9836 const Align StackAlign = DL.getPrefTypeAlign(RetTy);
9837 int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
9838 SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL));
9839
9840 ArgListEntry Entry(SRet, PointerType::getUnqual(RetTy->getContext()));
9841 Entry.IsSExt = false;
9842 Entry.IsZExt = false;
9843 Entry.IsSRet = true;
9844 Args.push_back(Entry);
9846 }
9847
9848 Args.emplace_back(Arg, ArgTy);
9849
9850 RTLIB::Libcall LC =
9851 (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
9852 const char *LibcallName = getLibcallName(LC);
9854 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
9855
9857 CLI.setDebugLoc(dl)
9858 .setChain(DAG.getEntryNode())
9859 .setCallee(CC, RetTy, Callee, std::move(Args))
9860 .setDiscardResult(ShouldUseSRet);
9861 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
9862
9863 if (!ShouldUseSRet)
9864 return CallResult.first;
9865
9866 SDValue LoadSin =
9867 DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
9868
9869 // Address of cos field.
9870 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
9871 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
9872 SDValue LoadCos =
9873 DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
9874
9875 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
9876 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
9877 LoadSin.getValue(0), LoadCos.getValue(0));
9878}
9879
9880SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
9881 bool Signed,
9882 SDValue &Chain) const {
9883 EVT VT = Op.getValueType();
9884 assert((VT == MVT::i32 || VT == MVT::i64) &&
9885 "unexpected type for custom lowering DIV");
9886 SDLoc dl(Op);
9887
9888 const auto &DL = DAG.getDataLayout();
9889 RTLIB::Libcall LC;
9890 if (Signed)
9891 LC = VT == MVT::i32 ? RTLIB::SDIVREM_I32 : RTLIB::SDIVREM_I64;
9892 else
9893 LC = VT == MVT::i32 ? RTLIB::UDIVREM_I32 : RTLIB::UDIVREM_I64;
9894
9895 const char *Name = getLibcallName(LC);
9897
9899
9900 for (auto AI : {1, 0}) {
9901 SDValue Operand = Op.getOperand(AI);
9902 Args.emplace_back(Operand,
9903 Operand.getValueType().getTypeForEVT(*DAG.getContext()));
9904 }
9905
9906 CallLoweringInfo CLI(DAG);
9907 CLI.setDebugLoc(dl)
9908 .setChain(Chain)
9910 ES, std::move(Args));
9911
9912 return LowerCallTo(CLI).first;
9913}
9914
9915// This is a code size optimisation: return the original SDIV node to
9916// DAGCombiner when we don't want to expand SDIV into a sequence of
9917// instructions, and an empty node otherwise which will cause the
9918// SDIV to be expanded in DAGCombine.
9919SDValue
9920ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
9921 SelectionDAG &DAG,
9922 SmallVectorImpl<SDNode *> &Created) const {
9923 // TODO: Support SREM
9924 if (N->getOpcode() != ISD::SDIV)
9925 return SDValue();
9926
9927 const auto &ST = DAG.getSubtarget<ARMSubtarget>();
9928 const bool MinSize = ST.hasMinSize();
9929 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
9930 : ST.hasDivideInARMMode();
9931
9932 // Don't touch vector types; rewriting this may lead to scalarizing
9933 // the int divs.
9934 if (N->getOperand(0).getValueType().isVector())
9935 return SDValue();
9936
9937 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
9938 // hwdiv support for this to be really profitable.
9939 if (!(MinSize && HasDivide))
9940 return SDValue();
9941
9942 // ARM mode is a bit simpler than Thumb: we can handle large power
9943 // of 2 immediates with 1 mov instruction; no further checks required,
9944 // just return the sdiv node.
9945 if (!ST.isThumb())
9946 return SDValue(N, 0);
9947
9948 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
9949 // and thus lose the code size benefits of a MOVS that requires only 2.
9950 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
9951 // but as it's doing exactly this, it's not worth the trouble to get TTI.
9952 if (Divisor.sgt(128))
9953 return SDValue();
9954
9955 return SDValue(N, 0);
9956}
9957
9958SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
9959 bool Signed) const {
9960 assert(Op.getValueType() == MVT::i32 &&
9961 "unexpected type for custom lowering DIV");
9962 SDLoc dl(Op);
9963
9964 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
9965 DAG.getEntryNode(), Op.getOperand(1));
9966
9967 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9968}
9969
9971 SDLoc DL(N);
9972 SDValue Op = N->getOperand(1);
9973 if (N->getValueType(0) == MVT::i32)
9974 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
9975 SDValue Lo, Hi;
9976 std::tie(Lo, Hi) = DAG.SplitScalar(Op, DL, MVT::i32, MVT::i32);
9977 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
9978 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
9979}
9980
9981void ARMTargetLowering::ExpandDIV_Windows(
9982 SDValue Op, SelectionDAG &DAG, bool Signed,
9984 const auto &DL = DAG.getDataLayout();
9985
9986 assert(Op.getValueType() == MVT::i64 &&
9987 "unexpected type for custom lowering DIV");
9988 SDLoc dl(Op);
9989
9990 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
9991
9992 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9993
9994 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
9995 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
9996 DAG.getConstant(32, dl, getPointerTy(DL)));
9997 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
9998
9999 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
10000}
10001
10003 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
10004 EVT MemVT = LD->getMemoryVT();
10005 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10006 MemVT == MVT::v16i1) &&
10007 "Expected a predicate type!");
10008 assert(MemVT == Op.getValueType());
10009 assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
10010 "Expected a non-extending load");
10011 assert(LD->isUnindexed() && "Expected a unindexed load");
10012
10013 // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit
10014 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
10015 // need to make sure that 8/4/2 bits are actually loaded into the correct
10016 // place, which means loading the value and then shuffling the values into
10017 // the bottom bits of the predicate.
10018 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
10019 // for BE).
10020 // Speaking of BE, apparently the rest of llvm will assume a reverse order to
10021 // a natural VMSR(load), so needs to be reversed.
10022
10023 SDLoc dl(Op);
10024 SDValue Load = DAG.getExtLoad(
10025 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
10027 LD->getMemOperand());
10028 SDValue Val = Load;
10029 if (DAG.getDataLayout().isBigEndian())
10030 Val = DAG.getNode(ISD::SRL, dl, MVT::i32,
10031 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),
10032 DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));
10033 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);
10034 if (MemVT != MVT::v16i1)
10035 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
10036 DAG.getConstant(0, dl, MVT::i32));
10037 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
10038}
10039
10040void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
10041 SelectionDAG &DAG) const {
10042 LoadSDNode *LD = cast<LoadSDNode>(N);
10043 EVT MemVT = LD->getMemoryVT();
10044 assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
10045
10046 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
10047 !Subtarget->isThumb1Only() && LD->isVolatile() &&
10048 LD->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
10049 SDLoc dl(N);
10051 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
10052 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
10053 SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
10054 SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
10055 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
10056 Results.append({Pair, Result.getValue(2)});
10057 }
10058}
10059
10061 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
10062 EVT MemVT = ST->getMemoryVT();
10063 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10064 MemVT == MVT::v16i1) &&
10065 "Expected a predicate type!");
10066 assert(MemVT == ST->getValue().getValueType());
10067 assert(!ST->isTruncatingStore() && "Expected a non-extending store");
10068 assert(ST->isUnindexed() && "Expected a unindexed store");
10069
10070 // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with
10071 // top bits unset and a scalar store.
10072 SDLoc dl(Op);
10073 SDValue Build = ST->getValue();
10074 if (MemVT != MVT::v16i1) {
10076 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {
10077 unsigned Elt = DAG.getDataLayout().isBigEndian()
10078 ? MemVT.getVectorNumElements() - I - 1
10079 : I;
10080 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
10081 DAG.getConstant(Elt, dl, MVT::i32)));
10082 }
10083 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
10084 Ops.push_back(DAG.getUNDEF(MVT::i32));
10085 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
10086 }
10087 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
10088 if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())
10089 GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,
10090 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP),
10091 DAG.getConstant(16, dl, MVT::i32));
10092 return DAG.getTruncStore(
10093 ST->getChain(), dl, GRP, ST->getBasePtr(),
10095 ST->getMemOperand());
10096}
10097
10099 const ARMSubtarget *Subtarget) {
10100 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
10101 EVT MemVT = ST->getMemoryVT();
10102 assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
10103
10104 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
10105 !Subtarget->isThumb1Only() && ST->isVolatile() &&
10106 ST->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
10107 SDNode *N = Op.getNode();
10108 SDLoc dl(N);
10109
10110 SDValue Lo = DAG.getNode(
10111 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
10112 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
10113 MVT::i32));
10114 SDValue Hi = DAG.getNode(
10115 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
10116 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
10117 MVT::i32));
10118
10119 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
10120 {ST->getChain(), Lo, Hi, ST->getBasePtr()},
10121 MemVT, ST->getMemOperand());
10122 } else if (Subtarget->hasMVEIntegerOps() &&
10123 ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10124 MemVT == MVT::v16i1))) {
10125 return LowerPredicateStore(Op, DAG);
10126 }
10127
10128 return SDValue();
10129}
10130
10131static bool isZeroVector(SDValue N) {
10132 return (ISD::isBuildVectorAllZeros(N.getNode()) ||
10133 (N->getOpcode() == ARMISD::VMOVIMM &&
10134 isNullConstant(N->getOperand(0))));
10135}
10136
10138 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
10139 MVT VT = Op.getSimpleValueType();
10140 SDValue Mask = N->getMask();
10141 SDValue PassThru = N->getPassThru();
10142 SDLoc dl(Op);
10143
10144 if (isZeroVector(PassThru))
10145 return Op;
10146
10147 // MVE Masked loads use zero as the passthru value. Here we convert undef to
10148 // zero too, and other values are lowered to a select.
10149 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
10150 DAG.getTargetConstant(0, dl, MVT::i32));
10151 SDValue NewLoad = DAG.getMaskedLoad(
10152 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
10153 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
10154 N->getExtensionType(), N->isExpandingLoad());
10155 SDValue Combo = NewLoad;
10156 bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
10157 PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
10158 isZeroVector(PassThru->getOperand(0));
10159 if (!PassThru.isUndef() && !PassThruIsCastZero)
10160 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
10161 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
10162}
10163
10165 const ARMSubtarget *ST) {
10166 if (!ST->hasMVEIntegerOps())
10167 return SDValue();
10168
10169 SDLoc dl(Op);
10170 unsigned BaseOpcode = 0;
10171 switch (Op->getOpcode()) {
10172 default: llvm_unreachable("Expected VECREDUCE opcode");
10173 case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
10174 case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
10175 case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
10176 case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
10177 case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
10178 case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
10179 case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
10180 case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
10181 }
10182
10183 SDValue Op0 = Op->getOperand(0);
10184 EVT VT = Op0.getValueType();
10185 EVT EltVT = VT.getVectorElementType();
10186 unsigned NumElts = VT.getVectorNumElements();
10187 unsigned NumActiveLanes = NumElts;
10188
10189 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10190 NumActiveLanes == 2) &&
10191 "Only expected a power 2 vector size");
10192
10193 // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
10194 // allows us to easily extract vector elements from the lanes.
10195 while (NumActiveLanes > 4) {
10196 unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
10197 SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
10198 Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
10199 NumActiveLanes /= 2;
10200 }
10201
10202 SDValue Res;
10203 if (NumActiveLanes == 4) {
10204 // The remaining 4 elements are summed sequentially
10205 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10206 DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
10207 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10208 DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
10209 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10210 DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
10211 SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10212 DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
10213 SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10214 SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
10215 Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
10216 } else {
10217 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10218 DAG.getConstant(0, dl, MVT::i32));
10219 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10220 DAG.getConstant(1, dl, MVT::i32));
10221 Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10222 }
10223
10224 // Result type may be wider than element type.
10225 if (EltVT != Op->getValueType(0))
10226 Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
10227 return Res;
10228}
10229
10231 const ARMSubtarget *ST) {
10232 if (!ST->hasMVEFloatOps())
10233 return SDValue();
10234 return LowerVecReduce(Op, DAG, ST);
10235}
10236
10238 const ARMSubtarget *ST) {
10239 if (!ST->hasNEON())
10240 return SDValue();
10241
10242 SDLoc dl(Op);
10243 SDValue Op0 = Op->getOperand(0);
10244 EVT VT = Op0.getValueType();
10245 EVT EltVT = VT.getVectorElementType();
10246
10247 unsigned PairwiseIntrinsic = 0;
10248 switch (Op->getOpcode()) {
10249 default:
10250 llvm_unreachable("Expected VECREDUCE opcode");
10252 PairwiseIntrinsic = Intrinsic::arm_neon_vpminu;
10253 break;
10255 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxu;
10256 break;
10258 PairwiseIntrinsic = Intrinsic::arm_neon_vpmins;
10259 break;
10261 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxs;
10262 break;
10263 }
10264 SDValue PairwiseOp = DAG.getConstant(PairwiseIntrinsic, dl, MVT::i32);
10265
10266 unsigned NumElts = VT.getVectorNumElements();
10267 unsigned NumActiveLanes = NumElts;
10268
10269 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10270 NumActiveLanes == 2) &&
10271 "Only expected a power 2 vector size");
10272
10273 // Split 128-bit vectors, since vpmin/max takes 2 64-bit vectors.
10274 if (VT.is128BitVector()) {
10275 SDValue Lo, Hi;
10276 std::tie(Lo, Hi) = DAG.SplitVector(Op0, dl);
10277 VT = Lo.getValueType();
10278 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Lo, Hi});
10279 NumActiveLanes /= 2;
10280 }
10281
10282 // Use pairwise reductions until one lane remains
10283 while (NumActiveLanes > 1) {
10284 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Op0, Op0});
10285 NumActiveLanes /= 2;
10286 }
10287
10288 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10289 DAG.getConstant(0, dl, MVT::i32));
10290
10291 // Result type may be wider than element type.
10292 if (EltVT != Op.getValueType()) {
10293 unsigned Extend = 0;
10294 switch (Op->getOpcode()) {
10295 default:
10296 llvm_unreachable("Expected VECREDUCE opcode");
10299 Extend = ISD::ZERO_EXTEND;
10300 break;
10303 Extend = ISD::SIGN_EXTEND;
10304 break;
10305 }
10306 Res = DAG.getNode(Extend, dl, Op.getValueType(), Res);
10307 }
10308 return Res;
10309}
10310
10312 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))
10313 // Acquire/Release load/store is not legal for targets without a dmb or
10314 // equivalent available.
10315 return SDValue();
10316
10317 // Monotonic load/store is legal for all targets.
10318 return Op;
10319}
10320
10323 SelectionDAG &DAG,
10324 const ARMSubtarget *Subtarget) {
10325 SDLoc DL(N);
10326 // Under Power Management extensions, the cycle-count is:
10327 // mrc p15, #0, <Rt>, c9, c13, #0
10328 SDValue Ops[] = { N->getOperand(0), // Chain
10329 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
10330 DAG.getTargetConstant(15, DL, MVT::i32),
10331 DAG.getTargetConstant(0, DL, MVT::i32),
10332 DAG.getTargetConstant(9, DL, MVT::i32),
10333 DAG.getTargetConstant(13, DL, MVT::i32),
10334 DAG.getTargetConstant(0, DL, MVT::i32)
10335 };
10336
10337 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
10338 DAG.getVTList(MVT::i32, MVT::Other), Ops);
10339 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
10340 DAG.getConstant(0, DL, MVT::i32)));
10341 Results.push_back(Cycles32.getValue(1));
10342}
10343
10345 SDValue V1) {
10346 SDLoc dl(V0.getNode());
10347 SDValue RegClass =
10348 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
10349 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
10350 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
10351 const SDValue Ops[] = {RegClass, V0, SubReg0, V1, SubReg1};
10352 return SDValue(
10353 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
10354}
10355
10357 SDLoc dl(V.getNode());
10358 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i32, MVT::i32);
10359 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10360 if (isBigEndian)
10361 std::swap(VLo, VHi);
10362 return createGPRPairNode2xi32(DAG, VLo, VHi);
10363}
10364
10367 SelectionDAG &DAG) {
10368 assert(N->getValueType(0) == MVT::i64 &&
10369 "AtomicCmpSwap on types less than 64 should be legal");
10370 SDValue Ops[] = {
10371 createGPRPairNode2xi32(DAG, N->getOperand(1),
10372 DAG.getUNDEF(MVT::i32)), // pointer, temp
10373 createGPRPairNodei64(DAG, N->getOperand(2)), // expected
10374 createGPRPairNodei64(DAG, N->getOperand(3)), // new
10375 N->getOperand(0), // chain in
10376 };
10377 SDNode *CmpSwap = DAG.getMachineNode(
10378 ARM::CMP_SWAP_64, SDLoc(N),
10379 DAG.getVTList(MVT::Untyped, MVT::Untyped, MVT::Other), Ops);
10380
10381 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
10382 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
10383
10384 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10385
10386 SDValue Lo =
10387 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
10388 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10389 SDValue Hi =
10390 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
10391 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10392 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
10393 Results.push_back(SDValue(CmpSwap, 2));
10394}
10395
10396SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
10397 SDLoc dl(Op);
10398 EVT VT = Op.getValueType();
10399 SDValue Chain = Op.getOperand(0);
10400 SDValue LHS = Op.getOperand(1);
10401 SDValue RHS = Op.getOperand(2);
10402 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();
10403 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10404
10405 // If we don't have instructions of this float type then soften to a libcall
10406 // and use SETCC instead.
10407 if (isUnsupportedFloatingType(LHS.getValueType())) {
10408 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS,
10409 Chain, IsSignaling);
10410 if (!RHS.getNode()) {
10411 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10412 CC = ISD::SETNE;
10413 }
10414 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS,
10415 DAG.getCondCode(CC));
10416 return DAG.getMergeValues({Result, Chain}, dl);
10417 }
10418
10419 ARMCC::CondCodes CondCode, CondCode2;
10420 FPCCToARMCC(CC, CondCode, CondCode2);
10421
10422 SDValue True = DAG.getConstant(1, dl, VT);
10423 SDValue False = DAG.getConstant(0, dl, VT);
10424 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
10425 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
10426 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, Cmp, DAG);
10427 if (CondCode2 != ARMCC::AL) {
10428 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
10429 Result = getCMOV(dl, VT, Result, True, ARMcc, Cmp, DAG);
10430 }
10431 return DAG.getMergeValues({Result, Chain}, dl);
10432}
10433
10434SDValue ARMTargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
10436
10437 EVT VT = getPointerTy(DAG.getDataLayout());
10438 int FI = MFI.CreateFixedObject(4, 0, false);
10439 return DAG.getFrameIndex(FI, VT);
10440}
10441
10442SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op,
10443 SelectionDAG &DAG) const {
10444 SDLoc DL(Op);
10445 MakeLibCallOptions CallOptions;
10446 MVT SVT = Op.getOperand(0).getSimpleValueType();
10447 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
10448 SDValue Res =
10449 makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
10450 return DAG.getBitcast(MVT::i32, Res);
10451}
10452
10453SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
10454 SDLoc dl(Op);
10455 SDValue LHS = Op.getOperand(0);
10456 SDValue RHS = Op.getOperand(1);
10457
10458 // Determine if this is signed or unsigned comparison
10459 bool IsSigned = (Op.getOpcode() == ISD::SCMP);
10460
10461 // Special case for Thumb1 UCMP only
10462 if (!IsSigned && Subtarget->isThumb1Only()) {
10463 // For Thumb unsigned comparison, use this sequence:
10464 // subs r2, r0, r1 ; r2 = LHS - RHS, sets flags
10465 // sbc r2, r2 ; r2 = r2 - r2 - !carry
10466 // cmp r1, r0 ; compare RHS with LHS
10467 // sbc r1, r1 ; r1 = r1 - r1 - !carry
10468 // subs r0, r2, r1 ; r0 = r2 - r1 (final result)
10469
10470 // First subtraction: LHS - RHS
10471 SDValue Sub1WithFlags = DAG.getNode(
10472 ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10473 SDValue Sub1Result = Sub1WithFlags.getValue(0);
10474 SDValue Flags1 = Sub1WithFlags.getValue(1);
10475
10476 // SUBE: Sub1Result - Sub1Result - !carry
10477 // This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned)
10478 SDValue Sbc1 =
10479 DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT),
10480 Sub1Result, Sub1Result, Flags1);
10481 SDValue Sbc1Result = Sbc1.getValue(0);
10482
10483 // Second comparison: RHS vs LHS (reverse comparison)
10484 SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS);
10485
10486 // SUBE: RHS - RHS - !carry
10487 // This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned)
10488 SDValue Sbc2 = DAG.getNode(
10489 ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags);
10490 SDValue Sbc2Result = Sbc2.getValue(0);
10491
10492 // Final subtraction: Sbc1Result - Sbc2Result (no flags needed)
10493 SDValue Result =
10494 DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result);
10495 if (Op.getValueType() != MVT::i32)
10496 Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
10497
10498 return Result;
10499 }
10500
10501 // For the ARM assembly pattern:
10502 // subs r0, r0, r1 ; subtract RHS from LHS and set flags
10503 // movgt r0, #1 ; if LHS > RHS, set result to 1 (GT for signed, HI for
10504 // unsigned) mvnlt r0, #0 ; if LHS < RHS, set result to -1 (LT for
10505 // signed, LO for unsigned)
10506 // ; if LHS == RHS, result remains 0 from the subs
10507
10508 // Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC
10509 unsigned Opcode = ARMISD::SUBC;
10510
10511 // Check if RHS is a subtraction against 0: (0 - X)
10512 if (RHS.getOpcode() == ISD::SUB) {
10513 SDValue SubLHS = RHS.getOperand(0);
10514 SDValue SubRHS = RHS.getOperand(1);
10515
10516 // Check if it's 0 - X
10517 if (isNullConstant(SubLHS)) {
10518 bool CanUseAdd = false;
10519 if (IsSigned) {
10520 // For SCMP: only if X is known to never be INT_MIN (to avoid overflow)
10521 if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS)
10523 .isMinSignedValue()) {
10524 CanUseAdd = true;
10525 }
10526 } else {
10527 // For UCMP: only if X is known to never be zero
10528 if (DAG.isKnownNeverZero(SubRHS)) {
10529 CanUseAdd = true;
10530 }
10531 }
10532
10533 if (CanUseAdd) {
10534 Opcode = ARMISD::ADDC;
10535 RHS = SubRHS; // Replace RHS with X, so we do LHS + X instead of
10536 // LHS - (0 - X)
10537 }
10538 }
10539 }
10540
10541 // Generate the operation with flags
10542 SDValue OpWithFlags =
10543 DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10544
10545 SDValue OpResult = OpWithFlags.getValue(0);
10546 SDValue Flags = OpWithFlags.getValue(1);
10547
10548 // Constants for conditional moves
10549 SDValue One = DAG.getConstant(1, dl, MVT::i32);
10550 SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32);
10551
10552 // Select condition codes based on signed vs unsigned
10553 ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI;
10554 ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO;
10555
10556 // First conditional move: if greater than, set to 1
10557 SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32);
10558 SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One,
10559 GTCondValue, Flags);
10560
10561 // Second conditional move: if less than, set to -1
10562 SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32);
10563 SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne,
10564 LTCondValue, Flags);
10565
10566 if (Op.getValueType() != MVT::i32)
10567 Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
10568
10569 return Result2;
10570}
10571
10573 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
10574 switch (Op.getOpcode()) {
10575 default: llvm_unreachable("Don't know how to custom lower this!");
10576 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
10577 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
10578 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
10579 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
10580 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
10581 case ISD::SELECT: return LowerSELECT(Op, DAG);
10582 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
10583 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
10584 case ISD::BR_CC: return LowerBR_CC(Op, DAG);
10585 case ISD::BR_JT: return LowerBR_JT(Op, DAG);
10586 case ISD::VASTART: return LowerVASTART(Op, DAG);
10587 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);
10588 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
10589 case ISD::SINT_TO_FP:
10590 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
10593 case ISD::FP_TO_SINT:
10594 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
10596 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget);
10597 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
10598 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
10599 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
10600 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
10601 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
10602 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
10603 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
10604 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
10605 Subtarget);
10606 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
10607 case ISD::SHL:
10608 case ISD::SRL:
10609 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
10610 case ISD::SREM: return LowerREM(Op.getNode(), DAG);
10611 case ISD::UREM: return LowerREM(Op.getNode(), DAG);
10612 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
10613 case ISD::SRL_PARTS:
10614 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
10615 case ISD::CTTZ:
10616 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
10617 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
10618 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
10619 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
10620 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
10621 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
10622 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
10623 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
10624 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
10625 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
10626 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
10627 case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget);
10628 case ISD::SIGN_EXTEND:
10629 case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget);
10630 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
10631 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
10632 case ISD::SET_FPMODE:
10633 return LowerSET_FPMODE(Op, DAG);
10634 case ISD::RESET_FPMODE:
10635 return LowerRESET_FPMODE(Op, DAG);
10636 case ISD::MUL: return LowerMUL(Op, DAG);
10637 case ISD::SDIV:
10638 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10639 return LowerDIV_Windows(Op, DAG, /* Signed */ true);
10640 return LowerSDIV(Op, DAG, Subtarget);
10641 case ISD::UDIV:
10642 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10643 return LowerDIV_Windows(Op, DAG, /* Signed */ false);
10644 return LowerUDIV(Op, DAG, Subtarget);
10645 case ISD::UADDO_CARRY:
10646 case ISD::USUBO_CARRY:
10647 return LowerUADDSUBO_CARRY(Op, DAG);
10648 case ISD::SADDO:
10649 case ISD::SSUBO:
10650 return LowerSignedALUO(Op, DAG);
10651 case ISD::UADDO:
10652 case ISD::USUBO:
10653 return LowerUnsignedALUO(Op, DAG);
10654 case ISD::SADDSAT:
10655 case ISD::SSUBSAT:
10656 case ISD::UADDSAT:
10657 case ISD::USUBSAT:
10658 return LowerADDSUBSAT(Op, DAG, Subtarget);
10659 case ISD::LOAD:
10660 return LowerPredicateLoad(Op, DAG);
10661 case ISD::STORE:
10662 return LowerSTORE(Op, DAG, Subtarget);
10663 case ISD::MLOAD:
10664 return LowerMLOAD(Op, DAG);
10665 case ISD::VECREDUCE_MUL:
10666 case ISD::VECREDUCE_AND:
10667 case ISD::VECREDUCE_OR:
10668 case ISD::VECREDUCE_XOR:
10669 return LowerVecReduce(Op, DAG, Subtarget);
10674 return LowerVecReduceF(Op, DAG, Subtarget);
10679 return LowerVecReduceMinMax(Op, DAG, Subtarget);
10680 case ISD::ATOMIC_LOAD:
10681 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
10682 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
10683 case ISD::SDIVREM:
10684 case ISD::UDIVREM: return LowerDivRem(Op, DAG);
10686 if (Subtarget->isTargetWindows())
10687 return LowerDYNAMIC_STACKALLOC(Op, DAG);
10688 llvm_unreachable("Don't know how to custom lower this!");
10690 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
10692 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
10693 case ISD::STRICT_FSETCC:
10694 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
10695 case ISD::SPONENTRY:
10696 return LowerSPONENTRY(Op, DAG);
10697 case ISD::FP_TO_BF16:
10698 return LowerFP_TO_BF16(Op, DAG);
10699 case ARMISD::WIN__DBZCHK: return SDValue();
10700 case ISD::UCMP:
10701 case ISD::SCMP:
10702 return LowerCMP(Op, DAG);
10703 }
10704}
10705
10707 SelectionDAG &DAG) {
10708 unsigned IntNo = N->getConstantOperandVal(0);
10709 unsigned Opc = 0;
10710 if (IntNo == Intrinsic::arm_smlald)
10712 else if (IntNo == Intrinsic::arm_smlaldx)
10714 else if (IntNo == Intrinsic::arm_smlsld)
10716 else if (IntNo == Intrinsic::arm_smlsldx)
10718 else
10719 return;
10720
10721 SDLoc dl(N);
10722 SDValue Lo, Hi;
10723 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(3), dl, MVT::i32, MVT::i32);
10724
10725 SDValue LongMul = DAG.getNode(Opc, dl,
10726 DAG.getVTList(MVT::i32, MVT::i32),
10727 N->getOperand(1), N->getOperand(2),
10728 Lo, Hi);
10729 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
10730 LongMul.getValue(0), LongMul.getValue(1)));
10731}
10732
10733/// ReplaceNodeResults - Replace the results of node with an illegal result
10734/// type with new values built out of custom code.
10737 SelectionDAG &DAG) const {
10738 SDValue Res;
10739 switch (N->getOpcode()) {
10740 default:
10741 llvm_unreachable("Don't know how to custom expand this!");
10742 case ISD::READ_REGISTER:
10744 break;
10745 case ISD::BITCAST:
10746 Res = ExpandBITCAST(N, DAG, Subtarget);
10747 break;
10748 case ISD::SRL:
10749 case ISD::SRA:
10750 case ISD::SHL:
10751 Res = Expand64BitShift(N, DAG, Subtarget);
10752 break;
10753 case ISD::SREM:
10754 case ISD::UREM:
10755 Res = LowerREM(N, DAG);
10756 break;
10757 case ISD::SDIVREM:
10758 case ISD::UDIVREM:
10759 Res = LowerDivRem(SDValue(N, 0), DAG);
10760 assert(Res.getNumOperands() == 2 && "DivRem needs two values");
10761 Results.push_back(Res.getValue(0));
10762 Results.push_back(Res.getValue(1));
10763 return;
10764 case ISD::SADDSAT:
10765 case ISD::SSUBSAT:
10766 case ISD::UADDSAT:
10767 case ISD::USUBSAT:
10768 Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
10769 break;
10771 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
10772 return;
10773 case ISD::UDIV:
10774 case ISD::SDIV:
10775 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
10776 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
10777 Results);
10780 return;
10782 return ReplaceLongIntrinsic(N, Results, DAG);
10783 case ISD::LOAD:
10784 LowerLOAD(N, Results, DAG);
10785 break;
10786 case ISD::TRUNCATE:
10787 Res = LowerTruncate(N, DAG, Subtarget);
10788 break;
10789 case ISD::SIGN_EXTEND:
10790 case ISD::ZERO_EXTEND:
10791 Res = LowerVectorExtend(N, DAG, Subtarget);
10792 break;
10795 Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget);
10796 break;
10797 }
10798 if (Res.getNode())
10799 Results.push_back(Res);
10800}
10801
10802//===----------------------------------------------------------------------===//
10803// ARM Scheduler Hooks
10804//===----------------------------------------------------------------------===//
10805
10806/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
10807/// registers the function context.
10808void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
10810 MachineBasicBlock *DispatchBB,
10811 int FI) const {
10812 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
10813 "ROPI/RWPI not currently supported with SjLj");
10814 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10815 DebugLoc dl = MI.getDebugLoc();
10816 MachineFunction *MF = MBB->getParent();
10820 const Function &F = MF->getFunction();
10821
10822 bool isThumb = Subtarget->isThumb();
10823 bool isThumb2 = Subtarget->isThumb2();
10824
10825 unsigned PCLabelId = AFI->createPICLabelUId();
10826 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
10828 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
10829 unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
10830
10831 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
10832 : &ARM::GPRRegClass;
10833
10834 // Grab constant pool and fixed stack memory operands.
10835 MachineMemOperand *CPMMO =
10838
10839 MachineMemOperand *FIMMOSt =
10842
10843 // Load the address of the dispatch MBB into the jump buffer.
10844 if (isThumb2) {
10845 // Incoming value: jbuf
10846 // ldr.n r5, LCPI1_1
10847 // orr r5, r5, #1
10848 // add r5, pc
10849 // str r5, [$jbuf, #+4] ; &jbuf[1]
10850 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10851 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
10853 .addMemOperand(CPMMO)
10855 // Set the low bit because of thumb mode.
10856 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10857 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
10858 .addReg(NewVReg1, RegState::Kill)
10859 .addImm(0x01)
10861 .add(condCodeOp());
10862 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10863 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
10864 .addReg(NewVReg2, RegState::Kill)
10865 .addImm(PCLabelId);
10866 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
10867 .addReg(NewVReg3, RegState::Kill)
10868 .addFrameIndex(FI)
10869 .addImm(36) // &jbuf[1] :: pc
10870 .addMemOperand(FIMMOSt)
10872 } else if (isThumb) {
10873 // Incoming value: jbuf
10874 // ldr.n r1, LCPI1_4
10875 // add r1, pc
10876 // mov r2, #1
10877 // orrs r1, r2
10878 // add r2, $jbuf, #+4 ; &jbuf[1]
10879 // str r1, [r2]
10880 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10881 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
10883 .addMemOperand(CPMMO)
10885 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10886 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
10887 .addReg(NewVReg1, RegState::Kill)
10888 .addImm(PCLabelId);
10889 // Set the low bit because of thumb mode.
10890 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10891 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
10892 .addReg(ARM::CPSR, RegState::Define)
10893 .addImm(1)
10895 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10896 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
10897 .addReg(ARM::CPSR, RegState::Define)
10898 .addReg(NewVReg2, RegState::Kill)
10899 .addReg(NewVReg3, RegState::Kill)
10901 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10902 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
10903 .addFrameIndex(FI)
10904 .addImm(36); // &jbuf[1] :: pc
10905 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
10906 .addReg(NewVReg4, RegState::Kill)
10907 .addReg(NewVReg5, RegState::Kill)
10908 .addImm(0)
10909 .addMemOperand(FIMMOSt)
10911 } else {
10912 // Incoming value: jbuf
10913 // ldr r1, LCPI1_1
10914 // add r1, pc, r1
10915 // str r1, [$jbuf, #+4] ; &jbuf[1]
10916 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10917 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
10919 .addImm(0)
10920 .addMemOperand(CPMMO)
10922 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10923 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
10924 .addReg(NewVReg1, RegState::Kill)
10925 .addImm(PCLabelId)
10927 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
10928 .addReg(NewVReg2, RegState::Kill)
10929 .addFrameIndex(FI)
10930 .addImm(36) // &jbuf[1] :: pc
10931 .addMemOperand(FIMMOSt)
10933 }
10934}
10935
10936void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
10937 MachineBasicBlock *MBB) const {
10938 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10939 DebugLoc dl = MI.getDebugLoc();
10940 MachineFunction *MF = MBB->getParent();
10942 MachineFrameInfo &MFI = MF->getFrameInfo();
10943 int FI = MFI.getFunctionContextIndex();
10944
10945 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
10946 : &ARM::GPRnopcRegClass;
10947
10948 // Get a mapping of the call site numbers to all of the landing pads they're
10949 // associated with.
10951 unsigned MaxCSNum = 0;
10952 for (MachineBasicBlock &BB : *MF) {
10953 if (!BB.isEHPad())
10954 continue;
10955
10956 // FIXME: We should assert that the EH_LABEL is the first MI in the landing
10957 // pad.
10958 for (MachineInstr &II : BB) {
10959 if (!II.isEHLabel())
10960 continue;
10961
10962 MCSymbol *Sym = II.getOperand(0).getMCSymbol();
10963 if (!MF->hasCallSiteLandingPad(Sym)) continue;
10964
10965 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
10966 for (unsigned Idx : CallSiteIdxs) {
10967 CallSiteNumToLPad[Idx].push_back(&BB);
10968 MaxCSNum = std::max(MaxCSNum, Idx);
10969 }
10970 break;
10971 }
10972 }
10973
10974 // Get an ordered list of the machine basic blocks for the jump table.
10975 std::vector<MachineBasicBlock*> LPadList;
10977 LPadList.reserve(CallSiteNumToLPad.size());
10978 for (unsigned I = 1; I <= MaxCSNum; ++I) {
10979 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
10980 for (MachineBasicBlock *MBB : MBBList) {
10981 LPadList.push_back(MBB);
10982 InvokeBBs.insert_range(MBB->predecessors());
10983 }
10984 }
10985
10986 assert(!LPadList.empty() &&
10987 "No landing pad destinations for the dispatch jump table!");
10988
10989 // Create the jump table and associated information.
10991 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
10992 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
10993
10994 // Create the MBBs for the dispatch code.
10995
10996 // Shove the dispatch's address into the return slot in the function context.
10997 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
10998 DispatchBB->setIsEHPad();
10999
11000 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
11001
11002 BuildMI(TrapBB, dl, TII->get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP));
11003 DispatchBB->addSuccessor(TrapBB);
11004
11005 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
11006 DispatchBB->addSuccessor(DispContBB);
11007
11008 // Insert and MBBs.
11009 MF->insert(MF->end(), DispatchBB);
11010 MF->insert(MF->end(), DispContBB);
11011 MF->insert(MF->end(), TrapBB);
11012
11013 // Insert code into the entry block that creates and registers the function
11014 // context.
11015 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
11016
11017 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
11020
11022 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
11023
11024 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
11025 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
11026
11027 // Add a register mask with no preserved registers. This results in all
11028 // registers being marked as clobbered. This can't work if the dispatch block
11029 // is in a Thumb1 function and is linked with ARM code which uses the FP
11030 // registers, as there is no way to preserve the FP registers in Thumb1 mode.
11032
11033 bool IsPositionIndependent = isPositionIndependent();
11034 unsigned NumLPads = LPadList.size();
11035 if (Subtarget->isThumb2()) {
11036 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11037 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
11038 .addFrameIndex(FI)
11039 .addImm(4)
11040 .addMemOperand(FIMMOLd)
11042
11043 if (NumLPads < 256) {
11044 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
11045 .addReg(NewVReg1)
11046 .addImm(LPadList.size())
11048 } else {
11049 Register VReg1 = MRI->createVirtualRegister(TRC);
11050 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
11051 .addImm(NumLPads & 0xFFFF)
11053
11054 unsigned VReg2 = VReg1;
11055 if ((NumLPads & 0xFFFF0000) != 0) {
11056 VReg2 = MRI->createVirtualRegister(TRC);
11057 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
11058 .addReg(VReg1)
11059 .addImm(NumLPads >> 16)
11061 }
11062
11063 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
11064 .addReg(NewVReg1)
11065 .addReg(VReg2)
11067 }
11068
11069 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
11070 .addMBB(TrapBB)
11072 .addReg(ARM::CPSR);
11073
11074 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11075 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
11076 .addJumpTableIndex(MJTI)
11078
11079 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11080 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
11081 .addReg(NewVReg3, RegState::Kill)
11082 .addReg(NewVReg1)
11085 .add(condCodeOp());
11086
11087 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
11088 .addReg(NewVReg4, RegState::Kill)
11089 .addReg(NewVReg1)
11090 .addJumpTableIndex(MJTI);
11091 } else if (Subtarget->isThumb()) {
11092 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11093 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
11094 .addFrameIndex(FI)
11095 .addImm(1)
11096 .addMemOperand(FIMMOLd)
11098
11099 if (NumLPads < 256) {
11100 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
11101 .addReg(NewVReg1)
11102 .addImm(NumLPads)
11104 } else {
11105 MachineConstantPool *ConstantPool = MF->getConstantPool();
11106 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11107 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11108
11109 // MachineConstantPool wants an explicit alignment.
11110 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11111 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11112
11113 Register VReg1 = MRI->createVirtualRegister(TRC);
11114 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
11115 .addReg(VReg1, RegState::Define)
11118 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
11119 .addReg(NewVReg1)
11120 .addReg(VReg1)
11122 }
11123
11124 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
11125 .addMBB(TrapBB)
11127 .addReg(ARM::CPSR);
11128
11129 Register NewVReg2 = MRI->createVirtualRegister(TRC);
11130 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
11131 .addReg(ARM::CPSR, RegState::Define)
11132 .addReg(NewVReg1)
11133 .addImm(2)
11135
11136 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11137 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
11138 .addJumpTableIndex(MJTI)
11140
11141 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11142 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
11143 .addReg(ARM::CPSR, RegState::Define)
11144 .addReg(NewVReg2, RegState::Kill)
11145 .addReg(NewVReg3)
11147
11148 MachineMemOperand *JTMMOLd =
11149 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11151
11152 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11153 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
11154 .addReg(NewVReg4, RegState::Kill)
11155 .addImm(0)
11156 .addMemOperand(JTMMOLd)
11158
11159 unsigned NewVReg6 = NewVReg5;
11160 if (IsPositionIndependent) {
11161 NewVReg6 = MRI->createVirtualRegister(TRC);
11162 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
11163 .addReg(ARM::CPSR, RegState::Define)
11164 .addReg(NewVReg5, RegState::Kill)
11165 .addReg(NewVReg3)
11167 }
11168
11169 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
11170 .addReg(NewVReg6, RegState::Kill)
11171 .addJumpTableIndex(MJTI);
11172 } else {
11173 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11174 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
11175 .addFrameIndex(FI)
11176 .addImm(4)
11177 .addMemOperand(FIMMOLd)
11179
11180 if (NumLPads < 256) {
11181 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
11182 .addReg(NewVReg1)
11183 .addImm(NumLPads)
11185 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
11186 Register VReg1 = MRI->createVirtualRegister(TRC);
11187 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
11188 .addImm(NumLPads & 0xFFFF)
11190
11191 unsigned VReg2 = VReg1;
11192 if ((NumLPads & 0xFFFF0000) != 0) {
11193 VReg2 = MRI->createVirtualRegister(TRC);
11194 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
11195 .addReg(VReg1)
11196 .addImm(NumLPads >> 16)
11198 }
11199
11200 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11201 .addReg(NewVReg1)
11202 .addReg(VReg2)
11204 } else {
11205 MachineConstantPool *ConstantPool = MF->getConstantPool();
11206 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11207 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11208
11209 // MachineConstantPool wants an explicit alignment.
11210 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11211 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11212
11213 Register VReg1 = MRI->createVirtualRegister(TRC);
11214 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
11215 .addReg(VReg1, RegState::Define)
11217 .addImm(0)
11219 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11220 .addReg(NewVReg1)
11221 .addReg(VReg1, RegState::Kill)
11223 }
11224
11225 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
11226 .addMBB(TrapBB)
11228 .addReg(ARM::CPSR);
11229
11230 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11231 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
11232 .addReg(NewVReg1)
11235 .add(condCodeOp());
11236 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11237 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
11238 .addJumpTableIndex(MJTI)
11240
11241 MachineMemOperand *JTMMOLd =
11242 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11244 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11245 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
11246 .addReg(NewVReg3, RegState::Kill)
11247 .addReg(NewVReg4)
11248 .addImm(0)
11249 .addMemOperand(JTMMOLd)
11251
11252 if (IsPositionIndependent) {
11253 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
11254 .addReg(NewVReg5, RegState::Kill)
11255 .addReg(NewVReg4)
11256 .addJumpTableIndex(MJTI);
11257 } else {
11258 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
11259 .addReg(NewVReg5, RegState::Kill)
11260 .addJumpTableIndex(MJTI);
11261 }
11262 }
11263
11264 // Add the jump table entries as successors to the MBB.
11266 for (MachineBasicBlock *CurMBB : LPadList) {
11267 if (SeenMBBs.insert(CurMBB).second)
11268 DispContBB->addSuccessor(CurMBB);
11269 }
11270
11271 // N.B. the order the invoke BBs are processed in doesn't matter here.
11272 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
11274 for (MachineBasicBlock *BB : InvokeBBs) {
11275
11276 // Remove the landing pad successor from the invoke block and replace it
11277 // with the new dispatch block.
11278 SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
11279 while (!Successors.empty()) {
11280 MachineBasicBlock *SMBB = Successors.pop_back_val();
11281 if (SMBB->isEHPad()) {
11282 BB->removeSuccessor(SMBB);
11283 MBBLPads.push_back(SMBB);
11284 }
11285 }
11286
11287 BB->addSuccessor(DispatchBB, BranchProbability::getZero());
11288 BB->normalizeSuccProbs();
11289
11290 // Find the invoke call and mark all of the callee-saved registers as
11291 // 'implicit defined' so that they're spilled. This prevents code from
11292 // moving instructions to before the EH block, where they will never be
11293 // executed.
11295 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
11296 if (!II->isCall()) continue;
11297
11298 DenseSet<unsigned> DefRegs;
11300 OI = II->operands_begin(), OE = II->operands_end();
11301 OI != OE; ++OI) {
11302 if (!OI->isReg()) continue;
11303 DefRegs.insert(OI->getReg());
11304 }
11305
11306 MachineInstrBuilder MIB(*MF, &*II);
11307
11308 for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
11309 unsigned Reg = SavedRegs[i];
11310 if (Subtarget->isThumb2() &&
11311 !ARM::tGPRRegClass.contains(Reg) &&
11312 !ARM::hGPRRegClass.contains(Reg))
11313 continue;
11314 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
11315 continue;
11316 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
11317 continue;
11318 if (!DefRegs.contains(Reg))
11320 }
11321
11322 break;
11323 }
11324 }
11325
11326 // Mark all former landing pads as non-landing pads. The dispatch is the only
11327 // landing pad now.
11328 for (MachineBasicBlock *MBBLPad : MBBLPads)
11329 MBBLPad->setIsEHPad(false);
11330
11331 // The instruction is gone now.
11332 MI.eraseFromParent();
11333}
11334
11335static
11337 for (MachineBasicBlock *S : MBB->successors())
11338 if (S != Succ)
11339 return S;
11340 llvm_unreachable("Expecting a BB with two successors!");
11341}
11342
11343/// Return the load opcode for a given load size. If load size >= 8,
11344/// neon opcode will be returned.
11345static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
11346 if (LdSize >= 8)
11347 return LdSize == 16 ? ARM::VLD1q32wb_fixed
11348 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
11349 if (IsThumb1)
11350 return LdSize == 4 ? ARM::tLDRi
11351 : LdSize == 2 ? ARM::tLDRHi
11352 : LdSize == 1 ? ARM::tLDRBi : 0;
11353 if (IsThumb2)
11354 return LdSize == 4 ? ARM::t2LDR_POST
11355 : LdSize == 2 ? ARM::t2LDRH_POST
11356 : LdSize == 1 ? ARM::t2LDRB_POST : 0;
11357 return LdSize == 4 ? ARM::LDR_POST_IMM
11358 : LdSize == 2 ? ARM::LDRH_POST
11359 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
11360}
11361
11362/// Return the store opcode for a given store size. If store size >= 8,
11363/// neon opcode will be returned.
11364static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
11365 if (StSize >= 8)
11366 return StSize == 16 ? ARM::VST1q32wb_fixed
11367 : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
11368 if (IsThumb1)
11369 return StSize == 4 ? ARM::tSTRi
11370 : StSize == 2 ? ARM::tSTRHi
11371 : StSize == 1 ? ARM::tSTRBi : 0;
11372 if (IsThumb2)
11373 return StSize == 4 ? ARM::t2STR_POST
11374 : StSize == 2 ? ARM::t2STRH_POST
11375 : StSize == 1 ? ARM::t2STRB_POST : 0;
11376 return StSize == 4 ? ARM::STR_POST_IMM
11377 : StSize == 2 ? ARM::STRH_POST
11378 : StSize == 1 ? ARM::STRB_POST_IMM : 0;
11379}
11380
11381/// Emit a post-increment load operation with given size. The instructions
11382/// will be added to BB at Pos.
11384 const TargetInstrInfo *TII, const DebugLoc &dl,
11385 unsigned LdSize, unsigned Data, unsigned AddrIn,
11386 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11387 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
11388 assert(LdOpc != 0 && "Should have a load opcode");
11389 if (LdSize >= 8) {
11390 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11391 .addReg(AddrOut, RegState::Define)
11392 .addReg(AddrIn)
11393 .addImm(0)
11395 } else if (IsThumb1) {
11396 // load + update AddrIn
11397 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11398 .addReg(AddrIn)
11399 .addImm(0)
11401 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11402 .add(t1CondCodeOp())
11403 .addReg(AddrIn)
11404 .addImm(LdSize)
11406 } else if (IsThumb2) {
11407 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11408 .addReg(AddrOut, RegState::Define)
11409 .addReg(AddrIn)
11410 .addImm(LdSize)
11412 } else { // arm
11413 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11414 .addReg(AddrOut, RegState::Define)
11415 .addReg(AddrIn)
11416 .addReg(0)
11417 .addImm(LdSize)
11419 }
11420}
11421
11422/// Emit a post-increment store operation with given size. The instructions
11423/// will be added to BB at Pos.
11425 const TargetInstrInfo *TII, const DebugLoc &dl,
11426 unsigned StSize, unsigned Data, unsigned AddrIn,
11427 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11428 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
11429 assert(StOpc != 0 && "Should have a store opcode");
11430 if (StSize >= 8) {
11431 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11432 .addReg(AddrIn)
11433 .addImm(0)
11434 .addReg(Data)
11436 } else if (IsThumb1) {
11437 // store + update AddrIn
11438 BuildMI(*BB, Pos, dl, TII->get(StOpc))
11439 .addReg(Data)
11440 .addReg(AddrIn)
11441 .addImm(0)
11443 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11444 .add(t1CondCodeOp())
11445 .addReg(AddrIn)
11446 .addImm(StSize)
11448 } else if (IsThumb2) {
11449 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11450 .addReg(Data)
11451 .addReg(AddrIn)
11452 .addImm(StSize)
11454 } else { // arm
11455 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11456 .addReg(Data)
11457 .addReg(AddrIn)
11458 .addReg(0)
11459 .addImm(StSize)
11461 }
11462}
11463
11465ARMTargetLowering::EmitStructByval(MachineInstr &MI,
11466 MachineBasicBlock *BB) const {
11467 // This pseudo instruction has 3 operands: dst, src, size
11468 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
11469 // Otherwise, we will generate unrolled scalar copies.
11470 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11471 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11473
11474 Register dest = MI.getOperand(0).getReg();
11475 Register src = MI.getOperand(1).getReg();
11476 unsigned SizeVal = MI.getOperand(2).getImm();
11477 unsigned Alignment = MI.getOperand(3).getImm();
11478 DebugLoc dl = MI.getDebugLoc();
11479
11480 MachineFunction *MF = BB->getParent();
11482 unsigned UnitSize = 0;
11483 const TargetRegisterClass *TRC = nullptr;
11484 const TargetRegisterClass *VecTRC = nullptr;
11485
11486 bool IsThumb1 = Subtarget->isThumb1Only();
11487 bool IsThumb2 = Subtarget->isThumb2();
11488 bool IsThumb = Subtarget->isThumb();
11489
11490 if (Alignment & 1) {
11491 UnitSize = 1;
11492 } else if (Alignment & 2) {
11493 UnitSize = 2;
11494 } else {
11495 // Check whether we can use NEON instructions.
11496 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
11497 Subtarget->hasNEON()) {
11498 if ((Alignment % 16 == 0) && SizeVal >= 16)
11499 UnitSize = 16;
11500 else if ((Alignment % 8 == 0) && SizeVal >= 8)
11501 UnitSize = 8;
11502 }
11503 // Can't use NEON instructions.
11504 if (UnitSize == 0)
11505 UnitSize = 4;
11506 }
11507
11508 // Select the correct opcode and register class for unit size load/store
11509 bool IsNeon = UnitSize >= 8;
11510 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
11511 if (IsNeon)
11512 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
11513 : UnitSize == 8 ? &ARM::DPRRegClass
11514 : nullptr;
11515
11516 unsigned BytesLeft = SizeVal % UnitSize;
11517 unsigned LoopSize = SizeVal - BytesLeft;
11518
11519 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
11520 // Use LDR and STR to copy.
11521 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
11522 // [destOut] = STR_POST(scratch, destIn, UnitSize)
11523 unsigned srcIn = src;
11524 unsigned destIn = dest;
11525 for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
11526 Register srcOut = MRI.createVirtualRegister(TRC);
11527 Register destOut = MRI.createVirtualRegister(TRC);
11528 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11529 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
11530 IsThumb1, IsThumb2);
11531 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
11532 IsThumb1, IsThumb2);
11533 srcIn = srcOut;
11534 destIn = destOut;
11535 }
11536
11537 // Handle the leftover bytes with LDRB and STRB.
11538 // [scratch, srcOut] = LDRB_POST(srcIn, 1)
11539 // [destOut] = STRB_POST(scratch, destIn, 1)
11540 for (unsigned i = 0; i < BytesLeft; i++) {
11541 Register srcOut = MRI.createVirtualRegister(TRC);
11542 Register destOut = MRI.createVirtualRegister(TRC);
11543 Register scratch = MRI.createVirtualRegister(TRC);
11544 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
11545 IsThumb1, IsThumb2);
11546 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
11547 IsThumb1, IsThumb2);
11548 srcIn = srcOut;
11549 destIn = destOut;
11550 }
11551 MI.eraseFromParent(); // The instruction is gone now.
11552 return BB;
11553 }
11554
11555 // Expand the pseudo op to a loop.
11556 // thisMBB:
11557 // ...
11558 // movw varEnd, # --> with thumb2
11559 // movt varEnd, #
11560 // ldrcp varEnd, idx --> without thumb2
11561 // fallthrough --> loopMBB
11562 // loopMBB:
11563 // PHI varPhi, varEnd, varLoop
11564 // PHI srcPhi, src, srcLoop
11565 // PHI destPhi, dst, destLoop
11566 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11567 // [destLoop] = STR_POST(scratch, destPhi, UnitSize)
11568 // subs varLoop, varPhi, #UnitSize
11569 // bne loopMBB
11570 // fallthrough --> exitMBB
11571 // exitMBB:
11572 // epilogue to handle left-over bytes
11573 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11574 // [destOut] = STRB_POST(scratch, destLoop, 1)
11575 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11576 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11577 MF->insert(It, loopMBB);
11578 MF->insert(It, exitMBB);
11579
11580 // Set the call frame size on entry to the new basic blocks.
11581 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
11582 loopMBB->setCallFrameSize(CallFrameSize);
11583 exitMBB->setCallFrameSize(CallFrameSize);
11584
11585 // Transfer the remainder of BB and its successor edges to exitMBB.
11586 exitMBB->splice(exitMBB->begin(), BB,
11587 std::next(MachineBasicBlock::iterator(MI)), BB->end());
11589
11590 // Load an immediate to varEnd.
11591 Register varEnd = MRI.createVirtualRegister(TRC);
11592 if (Subtarget->useMovt()) {
11593 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi32imm : ARM::MOVi32imm),
11594 varEnd)
11595 .addImm(LoopSize);
11596 } else if (Subtarget->genExecuteOnly()) {
11597 assert(IsThumb && "Non-thumb expected to have used movt");
11598 BuildMI(BB, dl, TII->get(ARM::tMOVi32imm), varEnd).addImm(LoopSize);
11599 } else {
11601 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11602 const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
11603
11604 // MachineConstantPool wants an explicit alignment.
11605 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11606 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11607 MachineMemOperand *CPMMO =
11610
11611 if (IsThumb)
11612 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
11613 .addReg(varEnd, RegState::Define)
11616 .addMemOperand(CPMMO);
11617 else
11618 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
11619 .addReg(varEnd, RegState::Define)
11621 .addImm(0)
11623 .addMemOperand(CPMMO);
11624 }
11625 BB->addSuccessor(loopMBB);
11626
11627 // Generate the loop body:
11628 // varPhi = PHI(varLoop, varEnd)
11629 // srcPhi = PHI(srcLoop, src)
11630 // destPhi = PHI(destLoop, dst)
11631 MachineBasicBlock *entryBB = BB;
11632 BB = loopMBB;
11633 Register varLoop = MRI.createVirtualRegister(TRC);
11634 Register varPhi = MRI.createVirtualRegister(TRC);
11635 Register srcLoop = MRI.createVirtualRegister(TRC);
11636 Register srcPhi = MRI.createVirtualRegister(TRC);
11637 Register destLoop = MRI.createVirtualRegister(TRC);
11638 Register destPhi = MRI.createVirtualRegister(TRC);
11639
11640 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
11641 .addReg(varLoop).addMBB(loopMBB)
11642 .addReg(varEnd).addMBB(entryBB);
11643 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
11644 .addReg(srcLoop).addMBB(loopMBB)
11645 .addReg(src).addMBB(entryBB);
11646 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
11647 .addReg(destLoop).addMBB(loopMBB)
11648 .addReg(dest).addMBB(entryBB);
11649
11650 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11651 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
11652 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11653 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
11654 IsThumb1, IsThumb2);
11655 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
11656 IsThumb1, IsThumb2);
11657
11658 // Decrement loop variable by UnitSize.
11659 if (IsThumb1) {
11660 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
11661 .add(t1CondCodeOp())
11662 .addReg(varPhi)
11663 .addImm(UnitSize)
11665 } else {
11667 BuildMI(*BB, BB->end(), dl,
11668 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
11669 MIB.addReg(varPhi)
11670 .addImm(UnitSize)
11672 .add(condCodeOp());
11673 MIB->getOperand(5).setReg(ARM::CPSR);
11674 MIB->getOperand(5).setIsDef(true);
11675 }
11676 BuildMI(*BB, BB->end(), dl,
11677 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
11678 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
11679
11680 // loopMBB can loop back to loopMBB or fall through to exitMBB.
11681 BB->addSuccessor(loopMBB);
11682 BB->addSuccessor(exitMBB);
11683
11684 // Add epilogue to handle BytesLeft.
11685 BB = exitMBB;
11686 auto StartOfExit = exitMBB->begin();
11687
11688 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11689 // [destOut] = STRB_POST(scratch, destLoop, 1)
11690 unsigned srcIn = srcLoop;
11691 unsigned destIn = destLoop;
11692 for (unsigned i = 0; i < BytesLeft; i++) {
11693 Register srcOut = MRI.createVirtualRegister(TRC);
11694 Register destOut = MRI.createVirtualRegister(TRC);
11695 Register scratch = MRI.createVirtualRegister(TRC);
11696 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
11697 IsThumb1, IsThumb2);
11698 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
11699 IsThumb1, IsThumb2);
11700 srcIn = srcOut;
11701 destIn = destOut;
11702 }
11703
11704 MI.eraseFromParent(); // The instruction is gone now.
11705 return BB;
11706}
11707
11709ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
11710 MachineBasicBlock *MBB) const {
11712 const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
11713 DebugLoc DL = MI.getDebugLoc();
11714
11715 assert(Subtarget->isTargetWindows() &&
11716 "__chkstk is only supported on Windows");
11717 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
11718
11719 // __chkstk takes the number of words to allocate on the stack in R4, and
11720 // returns the stack adjustment in number of bytes in R4. This will not
11721 // clober any other registers (other than the obvious lr).
11722 //
11723 // Although, technically, IP should be considered a register which may be
11724 // clobbered, the call itself will not touch it. Windows on ARM is a pure
11725 // thumb-2 environment, so there is no interworking required. As a result, we
11726 // do not expect a veneer to be emitted by the linker, clobbering IP.
11727 //
11728 // Each module receives its own copy of __chkstk, so no import thunk is
11729 // required, again, ensuring that IP is not clobbered.
11730 //
11731 // Finally, although some linkers may theoretically provide a trampoline for
11732 // out of range calls (which is quite common due to a 32M range limitation of
11733 // branches for Thumb), we can generate the long-call version via
11734 // -mcmodel=large, alleviating the need for the trampoline which may clobber
11735 // IP.
11736
11737 switch (TM.getCodeModel()) {
11738 case CodeModel::Tiny:
11739 llvm_unreachable("Tiny code model not available on ARM.");
11740 case CodeModel::Small:
11741 case CodeModel::Medium:
11742 case CodeModel::Kernel:
11743 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
11745 .addExternalSymbol("__chkstk")
11748 .addReg(ARM::R12,
11750 .addReg(ARM::CPSR,
11752 break;
11753 case CodeModel::Large: {
11755 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11756
11757 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
11758 .addExternalSymbol("__chkstk");
11761 .addReg(Reg, RegState::Kill)
11764 .addReg(ARM::R12,
11766 .addReg(ARM::CPSR,
11768 break;
11769 }
11770 }
11771
11772 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
11773 .addReg(ARM::SP, RegState::Kill)
11774 .addReg(ARM::R4, RegState::Kill)
11777 .add(condCodeOp());
11778
11779 MI.eraseFromParent();
11780 return MBB;
11781}
11782
11784ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
11785 MachineBasicBlock *MBB) const {
11786 DebugLoc DL = MI.getDebugLoc();
11787 MachineFunction *MF = MBB->getParent();
11788 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11789
11791 MF->insert(++MBB->getIterator(), ContBB);
11792 ContBB->splice(ContBB->begin(), MBB,
11793 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
11795 MBB->addSuccessor(ContBB);
11796
11798 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
11799 MF->push_back(TrapBB);
11800 MBB->addSuccessor(TrapBB);
11801
11802 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
11803 .addReg(MI.getOperand(0).getReg())
11804 .addImm(0)
11806 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
11807 .addMBB(TrapBB)
11809 .addReg(ARM::CPSR);
11810
11811 MI.eraseFromParent();
11812 return ContBB;
11813}
11814
11815// The CPSR operand of SelectItr might be missing a kill marker
11816// because there were multiple uses of CPSR, and ISel didn't know
11817// which to mark. Figure out whether SelectItr should have had a
11818// kill marker, and set it if it should. Returns the correct kill
11819// marker value.
11822 const TargetRegisterInfo* TRI) {
11823 // Scan forward through BB for a use/def of CPSR.
11824 MachineBasicBlock::iterator miI(std::next(SelectItr));
11825 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
11826 const MachineInstr& mi = *miI;
11827 if (mi.readsRegister(ARM::CPSR, /*TRI=*/nullptr))
11828 return false;
11829 if (mi.definesRegister(ARM::CPSR, /*TRI=*/nullptr))
11830 break; // Should have kill-flag - update below.
11831 }
11832
11833 // If we hit the end of the block, check whether CPSR is live into a
11834 // successor.
11835 if (miI == BB->end()) {
11836 for (MachineBasicBlock *Succ : BB->successors())
11837 if (Succ->isLiveIn(ARM::CPSR))
11838 return false;
11839 }
11840
11841 // We found a def, or hit the end of the basic block and CPSR wasn't live
11842 // out. SelectMI should have a kill flag on CPSR.
11843 SelectItr->addRegisterKilled(ARM::CPSR, TRI);
11844 return true;
11845}
11846
11847/// Adds logic in loop entry MBB to calculate loop iteration count and adds
11848/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
11850 MachineBasicBlock *TpLoopBody,
11851 MachineBasicBlock *TpExit, Register OpSizeReg,
11852 const TargetInstrInfo *TII, DebugLoc Dl,
11854 // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.
11855 Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11856 BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
11857 .addUse(OpSizeReg)
11858 .addImm(15)
11860 .addReg(0);
11861
11862 Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11863 BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
11864 .addUse(AddDestReg, RegState::Kill)
11865 .addImm(4)
11867 .addReg(0);
11868
11869 Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11870 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
11871 .addUse(LsrDestReg, RegState::Kill);
11872
11873 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
11874 .addUse(TotalIterationsReg)
11875 .addMBB(TpExit);
11876
11877 BuildMI(TpEntry, Dl, TII->get(ARM::t2B))
11878 .addMBB(TpLoopBody)
11880
11881 return TotalIterationsReg;
11882}
11883
11884/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
11885/// t2DoLoopEnd. These are used by later passes to generate tail predicated
11886/// loops.
11887static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
11888 MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
11889 const TargetInstrInfo *TII, DebugLoc Dl,
11890 MachineRegisterInfo &MRI, Register OpSrcReg,
11891 Register OpDestReg, Register ElementCountReg,
11892 Register TotalIterationsReg, bool IsMemcpy) {
11893 // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
11894 // array, loop iteration counter, predication counter.
11895
11896 Register SrcPhiReg, CurrSrcReg;
11897 if (IsMemcpy) {
11898 // Current position in the src array
11899 SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11900 CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11901 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
11902 .addUse(OpSrcReg)
11903 .addMBB(TpEntry)
11904 .addUse(CurrSrcReg)
11905 .addMBB(TpLoopBody);
11906 }
11907
11908 // Current position in the dest array
11909 Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11910 Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11911 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
11912 .addUse(OpDestReg)
11913 .addMBB(TpEntry)
11914 .addUse(CurrDestReg)
11915 .addMBB(TpLoopBody);
11916
11917 // Current loop counter
11918 Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11919 Register RemainingLoopIterationsReg =
11920 MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11921 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
11922 .addUse(TotalIterationsReg)
11923 .addMBB(TpEntry)
11924 .addUse(RemainingLoopIterationsReg)
11925 .addMBB(TpLoopBody);
11926
11927 // Predication counter
11928 Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11929 Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11930 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
11931 .addUse(ElementCountReg)
11932 .addMBB(TpEntry)
11933 .addUse(RemainingElementsReg)
11934 .addMBB(TpLoopBody);
11935
11936 // Pass predication counter to VCTP
11937 Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
11938 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
11939 .addUse(PredCounterPhiReg)
11941 .addReg(0)
11942 .addReg(0);
11943
11944 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
11945 .addUse(PredCounterPhiReg)
11946 .addImm(16)
11948 .addReg(0);
11949
11950 // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
11951 Register SrcValueReg;
11952 if (IsMemcpy) {
11953 SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
11954 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
11955 .addDef(CurrSrcReg)
11956 .addDef(SrcValueReg)
11957 .addReg(SrcPhiReg)
11958 .addImm(16)
11960 .addUse(VccrReg)
11961 .addReg(0);
11962 } else
11963 SrcValueReg = OpSrcReg;
11964
11965 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
11966 .addDef(CurrDestReg)
11967 .addUse(SrcValueReg)
11968 .addReg(DestPhiReg)
11969 .addImm(16)
11971 .addUse(VccrReg)
11972 .addReg(0);
11973
11974 // Add the pseudoInstrs for decrementing the loop counter and marking the
11975 // end:t2DoLoopDec and t2DoLoopEnd
11976 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
11977 .addUse(LoopCounterPhiReg)
11978 .addImm(1);
11979
11980 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
11981 .addUse(RemainingLoopIterationsReg)
11982 .addMBB(TpLoopBody);
11983
11984 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
11985 .addMBB(TpExit)
11987}
11988
11991 MachineBasicBlock *BB) const {
11992 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11993 DebugLoc dl = MI.getDebugLoc();
11994 bool isThumb2 = Subtarget->isThumb2();
11995 switch (MI.getOpcode()) {
11996 default: {
11997 MI.print(errs());
11998 llvm_unreachable("Unexpected instr type to insert");
11999 }
12000
12001 // Thumb1 post-indexed loads are really just single-register LDMs.
12002 case ARM::tLDR_postidx: {
12003 MachineOperand Def(MI.getOperand(1));
12004 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
12005 .add(Def) // Rn_wb
12006 .add(MI.getOperand(2)) // Rn
12007 .add(MI.getOperand(3)) // PredImm
12008 .add(MI.getOperand(4)) // PredReg
12009 .add(MI.getOperand(0)) // Rt
12010 .cloneMemRefs(MI);
12011 MI.eraseFromParent();
12012 return BB;
12013 }
12014
12015 case ARM::MVE_MEMCPYLOOPINST:
12016 case ARM::MVE_MEMSETLOOPINST: {
12017
12018 // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
12019 // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
12020 // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
12021 // adds the relevant instructions in the TP loop Body for generation of a
12022 // WLSTP loop.
12023
12024 // Below is relevant portion of the CFG after the transformation.
12025 // The Machine Basic Blocks are shown along with branch conditions (in
12026 // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
12027 // portion of the CFG and may not necessarily be the entry/exit of the
12028 // function.
12029
12030 // (Relevant) CFG after transformation:
12031 // TP entry MBB
12032 // |
12033 // |-----------------|
12034 // (n <= 0) (n > 0)
12035 // | |
12036 // | TP loop Body MBB<--|
12037 // | | |
12038 // \ |___________|
12039 // \ /
12040 // TP exit MBB
12041
12042 MachineFunction *MF = BB->getParent();
12043 MachineFunctionProperties &Properties = MF->getProperties();
12045
12046 Register OpDestReg = MI.getOperand(0).getReg();
12047 Register OpSrcReg = MI.getOperand(1).getReg();
12048 Register OpSizeReg = MI.getOperand(2).getReg();
12049
12050 // Allocate the required MBBs and add to parent function.
12051 MachineBasicBlock *TpEntry = BB;
12052 MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
12053 MachineBasicBlock *TpExit;
12054
12055 MF->push_back(TpLoopBody);
12056
12057 // If any instructions are present in the current block after
12058 // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
12059 // move the instructions into the newly created exit block. If there are no
12060 // instructions add an explicit branch to the FallThrough block and then
12061 // split.
12062 //
12063 // The split is required for two reasons:
12064 // 1) A terminator(t2WhileLoopStart) will be placed at that site.
12065 // 2) Since a TPLoopBody will be added later, any phis in successive blocks
12066 // need to be updated. splitAt() already handles this.
12067 TpExit = BB->splitAt(MI, false);
12068 if (TpExit == BB) {
12069 assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
12070 "block containing memcpy/memset Pseudo");
12071 TpExit = BB->getFallThrough();
12072 BuildMI(BB, dl, TII->get(ARM::t2B))
12073 .addMBB(TpExit)
12075 TpExit = BB->splitAt(MI, false);
12076 }
12077
12078 // Add logic for iteration count
12079 Register TotalIterationsReg =
12080 genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
12081
12082 // Add the vectorized (and predicated) loads/store instructions
12083 bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
12084 genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
12085 OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);
12086
12087 // Required to avoid conflict with the MachineVerifier during testing.
12088 Properties.resetNoPHIs();
12089
12090 // Connect the blocks
12091 TpEntry->addSuccessor(TpLoopBody);
12092 TpLoopBody->addSuccessor(TpLoopBody);
12093 TpLoopBody->addSuccessor(TpExit);
12094
12095 // Reorder for a more natural layout
12096 TpLoopBody->moveAfter(TpEntry);
12097 TpExit->moveAfter(TpLoopBody);
12098
12099 // Finally, remove the memcpy Pseudo Instruction
12100 MI.eraseFromParent();
12101
12102 // Return the exit block as it may contain other instructions requiring a
12103 // custom inserter
12104 return TpExit;
12105 }
12106
12107 // The Thumb2 pre-indexed stores have the same MI operands, they just
12108 // define them differently in the .td files from the isel patterns, so
12109 // they need pseudos.
12110 case ARM::t2STR_preidx:
12111 MI.setDesc(TII->get(ARM::t2STR_PRE));
12112 return BB;
12113 case ARM::t2STRB_preidx:
12114 MI.setDesc(TII->get(ARM::t2STRB_PRE));
12115 return BB;
12116 case ARM::t2STRH_preidx:
12117 MI.setDesc(TII->get(ARM::t2STRH_PRE));
12118 return BB;
12119
12120 case ARM::STRi_preidx:
12121 case ARM::STRBi_preidx: {
12122 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
12123 : ARM::STRB_PRE_IMM;
12124 // Decode the offset.
12125 unsigned Offset = MI.getOperand(4).getImm();
12126 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
12128 if (isSub)
12129 Offset = -Offset;
12130
12131 MachineMemOperand *MMO = *MI.memoperands_begin();
12132 BuildMI(*BB, MI, dl, TII->get(NewOpc))
12133 .add(MI.getOperand(0)) // Rn_wb
12134 .add(MI.getOperand(1)) // Rt
12135 .add(MI.getOperand(2)) // Rn
12136 .addImm(Offset) // offset (skip GPR==zero_reg)
12137 .add(MI.getOperand(5)) // pred
12138 .add(MI.getOperand(6))
12139 .addMemOperand(MMO);
12140 MI.eraseFromParent();
12141 return BB;
12142 }
12143 case ARM::STRr_preidx:
12144 case ARM::STRBr_preidx:
12145 case ARM::STRH_preidx: {
12146 unsigned NewOpc;
12147 switch (MI.getOpcode()) {
12148 default: llvm_unreachable("unexpected opcode!");
12149 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
12150 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
12151 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
12152 }
12153 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
12154 for (const MachineOperand &MO : MI.operands())
12155 MIB.add(MO);
12156 MI.eraseFromParent();
12157 return BB;
12158 }
12159
12160 case ARM::tMOVCCr_pseudo: {
12161 // To "insert" a SELECT_CC instruction, we actually have to insert the
12162 // diamond control-flow pattern. The incoming instruction knows the
12163 // destination vreg to set, the condition code register to branch on, the
12164 // true/false values to select between, and a branch opcode to use.
12165 const BasicBlock *LLVM_BB = BB->getBasicBlock();
12167
12168 // thisMBB:
12169 // ...
12170 // TrueVal = ...
12171 // cmpTY ccX, r1, r2
12172 // bCC copy1MBB
12173 // fallthrough --> copy0MBB
12174 MachineBasicBlock *thisMBB = BB;
12175 MachineFunction *F = BB->getParent();
12176 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
12177 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
12178 F->insert(It, copy0MBB);
12179 F->insert(It, sinkMBB);
12180
12181 // Set the call frame size on entry to the new basic blocks.
12182 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
12183 copy0MBB->setCallFrameSize(CallFrameSize);
12184 sinkMBB->setCallFrameSize(CallFrameSize);
12185
12186 // Check whether CPSR is live past the tMOVCCr_pseudo.
12187 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
12188 if (!MI.killsRegister(ARM::CPSR, /*TRI=*/nullptr) &&
12189 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
12190 copy0MBB->addLiveIn(ARM::CPSR);
12191 sinkMBB->addLiveIn(ARM::CPSR);
12192 }
12193
12194 // Transfer the remainder of BB and its successor edges to sinkMBB.
12195 sinkMBB->splice(sinkMBB->begin(), BB,
12196 std::next(MachineBasicBlock::iterator(MI)), BB->end());
12198
12199 BB->addSuccessor(copy0MBB);
12200 BB->addSuccessor(sinkMBB);
12201
12202 BuildMI(BB, dl, TII->get(ARM::tBcc))
12203 .addMBB(sinkMBB)
12204 .addImm(MI.getOperand(3).getImm())
12205 .addReg(MI.getOperand(4).getReg());
12206
12207 // copy0MBB:
12208 // %FalseValue = ...
12209 // # fallthrough to sinkMBB
12210 BB = copy0MBB;
12211
12212 // Update machine-CFG edges
12213 BB->addSuccessor(sinkMBB);
12214
12215 // sinkMBB:
12216 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
12217 // ...
12218 BB = sinkMBB;
12219 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
12220 .addReg(MI.getOperand(1).getReg())
12221 .addMBB(copy0MBB)
12222 .addReg(MI.getOperand(2).getReg())
12223 .addMBB(thisMBB);
12224
12225 MI.eraseFromParent(); // The pseudo instruction is gone now.
12226 return BB;
12227 }
12228
12229 case ARM::BCCi64:
12230 case ARM::BCCZi64: {
12231 // If there is an unconditional branch to the other successor, remove it.
12232 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
12233
12234 // Compare both parts that make up the double comparison separately for
12235 // equality.
12236 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
12237
12238 Register LHS1 = MI.getOperand(1).getReg();
12239 Register LHS2 = MI.getOperand(2).getReg();
12240 if (RHSisZero) {
12241 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12242 .addReg(LHS1)
12243 .addImm(0)
12245 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12246 .addReg(LHS2).addImm(0)
12247 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12248 } else {
12249 Register RHS1 = MI.getOperand(3).getReg();
12250 Register RHS2 = MI.getOperand(4).getReg();
12251 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12252 .addReg(LHS1)
12253 .addReg(RHS1)
12255 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12256 .addReg(LHS2).addReg(RHS2)
12257 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12258 }
12259
12260 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
12261 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
12262 if (MI.getOperand(0).getImm() == ARMCC::NE)
12263 std::swap(destMBB, exitMBB);
12264
12265 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
12266 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
12267 if (isThumb2)
12268 BuildMI(BB, dl, TII->get(ARM::t2B))
12269 .addMBB(exitMBB)
12271 else
12272 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
12273
12274 MI.eraseFromParent(); // The pseudo instruction is gone now.
12275 return BB;
12276 }
12277
12278 case ARM::Int_eh_sjlj_setjmp:
12279 case ARM::Int_eh_sjlj_setjmp_nofp:
12280 case ARM::tInt_eh_sjlj_setjmp:
12281 case ARM::t2Int_eh_sjlj_setjmp:
12282 case ARM::t2Int_eh_sjlj_setjmp_nofp:
12283 return BB;
12284
12285 case ARM::Int_eh_sjlj_setup_dispatch:
12286 EmitSjLjDispatchBlock(MI, BB);
12287 return BB;
12288
12289 case ARM::ABS:
12290 case ARM::t2ABS: {
12291 // To insert an ABS instruction, we have to insert the
12292 // diamond control-flow pattern. The incoming instruction knows the
12293 // source vreg to test against 0, the destination vreg to set,
12294 // the condition code register to branch on, the
12295 // true/false values to select between, and a branch opcode to use.
12296 // It transforms
12297 // V1 = ABS V0
12298 // into
12299 // V2 = MOVS V0
12300 // BCC (branch to SinkBB if V0 >= 0)
12301 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0)
12302 // SinkBB: V1 = PHI(V2, V3)
12303 const BasicBlock *LLVM_BB = BB->getBasicBlock();
12305 MachineFunction *Fn = BB->getParent();
12306 MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
12307 MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB);
12308 Fn->insert(BBI, RSBBB);
12309 Fn->insert(BBI, SinkBB);
12310
12311 // Set the call frame size on entry to the new basic blocks.
12312 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
12313 RSBBB->setCallFrameSize(CallFrameSize);
12314 SinkBB->setCallFrameSize(CallFrameSize);
12315
12316 Register ABSSrcReg = MI.getOperand(1).getReg();
12317 Register ABSDstReg = MI.getOperand(0).getReg();
12318 bool ABSSrcKIll = MI.getOperand(1).isKill();
12319 bool isThumb2 = Subtarget->isThumb2();
12321 // In Thumb mode S must not be specified if source register is the SP or
12322 // PC and if destination register is the SP, so restrict register class
12323 Register NewRsbDstReg = MRI.createVirtualRegister(
12324 isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
12325
12326 // Transfer the remainder of BB and its successor edges to sinkMBB.
12327 SinkBB->splice(SinkBB->begin(), BB,
12328 std::next(MachineBasicBlock::iterator(MI)), BB->end());
12330
12331 BB->addSuccessor(RSBBB);
12332 BB->addSuccessor(SinkBB);
12333
12334 // fall through to SinkMBB
12335 RSBBB->addSuccessor(SinkBB);
12336
12337 // insert a cmp at the end of BB
12338 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12339 .addReg(ABSSrcReg)
12340 .addImm(0)
12342
12343 // insert a bcc with opposite CC to ARMCC::MI at the end of BB
12344 BuildMI(BB, dl,
12345 TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
12347
12348 // insert rsbri in RSBBB
12349 // Note: BCC and rsbri will be converted into predicated rsbmi
12350 // by if-conversion pass
12351 BuildMI(*RSBBB, RSBBB->begin(), dl,
12352 TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
12353 .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
12354 .addImm(0)
12356 .add(condCodeOp());
12357
12358 // insert PHI in SinkBB,
12359 // reuse ABSDstReg to not change uses of ABS instruction
12360 BuildMI(*SinkBB, SinkBB->begin(), dl,
12361 TII->get(ARM::PHI), ABSDstReg)
12362 .addReg(NewRsbDstReg).addMBB(RSBBB)
12363 .addReg(ABSSrcReg).addMBB(BB);
12364
12365 // remove ABS instruction
12366 MI.eraseFromParent();
12367
12368 // return last added BB
12369 return SinkBB;
12370 }
12371 case ARM::COPY_STRUCT_BYVAL_I32:
12372 ++NumLoopByVals;
12373 return EmitStructByval(MI, BB);
12374 case ARM::WIN__CHKSTK:
12375 return EmitLowered__chkstk(MI, BB);
12376 case ARM::WIN__DBZCHK:
12377 return EmitLowered__dbzchk(MI, BB);
12378 }
12379}
12380
12381/// Attaches vregs to MEMCPY that it will use as scratch registers
12382/// when it is expanded into LDM/STM. This is done as a post-isel lowering
12383/// instead of as a custom inserter because we need the use list from the SDNode.
12384static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
12385 MachineInstr &MI, const SDNode *Node) {
12386 bool isThumb1 = Subtarget->isThumb1Only();
12387
12388 MachineFunction *MF = MI.getParent()->getParent();
12390 MachineInstrBuilder MIB(*MF, MI);
12391
12392 // If the new dst/src is unused mark it as dead.
12393 if (!Node->hasAnyUseOfValue(0)) {
12394 MI.getOperand(0).setIsDead(true);
12395 }
12396 if (!Node->hasAnyUseOfValue(1)) {
12397 MI.getOperand(1).setIsDead(true);
12398 }
12399
12400 // The MEMCPY both defines and kills the scratch registers.
12401 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
12402 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
12403 : &ARM::GPRRegClass);
12405 }
12406}
12407
12409 SDNode *Node) const {
12410 if (MI.getOpcode() == ARM::MEMCPY) {
12411 attachMEMCPYScratchRegs(Subtarget, MI, Node);
12412 return;
12413 }
12414
12415 const MCInstrDesc *MCID = &MI.getDesc();
12416 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
12417 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
12418 // operand is still set to noreg. If needed, set the optional operand's
12419 // register to CPSR, and remove the redundant implicit def.
12420 //
12421 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
12422
12423 // Rename pseudo opcodes.
12424 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
12425 unsigned ccOutIdx;
12426 if (NewOpc) {
12427 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
12428 MCID = &TII->get(NewOpc);
12429
12430 assert(MCID->getNumOperands() ==
12431 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
12432 && "converted opcode should be the same except for cc_out"
12433 " (and, on Thumb1, pred)");
12434
12435 MI.setDesc(*MCID);
12436
12437 // Add the optional cc_out operand
12438 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
12439
12440 // On Thumb1, move all input operands to the end, then add the predicate
12441 if (Subtarget->isThumb1Only()) {
12442 for (unsigned c = MCID->getNumOperands() - 4; c--;) {
12443 MI.addOperand(MI.getOperand(1));
12444 MI.removeOperand(1);
12445 }
12446
12447 // Restore the ties
12448 for (unsigned i = MI.getNumOperands(); i--;) {
12449 const MachineOperand& op = MI.getOperand(i);
12450 if (op.isReg() && op.isUse()) {
12451 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
12452 if (DefIdx != -1)
12453 MI.tieOperands(DefIdx, i);
12454 }
12455 }
12456
12458 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
12459 ccOutIdx = 1;
12460 } else
12461 ccOutIdx = MCID->getNumOperands() - 1;
12462 } else
12463 ccOutIdx = MCID->getNumOperands() - 1;
12464
12465 // Any ARM instruction that sets the 's' bit should specify an optional
12466 // "cc_out" operand in the last operand position.
12467 if (!MI.hasOptionalDef() || !MCID->operands()[ccOutIdx].isOptionalDef()) {
12468 assert(!NewOpc && "Optional cc_out operand required");
12469 return;
12470 }
12471 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
12472 // since we already have an optional CPSR def.
12473 bool definesCPSR = false;
12474 bool deadCPSR = false;
12475 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
12476 ++i) {
12477 const MachineOperand &MO = MI.getOperand(i);
12478 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
12479 definesCPSR = true;
12480 if (MO.isDead())
12481 deadCPSR = true;
12482 MI.removeOperand(i);
12483 break;
12484 }
12485 }
12486 if (!definesCPSR) {
12487 assert(!NewOpc && "Optional cc_out operand required");
12488 return;
12489 }
12490 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
12491 if (deadCPSR) {
12492 assert(!MI.getOperand(ccOutIdx).getReg() &&
12493 "expect uninitialized optional cc_out operand");
12494 // Thumb1 instructions must have the S bit even if the CPSR is dead.
12495 if (!Subtarget->isThumb1Only())
12496 return;
12497 }
12498
12499 // If this instruction was defined with an optional CPSR def and its dag node
12500 // had a live implicit CPSR def, then activate the optional CPSR def.
12501 MachineOperand &MO = MI.getOperand(ccOutIdx);
12502 MO.setReg(ARM::CPSR);
12503 MO.setIsDef(true);
12504}
12505
12506//===----------------------------------------------------------------------===//
12507// ARM Optimization Hooks
12508//===----------------------------------------------------------------------===//
12509
12510// Helper function that checks if N is a null or all ones constant.
12511static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
12513}
12514
12515// Return true if N is conditionally 0 or all ones.
12516// Detects these expressions where cc is an i1 value:
12517//
12518// (select cc 0, y) [AllOnes=0]
12519// (select cc y, 0) [AllOnes=0]
12520// (zext cc) [AllOnes=0]
12521// (sext cc) [AllOnes=0/1]
12522// (select cc -1, y) [AllOnes=1]
12523// (select cc y, -1) [AllOnes=1]
12524//
12525// Invert is set when N is the null/all ones constant when CC is false.
12526// OtherOp is set to the alternative value of N.
12528 SDValue &CC, bool &Invert,
12529 SDValue &OtherOp,
12530 SelectionDAG &DAG) {
12531 switch (N->getOpcode()) {
12532 default: return false;
12533 case ISD::SELECT: {
12534 CC = N->getOperand(0);
12535 SDValue N1 = N->getOperand(1);
12536 SDValue N2 = N->getOperand(2);
12537 if (isZeroOrAllOnes(N1, AllOnes)) {
12538 Invert = false;
12539 OtherOp = N2;
12540 return true;
12541 }
12542 if (isZeroOrAllOnes(N2, AllOnes)) {
12543 Invert = true;
12544 OtherOp = N1;
12545 return true;
12546 }
12547 return false;
12548 }
12549 case ISD::ZERO_EXTEND:
12550 // (zext cc) can never be the all ones value.
12551 if (AllOnes)
12552 return false;
12553 [[fallthrough]];
12554 case ISD::SIGN_EXTEND: {
12555 SDLoc dl(N);
12556 EVT VT = N->getValueType(0);
12557 CC = N->getOperand(0);
12558 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
12559 return false;
12560 Invert = !AllOnes;
12561 if (AllOnes)
12562 // When looking for an AllOnes constant, N is an sext, and the 'other'
12563 // value is 0.
12564 OtherOp = DAG.getConstant(0, dl, VT);
12565 else if (N->getOpcode() == ISD::ZERO_EXTEND)
12566 // When looking for a 0 constant, N can be zext or sext.
12567 OtherOp = DAG.getConstant(1, dl, VT);
12568 else
12569 OtherOp = DAG.getAllOnesConstant(dl, VT);
12570 return true;
12571 }
12572 }
12573}
12574
12575// Combine a constant select operand into its use:
12576//
12577// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
12578// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
12579// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
12580// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
12581// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
12582//
12583// The transform is rejected if the select doesn't have a constant operand that
12584// is null, or all ones when AllOnes is set.
12585//
12586// Also recognize sext/zext from i1:
12587//
12588// (add (zext cc), x) -> (select cc (add x, 1), x)
12589// (add (sext cc), x) -> (select cc (add x, -1), x)
12590//
12591// These transformations eventually create predicated instructions.
12592//
12593// @param N The node to transform.
12594// @param Slct The N operand that is a select.
12595// @param OtherOp The other N operand (x above).
12596// @param DCI Context.
12597// @param AllOnes Require the select constant to be all ones instead of null.
12598// @returns The new node, or SDValue() on failure.
12599static
12602 bool AllOnes = false) {
12603 SelectionDAG &DAG = DCI.DAG;
12604 EVT VT = N->getValueType(0);
12605 SDValue NonConstantVal;
12606 SDValue CCOp;
12607 bool SwapSelectOps;
12608 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
12609 NonConstantVal, DAG))
12610 return SDValue();
12611
12612 // Slct is now know to be the desired identity constant when CC is true.
12613 SDValue TrueVal = OtherOp;
12614 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
12615 OtherOp, NonConstantVal);
12616 // Unless SwapSelectOps says CC should be false.
12617 if (SwapSelectOps)
12618 std::swap(TrueVal, FalseVal);
12619
12620 return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
12621 CCOp, TrueVal, FalseVal);
12622}
12623
12624// Attempt combineSelectAndUse on each operand of a commutative operator N.
12625static
12628 SDValue N0 = N->getOperand(0);
12629 SDValue N1 = N->getOperand(1);
12630 if (N0.getNode()->hasOneUse())
12631 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
12632 return Result;
12633 if (N1.getNode()->hasOneUse())
12634 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
12635 return Result;
12636 return SDValue();
12637}
12638
12640 // VUZP shuffle node.
12641 if (N->getOpcode() == ARMISD::VUZP)
12642 return true;
12643
12644 // "VUZP" on i32 is an alias for VTRN.
12645 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
12646 return true;
12647
12648 return false;
12649}
12650
12653 const ARMSubtarget *Subtarget) {
12654 // Look for ADD(VUZP.0, VUZP.1).
12655 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
12656 N0 == N1)
12657 return SDValue();
12658
12659 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
12660 if (!N->getValueType(0).is64BitVector())
12661 return SDValue();
12662
12663 // Generate vpadd.
12664 SelectionDAG &DAG = DCI.DAG;
12665 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12666 SDLoc dl(N);
12667 SDNode *Unzip = N0.getNode();
12668 EVT VT = N->getValueType(0);
12669
12671 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
12672 TLI.getPointerTy(DAG.getDataLayout())));
12673 Ops.push_back(Unzip->getOperand(0));
12674 Ops.push_back(Unzip->getOperand(1));
12675
12676 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12677}
12678
12681 const ARMSubtarget *Subtarget) {
12682 // Check for two extended operands.
12683 if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
12684 N1.getOpcode() == ISD::SIGN_EXTEND) &&
12685 !(N0.getOpcode() == ISD::ZERO_EXTEND &&
12686 N1.getOpcode() == ISD::ZERO_EXTEND))
12687 return SDValue();
12688
12689 SDValue N00 = N0.getOperand(0);
12690 SDValue N10 = N1.getOperand(0);
12691
12692 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
12693 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
12694 N00 == N10)
12695 return SDValue();
12696
12697 // We only recognize Q register paddl here; this can't be reached until
12698 // after type legalization.
12699 if (!N00.getValueType().is64BitVector() ||
12701 return SDValue();
12702
12703 // Generate vpaddl.
12704 SelectionDAG &DAG = DCI.DAG;
12705 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12706 SDLoc dl(N);
12707 EVT VT = N->getValueType(0);
12708
12710 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
12711 unsigned Opcode;
12712 if (N0.getOpcode() == ISD::SIGN_EXTEND)
12713 Opcode = Intrinsic::arm_neon_vpaddls;
12714 else
12715 Opcode = Intrinsic::arm_neon_vpaddlu;
12716 Ops.push_back(DAG.getConstant(Opcode, dl,
12717 TLI.getPointerTy(DAG.getDataLayout())));
12718 EVT ElemTy = N00.getValueType().getVectorElementType();
12719 unsigned NumElts = VT.getVectorNumElements();
12720 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
12721 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
12722 N00.getOperand(0), N00.getOperand(1));
12723 Ops.push_back(Concat);
12724
12725 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12726}
12727
12728// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
12729// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
12730// much easier to match.
12731static SDValue
12734 const ARMSubtarget *Subtarget) {
12735 // Only perform optimization if after legalize, and if NEON is available. We
12736 // also expected both operands to be BUILD_VECTORs.
12737 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
12738 || N0.getOpcode() != ISD::BUILD_VECTOR
12739 || N1.getOpcode() != ISD::BUILD_VECTOR)
12740 return SDValue();
12741
12742 // Check output type since VPADDL operand elements can only be 8, 16, or 32.
12743 EVT VT = N->getValueType(0);
12744 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
12745 return SDValue();
12746
12747 // Check that the vector operands are of the right form.
12748 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
12749 // operands, where N is the size of the formed vector.
12750 // Each EXTRACT_VECTOR should have the same input vector and odd or even
12751 // index such that we have a pair wise add pattern.
12752
12753 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
12755 return SDValue();
12756 SDValue Vec = N0->getOperand(0)->getOperand(0);
12757 SDNode *V = Vec.getNode();
12758 unsigned nextIndex = 0;
12759
12760 // For each operands to the ADD which are BUILD_VECTORs,
12761 // check to see if each of their operands are an EXTRACT_VECTOR with
12762 // the same vector and appropriate index.
12763 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
12766
12767 SDValue ExtVec0 = N0->getOperand(i);
12768 SDValue ExtVec1 = N1->getOperand(i);
12769
12770 // First operand is the vector, verify its the same.
12771 if (V != ExtVec0->getOperand(0).getNode() ||
12772 V != ExtVec1->getOperand(0).getNode())
12773 return SDValue();
12774
12775 // Second is the constant, verify its correct.
12776 ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
12777 ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
12778
12779 // For the constant, we want to see all the even or all the odd.
12780 if (!C0 || !C1 || C0->getZExtValue() != nextIndex
12781 || C1->getZExtValue() != nextIndex+1)
12782 return SDValue();
12783
12784 // Increment index.
12785 nextIndex+=2;
12786 } else
12787 return SDValue();
12788 }
12789
12790 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
12791 // we're using the entire input vector, otherwise there's a size/legality
12792 // mismatch somewhere.
12793 if (nextIndex != Vec.getValueType().getVectorNumElements() ||
12795 return SDValue();
12796
12797 // Create VPADDL node.
12798 SelectionDAG &DAG = DCI.DAG;
12799 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12800
12801 SDLoc dl(N);
12802
12803 // Build operand list.
12805 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
12806 TLI.getPointerTy(DAG.getDataLayout())));
12807
12808 // Input is the vector.
12809 Ops.push_back(Vec);
12810
12811 // Get widened type and narrowed type.
12812 MVT widenType;
12813 unsigned numElem = VT.getVectorNumElements();
12814
12815 EVT inputLaneType = Vec.getValueType().getVectorElementType();
12816 switch (inputLaneType.getSimpleVT().SimpleTy) {
12817 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
12818 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
12819 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
12820 default:
12821 llvm_unreachable("Invalid vector element type for padd optimization.");
12822 }
12823
12824 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
12825 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
12826 return DAG.getNode(ExtOp, dl, VT, tmp);
12827}
12828
12830 if (V->getOpcode() == ISD::UMUL_LOHI ||
12831 V->getOpcode() == ISD::SMUL_LOHI)
12832 return V;
12833 return SDValue();
12834}
12835
12836static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
12838 const ARMSubtarget *Subtarget) {
12839 if (!Subtarget->hasBaseDSP())
12840 return SDValue();
12841
12842 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
12843 // accumulates the product into a 64-bit value. The 16-bit values will
12844 // be sign extended somehow or SRA'd into 32-bit values
12845 // (addc (adde (mul 16bit, 16bit), lo), hi)
12846 SDValue Mul = AddcNode->getOperand(0);
12847 SDValue Lo = AddcNode->getOperand(1);
12848 if (Mul.getOpcode() != ISD::MUL) {
12849 Lo = AddcNode->getOperand(0);
12850 Mul = AddcNode->getOperand(1);
12851 if (Mul.getOpcode() != ISD::MUL)
12852 return SDValue();
12853 }
12854
12855 SDValue SRA = AddeNode->getOperand(0);
12856 SDValue Hi = AddeNode->getOperand(1);
12857 if (SRA.getOpcode() != ISD::SRA) {
12858 SRA = AddeNode->getOperand(1);
12859 Hi = AddeNode->getOperand(0);
12860 if (SRA.getOpcode() != ISD::SRA)
12861 return SDValue();
12862 }
12863 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
12864 if (Const->getZExtValue() != 31)
12865 return SDValue();
12866 } else
12867 return SDValue();
12868
12869 if (SRA.getOperand(0) != Mul)
12870 return SDValue();
12871
12872 SelectionDAG &DAG = DCI.DAG;
12873 SDLoc dl(AddcNode);
12874 unsigned Opcode = 0;
12875 SDValue Op0;
12876 SDValue Op1;
12877
12878 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
12879 Opcode = ARMISD::SMLALBB;
12880 Op0 = Mul.getOperand(0);
12881 Op1 = Mul.getOperand(1);
12882 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
12883 Opcode = ARMISD::SMLALBT;
12884 Op0 = Mul.getOperand(0);
12885 Op1 = Mul.getOperand(1).getOperand(0);
12886 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
12887 Opcode = ARMISD::SMLALTB;
12888 Op0 = Mul.getOperand(0).getOperand(0);
12889 Op1 = Mul.getOperand(1);
12890 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
12891 Opcode = ARMISD::SMLALTT;
12892 Op0 = Mul->getOperand(0).getOperand(0);
12893 Op1 = Mul->getOperand(1).getOperand(0);
12894 }
12895
12896 if (!Op0 || !Op1)
12897 return SDValue();
12898
12899 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
12900 Op0, Op1, Lo, Hi);
12901 // Replace the ADDs' nodes uses by the MLA node's values.
12902 SDValue HiMLALResult(SMLAL.getNode(), 1);
12903 SDValue LoMLALResult(SMLAL.getNode(), 0);
12904
12905 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
12906 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
12907
12908 // Return original node to notify the driver to stop replacing.
12909 SDValue resNode(AddcNode, 0);
12910 return resNode;
12911}
12912
12915 const ARMSubtarget *Subtarget) {
12916 // Look for multiply add opportunities.
12917 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
12918 // each add nodes consumes a value from ISD::UMUL_LOHI and there is
12919 // a glue link from the first add to the second add.
12920 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
12921 // a S/UMLAL instruction.
12922 // UMUL_LOHI
12923 // / :lo \ :hi
12924 // V \ [no multiline comment]
12925 // loAdd -> ADDC |
12926 // \ :carry /
12927 // V V
12928 // ADDE <- hiAdd
12929 //
12930 // In the special case where only the higher part of a signed result is used
12931 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
12932 // a constant with the exact value of 0x80000000, we recognize we are dealing
12933 // with a "rounded multiply and add" (or subtract) and transform it into
12934 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
12935
12936 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
12937 AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
12938 "Expect an ADDE or SUBE");
12939
12940 assert(AddeSubeNode->getNumOperands() == 3 &&
12941 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
12942 "ADDE node has the wrong inputs");
12943
12944 // Check that we are chained to the right ADDC or SUBC node.
12945 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
12946 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12947 AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
12948 (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
12949 AddcSubcNode->getOpcode() != ARMISD::SUBC))
12950 return SDValue();
12951
12952 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
12953 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
12954
12955 // Check if the two operands are from the same mul_lohi node.
12956 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
12957 return SDValue();
12958
12959 assert(AddcSubcNode->getNumValues() == 2 &&
12960 AddcSubcNode->getValueType(0) == MVT::i32 &&
12961 "Expect ADDC with two result values. First: i32");
12962
12963 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
12964 // maybe a SMLAL which multiplies two 16-bit values.
12965 if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12966 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
12967 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
12968 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
12969 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
12970 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
12971
12972 // Check for the triangle shape.
12973 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
12974 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
12975
12976 // Make sure that the ADDE/SUBE operands are not coming from the same node.
12977 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
12978 return SDValue();
12979
12980 // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
12981 bool IsLeftOperandMUL = false;
12982 SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
12983 if (MULOp == SDValue())
12984 MULOp = findMUL_LOHI(AddeSubeOp1);
12985 else
12986 IsLeftOperandMUL = true;
12987 if (MULOp == SDValue())
12988 return SDValue();
12989
12990 // Figure out the right opcode.
12991 unsigned Opc = MULOp->getOpcode();
12992 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
12993
12994 // Figure out the high and low input values to the MLAL node.
12995 SDValue *HiAddSub = nullptr;
12996 SDValue *LoMul = nullptr;
12997 SDValue *LowAddSub = nullptr;
12998
12999 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
13000 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
13001 return SDValue();
13002
13003 if (IsLeftOperandMUL)
13004 HiAddSub = &AddeSubeOp1;
13005 else
13006 HiAddSub = &AddeSubeOp0;
13007
13008 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
13009 // whose low result is fed to the ADDC/SUBC we are checking.
13010
13011 if (AddcSubcOp0 == MULOp.getValue(0)) {
13012 LoMul = &AddcSubcOp0;
13013 LowAddSub = &AddcSubcOp1;
13014 }
13015 if (AddcSubcOp1 == MULOp.getValue(0)) {
13016 LoMul = &AddcSubcOp1;
13017 LowAddSub = &AddcSubcOp0;
13018 }
13019
13020 if (!LoMul)
13021 return SDValue();
13022
13023 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
13024 // the replacement below will create a cycle.
13025 if (AddcSubcNode == HiAddSub->getNode() ||
13026 AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
13027 return SDValue();
13028
13029 // Create the merged node.
13030 SelectionDAG &DAG = DCI.DAG;
13031
13032 // Start building operand list.
13034 Ops.push_back(LoMul->getOperand(0));
13035 Ops.push_back(LoMul->getOperand(1));
13036
13037 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
13038 // the case, we must be doing signed multiplication and only use the higher
13039 // part of the result of the MLAL, furthermore the LowAddSub must be a constant
13040 // addition or subtraction with the value of 0x800000.
13041 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
13042 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
13043 LowAddSub->getNode()->getOpcode() == ISD::Constant &&
13044 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
13045 0x80000000) {
13046 Ops.push_back(*HiAddSub);
13047 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
13048 FinalOpc = ARMISD::SMMLSR;
13049 } else {
13050 FinalOpc = ARMISD::SMMLAR;
13051 }
13052 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
13053 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
13054
13055 return SDValue(AddeSubeNode, 0);
13056 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
13057 // SMMLS is generated during instruction selection and the rest of this
13058 // function can not handle the case where AddcSubcNode is a SUBC.
13059 return SDValue();
13060
13061 // Finish building the operand list for {U/S}MLAL
13062 Ops.push_back(*LowAddSub);
13063 Ops.push_back(*HiAddSub);
13064
13065 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
13066 DAG.getVTList(MVT::i32, MVT::i32), Ops);
13067
13068 // Replace the ADDs' nodes uses by the MLA node's values.
13069 SDValue HiMLALResult(MLALNode.getNode(), 1);
13070 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
13071
13072 SDValue LoMLALResult(MLALNode.getNode(), 0);
13073 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
13074
13075 // Return original node to notify the driver to stop replacing.
13076 return SDValue(AddeSubeNode, 0);
13077}
13078
13081 const ARMSubtarget *Subtarget) {
13082 // UMAAL is similar to UMLAL except that it adds two unsigned values.
13083 // While trying to combine for the other MLAL nodes, first search for the
13084 // chance to use UMAAL. Check if Addc uses a node which has already
13085 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
13086 // as the addend, and it's handled in PerformUMLALCombine.
13087
13088 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
13089 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
13090
13091 // Check that we have a glued ADDC node.
13092 SDNode* AddcNode = AddeNode->getOperand(2).getNode();
13093 if (AddcNode->getOpcode() != ARMISD::ADDC)
13094 return SDValue();
13095
13096 // Find the converted UMAAL or quit if it doesn't exist.
13097 SDNode *UmlalNode = nullptr;
13098 SDValue AddHi;
13099 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
13100 UmlalNode = AddcNode->getOperand(0).getNode();
13101 AddHi = AddcNode->getOperand(1);
13102 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
13103 UmlalNode = AddcNode->getOperand(1).getNode();
13104 AddHi = AddcNode->getOperand(0);
13105 } else {
13106 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
13107 }
13108
13109 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
13110 // the ADDC as well as Zero.
13111 if (!isNullConstant(UmlalNode->getOperand(3)))
13112 return SDValue();
13113
13114 if ((isNullConstant(AddeNode->getOperand(0)) &&
13115 AddeNode->getOperand(1).getNode() == UmlalNode) ||
13116 (AddeNode->getOperand(0).getNode() == UmlalNode &&
13117 isNullConstant(AddeNode->getOperand(1)))) {
13118 SelectionDAG &DAG = DCI.DAG;
13119 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
13120 UmlalNode->getOperand(2), AddHi };
13121 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
13122 DAG.getVTList(MVT::i32, MVT::i32), Ops);
13123
13124 // Replace the ADDs' nodes uses by the UMAAL node's values.
13125 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
13126 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
13127
13128 // Return original node to notify the driver to stop replacing.
13129 return SDValue(AddeNode, 0);
13130 }
13131 return SDValue();
13132}
13133
13135 const ARMSubtarget *Subtarget) {
13136 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
13137 return SDValue();
13138
13139 // Check that we have a pair of ADDC and ADDE as operands.
13140 // Both addends of the ADDE must be zero.
13141 SDNode* AddcNode = N->getOperand(2).getNode();
13142 SDNode* AddeNode = N->getOperand(3).getNode();
13143 if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
13144 (AddeNode->getOpcode() == ARMISD::ADDE) &&
13145 isNullConstant(AddeNode->getOperand(0)) &&
13146 isNullConstant(AddeNode->getOperand(1)) &&
13147 (AddeNode->getOperand(2).getNode() == AddcNode))
13148 return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
13149 DAG.getVTList(MVT::i32, MVT::i32),
13150 {N->getOperand(0), N->getOperand(1),
13151 AddcNode->getOperand(0), AddcNode->getOperand(1)});
13152 else
13153 return SDValue();
13154}
13155
13158 const ARMSubtarget *Subtarget) {
13159 SelectionDAG &DAG(DCI.DAG);
13160
13161 if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(1)) {
13162 // (SUBC (ADDE 0, 0, C), 1) -> C
13163 SDValue LHS = N->getOperand(0);
13164 SDValue RHS = N->getOperand(1);
13165 if (LHS->getOpcode() == ARMISD::ADDE &&
13166 isNullConstant(LHS->getOperand(0)) &&
13167 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
13168 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
13169 }
13170 }
13171
13172 if (Subtarget->isThumb1Only()) {
13173 SDValue RHS = N->getOperand(1);
13174 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
13175 int32_t imm = C->getSExtValue();
13176 if (imm < 0 && imm > std::numeric_limits<int>::min()) {
13177 SDLoc DL(N);
13178 RHS = DAG.getConstant(-imm, DL, MVT::i32);
13179 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
13180 : ARMISD::ADDC;
13181 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
13182 }
13183 }
13184 }
13185
13186 return SDValue();
13187}
13188
13191 const ARMSubtarget *Subtarget) {
13192 if (Subtarget->isThumb1Only()) {
13193 SelectionDAG &DAG = DCI.DAG;
13194 SDValue RHS = N->getOperand(1);
13195 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
13196 int64_t imm = C->getSExtValue();
13197 if (imm < 0) {
13198 SDLoc DL(N);
13199
13200 // The with-carry-in form matches bitwise not instead of the negation.
13201 // Effectively, the inverse interpretation of the carry flag already
13202 // accounts for part of the negation.
13203 RHS = DAG.getConstant(~imm, DL, MVT::i32);
13204
13205 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
13206 : ARMISD::ADDE;
13207 return DAG.getNode(Opcode, DL, N->getVTList(),
13208 N->getOperand(0), RHS, N->getOperand(2));
13209 }
13210 }
13211 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
13212 return AddCombineTo64bitMLAL(N, DCI, Subtarget);
13213 }
13214 return SDValue();
13215}
13216
13219 const ARMSubtarget *Subtarget) {
13220 if (!Subtarget->hasMVEIntegerOps())
13221 return SDValue();
13222
13223 SDLoc dl(N);
13224 SDValue SetCC;
13225 SDValue LHS;
13226 SDValue RHS;
13227 ISD::CondCode CC;
13228 SDValue TrueVal;
13229 SDValue FalseVal;
13230
13231 if (N->getOpcode() == ISD::SELECT &&
13232 N->getOperand(0)->getOpcode() == ISD::SETCC) {
13233 SetCC = N->getOperand(0);
13234 LHS = SetCC->getOperand(0);
13235 RHS = SetCC->getOperand(1);
13236 CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
13237 TrueVal = N->getOperand(1);
13238 FalseVal = N->getOperand(2);
13239 } else if (N->getOpcode() == ISD::SELECT_CC) {
13240 LHS = N->getOperand(0);
13241 RHS = N->getOperand(1);
13242 CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
13243 TrueVal = N->getOperand(2);
13244 FalseVal = N->getOperand(3);
13245 } else {
13246 return SDValue();
13247 }
13248
13249 unsigned int Opcode = 0;
13250 if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
13251 FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
13252 (CC == ISD::SETULT || CC == ISD::SETUGT)) {
13253 Opcode = ARMISD::VMINVu;
13254 if (CC == ISD::SETUGT)
13255 std::swap(TrueVal, FalseVal);
13256 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
13257 FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
13258 (CC == ISD::SETLT || CC == ISD::SETGT)) {
13259 Opcode = ARMISD::VMINVs;
13260 if (CC == ISD::SETGT)
13261 std::swap(TrueVal, FalseVal);
13262 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
13263 FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
13264 (CC == ISD::SETUGT || CC == ISD::SETULT)) {
13265 Opcode = ARMISD::VMAXVu;
13266 if (CC == ISD::SETULT)
13267 std::swap(TrueVal, FalseVal);
13268 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
13269 FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
13270 (CC == ISD::SETGT || CC == ISD::SETLT)) {
13271 Opcode = ARMISD::VMAXVs;
13272 if (CC == ISD::SETLT)
13273 std::swap(TrueVal, FalseVal);
13274 } else
13275 return SDValue();
13276
13277 // Normalise to the right hand side being the vector reduction
13278 switch (TrueVal->getOpcode()) {
13283 std::swap(LHS, RHS);
13284 std::swap(TrueVal, FalseVal);
13285 break;
13286 }
13287
13288 EVT VectorType = FalseVal->getOperand(0).getValueType();
13289
13290 if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&
13291 VectorType != MVT::v4i32)
13292 return SDValue();
13293
13294 EVT VectorScalarType = VectorType.getVectorElementType();
13295
13296 // The values being selected must also be the ones being compared
13297 if (TrueVal != LHS || FalseVal != RHS)
13298 return SDValue();
13299
13300 EVT LeftType = LHS->getValueType(0);
13301 EVT RightType = RHS->getValueType(0);
13302
13303 // The types must match the reduced type too
13304 if (LeftType != VectorScalarType || RightType != VectorScalarType)
13305 return SDValue();
13306
13307 // Legalise the scalar to an i32
13308 if (VectorScalarType != MVT::i32)
13309 LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
13310
13311 // Generate the reduction as an i32 for legalisation purposes
13312 auto Reduction =
13313 DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
13314
13315 // The result isn't actually an i32 so truncate it back to its original type
13316 if (VectorScalarType != MVT::i32)
13317 Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
13318
13319 return Reduction;
13320}
13321
13322// A special combine for the vqdmulh family of instructions. This is one of the
13323// potential set of patterns that could patch this instruction. The base pattern
13324// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
13325// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
13326// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
13327// the max is unnecessary.
13329 EVT VT = N->getValueType(0);
13330 SDValue Shft;
13331 ConstantSDNode *Clamp;
13332
13333 if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
13334 return SDValue();
13335
13336 if (N->getOpcode() == ISD::SMIN) {
13337 Shft = N->getOperand(0);
13338 Clamp = isConstOrConstSplat(N->getOperand(1));
13339 } else if (N->getOpcode() == ISD::VSELECT) {
13340 // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
13341 SDValue Cmp = N->getOperand(0);
13342 if (Cmp.getOpcode() != ISD::SETCC ||
13343 cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||
13344 Cmp.getOperand(0) != N->getOperand(1) ||
13345 Cmp.getOperand(1) != N->getOperand(2))
13346 return SDValue();
13347 Shft = N->getOperand(1);
13348 Clamp = isConstOrConstSplat(N->getOperand(2));
13349 } else
13350 return SDValue();
13351
13352 if (!Clamp)
13353 return SDValue();
13354
13355 MVT ScalarType;
13356 int ShftAmt = 0;
13357 switch (Clamp->getSExtValue()) {
13358 case (1 << 7) - 1:
13359 ScalarType = MVT::i8;
13360 ShftAmt = 7;
13361 break;
13362 case (1 << 15) - 1:
13363 ScalarType = MVT::i16;
13364 ShftAmt = 15;
13365 break;
13366 case (1ULL << 31) - 1:
13367 ScalarType = MVT::i32;
13368 ShftAmt = 31;
13369 break;
13370 default:
13371 return SDValue();
13372 }
13373
13374 if (Shft.getOpcode() != ISD::SRA)
13375 return SDValue();
13377 if (!N1 || N1->getSExtValue() != ShftAmt)
13378 return SDValue();
13379
13380 SDValue Mul = Shft.getOperand(0);
13381 if (Mul.getOpcode() != ISD::MUL)
13382 return SDValue();
13383
13384 SDValue Ext0 = Mul.getOperand(0);
13385 SDValue Ext1 = Mul.getOperand(1);
13386 if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
13387 Ext1.getOpcode() != ISD::SIGN_EXTEND)
13388 return SDValue();
13389 EVT VecVT = Ext0.getOperand(0).getValueType();
13390 if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1)
13391 return SDValue();
13392 if (Ext1.getOperand(0).getValueType() != VecVT ||
13393 VecVT.getScalarType() != ScalarType ||
13394 VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
13395 return SDValue();
13396
13397 SDLoc DL(Mul);
13398 unsigned LegalLanes = 128 / (ShftAmt + 1);
13399 EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes);
13400 // For types smaller than legal vectors extend to be legal and only use needed
13401 // lanes.
13402 if (VecVT.getSizeInBits() < 128) {
13403 EVT ExtVecVT =
13405 VecVT.getVectorNumElements());
13406 SDValue Inp0 =
13407 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0));
13408 SDValue Inp1 =
13409 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0));
13410 Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0);
13411 Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1);
13412 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13413 SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH);
13414 Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc);
13415 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc);
13416 }
13417
13418 // For larger types, split into legal sized chunks.
13419 assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type");
13420 unsigned NumParts = VecVT.getSizeInBits() / 128;
13422 for (unsigned I = 0; I < NumParts; ++I) {
13423 SDValue Inp0 =
13424 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0),
13425 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13426 SDValue Inp1 =
13427 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0),
13428 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13429 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13430 Parts.push_back(VQDMULH);
13431 }
13432 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT,
13433 DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));
13434}
13435
13438 const ARMSubtarget *Subtarget) {
13439 if (!Subtarget->hasMVEIntegerOps())
13440 return SDValue();
13441
13442 if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))
13443 return V;
13444
13445 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
13446 //
13447 // We need to re-implement this optimization here as the implementation in the
13448 // Target-Independent DAGCombiner does not handle the kind of constant we make
13449 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
13450 // good reason, allowing truncation there would break other targets).
13451 //
13452 // Currently, this is only done for MVE, as it's the only target that benefits
13453 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
13454 if (N->getOperand(0).getOpcode() != ISD::XOR)
13455 return SDValue();
13456 SDValue XOR = N->getOperand(0);
13457
13458 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
13459 // It is important to check with truncation allowed as the BUILD_VECTORs we
13460 // generate in those situations will truncate their operands.
13461 ConstantSDNode *Const =
13462 isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,
13463 /*AllowTruncation*/ true);
13464 if (!Const || !Const->isOne())
13465 return SDValue();
13466
13467 // Rewrite into vselect(cond, rhs, lhs).
13468 SDValue Cond = XOR->getOperand(0);
13469 SDValue LHS = N->getOperand(1);
13470 SDValue RHS = N->getOperand(2);
13471 EVT Type = N->getValueType(0);
13472 return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
13473}
13474
13475// Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n
13478 const ARMSubtarget *Subtarget) {
13479 SDValue Op0 = N->getOperand(0);
13480 SDValue Op1 = N->getOperand(1);
13481 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
13482 EVT VT = N->getValueType(0);
13483
13484 if (!Subtarget->hasMVEIntegerOps() ||
13486 return SDValue();
13487
13488 if (CC == ISD::SETUGE) {
13489 std::swap(Op0, Op1);
13490 CC = ISD::SETULT;
13491 }
13492
13493 if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 ||
13495 return SDValue();
13496
13497 // Check first operand is BuildVector of 0,1,2,...
13498 for (unsigned I = 0; I < VT.getVectorNumElements(); I++) {
13499 if (!Op0.getOperand(I).isUndef() &&
13500 !(isa<ConstantSDNode>(Op0.getOperand(I)) &&
13501 Op0.getConstantOperandVal(I) == I))
13502 return SDValue();
13503 }
13504
13505 // The second is a Splat of Op1S
13506 SDValue Op1S = DCI.DAG.getSplatValue(Op1);
13507 if (!Op1S)
13508 return SDValue();
13509
13510 unsigned Opc;
13511 switch (VT.getVectorNumElements()) {
13512 case 2:
13513 Opc = Intrinsic::arm_mve_vctp64;
13514 break;
13515 case 4:
13516 Opc = Intrinsic::arm_mve_vctp32;
13517 break;
13518 case 8:
13519 Opc = Intrinsic::arm_mve_vctp16;
13520 break;
13521 case 16:
13522 Opc = Intrinsic::arm_mve_vctp8;
13523 break;
13524 default:
13525 return SDValue();
13526 }
13527
13528 SDLoc DL(N);
13529 return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
13530 DCI.DAG.getConstant(Opc, DL, MVT::i32),
13531 DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32));
13532}
13533
13534/// PerformADDECombine - Target-specific dag combine transform from
13535/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
13536/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
13539 const ARMSubtarget *Subtarget) {
13540 // Only ARM and Thumb2 support UMLAL/SMLAL.
13541 if (Subtarget->isThumb1Only())
13542 return PerformAddeSubeCombine(N, DCI, Subtarget);
13543
13544 // Only perform the checks after legalize when the pattern is available.
13545 if (DCI.isBeforeLegalize()) return SDValue();
13546
13547 return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
13548}
13549
13550/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
13551/// operands N0 and N1. This is a helper for PerformADDCombine that is
13552/// called with the default operands, and if that fails, with commuted
13553/// operands.
13556 const ARMSubtarget *Subtarget){
13557 // Attempt to create vpadd for this add.
13558 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
13559 return Result;
13560
13561 // Attempt to create vpaddl for this add.
13562 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
13563 return Result;
13564 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
13565 Subtarget))
13566 return Result;
13567
13568 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
13569 if (N0.getNode()->hasOneUse())
13570 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
13571 return Result;
13572 return SDValue();
13573}
13574
13576 EVT VT = N->getValueType(0);
13577 SDValue N0 = N->getOperand(0);
13578 SDValue N1 = N->getOperand(1);
13579 SDLoc dl(N);
13580
13581 auto IsVecReduce = [](SDValue Op) {
13582 switch (Op.getOpcode()) {
13583 case ISD::VECREDUCE_ADD:
13584 case ARMISD::VADDVs:
13585 case ARMISD::VADDVu:
13586 case ARMISD::VMLAVs:
13587 case ARMISD::VMLAVu:
13588 return true;
13589 }
13590 return false;
13591 };
13592
13593 auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) {
13594 // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->
13595 // add(add(X, vecreduce(Y)), vecreduce(Z))
13596 // to make better use of vaddva style instructions.
13597 if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&
13598 IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) &&
13599 !isa<ConstantSDNode>(N0) && N1->hasOneUse()) {
13600 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0));
13601 return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1));
13602 }
13603 // And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
13604 // add(add(add(A, C), reduce(B)), reduce(D))
13605 if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
13606 N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) {
13607 unsigned N0RedOp = 0;
13608 if (!IsVecReduce(N0.getOperand(N0RedOp))) {
13609 N0RedOp = 1;
13610 if (!IsVecReduce(N0.getOperand(N0RedOp)))
13611 return SDValue();
13612 }
13613
13614 unsigned N1RedOp = 0;
13615 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13616 N1RedOp = 1;
13617 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13618 return SDValue();
13619
13620 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp),
13621 N1.getOperand(1 - N1RedOp));
13622 SDValue Add1 =
13623 DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp));
13624 return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp));
13625 }
13626 return SDValue();
13627 };
13628 if (SDValue R = DistrubuteAddAddVecReduce(N0, N1))
13629 return R;
13630 if (SDValue R = DistrubuteAddAddVecReduce(N1, N0))
13631 return R;
13632
13633 // Distribute add(vecreduce(load(Y)), vecreduce(load(Z)))
13634 // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z)))
13635 // by ascending load offsets. This can help cores prefetch if the order of
13636 // loads is more predictable.
13637 auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) {
13638 // Check if two reductions are known to load data where one is before/after
13639 // another. Return negative if N0 loads data before N1, positive if N1 is
13640 // before N0 and 0 otherwise if nothing is known.
13641 auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) {
13642 // Look through to the first operand of a MUL, for the VMLA case.
13643 // Currently only looks at the first operand, in the hope they are equal.
13644 if (N0.getOpcode() == ISD::MUL)
13645 N0 = N0.getOperand(0);
13646 if (N1.getOpcode() == ISD::MUL)
13647 N1 = N1.getOperand(0);
13648
13649 // Return true if the two operands are loads to the same object and the
13650 // offset of the first is known to be less than the offset of the second.
13651 LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0);
13652 LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1);
13653 if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() ||
13654 !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() ||
13655 Load1->isIndexed())
13656 return 0;
13657
13658 auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG);
13659 auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG);
13660
13661 if (!BaseLocDecomp0.getBase() ||
13662 BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() ||
13663 !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset())
13664 return 0;
13665 if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset())
13666 return -1;
13667 if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset())
13668 return 1;
13669 return 0;
13670 };
13671
13672 SDValue X;
13673 if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) {
13674 if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) {
13675 int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0),
13676 N0.getOperand(1).getOperand(0));
13677 if (IsBefore < 0) {
13678 X = N0.getOperand(0);
13679 N0 = N0.getOperand(1);
13680 } else if (IsBefore > 0) {
13681 X = N0.getOperand(1);
13682 N0 = N0.getOperand(0);
13683 } else
13684 return SDValue();
13685 } else if (IsVecReduce(N0.getOperand(0))) {
13686 X = N0.getOperand(1);
13687 N0 = N0.getOperand(0);
13688 } else if (IsVecReduce(N0.getOperand(1))) {
13689 X = N0.getOperand(0);
13690 N0 = N0.getOperand(1);
13691 } else
13692 return SDValue();
13693 } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) &&
13694 IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) {
13695 // Note this is backward to how you would expect. We create
13696 // add(reduce(load + 16), reduce(load + 0)) so that the
13697 // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving
13698 // the X as VADDV(load + 0)
13699 return DAG.getNode(ISD::ADD, dl, VT, N1, N0);
13700 } else
13701 return SDValue();
13702
13703 if (!IsVecReduce(N0) || !IsVecReduce(N1))
13704 return SDValue();
13705
13706 if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0)
13707 return SDValue();
13708
13709 // Switch from add(add(X, N0), N1) to add(add(X, N1), N0)
13710 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1);
13711 return DAG.getNode(ISD::ADD, dl, VT, Add0, N0);
13712 };
13713 if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true))
13714 return R;
13715 if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false))
13716 return R;
13717 return SDValue();
13718}
13719
13721 const ARMSubtarget *Subtarget) {
13722 if (!Subtarget->hasMVEIntegerOps())
13723 return SDValue();
13724
13726 return R;
13727
13728 EVT VT = N->getValueType(0);
13729 SDValue N0 = N->getOperand(0);
13730 SDValue N1 = N->getOperand(1);
13731 SDLoc dl(N);
13732
13733 if (VT != MVT::i64)
13734 return SDValue();
13735
13736 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
13737 // will look like:
13738 // t1: i32,i32 = ARMISD::VADDLVs x
13739 // t2: i64 = build_pair t1, t1:1
13740 // t3: i64 = add t2, y
13741 // Otherwise we try to push the add up above VADDLVAx, to potentially allow
13742 // the add to be simplified separately.
13743 // We also need to check for sext / zext and commutitive adds.
13744 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
13745 SDValue NB) {
13746 if (NB->getOpcode() != ISD::BUILD_PAIR)
13747 return SDValue();
13748 SDValue VecRed = NB->getOperand(0);
13749 if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) ||
13750 VecRed.getResNo() != 0 ||
13751 NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
13752 return SDValue();
13753
13754 if (VecRed->getOpcode() == OpcodeA) {
13755 // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
13756 SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
13757 VecRed.getOperand(0), VecRed.getOperand(1));
13758 NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA);
13759 }
13760
13762 std::tie(Ops[0], Ops[1]) = DAG.SplitScalar(NA, dl, MVT::i32, MVT::i32);
13763
13764 unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0;
13765 for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++)
13766 Ops.push_back(VecRed->getOperand(I));
13767 SDValue Red =
13768 DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops);
13769 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
13770 SDValue(Red.getNode(), 1));
13771 };
13772
13773 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
13774 return M;
13775 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))
13776 return M;
13777 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))
13778 return M;
13779 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
13780 return M;
13781 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
13782 return M;
13783 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
13784 return M;
13785 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
13786 return M;
13787 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
13788 return M;
13789 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
13790 return M;
13791 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
13792 return M;
13793 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))
13794 return M;
13795 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
13796 return M;
13797 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))
13798 return M;
13799 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))
13800 return M;
13801 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))
13802 return M;
13803 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))
13804 return M;
13805 return SDValue();
13806}
13807
13808bool
13810 CombineLevel Level) const {
13811 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
13812 N->getOpcode() == ISD::SRL) &&
13813 "Expected shift op");
13814
13815 SDValue ShiftLHS = N->getOperand(0);
13816 if (!ShiftLHS->hasOneUse())
13817 return false;
13818
13819 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
13820 !ShiftLHS.getOperand(0)->hasOneUse())
13821 return false;
13822
13823 if (Level == BeforeLegalizeTypes)
13824 return true;
13825
13826 if (N->getOpcode() != ISD::SHL)
13827 return true;
13828
13829 if (Subtarget->isThumb1Only()) {
13830 // Avoid making expensive immediates by commuting shifts. (This logic
13831 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
13832 // for free.)
13833 if (N->getOpcode() != ISD::SHL)
13834 return true;
13835 SDValue N1 = N->getOperand(0);
13836 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
13837 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
13838 return true;
13839 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
13840 if (Const->getAPIntValue().ult(256))
13841 return false;
13842 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
13843 Const->getAPIntValue().sgt(-256))
13844 return false;
13845 }
13846 return true;
13847 }
13848
13849 // Turn off commute-with-shift transform after legalization, so it doesn't
13850 // conflict with PerformSHLSimplify. (We could try to detect when
13851 // PerformSHLSimplify would trigger more precisely, but it isn't
13852 // really necessary.)
13853 return false;
13854}
13855
13857 const SDNode *N) const {
13858 assert(N->getOpcode() == ISD::XOR &&
13859 (N->getOperand(0).getOpcode() == ISD::SHL ||
13860 N->getOperand(0).getOpcode() == ISD::SRL) &&
13861 "Expected XOR(SHIFT) pattern");
13862
13863 // Only commute if the entire NOT mask is a hidden shifted mask.
13864 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
13865 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
13866 if (XorC && ShiftC) {
13867 unsigned MaskIdx, MaskLen;
13868 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
13869 unsigned ShiftAmt = ShiftC->getZExtValue();
13870 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
13871 if (N->getOperand(0).getOpcode() == ISD::SHL)
13872 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
13873 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
13874 }
13875 }
13876
13877 return false;
13878}
13879
13881 const SDNode *N, CombineLevel Level) const {
13882 assert(((N->getOpcode() == ISD::SHL &&
13883 N->getOperand(0).getOpcode() == ISD::SRL) ||
13884 (N->getOpcode() == ISD::SRL &&
13885 N->getOperand(0).getOpcode() == ISD::SHL)) &&
13886 "Expected shift-shift mask");
13887
13888 if (!Subtarget->isThumb1Only())
13889 return true;
13890
13891 if (Level == BeforeLegalizeTypes)
13892 return true;
13893
13894 return false;
13895}
13896
13898 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
13899 SDValue Y) const {
13900 return Subtarget->hasMVEIntegerOps() && isTypeLegal(VT) &&
13901 SelectOpcode == ISD::VSELECT;
13902}
13903
13905 if (!Subtarget->hasNEON()) {
13906 if (Subtarget->isThumb1Only())
13907 return VT.getScalarSizeInBits() <= 32;
13908 return true;
13909 }
13910 return VT.isScalarInteger();
13911}
13912
13914 EVT VT) const {
13915 if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple())
13916 return false;
13917
13918 switch (FPVT.getSimpleVT().SimpleTy) {
13919 case MVT::f16:
13920 return Subtarget->hasVFP2Base();
13921 case MVT::f32:
13922 return Subtarget->hasVFP2Base();
13923 case MVT::f64:
13924 return Subtarget->hasFP64();
13925 case MVT::v4f32:
13926 case MVT::v8f16:
13927 return Subtarget->hasMVEFloatOps();
13928 default:
13929 return false;
13930 }
13931}
13932
13935 const ARMSubtarget *ST) {
13936 // Allow the generic combiner to identify potential bswaps.
13937 if (DCI.isBeforeLegalize())
13938 return SDValue();
13939
13940 // DAG combiner will fold:
13941 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
13942 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
13943 // Other code patterns that can be also be modified have the following form:
13944 // b + ((a << 1) | 510)
13945 // b + ((a << 1) & 510)
13946 // b + ((a << 1) ^ 510)
13947 // b + ((a << 1) + 510)
13948
13949 // Many instructions can perform the shift for free, but it requires both
13950 // the operands to be registers. If c1 << c2 is too large, a mov immediate
13951 // instruction will needed. So, unfold back to the original pattern if:
13952 // - if c1 and c2 are small enough that they don't require mov imms.
13953 // - the user(s) of the node can perform an shl
13954
13955 // No shifted operands for 16-bit instructions.
13956 if (ST->isThumb() && ST->isThumb1Only())
13957 return SDValue();
13958
13959 // Check that all the users could perform the shl themselves.
13960 for (auto *U : N->users()) {
13961 switch(U->getOpcode()) {
13962 default:
13963 return SDValue();
13964 case ISD::SUB:
13965 case ISD::ADD:
13966 case ISD::AND:
13967 case ISD::OR:
13968 case ISD::XOR:
13969 case ISD::SETCC:
13970 case ARMISD::CMP:
13971 // Check that the user isn't already using a constant because there
13972 // aren't any instructions that support an immediate operand and a
13973 // shifted operand.
13974 if (isa<ConstantSDNode>(U->getOperand(0)) ||
13975 isa<ConstantSDNode>(U->getOperand(1)))
13976 return SDValue();
13977
13978 // Check that it's not already using a shift.
13979 if (U->getOperand(0).getOpcode() == ISD::SHL ||
13980 U->getOperand(1).getOpcode() == ISD::SHL)
13981 return SDValue();
13982 break;
13983 }
13984 }
13985
13986 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
13987 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
13988 return SDValue();
13989
13990 if (N->getOperand(0).getOpcode() != ISD::SHL)
13991 return SDValue();
13992
13993 SDValue SHL = N->getOperand(0);
13994
13995 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
13996 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
13997 if (!C1ShlC2 || !C2)
13998 return SDValue();
13999
14000 APInt C2Int = C2->getAPIntValue();
14001 APInt C1Int = C1ShlC2->getAPIntValue();
14002 unsigned C2Width = C2Int.getBitWidth();
14003 if (C2Int.uge(C2Width))
14004 return SDValue();
14005 uint64_t C2Value = C2Int.getZExtValue();
14006
14007 // Check that performing a lshr will not lose any information.
14008 APInt Mask = APInt::getHighBitsSet(C2Width, C2Width - C2Value);
14009 if ((C1Int & Mask) != C1Int)
14010 return SDValue();
14011
14012 // Shift the first constant.
14013 C1Int.lshrInPlace(C2Int);
14014
14015 // The immediates are encoded as an 8-bit value that can be rotated.
14016 auto LargeImm = [](const APInt &Imm) {
14017 unsigned Zeros = Imm.countl_zero() + Imm.countr_zero();
14018 return Imm.getBitWidth() - Zeros > 8;
14019 };
14020
14021 if (LargeImm(C1Int) || LargeImm(C2Int))
14022 return SDValue();
14023
14024 SelectionDAG &DAG = DCI.DAG;
14025 SDLoc dl(N);
14026 SDValue X = SHL.getOperand(0);
14027 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
14028 DAG.getConstant(C1Int, dl, MVT::i32));
14029 // Shift left to compensate for the lshr of C1Int.
14030 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
14031
14032 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
14033 SHL.dump(); N->dump());
14034 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
14035 return Res;
14036}
14037
14038
14039/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
14040///
14043 const ARMSubtarget *Subtarget) {
14044 SDValue N0 = N->getOperand(0);
14045 SDValue N1 = N->getOperand(1);
14046
14047 // Only works one way, because it needs an immediate operand.
14048 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14049 return Result;
14050
14051 if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget))
14052 return Result;
14053
14054 // First try with the default operand order.
14055 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
14056 return Result;
14057
14058 // If that didn't work, try again with the operands commuted.
14059 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
14060}
14061
14062// Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)
14063// providing -X is as cheap as X (currently, just a constant).
14065 if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0)))
14066 return SDValue();
14067 SDValue CSINC = N->getOperand(1);
14068 if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse())
14069 return SDValue();
14070
14071 ConstantSDNode *X = dyn_cast<ConstantSDNode>(CSINC.getOperand(0));
14072 if (!X)
14073 return SDValue();
14074
14075 return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32,
14076 DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0),
14077 CSINC.getOperand(0)),
14078 CSINC.getOperand(1), CSINC.getOperand(2),
14079 CSINC.getOperand(3));
14080}
14081
14082/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
14083///
14086 const ARMSubtarget *Subtarget) {
14087 SDValue N0 = N->getOperand(0);
14088 SDValue N1 = N->getOperand(1);
14089
14090 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
14091 if (N1.getNode()->hasOneUse())
14092 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
14093 return Result;
14094
14095 if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG))
14096 return R;
14097
14098 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
14099 return SDValue();
14100
14101 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
14102 // so that we can readily pattern match more mve instructions which can use
14103 // a scalar operand.
14104 SDValue VDup = N->getOperand(1);
14105 if (VDup->getOpcode() != ARMISD::VDUP)
14106 return SDValue();
14107
14108 SDValue VMov = N->getOperand(0);
14109 if (VMov->getOpcode() == ISD::BITCAST)
14110 VMov = VMov->getOperand(0);
14111
14112 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov))
14113 return SDValue();
14114
14115 SDLoc dl(N);
14116 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,
14117 DCI.DAG.getConstant(0, dl, MVT::i32),
14118 VDup->getOperand(0));
14119 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate);
14120}
14121
14122/// PerformVMULCombine
14123/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
14124/// special multiplier accumulator forwarding.
14125/// vmul d3, d0, d2
14126/// vmla d3, d1, d2
14127/// is faster than
14128/// vadd d3, d0, d1
14129/// vmul d3, d3, d2
14130// However, for (A + B) * (A + B),
14131// vadd d2, d0, d1
14132// vmul d3, d0, d2
14133// vmla d3, d1, d2
14134// is slower than
14135// vadd d2, d0, d1
14136// vmul d3, d2, d2
14139 const ARMSubtarget *Subtarget) {
14140 if (!Subtarget->hasVMLxForwarding())
14141 return SDValue();
14142
14143 SelectionDAG &DAG = DCI.DAG;
14144 SDValue N0 = N->getOperand(0);
14145 SDValue N1 = N->getOperand(1);
14146 unsigned Opcode = N0.getOpcode();
14147 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14148 Opcode != ISD::FADD && Opcode != ISD::FSUB) {
14149 Opcode = N1.getOpcode();
14150 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14151 Opcode != ISD::FADD && Opcode != ISD::FSUB)
14152 return SDValue();
14153 std::swap(N0, N1);
14154 }
14155
14156 if (N0 == N1)
14157 return SDValue();
14158
14159 EVT VT = N->getValueType(0);
14160 SDLoc DL(N);
14161 SDValue N00 = N0->getOperand(0);
14162 SDValue N01 = N0->getOperand(1);
14163 return DAG.getNode(Opcode, DL, VT,
14164 DAG.getNode(ISD::MUL, DL, VT, N00, N1),
14165 DAG.getNode(ISD::MUL, DL, VT, N01, N1));
14166}
14167
14169 const ARMSubtarget *Subtarget) {
14170 EVT VT = N->getValueType(0);
14171 if (VT != MVT::v2i64)
14172 return SDValue();
14173
14174 SDValue N0 = N->getOperand(0);
14175 SDValue N1 = N->getOperand(1);
14176
14177 auto IsSignExt = [&](SDValue Op) {
14178 if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
14179 return SDValue();
14180 EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
14181 if (VT.getScalarSizeInBits() == 32)
14182 return Op->getOperand(0);
14183 return SDValue();
14184 };
14185 auto IsZeroExt = [&](SDValue Op) {
14186 // Zero extends are a little more awkward. At the point we are matching
14187 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
14188 // That might be before of after a bitcast depending on how the and is
14189 // placed. Because this has to look through bitcasts, it is currently only
14190 // supported on LE.
14191 if (!Subtarget->isLittle())
14192 return SDValue();
14193
14194 SDValue And = Op;
14195 if (And->getOpcode() == ISD::BITCAST)
14196 And = And->getOperand(0);
14197 if (And->getOpcode() != ISD::AND)
14198 return SDValue();
14199 SDValue Mask = And->getOperand(1);
14200 if (Mask->getOpcode() == ISD::BITCAST)
14201 Mask = Mask->getOperand(0);
14202
14203 if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
14204 Mask.getValueType() != MVT::v4i32)
14205 return SDValue();
14206 if (isAllOnesConstant(Mask->getOperand(0)) &&
14207 isNullConstant(Mask->getOperand(1)) &&
14208 isAllOnesConstant(Mask->getOperand(2)) &&
14209 isNullConstant(Mask->getOperand(3)))
14210 return And->getOperand(0);
14211 return SDValue();
14212 };
14213
14214 SDLoc dl(N);
14215 if (SDValue Op0 = IsSignExt(N0)) {
14216 if (SDValue Op1 = IsSignExt(N1)) {
14217 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14218 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14219 return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
14220 }
14221 }
14222 if (SDValue Op0 = IsZeroExt(N0)) {
14223 if (SDValue Op1 = IsZeroExt(N1)) {
14224 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14225 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14226 return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
14227 }
14228 }
14229
14230 return SDValue();
14231}
14232
14235 const ARMSubtarget *Subtarget) {
14236 SelectionDAG &DAG = DCI.DAG;
14237
14238 EVT VT = N->getValueType(0);
14239 if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
14240 return PerformMVEVMULLCombine(N, DAG, Subtarget);
14241
14242 if (Subtarget->isThumb1Only())
14243 return SDValue();
14244
14245 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14246 return SDValue();
14247
14248 if (VT.is64BitVector() || VT.is128BitVector())
14249 return PerformVMULCombine(N, DCI, Subtarget);
14250 if (VT != MVT::i32)
14251 return SDValue();
14252
14253 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14254 if (!C)
14255 return SDValue();
14256
14257 int64_t MulAmt = C->getSExtValue();
14258 unsigned ShiftAmt = llvm::countr_zero<uint64_t>(MulAmt);
14259
14260 ShiftAmt = ShiftAmt & (32 - 1);
14261 SDValue V = N->getOperand(0);
14262 SDLoc DL(N);
14263
14264 SDValue Res;
14265 MulAmt >>= ShiftAmt;
14266
14267 if (MulAmt >= 0) {
14268 if (llvm::has_single_bit<uint32_t>(MulAmt - 1)) {
14269 // (mul x, 2^N + 1) => (add (shl x, N), x)
14270 Res = DAG.getNode(ISD::ADD, DL, VT,
14271 V,
14272 DAG.getNode(ISD::SHL, DL, VT,
14273 V,
14274 DAG.getConstant(Log2_32(MulAmt - 1), DL,
14275 MVT::i32)));
14276 } else if (llvm::has_single_bit<uint32_t>(MulAmt + 1)) {
14277 // (mul x, 2^N - 1) => (sub (shl x, N), x)
14278 Res = DAG.getNode(ISD::SUB, DL, VT,
14279 DAG.getNode(ISD::SHL, DL, VT,
14280 V,
14281 DAG.getConstant(Log2_32(MulAmt + 1), DL,
14282 MVT::i32)),
14283 V);
14284 } else
14285 return SDValue();
14286 } else {
14287 uint64_t MulAmtAbs = -MulAmt;
14288 if (llvm::has_single_bit<uint32_t>(MulAmtAbs + 1)) {
14289 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
14290 Res = DAG.getNode(ISD::SUB, DL, VT,
14291 V,
14292 DAG.getNode(ISD::SHL, DL, VT,
14293 V,
14294 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
14295 MVT::i32)));
14296 } else if (llvm::has_single_bit<uint32_t>(MulAmtAbs - 1)) {
14297 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
14298 Res = DAG.getNode(ISD::ADD, DL, VT,
14299 V,
14300 DAG.getNode(ISD::SHL, DL, VT,
14301 V,
14302 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
14303 MVT::i32)));
14304 Res = DAG.getNode(ISD::SUB, DL, VT,
14305 DAG.getConstant(0, DL, MVT::i32), Res);
14306 } else
14307 return SDValue();
14308 }
14309
14310 if (ShiftAmt != 0)
14311 Res = DAG.getNode(ISD::SHL, DL, VT,
14312 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
14313
14314 // Do not add new nodes to DAG combiner worklist.
14315 DCI.CombineTo(N, Res, false);
14316 return SDValue();
14317}
14318
14321 const ARMSubtarget *Subtarget) {
14322 // Allow DAGCombine to pattern-match before we touch the canonical form.
14323 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14324 return SDValue();
14325
14326 if (N->getValueType(0) != MVT::i32)
14327 return SDValue();
14328
14329 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14330 if (!N1C)
14331 return SDValue();
14332
14333 uint32_t C1 = (uint32_t)N1C->getZExtValue();
14334 // Don't transform uxtb/uxth.
14335 if (C1 == 255 || C1 == 65535)
14336 return SDValue();
14337
14338 SDNode *N0 = N->getOperand(0).getNode();
14339 if (!N0->hasOneUse())
14340 return SDValue();
14341
14342 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
14343 return SDValue();
14344
14345 bool LeftShift = N0->getOpcode() == ISD::SHL;
14346
14347 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1));
14348 if (!N01C)
14349 return SDValue();
14350
14351 uint32_t C2 = (uint32_t)N01C->getZExtValue();
14352 if (!C2 || C2 >= 32)
14353 return SDValue();
14354
14355 // Clear irrelevant bits in the mask.
14356 if (LeftShift)
14357 C1 &= (-1U << C2);
14358 else
14359 C1 &= (-1U >> C2);
14360
14361 SelectionDAG &DAG = DCI.DAG;
14362 SDLoc DL(N);
14363
14364 // We have a pattern of the form "(and (shl x, c2) c1)" or
14365 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
14366 // transform to a pair of shifts, to save materializing c1.
14367
14368 // First pattern: right shift, then mask off leading bits.
14369 // FIXME: Use demanded bits?
14370 if (!LeftShift && isMask_32(C1)) {
14371 uint32_t C3 = llvm::countl_zero(C1);
14372 if (C2 < C3) {
14373 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14374 DAG.getConstant(C3 - C2, DL, MVT::i32));
14375 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14376 DAG.getConstant(C3, DL, MVT::i32));
14377 }
14378 }
14379
14380 // First pattern, reversed: left shift, then mask off trailing bits.
14381 if (LeftShift && isMask_32(~C1)) {
14382 uint32_t C3 = llvm::countr_zero(C1);
14383 if (C2 < C3) {
14384 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14385 DAG.getConstant(C3 - C2, DL, MVT::i32));
14386 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14387 DAG.getConstant(C3, DL, MVT::i32));
14388 }
14389 }
14390
14391 // Second pattern: left shift, then mask off leading bits.
14392 // FIXME: Use demanded bits?
14393 if (LeftShift && isShiftedMask_32(C1)) {
14394 uint32_t Trailing = llvm::countr_zero(C1);
14395 uint32_t C3 = llvm::countl_zero(C1);
14396 if (Trailing == C2 && C2 + C3 < 32) {
14397 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14398 DAG.getConstant(C2 + C3, DL, MVT::i32));
14399 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14400 DAG.getConstant(C3, DL, MVT::i32));
14401 }
14402 }
14403
14404 // Second pattern, reversed: right shift, then mask off trailing bits.
14405 // FIXME: Handle other patterns of known/demanded bits.
14406 if (!LeftShift && isShiftedMask_32(C1)) {
14407 uint32_t Leading = llvm::countl_zero(C1);
14408 uint32_t C3 = llvm::countr_zero(C1);
14409 if (Leading == C2 && C2 + C3 < 32) {
14410 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14411 DAG.getConstant(C2 + C3, DL, MVT::i32));
14412 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14413 DAG.getConstant(C3, DL, MVT::i32));
14414 }
14415 }
14416
14417 // Transform "(and (shl x, c2) c1)" into "(shl (and x, c1>>c2), c2)"
14418 // if "c1 >> c2" is a cheaper immediate than "c1"
14419 if (LeftShift &&
14420 HasLowerConstantMaterializationCost(C1 >> C2, C1, Subtarget)) {
14421
14422 SDValue And = DAG.getNode(ISD::AND, DL, MVT::i32, N0->getOperand(0),
14423 DAG.getConstant(C1 >> C2, DL, MVT::i32));
14424 return DAG.getNode(ISD::SHL, DL, MVT::i32, And,
14425 DAG.getConstant(C2, DL, MVT::i32));
14426 }
14427
14428 return SDValue();
14429}
14430
14433 const ARMSubtarget *Subtarget) {
14434 // Attempt to use immediate-form VBIC
14435 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14436 SDLoc dl(N);
14437 EVT VT = N->getValueType(0);
14438 SelectionDAG &DAG = DCI.DAG;
14439
14440 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 ||
14441 VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)
14442 return SDValue();
14443
14444 APInt SplatBits, SplatUndef;
14445 unsigned SplatBitSize;
14446 bool HasAnyUndefs;
14447 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14448 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14449 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14450 SplatBitSize == 64) {
14451 EVT VbicVT;
14452 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
14453 SplatUndef.getZExtValue(), SplatBitSize,
14454 DAG, dl, VbicVT, VT, OtherModImm);
14455 if (Val.getNode()) {
14456 SDValue Input =
14457 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VbicVT, N->getOperand(0));
14458 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
14459 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vbic);
14460 }
14461 }
14462 }
14463
14464 if (!Subtarget->isThumb1Only()) {
14465 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
14466 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
14467 return Result;
14468
14469 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14470 return Result;
14471 }
14472
14473 if (Subtarget->isThumb1Only())
14474 if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
14475 return Result;
14476
14477 return SDValue();
14478}
14479
14480// Try combining OR nodes to SMULWB, SMULWT.
14483 const ARMSubtarget *Subtarget) {
14484 if (!Subtarget->hasV6Ops() ||
14485 (Subtarget->isThumb() &&
14486 (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
14487 return SDValue();
14488
14489 SDValue SRL = OR->getOperand(0);
14490 SDValue SHL = OR->getOperand(1);
14491
14492 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
14493 SRL = OR->getOperand(1);
14494 SHL = OR->getOperand(0);
14495 }
14496 if (!isSRL16(SRL) || !isSHL16(SHL))
14497 return SDValue();
14498
14499 // The first operands to the shifts need to be the two results from the
14500 // same smul_lohi node.
14501 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
14502 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
14503 return SDValue();
14504
14505 SDNode *SMULLOHI = SRL.getOperand(0).getNode();
14506 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
14507 SHL.getOperand(0) != SDValue(SMULLOHI, 1))
14508 return SDValue();
14509
14510 // Now we have:
14511 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
14512 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
14513 // For SMUWB the 16-bit value will signed extended somehow.
14514 // For SMULWT only the SRA is required.
14515 // Check both sides of SMUL_LOHI
14516 SDValue OpS16 = SMULLOHI->getOperand(0);
14517 SDValue OpS32 = SMULLOHI->getOperand(1);
14518
14519 SelectionDAG &DAG = DCI.DAG;
14520 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
14521 OpS16 = OpS32;
14522 OpS32 = SMULLOHI->getOperand(0);
14523 }
14524
14525 SDLoc dl(OR);
14526 unsigned Opcode = 0;
14527 if (isS16(OpS16, DAG))
14528 Opcode = ARMISD::SMULWB;
14529 else if (isSRA16(OpS16)) {
14530 Opcode = ARMISD::SMULWT;
14531 OpS16 = OpS16->getOperand(0);
14532 }
14533 else
14534 return SDValue();
14535
14536 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
14537 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
14538 return SDValue(OR, 0);
14539}
14540
14543 const ARMSubtarget *Subtarget) {
14544 // BFI is only available on V6T2+
14545 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
14546 return SDValue();
14547
14548 EVT VT = N->getValueType(0);
14549 SDValue N0 = N->getOperand(0);
14550 SDValue N1 = N->getOperand(1);
14551 SelectionDAG &DAG = DCI.DAG;
14552 SDLoc DL(N);
14553 // 1) or (and A, mask), val => ARMbfi A, val, mask
14554 // iff (val & mask) == val
14555 //
14556 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14557 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
14558 // && mask == ~mask2
14559 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
14560 // && ~mask == mask2
14561 // (i.e., copy a bitfield value into another bitfield of the same width)
14562
14563 if (VT != MVT::i32)
14564 return SDValue();
14565
14566 SDValue N00 = N0.getOperand(0);
14567
14568 // The value and the mask need to be constants so we can verify this is
14569 // actually a bitfield set. If the mask is 0xffff, we can do better
14570 // via a movt instruction, so don't use BFI in that case.
14571 SDValue MaskOp = N0.getOperand(1);
14572 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
14573 if (!MaskC)
14574 return SDValue();
14575 unsigned Mask = MaskC->getZExtValue();
14576 if (Mask == 0xffff)
14577 return SDValue();
14578 SDValue Res;
14579 // Case (1): or (and A, mask), val => ARMbfi A, val, mask
14580 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
14581 if (N1C) {
14582 unsigned Val = N1C->getZExtValue();
14583 if ((Val & ~Mask) != Val)
14584 return SDValue();
14585
14586 if (ARM::isBitFieldInvertedMask(Mask)) {
14587 Val >>= llvm::countr_zero(~Mask);
14588
14589 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
14590 DAG.getConstant(Val, DL, MVT::i32),
14591 DAG.getConstant(Mask, DL, MVT::i32));
14592
14593 DCI.CombineTo(N, Res, false);
14594 // Return value from the original node to inform the combiner than N is
14595 // now dead.
14596 return SDValue(N, 0);
14597 }
14598 } else if (N1.getOpcode() == ISD::AND) {
14599 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14600 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
14601 if (!N11C)
14602 return SDValue();
14603 unsigned Mask2 = N11C->getZExtValue();
14604
14605 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
14606 // as is to match.
14607 if (ARM::isBitFieldInvertedMask(Mask) &&
14608 (Mask == ~Mask2)) {
14609 // The pack halfword instruction works better for masks that fit it,
14610 // so use that when it's available.
14611 if (Subtarget->hasDSP() &&
14612 (Mask == 0xffff || Mask == 0xffff0000))
14613 return SDValue();
14614 // 2a
14615 unsigned amt = llvm::countr_zero(Mask2);
14616 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
14617 DAG.getConstant(amt, DL, MVT::i32));
14618 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
14619 DAG.getConstant(Mask, DL, MVT::i32));
14620 DCI.CombineTo(N, Res, false);
14621 // Return value from the original node to inform the combiner than N is
14622 // now dead.
14623 return SDValue(N, 0);
14624 } else if (ARM::isBitFieldInvertedMask(~Mask) &&
14625 (~Mask == Mask2)) {
14626 // The pack halfword instruction works better for masks that fit it,
14627 // so use that when it's available.
14628 if (Subtarget->hasDSP() &&
14629 (Mask2 == 0xffff || Mask2 == 0xffff0000))
14630 return SDValue();
14631 // 2b
14632 unsigned lsb = llvm::countr_zero(Mask);
14633 Res = DAG.getNode(ISD::SRL, DL, VT, N00,
14634 DAG.getConstant(lsb, DL, MVT::i32));
14635 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
14636 DAG.getConstant(Mask2, DL, MVT::i32));
14637 DCI.CombineTo(N, Res, false);
14638 // Return value from the original node to inform the combiner than N is
14639 // now dead.
14640 return SDValue(N, 0);
14641 }
14642 }
14643
14644 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
14645 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
14647 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
14648 // where lsb(mask) == #shamt and masked bits of B are known zero.
14649 SDValue ShAmt = N00.getOperand(1);
14650 unsigned ShAmtC = ShAmt->getAsZExtVal();
14651 unsigned LSB = llvm::countr_zero(Mask);
14652 if (ShAmtC != LSB)
14653 return SDValue();
14654
14655 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
14656 DAG.getConstant(~Mask, DL, MVT::i32));
14657
14658 DCI.CombineTo(N, Res, false);
14659 // Return value from the original node to inform the combiner than N is
14660 // now dead.
14661 return SDValue(N, 0);
14662 }
14663
14664 return SDValue();
14665}
14666
14667static bool isValidMVECond(unsigned CC, bool IsFloat) {
14668 switch (CC) {
14669 case ARMCC::EQ:
14670 case ARMCC::NE:
14671 case ARMCC::LE:
14672 case ARMCC::GT:
14673 case ARMCC::GE:
14674 case ARMCC::LT:
14675 return true;
14676 case ARMCC::HS:
14677 case ARMCC::HI:
14678 return !IsFloat;
14679 default:
14680 return false;
14681 };
14682}
14683
14685 if (N->getOpcode() == ARMISD::VCMP)
14686 return (ARMCC::CondCodes)N->getConstantOperandVal(2);
14687 else if (N->getOpcode() == ARMISD::VCMPZ)
14688 return (ARMCC::CondCodes)N->getConstantOperandVal(1);
14689 else
14690 llvm_unreachable("Not a VCMP/VCMPZ!");
14691}
14692
14695 return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
14696}
14697
14699 const ARMSubtarget *Subtarget) {
14700 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
14701 // together with predicates
14702 EVT VT = N->getValueType(0);
14703 SDLoc DL(N);
14704 SDValue N0 = N->getOperand(0);
14705 SDValue N1 = N->getOperand(1);
14706
14707 auto IsFreelyInvertable = [&](SDValue V) {
14708 if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
14709 return CanInvertMVEVCMP(V);
14710 return false;
14711 };
14712
14713 // At least one operand must be freely invertable.
14714 if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
14715 return SDValue();
14716
14717 SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT);
14718 SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT);
14719 SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
14720 return DAG.getLogicalNOT(DL, And, VT);
14721}
14722
14723/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
14726 const ARMSubtarget *Subtarget) {
14727 // Attempt to use immediate-form VORR
14728 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14729 SDLoc dl(N);
14730 EVT VT = N->getValueType(0);
14731 SelectionDAG &DAG = DCI.DAG;
14732
14733 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14734 return SDValue();
14735
14736 if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 ||
14737 VT == MVT::v8i1 || VT == MVT::v16i1))
14738 return PerformORCombine_i1(N, DAG, Subtarget);
14739
14740 APInt SplatBits, SplatUndef;
14741 unsigned SplatBitSize;
14742 bool HasAnyUndefs;
14743 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14744 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14745 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14746 SplatBitSize == 64) {
14747 EVT VorrVT;
14748 SDValue Val =
14749 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
14750 SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
14751 if (Val.getNode()) {
14752 SDValue Input =
14753 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VorrVT, N->getOperand(0));
14754 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
14755 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vorr);
14756 }
14757 }
14758 }
14759
14760 if (!Subtarget->isThumb1Only()) {
14761 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
14762 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14763 return Result;
14764 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
14765 return Result;
14766 }
14767
14768 SDValue N0 = N->getOperand(0);
14769 SDValue N1 = N->getOperand(1);
14770
14771 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
14772 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
14774
14775 // The code below optimizes (or (and X, Y), Z).
14776 // The AND operand needs to have a single user to make these optimizations
14777 // profitable.
14778 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
14779 return SDValue();
14780
14781 APInt SplatUndef;
14782 unsigned SplatBitSize;
14783 bool HasAnyUndefs;
14784
14785 APInt SplatBits0, SplatBits1;
14786 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
14787 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
14788 // Ensure that the second operand of both ands are constants
14789 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
14790 HasAnyUndefs) && !HasAnyUndefs) {
14791 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
14792 HasAnyUndefs) && !HasAnyUndefs) {
14793 // Ensure that the bit width of the constants are the same and that
14794 // the splat arguments are logical inverses as per the pattern we
14795 // are trying to simplify.
14796 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
14797 SplatBits0 == ~SplatBits1) {
14798 // Canonicalize the vector type to make instruction selection
14799 // simpler.
14800 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
14801 SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,
14802 N0->getOperand(1),
14803 N0->getOperand(0),
14804 N1->getOperand(0));
14805 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Result);
14806 }
14807 }
14808 }
14809 }
14810
14811 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
14812 // reasonable.
14813 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
14814 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
14815 return Res;
14816 }
14817
14818 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14819 return Result;
14820
14821 return SDValue();
14822}
14823
14826 const ARMSubtarget *Subtarget) {
14827 EVT VT = N->getValueType(0);
14828 SelectionDAG &DAG = DCI.DAG;
14829
14830 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14831 return SDValue();
14832
14833 if (!Subtarget->isThumb1Only()) {
14834 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
14835 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14836 return Result;
14837
14838 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14839 return Result;
14840 }
14841
14842 if (Subtarget->hasMVEIntegerOps()) {
14843 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
14844 SDValue N0 = N->getOperand(0);
14845 SDValue N1 = N->getOperand(1);
14846 const TargetLowering *TLI = Subtarget->getTargetLowering();
14847 if (TLI->isConstTrueVal(N1) &&
14848 (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
14849 if (CanInvertMVEVCMP(N0)) {
14850 SDLoc DL(N0);
14852
14854 Ops.push_back(N0->getOperand(0));
14855 if (N0->getOpcode() == ARMISD::VCMP)
14856 Ops.push_back(N0->getOperand(1));
14857 Ops.push_back(DAG.getConstant(CC, DL, MVT::i32));
14858 return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
14859 }
14860 }
14861 }
14862
14863 return SDValue();
14864}
14865
14866// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
14867// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
14868// their position in "to" (Rd).
14869static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
14870 assert(N->getOpcode() == ARMISD::BFI);
14871
14872 SDValue From = N->getOperand(1);
14873 ToMask = ~N->getConstantOperandAPInt(2);
14874 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.popcount());
14875
14876 // If the Base came from a SHR #C, we can deduce that it is really testing bit
14877 // #C in the base of the SHR.
14878 if (From->getOpcode() == ISD::SRL &&
14879 isa<ConstantSDNode>(From->getOperand(1))) {
14880 APInt Shift = From->getConstantOperandAPInt(1);
14881 assert(Shift.getLimitedValue() < 32 && "Shift too large!");
14882 FromMask <<= Shift.getLimitedValue(31);
14883 From = From->getOperand(0);
14884 }
14885
14886 return From;
14887}
14888
14889// If A and B contain one contiguous set of bits, does A | B == A . B?
14890//
14891// Neither A nor B must be zero.
14892static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
14893 unsigned LastActiveBitInA = A.countr_zero();
14894 unsigned FirstActiveBitInB = B.getBitWidth() - B.countl_zero() - 1;
14895 return LastActiveBitInA - 1 == FirstActiveBitInB;
14896}
14897
14899 // We have a BFI in N. Find a BFI it can combine with, if one exists.
14900 APInt ToMask, FromMask;
14901 SDValue From = ParseBFI(N, ToMask, FromMask);
14902 SDValue To = N->getOperand(0);
14903
14904 SDValue V = To;
14905 if (V.getOpcode() != ARMISD::BFI)
14906 return SDValue();
14907
14908 APInt NewToMask, NewFromMask;
14909 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
14910 if (NewFrom != From)
14911 return SDValue();
14912
14913 // Do the written bits conflict with any we've seen so far?
14914 if ((NewToMask & ToMask).getBoolValue())
14915 // Conflicting bits.
14916 return SDValue();
14917
14918 // Are the new bits contiguous when combined with the old bits?
14919 if (BitsProperlyConcatenate(ToMask, NewToMask) &&
14920 BitsProperlyConcatenate(FromMask, NewFromMask))
14921 return V;
14922 if (BitsProperlyConcatenate(NewToMask, ToMask) &&
14923 BitsProperlyConcatenate(NewFromMask, FromMask))
14924 return V;
14925
14926 return SDValue();
14927}
14928
14930 SDValue N0 = N->getOperand(0);
14931 SDValue N1 = N->getOperand(1);
14932
14933 if (N1.getOpcode() == ISD::AND) {
14934 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
14935 // the bits being cleared by the AND are not demanded by the BFI.
14936 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
14937 if (!N11C)
14938 return SDValue();
14939 unsigned InvMask = N->getConstantOperandVal(2);
14940 unsigned LSB = llvm::countr_zero(~InvMask);
14941 unsigned Width = llvm::bit_width<unsigned>(~InvMask) - LSB;
14942 assert(Width <
14943 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
14944 "undefined behavior");
14945 unsigned Mask = (1u << Width) - 1;
14946 unsigned Mask2 = N11C->getZExtValue();
14947 if ((Mask & (~Mask2)) == 0)
14948 return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
14949 N->getOperand(0), N1.getOperand(0), N->getOperand(2));
14950 return SDValue();
14951 }
14952
14953 // Look for another BFI to combine with.
14954 if (SDValue CombineBFI = FindBFIToCombineWith(N)) {
14955 // We've found a BFI.
14956 APInt ToMask1, FromMask1;
14957 SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
14958
14959 APInt ToMask2, FromMask2;
14960 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
14961 assert(From1 == From2);
14962 (void)From2;
14963
14964 // Create a new BFI, combining the two together.
14965 APInt NewFromMask = FromMask1 | FromMask2;
14966 APInt NewToMask = ToMask1 | ToMask2;
14967
14968 EVT VT = N->getValueType(0);
14969 SDLoc dl(N);
14970
14971 if (NewFromMask[0] == 0)
14972 From1 = DAG.getNode(ISD::SRL, dl, VT, From1,
14973 DAG.getConstant(NewFromMask.countr_zero(), dl, VT));
14974 return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1,
14975 DAG.getConstant(~NewToMask, dl, VT));
14976 }
14977
14978 // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so
14979 // that lower bit insertions are performed first, providing that M1 and M2
14980 // do no overlap. This can allow multiple BFI instructions to be combined
14981 // together by the other folds above.
14982 if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
14983 APInt ToMask1 = ~N->getConstantOperandAPInt(2);
14984 APInt ToMask2 = ~N0.getConstantOperandAPInt(2);
14985
14986 if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 ||
14987 ToMask1.countl_zero() < ToMask2.countl_zero())
14988 return SDValue();
14989
14990 EVT VT = N->getValueType(0);
14991 SDLoc dl(N);
14992 SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0),
14993 N->getOperand(1), N->getOperand(2));
14994 return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1),
14995 N0.getOperand(2));
14996 }
14997
14998 return SDValue();
14999}
15000
15001// Check that N is CMPZ(CSINC(0, 0, CC, X)),
15002// or CMPZ(CMOV(1, 0, CC, X))
15003// return X if valid.
15005 if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1)))
15006 return SDValue();
15007 SDValue CSInc = Cmp->getOperand(0);
15008
15009 // Ignore any `And 1` nodes that may not yet have been removed. We are
15010 // looking for a value that produces 1/0, so these have no effect on the
15011 // code.
15012 while (CSInc.getOpcode() == ISD::AND &&
15013 isa<ConstantSDNode>(CSInc.getOperand(1)) &&
15014 CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse())
15015 CSInc = CSInc.getOperand(0);
15016
15017 if (CSInc.getOpcode() == ARMISD::CSINC &&
15018 isNullConstant(CSInc.getOperand(0)) &&
15019 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
15021 return CSInc.getOperand(3);
15022 }
15023 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) &&
15024 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
15026 return CSInc.getOperand(3);
15027 }
15028 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) &&
15029 isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) {
15032 return CSInc.getOperand(3);
15033 }
15034 return SDValue();
15035}
15036
15038 // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in
15039 // t92: flags = ARMISD::CMPZ t74, 0
15040 // t93: i32 = ARMISD::CSINC 0, 0, 1, t92
15041 // t96: flags = ARMISD::CMPZ t93, 0
15042 // t114: i32 = ARMISD::CSINV 0, 0, 0, t96
15044 if (SDValue C = IsCMPZCSINC(N, Cond))
15045 if (Cond == ARMCC::EQ)
15046 return C;
15047 return SDValue();
15048}
15049
15051 // Fold away an unneccessary CMPZ/CSINC
15052 // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) ->
15053 // if C1==EQ -> CSXYZ A, B, C2, D
15054 // if C1==NE -> CSXYZ A, B, NOT(C2), D
15056 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
15057 if (N->getConstantOperandVal(2) == ARMCC::EQ)
15058 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15059 N->getOperand(1),
15060 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
15061 if (N->getConstantOperandVal(2) == ARMCC::NE)
15062 return DAG.getNode(
15063 N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15064 N->getOperand(1),
15066 }
15067 return SDValue();
15068}
15069
15070/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
15071/// ARMISD::VMOVRRD.
15074 const ARMSubtarget *Subtarget) {
15075 // vmovrrd(vmovdrr x, y) -> x,y
15076 SDValue InDouble = N->getOperand(0);
15077 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
15078 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
15079
15080 // vmovrrd(load f64) -> (load i32), (load i32)
15081 SDNode *InNode = InDouble.getNode();
15082 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
15083 InNode->getValueType(0) == MVT::f64 &&
15084 InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
15085 !cast<LoadSDNode>(InNode)->isVolatile()) {
15086 // TODO: Should this be done for non-FrameIndex operands?
15087 LoadSDNode *LD = cast<LoadSDNode>(InNode);
15088
15089 SelectionDAG &DAG = DCI.DAG;
15090 SDLoc DL(LD);
15091 SDValue BasePtr = LD->getBasePtr();
15092 SDValue NewLD1 =
15093 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
15094 LD->getAlign(), LD->getMemOperand()->getFlags());
15095
15096 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
15097 DAG.getConstant(4, DL, MVT::i32));
15098
15099 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
15100 LD->getPointerInfo().getWithOffset(4),
15101 commonAlignment(LD->getAlign(), 4),
15102 LD->getMemOperand()->getFlags());
15103
15104 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
15105 if (DCI.DAG.getDataLayout().isBigEndian())
15106 std::swap (NewLD1, NewLD2);
15107 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
15108 return Result;
15109 }
15110
15111 // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
15112 // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
15113 if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15114 isa<ConstantSDNode>(InDouble.getOperand(1))) {
15115 SDValue BV = InDouble.getOperand(0);
15116 // Look up through any nop bitcasts and vector_reg_casts. bitcasts may
15117 // change lane order under big endian.
15118 bool BVSwap = BV.getOpcode() == ISD::BITCAST;
15119 while (
15120 (BV.getOpcode() == ISD::BITCAST ||
15122 (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {
15123 BVSwap = BV.getOpcode() == ISD::BITCAST;
15124 BV = BV.getOperand(0);
15125 }
15126 if (BV.getValueType() != MVT::v4i32)
15127 return SDValue();
15128
15129 // Handle buildvectors, pulling out the correct lane depending on
15130 // endianness.
15131 unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0;
15132 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
15133 SDValue Op0 = BV.getOperand(Offset);
15134 SDValue Op1 = BV.getOperand(Offset + 1);
15135 if (!Subtarget->isLittle() && BVSwap)
15136 std::swap(Op0, Op1);
15137
15138 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15139 }
15140
15141 // A chain of insert_vectors, grabbing the correct value of the chain of
15142 // inserts.
15143 SDValue Op0, Op1;
15144 while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {
15145 if (isa<ConstantSDNode>(BV.getOperand(2))) {
15146 if (BV.getConstantOperandVal(2) == Offset && !Op0)
15147 Op0 = BV.getOperand(1);
15148 if (BV.getConstantOperandVal(2) == Offset + 1 && !Op1)
15149 Op1 = BV.getOperand(1);
15150 }
15151 BV = BV.getOperand(0);
15152 }
15153 if (!Subtarget->isLittle() && BVSwap)
15154 std::swap(Op0, Op1);
15155 if (Op0 && Op1)
15156 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15157 }
15158
15159 return SDValue();
15160}
15161
15162/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
15163/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
15165 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
15166 SDValue Op0 = N->getOperand(0);
15167 SDValue Op1 = N->getOperand(1);
15168 if (Op0.getOpcode() == ISD::BITCAST)
15169 Op0 = Op0.getOperand(0);
15170 if (Op1.getOpcode() == ISD::BITCAST)
15171 Op1 = Op1.getOperand(0);
15172 if (Op0.getOpcode() == ARMISD::VMOVRRD &&
15173 Op0.getNode() == Op1.getNode() &&
15174 Op0.getResNo() == 0 && Op1.getResNo() == 1)
15175 return DAG.getNode(ISD::BITCAST, SDLoc(N),
15176 N->getValueType(0), Op0.getOperand(0));
15177 return SDValue();
15178}
15179
15182 SDValue Op0 = N->getOperand(0);
15183
15184 // VMOVhr (VMOVrh (X)) -> X
15185 if (Op0->getOpcode() == ARMISD::VMOVrh)
15186 return Op0->getOperand(0);
15187
15188 // FullFP16: half values are passed in S-registers, and we don't
15189 // need any of the bitcast and moves:
15190 //
15191 // t2: f32,ch1,gl1? = CopyFromReg ch, Register:f32 %0, gl?
15192 // t5: i32 = bitcast t2
15193 // t18: f16 = ARMISD::VMOVhr t5
15194 // =>
15195 // tN: f16,ch2,gl2? = CopyFromReg ch, Register::f32 %0, gl?
15196 if (Op0->getOpcode() == ISD::BITCAST) {
15197 SDValue Copy = Op0->getOperand(0);
15198 if (Copy.getValueType() == MVT::f32 &&
15199 Copy->getOpcode() == ISD::CopyFromReg) {
15200 bool HasGlue = Copy->getNumOperands() == 3;
15201 SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1),
15202 HasGlue ? Copy->getOperand(2) : SDValue()};
15203 EVT OutTys[] = {N->getValueType(0), MVT::Other, MVT::Glue};
15204 SDValue NewCopy =
15206 DCI.DAG.getVTList(ArrayRef(OutTys, HasGlue ? 3 : 2)),
15207 ArrayRef(Ops, HasGlue ? 3 : 2));
15208
15209 // Update Users, Chains, and Potential Glue.
15210 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), NewCopy.getValue(0));
15211 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(1), NewCopy.getValue(1));
15212 if (HasGlue)
15213 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(2),
15214 NewCopy.getValue(2));
15215
15216 return NewCopy;
15217 }
15218 }
15219
15220 // fold (VMOVhr (load x)) -> (load (f16*)x)
15221 if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {
15222 if (LN0->hasOneUse() && LN0->isUnindexed() &&
15223 LN0->getMemoryVT() == MVT::i16) {
15224 SDValue Load =
15225 DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),
15226 LN0->getBasePtr(), LN0->getMemOperand());
15227 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15228 DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
15229 return Load;
15230 }
15231 }
15232
15233 // Only the bottom 16 bits of the source register are used.
15234 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15235 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15236 if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
15237 return SDValue(N, 0);
15238
15239 return SDValue();
15240}
15241
15243 SDValue N0 = N->getOperand(0);
15244 EVT VT = N->getValueType(0);
15245
15246 // fold (VMOVrh (fpconst x)) -> const x
15247 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0)) {
15248 APFloat V = C->getValueAPF();
15249 return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
15250 }
15251
15252 // fold (VMOVrh (load x)) -> (zextload (i16*)x)
15253 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
15254 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
15255
15256 SDValue Load =
15257 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
15258 LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
15259 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15260 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
15261 return Load;
15262 }
15263
15264 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
15265 if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15266 isa<ConstantSDNode>(N0->getOperand(1)))
15267 return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
15268 N0->getOperand(1));
15269
15270 return SDValue();
15271}
15272
15273/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
15274/// are normal, non-volatile loads. If so, it is profitable to bitcast an
15275/// i64 vector to have f64 elements, since the value can then be loaded
15276/// directly into a VFP register.
15278 unsigned NumElts = N->getValueType(0).getVectorNumElements();
15279 for (unsigned i = 0; i < NumElts; ++i) {
15280 SDNode *Elt = N->getOperand(i).getNode();
15281 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
15282 return true;
15283 }
15284 return false;
15285}
15286
15287/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
15288/// ISD::BUILD_VECTOR.
15291 const ARMSubtarget *Subtarget) {
15292 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
15293 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
15294 // into a pair of GPRs, which is fine when the value is used as a scalar,
15295 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
15296 SelectionDAG &DAG = DCI.DAG;
15297 if (N->getNumOperands() == 2)
15298 if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
15299 return RV;
15300
15301 // Load i64 elements as f64 values so that type legalization does not split
15302 // them up into i32 values.
15303 EVT VT = N->getValueType(0);
15304 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
15305 return SDValue();
15306 SDLoc dl(N);
15308 unsigned NumElts = VT.getVectorNumElements();
15309 for (unsigned i = 0; i < NumElts; ++i) {
15310 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
15311 Ops.push_back(V);
15312 // Make the DAGCombiner fold the bitcast.
15313 DCI.AddToWorklist(V.getNode());
15314 }
15315 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
15316 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
15317 return DAG.getNode(ISD::BITCAST, dl, VT, BV);
15318}
15319
15320/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
15321static SDValue
15323 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
15324 // At that time, we may have inserted bitcasts from integer to float.
15325 // If these bitcasts have survived DAGCombine, change the lowering of this
15326 // BUILD_VECTOR in something more vector friendly, i.e., that does not
15327 // force to use floating point types.
15328
15329 // Make sure we can change the type of the vector.
15330 // This is possible iff:
15331 // 1. The vector is only used in a bitcast to a integer type. I.e.,
15332 // 1.1. Vector is used only once.
15333 // 1.2. Use is a bit convert to an integer type.
15334 // 2. The size of its operands are 32-bits (64-bits are not legal).
15335 EVT VT = N->getValueType(0);
15336 EVT EltVT = VT.getVectorElementType();
15337
15338 // Check 1.1. and 2.
15339 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
15340 return SDValue();
15341
15342 // By construction, the input type must be float.
15343 assert(EltVT == MVT::f32 && "Unexpected type!");
15344
15345 // Check 1.2.
15346 SDNode *Use = *N->user_begin();
15347 if (Use->getOpcode() != ISD::BITCAST ||
15348 Use->getValueType(0).isFloatingPoint())
15349 return SDValue();
15350
15351 // Check profitability.
15352 // Model is, if more than half of the relevant operands are bitcast from
15353 // i32, turn the build_vector into a sequence of insert_vector_elt.
15354 // Relevant operands are everything that is not statically
15355 // (i.e., at compile time) bitcasted.
15356 unsigned NumOfBitCastedElts = 0;
15357 unsigned NumElts = VT.getVectorNumElements();
15358 unsigned NumOfRelevantElts = NumElts;
15359 for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
15360 SDValue Elt = N->getOperand(Idx);
15361 if (Elt->getOpcode() == ISD::BITCAST) {
15362 // Assume only bit cast to i32 will go away.
15363 if (Elt->getOperand(0).getValueType() == MVT::i32)
15364 ++NumOfBitCastedElts;
15365 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
15366 // Constants are statically casted, thus do not count them as
15367 // relevant operands.
15368 --NumOfRelevantElts;
15369 }
15370
15371 // Check if more than half of the elements require a non-free bitcast.
15372 if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
15373 return SDValue();
15374
15375 SelectionDAG &DAG = DCI.DAG;
15376 // Create the new vector type.
15377 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
15378 // Check if the type is legal.
15379 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15380 if (!TLI.isTypeLegal(VecVT))
15381 return SDValue();
15382
15383 // Combine:
15384 // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
15385 // => BITCAST INSERT_VECTOR_ELT
15386 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
15387 // (BITCAST EN), N.
15388 SDValue Vec = DAG.getUNDEF(VecVT);
15389 SDLoc dl(N);
15390 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
15391 SDValue V = N->getOperand(Idx);
15392 if (V.isUndef())
15393 continue;
15394 if (V.getOpcode() == ISD::BITCAST &&
15395 V->getOperand(0).getValueType() == MVT::i32)
15396 // Fold obvious case.
15397 V = V.getOperand(0);
15398 else {
15399 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
15400 // Make the DAGCombiner fold the bitcasts.
15401 DCI.AddToWorklist(V.getNode());
15402 }
15403 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
15404 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
15405 }
15406 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
15407 // Make the DAGCombiner fold the bitcasts.
15408 DCI.AddToWorklist(Vec.getNode());
15409 return Vec;
15410}
15411
15412static SDValue
15414 EVT VT = N->getValueType(0);
15415 SDValue Op = N->getOperand(0);
15416 SDLoc dl(N);
15417
15418 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
15419 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
15420 // If the valuetypes are the same, we can remove the cast entirely.
15421 if (Op->getOperand(0).getValueType() == VT)
15422 return Op->getOperand(0);
15423 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15424 }
15425
15426 // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
15427 // more VPNOT which might get folded as else predicates.
15428 if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
15429 SDValue X =
15430 DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15432 DCI.DAG.getConstant(65535, dl, MVT::i32));
15433 return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);
15434 }
15435
15436 // Only the bottom 16 bits of the source register are used.
15437 if (Op.getValueType() == MVT::i32) {
15438 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15439 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15440 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
15441 return SDValue(N, 0);
15442 }
15443 return SDValue();
15444}
15445
15447 const ARMSubtarget *ST) {
15448 EVT VT = N->getValueType(0);
15449 SDValue Op = N->getOperand(0);
15450 SDLoc dl(N);
15451
15452 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
15453 if (ST->isLittle())
15454 return DAG.getNode(ISD::BITCAST, dl, VT, Op);
15455
15456 // VT VECTOR_REG_CAST (VT Op) -> Op
15457 if (Op.getValueType() == VT)
15458 return Op;
15459 // VECTOR_REG_CAST undef -> undef
15460 if (Op.isUndef())
15461 return DAG.getUNDEF(VT);
15462
15463 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
15464 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
15465 // If the valuetypes are the same, we can remove the cast entirely.
15466 if (Op->getOperand(0).getValueType() == VT)
15467 return Op->getOperand(0);
15468 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
15469 }
15470
15471 return SDValue();
15472}
15473
15475 const ARMSubtarget *Subtarget) {
15476 if (!Subtarget->hasMVEIntegerOps())
15477 return SDValue();
15478
15479 EVT VT = N->getValueType(0);
15480 SDValue Op0 = N->getOperand(0);
15481 SDValue Op1 = N->getOperand(1);
15482 ARMCC::CondCodes Cond = (ARMCC::CondCodes)N->getConstantOperandVal(2);
15483 SDLoc dl(N);
15484
15485 // vcmp X, 0, cc -> vcmpz X, cc
15486 if (isZeroVector(Op1))
15487 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2));
15488
15489 unsigned SwappedCond = getSwappedCondition(Cond);
15490 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
15491 // vcmp 0, X, cc -> vcmpz X, reversed(cc)
15492 if (isZeroVector(Op0))
15493 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
15494 DAG.getConstant(SwappedCond, dl, MVT::i32));
15495 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
15496 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
15497 return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
15498 DAG.getConstant(SwappedCond, dl, MVT::i32));
15499 }
15500
15501 return SDValue();
15502}
15503
15504/// PerformInsertEltCombine - Target-specific dag combine xforms for
15505/// ISD::INSERT_VECTOR_ELT.
15508 // Bitcast an i64 load inserted into a vector to f64.
15509 // Otherwise, the i64 value will be legalized to a pair of i32 values.
15510 EVT VT = N->getValueType(0);
15511 SDNode *Elt = N->getOperand(1).getNode();
15512 if (VT.getVectorElementType() != MVT::i64 ||
15513 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
15514 return SDValue();
15515
15516 SelectionDAG &DAG = DCI.DAG;
15517 SDLoc dl(N);
15518 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
15520 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
15521 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
15522 // Make the DAGCombiner fold the bitcasts.
15523 DCI.AddToWorklist(Vec.getNode());
15524 DCI.AddToWorklist(V.getNode());
15525 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
15526 Vec, V, N->getOperand(2));
15527 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
15528}
15529
15530// Convert a pair of extracts from the same base vector to a VMOVRRD. Either
15531// directly or bitcast to an integer if the original is a float vector.
15532// extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2)
15533// bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2)
15534static SDValue
15536 EVT VT = N->getValueType(0);
15537 SDLoc dl(N);
15538
15539 if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 ||
15540 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64))
15541 return SDValue();
15542
15543 SDValue Ext = SDValue(N, 0);
15544 if (Ext.getOpcode() == ISD::BITCAST &&
15545 Ext.getOperand(0).getValueType() == MVT::f32)
15546 Ext = Ext.getOperand(0);
15547 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15548 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
15549 Ext.getConstantOperandVal(1) % 2 != 0)
15550 return SDValue();
15551 if (Ext->hasOneUse() && (Ext->user_begin()->getOpcode() == ISD::SINT_TO_FP ||
15552 Ext->user_begin()->getOpcode() == ISD::UINT_TO_FP))
15553 return SDValue();
15554
15555 SDValue Op0 = Ext.getOperand(0);
15556 EVT VecVT = Op0.getValueType();
15557 unsigned ResNo = Op0.getResNo();
15558 unsigned Lane = Ext.getConstantOperandVal(1);
15559 if (VecVT.getVectorNumElements() != 4)
15560 return SDValue();
15561
15562 // Find another extract, of Lane + 1
15563 auto OtherIt = find_if(Op0->users(), [&](SDNode *V) {
15564 return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15565 isa<ConstantSDNode>(V->getOperand(1)) &&
15566 V->getConstantOperandVal(1) == Lane + 1 &&
15567 V->getOperand(0).getResNo() == ResNo;
15568 });
15569 if (OtherIt == Op0->users().end())
15570 return SDValue();
15571
15572 // For float extracts, we need to be converting to a i32 for both vector
15573 // lanes.
15574 SDValue OtherExt(*OtherIt, 0);
15575 if (OtherExt.getValueType() != MVT::i32) {
15576 if (!OtherExt->hasOneUse() ||
15577 OtherExt->user_begin()->getOpcode() != ISD::BITCAST ||
15578 OtherExt->user_begin()->getValueType(0) != MVT::i32)
15579 return SDValue();
15580 OtherExt = SDValue(*OtherExt->user_begin(), 0);
15581 }
15582
15583 // Convert the type to a f64 and extract with a VMOVRRD.
15584 SDValue F64 = DCI.DAG.getNode(
15585 ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15586 DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),
15587 DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));
15588 SDValue VMOVRRD =
15589 DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);
15590
15591 DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1));
15592 return VMOVRRD;
15593}
15594
15597 const ARMSubtarget *ST) {
15598 SDValue Op0 = N->getOperand(0);
15599 EVT VT = N->getValueType(0);
15600 SDLoc dl(N);
15601
15602 // extract (vdup x) -> x
15603 if (Op0->getOpcode() == ARMISD::VDUP) {
15604 SDValue X = Op0->getOperand(0);
15605 if (VT == MVT::f16 && X.getValueType() == MVT::i32)
15606 return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
15607 if (VT == MVT::i32 && X.getValueType() == MVT::f16)
15608 return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
15609 if (VT == MVT::f32 && X.getValueType() == MVT::i32)
15610 return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X);
15611
15612 while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
15613 X = X->getOperand(0);
15614 if (X.getValueType() == VT)
15615 return X;
15616 }
15617
15618 // extract ARM_BUILD_VECTOR -> x
15619 if (Op0->getOpcode() == ARMISD::BUILD_VECTOR &&
15620 isa<ConstantSDNode>(N->getOperand(1)) &&
15621 N->getConstantOperandVal(1) < Op0.getNumOperands()) {
15622 return Op0.getOperand(N->getConstantOperandVal(1));
15623 }
15624
15625 // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b
15626 if (Op0.getValueType() == MVT::v4i32 &&
15627 isa<ConstantSDNode>(N->getOperand(1)) &&
15628 Op0.getOpcode() == ISD::BITCAST &&
15630 Op0.getOperand(0).getValueType() == MVT::v2f64) {
15631 SDValue BV = Op0.getOperand(0);
15632 unsigned Offset = N->getConstantOperandVal(1);
15633 SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1);
15634 if (MOV.getOpcode() == ARMISD::VMOVDRR)
15635 return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2);
15636 }
15637
15638 // extract x, n; extract x, n+1 -> VMOVRRD x
15639 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
15640 return R;
15641
15642 // extract (MVETrunc(x)) -> extract x
15643 if (Op0->getOpcode() == ARMISD::MVETRUNC) {
15644 unsigned Idx = N->getConstantOperandVal(1);
15645 unsigned Vec =
15647 unsigned SubIdx =
15649 return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec),
15650 DCI.DAG.getConstant(SubIdx, dl, MVT::i32));
15651 }
15652
15653 return SDValue();
15654}
15655
15657 SDValue Op = N->getOperand(0);
15658 EVT VT = N->getValueType(0);
15659
15660 // sext_inreg(VGETLANEu) -> VGETLANEs
15661 if (Op.getOpcode() == ARMISD::VGETLANEu &&
15662 cast<VTSDNode>(N->getOperand(1))->getVT() ==
15663 Op.getOperand(0).getValueType().getScalarType())
15664 return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0),
15665 Op.getOperand(1));
15666
15667 return SDValue();
15668}
15669
15670static SDValue
15672 SDValue Vec = N->getOperand(0);
15673 SDValue SubVec = N->getOperand(1);
15674 uint64_t IdxVal = N->getConstantOperandVal(2);
15675 EVT VecVT = Vec.getValueType();
15676 EVT SubVT = SubVec.getValueType();
15677
15678 // Only do this for legal fixed vector types.
15679 if (!VecVT.isFixedLengthVector() ||
15680 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
15682 return SDValue();
15683
15684 // Ignore widening patterns.
15685 if (IdxVal == 0 && Vec.isUndef())
15686 return SDValue();
15687
15688 // Subvector must be half the width and an "aligned" insertion.
15689 unsigned NumSubElts = SubVT.getVectorNumElements();
15690 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
15691 (IdxVal != 0 && IdxVal != NumSubElts))
15692 return SDValue();
15693
15694 // Fold insert_subvector -> concat_vectors
15695 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
15696 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
15697 SDLoc DL(N);
15698 SDValue Lo, Hi;
15699 if (IdxVal == 0) {
15700 Lo = SubVec;
15701 Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15702 DCI.DAG.getVectorIdxConstant(NumSubElts, DL));
15703 } else {
15704 Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15705 DCI.DAG.getVectorIdxConstant(0, DL));
15706 Hi = SubVec;
15707 }
15708 return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
15709}
15710
15711// shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
15713 SelectionDAG &DAG) {
15714 SDValue Trunc = N->getOperand(0);
15715 EVT VT = Trunc.getValueType();
15716 if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef())
15717 return SDValue();
15718
15719 SDLoc DL(Trunc);
15720 if (isVMOVNTruncMask(N->getMask(), VT, false))
15721 return DAG.getNode(
15722 ARMISD::VMOVN, DL, VT,
15723 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15724 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15725 DAG.getConstant(1, DL, MVT::i32));
15726 else if (isVMOVNTruncMask(N->getMask(), VT, true))
15727 return DAG.getNode(
15728 ARMISD::VMOVN, DL, VT,
15729 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15730 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15731 DAG.getConstant(1, DL, MVT::i32));
15732 return SDValue();
15733}
15734
15735/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
15736/// ISD::VECTOR_SHUFFLE.
15738 if (SDValue R = PerformShuffleVMOVNCombine(cast<ShuffleVectorSDNode>(N), DAG))
15739 return R;
15740
15741 // The LLVM shufflevector instruction does not require the shuffle mask
15742 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
15743 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
15744 // operands do not match the mask length, they are extended by concatenating
15745 // them with undef vectors. That is probably the right thing for other
15746 // targets, but for NEON it is better to concatenate two double-register
15747 // size vector operands into a single quad-register size vector. Do that
15748 // transformation here:
15749 // shuffle(concat(v1, undef), concat(v2, undef)) ->
15750 // shuffle(concat(v1, v2), undef)
15751 SDValue Op0 = N->getOperand(0);
15752 SDValue Op1 = N->getOperand(1);
15753 if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
15754 Op1.getOpcode() != ISD::CONCAT_VECTORS ||
15755 Op0.getNumOperands() != 2 ||
15756 Op1.getNumOperands() != 2)
15757 return SDValue();
15758 SDValue Concat0Op1 = Op0.getOperand(1);
15759 SDValue Concat1Op1 = Op1.getOperand(1);
15760 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
15761 return SDValue();
15762 // Skip the transformation if any of the types are illegal.
15763 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15764 EVT VT = N->getValueType(0);
15765 if (!TLI.isTypeLegal(VT) ||
15766 !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
15767 !TLI.isTypeLegal(Concat1Op1.getValueType()))
15768 return SDValue();
15769
15770 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
15771 Op0.getOperand(0), Op1.getOperand(0));
15772 // Translate the shuffle mask.
15773 SmallVector<int, 16> NewMask;
15774 unsigned NumElts = VT.getVectorNumElements();
15775 unsigned HalfElts = NumElts/2;
15776 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
15777 for (unsigned n = 0; n < NumElts; ++n) {
15778 int MaskElt = SVN->getMaskElt(n);
15779 int NewElt = -1;
15780 if (MaskElt < (int)HalfElts)
15781 NewElt = MaskElt;
15782 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
15783 NewElt = HalfElts + MaskElt - NumElts;
15784 NewMask.push_back(NewElt);
15785 }
15786 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
15787 DAG.getUNDEF(VT), NewMask);
15788}
15789
15790/// Load/store instruction that can be merged with a base address
15791/// update
15796 unsigned AddrOpIdx;
15797};
15798
15800 /// Instruction that updates a pointer
15802 /// Pointer increment operand
15804 /// Pointer increment value if it is a constant, or 0 otherwise
15805 unsigned ConstInc;
15806};
15807
15809 // Check that the add is independent of the load/store.
15810 // Otherwise, folding it would create a cycle. Search through Addr
15811 // as well, since the User may not be a direct user of Addr and
15812 // only share a base pointer.
15815 Worklist.push_back(N);
15816 Worklist.push_back(User);
15817 const unsigned MaxSteps = 1024;
15818 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
15819 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
15820 return false;
15821 return true;
15822}
15823
15825 struct BaseUpdateUser &User,
15826 bool SimpleConstIncOnly,
15828 SelectionDAG &DAG = DCI.DAG;
15829 SDNode *N = Target.N;
15830 MemSDNode *MemN = cast<MemSDNode>(N);
15831 SDLoc dl(N);
15832
15833 // Find the new opcode for the updating load/store.
15834 bool isLoadOp = true;
15835 bool isLaneOp = false;
15836 // Workaround for vst1x and vld1x intrinsics which do not have alignment
15837 // as an operand.
15838 bool hasAlignment = true;
15839 unsigned NewOpc = 0;
15840 unsigned NumVecs = 0;
15841 if (Target.isIntrinsic) {
15842 unsigned IntNo = N->getConstantOperandVal(1);
15843 switch (IntNo) {
15844 default:
15845 llvm_unreachable("unexpected intrinsic for Neon base update");
15846 case Intrinsic::arm_neon_vld1:
15847 NewOpc = ARMISD::VLD1_UPD;
15848 NumVecs = 1;
15849 break;
15850 case Intrinsic::arm_neon_vld2:
15851 NewOpc = ARMISD::VLD2_UPD;
15852 NumVecs = 2;
15853 break;
15854 case Intrinsic::arm_neon_vld3:
15855 NewOpc = ARMISD::VLD3_UPD;
15856 NumVecs = 3;
15857 break;
15858 case Intrinsic::arm_neon_vld4:
15859 NewOpc = ARMISD::VLD4_UPD;
15860 NumVecs = 4;
15861 break;
15862 case Intrinsic::arm_neon_vld1x2:
15863 NewOpc = ARMISD::VLD1x2_UPD;
15864 NumVecs = 2;
15865 hasAlignment = false;
15866 break;
15867 case Intrinsic::arm_neon_vld1x3:
15868 NewOpc = ARMISD::VLD1x3_UPD;
15869 NumVecs = 3;
15870 hasAlignment = false;
15871 break;
15872 case Intrinsic::arm_neon_vld1x4:
15873 NewOpc = ARMISD::VLD1x4_UPD;
15874 NumVecs = 4;
15875 hasAlignment = false;
15876 break;
15877 case Intrinsic::arm_neon_vld2dup:
15878 NewOpc = ARMISD::VLD2DUP_UPD;
15879 NumVecs = 2;
15880 break;
15881 case Intrinsic::arm_neon_vld3dup:
15882 NewOpc = ARMISD::VLD3DUP_UPD;
15883 NumVecs = 3;
15884 break;
15885 case Intrinsic::arm_neon_vld4dup:
15886 NewOpc = ARMISD::VLD4DUP_UPD;
15887 NumVecs = 4;
15888 break;
15889 case Intrinsic::arm_neon_vld2lane:
15890 NewOpc = ARMISD::VLD2LN_UPD;
15891 NumVecs = 2;
15892 isLaneOp = true;
15893 break;
15894 case Intrinsic::arm_neon_vld3lane:
15895 NewOpc = ARMISD::VLD3LN_UPD;
15896 NumVecs = 3;
15897 isLaneOp = true;
15898 break;
15899 case Intrinsic::arm_neon_vld4lane:
15900 NewOpc = ARMISD::VLD4LN_UPD;
15901 NumVecs = 4;
15902 isLaneOp = true;
15903 break;
15904 case Intrinsic::arm_neon_vst1:
15905 NewOpc = ARMISD::VST1_UPD;
15906 NumVecs = 1;
15907 isLoadOp = false;
15908 break;
15909 case Intrinsic::arm_neon_vst2:
15910 NewOpc = ARMISD::VST2_UPD;
15911 NumVecs = 2;
15912 isLoadOp = false;
15913 break;
15914 case Intrinsic::arm_neon_vst3:
15915 NewOpc = ARMISD::VST3_UPD;
15916 NumVecs = 3;
15917 isLoadOp = false;
15918 break;
15919 case Intrinsic::arm_neon_vst4:
15920 NewOpc = ARMISD::VST4_UPD;
15921 NumVecs = 4;
15922 isLoadOp = false;
15923 break;
15924 case Intrinsic::arm_neon_vst2lane:
15925 NewOpc = ARMISD::VST2LN_UPD;
15926 NumVecs = 2;
15927 isLoadOp = false;
15928 isLaneOp = true;
15929 break;
15930 case Intrinsic::arm_neon_vst3lane:
15931 NewOpc = ARMISD::VST3LN_UPD;
15932 NumVecs = 3;
15933 isLoadOp = false;
15934 isLaneOp = true;
15935 break;
15936 case Intrinsic::arm_neon_vst4lane:
15937 NewOpc = ARMISD::VST4LN_UPD;
15938 NumVecs = 4;
15939 isLoadOp = false;
15940 isLaneOp = true;
15941 break;
15942 case Intrinsic::arm_neon_vst1x2:
15943 NewOpc = ARMISD::VST1x2_UPD;
15944 NumVecs = 2;
15945 isLoadOp = false;
15946 hasAlignment = false;
15947 break;
15948 case Intrinsic::arm_neon_vst1x3:
15949 NewOpc = ARMISD::VST1x3_UPD;
15950 NumVecs = 3;
15951 isLoadOp = false;
15952 hasAlignment = false;
15953 break;
15954 case Intrinsic::arm_neon_vst1x4:
15955 NewOpc = ARMISD::VST1x4_UPD;
15956 NumVecs = 4;
15957 isLoadOp = false;
15958 hasAlignment = false;
15959 break;
15960 }
15961 } else {
15962 isLaneOp = true;
15963 switch (N->getOpcode()) {
15964 default:
15965 llvm_unreachable("unexpected opcode for Neon base update");
15966 case ARMISD::VLD1DUP:
15967 NewOpc = ARMISD::VLD1DUP_UPD;
15968 NumVecs = 1;
15969 break;
15970 case ARMISD::VLD2DUP:
15971 NewOpc = ARMISD::VLD2DUP_UPD;
15972 NumVecs = 2;
15973 break;
15974 case ARMISD::VLD3DUP:
15975 NewOpc = ARMISD::VLD3DUP_UPD;
15976 NumVecs = 3;
15977 break;
15978 case ARMISD::VLD4DUP:
15979 NewOpc = ARMISD::VLD4DUP_UPD;
15980 NumVecs = 4;
15981 break;
15982 case ISD::LOAD:
15983 NewOpc = ARMISD::VLD1_UPD;
15984 NumVecs = 1;
15985 isLaneOp = false;
15986 break;
15987 case ISD::STORE:
15988 NewOpc = ARMISD::VST1_UPD;
15989 NumVecs = 1;
15990 isLaneOp = false;
15991 isLoadOp = false;
15992 break;
15993 }
15994 }
15995
15996 // Find the size of memory referenced by the load/store.
15997 EVT VecTy;
15998 if (isLoadOp) {
15999 VecTy = N->getValueType(0);
16000 } else if (Target.isIntrinsic) {
16001 VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType();
16002 } else {
16003 assert(Target.isStore &&
16004 "Node has to be a load, a store, or an intrinsic!");
16005 VecTy = N->getOperand(1).getValueType();
16006 }
16007
16008 bool isVLDDUPOp =
16009 NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
16010 NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
16011
16012 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
16013 if (isLaneOp || isVLDDUPOp)
16014 NumBytes /= VecTy.getVectorNumElements();
16015
16016 if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {
16017 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
16018 // separate instructions that make it harder to use a non-constant update.
16019 return false;
16020 }
16021
16022 if (SimpleConstIncOnly && User.ConstInc != NumBytes)
16023 return false;
16024
16025 if (!isValidBaseUpdate(N, User.N))
16026 return false;
16027
16028 // OK, we found an ADD we can fold into the base update.
16029 // Now, create a _UPD node, taking care of not breaking alignment.
16030
16031 EVT AlignedVecTy = VecTy;
16032 Align Alignment = MemN->getAlign();
16033
16034 // If this is a less-than-standard-aligned load/store, change the type to
16035 // match the standard alignment.
16036 // The alignment is overlooked when selecting _UPD variants; and it's
16037 // easier to introduce bitcasts here than fix that.
16038 // There are 3 ways to get to this base-update combine:
16039 // - intrinsics: they are assumed to be properly aligned (to the standard
16040 // alignment of the memory type), so we don't need to do anything.
16041 // - ARMISD::VLDx nodes: they are only generated from the aforementioned
16042 // intrinsics, so, likewise, there's nothing to do.
16043 // - generic load/store instructions: the alignment is specified as an
16044 // explicit operand, rather than implicitly as the standard alignment
16045 // of the memory type (like the intrisics). We need to change the
16046 // memory type to match the explicit alignment. That way, we don't
16047 // generate non-standard-aligned ARMISD::VLDx nodes.
16048 if (isa<LSBaseSDNode>(N)) {
16049 if (Alignment.value() < VecTy.getScalarSizeInBits() / 8) {
16050 MVT EltTy = MVT::getIntegerVT(Alignment.value() * 8);
16051 assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
16052 assert(!isLaneOp && "Unexpected generic load/store lane.");
16053 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
16054 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
16055 }
16056 // Don't set an explicit alignment on regular load/stores that we want
16057 // to transform to VLD/VST 1_UPD nodes.
16058 // This matches the behavior of regular load/stores, which only get an
16059 // explicit alignment if the MMO alignment is larger than the standard
16060 // alignment of the memory type.
16061 // Intrinsics, however, always get an explicit alignment, set to the
16062 // alignment of the MMO.
16063 Alignment = Align(1);
16064 }
16065
16066 // Create the new updating load/store node.
16067 // First, create an SDVTList for the new updating node's results.
16068 EVT Tys[6];
16069 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16070 unsigned n;
16071 for (n = 0; n < NumResultVecs; ++n)
16072 Tys[n] = AlignedVecTy;
16073 Tys[n++] = MVT::i32;
16074 Tys[n] = MVT::Other;
16075 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16076
16077 // Then, gather the new node's operands.
16079 Ops.push_back(N->getOperand(0)); // incoming chain
16080 Ops.push_back(N->getOperand(Target.AddrOpIdx));
16081 Ops.push_back(User.Inc);
16082
16083 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
16084 // Try to match the intrinsic's signature
16085 Ops.push_back(StN->getValue());
16086 } else {
16087 // Loads (and of course intrinsics) match the intrinsics' signature,
16088 // so just add all but the alignment operand.
16089 unsigned LastOperand =
16090 hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
16091 for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)
16092 Ops.push_back(N->getOperand(i));
16093 }
16094
16095 // For all node types, the alignment operand is always the last one.
16096 Ops.push_back(DAG.getConstant(Alignment.value(), dl, MVT::i32));
16097
16098 // If this is a non-standard-aligned STORE, the penultimate operand is the
16099 // stored value. Bitcast it to the aligned type.
16100 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
16101 SDValue &StVal = Ops[Ops.size() - 2];
16102 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
16103 }
16104
16105 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
16106 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
16107 MemN->getMemOperand());
16108
16109 // Update the uses.
16110 SmallVector<SDValue, 5> NewResults;
16111 for (unsigned i = 0; i < NumResultVecs; ++i)
16112 NewResults.push_back(SDValue(UpdN.getNode(), i));
16113
16114 // If this is an non-standard-aligned LOAD, the first result is the loaded
16115 // value. Bitcast it to the expected result type.
16116 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
16117 SDValue &LdVal = NewResults[0];
16118 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
16119 }
16120
16121 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16122 DCI.CombineTo(N, NewResults);
16123 DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs));
16124
16125 return true;
16126}
16127
16128// If (opcode ptr inc) is and ADD-like instruction, return the
16129// increment value. Otherwise return 0.
16130static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,
16131 SDValue Inc, const SelectionDAG &DAG) {
16132 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
16133 if (!CInc)
16134 return 0;
16135
16136 switch (Opcode) {
16137 case ARMISD::VLD1_UPD:
16138 case ISD::ADD:
16139 return CInc->getZExtValue();
16140 case ISD::OR: {
16141 if (DAG.haveNoCommonBitsSet(Ptr, Inc)) {
16142 // (OR ptr inc) is the same as (ADD ptr inc)
16143 return CInc->getZExtValue();
16144 }
16145 return 0;
16146 }
16147 default:
16148 return 0;
16149 }
16150}
16151
16153 switch (N->getOpcode()) {
16154 case ISD::ADD:
16155 case ISD::OR: {
16156 if (isa<ConstantSDNode>(N->getOperand(1))) {
16157 *Ptr = N->getOperand(0);
16158 *CInc = N->getOperand(1);
16159 return true;
16160 }
16161 return false;
16162 }
16163 case ARMISD::VLD1_UPD: {
16164 if (isa<ConstantSDNode>(N->getOperand(2))) {
16165 *Ptr = N->getOperand(1);
16166 *CInc = N->getOperand(2);
16167 return true;
16168 }
16169 return false;
16170 }
16171 default:
16172 return false;
16173 }
16174}
16175
16176/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
16177/// NEON load/store intrinsics, and generic vector load/stores, to merge
16178/// base address updates.
16179/// For generic load/stores, the memory type is assumed to be a vector.
16180/// The caller is assumed to have checked legality.
16183 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
16184 N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
16185 const bool isStore = N->getOpcode() == ISD::STORE;
16186 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
16187 BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
16188
16189 // Limit the number of possible base-updates we look at to prevent degenerate
16190 // cases.
16191 unsigned MaxBaseUpdates = ArmMaxBaseUpdatesToCheck;
16192
16193 SDValue Addr = N->getOperand(AddrOpIdx);
16194
16196
16197 // Search for a use of the address operand that is an increment.
16198 for (SDUse &Use : Addr->uses()) {
16199 SDNode *User = Use.getUser();
16200 if (Use.getResNo() != Addr.getResNo() || User->getNumOperands() != 2)
16201 continue;
16202
16203 SDValue Inc = User->getOperand(Use.getOperandNo() == 1 ? 0 : 1);
16204 unsigned ConstInc =
16205 getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
16206
16207 if (ConstInc || User->getOpcode() == ISD::ADD) {
16208 BaseUpdates.push_back({User, Inc, ConstInc});
16209 if (BaseUpdates.size() >= MaxBaseUpdates)
16210 break;
16211 }
16212 }
16213
16214 // If the address is a constant pointer increment itself, find
16215 // another constant increment that has the same base operand
16216 SDValue Base;
16217 SDValue CInc;
16218 if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {
16219 unsigned Offset =
16220 getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);
16221 for (SDUse &Use : Base->uses()) {
16222
16223 SDNode *User = Use.getUser();
16224 if (Use.getResNo() != Base.getResNo() || User == Addr.getNode() ||
16225 User->getNumOperands() != 2)
16226 continue;
16227
16228 SDValue UserInc = User->getOperand(Use.getOperandNo() == 0 ? 1 : 0);
16229 unsigned UserOffset =
16230 getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);
16231
16232 if (!UserOffset || UserOffset <= Offset)
16233 continue;
16234
16235 unsigned NewConstInc = UserOffset - Offset;
16236 SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
16237 BaseUpdates.push_back({User, NewInc, NewConstInc});
16238 if (BaseUpdates.size() >= MaxBaseUpdates)
16239 break;
16240 }
16241 }
16242
16243 // Try to fold the load/store with an update that matches memory
16244 // access size. This should work well for sequential loads.
16245 unsigned NumValidUpd = BaseUpdates.size();
16246 for (unsigned I = 0; I < NumValidUpd; I++) {
16247 BaseUpdateUser &User = BaseUpdates[I];
16248 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
16249 return SDValue();
16250 }
16251
16252 // Try to fold with other users. Non-constant updates are considered
16253 // first, and constant updates are sorted to not break a sequence of
16254 // strided accesses (if there is any).
16255 llvm::stable_sort(BaseUpdates,
16256 [](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) {
16257 return LHS.ConstInc < RHS.ConstInc;
16258 });
16259 for (BaseUpdateUser &User : BaseUpdates) {
16260 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
16261 return SDValue();
16262 }
16263 return SDValue();
16264}
16265
16268 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16269 return SDValue();
16270
16271 return CombineBaseUpdate(N, DCI);
16272}
16273
16276 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16277 return SDValue();
16278
16279 SelectionDAG &DAG = DCI.DAG;
16280 SDValue Addr = N->getOperand(2);
16281 MemSDNode *MemN = cast<MemSDNode>(N);
16282 SDLoc dl(N);
16283
16284 // For the stores, where there are multiple intrinsics we only actually want
16285 // to post-inc the last of the them.
16286 unsigned IntNo = N->getConstantOperandVal(1);
16287 if (IntNo == Intrinsic::arm_mve_vst2q && N->getConstantOperandVal(5) != 1)
16288 return SDValue();
16289 if (IntNo == Intrinsic::arm_mve_vst4q && N->getConstantOperandVal(7) != 3)
16290 return SDValue();
16291
16292 // Search for a use of the address operand that is an increment.
16293 for (SDUse &Use : Addr->uses()) {
16294 SDNode *User = Use.getUser();
16295 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
16296 continue;
16297
16298 // Check that the add is independent of the load/store. Otherwise, folding
16299 // it would create a cycle. We can avoid searching through Addr as it's a
16300 // predecessor to both.
16303 Visited.insert(Addr.getNode());
16304 Worklist.push_back(N);
16305 Worklist.push_back(User);
16306 const unsigned MaxSteps = 1024;
16307 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
16308 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
16309 continue;
16310
16311 // Find the new opcode for the updating load/store.
16312 bool isLoadOp = true;
16313 unsigned NewOpc = 0;
16314 unsigned NumVecs = 0;
16315 switch (IntNo) {
16316 default:
16317 llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
16318 case Intrinsic::arm_mve_vld2q:
16319 NewOpc = ARMISD::VLD2_UPD;
16320 NumVecs = 2;
16321 break;
16322 case Intrinsic::arm_mve_vld4q:
16323 NewOpc = ARMISD::VLD4_UPD;
16324 NumVecs = 4;
16325 break;
16326 case Intrinsic::arm_mve_vst2q:
16327 NewOpc = ARMISD::VST2_UPD;
16328 NumVecs = 2;
16329 isLoadOp = false;
16330 break;
16331 case Intrinsic::arm_mve_vst4q:
16332 NewOpc = ARMISD::VST4_UPD;
16333 NumVecs = 4;
16334 isLoadOp = false;
16335 break;
16336 }
16337
16338 // Find the size of memory referenced by the load/store.
16339 EVT VecTy;
16340 if (isLoadOp) {
16341 VecTy = N->getValueType(0);
16342 } else {
16343 VecTy = N->getOperand(3).getValueType();
16344 }
16345
16346 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
16347
16348 // If the increment is a constant, it must match the memory ref size.
16349 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
16350 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
16351 if (!CInc || CInc->getZExtValue() != NumBytes)
16352 continue;
16353
16354 // Create the new updating load/store node.
16355 // First, create an SDVTList for the new updating node's results.
16356 EVT Tys[6];
16357 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16358 unsigned n;
16359 for (n = 0; n < NumResultVecs; ++n)
16360 Tys[n] = VecTy;
16361 Tys[n++] = MVT::i32;
16362 Tys[n] = MVT::Other;
16363 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16364
16365 // Then, gather the new node's operands.
16367 Ops.push_back(N->getOperand(0)); // incoming chain
16368 Ops.push_back(N->getOperand(2)); // ptr
16369 Ops.push_back(Inc);
16370
16371 for (unsigned i = 3; i < N->getNumOperands(); ++i)
16372 Ops.push_back(N->getOperand(i));
16373
16374 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,
16375 MemN->getMemOperand());
16376
16377 // Update the uses.
16378 SmallVector<SDValue, 5> NewResults;
16379 for (unsigned i = 0; i < NumResultVecs; ++i)
16380 NewResults.push_back(SDValue(UpdN.getNode(), i));
16381
16382 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16383 DCI.CombineTo(N, NewResults);
16384 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
16385
16386 break;
16387 }
16388
16389 return SDValue();
16390}
16391
16392/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
16393/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
16394/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
16395/// return true.
16397 SelectionDAG &DAG = DCI.DAG;
16398 EVT VT = N->getValueType(0);
16399 // vldN-dup instructions only support 64-bit vectors for N > 1.
16400 if (!VT.is64BitVector())
16401 return false;
16402
16403 // Check if the VDUPLANE operand is a vldN-dup intrinsic.
16404 SDNode *VLD = N->getOperand(0).getNode();
16405 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
16406 return false;
16407 unsigned NumVecs = 0;
16408 unsigned NewOpc = 0;
16409 unsigned IntNo = VLD->getConstantOperandVal(1);
16410 if (IntNo == Intrinsic::arm_neon_vld2lane) {
16411 NumVecs = 2;
16412 NewOpc = ARMISD::VLD2DUP;
16413 } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
16414 NumVecs = 3;
16415 NewOpc = ARMISD::VLD3DUP;
16416 } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
16417 NumVecs = 4;
16418 NewOpc = ARMISD::VLD4DUP;
16419 } else {
16420 return false;
16421 }
16422
16423 // First check that all the vldN-lane uses are VDUPLANEs and that the lane
16424 // numbers match the load.
16425 unsigned VLDLaneNo = VLD->getConstantOperandVal(NumVecs + 3);
16426 for (SDUse &Use : VLD->uses()) {
16427 // Ignore uses of the chain result.
16428 if (Use.getResNo() == NumVecs)
16429 continue;
16430 SDNode *User = Use.getUser();
16431 if (User->getOpcode() != ARMISD::VDUPLANE ||
16432 VLDLaneNo != User->getConstantOperandVal(1))
16433 return false;
16434 }
16435
16436 // Create the vldN-dup node.
16437 EVT Tys[5];
16438 unsigned n;
16439 for (n = 0; n < NumVecs; ++n)
16440 Tys[n] = VT;
16441 Tys[n] = MVT::Other;
16442 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumVecs + 1));
16443 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
16444 MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
16445 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
16446 Ops, VLDMemInt->getMemoryVT(),
16447 VLDMemInt->getMemOperand());
16448
16449 // Update the uses.
16450 for (SDUse &Use : VLD->uses()) {
16451 unsigned ResNo = Use.getResNo();
16452 // Ignore uses of the chain result.
16453 if (ResNo == NumVecs)
16454 continue;
16455 DCI.CombineTo(Use.getUser(), SDValue(VLDDup.getNode(), ResNo));
16456 }
16457
16458 // Now the vldN-lane intrinsic is dead except for its chain result.
16459 // Update uses of the chain.
16460 std::vector<SDValue> VLDDupResults;
16461 for (unsigned n = 0; n < NumVecs; ++n)
16462 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
16463 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
16464 DCI.CombineTo(VLD, VLDDupResults);
16465
16466 return true;
16467}
16468
16469/// PerformVDUPLANECombine - Target-specific dag combine xforms for
16470/// ARMISD::VDUPLANE.
16473 const ARMSubtarget *Subtarget) {
16474 SDValue Op = N->getOperand(0);
16475 EVT VT = N->getValueType(0);
16476
16477 // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
16478 if (Subtarget->hasMVEIntegerOps()) {
16479 EVT ExtractVT = VT.getVectorElementType();
16480 // We need to ensure we are creating a legal type.
16481 if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
16482 ExtractVT = MVT::i32;
16483 SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
16484 N->getOperand(0), N->getOperand(1));
16485 return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
16486 }
16487
16488 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
16489 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
16490 if (CombineVLDDUP(N, DCI))
16491 return SDValue(N, 0);
16492
16493 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
16494 // redundant. Ignore bit_converts for now; element sizes are checked below.
16495 while (Op.getOpcode() == ISD::BITCAST)
16496 Op = Op.getOperand(0);
16497 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
16498 return SDValue();
16499
16500 // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
16501 unsigned EltSize = Op.getScalarValueSizeInBits();
16502 // The canonical VMOV for a zero vector uses a 32-bit element size.
16503 unsigned Imm = Op.getConstantOperandVal(0);
16504 unsigned EltBits;
16505 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
16506 EltSize = 8;
16507 if (EltSize > VT.getScalarSizeInBits())
16508 return SDValue();
16509
16510 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
16511}
16512
16513/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
16515 const ARMSubtarget *Subtarget) {
16516 SDValue Op = N->getOperand(0);
16517 SDLoc dl(N);
16518
16519 if (Subtarget->hasMVEIntegerOps()) {
16520 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
16521 // need to come from a GPR.
16522 if (Op.getValueType() == MVT::f32)
16523 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16524 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
16525 else if (Op.getValueType() == MVT::f16)
16526 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16527 DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
16528 }
16529
16530 if (!Subtarget->hasNEON())
16531 return SDValue();
16532
16533 // Match VDUP(LOAD) -> VLD1DUP.
16534 // We match this pattern here rather than waiting for isel because the
16535 // transform is only legal for unindexed loads.
16536 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
16537 if (LD && Op.hasOneUse() && LD->isUnindexed() &&
16538 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
16539 SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),
16540 DAG.getConstant(LD->getAlign().value(), SDLoc(N), MVT::i32)};
16541 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
16542 SDValue VLDDup =
16543 DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, Ops,
16544 LD->getMemoryVT(), LD->getMemOperand());
16545 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
16546 return VLDDup;
16547 }
16548
16549 return SDValue();
16550}
16551
16554 const ARMSubtarget *Subtarget) {
16555 EVT VT = N->getValueType(0);
16556
16557 // If this is a legal vector load, try to combine it into a VLD1_UPD.
16558 if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() &&
16560 return CombineBaseUpdate(N, DCI);
16561
16562 return SDValue();
16563}
16564
16565// Optimize trunc store (of multiple scalars) to shuffle and store. First,
16566// pack all of the elements in one place. Next, store to memory in fewer
16567// chunks.
16569 SelectionDAG &DAG) {
16570 SDValue StVal = St->getValue();
16571 EVT VT = StVal.getValueType();
16572 if (!St->isTruncatingStore() || !VT.isVector())
16573 return SDValue();
16574 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16575 EVT StVT = St->getMemoryVT();
16576 unsigned NumElems = VT.getVectorNumElements();
16577 assert(StVT != VT && "Cannot truncate to the same type");
16578 unsigned FromEltSz = VT.getScalarSizeInBits();
16579 unsigned ToEltSz = StVT.getScalarSizeInBits();
16580
16581 // From, To sizes and ElemCount must be pow of two
16582 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
16583 return SDValue();
16584
16585 // We are going to use the original vector elt for storing.
16586 // Accumulated smaller vector elements must be a multiple of the store size.
16587 if (0 != (NumElems * FromEltSz) % ToEltSz)
16588 return SDValue();
16589
16590 unsigned SizeRatio = FromEltSz / ToEltSz;
16591 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
16592
16593 // Create a type on which we perform the shuffle.
16594 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
16595 NumElems * SizeRatio);
16596 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
16597
16598 SDLoc DL(St);
16599 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
16600 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
16601 for (unsigned i = 0; i < NumElems; ++i)
16602 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
16603 : i * SizeRatio;
16604
16605 // Can't shuffle using an illegal type.
16606 if (!TLI.isTypeLegal(WideVecVT))
16607 return SDValue();
16608
16609 SDValue Shuff = DAG.getVectorShuffle(
16610 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
16611 // At this point all of the data is stored at the bottom of the
16612 // register. We now need to save it to mem.
16613
16614 // Find the largest store unit
16615 MVT StoreType = MVT::i8;
16616 for (MVT Tp : MVT::integer_valuetypes()) {
16617 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
16618 StoreType = Tp;
16619 }
16620 // Didn't find a legal store type.
16621 if (!TLI.isTypeLegal(StoreType))
16622 return SDValue();
16623
16624 // Bitcast the original vector into a vector of store-size units
16625 EVT StoreVecVT =
16626 EVT::getVectorVT(*DAG.getContext(), StoreType,
16627 VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
16628 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
16629 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
16631 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
16632 TLI.getPointerTy(DAG.getDataLayout()));
16633 SDValue BasePtr = St->getBasePtr();
16634
16635 // Perform one or more big stores into memory.
16636 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
16637 for (unsigned I = 0; I < E; I++) {
16638 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
16639 ShuffWide, DAG.getIntPtrConstant(I, DL));
16640 SDValue Ch =
16641 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
16642 St->getAlign(), St->getMemOperand()->getFlags());
16643 BasePtr =
16644 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
16645 Chains.push_back(Ch);
16646 }
16647 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
16648}
16649
16650// Try taking a single vector store from an fpround (which would otherwise turn
16651// into an expensive buildvector) and splitting it into a series of narrowing
16652// stores.
16654 SelectionDAG &DAG) {
16655 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16656 return SDValue();
16657 SDValue Trunc = St->getValue();
16658 if (Trunc->getOpcode() != ISD::FP_ROUND)
16659 return SDValue();
16660 EVT FromVT = Trunc->getOperand(0).getValueType();
16661 EVT ToVT = Trunc.getValueType();
16662 if (!ToVT.isVector())
16663 return SDValue();
16665 EVT ToEltVT = ToVT.getVectorElementType();
16666 EVT FromEltVT = FromVT.getVectorElementType();
16667
16668 if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16)
16669 return SDValue();
16670
16671 unsigned NumElements = 4;
16672 if (FromVT.getVectorNumElements() % NumElements != 0)
16673 return SDValue();
16674
16675 // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so
16676 // use the VMOVN over splitting the store. We are looking for patterns of:
16677 // !rev: 0 N 1 N+1 2 N+2 ...
16678 // rev: N 0 N+1 1 N+2 2 ...
16679 // The shuffle may either be a single source (in which case N = NumElts/2) or
16680 // two inputs extended with concat to the same size (in which case N =
16681 // NumElts).
16682 auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
16683 ArrayRef<int> M = SVN->getMask();
16684 unsigned NumElts = ToVT.getVectorNumElements();
16685 if (SVN->getOperand(1).isUndef())
16686 NumElts /= 2;
16687
16688 unsigned Off0 = Rev ? NumElts : 0;
16689 unsigned Off1 = Rev ? 0 : NumElts;
16690
16691 for (unsigned I = 0; I < NumElts; I += 2) {
16692 if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
16693 return false;
16694 if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
16695 return false;
16696 }
16697
16698 return true;
16699 };
16700
16701 if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
16702 if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
16703 return SDValue();
16704
16705 LLVMContext &C = *DAG.getContext();
16706 SDLoc DL(St);
16707 // Details about the old store
16708 SDValue Ch = St->getChain();
16709 SDValue BasePtr = St->getBasePtr();
16710 Align Alignment = St->getBaseAlign();
16712 AAMDNodes AAInfo = St->getAAInfo();
16713
16714 // We split the store into slices of NumElements. fp16 trunc stores are vcvt
16715 // and then stored as truncating integer stores.
16716 EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
16717 EVT NewToVT = EVT::getVectorVT(
16718 C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
16719
16721 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
16722 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
16723 SDValue NewPtr =
16724 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16725
16726 SDValue Extract =
16727 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
16728 DAG.getConstant(i * NumElements, DL, MVT::i32));
16729
16730 SDValue FPTrunc =
16731 DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
16732 Extract, DAG.getConstant(0, DL, MVT::i32));
16733 Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
16734
16735 SDValue Store = DAG.getTruncStore(
16736 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16737 NewToVT, Alignment, MMOFlags, AAInfo);
16738 Stores.push_back(Store);
16739 }
16740 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16741}
16742
16743// Try taking a single vector store from an MVETRUNC (which would otherwise turn
16744// into an expensive buildvector) and splitting it into a series of narrowing
16745// stores.
16747 SelectionDAG &DAG) {
16748 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16749 return SDValue();
16750 SDValue Trunc = St->getValue();
16751 if (Trunc->getOpcode() != ARMISD::MVETRUNC)
16752 return SDValue();
16753 EVT FromVT = Trunc->getOperand(0).getValueType();
16754 EVT ToVT = Trunc.getValueType();
16755
16756 LLVMContext &C = *DAG.getContext();
16757 SDLoc DL(St);
16758 // Details about the old store
16759 SDValue Ch = St->getChain();
16760 SDValue BasePtr = St->getBasePtr();
16761 Align Alignment = St->getBaseAlign();
16763 AAMDNodes AAInfo = St->getAAInfo();
16764
16765 EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(),
16766 FromVT.getVectorNumElements());
16767
16769 for (unsigned i = 0; i < Trunc.getNumOperands(); i++) {
16770 unsigned NewOffset =
16771 i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8;
16772 SDValue NewPtr =
16773 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16774
16775 SDValue Extract = Trunc.getOperand(i);
16776 SDValue Store = DAG.getTruncStore(
16777 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16778 NewToVT, Alignment, MMOFlags, AAInfo);
16779 Stores.push_back(Store);
16780 }
16781 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16782}
16783
16784// Given a floating point store from an extracted vector, with an integer
16785// VGETLANE that already exists, store the existing VGETLANEu directly. This can
16786// help reduce fp register pressure, doesn't require the fp extract and allows
16787// use of more integer post-inc stores not available with vstr.
16789 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16790 return SDValue();
16791 SDValue Extract = St->getValue();
16792 EVT VT = Extract.getValueType();
16793 // For now only uses f16. This may be useful for f32 too, but that will
16794 // be bitcast(extract), not the VGETLANEu we currently check here.
16795 if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16796 return SDValue();
16797
16798 SDNode *GetLane =
16799 DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32),
16800 {Extract.getOperand(0), Extract.getOperand(1)});
16801 if (!GetLane)
16802 return SDValue();
16803
16804 LLVMContext &C = *DAG.getContext();
16805 SDLoc DL(St);
16806 // Create a new integer store to replace the existing floating point version.
16807 SDValue Ch = St->getChain();
16808 SDValue BasePtr = St->getBasePtr();
16809 Align Alignment = St->getBaseAlign();
16811 AAMDNodes AAInfo = St->getAAInfo();
16812 EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits());
16813 SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr,
16814 St->getPointerInfo(), NewToVT, Alignment,
16815 MMOFlags, AAInfo);
16816
16817 return Store;
16818}
16819
16820/// PerformSTORECombine - Target-specific dag combine xforms for
16821/// ISD::STORE.
16824 const ARMSubtarget *Subtarget) {
16825 StoreSDNode *St = cast<StoreSDNode>(N);
16826 if (St->isVolatile())
16827 return SDValue();
16828 SDValue StVal = St->getValue();
16829 EVT VT = StVal.getValueType();
16830
16831 if (Subtarget->hasNEON())
16832 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
16833 return Store;
16834
16835 if (Subtarget->hasMVEFloatOps())
16836 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
16837 return NewToken;
16838
16839 if (Subtarget->hasMVEIntegerOps()) {
16840 if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG))
16841 return NewChain;
16842 if (SDValue NewToken =
16844 return NewToken;
16845 }
16846
16847 if (!ISD::isNormalStore(St))
16848 return SDValue();
16849
16850 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
16851 // ARM stores of arguments in the same cache line.
16852 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
16853 StVal.getNode()->hasOneUse()) {
16854 SelectionDAG &DAG = DCI.DAG;
16855 bool isBigEndian = DAG.getDataLayout().isBigEndian();
16856 SDLoc DL(St);
16857 SDValue BasePtr = St->getBasePtr();
16858 SDValue NewST1 = DAG.getStore(
16859 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
16860 BasePtr, St->getPointerInfo(), St->getBaseAlign(),
16861 St->getMemOperand()->getFlags());
16862
16863 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
16864 DAG.getConstant(4, DL, MVT::i32));
16865 return DAG.getStore(NewST1.getValue(0), DL,
16866 StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
16867 OffsetPtr, St->getPointerInfo().getWithOffset(4),
16868 St->getBaseAlign(), St->getMemOperand()->getFlags());
16869 }
16870
16871 if (StVal.getValueType() == MVT::i64 &&
16873
16874 // Bitcast an i64 store extracted from a vector to f64.
16875 // Otherwise, the i64 value will be legalized to a pair of i32 values.
16876 SelectionDAG &DAG = DCI.DAG;
16877 SDLoc dl(StVal);
16878 SDValue IntVec = StVal.getOperand(0);
16879 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
16881 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
16882 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16883 Vec, StVal.getOperand(1));
16884 dl = SDLoc(N);
16885 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
16886 // Make the DAGCombiner fold the bitcasts.
16887 DCI.AddToWorklist(Vec.getNode());
16888 DCI.AddToWorklist(ExtElt.getNode());
16889 DCI.AddToWorklist(V.getNode());
16890 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
16891 St->getPointerInfo(), St->getAlign(),
16892 St->getMemOperand()->getFlags(), St->getAAInfo());
16893 }
16894
16895 // If this is a legal vector store, try to combine it into a VST1_UPD.
16896 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
16898 return CombineBaseUpdate(N, DCI);
16899
16900 return SDValue();
16901}
16902
16903/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
16904/// can replace combinations of VMUL and VCVT (floating-point to integer)
16905/// when the VMUL has a constant operand that is a power of 2.
16906///
16907/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
16908/// vmul.f32 d16, d17, d16
16909/// vcvt.s32.f32 d16, d16
16910/// becomes:
16911/// vcvt.s32.f32 d16, d16, #3
16913 const ARMSubtarget *Subtarget) {
16914 if (!Subtarget->hasNEON())
16915 return SDValue();
16916
16917 SDValue Op = N->getOperand(0);
16918 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
16919 Op.getOpcode() != ISD::FMUL)
16920 return SDValue();
16921
16922 SDValue ConstVec = Op->getOperand(1);
16923 if (!isa<BuildVectorSDNode>(ConstVec))
16924 return SDValue();
16925
16926 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
16927 uint32_t FloatBits = FloatTy.getSizeInBits();
16928 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
16929 uint32_t IntBits = IntTy.getSizeInBits();
16930 unsigned NumLanes = Op.getValueType().getVectorNumElements();
16931 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
16932 // These instructions only exist converting from f32 to i32. We can handle
16933 // smaller integers by generating an extra truncate, but larger ones would
16934 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
16935 // these intructions only support v2i32/v4i32 types.
16936 return SDValue();
16937 }
16938
16939 BitVector UndefElements;
16940 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
16941 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
16942 if (C == -1 || C == 0 || C > 32)
16943 return SDValue();
16944
16945 SDLoc dl(N);
16946 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
16947 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
16948 Intrinsic::arm_neon_vcvtfp2fxu;
16949 SDValue FixConv = DAG.getNode(
16950 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
16951 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
16952 DAG.getConstant(C, dl, MVT::i32));
16953
16954 if (IntBits < FloatBits)
16955 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
16956
16957 return FixConv;
16958}
16959
16961 const ARMSubtarget *Subtarget) {
16962 if (!Subtarget->hasMVEFloatOps())
16963 return SDValue();
16964
16965 // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x)
16966 // The second form can be more easily turned into a predicated vadd, and
16967 // possibly combined into a fma to become a predicated vfma.
16968 SDValue Op0 = N->getOperand(0);
16969 SDValue Op1 = N->getOperand(1);
16970 EVT VT = N->getValueType(0);
16971 SDLoc DL(N);
16972
16973 // The identity element for a fadd is -0.0 or +0.0 when the nsz flag is set,
16974 // which these VMOV's represent.
16975 auto isIdentitySplat = [&](SDValue Op, bool NSZ) {
16976 if (Op.getOpcode() != ISD::BITCAST ||
16977 Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM)
16978 return false;
16979 uint64_t ImmVal = Op.getOperand(0).getConstantOperandVal(0);
16980 if (VT == MVT::v4f32 && (ImmVal == 1664 || (ImmVal == 0 && NSZ)))
16981 return true;
16982 if (VT == MVT::v8f16 && (ImmVal == 2688 || (ImmVal == 0 && NSZ)))
16983 return true;
16984 return false;
16985 };
16986
16987 if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT)
16988 std::swap(Op0, Op1);
16989
16990 if (Op1.getOpcode() != ISD::VSELECT)
16991 return SDValue();
16992
16993 SDNodeFlags FaddFlags = N->getFlags();
16994 bool NSZ = FaddFlags.hasNoSignedZeros();
16995 if (!isIdentitySplat(Op1.getOperand(2), NSZ))
16996 return SDValue();
16997
16998 SDValue FAdd =
16999 DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), FaddFlags);
17000 return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0, FaddFlags);
17001}
17002
17004 SDValue LHS = N->getOperand(0);
17005 SDValue RHS = N->getOperand(1);
17006 EVT VT = N->getValueType(0);
17007 SDLoc DL(N);
17008
17009 if (!N->getFlags().hasAllowReassociation())
17010 return SDValue();
17011
17012 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
17013 auto ReassocComplex = [&](SDValue A, SDValue B) {
17014 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
17015 return SDValue();
17016 unsigned Opc = A.getConstantOperandVal(0);
17017 if (Opc != Intrinsic::arm_mve_vcmlaq)
17018 return SDValue();
17019 SDValue VCMLA = DAG.getNode(
17020 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0), A.getOperand(1),
17021 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(2), B, N->getFlags()),
17022 A.getOperand(3), A.getOperand(4));
17023 VCMLA->setFlags(A->getFlags());
17024 return VCMLA;
17025 };
17026 if (SDValue R = ReassocComplex(LHS, RHS))
17027 return R;
17028 if (SDValue R = ReassocComplex(RHS, LHS))
17029 return R;
17030
17031 return SDValue();
17032}
17033
17035 const ARMSubtarget *Subtarget) {
17036 if (SDValue S = PerformFAddVSelectCombine(N, DAG, Subtarget))
17037 return S;
17038 if (SDValue S = PerformFADDVCMLACombine(N, DAG))
17039 return S;
17040 return SDValue();
17041}
17042
17043/// PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
17044/// can replace combinations of VCVT (integer to floating-point) and VMUL
17045/// when the VMUL has a constant operand that is a power of 2.
17046///
17047/// Example (assume d17 = <float 0.125, float 0.125>):
17048/// vcvt.f32.s32 d16, d16
17049/// vmul.f32 d16, d16, d17
17050/// becomes:
17051/// vcvt.f32.s32 d16, d16, #3
17053 const ARMSubtarget *Subtarget) {
17054 if (!Subtarget->hasNEON())
17055 return SDValue();
17056
17057 SDValue Op = N->getOperand(0);
17058 unsigned OpOpcode = Op.getNode()->getOpcode();
17059 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
17060 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
17061 return SDValue();
17062
17063 SDValue ConstVec = N->getOperand(1);
17064 if (!isa<BuildVectorSDNode>(ConstVec))
17065 return SDValue();
17066
17067 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
17068 uint32_t FloatBits = FloatTy.getSizeInBits();
17069 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
17070 uint32_t IntBits = IntTy.getSizeInBits();
17071 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17072 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
17073 // These instructions only exist converting from i32 to f32. We can handle
17074 // smaller integers by generating an extra extend, but larger ones would
17075 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
17076 // these intructions only support v2i32/v4i32 types.
17077 return SDValue();
17078 }
17079
17080 ConstantFPSDNode *CN = isConstOrConstSplatFP(ConstVec, true);
17081 APFloat Recip(0.0f);
17082 if (!CN || !CN->getValueAPF().getExactInverse(&Recip))
17083 return SDValue();
17084
17085 bool IsExact;
17086 APSInt IntVal(33);
17087 if (Recip.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
17088 APFloat::opOK ||
17089 !IsExact)
17090 return SDValue();
17091
17092 int32_t C = IntVal.exactLogBase2();
17093 if (C == -1 || C == 0 || C > 32)
17094 return SDValue();
17095
17096 SDLoc DL(N);
17097 bool isSigned = OpOpcode == ISD::SINT_TO_FP;
17098 SDValue ConvInput = Op.getOperand(0);
17099 if (IntBits < FloatBits)
17101 NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput);
17102
17103 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp
17104 : Intrinsic::arm_neon_vcvtfxu2fp;
17105 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
17106 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
17107 DAG.getConstant(C, DL, MVT::i32));
17108}
17109
17111 const ARMSubtarget *ST) {
17112 if (!ST->hasMVEIntegerOps())
17113 return SDValue();
17114
17115 assert(N->getOpcode() == ISD::VECREDUCE_ADD);
17116 EVT ResVT = N->getValueType(0);
17117 SDValue N0 = N->getOperand(0);
17118 SDLoc dl(N);
17119
17120 // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
17121 if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
17122 (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 ||
17123 N0.getValueType() == MVT::v16i8)) {
17124 SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0));
17125 SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1));
17126 return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1);
17127 }
17128
17129 // We are looking for something that will have illegal types if left alone,
17130 // but that we can convert to a single instruction under MVE. For example
17131 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
17132 // or
17133 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
17134
17135 // The legal cases are:
17136 // VADDV u/s 8/16/32
17137 // VMLAV u/s 8/16/32
17138 // VADDLV u/s 32
17139 // VMLALV u/s 16/32
17140
17141 // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
17142 // extend it and use v4i32 instead.
17143 auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) {
17144 EVT AVT = A.getValueType();
17145 return any_of(ExtTypes, [&](MVT Ty) {
17146 return AVT.getVectorNumElements() == Ty.getVectorNumElements() &&
17147 AVT.bitsLE(Ty);
17148 });
17149 };
17150 auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
17151 EVT AVT = A.getValueType();
17152 if (!AVT.is128BitVector())
17153 A = DAG.getNode(ExtendCode, dl,
17155 128 / AVT.getVectorMinNumElements())),
17156 A);
17157 return A;
17158 };
17159 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
17160 if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
17161 return SDValue();
17162 SDValue A = N0->getOperand(0);
17163 if (ExtTypeMatches(A, ExtTypes))
17164 return ExtendIfNeeded(A, ExtendCode);
17165 return SDValue();
17166 };
17167 auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
17168 ArrayRef<MVT> ExtTypes, SDValue &Mask) {
17169 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17171 return SDValue();
17172 Mask = N0->getOperand(0);
17173 SDValue Ext = N0->getOperand(1);
17174 if (Ext->getOpcode() != ExtendCode)
17175 return SDValue();
17176 SDValue A = Ext->getOperand(0);
17177 if (ExtTypeMatches(A, ExtTypes))
17178 return ExtendIfNeeded(A, ExtendCode);
17179 return SDValue();
17180 };
17181 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17182 SDValue &A, SDValue &B) {
17183 // For a vmla we are trying to match a larger pattern:
17184 // ExtA = sext/zext A
17185 // ExtB = sext/zext B
17186 // Mul = mul ExtA, ExtB
17187 // vecreduce.add Mul
17188 // There might also be en extra extend between the mul and the addreduce, so
17189 // long as the bitwidth is high enough to make them equivalent (for example
17190 // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
17191 if (ResVT != RetTy)
17192 return false;
17193 SDValue Mul = N0;
17194 if (Mul->getOpcode() == ExtendCode &&
17195 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17196 ResVT.getScalarSizeInBits())
17197 Mul = Mul->getOperand(0);
17198 if (Mul->getOpcode() != ISD::MUL)
17199 return false;
17200 SDValue ExtA = Mul->getOperand(0);
17201 SDValue ExtB = Mul->getOperand(1);
17202 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17203 return false;
17204 A = ExtA->getOperand(0);
17205 B = ExtB->getOperand(0);
17206 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17207 A = ExtendIfNeeded(A, ExtendCode);
17208 B = ExtendIfNeeded(B, ExtendCode);
17209 return true;
17210 }
17211 return false;
17212 };
17213 auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17214 SDValue &A, SDValue &B, SDValue &Mask) {
17215 // Same as the pattern above with a select for the zero predicated lanes
17216 // ExtA = sext/zext A
17217 // ExtB = sext/zext B
17218 // Mul = mul ExtA, ExtB
17219 // N0 = select Mask, Mul, 0
17220 // vecreduce.add N0
17221 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17223 return false;
17224 Mask = N0->getOperand(0);
17225 SDValue Mul = N0->getOperand(1);
17226 if (Mul->getOpcode() == ExtendCode &&
17227 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17228 ResVT.getScalarSizeInBits())
17229 Mul = Mul->getOperand(0);
17230 if (Mul->getOpcode() != ISD::MUL)
17231 return false;
17232 SDValue ExtA = Mul->getOperand(0);
17233 SDValue ExtB = Mul->getOperand(1);
17234 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17235 return false;
17236 A = ExtA->getOperand(0);
17237 B = ExtB->getOperand(0);
17238 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17239 A = ExtendIfNeeded(A, ExtendCode);
17240 B = ExtendIfNeeded(B, ExtendCode);
17241 return true;
17242 }
17243 return false;
17244 };
17245 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
17246 // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64
17247 // reductions. The operands are extended with MVEEXT, but as they are
17248 // reductions the lane orders do not matter. MVEEXT may be combined with
17249 // loads to produce two extending loads, or else they will be expanded to
17250 // VREV/VMOVL.
17251 EVT VT = Ops[0].getValueType();
17252 if (VT == MVT::v16i8) {
17253 assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) &&
17254 "Unexpected illegal long reduction opcode");
17255 bool IsUnsigned = Opcode == ARMISD::VMLALVu;
17256
17257 SDValue Ext0 =
17258 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17259 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]);
17260 SDValue Ext1 =
17261 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17262 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]);
17263
17264 SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
17265 Ext0, Ext1);
17266 SDValue MLA1 =
17267 DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl,
17268 DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1),
17269 Ext0.getValue(1), Ext1.getValue(1));
17270 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1));
17271 }
17272 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
17273 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
17274 SDValue(Node.getNode(), 1));
17275 };
17276
17277 SDValue A, B;
17278 SDValue Mask;
17279 if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17280 return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
17281 if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17282 return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
17283 if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17284 A, B))
17285 return Create64bitNode(ARMISD::VMLALVs, {A, B});
17286 if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17287 A, B))
17288 return Create64bitNode(ARMISD::VMLALVu, {A, B});
17289 if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
17290 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17291 DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
17292 if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))
17293 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17294 DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
17295
17296 if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17297 Mask))
17298 return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
17299 if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17300 Mask))
17301 return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
17302 if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17303 Mask))
17304 return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
17305 if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17306 Mask))
17307 return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
17308 if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
17309 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17310 DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
17311 if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))
17312 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17313 DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
17314
17315 if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
17316 return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
17317 if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
17318 return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
17319 if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
17320 return Create64bitNode(ARMISD::VADDLVs, {A});
17321 if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
17322 return Create64bitNode(ARMISD::VADDLVu, {A});
17323 if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
17324 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17325 DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
17326 if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
17327 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17328 DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
17329
17330 if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17331 return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
17332 if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17333 return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
17334 if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))
17335 return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
17336 if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))
17337 return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
17338 if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
17339 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17340 DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
17341 if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
17342 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17343 DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
17344
17345 // Some complications. We can get a case where the two inputs of the mul are
17346 // the same, then the output sext will have been helpfully converted to a
17347 // zext. Turn it back.
17348 SDValue Op = N0;
17349 if (Op->getOpcode() == ISD::VSELECT)
17350 Op = Op->getOperand(1);
17351 if (Op->getOpcode() == ISD::ZERO_EXTEND &&
17352 Op->getOperand(0)->getOpcode() == ISD::MUL) {
17353 SDValue Mul = Op->getOperand(0);
17354 if (Mul->getOperand(0) == Mul->getOperand(1) &&
17355 Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
17356 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);
17357 if (Op != N0)
17358 Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),
17359 N0->getOperand(0), Ext, N0->getOperand(2));
17360 return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);
17361 }
17362 }
17363
17364 return SDValue();
17365}
17366
17367// Looks for vaddv(shuffle) or vmlav(shuffle, shuffle), with a shuffle where all
17368// the lanes are used. Due to the reduction being commutative the shuffle can be
17369// removed.
17371 unsigned VecOp = N->getOperand(0).getValueType().isVector() ? 0 : 2;
17372 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp));
17373 if (!Shuf || !Shuf->getOperand(1).isUndef())
17374 return SDValue();
17375
17376 // Check all elements are used once in the mask.
17377 ArrayRef<int> Mask = Shuf->getMask();
17378 APInt SetElts(Mask.size(), 0);
17379 for (int E : Mask) {
17380 if (E < 0 || E >= (int)Mask.size())
17381 return SDValue();
17382 SetElts.setBit(E);
17383 }
17384 if (!SetElts.isAllOnes())
17385 return SDValue();
17386
17387 if (N->getNumOperands() != VecOp + 1) {
17388 auto *Shuf2 = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp + 1));
17389 if (!Shuf2 || !Shuf2->getOperand(1).isUndef() || Shuf2->getMask() != Mask)
17390 return SDValue();
17391 }
17392
17394 for (SDValue Op : N->ops()) {
17395 if (Op.getValueType().isVector())
17396 Ops.push_back(Op.getOperand(0));
17397 else
17398 Ops.push_back(Op);
17399 }
17400 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops);
17401}
17402
17405 SDValue Op0 = N->getOperand(0);
17406 SDValue Op1 = N->getOperand(1);
17407 unsigned IsTop = N->getConstantOperandVal(2);
17408
17409 // VMOVNT a undef -> a
17410 // VMOVNB a undef -> a
17411 // VMOVNB undef a -> a
17412 if (Op1->isUndef())
17413 return Op0;
17414 if (Op0->isUndef() && !IsTop)
17415 return Op1;
17416
17417 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
17418 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
17419 if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
17420 Op1->getOpcode() == ARMISD::VQMOVNu) &&
17421 Op1->getConstantOperandVal(2) == 0)
17422 return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),
17423 Op0, Op1->getOperand(1), N->getOperand(2));
17424
17425 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
17426 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
17427 // into the top or bottom lanes.
17428 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17429 APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1));
17430 APInt Op0DemandedElts =
17431 IsTop ? Op1DemandedElts
17432 : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));
17433
17434 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17435 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17436 return SDValue(N, 0);
17437 if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, DCI))
17438 return SDValue(N, 0);
17439
17440 return SDValue();
17441}
17442
17445 SDValue Op0 = N->getOperand(0);
17446 unsigned IsTop = N->getConstantOperandVal(2);
17447
17448 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17449 APInt Op0DemandedElts =
17450 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
17451 : APInt::getHighBitsSet(2, 1));
17452
17453 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17454 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17455 return SDValue(N, 0);
17456 return SDValue();
17457}
17458
17461 EVT VT = N->getValueType(0);
17462 SDValue LHS = N->getOperand(0);
17463 SDValue RHS = N->getOperand(1);
17464
17465 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS);
17466 auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS);
17467 // Turn VQDMULH(shuffle, shuffle) -> shuffle(VQDMULH)
17468 if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) &&
17469 LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() &&
17470 (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {
17471 SDLoc DL(N);
17472 SDValue NewBinOp = DCI.DAG.getNode(N->getOpcode(), DL, VT,
17473 LHS.getOperand(0), RHS.getOperand(0));
17474 SDValue UndefV = LHS.getOperand(1);
17475 return DCI.DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());
17476 }
17477 return SDValue();
17478}
17479
17481 SDLoc DL(N);
17482 SDValue Op0 = N->getOperand(0);
17483 SDValue Op1 = N->getOperand(1);
17484
17485 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
17486 // uses of the intrinsics.
17487 if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
17488 int ShiftAmt = C->getSExtValue();
17489 if (ShiftAmt == 0) {
17490 SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);
17491 DAG.ReplaceAllUsesWith(N, Merge.getNode());
17492 return SDValue();
17493 }
17494
17495 if (ShiftAmt >= -32 && ShiftAmt < 0) {
17496 unsigned NewOpcode =
17497 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
17498 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
17499 DAG.getConstant(-ShiftAmt, DL, MVT::i32));
17500 DAG.ReplaceAllUsesWith(N, NewShift.getNode());
17501 return NewShift;
17502 }
17503 }
17504
17505 return SDValue();
17506}
17507
17508/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
17510 DAGCombinerInfo &DCI) const {
17511 SelectionDAG &DAG = DCI.DAG;
17512 unsigned IntNo = N->getConstantOperandVal(0);
17513 switch (IntNo) {
17514 default:
17515 // Don't do anything for most intrinsics.
17516 break;
17517
17518 // Vector shifts: check for immediate versions and lower them.
17519 // Note: This is done during DAG combining instead of DAG legalizing because
17520 // the build_vectors for 64-bit vector element shift counts are generally
17521 // not legal, and it is hard to see their values after they get legalized to
17522 // loads from a constant pool.
17523 case Intrinsic::arm_neon_vshifts:
17524 case Intrinsic::arm_neon_vshiftu:
17525 case Intrinsic::arm_neon_vrshifts:
17526 case Intrinsic::arm_neon_vrshiftu:
17527 case Intrinsic::arm_neon_vrshiftn:
17528 case Intrinsic::arm_neon_vqshifts:
17529 case Intrinsic::arm_neon_vqshiftu:
17530 case Intrinsic::arm_neon_vqshiftsu:
17531 case Intrinsic::arm_neon_vqshiftns:
17532 case Intrinsic::arm_neon_vqshiftnu:
17533 case Intrinsic::arm_neon_vqshiftnsu:
17534 case Intrinsic::arm_neon_vqrshiftns:
17535 case Intrinsic::arm_neon_vqrshiftnu:
17536 case Intrinsic::arm_neon_vqrshiftnsu: {
17537 EVT VT = N->getOperand(1).getValueType();
17538 int64_t Cnt;
17539 unsigned VShiftOpc = 0;
17540
17541 switch (IntNo) {
17542 case Intrinsic::arm_neon_vshifts:
17543 case Intrinsic::arm_neon_vshiftu:
17544 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
17545 VShiftOpc = ARMISD::VSHLIMM;
17546 break;
17547 }
17548 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
17549 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
17551 break;
17552 }
17553 return SDValue();
17554
17555 case Intrinsic::arm_neon_vrshifts:
17556 case Intrinsic::arm_neon_vrshiftu:
17557 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
17558 break;
17559 return SDValue();
17560
17561 case Intrinsic::arm_neon_vqshifts:
17562 case Intrinsic::arm_neon_vqshiftu:
17563 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17564 break;
17565 return SDValue();
17566
17567 case Intrinsic::arm_neon_vqshiftsu:
17568 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17569 break;
17570 llvm_unreachable("invalid shift count for vqshlu intrinsic");
17571
17572 case Intrinsic::arm_neon_vrshiftn:
17573 case Intrinsic::arm_neon_vqshiftns:
17574 case Intrinsic::arm_neon_vqshiftnu:
17575 case Intrinsic::arm_neon_vqshiftnsu:
17576 case Intrinsic::arm_neon_vqrshiftns:
17577 case Intrinsic::arm_neon_vqrshiftnu:
17578 case Intrinsic::arm_neon_vqrshiftnsu:
17579 // Narrowing shifts require an immediate right shift.
17580 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
17581 break;
17582 llvm_unreachable("invalid shift count for narrowing vector shift "
17583 "intrinsic");
17584
17585 default:
17586 llvm_unreachable("unhandled vector shift");
17587 }
17588
17589 switch (IntNo) {
17590 case Intrinsic::arm_neon_vshifts:
17591 case Intrinsic::arm_neon_vshiftu:
17592 // Opcode already set above.
17593 break;
17594 case Intrinsic::arm_neon_vrshifts:
17595 VShiftOpc = ARMISD::VRSHRsIMM;
17596 break;
17597 case Intrinsic::arm_neon_vrshiftu:
17598 VShiftOpc = ARMISD::VRSHRuIMM;
17599 break;
17600 case Intrinsic::arm_neon_vrshiftn:
17601 VShiftOpc = ARMISD::VRSHRNIMM;
17602 break;
17603 case Intrinsic::arm_neon_vqshifts:
17604 VShiftOpc = ARMISD::VQSHLsIMM;
17605 break;
17606 case Intrinsic::arm_neon_vqshiftu:
17607 VShiftOpc = ARMISD::VQSHLuIMM;
17608 break;
17609 case Intrinsic::arm_neon_vqshiftsu:
17610 VShiftOpc = ARMISD::VQSHLsuIMM;
17611 break;
17612 case Intrinsic::arm_neon_vqshiftns:
17613 VShiftOpc = ARMISD::VQSHRNsIMM;
17614 break;
17615 case Intrinsic::arm_neon_vqshiftnu:
17616 VShiftOpc = ARMISD::VQSHRNuIMM;
17617 break;
17618 case Intrinsic::arm_neon_vqshiftnsu:
17619 VShiftOpc = ARMISD::VQSHRNsuIMM;
17620 break;
17621 case Intrinsic::arm_neon_vqrshiftns:
17622 VShiftOpc = ARMISD::VQRSHRNsIMM;
17623 break;
17624 case Intrinsic::arm_neon_vqrshiftnu:
17625 VShiftOpc = ARMISD::VQRSHRNuIMM;
17626 break;
17627 case Intrinsic::arm_neon_vqrshiftnsu:
17628 VShiftOpc = ARMISD::VQRSHRNsuIMM;
17629 break;
17630 }
17631
17632 SDLoc dl(N);
17633 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17634 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
17635 }
17636
17637 case Intrinsic::arm_neon_vshiftins: {
17638 EVT VT = N->getOperand(1).getValueType();
17639 int64_t Cnt;
17640 unsigned VShiftOpc = 0;
17641
17642 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
17643 VShiftOpc = ARMISD::VSLIIMM;
17644 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
17645 VShiftOpc = ARMISD::VSRIIMM;
17646 else {
17647 llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
17648 }
17649
17650 SDLoc dl(N);
17651 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17652 N->getOperand(1), N->getOperand(2),
17653 DAG.getConstant(Cnt, dl, MVT::i32));
17654 }
17655
17656 case Intrinsic::arm_neon_vqrshifts:
17657 case Intrinsic::arm_neon_vqrshiftu:
17658 // No immediate versions of these to check for.
17659 break;
17660
17661 case Intrinsic::arm_neon_vbsl: {
17662 SDLoc dl(N);
17663 return DAG.getNode(ARMISD::VBSP, dl, N->getValueType(0), N->getOperand(1),
17664 N->getOperand(2), N->getOperand(3));
17665 }
17666 case Intrinsic::arm_mve_vqdmlah:
17667 case Intrinsic::arm_mve_vqdmlash:
17668 case Intrinsic::arm_mve_vqrdmlah:
17669 case Intrinsic::arm_mve_vqrdmlash:
17670 case Intrinsic::arm_mve_vmla_n_predicated:
17671 case Intrinsic::arm_mve_vmlas_n_predicated:
17672 case Intrinsic::arm_mve_vqdmlah_predicated:
17673 case Intrinsic::arm_mve_vqdmlash_predicated:
17674 case Intrinsic::arm_mve_vqrdmlah_predicated:
17675 case Intrinsic::arm_mve_vqrdmlash_predicated: {
17676 // These intrinsics all take an i32 scalar operand which is narrowed to the
17677 // size of a single lane of the vector type they return. So we don't need
17678 // any bits of that operand above that point, which allows us to eliminate
17679 // uxth/sxth.
17680 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
17681 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17682 if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
17683 return SDValue();
17684 break;
17685 }
17686
17687 case Intrinsic::arm_mve_minv:
17688 case Intrinsic::arm_mve_maxv:
17689 case Intrinsic::arm_mve_minav:
17690 case Intrinsic::arm_mve_maxav:
17691 case Intrinsic::arm_mve_minv_predicated:
17692 case Intrinsic::arm_mve_maxv_predicated:
17693 case Intrinsic::arm_mve_minav_predicated:
17694 case Intrinsic::arm_mve_maxav_predicated: {
17695 // These intrinsics all take an i32 scalar operand which is narrowed to the
17696 // size of a single lane of the vector type they take as the other input.
17697 unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
17698 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17699 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
17700 return SDValue();
17701 break;
17702 }
17703
17704 case Intrinsic::arm_mve_addv: {
17705 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
17706 // which allow PerformADDVecReduce to turn it into VADDLV when possible.
17707 bool Unsigned = N->getConstantOperandVal(2);
17709 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
17710 }
17711
17712 case Intrinsic::arm_mve_addlv:
17713 case Intrinsic::arm_mve_addlv_predicated: {
17714 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
17715 // which recombines the two outputs into an i64
17716 bool Unsigned = N->getConstantOperandVal(2);
17717 unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
17720
17722 for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
17723 if (i != 2) // skip the unsigned flag
17724 Ops.push_back(N->getOperand(i));
17725
17726 SDLoc dl(N);
17727 SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
17728 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
17729 val.getValue(1));
17730 }
17731 }
17732
17733 return SDValue();
17734}
17735
17736/// PerformShiftCombine - Checks for immediate versions of vector shifts and
17737/// lowers them. As with the vector shift intrinsics, this is done during DAG
17738/// combining instead of DAG legalizing because the build_vectors for 64-bit
17739/// vector element shift counts are generally not legal, and it is hard to see
17740/// their values after they get legalized to loads from a constant pool.
17743 const ARMSubtarget *ST) {
17744 SelectionDAG &DAG = DCI.DAG;
17745 EVT VT = N->getValueType(0);
17746
17747 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
17748 N->getOperand(0)->getOpcode() == ISD::AND &&
17749 N->getOperand(0)->hasOneUse()) {
17750 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
17751 return SDValue();
17752 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
17753 // usually show up because instcombine prefers to canonicalize it to
17754 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
17755 // out of GEP lowering in some cases.
17756 SDValue N0 = N->getOperand(0);
17757 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
17758 if (!ShiftAmtNode)
17759 return SDValue();
17760 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
17761 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
17762 if (!AndMaskNode)
17763 return SDValue();
17764 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
17765 // Don't transform uxtb/uxth.
17766 if (AndMask == 255 || AndMask == 65535)
17767 return SDValue();
17768 if (isMask_32(AndMask)) {
17769 uint32_t MaskedBits = llvm::countl_zero(AndMask);
17770 if (MaskedBits > ShiftAmt) {
17771 SDLoc DL(N);
17772 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
17773 DAG.getConstant(MaskedBits, DL, MVT::i32));
17774 return DAG.getNode(
17775 ISD::SRL, DL, MVT::i32, SHL,
17776 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
17777 }
17778 }
17779 }
17780
17781 // Nothing to be done for scalar shifts.
17782 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17783 if (!VT.isVector() || !TLI.isTypeLegal(VT))
17784 return SDValue();
17785 if (ST->hasMVEIntegerOps())
17786 return SDValue();
17787
17788 int64_t Cnt;
17789
17790 switch (N->getOpcode()) {
17791 default: llvm_unreachable("unexpected shift opcode");
17792
17793 case ISD::SHL:
17794 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
17795 SDLoc dl(N);
17796 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
17797 DAG.getConstant(Cnt, dl, MVT::i32));
17798 }
17799 break;
17800
17801 case ISD::SRA:
17802 case ISD::SRL:
17803 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
17804 unsigned VShiftOpc =
17805 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
17806 SDLoc dl(N);
17807 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
17808 DAG.getConstant(Cnt, dl, MVT::i32));
17809 }
17810 }
17811 return SDValue();
17812}
17813
17814// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
17815// split into multiple extending loads, which are simpler to deal with than an
17816// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
17817// to convert the type to an f32.
17819 SDValue N0 = N->getOperand(0);
17820 if (N0.getOpcode() != ISD::LOAD)
17821 return SDValue();
17822 LoadSDNode *LD = cast<LoadSDNode>(N0.getNode());
17823 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
17824 LD->getExtensionType() != ISD::NON_EXTLOAD)
17825 return SDValue();
17826 EVT FromVT = LD->getValueType(0);
17827 EVT ToVT = N->getValueType(0);
17828 if (!ToVT.isVector())
17829 return SDValue();
17831 EVT ToEltVT = ToVT.getVectorElementType();
17832 EVT FromEltVT = FromVT.getVectorElementType();
17833
17834 unsigned NumElements = 0;
17835 if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)
17836 NumElements = 4;
17837 if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
17838 NumElements = 4;
17839 if (NumElements == 0 ||
17840 (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
17841 FromVT.getVectorNumElements() % NumElements != 0 ||
17842 !isPowerOf2_32(NumElements))
17843 return SDValue();
17844
17845 LLVMContext &C = *DAG.getContext();
17846 SDLoc DL(LD);
17847 // Details about the old load
17848 SDValue Ch = LD->getChain();
17849 SDValue BasePtr = LD->getBasePtr();
17850 Align Alignment = LD->getBaseAlign();
17851 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
17852 AAMDNodes AAInfo = LD->getAAInfo();
17853
17854 ISD::LoadExtType NewExtType =
17855 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
17856 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
17857 EVT NewFromVT = EVT::getVectorVT(
17858 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
17859 EVT NewToVT = EVT::getVectorVT(
17860 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
17861
17864 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
17865 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
17866 SDValue NewPtr =
17867 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
17868
17869 SDValue NewLoad =
17870 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
17871 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
17872 Alignment, MMOFlags, AAInfo);
17873 Loads.push_back(NewLoad);
17874 Chains.push_back(SDValue(NewLoad.getNode(), 1));
17875 }
17876
17877 // Float truncs need to extended with VCVTB's into their floating point types.
17878 if (FromEltVT == MVT::f16) {
17880
17881 for (unsigned i = 0; i < Loads.size(); i++) {
17882 SDValue LoadBC =
17883 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
17884 SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
17885 DAG.getConstant(0, DL, MVT::i32));
17886 Extends.push_back(FPExt);
17887 }
17888
17889 Loads = Extends;
17890 }
17891
17892 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
17893 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
17894 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
17895}
17896
17897/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
17898/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
17900 const ARMSubtarget *ST) {
17901 SDValue N0 = N->getOperand(0);
17902
17903 // Check for sign- and zero-extensions of vector extract operations of 8- and
17904 // 16-bit vector elements. NEON and MVE support these directly. They are
17905 // handled during DAG combining because type legalization will promote them
17906 // to 32-bit types and it is messy to recognize the operations after that.
17907 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&
17909 SDValue Vec = N0.getOperand(0);
17910 SDValue Lane = N0.getOperand(1);
17911 EVT VT = N->getValueType(0);
17912 EVT EltVT = N0.getValueType();
17913 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17914
17915 if (VT == MVT::i32 &&
17916 (EltVT == MVT::i8 || EltVT == MVT::i16) &&
17917 TLI.isTypeLegal(Vec.getValueType()) &&
17918 isa<ConstantSDNode>(Lane)) {
17919
17920 unsigned Opc = 0;
17921 switch (N->getOpcode()) {
17922 default: llvm_unreachable("unexpected opcode");
17923 case ISD::SIGN_EXTEND:
17925 break;
17926 case ISD::ZERO_EXTEND:
17927 case ISD::ANY_EXTEND:
17929 break;
17930 }
17931 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
17932 }
17933 }
17934
17935 if (ST->hasMVEIntegerOps())
17936 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17937 return NewLoad;
17938
17939 return SDValue();
17940}
17941
17943 const ARMSubtarget *ST) {
17944 if (ST->hasMVEFloatOps())
17945 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17946 return NewLoad;
17947
17948 return SDValue();
17949}
17950
17951// Lower smin(smax(x, C1), C2) to ssat or usat, if they have saturating
17952// constant bounds.
17954 const ARMSubtarget *Subtarget) {
17955 if ((Subtarget->isThumb() || !Subtarget->hasV6Ops()) &&
17956 !Subtarget->isThumb2())
17957 return SDValue();
17958
17959 EVT VT = Op.getValueType();
17960 SDValue Op0 = Op.getOperand(0);
17961
17962 if (VT != MVT::i32 ||
17963 (Op0.getOpcode() != ISD::SMIN && Op0.getOpcode() != ISD::SMAX) ||
17964 !isa<ConstantSDNode>(Op.getOperand(1)) ||
17965 !isa<ConstantSDNode>(Op0.getOperand(1)))
17966 return SDValue();
17967
17968 SDValue Min = Op;
17969 SDValue Max = Op0;
17970 SDValue Input = Op0.getOperand(0);
17971 if (Min.getOpcode() == ISD::SMAX)
17972 std::swap(Min, Max);
17973
17974 APInt MinC = Min.getConstantOperandAPInt(1);
17975 APInt MaxC = Max.getConstantOperandAPInt(1);
17976
17977 if (Min.getOpcode() != ISD::SMIN || Max.getOpcode() != ISD::SMAX ||
17978 !(MinC + 1).isPowerOf2())
17979 return SDValue();
17980
17981 SDLoc DL(Op);
17982 if (MinC == ~MaxC)
17983 return DAG.getNode(ARMISD::SSAT, DL, VT, Input,
17984 DAG.getConstant(MinC.countr_one(), DL, VT));
17985 if (MaxC == 0)
17986 return DAG.getNode(ARMISD::USAT, DL, VT, Input,
17987 DAG.getConstant(MinC.countr_one(), DL, VT));
17988
17989 return SDValue();
17990}
17991
17992/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
17993/// saturates.
17995 const ARMSubtarget *ST) {
17996 EVT VT = N->getValueType(0);
17997 SDValue N0 = N->getOperand(0);
17998
17999 if (VT == MVT::i32)
18000 return PerformMinMaxToSatCombine(SDValue(N, 0), DAG, ST);
18001
18002 if (!ST->hasMVEIntegerOps())
18003 return SDValue();
18004
18005 if (SDValue V = PerformVQDMULHCombine(N, DAG))
18006 return V;
18007
18008 if (VT != MVT::v4i32 && VT != MVT::v8i16)
18009 return SDValue();
18010
18011 auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
18012 // Check one is a smin and the other is a smax
18013 if (Min->getOpcode() != ISD::SMIN)
18014 std::swap(Min, Max);
18015 if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
18016 return false;
18017
18018 APInt SaturateC;
18019 if (VT == MVT::v4i32)
18020 SaturateC = APInt(32, (1 << 15) - 1, true);
18021 else //if (VT == MVT::v8i16)
18022 SaturateC = APInt(16, (1 << 7) - 1, true);
18023
18024 APInt MinC, MaxC;
18025 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18026 MinC != SaturateC)
18027 return false;
18028 if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
18029 MaxC != ~SaturateC)
18030 return false;
18031 return true;
18032 };
18033
18034 if (IsSignedSaturate(N, N0.getNode())) {
18035 SDLoc DL(N);
18036 MVT ExtVT, HalfVT;
18037 if (VT == MVT::v4i32) {
18038 HalfVT = MVT::v8i16;
18039 ExtVT = MVT::v4i16;
18040 } else { // if (VT == MVT::v8i16)
18041 HalfVT = MVT::v16i8;
18042 ExtVT = MVT::v8i8;
18043 }
18044
18045 // Create a VQMOVNB with undef top lanes, then signed extended into the top
18046 // half. That extend will hopefully be removed if only the bottom bits are
18047 // demanded (though a truncating store, for example).
18048 SDValue VQMOVN =
18049 DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
18050 N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
18051 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18052 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
18053 DAG.getValueType(ExtVT));
18054 }
18055
18056 auto IsUnsignedSaturate = [&](SDNode *Min) {
18057 // For unsigned, we just need to check for <= 0xffff
18058 if (Min->getOpcode() != ISD::UMIN)
18059 return false;
18060
18061 APInt SaturateC;
18062 if (VT == MVT::v4i32)
18063 SaturateC = APInt(32, (1 << 16) - 1, true);
18064 else //if (VT == MVT::v8i16)
18065 SaturateC = APInt(16, (1 << 8) - 1, true);
18066
18067 APInt MinC;
18068 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18069 MinC != SaturateC)
18070 return false;
18071 return true;
18072 };
18073
18074 if (IsUnsignedSaturate(N)) {
18075 SDLoc DL(N);
18076 MVT HalfVT;
18077 unsigned ExtConst;
18078 if (VT == MVT::v4i32) {
18079 HalfVT = MVT::v8i16;
18080 ExtConst = 0x0000FFFF;
18081 } else { //if (VT == MVT::v8i16)
18082 HalfVT = MVT::v16i8;
18083 ExtConst = 0x00FF;
18084 }
18085
18086 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
18087 // an AND. That extend will hopefully be removed if only the bottom bits are
18088 // demanded (though a truncating store, for example).
18089 SDValue VQMOVN =
18090 DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
18091 DAG.getConstant(0, DL, MVT::i32));
18092 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18093 return DAG.getNode(ISD::AND, DL, VT, Bitcast,
18094 DAG.getConstant(ExtConst, DL, VT));
18095 }
18096
18097 return SDValue();
18098}
18099
18101 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
18102 if (!C)
18103 return nullptr;
18104 const APInt *CV = &C->getAPIntValue();
18105 return CV->isPowerOf2() ? CV : nullptr;
18106}
18107
18109 // If we have a CMOV, OR and AND combination such as:
18110 // if (x & CN)
18111 // y |= CM;
18112 //
18113 // And:
18114 // * CN is a single bit;
18115 // * All bits covered by CM are known zero in y
18116 //
18117 // Then we can convert this into a sequence of BFI instructions. This will
18118 // always be a win if CM is a single bit, will always be no worse than the
18119 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
18120 // three bits (due to the extra IT instruction).
18121
18122 SDValue Op0 = CMOV->getOperand(0);
18123 SDValue Op1 = CMOV->getOperand(1);
18124 auto CC = CMOV->getConstantOperandAPInt(2).getLimitedValue();
18125 SDValue CmpZ = CMOV->getOperand(3);
18126
18127 // The compare must be against zero.
18128 if (!isNullConstant(CmpZ->getOperand(1)))
18129 return SDValue();
18130
18131 assert(CmpZ->getOpcode() == ARMISD::CMPZ);
18132 SDValue And = CmpZ->getOperand(0);
18133 if (And->getOpcode() != ISD::AND)
18134 return SDValue();
18135 const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
18136 if (!AndC)
18137 return SDValue();
18138 SDValue X = And->getOperand(0);
18139
18140 if (CC == ARMCC::EQ) {
18141 // We're performing an "equal to zero" compare. Swap the operands so we
18142 // canonicalize on a "not equal to zero" compare.
18143 std::swap(Op0, Op1);
18144 } else {
18145 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
18146 }
18147
18148 if (Op1->getOpcode() != ISD::OR)
18149 return SDValue();
18150
18151 ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1));
18152 if (!OrC)
18153 return SDValue();
18154 SDValue Y = Op1->getOperand(0);
18155
18156 if (Op0 != Y)
18157 return SDValue();
18158
18159 // Now, is it profitable to continue?
18160 APInt OrCI = OrC->getAPIntValue();
18161 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
18162 if (OrCI.popcount() > Heuristic)
18163 return SDValue();
18164
18165 // Lastly, can we determine that the bits defined by OrCI
18166 // are zero in Y?
18167 KnownBits Known = DAG.computeKnownBits(Y);
18168 if ((OrCI & Known.Zero) != OrCI)
18169 return SDValue();
18170
18171 // OK, we can do the combine.
18172 SDValue V = Y;
18173 SDLoc dl(X);
18174 EVT VT = X.getValueType();
18175 unsigned BitInX = AndC->logBase2();
18176
18177 if (BitInX != 0) {
18178 // We must shift X first.
18179 X = DAG.getNode(ISD::SRL, dl, VT, X,
18180 DAG.getConstant(BitInX, dl, VT));
18181 }
18182
18183 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
18184 BitInY < NumActiveBits; ++BitInY) {
18185 if (OrCI[BitInY] == 0)
18186 continue;
18187 APInt Mask(VT.getSizeInBits(), 0);
18188 Mask.setBit(BitInY);
18189 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
18190 // Confusingly, the operand is an *inverted* mask.
18191 DAG.getConstant(~Mask, dl, VT));
18192 }
18193
18194 return V;
18195}
18196
18197// Given N, the value controlling the conditional branch, search for the loop
18198// intrinsic, returning it, along with how the value is used. We need to handle
18199// patterns such as the following:
18200// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
18201// (brcond (setcc (loop.decrement), 0, eq), exit)
18202// (brcond (setcc (loop.decrement), 0, ne), header)
18204 bool &Negate) {
18205 switch (N->getOpcode()) {
18206 default:
18207 break;
18208 case ISD::XOR: {
18209 if (!isa<ConstantSDNode>(N.getOperand(1)))
18210 return SDValue();
18211 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
18212 return SDValue();
18213 Negate = !Negate;
18214 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
18215 }
18216 case ISD::SETCC: {
18217 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
18218 if (!Const)
18219 return SDValue();
18220 if (Const->isZero())
18221 Imm = 0;
18222 else if (Const->isOne())
18223 Imm = 1;
18224 else
18225 return SDValue();
18226 CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
18227 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
18228 }
18230 unsigned IntOp = N.getConstantOperandVal(1);
18231 if (IntOp != Intrinsic::test_start_loop_iterations &&
18232 IntOp != Intrinsic::loop_decrement_reg)
18233 return SDValue();
18234 return N;
18235 }
18236 }
18237 return SDValue();
18238}
18239
18242 const ARMSubtarget *ST) {
18243
18244 // The hwloop intrinsics that we're interested are used for control-flow,
18245 // either for entering or exiting the loop:
18246 // - test.start.loop.iterations will test whether its operand is zero. If it
18247 // is zero, the proceeding branch should not enter the loop.
18248 // - loop.decrement.reg also tests whether its operand is zero. If it is
18249 // zero, the proceeding branch should not branch back to the beginning of
18250 // the loop.
18251 // So here, we need to check that how the brcond is using the result of each
18252 // of the intrinsics to ensure that we're branching to the right place at the
18253 // right time.
18254
18255 ISD::CondCode CC;
18256 SDValue Cond;
18257 int Imm = 1;
18258 bool Negate = false;
18259 SDValue Chain = N->getOperand(0);
18260 SDValue Dest;
18261
18262 if (N->getOpcode() == ISD::BRCOND) {
18263 CC = ISD::SETEQ;
18264 Cond = N->getOperand(1);
18265 Dest = N->getOperand(2);
18266 } else {
18267 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
18268 CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
18269 Cond = N->getOperand(2);
18270 Dest = N->getOperand(4);
18271 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
18272 if (!Const->isOne() && !Const->isZero())
18273 return SDValue();
18274 Imm = Const->getZExtValue();
18275 } else
18276 return SDValue();
18277 }
18278
18279 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
18280 if (!Int)
18281 return SDValue();
18282
18283 if (Negate)
18284 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);
18285
18286 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
18287 return (CC == ISD::SETEQ && Imm == 0) ||
18288 (CC == ISD::SETNE && Imm == 1) ||
18289 (CC == ISD::SETLT && Imm == 1) ||
18290 (CC == ISD::SETULT && Imm == 1);
18291 };
18292
18293 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
18294 return (CC == ISD::SETEQ && Imm == 1) ||
18295 (CC == ISD::SETNE && Imm == 0) ||
18296 (CC == ISD::SETGT && Imm == 0) ||
18297 (CC == ISD::SETUGT && Imm == 0) ||
18298 (CC == ISD::SETGE && Imm == 1) ||
18299 (CC == ISD::SETUGE && Imm == 1);
18300 };
18301
18302 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
18303 "unsupported condition");
18304
18305 SDLoc dl(Int);
18306 SelectionDAG &DAG = DCI.DAG;
18307 SDValue Elements = Int.getOperand(2);
18308 unsigned IntOp = Int->getConstantOperandVal(1);
18309 assert((N->hasOneUse() && N->user_begin()->getOpcode() == ISD::BR) &&
18310 "expected single br user");
18311 SDNode *Br = *N->user_begin();
18312 SDValue OtherTarget = Br->getOperand(1);
18313
18314 // Update the unconditional branch to branch to the given Dest.
18315 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
18316 SDValue NewBrOps[] = { Br->getOperand(0), Dest };
18317 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
18318 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
18319 };
18320
18321 if (IntOp == Intrinsic::test_start_loop_iterations) {
18322 SDValue Res;
18323 SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements);
18324 // We expect this 'instruction' to branch when the counter is zero.
18325 if (IsTrueIfZero(CC, Imm)) {
18326 SDValue Ops[] = {Chain, Setup, Dest};
18327 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18328 } else {
18329 // The logic is the reverse of what we need for WLS, so find the other
18330 // basic block target: the target of the proceeding br.
18331 UpdateUncondBr(Br, Dest, DAG);
18332
18333 SDValue Ops[] = {Chain, Setup, OtherTarget};
18334 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18335 }
18336 // Update LR count to the new value
18337 DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup);
18338 // Update chain
18339 DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));
18340 return Res;
18341 } else {
18342 SDValue Size =
18343 DAG.getTargetConstant(Int.getConstantOperandVal(3), dl, MVT::i32);
18344 SDValue Args[] = { Int.getOperand(0), Elements, Size, };
18345 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
18346 DAG.getVTList(MVT::i32, MVT::Other), Args);
18347 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
18348
18349 // We expect this instruction to branch when the count is not zero.
18350 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
18351
18352 // Update the unconditional branch to target the loop preheader if we've
18353 // found the condition has been reversed.
18354 if (Target == OtherTarget)
18355 UpdateUncondBr(Br, Dest, DAG);
18356
18357 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
18358 SDValue(LoopDec.getNode(), 1), Chain);
18359
18360 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
18361 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
18362 }
18363 return SDValue();
18364}
18365
18366/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
18367SDValue
18369 SDValue Cmp = N->getOperand(3);
18370 if (Cmp.getOpcode() != ARMISD::CMPZ)
18371 // Only looking at NE cases.
18372 return SDValue();
18373
18374 SDLoc dl(N);
18375 SDValue LHS = Cmp.getOperand(0);
18376 SDValue RHS = Cmp.getOperand(1);
18377 SDValue Chain = N->getOperand(0);
18378 SDValue BB = N->getOperand(1);
18379 SDValue ARMcc = N->getOperand(2);
18381
18382 // (brcond Chain BB ne (cmpz (and (cmov 0 1 CC Flags) 1) 0))
18383 // -> (brcond Chain BB CC Flags)
18384 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
18385 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
18386 LHS->getOperand(0)->hasOneUse() &&
18387 isNullConstant(LHS->getOperand(0)->getOperand(0)) &&
18388 isOneConstant(LHS->getOperand(0)->getOperand(1)) &&
18389 isOneConstant(LHS->getOperand(1)) && isNullConstant(RHS)) {
18390 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, BB,
18391 LHS->getOperand(0)->getOperand(2),
18392 LHS->getOperand(0)->getOperand(3));
18393 }
18394
18395 return SDValue();
18396}
18397
18398/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
18399SDValue
18401 SDValue Cmp = N->getOperand(3);
18402 if (Cmp.getOpcode() != ARMISD::CMPZ)
18403 // Only looking at EQ and NE cases.
18404 return SDValue();
18405
18406 EVT VT = N->getValueType(0);
18407 SDLoc dl(N);
18408 SDValue LHS = Cmp.getOperand(0);
18409 SDValue RHS = Cmp.getOperand(1);
18410 SDValue FalseVal = N->getOperand(0);
18411 SDValue TrueVal = N->getOperand(1);
18412 SDValue ARMcc = N->getOperand(2);
18414
18415 // BFI is only available on V6T2+.
18416 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
18418 if (R)
18419 return R;
18420 }
18421
18422 // Simplify
18423 // mov r1, r0
18424 // cmp r1, x
18425 // mov r0, y
18426 // moveq r0, x
18427 // to
18428 // cmp r0, x
18429 // movne r0, y
18430 //
18431 // mov r1, r0
18432 // cmp r1, x
18433 // mov r0, x
18434 // movne r0, y
18435 // to
18436 // cmp r0, x
18437 // movne r0, y
18438 /// FIXME: Turn this into a target neutral optimization?
18439 SDValue Res;
18440 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
18441 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, Cmp);
18442 } else if (CC == ARMCC::EQ && TrueVal == RHS) {
18443 SDValue ARMcc;
18444 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
18445 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, NewCmp);
18446 }
18447
18448 // (cmov F T ne (cmpz (cmov 0 1 CC Flags) 0))
18449 // -> (cmov F T CC Flags)
18450 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse() &&
18451 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
18453 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
18454 LHS->getOperand(2), LHS->getOperand(3));
18455 }
18456
18457 if (!VT.isInteger())
18458 return SDValue();
18459
18460 // Fold away an unneccessary CMPZ/CMOV
18461 // CMOV A, B, C1, (CMPZ (CMOV 1, 0, C2, D), 0) ->
18462 // if C1==EQ -> CMOV A, B, C2, D
18463 // if C1==NE -> CMOV A, B, NOT(C2), D
18464 if (N->getConstantOperandVal(2) == ARMCC::EQ ||
18465 N->getConstantOperandVal(2) == ARMCC::NE) {
18467 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
18468 if (N->getConstantOperandVal(2) == ARMCC::NE)
18470 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
18471 N->getOperand(1),
18472 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
18473 }
18474 }
18475
18476 // Materialize a boolean comparison for integers so we can avoid branching.
18477 if (isNullConstant(FalseVal)) {
18478 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
18479 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
18480 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
18481 // right 5 bits will make that 32 be 1, otherwise it will be 0.
18482 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
18483 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18484 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
18485 DAG.getConstant(5, dl, MVT::i32));
18486 } else {
18487 // CMOV 0, 1, ==, (CMPZ x, y) ->
18488 // (UADDO_CARRY (SUB x, y), t:0, t:1)
18489 // where t = (USUBO_CARRY 0, (SUB x, y), 0)
18490 //
18491 // The USUBO_CARRY computes 0 - (x - y) and this will give a borrow when
18492 // x != y. In other words, a carry C == 1 when x == y, C == 0
18493 // otherwise.
18494 // The final UADDO_CARRY computes
18495 // x - y + (0 - (x - y)) + C == C
18496 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18497 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18498 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
18499 // ISD::USUBO_CARRY returns a borrow but we want the carry here
18500 // actually.
18501 SDValue Carry =
18502 DAG.getNode(ISD::SUB, dl, MVT::i32,
18503 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
18504 Res = DAG.getNode(ISD::UADDO_CARRY, dl, VTs, Sub, Neg, Carry);
18505 }
18506 } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
18507 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
18508 // This seems pointless but will allow us to combine it further below.
18509 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18510 SDValue Sub =
18511 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18512 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
18513 Sub.getValue(1));
18514 FalseVal = Sub;
18515 }
18516 } else if (isNullConstant(TrueVal)) {
18517 if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
18518 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
18519 // This seems pointless but will allow us to combine it further below
18520 // Note that we change == for != as this is the dual for the case above.
18521 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18522 SDValue Sub =
18523 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18524 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
18525 DAG.getConstant(ARMCC::NE, dl, MVT::i32),
18526 Sub.getValue(1));
18527 FalseVal = Sub;
18528 }
18529 }
18530
18531 // On Thumb1, the DAG above may be further combined if z is a power of 2
18532 // (z == 2 ^ K).
18533 // CMOV (SUBC x, y), z, !=, (SUBC x, y):1 ->
18534 // t1 = (USUBO (SUB x, y), 1)
18535 // t2 = (USUBO_CARRY (SUB x, y), t1:0, t1:1)
18536 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18537 //
18538 // This also handles the special case of comparing against zero; it's
18539 // essentially, the same pattern, except there's no SUBC:
18540 // CMOV x, z, !=, (CMPZ x, 0) ->
18541 // t1 = (USUBO x, 1)
18542 // t2 = (USUBO_CARRY x, t1:0, t1:1)
18543 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18544 const APInt *TrueConst;
18545 if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
18546 ((FalseVal.getOpcode() == ARMISD::SUBC && FalseVal.getOperand(0) == LHS &&
18547 FalseVal.getOperand(1) == RHS) ||
18548 (FalseVal == LHS && isNullConstant(RHS))) &&
18549 (TrueConst = isPowerOf2Constant(TrueVal))) {
18550 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18551 unsigned ShiftAmount = TrueConst->logBase2();
18552 if (ShiftAmount)
18553 TrueVal = DAG.getConstant(1, dl, VT);
18554 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
18555 Res = DAG.getNode(ISD::USUBO_CARRY, dl, VTs, FalseVal, Subc,
18556 Subc.getValue(1));
18557
18558 if (ShiftAmount)
18559 Res = DAG.getNode(ISD::SHL, dl, VT, Res,
18560 DAG.getConstant(ShiftAmount, dl, MVT::i32));
18561 }
18562
18563 if (Res.getNode()) {
18564 KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
18565 // Capture demanded bits information that would be otherwise lost.
18566 if (Known.Zero == 0xfffffffe)
18567 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18568 DAG.getValueType(MVT::i1));
18569 else if (Known.Zero == 0xffffff00)
18570 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18571 DAG.getValueType(MVT::i8));
18572 else if (Known.Zero == 0xffff0000)
18573 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18574 DAG.getValueType(MVT::i16));
18575 }
18576
18577 return Res;
18578}
18579
18582 const ARMSubtarget *ST) {
18583 SelectionDAG &DAG = DCI.DAG;
18584 SDValue Src = N->getOperand(0);
18585 EVT DstVT = N->getValueType(0);
18586
18587 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
18588 if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
18589 EVT SrcVT = Src.getValueType();
18590 if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
18591 return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
18592 }
18593
18594 // We may have a bitcast of something that has already had this bitcast
18595 // combine performed on it, so skip past any VECTOR_REG_CASTs.
18596 if (Src.getOpcode() == ARMISD::VECTOR_REG_CAST &&
18597 Src.getOperand(0).getValueType().getScalarSizeInBits() <=
18598 Src.getValueType().getScalarSizeInBits())
18599 Src = Src.getOperand(0);
18600
18601 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
18602 // would be generated is at least the width of the element type.
18603 EVT SrcVT = Src.getValueType();
18604 if ((Src.getOpcode() == ARMISD::VMOVIMM ||
18605 Src.getOpcode() == ARMISD::VMVNIMM ||
18606 Src.getOpcode() == ARMISD::VMOVFPIMM) &&
18607 SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
18608 DAG.getDataLayout().isBigEndian())
18609 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
18610
18611 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x
18612 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
18613 return R;
18614
18615 return SDValue();
18616}
18617
18618// Some combines for the MVETrunc truncations legalizer helper. Also lowers the
18619// node into stack operations after legalizeOps.
18622 SelectionDAG &DAG = DCI.DAG;
18623 EVT VT = N->getValueType(0);
18624 SDLoc DL(N);
18625
18626 // MVETrunc(Undef, Undef) -> Undef
18627 if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); }))
18628 return DAG.getUNDEF(VT);
18629
18630 // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc
18631 if (N->getNumOperands() == 2 &&
18632 N->getOperand(0).getOpcode() == ARMISD::MVETRUNC &&
18633 N->getOperand(1).getOpcode() == ARMISD::MVETRUNC)
18634 return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0),
18635 N->getOperand(0).getOperand(1),
18636 N->getOperand(1).getOperand(0),
18637 N->getOperand(1).getOperand(1));
18638
18639 // MVETrunc(shuffle, shuffle) -> VMOVN
18640 if (N->getNumOperands() == 2 &&
18641 N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
18642 N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) {
18643 auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode());
18644 auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode());
18645
18646 if (S0->getOperand(0) == S1->getOperand(0) &&
18647 S0->getOperand(1) == S1->getOperand(1)) {
18648 // Construct complete shuffle mask
18649 SmallVector<int, 8> Mask(S0->getMask());
18650 Mask.append(S1->getMask().begin(), S1->getMask().end());
18651
18652 if (isVMOVNTruncMask(Mask, VT, false))
18653 return DAG.getNode(
18654 ARMISD::VMOVN, DL, VT,
18655 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18656 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18657 DAG.getConstant(1, DL, MVT::i32));
18658 if (isVMOVNTruncMask(Mask, VT, true))
18659 return DAG.getNode(
18660 ARMISD::VMOVN, DL, VT,
18661 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18662 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18663 DAG.getConstant(1, DL, MVT::i32));
18664 }
18665 }
18666
18667 // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
18668 // truncate to a buildvector to allow the generic optimisations to kick in.
18669 if (all_of(N->ops(), [](SDValue Op) {
18670 return Op.getOpcode() == ISD::BUILD_VECTOR ||
18671 Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
18672 (Op.getOpcode() == ISD::BITCAST &&
18673 Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
18674 })) {
18675 SmallVector<SDValue, 8> Extracts;
18676 for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
18677 SDValue O = N->getOperand(Op);
18678 for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) {
18679 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O,
18680 DAG.getConstant(i, DL, MVT::i32));
18681 Extracts.push_back(Ext);
18682 }
18683 }
18684 return DAG.getBuildVector(VT, DL, Extracts);
18685 }
18686
18687 // If we are late in the legalization process and nothing has optimised
18688 // the trunc to anything better, lower it to a stack store and reload,
18689 // performing the truncation whilst keeping the lanes in the correct order:
18690 // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
18691 if (!DCI.isAfterLegalizeDAG())
18692 return SDValue();
18693
18694 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18695 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18696 int NumIns = N->getNumOperands();
18697 assert((NumIns == 2 || NumIns == 4) &&
18698 "Expected 2 or 4 inputs to an MVETrunc");
18699 EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
18700 if (N->getNumOperands() == 4)
18701 StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());
18702
18703 SmallVector<SDValue> Chains;
18704 for (int I = 0; I < NumIns; I++) {
18705 SDValue Ptr = DAG.getNode(
18706 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18707 DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
18709 DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
18710 SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
18711 Ptr, MPI, StoreVT, Align(4));
18712 Chains.push_back(Ch);
18713 }
18714
18715 SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18716 MachinePointerInfo MPI =
18718 return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
18719}
18720
18721// Take a MVEEXT(load x) and split that into (extload x, extload x+8)
18723 SelectionDAG &DAG) {
18724 SDValue N0 = N->getOperand(0);
18725 LoadSDNode *LD = dyn_cast<LoadSDNode>(N0.getNode());
18726 if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())
18727 return SDValue();
18728
18729 EVT FromVT = LD->getMemoryVT();
18730 EVT ToVT = N->getValueType(0);
18731 if (!ToVT.isVector())
18732 return SDValue();
18733 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);
18734 EVT ToEltVT = ToVT.getVectorElementType();
18735 EVT FromEltVT = FromVT.getVectorElementType();
18736
18737 unsigned NumElements = 0;
18738 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
18739 NumElements = 4;
18740 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
18741 NumElements = 8;
18742 assert(NumElements != 0);
18743
18744 ISD::LoadExtType NewExtType =
18745 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
18746 if (LD->getExtensionType() != ISD::NON_EXTLOAD &&
18747 LD->getExtensionType() != ISD::EXTLOAD &&
18748 LD->getExtensionType() != NewExtType)
18749 return SDValue();
18750
18751 LLVMContext &C = *DAG.getContext();
18752 SDLoc DL(LD);
18753 // Details about the old load
18754 SDValue Ch = LD->getChain();
18755 SDValue BasePtr = LD->getBasePtr();
18756 Align Alignment = LD->getBaseAlign();
18757 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
18758 AAMDNodes AAInfo = LD->getAAInfo();
18759
18760 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
18761 EVT NewFromVT = EVT::getVectorVT(
18762 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
18763 EVT NewToVT = EVT::getVectorVT(
18764 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
18765
18768 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
18769 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
18770 SDValue NewPtr =
18771 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
18772
18773 SDValue NewLoad =
18774 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
18775 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
18776 Alignment, MMOFlags, AAInfo);
18777 Loads.push_back(NewLoad);
18778 Chains.push_back(SDValue(NewLoad.getNode(), 1));
18779 }
18780
18781 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18782 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
18783 return DAG.getMergeValues(Loads, DL);
18784}
18785
18786// Perform combines for MVEEXT. If it has not be optimized to anything better
18787// before lowering, it gets converted to stack store and extloads performing the
18788// extend whilst still keeping the same lane ordering.
18791 SelectionDAG &DAG = DCI.DAG;
18792 EVT VT = N->getValueType(0);
18793 SDLoc DL(N);
18794 assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
18795 assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");
18796
18797 EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18798 *DAG.getContext());
18799 auto Extend = [&](SDValue V) {
18800 SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V);
18801 return N->getOpcode() == ARMISD::MVESEXT
18802 ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT,
18803 DAG.getValueType(ExtVT))
18804 : DAG.getZeroExtendInReg(VVT, DL, ExtVT);
18805 };
18806
18807 // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
18808 if (N->getOperand(0).getOpcode() == ARMISD::VDUP) {
18809 SDValue Ext = Extend(N->getOperand(0));
18810 return DAG.getMergeValues({Ext, Ext}, DL);
18811 }
18812
18813 // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
18814 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) {
18815 ArrayRef<int> Mask = SVN->getMask();
18816 assert(Mask.size() == 2 * VT.getVectorNumElements());
18817 assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());
18818 unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;
18819 SDValue Op0 = SVN->getOperand(0);
18820 SDValue Op1 = SVN->getOperand(1);
18821
18822 auto CheckInregMask = [&](int Start, int Offset) {
18823 for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)
18824 if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)
18825 return false;
18826 return true;
18827 };
18828 SDValue V0 = SDValue(N, 0);
18829 SDValue V1 = SDValue(N, 1);
18830 if (CheckInregMask(0, 0))
18831 V0 = Extend(Op0);
18832 else if (CheckInregMask(0, 1))
18833 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18834 else if (CheckInregMask(0, Mask.size()))
18835 V0 = Extend(Op1);
18836 else if (CheckInregMask(0, Mask.size() + 1))
18837 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18838
18839 if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))
18840 V1 = Extend(Op1);
18841 else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))
18842 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18843 else if (CheckInregMask(VT.getVectorNumElements(), 0))
18844 V1 = Extend(Op0);
18845 else if (CheckInregMask(VT.getVectorNumElements(), 1))
18846 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18847
18848 if (V0.getNode() != N || V1.getNode() != N)
18849 return DAG.getMergeValues({V0, V1}, DL);
18850 }
18851
18852 // MVEEXT(load) -> extload, extload
18853 if (N->getOperand(0)->getOpcode() == ISD::LOAD)
18855 return L;
18856
18857 if (!DCI.isAfterLegalizeDAG())
18858 return SDValue();
18859
18860 // Lower to a stack store and reload:
18861 // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
18862 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18863 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18864 int NumOuts = N->getNumValues();
18865 assert((NumOuts == 2 || NumOuts == 4) &&
18866 "Expected 2 or 4 outputs to an MVEEXT");
18867 EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18868 *DAG.getContext());
18869 if (N->getNumOperands() == 4)
18870 LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext());
18871
18872 MachinePointerInfo MPI =
18874 SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0),
18875 StackPtr, MPI, Align(4));
18876
18878 for (int I = 0; I < NumOuts; I++) {
18879 SDValue Ptr = DAG.getNode(
18880 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18881 DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType()));
18883 DAG.getMachineFunction(), SPFI, I * 16 / NumOuts);
18884 SDValue Load = DAG.getExtLoad(
18885 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL,
18886 VT, Chain, Ptr, MPI, LoadVT, Align(4));
18887 Loads.push_back(Load);
18888 }
18889
18890 return DAG.getMergeValues(Loads, DL);
18891}
18892
18894 DAGCombinerInfo &DCI) const {
18895 switch (N->getOpcode()) {
18896 default: break;
18897 case ISD::SELECT_CC:
18898 case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget);
18899 case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);
18900 case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
18901 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
18902 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
18903 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
18904 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget);
18905 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
18906 case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
18907 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
18908 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
18909 case ISD::BRCOND:
18910 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget);
18911 case ARMISD::ADDC:
18912 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
18913 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
18914 case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG);
18915 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
18916 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
18917 case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI);
18918 case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG);
18919 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
18920 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
18923 return PerformExtractEltCombine(N, DCI, Subtarget);
18927 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
18928 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget);
18929 case ISD::FP_TO_SINT:
18930 case ISD::FP_TO_UINT:
18931 return PerformVCVTCombine(N, DCI.DAG, Subtarget);
18932 case ISD::FADD:
18933 return PerformFADDCombine(N, DCI.DAG, Subtarget);
18934 case ISD::FMUL:
18935 return PerformVMulVCTPCombine(N, DCI.DAG, Subtarget);
18937 return PerformIntrinsicCombine(N, DCI);
18938 case ISD::SHL:
18939 case ISD::SRA:
18940 case ISD::SRL:
18941 return PerformShiftCombine(N, DCI, Subtarget);
18942 case ISD::SIGN_EXTEND:
18943 case ISD::ZERO_EXTEND:
18944 case ISD::ANY_EXTEND:
18945 return PerformExtendCombine(N, DCI.DAG, Subtarget);
18946 case ISD::FP_EXTEND:
18947 return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
18948 case ISD::SMIN:
18949 case ISD::UMIN:
18950 case ISD::SMAX:
18951 case ISD::UMAX:
18952 return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
18953 case ARMISD::CMOV:
18954 return PerformCMOVCombine(N, DCI.DAG);
18955 case ARMISD::BRCOND:
18956 return PerformBRCONDCombine(N, DCI.DAG);
18957 case ARMISD::CMPZ:
18958 return PerformCMPZCombine(N, DCI.DAG);
18959 case ARMISD::CSINC:
18960 case ARMISD::CSINV:
18961 case ARMISD::CSNEG:
18962 return PerformCSETCombine(N, DCI.DAG);
18963 case ISD::LOAD:
18964 return PerformLOADCombine(N, DCI, Subtarget);
18965 case ARMISD::VLD1DUP:
18966 case ARMISD::VLD2DUP:
18967 case ARMISD::VLD3DUP:
18968 case ARMISD::VLD4DUP:
18969 return PerformVLDCombine(N, DCI);
18971 return PerformARMBUILD_VECTORCombine(N, DCI);
18972 case ISD::BITCAST:
18973 return PerformBITCASTCombine(N, DCI, Subtarget);
18975 return PerformPREDICATE_CASTCombine(N, DCI);
18977 return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget);
18978 case ARMISD::MVETRUNC:
18979 return PerformMVETruncCombine(N, DCI);
18980 case ARMISD::MVESEXT:
18981 case ARMISD::MVEZEXT:
18982 return PerformMVEExtCombine(N, DCI);
18983 case ARMISD::VCMP:
18984 return PerformVCMPCombine(N, DCI.DAG, Subtarget);
18985 case ISD::VECREDUCE_ADD:
18986 return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
18987 case ARMISD::VADDVs:
18988 case ARMISD::VADDVu:
18989 case ARMISD::VADDLVs:
18990 case ARMISD::VADDLVu:
18991 case ARMISD::VADDLVAs:
18992 case ARMISD::VADDLVAu:
18993 case ARMISD::VMLAVs:
18994 case ARMISD::VMLAVu:
18995 case ARMISD::VMLALVs:
18996 case ARMISD::VMLALVu:
18997 case ARMISD::VMLALVAs:
18998 case ARMISD::VMLALVAu:
18999 return PerformReduceShuffleCombine(N, DCI.DAG);
19000 case ARMISD::VMOVN:
19001 return PerformVMOVNCombine(N, DCI);
19002 case ARMISD::VQMOVNs:
19003 case ARMISD::VQMOVNu:
19004 return PerformVQMOVNCombine(N, DCI);
19005 case ARMISD::VQDMULH:
19006 return PerformVQDMULHCombine(N, DCI);
19007 case ARMISD::ASRL:
19008 case ARMISD::LSRL:
19009 case ARMISD::LSLL:
19010 return PerformLongShiftCombine(N, DCI.DAG);
19011 case ARMISD::SMULWB: {
19012 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19013 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
19014 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
19015 return SDValue();
19016 break;
19017 }
19018 case ARMISD::SMULWT: {
19019 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19020 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
19021 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
19022 return SDValue();
19023 break;
19024 }
19025 case ARMISD::SMLALBB:
19026 case ARMISD::QADD16b:
19027 case ARMISD::QSUB16b:
19028 case ARMISD::UQADD16b:
19029 case ARMISD::UQSUB16b: {
19030 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19031 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
19032 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19033 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19034 return SDValue();
19035 break;
19036 }
19037 case ARMISD::SMLALBT: {
19038 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
19039 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
19040 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
19041 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
19042 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
19043 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
19044 return SDValue();
19045 break;
19046 }
19047 case ARMISD::SMLALTB: {
19048 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
19049 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
19050 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
19051 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
19052 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
19053 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
19054 return SDValue();
19055 break;
19056 }
19057 case ARMISD::SMLALTT: {
19058 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19059 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
19060 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19061 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19062 return SDValue();
19063 break;
19064 }
19065 case ARMISD::QADD8b:
19066 case ARMISD::QSUB8b:
19067 case ARMISD::UQADD8b:
19068 case ARMISD::UQSUB8b: {
19069 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19070 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
19071 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19072 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19073 return SDValue();
19074 break;
19075 }
19076 case ARMISD::VBSP:
19077 if (N->getOperand(1) == N->getOperand(2))
19078 return N->getOperand(1);
19079 return SDValue();
19082 switch (N->getConstantOperandVal(1)) {
19083 case Intrinsic::arm_neon_vld1:
19084 case Intrinsic::arm_neon_vld1x2:
19085 case Intrinsic::arm_neon_vld1x3:
19086 case Intrinsic::arm_neon_vld1x4:
19087 case Intrinsic::arm_neon_vld2:
19088 case Intrinsic::arm_neon_vld3:
19089 case Intrinsic::arm_neon_vld4:
19090 case Intrinsic::arm_neon_vld2lane:
19091 case Intrinsic::arm_neon_vld3lane:
19092 case Intrinsic::arm_neon_vld4lane:
19093 case Intrinsic::arm_neon_vld2dup:
19094 case Intrinsic::arm_neon_vld3dup:
19095 case Intrinsic::arm_neon_vld4dup:
19096 case Intrinsic::arm_neon_vst1:
19097 case Intrinsic::arm_neon_vst1x2:
19098 case Intrinsic::arm_neon_vst1x3:
19099 case Intrinsic::arm_neon_vst1x4:
19100 case Intrinsic::arm_neon_vst2:
19101 case Intrinsic::arm_neon_vst3:
19102 case Intrinsic::arm_neon_vst4:
19103 case Intrinsic::arm_neon_vst2lane:
19104 case Intrinsic::arm_neon_vst3lane:
19105 case Intrinsic::arm_neon_vst4lane:
19106 return PerformVLDCombine(N, DCI);
19107 case Intrinsic::arm_mve_vld2q:
19108 case Intrinsic::arm_mve_vld4q:
19109 case Intrinsic::arm_mve_vst2q:
19110 case Intrinsic::arm_mve_vst4q:
19111 return PerformMVEVLDCombine(N, DCI);
19112 default: break;
19113 }
19114 break;
19115 }
19116 return SDValue();
19117}
19118
19120 EVT VT) const {
19121 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
19122}
19123
19125 Align Alignment,
19127 unsigned *Fast) const {
19128 // Depends what it gets converted into if the type is weird.
19129 if (!VT.isSimple())
19130 return false;
19131
19132 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
19133 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
19134 auto Ty = VT.getSimpleVT().SimpleTy;
19135
19136 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
19137 // Unaligned access can use (for example) LRDB, LRDH, LDR
19138 if (AllowsUnaligned) {
19139 if (Fast)
19140 *Fast = Subtarget->hasV7Ops();
19141 return true;
19142 }
19143 }
19144
19145 if (Ty == MVT::f64 || Ty == MVT::v2f64) {
19146 // For any little-endian targets with neon, we can support unaligned ld/st
19147 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
19148 // A big-endian target may also explicitly support unaligned accesses
19149 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
19150 if (Fast)
19151 *Fast = 1;
19152 return true;
19153 }
19154 }
19155
19156 if (!Subtarget->hasMVEIntegerOps())
19157 return false;
19158
19159 // These are for predicates
19160 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 ||
19161 Ty == MVT::v2i1)) {
19162 if (Fast)
19163 *Fast = 1;
19164 return true;
19165 }
19166
19167 // These are for truncated stores/narrowing loads. They are fine so long as
19168 // the alignment is at least the size of the item being loaded
19169 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
19170 Alignment >= VT.getScalarSizeInBits() / 8) {
19171 if (Fast)
19172 *Fast = true;
19173 return true;
19174 }
19175
19176 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
19177 // VSTRW.U32 all store the vector register in exactly the same format, and
19178 // differ only in the range of their immediate offset field and the required
19179 // alignment. So there is always a store that can be used, regardless of
19180 // actual type.
19181 //
19182 // For big endian, that is not the case. But can still emit a (VSTRB.U8;
19183 // VREV64.8) pair and get the same effect. This will likely be better than
19184 // aligning the vector through the stack.
19185 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
19186 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
19187 Ty == MVT::v2f64) {
19188 if (Fast)
19189 *Fast = 1;
19190 return true;
19191 }
19192
19193 return false;
19194}
19195
19197 LLVMContext &Context, const MemOp &Op,
19198 const AttributeList &FuncAttributes) const {
19199 // See if we can use NEON instructions for this...
19200 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
19201 !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
19202 unsigned Fast;
19203 if (Op.size() >= 16 &&
19204 (Op.isAligned(Align(16)) ||
19205 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1),
19207 Fast))) {
19208 return MVT::v2f64;
19209 } else if (Op.size() >= 8 &&
19210 (Op.isAligned(Align(8)) ||
19212 MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) &&
19213 Fast))) {
19214 return MVT::f64;
19215 }
19216 }
19217
19218 // Let the target-independent logic figure it out.
19219 return MVT::Other;
19220}
19221
19222// 64-bit integers are split into their high and low parts and held in two
19223// different registers, so the trunc is free since the low register can just
19224// be used.
19225bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
19226 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
19227 return false;
19228 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
19229 unsigned DestBits = DstTy->getPrimitiveSizeInBits();
19230 return (SrcBits == 64 && DestBits == 32);
19231}
19232
19234 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
19235 !DstVT.isInteger())
19236 return false;
19237 unsigned SrcBits = SrcVT.getSizeInBits();
19238 unsigned DestBits = DstVT.getSizeInBits();
19239 return (SrcBits == 64 && DestBits == 32);
19240}
19241
19243 if (Val.getOpcode() != ISD::LOAD)
19244 return false;
19245
19246 EVT VT1 = Val.getValueType();
19247 if (!VT1.isSimple() || !VT1.isInteger() ||
19248 !VT2.isSimple() || !VT2.isInteger())
19249 return false;
19250
19251 switch (VT1.getSimpleVT().SimpleTy) {
19252 default: break;
19253 case MVT::i1:
19254 case MVT::i8:
19255 case MVT::i16:
19256 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
19257 return true;
19258 }
19259
19260 return false;
19261}
19262
19264 if (!VT.isSimple())
19265 return false;
19266
19267 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
19268 // negate values directly (fneg is free). So, we don't want to let the DAG
19269 // combiner rewrite fneg into xors and some other instructions. For f16 and
19270 // FullFP16 argument passing, some bitcast nodes may be introduced,
19271 // triggering this DAG combine rewrite, so we are avoiding that with this.
19272 switch (VT.getSimpleVT().SimpleTy) {
19273 default: break;
19274 case MVT::f16:
19275 return Subtarget->hasFullFP16();
19276 }
19277
19278 return false;
19279}
19280
19282 if (!Subtarget->hasMVEIntegerOps())
19283 return nullptr;
19284 Type *SVIType = SVI->getType();
19285 Type *ScalarType = SVIType->getScalarType();
19286
19287 if (ScalarType->isFloatTy())
19288 return Type::getInt32Ty(SVIType->getContext());
19289 if (ScalarType->isHalfTy())
19290 return Type::getInt16Ty(SVIType->getContext());
19291 return nullptr;
19292}
19293
19295 EVT VT = ExtVal.getValueType();
19296
19297 if (!isTypeLegal(VT))
19298 return false;
19299
19300 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
19301 if (Ld->isExpandingLoad())
19302 return false;
19303 }
19304
19305 if (Subtarget->hasMVEIntegerOps())
19306 return true;
19307
19308 // Don't create a loadext if we can fold the extension into a wide/long
19309 // instruction.
19310 // If there's more than one user instruction, the loadext is desirable no
19311 // matter what. There can be two uses by the same instruction.
19312 if (ExtVal->use_empty() ||
19313 !ExtVal->user_begin()->isOnlyUserOf(ExtVal.getNode()))
19314 return true;
19315
19316 SDNode *U = *ExtVal->user_begin();
19317 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
19318 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
19319 return false;
19320
19321 return true;
19322}
19323
19325 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
19326 return false;
19327
19328 if (!isTypeLegal(EVT::getEVT(Ty1)))
19329 return false;
19330
19331 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
19332
19333 // Assuming the caller doesn't have a zeroext or signext return parameter,
19334 // truncation all the way down to i1 is valid.
19335 return true;
19336}
19337
19338/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
19339/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
19340/// expanded to FMAs when this method returns true, otherwise fmuladd is
19341/// expanded to fmul + fadd.
19342///
19343/// ARM supports both fused and unfused multiply-add operations; we already
19344/// lower a pair of fmul and fadd to the latter so it's not clear that there
19345/// would be a gain or that the gain would be worthwhile enough to risk
19346/// correctness bugs.
19347///
19348/// For MVE, we set this to true as it helps simplify the need for some
19349/// patterns (and we don't have the non-fused floating point instruction).
19350bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
19351 EVT VT) const {
19352 if (Subtarget->useSoftFloat())
19353 return false;
19354
19355 if (!VT.isSimple())
19356 return false;
19357
19358 switch (VT.getSimpleVT().SimpleTy) {
19359 case MVT::v4f32:
19360 case MVT::v8f16:
19361 return Subtarget->hasMVEFloatOps();
19362 case MVT::f16:
19363 return Subtarget->useFPVFMx16();
19364 case MVT::f32:
19365 return Subtarget->useFPVFMx();
19366 case MVT::f64:
19367 return Subtarget->useFPVFMx64();
19368 default:
19369 break;
19370 }
19371
19372 return false;
19373}
19374
19375static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
19376 if (V < 0)
19377 return false;
19378
19379 unsigned Scale = 1;
19380 switch (VT.getSimpleVT().SimpleTy) {
19381 case MVT::i1:
19382 case MVT::i8:
19383 // Scale == 1;
19384 break;
19385 case MVT::i16:
19386 // Scale == 2;
19387 Scale = 2;
19388 break;
19389 default:
19390 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
19391 // Scale == 4;
19392 Scale = 4;
19393 break;
19394 }
19395
19396 if ((V & (Scale - 1)) != 0)
19397 return false;
19398 return isUInt<5>(V / Scale);
19399}
19400
19401static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
19402 const ARMSubtarget *Subtarget) {
19403 if (!VT.isInteger() && !VT.isFloatingPoint())
19404 return false;
19405 if (VT.isVector() && Subtarget->hasNEON())
19406 return false;
19407 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
19408 !Subtarget->hasMVEFloatOps())
19409 return false;
19410
19411 bool IsNeg = false;
19412 if (V < 0) {
19413 IsNeg = true;
19414 V = -V;
19415 }
19416
19417 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U);
19418
19419 // MVE: size * imm7
19420 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
19421 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
19422 case MVT::i32:
19423 case MVT::f32:
19424 return isShiftedUInt<7,2>(V);
19425 case MVT::i16:
19426 case MVT::f16:
19427 return isShiftedUInt<7,1>(V);
19428 case MVT::i8:
19429 return isUInt<7>(V);
19430 default:
19431 return false;
19432 }
19433 }
19434
19435 // half VLDR: 2 * imm8
19436 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
19437 return isShiftedUInt<8, 1>(V);
19438 // VLDR and LDRD: 4 * imm8
19439 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
19440 return isShiftedUInt<8, 2>(V);
19441
19442 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
19443 // + imm12 or - imm8
19444 if (IsNeg)
19445 return isUInt<8>(V);
19446 return isUInt<12>(V);
19447 }
19448
19449 return false;
19450}
19451
19452/// isLegalAddressImmediate - Return true if the integer value can be used
19453/// as the offset of the target addressing mode for load / store of the
19454/// given type.
19455static bool isLegalAddressImmediate(int64_t V, EVT VT,
19456 const ARMSubtarget *Subtarget) {
19457 if (V == 0)
19458 return true;
19459
19460 if (!VT.isSimple())
19461 return false;
19462
19463 if (Subtarget->isThumb1Only())
19464 return isLegalT1AddressImmediate(V, VT);
19465 else if (Subtarget->isThumb2())
19466 return isLegalT2AddressImmediate(V, VT, Subtarget);
19467
19468 // ARM mode.
19469 if (V < 0)
19470 V = - V;
19471 switch (VT.getSimpleVT().SimpleTy) {
19472 default: return false;
19473 case MVT::i1:
19474 case MVT::i8:
19475 case MVT::i32:
19476 // +- imm12
19477 return isUInt<12>(V);
19478 case MVT::i16:
19479 // +- imm8
19480 return isUInt<8>(V);
19481 case MVT::f32:
19482 case MVT::f64:
19483 if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
19484 return false;
19485 return isShiftedUInt<8, 2>(V);
19486 }
19487}
19488
19490 EVT VT) const {
19491 int Scale = AM.Scale;
19492 if (Scale < 0)
19493 return false;
19494
19495 switch (VT.getSimpleVT().SimpleTy) {
19496 default: return false;
19497 case MVT::i1:
19498 case MVT::i8:
19499 case MVT::i16:
19500 case MVT::i32:
19501 if (Scale == 1)
19502 return true;
19503 // r + r << imm
19504 Scale = Scale & ~1;
19505 return Scale == 2 || Scale == 4 || Scale == 8;
19506 case MVT::i64:
19507 // FIXME: What are we trying to model here? ldrd doesn't have an r + r
19508 // version in Thumb mode.
19509 // r + r
19510 if (Scale == 1)
19511 return true;
19512 // r * 2 (this can be lowered to r + r).
19513 if (!AM.HasBaseReg && Scale == 2)
19514 return true;
19515 return false;
19516 case MVT::isVoid:
19517 // Note, we allow "void" uses (basically, uses that aren't loads or
19518 // stores), because arm allows folding a scale into many arithmetic
19519 // operations. This should be made more precise and revisited later.
19520
19521 // Allow r << imm, but the imm has to be a multiple of two.
19522 if (Scale & 1) return false;
19523 return isPowerOf2_32(Scale);
19524 }
19525}
19526
19528 EVT VT) const {
19529 const int Scale = AM.Scale;
19530
19531 // Negative scales are not supported in Thumb1.
19532 if (Scale < 0)
19533 return false;
19534
19535 // Thumb1 addressing modes do not support register scaling excepting the
19536 // following cases:
19537 // 1. Scale == 1 means no scaling.
19538 // 2. Scale == 2 this can be lowered to r + r if there is no base register.
19539 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
19540}
19541
19542/// isLegalAddressingMode - Return true if the addressing mode represented
19543/// by AM is legal for this target, for a load/store of the specified type.
19545 const AddrMode &AM, Type *Ty,
19546 unsigned AS, Instruction *I) const {
19547 EVT VT = getValueType(DL, Ty, true);
19548 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
19549 return false;
19550
19551 // Can never fold addr of global into load/store.
19552 if (AM.BaseGV)
19553 return false;
19554
19555 switch (AM.Scale) {
19556 case 0: // no scale reg, must be "r+i" or "r", or "i".
19557 break;
19558 default:
19559 // ARM doesn't support any R+R*scale+imm addr modes.
19560 if (AM.BaseOffs)
19561 return false;
19562
19563 if (!VT.isSimple())
19564 return false;
19565
19566 if (Subtarget->isThumb1Only())
19567 return isLegalT1ScaledAddressingMode(AM, VT);
19568
19569 if (Subtarget->isThumb2())
19570 return isLegalT2ScaledAddressingMode(AM, VT);
19571
19572 int Scale = AM.Scale;
19573 switch (VT.getSimpleVT().SimpleTy) {
19574 default: return false;
19575 case MVT::i1:
19576 case MVT::i8:
19577 case MVT::i32:
19578 if (Scale < 0) Scale = -Scale;
19579 if (Scale == 1)
19580 return true;
19581 // r + r << imm
19582 return isPowerOf2_32(Scale & ~1);
19583 case MVT::i16:
19584 case MVT::i64:
19585 // r +/- r
19586 if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
19587 return true;
19588 // r * 2 (this can be lowered to r + r).
19589 if (!AM.HasBaseReg && Scale == 2)
19590 return true;
19591 return false;
19592
19593 case MVT::isVoid:
19594 // Note, we allow "void" uses (basically, uses that aren't loads or
19595 // stores), because arm allows folding a scale into many arithmetic
19596 // operations. This should be made more precise and revisited later.
19597
19598 // Allow r << imm, but the imm has to be a multiple of two.
19599 if (Scale & 1) return false;
19600 return isPowerOf2_32(Scale);
19601 }
19602 }
19603 return true;
19604}
19605
19606/// isLegalICmpImmediate - Return true if the specified immediate is legal
19607/// icmp immediate, that is the target has icmp instructions which can compare
19608/// a register against the immediate without having to materialize the
19609/// immediate into a register.
19611 // Thumb2 and ARM modes can use cmn for negative immediates.
19612 if (!Subtarget->isThumb())
19613 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
19614 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
19615 if (Subtarget->isThumb2())
19616 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
19617 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
19618 // Thumb1 doesn't have cmn, and only 8-bit immediates.
19619 return Imm >= 0 && Imm <= 255;
19620}
19621
19622/// isLegalAddImmediate - Return true if the specified immediate is a legal add
19623/// *or sub* immediate, that is the target has add or sub instructions which can
19624/// add a register with the immediate without having to materialize the
19625/// immediate into a register.
19627 // Same encoding for add/sub, just flip the sign.
19628 uint64_t AbsImm = AbsoluteValue(Imm);
19629 if (!Subtarget->isThumb())
19630 return ARM_AM::getSOImmVal(AbsImm) != -1;
19631 if (Subtarget->isThumb2())
19632 return ARM_AM::getT2SOImmVal(AbsImm) != -1;
19633 // Thumb1 only has 8-bit unsigned immediate.
19634 return AbsImm <= 255;
19635}
19636
19637// Return false to prevent folding
19638// (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine,
19639// if the folding leads to worse code.
19641 SDValue ConstNode) const {
19642 // Let the DAGCombiner decide for vector types and large types.
19643 const EVT VT = AddNode.getValueType();
19644 if (VT.isVector() || VT.getScalarSizeInBits() > 32)
19645 return true;
19646
19647 // It is worse if c0 is legal add immediate, while c1*c0 is not
19648 // and has to be composed by at least two instructions.
19649 const ConstantSDNode *C0Node = cast<ConstantSDNode>(AddNode.getOperand(1));
19650 const ConstantSDNode *C1Node = cast<ConstantSDNode>(ConstNode);
19651 const int64_t C0 = C0Node->getSExtValue();
19652 APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue();
19654 return true;
19655 if (ConstantMaterializationCost((unsigned)CA.getZExtValue(), Subtarget) > 1)
19656 return false;
19657
19658 // Default to true and let the DAGCombiner decide.
19659 return true;
19660}
19661
19663 bool isSEXTLoad, SDValue &Base,
19664 SDValue &Offset, bool &isInc,
19665 SelectionDAG &DAG) {
19666 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19667 return false;
19668
19669 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
19670 // AddressingMode 3
19671 Base = Ptr->getOperand(0);
19672 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
19673 int RHSC = (int)RHS->getZExtValue();
19674 if (RHSC < 0 && RHSC > -256) {
19675 assert(Ptr->getOpcode() == ISD::ADD);
19676 isInc = false;
19677 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19678 return true;
19679 }
19680 }
19681 isInc = (Ptr->getOpcode() == ISD::ADD);
19682 Offset = Ptr->getOperand(1);
19683 return true;
19684 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
19685 // AddressingMode 2
19686 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
19687 int RHSC = (int)RHS->getZExtValue();
19688 if (RHSC < 0 && RHSC > -0x1000) {
19689 assert(Ptr->getOpcode() == ISD::ADD);
19690 isInc = false;
19691 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19692 Base = Ptr->getOperand(0);
19693 return true;
19694 }
19695 }
19696
19697 if (Ptr->getOpcode() == ISD::ADD) {
19698 isInc = true;
19699 ARM_AM::ShiftOpc ShOpcVal=
19700 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
19701 if (ShOpcVal != ARM_AM::no_shift) {
19702 Base = Ptr->getOperand(1);
19703 Offset = Ptr->getOperand(0);
19704 } else {
19705 Base = Ptr->getOperand(0);
19706 Offset = Ptr->getOperand(1);
19707 }
19708 return true;
19709 }
19710
19711 isInc = (Ptr->getOpcode() == ISD::ADD);
19712 Base = Ptr->getOperand(0);
19713 Offset = Ptr->getOperand(1);
19714 return true;
19715 }
19716
19717 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
19718 return false;
19719}
19720
19722 bool isSEXTLoad, SDValue &Base,
19723 SDValue &Offset, bool &isInc,
19724 SelectionDAG &DAG) {
19725 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19726 return false;
19727
19728 Base = Ptr->getOperand(0);
19729 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
19730 int RHSC = (int)RHS->getZExtValue();
19731 if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
19732 assert(Ptr->getOpcode() == ISD::ADD);
19733 isInc = false;
19734 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19735 return true;
19736 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
19737 isInc = Ptr->getOpcode() == ISD::ADD;
19738 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19739 return true;
19740 }
19741 }
19742
19743 return false;
19744}
19745
19746static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
19747 bool isSEXTLoad, bool IsMasked, bool isLE,
19749 bool &isInc, SelectionDAG &DAG) {
19750 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19751 return false;
19752 if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
19753 return false;
19754
19755 // We allow LE non-masked loads to change the type (for example use a vldrb.8
19756 // as opposed to a vldrw.32). This can allow extra addressing modes or
19757 // alignments for what is otherwise an equivalent instruction.
19758 bool CanChangeType = isLE && !IsMasked;
19759
19760 ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1));
19761 int RHSC = (int)RHS->getZExtValue();
19762
19763 auto IsInRange = [&](int RHSC, int Limit, int Scale) {
19764 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
19765 assert(Ptr->getOpcode() == ISD::ADD);
19766 isInc = false;
19767 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19768 return true;
19769 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
19770 isInc = Ptr->getOpcode() == ISD::ADD;
19771 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19772 return true;
19773 }
19774 return false;
19775 };
19776
19777 // Try to find a matching instruction based on s/zext, Alignment, Offset and
19778 // (in BE/masked) type.
19779 Base = Ptr->getOperand(0);
19780 if (VT == MVT::v4i16) {
19781 if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
19782 return true;
19783 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
19784 if (IsInRange(RHSC, 0x80, 1))
19785 return true;
19786 } else if (Alignment >= 4 &&
19787 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
19788 IsInRange(RHSC, 0x80, 4))
19789 return true;
19790 else if (Alignment >= 2 &&
19791 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
19792 IsInRange(RHSC, 0x80, 2))
19793 return true;
19794 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
19795 return true;
19796 return false;
19797}
19798
19799/// getPreIndexedAddressParts - returns true by value, base pointer and
19800/// offset pointer and addressing mode by reference if the node's address
19801/// can be legally represented as pre-indexed load / store address.
19802bool
19804 SDValue &Offset,
19806 SelectionDAG &DAG) const {
19807 if (Subtarget->isThumb1Only())
19808 return false;
19809
19810 EVT VT;
19811 SDValue Ptr;
19812 Align Alignment;
19813 bool isSEXTLoad = false;
19814 bool IsMasked = false;
19815 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19816 Ptr = LD->getBasePtr();
19817 VT = LD->getMemoryVT();
19818 Alignment = LD->getAlign();
19819 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19820 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19821 Ptr = ST->getBasePtr();
19822 VT = ST->getMemoryVT();
19823 Alignment = ST->getAlign();
19824 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19825 Ptr = LD->getBasePtr();
19826 VT = LD->getMemoryVT();
19827 Alignment = LD->getAlign();
19828 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19829 IsMasked = true;
19830 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
19831 Ptr = ST->getBasePtr();
19832 VT = ST->getMemoryVT();
19833 Alignment = ST->getAlign();
19834 IsMasked = true;
19835 } else
19836 return false;
19837
19838 bool isInc;
19839 bool isLegal = false;
19840 if (VT.isVector())
19841 isLegal = Subtarget->hasMVEIntegerOps() &&
19843 Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
19844 Subtarget->isLittle(), Base, Offset, isInc, DAG);
19845 else {
19846 if (Subtarget->isThumb2())
19847 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19848 Offset, isInc, DAG);
19849 else
19850 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19851 Offset, isInc, DAG);
19852 }
19853 if (!isLegal)
19854 return false;
19855
19856 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
19857 return true;
19858}
19859
19860/// getPostIndexedAddressParts - returns true by value, base pointer and
19861/// offset pointer and addressing mode by reference if this node can be
19862/// combined with a load / store to form a post-indexed load / store.
19864 SDValue &Base,
19865 SDValue &Offset,
19867 SelectionDAG &DAG) const {
19868 EVT VT;
19869 SDValue Ptr;
19870 Align Alignment;
19871 bool isSEXTLoad = false, isNonExt;
19872 bool IsMasked = false;
19873 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19874 VT = LD->getMemoryVT();
19875 Ptr = LD->getBasePtr();
19876 Alignment = LD->getAlign();
19877 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19878 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19879 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19880 VT = ST->getMemoryVT();
19881 Ptr = ST->getBasePtr();
19882 Alignment = ST->getAlign();
19883 isNonExt = !ST->isTruncatingStore();
19884 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19885 VT = LD->getMemoryVT();
19886 Ptr = LD->getBasePtr();
19887 Alignment = LD->getAlign();
19888 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19889 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19890 IsMasked = true;
19891 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
19892 VT = ST->getMemoryVT();
19893 Ptr = ST->getBasePtr();
19894 Alignment = ST->getAlign();
19895 isNonExt = !ST->isTruncatingStore();
19896 IsMasked = true;
19897 } else
19898 return false;
19899
19900 if (Subtarget->isThumb1Only()) {
19901 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
19902 // must be non-extending/truncating, i32, with an offset of 4.
19903 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
19904 if (Op->getOpcode() != ISD::ADD || !isNonExt)
19905 return false;
19906 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
19907 if (!RHS || RHS->getZExtValue() != 4)
19908 return false;
19909 if (Alignment < Align(4))
19910 return false;
19911
19912 Offset = Op->getOperand(1);
19913 Base = Op->getOperand(0);
19914 AM = ISD::POST_INC;
19915 return true;
19916 }
19917
19918 bool isInc;
19919 bool isLegal = false;
19920 if (VT.isVector())
19921 isLegal = Subtarget->hasMVEIntegerOps() &&
19922 getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,
19923 Subtarget->isLittle(), Base, Offset,
19924 isInc, DAG);
19925 else {
19926 if (Subtarget->isThumb2())
19927 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19928 isInc, DAG);
19929 else
19930 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19931 isInc, DAG);
19932 }
19933 if (!isLegal)
19934 return false;
19935
19936 if (Ptr != Base) {
19937 // Swap base ptr and offset to catch more post-index load / store when
19938 // it's legal. In Thumb2 mode, offset must be an immediate.
19939 if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
19940 !Subtarget->isThumb2())
19942
19943 // Post-indexed load / store update the base pointer.
19944 if (Ptr != Base)
19945 return false;
19946 }
19947
19948 AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
19949 return true;
19950}
19951
19953 KnownBits &Known,
19954 const APInt &DemandedElts,
19955 const SelectionDAG &DAG,
19956 unsigned Depth) const {
19957 unsigned BitWidth = Known.getBitWidth();
19958 Known.resetAll();
19959 switch (Op.getOpcode()) {
19960 default: break;
19961 case ARMISD::ADDC:
19962 case ARMISD::ADDE:
19963 case ARMISD::SUBC:
19964 case ARMISD::SUBE:
19965 // Special cases when we convert a carry to a boolean.
19966 if (Op.getResNo() == 0) {
19967 SDValue LHS = Op.getOperand(0);
19968 SDValue RHS = Op.getOperand(1);
19969 // (ADDE 0, 0, C) will give us a single bit.
19970 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
19973 return;
19974 }
19975 }
19976 break;
19977 case ARMISD::CMOV: {
19978 // Bits are known zero/one if known on the LHS and RHS.
19979 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
19980 if (Known.isUnknown())
19981 return;
19982
19983 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
19984 Known = Known.intersectWith(KnownRHS);
19985 return;
19986 }
19988 Intrinsic::ID IntID =
19989 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
19990 switch (IntID) {
19991 default: return;
19992 case Intrinsic::arm_ldaex:
19993 case Intrinsic::arm_ldrex: {
19994 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
19995 unsigned MemBits = VT.getScalarSizeInBits();
19996 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
19997 return;
19998 }
19999 }
20000 }
20001 case ARMISD::BFI: {
20002 // Conservatively, we can recurse down the first operand
20003 // and just mask out all affected bits.
20004 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
20005
20006 // The operand to BFI is already a mask suitable for removing the bits it
20007 // sets.
20008 const APInt &Mask = Op.getConstantOperandAPInt(2);
20009 Known.Zero &= Mask;
20010 Known.One &= Mask;
20011 return;
20012 }
20013 case ARMISD::VGETLANEs:
20014 case ARMISD::VGETLANEu: {
20015 const SDValue &SrcSV = Op.getOperand(0);
20016 EVT VecVT = SrcSV.getValueType();
20017 assert(VecVT.isVector() && "VGETLANE expected a vector type");
20018 const unsigned NumSrcElts = VecVT.getVectorNumElements();
20019 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
20020 assert(Pos->getAPIntValue().ult(NumSrcElts) &&
20021 "VGETLANE index out of bounds");
20022 unsigned Idx = Pos->getZExtValue();
20023 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
20024 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
20025
20026 EVT VT = Op.getValueType();
20027 const unsigned DstSz = VT.getScalarSizeInBits();
20028 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
20029 (void)SrcSz;
20030 assert(SrcSz == Known.getBitWidth());
20031 assert(DstSz > SrcSz);
20032 if (Op.getOpcode() == ARMISD::VGETLANEs)
20033 Known = Known.sext(DstSz);
20034 else {
20035 Known = Known.zext(DstSz);
20036 }
20037 assert(DstSz == Known.getBitWidth());
20038 break;
20039 }
20040 case ARMISD::VMOVrh: {
20041 KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20042 assert(KnownOp.getBitWidth() == 16);
20043 Known = KnownOp.zext(32);
20044 break;
20045 }
20046 case ARMISD::CSINC:
20047 case ARMISD::CSINV:
20048 case ARMISD::CSNEG: {
20049 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20050 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
20051
20052 // The result is either:
20053 // CSINC: KnownOp0 or KnownOp1 + 1
20054 // CSINV: KnownOp0 or ~KnownOp1
20055 // CSNEG: KnownOp0 or KnownOp1 * -1
20056 if (Op.getOpcode() == ARMISD::CSINC)
20057 KnownOp1 =
20058 KnownBits::add(KnownOp1, KnownBits::makeConstant(APInt(32, 1)));
20059 else if (Op.getOpcode() == ARMISD::CSINV)
20060 std::swap(KnownOp1.Zero, KnownOp1.One);
20061 else if (Op.getOpcode() == ARMISD::CSNEG)
20062 KnownOp1 = KnownBits::mul(KnownOp1,
20064
20065 Known = KnownOp0.intersectWith(KnownOp1);
20066 break;
20067 }
20068 }
20069}
20070
20072 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
20073 TargetLoweringOpt &TLO) const {
20074 // Delay optimization, so we don't have to deal with illegal types, or block
20075 // optimizations.
20076 if (!TLO.LegalOps)
20077 return false;
20078
20079 // Only optimize AND for now.
20080 if (Op.getOpcode() != ISD::AND)
20081 return false;
20082
20083 EVT VT = Op.getValueType();
20084
20085 // Ignore vectors.
20086 if (VT.isVector())
20087 return false;
20088
20089 assert(VT == MVT::i32 && "Unexpected integer type");
20090
20091 // Make sure the RHS really is a constant.
20092 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
20093 if (!C)
20094 return false;
20095
20096 unsigned Mask = C->getZExtValue();
20097
20098 unsigned Demanded = DemandedBits.getZExtValue();
20099 unsigned ShrunkMask = Mask & Demanded;
20100 unsigned ExpandedMask = Mask | ~Demanded;
20101
20102 // If the mask is all zeros, let the target-independent code replace the
20103 // result with zero.
20104 if (ShrunkMask == 0)
20105 return false;
20106
20107 // If the mask is all ones, erase the AND. (Currently, the target-independent
20108 // code won't do this, so we have to do it explicitly to avoid an infinite
20109 // loop in obscure cases.)
20110 if (ExpandedMask == ~0U)
20111 return TLO.CombineTo(Op, Op.getOperand(0));
20112
20113 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
20114 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
20115 };
20116 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
20117 if (NewMask == Mask)
20118 return true;
20119 SDLoc DL(Op);
20120 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
20121 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
20122 return TLO.CombineTo(Op, NewOp);
20123 };
20124
20125 // Prefer uxtb mask.
20126 if (IsLegalMask(0xFF))
20127 return UseMask(0xFF);
20128
20129 // Prefer uxth mask.
20130 if (IsLegalMask(0xFFFF))
20131 return UseMask(0xFFFF);
20132
20133 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
20134 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
20135 if (ShrunkMask < 256)
20136 return UseMask(ShrunkMask);
20137
20138 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
20139 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
20140 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
20141 return UseMask(ExpandedMask);
20142
20143 // Potential improvements:
20144 //
20145 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
20146 // We could try to prefer Thumb1 immediates which can be lowered to a
20147 // two-instruction sequence.
20148 // We could try to recognize more legal ARM/Thumb2 immediates here.
20149
20150 return false;
20151}
20152
20154 SDValue Op, const APInt &OriginalDemandedBits,
20155 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
20156 unsigned Depth) const {
20157 unsigned Opc = Op.getOpcode();
20158
20159 switch (Opc) {
20160 case ARMISD::ASRL:
20161 case ARMISD::LSRL: {
20162 // If this is result 0 and the other result is unused, see if the demand
20163 // bits allow us to shrink this long shift into a standard small shift in
20164 // the opposite direction.
20165 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
20166 isa<ConstantSDNode>(Op->getOperand(2))) {
20167 unsigned ShAmt = Op->getConstantOperandVal(2);
20168 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32)
20169 << (32 - ShAmt)))
20170 return TLO.CombineTo(
20171 Op, TLO.DAG.getNode(
20172 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
20173 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
20174 }
20175 break;
20176 }
20177 case ARMISD::VBICIMM: {
20178 SDValue Op0 = Op.getOperand(0);
20179 unsigned ModImm = Op.getConstantOperandVal(1);
20180 unsigned EltBits = 0;
20181 uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits);
20182 if ((OriginalDemandedBits & Mask) == 0)
20183 return TLO.CombineTo(Op, Op0);
20184 }
20185 }
20186
20188 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
20189}
20190
20191//===----------------------------------------------------------------------===//
20192// ARM Inline Assembly Support
20193//===----------------------------------------------------------------------===//
20194
20196 // Looking for "rev" which is V6+.
20197 if (!Subtarget->hasV6Ops())
20198 return false;
20199
20200 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
20201 StringRef AsmStr = IA->getAsmString();
20202 SmallVector<StringRef, 4> AsmPieces;
20203 SplitString(AsmStr, AsmPieces, ";\n");
20204
20205 switch (AsmPieces.size()) {
20206 default: return false;
20207 case 1:
20208 AsmStr = AsmPieces[0];
20209 AsmPieces.clear();
20210 SplitString(AsmStr, AsmPieces, " \t,");
20211
20212 // rev $0, $1
20213 if (AsmPieces.size() == 3 && AsmPieces[0] == "rev" &&
20214 AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
20215 IA->getConstraintString().starts_with("=l,l")) {
20216 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
20217 if (Ty && Ty->getBitWidth() == 32)
20219 }
20220 break;
20221 }
20222
20223 return false;
20224}
20225
20226const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
20227 // At this point, we have to lower this constraint to something else, so we
20228 // lower it to an "r" or "w". However, by doing this we will force the result
20229 // to be in register, while the X constraint is much more permissive.
20230 //
20231 // Although we are correct (we are free to emit anything, without
20232 // constraints), we might break use cases that would expect us to be more
20233 // efficient and emit something else.
20234 if (!Subtarget->hasVFP2Base())
20235 return "r";
20236 if (ConstraintVT.isFloatingPoint())
20237 return "w";
20238 if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
20239 (ConstraintVT.getSizeInBits() == 64 ||
20240 ConstraintVT.getSizeInBits() == 128))
20241 return "w";
20242
20243 return "r";
20244}
20245
20246/// getConstraintType - Given a constraint letter, return the type of
20247/// constraint it is for this target.
20250 unsigned S = Constraint.size();
20251 if (S == 1) {
20252 switch (Constraint[0]) {
20253 default: break;
20254 case 'l': return C_RegisterClass;
20255 case 'w': return C_RegisterClass;
20256 case 'h': return C_RegisterClass;
20257 case 'x': return C_RegisterClass;
20258 case 't': return C_RegisterClass;
20259 case 'j': return C_Immediate; // Constant for movw.
20260 // An address with a single base register. Due to the way we
20261 // currently handle addresses it is the same as an 'r' memory constraint.
20262 case 'Q': return C_Memory;
20263 }
20264 } else if (S == 2) {
20265 switch (Constraint[0]) {
20266 default: break;
20267 case 'T': return C_RegisterClass;
20268 // All 'U+' constraints are addresses.
20269 case 'U': return C_Memory;
20270 }
20271 }
20272 return TargetLowering::getConstraintType(Constraint);
20273}
20274
20275/// Examine constraint type and operand type and determine a weight value.
20276/// This object must already have been set up with the operand type
20277/// and the current alternative constraint selected.
20280 AsmOperandInfo &info, const char *constraint) const {
20282 Value *CallOperandVal = info.CallOperandVal;
20283 // If we don't have a value, we can't do a match,
20284 // but allow it at the lowest weight.
20285 if (!CallOperandVal)
20286 return CW_Default;
20287 Type *type = CallOperandVal->getType();
20288 // Look at the constraint type.
20289 switch (*constraint) {
20290 default:
20292 break;
20293 case 'l':
20294 if (type->isIntegerTy()) {
20295 if (Subtarget->isThumb())
20296 weight = CW_SpecificReg;
20297 else
20298 weight = CW_Register;
20299 }
20300 break;
20301 case 'w':
20302 if (type->isFloatingPointTy())
20303 weight = CW_Register;
20304 break;
20305 }
20306 return weight;
20307}
20308
20309static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT) {
20310 if (PR == 0 || VT == MVT::Other)
20311 return false;
20312 return (ARM::SPRRegClass.contains(PR) && VT != MVT::f32 && VT != MVT::i32) ||
20313 (ARM::DPRRegClass.contains(PR) && VT != MVT::f64 &&
20314 !VT.is64BitVector());
20315}
20316
20317using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
20318
20320 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
20321 switch (Constraint.size()) {
20322 case 1:
20323 // GCC ARM Constraint Letters
20324 switch (Constraint[0]) {
20325 case 'l': // Low regs or general regs.
20326 if (Subtarget->isThumb())
20327 return RCPair(0U, &ARM::tGPRRegClass);
20328 return RCPair(0U, &ARM::GPRRegClass);
20329 case 'h': // High regs or no regs.
20330 if (Subtarget->isThumb())
20331 return RCPair(0U, &ARM::hGPRRegClass);
20332 break;
20333 case 'r':
20334 if (Subtarget->isThumb1Only())
20335 return RCPair(0U, &ARM::tGPRRegClass);
20336 return RCPair(0U, &ARM::GPRRegClass);
20337 case 'w':
20338 if (VT == MVT::Other)
20339 break;
20340 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20341 return RCPair(0U, &ARM::SPRRegClass);
20342 if (VT.getSizeInBits() == 64)
20343 return RCPair(0U, &ARM::DPRRegClass);
20344 if (VT.getSizeInBits() == 128)
20345 return RCPair(0U, &ARM::QPRRegClass);
20346 break;
20347 case 'x':
20348 if (VT == MVT::Other)
20349 break;
20350 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20351 return RCPair(0U, &ARM::SPR_8RegClass);
20352 if (VT.getSizeInBits() == 64)
20353 return RCPair(0U, &ARM::DPR_8RegClass);
20354 if (VT.getSizeInBits() == 128)
20355 return RCPair(0U, &ARM::QPR_8RegClass);
20356 break;
20357 case 't':
20358 if (VT == MVT::Other)
20359 break;
20360 if (VT == MVT::f32 || VT == MVT::i32 || VT == MVT::f16 || VT == MVT::bf16)
20361 return RCPair(0U, &ARM::SPRRegClass);
20362 if (VT.getSizeInBits() == 64)
20363 return RCPair(0U, &ARM::DPR_VFP2RegClass);
20364 if (VT.getSizeInBits() == 128)
20365 return RCPair(0U, &ARM::QPR_VFP2RegClass);
20366 break;
20367 }
20368 break;
20369
20370 case 2:
20371 if (Constraint[0] == 'T') {
20372 switch (Constraint[1]) {
20373 default:
20374 break;
20375 case 'e':
20376 return RCPair(0U, &ARM::tGPREvenRegClass);
20377 case 'o':
20378 return RCPair(0U, &ARM::tGPROddRegClass);
20379 }
20380 }
20381 break;
20382
20383 default:
20384 break;
20385 }
20386
20387 if (StringRef("{cc}").equals_insensitive(Constraint))
20388 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
20389
20390 auto RCP = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
20391 if (isIncompatibleReg(RCP.first, VT))
20392 return {0, nullptr};
20393 return RCP;
20394}
20395
20396/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
20397/// vector. If it is invalid, don't add anything to Ops.
20399 StringRef Constraint,
20400 std::vector<SDValue> &Ops,
20401 SelectionDAG &DAG) const {
20402 SDValue Result;
20403
20404 // Currently only support length 1 constraints.
20405 if (Constraint.size() != 1)
20406 return;
20407
20408 char ConstraintLetter = Constraint[0];
20409 switch (ConstraintLetter) {
20410 default: break;
20411 case 'j':
20412 case 'I': case 'J': case 'K': case 'L':
20413 case 'M': case 'N': case 'O':
20414 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
20415 if (!C)
20416 return;
20417
20418 int64_t CVal64 = C->getSExtValue();
20419 int CVal = (int) CVal64;
20420 // None of these constraints allow values larger than 32 bits. Check
20421 // that the value fits in an int.
20422 if (CVal != CVal64)
20423 return;
20424
20425 switch (ConstraintLetter) {
20426 case 'j':
20427 // Constant suitable for movw, must be between 0 and
20428 // 65535.
20429 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
20430 if (CVal >= 0 && CVal <= 65535)
20431 break;
20432 return;
20433 case 'I':
20434 if (Subtarget->isThumb1Only()) {
20435 // This must be a constant between 0 and 255, for ADD
20436 // immediates.
20437 if (CVal >= 0 && CVal <= 255)
20438 break;
20439 } else if (Subtarget->isThumb2()) {
20440 // A constant that can be used as an immediate value in a
20441 // data-processing instruction.
20442 if (ARM_AM::getT2SOImmVal(CVal) != -1)
20443 break;
20444 } else {
20445 // A constant that can be used as an immediate value in a
20446 // data-processing instruction.
20447 if (ARM_AM::getSOImmVal(CVal) != -1)
20448 break;
20449 }
20450 return;
20451
20452 case 'J':
20453 if (Subtarget->isThumb1Only()) {
20454 // This must be a constant between -255 and -1, for negated ADD
20455 // immediates. This can be used in GCC with an "n" modifier that
20456 // prints the negated value, for use with SUB instructions. It is
20457 // not useful otherwise but is implemented for compatibility.
20458 if (CVal >= -255 && CVal <= -1)
20459 break;
20460 } else {
20461 // This must be a constant between -4095 and 4095. It is not clear
20462 // what this constraint is intended for. Implemented for
20463 // compatibility with GCC.
20464 if (CVal >= -4095 && CVal <= 4095)
20465 break;
20466 }
20467 return;
20468
20469 case 'K':
20470 if (Subtarget->isThumb1Only()) {
20471 // A 32-bit value where only one byte has a nonzero value. Exclude
20472 // zero to match GCC. This constraint is used by GCC internally for
20473 // constants that can be loaded with a move/shift combination.
20474 // It is not useful otherwise but is implemented for compatibility.
20475 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
20476 break;
20477 } else if (Subtarget->isThumb2()) {
20478 // A constant whose bitwise inverse can be used as an immediate
20479 // value in a data-processing instruction. This can be used in GCC
20480 // with a "B" modifier that prints the inverted value, for use with
20481 // BIC and MVN instructions. It is not useful otherwise but is
20482 // implemented for compatibility.
20483 if (ARM_AM::getT2SOImmVal(~CVal) != -1)
20484 break;
20485 } else {
20486 // A constant whose bitwise inverse can be used as an immediate
20487 // value in a data-processing instruction. This can be used in GCC
20488 // with a "B" modifier that prints the inverted value, for use with
20489 // BIC and MVN instructions. It is not useful otherwise but is
20490 // implemented for compatibility.
20491 if (ARM_AM::getSOImmVal(~CVal) != -1)
20492 break;
20493 }
20494 return;
20495
20496 case 'L':
20497 if (Subtarget->isThumb1Only()) {
20498 // This must be a constant between -7 and 7,
20499 // for 3-operand ADD/SUB immediate instructions.
20500 if (CVal >= -7 && CVal < 7)
20501 break;
20502 } else if (Subtarget->isThumb2()) {
20503 // A constant whose negation can be used as an immediate value in a
20504 // data-processing instruction. This can be used in GCC with an "n"
20505 // modifier that prints the negated value, for use with SUB
20506 // instructions. It is not useful otherwise but is implemented for
20507 // compatibility.
20508 if (ARM_AM::getT2SOImmVal(-CVal) != -1)
20509 break;
20510 } else {
20511 // A constant whose negation can be used as an immediate value in a
20512 // data-processing instruction. This can be used in GCC with an "n"
20513 // modifier that prints the negated value, for use with SUB
20514 // instructions. It is not useful otherwise but is implemented for
20515 // compatibility.
20516 if (ARM_AM::getSOImmVal(-CVal) != -1)
20517 break;
20518 }
20519 return;
20520
20521 case 'M':
20522 if (Subtarget->isThumb1Only()) {
20523 // This must be a multiple of 4 between 0 and 1020, for
20524 // ADD sp + immediate.
20525 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
20526 break;
20527 } else {
20528 // A power of two or a constant between 0 and 32. This is used in
20529 // GCC for the shift amount on shifted register operands, but it is
20530 // useful in general for any shift amounts.
20531 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
20532 break;
20533 }
20534 return;
20535
20536 case 'N':
20537 if (Subtarget->isThumb1Only()) {
20538 // This must be a constant between 0 and 31, for shift amounts.
20539 if (CVal >= 0 && CVal <= 31)
20540 break;
20541 }
20542 return;
20543
20544 case 'O':
20545 if (Subtarget->isThumb1Only()) {
20546 // This must be a multiple of 4 between -508 and 508, for
20547 // ADD/SUB sp = sp + immediate.
20548 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
20549 break;
20550 }
20551 return;
20552 }
20553 Result = DAG.getSignedTargetConstant(CVal, SDLoc(Op), Op.getValueType());
20554 break;
20555 }
20556
20557 if (Result.getNode()) {
20558 Ops.push_back(Result);
20559 return;
20560 }
20561 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
20562}
20563
20564static RTLIB::Libcall getDivRemLibcall(
20565 const SDNode *N, MVT::SimpleValueType SVT) {
20566 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20567 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20568 "Unhandled Opcode in getDivRemLibcall");
20569 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20570 N->getOpcode() == ISD::SREM;
20571 RTLIB::Libcall LC;
20572 switch (SVT) {
20573 default: llvm_unreachable("Unexpected request for libcall!");
20574 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
20575 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
20576 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
20577 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
20578 }
20579 return LC;
20580}
20581
20583 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
20584 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20585 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20586 "Unhandled Opcode in getDivRemArgList");
20587 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20588 N->getOpcode() == ISD::SREM;
20590 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
20591 EVT ArgVT = N->getOperand(i).getValueType();
20592 Type *ArgTy = ArgVT.getTypeForEVT(*Context);
20593 TargetLowering::ArgListEntry Entry(N->getOperand(i), ArgTy);
20594 Entry.IsSExt = isSigned;
20595 Entry.IsZExt = !isSigned;
20596 Args.push_back(Entry);
20597 }
20598 if (Subtarget->isTargetWindows() && Args.size() >= 2)
20599 std::swap(Args[0], Args[1]);
20600 return Args;
20601}
20602
20603SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
20604 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
20605 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
20606 Subtarget->isTargetWindows()) &&
20607 "Register-based DivRem lowering only");
20608 unsigned Opcode = Op->getOpcode();
20609 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
20610 "Invalid opcode for Div/Rem lowering");
20611 bool isSigned = (Opcode == ISD::SDIVREM);
20612 EVT VT = Op->getValueType(0);
20613 SDLoc dl(Op);
20614
20615 if (VT == MVT::i64 && isa<ConstantSDNode>(Op.getOperand(1))) {
20617 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i32, DAG)) {
20618 SDValue Res0 =
20619 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[0], Result[1]);
20620 SDValue Res1 =
20621 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[2], Result[3]);
20622 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20623 {Res0, Res1});
20624 }
20625 }
20626
20627 Type *Ty = VT.getTypeForEVT(*DAG.getContext());
20628
20629 // If the target has hardware divide, use divide + multiply + subtract:
20630 // div = a / b
20631 // rem = a - b * div
20632 // return {div, rem}
20633 // This should be lowered into UDIV/SDIV + MLS later on.
20634 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
20635 : Subtarget->hasDivideInARMMode();
20636 if (hasDivide && Op->getValueType(0).isSimple() &&
20637 Op->getSimpleValueType(0) == MVT::i32) {
20638 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
20639 const SDValue Dividend = Op->getOperand(0);
20640 const SDValue Divisor = Op->getOperand(1);
20641 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
20642 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
20643 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
20644
20645 SDValue Values[2] = {Div, Rem};
20646 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
20647 }
20648
20649 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
20650 VT.getSimpleVT().SimpleTy);
20651 SDValue InChain = DAG.getEntryNode();
20652
20654 DAG.getContext(),
20655 Subtarget);
20656
20659
20660 Type *RetTy = StructType::get(Ty, Ty);
20661
20662 if (Subtarget->isTargetWindows())
20663 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
20664
20666 CLI.setDebugLoc(dl).setChain(InChain)
20667 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
20669
20670 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
20671 return CallInfo.first;
20672}
20673
20674// Lowers REM using divmod helpers
20675// see RTABI section 4.2/4.3
20676SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
20677 EVT VT = N->getValueType(0);
20678
20679 if (VT == MVT::i64 && isa<ConstantSDNode>(N->getOperand(1))) {
20681 if (expandDIVREMByConstant(N, Result, MVT::i32, DAG))
20682 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), N->getValueType(0),
20683 Result[0], Result[1]);
20684 }
20685
20686 // Build return types (div and rem)
20687 std::vector<Type*> RetTyParams;
20688 Type *RetTyElement;
20689
20690 switch (VT.getSimpleVT().SimpleTy) {
20691 default: llvm_unreachable("Unexpected request for libcall!");
20692 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
20693 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
20694 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
20695 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
20696 }
20697
20698 RetTyParams.push_back(RetTyElement);
20699 RetTyParams.push_back(RetTyElement);
20700 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
20701 Type *RetTy = StructType::get(*DAG.getContext(), ret);
20702
20703 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
20704 SimpleTy);
20705 SDValue InChain = DAG.getEntryNode();
20707 Subtarget);
20708 bool isSigned = N->getOpcode() == ISD::SREM;
20711
20712 if (Subtarget->isTargetWindows())
20713 InChain = WinDBZCheckDenominator(DAG, N, InChain);
20714
20715 // Lower call
20716 CallLoweringInfo CLI(DAG);
20717 CLI.setChain(InChain)
20718 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
20720 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
20721
20722 // Return second (rem) result operand (first contains div)
20723 SDNode *ResNode = CallResult.first.getNode();
20724 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
20725 return ResNode->getOperand(1);
20726}
20727
20728SDValue
20729ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
20730 assert(Subtarget->isTargetWindows() && "unsupported target platform");
20731 SDLoc DL(Op);
20732
20733 // Get the inputs.
20734 SDValue Chain = Op.getOperand(0);
20735 SDValue Size = Op.getOperand(1);
20736
20738 "no-stack-arg-probe")) {
20740 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
20741 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20742 Chain = SP.getValue(1);
20743 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
20744 if (Align)
20745 SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
20746 DAG.getSignedConstant(-Align->value(), DL, MVT::i32));
20747 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
20748 SDValue Ops[2] = { SP, Chain };
20749 return DAG.getMergeValues(Ops, DL);
20750 }
20751
20752 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
20753 DAG.getConstant(2, DL, MVT::i32));
20754
20755 SDValue Glue;
20756 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Glue);
20757 Glue = Chain.getValue(1);
20758
20759 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20760 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Glue);
20761
20762 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20763 Chain = NewSP.getValue(1);
20764
20765 SDValue Ops[2] = { NewSP, Chain };
20766 return DAG.getMergeValues(Ops, DL);
20767}
20768
20769SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
20770 bool IsStrict = Op->isStrictFPOpcode();
20771 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20772 const unsigned DstSz = Op.getValueType().getSizeInBits();
20773 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
20774 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
20775 "Unexpected type for custom-lowering FP_EXTEND");
20776
20777 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20778 "With both FP DP and 16, any FP conversion is legal!");
20779
20780 assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
20781 "With FP16, 16 to 32 conversion is legal!");
20782
20783 // Converting from 32 -> 64 is valid if we have FP64.
20784 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) {
20785 // FIXME: Remove this when we have strict fp instruction selection patterns
20786 if (IsStrict) {
20787 SDLoc Loc(Op);
20789 Loc, Op.getValueType(), SrcVal);
20790 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
20791 }
20792 return Op;
20793 }
20794
20795 // Either we are converting from 16 -> 64, without FP16 and/or
20796 // FP.double-precision or without Armv8-fp. So we must do it in two
20797 // steps.
20798 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
20799 // without FP16. So we must do a function call.
20800 SDLoc Loc(Op);
20801 RTLIB::Libcall LC;
20802 MakeLibCallOptions CallOptions;
20803 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20804 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {
20805 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());
20806 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);
20807 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);
20808 if (Supported) {
20809 if (IsStrict) {
20810 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc,
20811 {DstVT, MVT::Other}, {Chain, SrcVal});
20812 Chain = SrcVal.getValue(1);
20813 } else {
20814 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal);
20815 }
20816 } else {
20817 LC = RTLIB::getFPEXT(SrcVT, DstVT);
20818 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20819 "Unexpected type for custom-lowering FP_EXTEND");
20820 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20821 Loc, Chain);
20822 }
20823 }
20824
20825 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal;
20826}
20827
20828SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
20829 bool IsStrict = Op->isStrictFPOpcode();
20830
20831 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20832 EVT SrcVT = SrcVal.getValueType();
20833 EVT DstVT = Op.getValueType();
20834 const unsigned DstSz = Op.getValueType().getSizeInBits();
20835 const unsigned SrcSz = SrcVT.getSizeInBits();
20836 (void)DstSz;
20837 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
20838 "Unexpected type for custom-lowering FP_ROUND");
20839
20840 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20841 "With both FP DP and 16, any FP conversion is legal!");
20842
20843 SDLoc Loc(Op);
20844
20845 // Instruction from 32 -> 16 if hasFP16 is valid
20846 if (SrcSz == 32 && Subtarget->hasFP16())
20847 return Op;
20848
20849 // Lib call from 32 -> 16 / 64 -> [32, 16]
20850 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
20851 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20852 "Unexpected type for custom-lowering FP_ROUND");
20853 MakeLibCallOptions CallOptions;
20854 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20856 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20857 Loc, Chain);
20858 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
20859}
20860
20861bool
20863 // The ARM target isn't yet aware of offsets.
20864 return false;
20865}
20866
20868 if (v == 0xffffffff)
20869 return false;
20870
20871 // there can be 1's on either or both "outsides", all the "inside"
20872 // bits must be 0's
20873 return isShiftedMask_32(~v);
20874}
20875
20876/// isFPImmLegal - Returns true if the target can instruction select the
20877/// specified FP immediate natively. If false, the legalizer will
20878/// materialize the FP immediate as a load from a constant pool.
20880 bool ForCodeSize) const {
20881 if (!Subtarget->hasVFP3Base())
20882 return false;
20883 if (VT == MVT::f16 && Subtarget->hasFullFP16())
20884 return ARM_AM::getFP16Imm(Imm) != -1;
20885 if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
20886 ARM_AM::getFP32FP16Imm(Imm) != -1)
20887 return true;
20888 if (VT == MVT::f32)
20889 return ARM_AM::getFP32Imm(Imm) != -1;
20890 if (VT == MVT::f64 && Subtarget->hasFP64())
20891 return ARM_AM::getFP64Imm(Imm) != -1;
20892 return false;
20893}
20894
20895/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
20896/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
20897/// specified in the intrinsic calls.
20899 const CallInst &I,
20900 MachineFunction &MF,
20901 unsigned Intrinsic) const {
20902 switch (Intrinsic) {
20903 case Intrinsic::arm_neon_vld1:
20904 case Intrinsic::arm_neon_vld2:
20905 case Intrinsic::arm_neon_vld3:
20906 case Intrinsic::arm_neon_vld4:
20907 case Intrinsic::arm_neon_vld2lane:
20908 case Intrinsic::arm_neon_vld3lane:
20909 case Intrinsic::arm_neon_vld4lane:
20910 case Intrinsic::arm_neon_vld2dup:
20911 case Intrinsic::arm_neon_vld3dup:
20912 case Intrinsic::arm_neon_vld4dup: {
20914 // Conservatively set memVT to the entire set of vectors loaded.
20915 auto &DL = I.getDataLayout();
20916 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20917 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20918 Info.ptrVal = I.getArgOperand(0);
20919 Info.offset = 0;
20920 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
20921 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
20922 // volatile loads with NEON intrinsics not supported
20924 return true;
20925 }
20926 case Intrinsic::arm_neon_vld1x2:
20927 case Intrinsic::arm_neon_vld1x3:
20928 case Intrinsic::arm_neon_vld1x4: {
20930 // Conservatively set memVT to the entire set of vectors loaded.
20931 auto &DL = I.getDataLayout();
20932 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20933 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20934 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
20935 Info.offset = 0;
20936 Info.align = I.getParamAlign(I.arg_size() - 1).valueOrOne();
20937 // volatile loads with NEON intrinsics not supported
20939 return true;
20940 }
20941 case Intrinsic::arm_neon_vst1:
20942 case Intrinsic::arm_neon_vst2:
20943 case Intrinsic::arm_neon_vst3:
20944 case Intrinsic::arm_neon_vst4:
20945 case Intrinsic::arm_neon_vst2lane:
20946 case Intrinsic::arm_neon_vst3lane:
20947 case Intrinsic::arm_neon_vst4lane: {
20949 // Conservatively set memVT to the entire set of vectors stored.
20950 auto &DL = I.getDataLayout();
20951 unsigned NumElts = 0;
20952 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
20953 Type *ArgTy = I.getArgOperand(ArgI)->getType();
20954 if (!ArgTy->isVectorTy())
20955 break;
20956 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
20957 }
20958 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20959 Info.ptrVal = I.getArgOperand(0);
20960 Info.offset = 0;
20961 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
20962 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
20963 // volatile stores with NEON intrinsics not supported
20965 return true;
20966 }
20967 case Intrinsic::arm_neon_vst1x2:
20968 case Intrinsic::arm_neon_vst1x3:
20969 case Intrinsic::arm_neon_vst1x4: {
20971 // Conservatively set memVT to the entire set of vectors stored.
20972 auto &DL = I.getDataLayout();
20973 unsigned NumElts = 0;
20974 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
20975 Type *ArgTy = I.getArgOperand(ArgI)->getType();
20976 if (!ArgTy->isVectorTy())
20977 break;
20978 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
20979 }
20980 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20981 Info.ptrVal = I.getArgOperand(0);
20982 Info.offset = 0;
20983 Info.align = I.getParamAlign(0).valueOrOne();
20984 // volatile stores with NEON intrinsics not supported
20986 return true;
20987 }
20988 case Intrinsic::arm_mve_vld2q:
20989 case Intrinsic::arm_mve_vld4q: {
20991 // Conservatively set memVT to the entire set of vectors loaded.
20992 Type *VecTy = cast<StructType>(I.getType())->getElementType(1);
20993 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
20994 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
20995 Info.ptrVal = I.getArgOperand(0);
20996 Info.offset = 0;
20997 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
20998 // volatile loads with MVE intrinsics not supported
21000 return true;
21001 }
21002 case Intrinsic::arm_mve_vst2q:
21003 case Intrinsic::arm_mve_vst4q: {
21005 // Conservatively set memVT to the entire set of vectors stored.
21006 Type *VecTy = I.getArgOperand(1)->getType();
21007 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
21008 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
21009 Info.ptrVal = I.getArgOperand(0);
21010 Info.offset = 0;
21011 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
21012 // volatile stores with MVE intrinsics not supported
21014 return true;
21015 }
21016 case Intrinsic::arm_mve_vldr_gather_base:
21017 case Intrinsic::arm_mve_vldr_gather_base_predicated: {
21019 Info.ptrVal = nullptr;
21020 Info.memVT = MVT::getVT(I.getType());
21021 Info.align = Align(1);
21023 return true;
21024 }
21025 case Intrinsic::arm_mve_vldr_gather_base_wb:
21026 case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {
21028 Info.ptrVal = nullptr;
21029 Info.memVT = MVT::getVT(I.getType()->getContainedType(0));
21030 Info.align = Align(1);
21032 return true;
21033 }
21034 case Intrinsic::arm_mve_vldr_gather_offset:
21035 case Intrinsic::arm_mve_vldr_gather_offset_predicated: {
21037 Info.ptrVal = nullptr;
21038 MVT DataVT = MVT::getVT(I.getType());
21039 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
21040 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21041 DataVT.getVectorNumElements());
21042 Info.align = Align(1);
21044 return true;
21045 }
21046 case Intrinsic::arm_mve_vstr_scatter_base:
21047 case Intrinsic::arm_mve_vstr_scatter_base_predicated: {
21049 Info.ptrVal = nullptr;
21050 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21051 Info.align = Align(1);
21053 return true;
21054 }
21055 case Intrinsic::arm_mve_vstr_scatter_base_wb:
21056 case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: {
21058 Info.ptrVal = nullptr;
21059 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21060 Info.align = Align(1);
21062 return true;
21063 }
21064 case Intrinsic::arm_mve_vstr_scatter_offset:
21065 case Intrinsic::arm_mve_vstr_scatter_offset_predicated: {
21067 Info.ptrVal = nullptr;
21068 MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType());
21069 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
21070 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21071 DataVT.getVectorNumElements());
21072 Info.align = Align(1);
21074 return true;
21075 }
21076 case Intrinsic::arm_ldaex:
21077 case Intrinsic::arm_ldrex: {
21078 auto &DL = I.getDataLayout();
21079 Type *ValTy = I.getParamElementType(0);
21081 Info.memVT = MVT::getVT(ValTy);
21082 Info.ptrVal = I.getArgOperand(0);
21083 Info.offset = 0;
21084 Info.align = DL.getABITypeAlign(ValTy);
21086 return true;
21087 }
21088 case Intrinsic::arm_stlex:
21089 case Intrinsic::arm_strex: {
21090 auto &DL = I.getDataLayout();
21091 Type *ValTy = I.getParamElementType(1);
21093 Info.memVT = MVT::getVT(ValTy);
21094 Info.ptrVal = I.getArgOperand(1);
21095 Info.offset = 0;
21096 Info.align = DL.getABITypeAlign(ValTy);
21098 return true;
21099 }
21100 case Intrinsic::arm_stlexd:
21101 case Intrinsic::arm_strexd:
21103 Info.memVT = MVT::i64;
21104 Info.ptrVal = I.getArgOperand(2);
21105 Info.offset = 0;
21106 Info.align = Align(8);
21108 return true;
21109
21110 case Intrinsic::arm_ldaexd:
21111 case Intrinsic::arm_ldrexd:
21113 Info.memVT = MVT::i64;
21114 Info.ptrVal = I.getArgOperand(0);
21115 Info.offset = 0;
21116 Info.align = Align(8);
21118 return true;
21119
21120 default:
21121 break;
21122 }
21123
21124 return false;
21125}
21126
21127/// Returns true if it is beneficial to convert a load of a constant
21128/// to just the constant itself.
21130 Type *Ty) const {
21131 assert(Ty->isIntegerTy());
21132
21133 unsigned Bits = Ty->getPrimitiveSizeInBits();
21134 if (Bits == 0 || Bits > 32)
21135 return false;
21136 return true;
21137}
21138
21140 unsigned Index) const {
21142 return false;
21143
21144 return (Index == 0 || Index == ResVT.getVectorNumElements());
21145}
21146
21148 ARM_MB::MemBOpt Domain) const {
21149 // First, if the target has no DMB, see what fallback we can use.
21150 if (!Subtarget->hasDataBarrier()) {
21151 // Some ARMv6 cpus can support data barriers with an mcr instruction.
21152 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
21153 // here.
21154 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
21155 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
21156 Builder.getInt32(0), Builder.getInt32(7),
21157 Builder.getInt32(10), Builder.getInt32(5)};
21158 return Builder.CreateIntrinsic(Intrinsic::arm_mcr, args);
21159 } else {
21160 // Instead of using barriers, atomic accesses on these subtargets use
21161 // libcalls.
21162 llvm_unreachable("makeDMB on a target so old that it has no barriers");
21163 }
21164 } else {
21165 // Only a full system barrier exists in the M-class architectures.
21166 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
21167 Constant *CDomain = Builder.getInt32(Domain);
21168 return Builder.CreateIntrinsic(Intrinsic::arm_dmb, CDomain);
21169 }
21170}
21171
21172// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
21174 Instruction *Inst,
21175 AtomicOrdering Ord) const {
21176 switch (Ord) {
21179 llvm_unreachable("Invalid fence: unordered/non-atomic");
21182 return nullptr; // Nothing to do
21184 if (!Inst->hasAtomicStore())
21185 return nullptr; // Nothing to do
21186 [[fallthrough]];
21189 if (Subtarget->preferISHSTBarriers())
21190 return makeDMB(Builder, ARM_MB::ISHST);
21191 // FIXME: add a comment with a link to documentation justifying this.
21192 else
21193 return makeDMB(Builder, ARM_MB::ISH);
21194 }
21195 llvm_unreachable("Unknown fence ordering in emitLeadingFence");
21196}
21197
21199 Instruction *Inst,
21200 AtomicOrdering Ord) const {
21201 switch (Ord) {
21204 llvm_unreachable("Invalid fence: unordered/not-atomic");
21207 return nullptr; // Nothing to do
21211 return makeDMB(Builder, ARM_MB::ISH);
21212 }
21213 llvm_unreachable("Unknown fence ordering in emitTrailingFence");
21214}
21215
21216// Loads and stores less than 64-bits are already atomic; ones above that
21217// are doomed anyway, so defer to the default libcall and blame the OS when
21218// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21219// anything for those.
21222 bool has64BitAtomicStore;
21223 if (Subtarget->isMClass())
21224 has64BitAtomicStore = false;
21225 else if (Subtarget->isThumb())
21226 has64BitAtomicStore = Subtarget->hasV7Ops();
21227 else
21228 has64BitAtomicStore = Subtarget->hasV6Ops();
21229
21230 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
21231 return Size == 64 && has64BitAtomicStore ? AtomicExpansionKind::Expand
21233}
21234
21235// Loads and stores less than 64-bits are already atomic; ones above that
21236// are doomed anyway, so defer to the default libcall and blame the OS when
21237// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21238// anything for those.
21239// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
21240// guarantee, see DDI0406C ARM architecture reference manual,
21241// sections A8.8.72-74 LDRD)
21244 bool has64BitAtomicLoad;
21245 if (Subtarget->isMClass())
21246 has64BitAtomicLoad = false;
21247 else if (Subtarget->isThumb())
21248 has64BitAtomicLoad = Subtarget->hasV7Ops();
21249 else
21250 has64BitAtomicLoad = Subtarget->hasV6Ops();
21251
21252 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
21253 return (Size == 64 && has64BitAtomicLoad) ? AtomicExpansionKind::LLOnly
21255}
21256
21257// For the real atomic operations, we have ldrex/strex up to 32 bits,
21258// and up to 64 bits on the non-M profiles
21261 if (AI->isFloatingPointOperation())
21263
21264 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
21265 bool hasAtomicRMW;
21266 if (Subtarget->isMClass())
21267 hasAtomicRMW = Subtarget->hasV8MBaselineOps();
21268 else if (Subtarget->isThumb())
21269 hasAtomicRMW = Subtarget->hasV7Ops();
21270 else
21271 hasAtomicRMW = Subtarget->hasV6Ops();
21272 if (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) {
21273 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21274 // implement atomicrmw without spilling. If the target address is also on
21275 // the stack and close enough to the spill slot, this can lead to a
21276 // situation where the monitor always gets cleared and the atomic operation
21277 // can never succeed. So at -O0 lower this operation to a CAS loop.
21278 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
21281 }
21283}
21284
21285// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32
21286// bits, and up to 64 bits on the non-M profiles.
21289 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21290 // implement cmpxchg without spilling. If the address being exchanged is also
21291 // on the stack and close enough to the spill slot, this can lead to a
21292 // situation where the monitor always gets cleared and the atomic operation
21293 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
21294 unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
21295 bool HasAtomicCmpXchg;
21296 if (Subtarget->isMClass())
21297 HasAtomicCmpXchg = Subtarget->hasV8MBaselineOps();
21298 else if (Subtarget->isThumb())
21299 HasAtomicCmpXchg = Subtarget->hasV7Ops();
21300 else
21301 HasAtomicCmpXchg = Subtarget->hasV6Ops();
21302 if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None &&
21303 HasAtomicCmpXchg && Size <= (Subtarget->isMClass() ? 32U : 64U))
21306}
21307
21309 const Instruction *I) const {
21310 return InsertFencesForAtomic;
21311}
21312
21314 // ROPI/RWPI are not supported currently.
21315 return !Subtarget->isROPI() && !Subtarget->isRWPI();
21316}
21317
21319 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
21320 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
21321 if (SecurityCheckCookieLibcall == RTLIB::Unsupported)
21323
21324 // MSVC CRT has a global variable holding security cookie.
21325 M.getOrInsertGlobal("__security_cookie",
21326 PointerType::getUnqual(M.getContext()));
21327
21328 // MSVC CRT has a function to validate security cookie.
21329 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
21330 getLibcallImplName(SecurityCheckCookieLibcall),
21331 Type::getVoidTy(M.getContext()), PointerType::getUnqual(M.getContext()));
21332 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
21333 F->addParamAttr(0, Attribute::AttrKind::InReg);
21334}
21335
21337 // MSVC CRT has a function to validate security cookie.
21338 RTLIB::LibcallImpl SecurityCheckCookie =
21339 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
21340 if (SecurityCheckCookie != RTLIB::Unsupported)
21341 return M.getFunction(getLibcallImplName(SecurityCheckCookie));
21343}
21344
21346 unsigned &Cost) const {
21347 // If we do not have NEON, vector types are not natively supported.
21348 if (!Subtarget->hasNEON())
21349 return false;
21350
21351 // Floating point values and vector values map to the same register file.
21352 // Therefore, although we could do a store extract of a vector type, this is
21353 // better to leave at float as we have more freedom in the addressing mode for
21354 // those.
21355 if (VectorTy->isFPOrFPVectorTy())
21356 return false;
21357
21358 // If the index is unknown at compile time, this is very expensive to lower
21359 // and it is not possible to combine the store with the extract.
21360 if (!isa<ConstantInt>(Idx))
21361 return false;
21362
21363 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
21364 unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();
21365 // We can do a store + vector extract on any vector that fits perfectly in a D
21366 // or Q register.
21367 if (BitWidth == 64 || BitWidth == 128) {
21368 Cost = 0;
21369 return true;
21370 }
21371 return false;
21372}
21373
21375 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21376}
21377
21379 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21380}
21381
21383 const Instruction &AndI) const {
21384 if (!Subtarget->hasV7Ops())
21385 return false;
21386
21387 // Sink the `and` instruction only if the mask would fit into a modified
21388 // immediate operand.
21389 ConstantInt *Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
21390 if (!Mask || Mask->getValue().getBitWidth() > 32u)
21391 return false;
21392 auto MaskVal = unsigned(Mask->getValue().getZExtValue());
21393 return (Subtarget->isThumb2() ? ARM_AM::getT2SOImmVal(MaskVal)
21394 : ARM_AM::getSOImmVal(MaskVal)) != -1;
21395}
21396
21399 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
21400 if (Subtarget->hasMinSize() && !Subtarget->isTargetWindows())
21403 ExpansionFactor);
21404}
21405
21407 Value *Addr,
21408 AtomicOrdering Ord) const {
21409 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21410 bool IsAcquire = isAcquireOrStronger(Ord);
21411
21412 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
21413 // intrinsic must return {i32, i32} and we have to recombine them into a
21414 // single i64 here.
21415 if (ValueTy->getPrimitiveSizeInBits() == 64) {
21417 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
21418
21419 Value *LoHi =
21420 Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");
21421
21422 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
21423 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
21424 if (!Subtarget->isLittle())
21425 std::swap (Lo, Hi);
21426 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
21427 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
21428 return Builder.CreateOr(
21429 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64");
21430 }
21431
21432 Type *Tys[] = { Addr->getType() };
21433 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
21434 CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
21435
21436 CI->addParamAttr(
21437 0, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy));
21438 return Builder.CreateTruncOrBitCast(CI, ValueTy);
21439}
21440
21442 IRBuilderBase &Builder) const {
21443 if (!Subtarget->hasV7Ops())
21444 return;
21445 Builder.CreateIntrinsic(Intrinsic::arm_clrex, {});
21446}
21447
21449 Value *Val, Value *Addr,
21450 AtomicOrdering Ord) const {
21451 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21452 bool IsRelease = isReleaseOrStronger(Ord);
21453
21454 // Since the intrinsics must have legal type, the i64 intrinsics take two
21455 // parameters: "i32, i32". We must marshal Val into the appropriate form
21456 // before the call.
21457 if (Val->getType()->getPrimitiveSizeInBits() == 64) {
21459 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
21460 Type *Int32Ty = Type::getInt32Ty(M->getContext());
21461
21462 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
21463 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
21464 if (!Subtarget->isLittle())
21465 std::swap(Lo, Hi);
21466 return Builder.CreateIntrinsic(Int, {Lo, Hi, Addr});
21467 }
21468
21469 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
21470 Type *Tys[] = { Addr->getType() };
21472
21473 CallInst *CI = Builder.CreateCall(
21474 Strex, {Builder.CreateZExtOrBitCast(
21475 Val, Strex->getFunctionType()->getParamType(0)),
21476 Addr});
21477 CI->addParamAttr(1, Attribute::get(M->getContext(), Attribute::ElementType,
21478 Val->getType()));
21479 return CI;
21480}
21481
21482
21484 return Subtarget->isMClass();
21485}
21486
21487/// A helper function for determining the number of interleaved accesses we
21488/// will generate when lowering accesses of the given type.
21489unsigned
21491 const DataLayout &DL) const {
21492 return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
21493}
21494
21496 unsigned Factor, FixedVectorType *VecTy, Align Alignment,
21497 const DataLayout &DL) const {
21498
21499 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
21500 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
21501
21502 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
21503 return false;
21504
21505 // Ensure the vector doesn't have f16 elements. Even though we could do an
21506 // i16 vldN, we can't hold the f16 vectors and will end up converting via
21507 // f32.
21508 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
21509 return false;
21510 if (Subtarget->hasMVEIntegerOps() && Factor == 3)
21511 return false;
21512
21513 // Ensure the number of vector elements is greater than 1.
21514 if (VecTy->getNumElements() < 2)
21515 return false;
21516
21517 // Ensure the element type is legal.
21518 if (ElSize != 8 && ElSize != 16 && ElSize != 32)
21519 return false;
21520 // And the alignment if high enough under MVE.
21521 if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8)
21522 return false;
21523
21524 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
21525 // 128 will be split into multiple interleaved accesses.
21526 if (Subtarget->hasNEON() && VecSize == 64)
21527 return true;
21528 return VecSize % 128 == 0;
21529}
21530
21532 if (Subtarget->hasNEON())
21533 return 4;
21534 if (Subtarget->hasMVEIntegerOps())
21537}
21538
21539/// Lower an interleaved load into a vldN intrinsic.
21540///
21541/// E.g. Lower an interleaved load (Factor = 2):
21542/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
21543/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
21544/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
21545///
21546/// Into:
21547/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
21548/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
21549/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
21551 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
21552 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
21553 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21554 "Invalid interleave factor");
21555 assert(!Shuffles.empty() && "Empty shufflevector input");
21556 assert(Shuffles.size() == Indices.size() &&
21557 "Unmatched number of shufflevectors and indices");
21558
21559 auto *LI = dyn_cast<LoadInst>(Load);
21560 if (!LI)
21561 return false;
21562 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
21563
21564 auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
21565 Type *EltTy = VecTy->getElementType();
21566
21567 const DataLayout &DL = LI->getDataLayout();
21568 Align Alignment = LI->getAlign();
21569
21570 // Skip if we do not have NEON and skip illegal vector types. We can
21571 // "legalize" wide vector types into multiple interleaved accesses as long as
21572 // the vector types are divisible by 128.
21573 if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL))
21574 return false;
21575
21576 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
21577
21578 // A pointer vector can not be the return type of the ldN intrinsics. Need to
21579 // load integer vectors first and then convert to pointer vectors.
21580 if (EltTy->isPointerTy())
21581 VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);
21582
21583 IRBuilder<> Builder(LI);
21584
21585 // The base address of the load.
21586 Value *BaseAddr = LI->getPointerOperand();
21587
21588 if (NumLoads > 1) {
21589 // If we're going to generate more than one load, reset the sub-vector type
21590 // to something legal.
21591 VecTy = FixedVectorType::get(VecTy->getElementType(),
21592 VecTy->getNumElements() / NumLoads);
21593 }
21594
21595 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
21596
21597 auto createLoadIntrinsic = [&](Value *BaseAddr) {
21598 if (Subtarget->hasNEON()) {
21599 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21600 Type *Tys[] = {VecTy, PtrTy};
21601 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
21602 Intrinsic::arm_neon_vld3,
21603 Intrinsic::arm_neon_vld4};
21604
21606 Ops.push_back(BaseAddr);
21607 Ops.push_back(Builder.getInt32(LI->getAlign().value()));
21608
21609 return Builder.CreateIntrinsic(LoadInts[Factor - 2], Tys, Ops,
21610 /*FMFSource=*/nullptr, "vldN");
21611 } else {
21612 assert((Factor == 2 || Factor == 4) &&
21613 "expected interleave factor of 2 or 4 for MVE");
21614 Intrinsic::ID LoadInts =
21615 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
21616 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21617 Type *Tys[] = {VecTy, PtrTy};
21618
21620 Ops.push_back(BaseAddr);
21621 return Builder.CreateIntrinsic(LoadInts, Tys, Ops, /*FMFSource=*/nullptr,
21622 "vldN");
21623 }
21624 };
21625
21626 // Holds sub-vectors extracted from the load intrinsic return values. The
21627 // sub-vectors are associated with the shufflevector instructions they will
21628 // replace.
21630
21631 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
21632 // If we're generating more than one load, compute the base address of
21633 // subsequent loads as an offset from the previous.
21634 if (LoadCount > 0)
21635 BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,
21636 VecTy->getNumElements() * Factor);
21637
21638 CallInst *VldN = createLoadIntrinsic(BaseAddr);
21639
21640 // Replace uses of each shufflevector with the corresponding vector loaded
21641 // by ldN.
21642 for (unsigned i = 0; i < Shuffles.size(); i++) {
21643 ShuffleVectorInst *SV = Shuffles[i];
21644 unsigned Index = Indices[i];
21645
21646 Value *SubVec = Builder.CreateExtractValue(VldN, Index);
21647
21648 // Convert the integer vector to pointer vector if the element is pointer.
21649 if (EltTy->isPointerTy())
21650 SubVec = Builder.CreateIntToPtr(
21651 SubVec,
21653
21654 SubVecs[SV].push_back(SubVec);
21655 }
21656 }
21657
21658 // Replace uses of the shufflevector instructions with the sub-vectors
21659 // returned by the load intrinsic. If a shufflevector instruction is
21660 // associated with more than one sub-vector, those sub-vectors will be
21661 // concatenated into a single wide vector.
21662 for (ShuffleVectorInst *SVI : Shuffles) {
21663 auto &SubVec = SubVecs[SVI];
21664 auto *WideVec =
21665 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
21666 SVI->replaceAllUsesWith(WideVec);
21667 }
21668
21669 return true;
21670}
21671
21672/// Lower an interleaved store into a vstN intrinsic.
21673///
21674/// E.g. Lower an interleaved store (Factor = 3):
21675/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
21676/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
21677/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
21678///
21679/// Into:
21680/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
21681/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
21682/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
21683/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21684///
21685/// Note that the new shufflevectors will be removed and we'll only generate one
21686/// vst3 instruction in CodeGen.
21687///
21688/// Example for a more general valid mask (Factor 3). Lower:
21689/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
21690/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
21691/// store <12 x i32> %i.vec, <12 x i32>* %ptr
21692///
21693/// Into:
21694/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
21695/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
21696/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
21697/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21699 Value *LaneMask,
21700 ShuffleVectorInst *SVI,
21701 unsigned Factor,
21702 const APInt &GapMask) const {
21703 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21704 "Invalid interleave factor");
21705 auto *SI = dyn_cast<StoreInst>(Store);
21706 if (!SI)
21707 return false;
21708 assert(!LaneMask && GapMask.popcount() == Factor &&
21709 "Unexpected mask on store");
21710
21711 auto *VecTy = cast<FixedVectorType>(SVI->getType());
21712 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
21713
21714 unsigned LaneLen = VecTy->getNumElements() / Factor;
21715 Type *EltTy = VecTy->getElementType();
21716 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
21717
21718 const DataLayout &DL = SI->getDataLayout();
21719 Align Alignment = SI->getAlign();
21720
21721 // Skip if we do not have NEON and skip illegal vector types. We can
21722 // "legalize" wide vector types into multiple interleaved accesses as long as
21723 // the vector types are divisible by 128.
21724 if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
21725 return false;
21726
21727 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
21728
21729 Value *Op0 = SVI->getOperand(0);
21730 Value *Op1 = SVI->getOperand(1);
21731 IRBuilder<> Builder(SI);
21732
21733 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
21734 // vectors to integer vectors.
21735 if (EltTy->isPointerTy()) {
21736 Type *IntTy = DL.getIntPtrType(EltTy);
21737
21738 // Convert to the corresponding integer vector.
21739 auto *IntVecTy =
21740 FixedVectorType::get(IntTy, cast<FixedVectorType>(Op0->getType()));
21741 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
21742 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
21743
21744 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
21745 }
21746
21747 // The base address of the store.
21748 Value *BaseAddr = SI->getPointerOperand();
21749
21750 if (NumStores > 1) {
21751 // If we're going to generate more than one store, reset the lane length
21752 // and sub-vector type to something legal.
21753 LaneLen /= NumStores;
21754 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
21755 }
21756
21757 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
21758
21759 auto Mask = SVI->getShuffleMask();
21760
21761 auto createStoreIntrinsic = [&](Value *BaseAddr,
21762 SmallVectorImpl<Value *> &Shuffles) {
21763 if (Subtarget->hasNEON()) {
21764 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
21765 Intrinsic::arm_neon_vst3,
21766 Intrinsic::arm_neon_vst4};
21767 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21768 Type *Tys[] = {PtrTy, SubVecTy};
21769
21771 Ops.push_back(BaseAddr);
21772 append_range(Ops, Shuffles);
21773 Ops.push_back(Builder.getInt32(SI->getAlign().value()));
21774 Builder.CreateIntrinsic(StoreInts[Factor - 2], Tys, Ops);
21775 } else {
21776 assert((Factor == 2 || Factor == 4) &&
21777 "expected interleave factor of 2 or 4 for MVE");
21778 Intrinsic::ID StoreInts =
21779 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
21780 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21781 Type *Tys[] = {PtrTy, SubVecTy};
21782
21784 Ops.push_back(BaseAddr);
21785 append_range(Ops, Shuffles);
21786 for (unsigned F = 0; F < Factor; F++) {
21787 Ops.push_back(Builder.getInt32(F));
21788 Builder.CreateIntrinsic(StoreInts, Tys, Ops);
21789 Ops.pop_back();
21790 }
21791 }
21792 };
21793
21794 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
21795 // If we generating more than one store, we compute the base address of
21796 // subsequent stores as an offset from the previous.
21797 if (StoreCount > 0)
21798 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
21799 BaseAddr, LaneLen * Factor);
21800
21801 SmallVector<Value *, 4> Shuffles;
21802
21803 // Split the shufflevector operands into sub vectors for the new vstN call.
21804 for (unsigned i = 0; i < Factor; i++) {
21805 unsigned IdxI = StoreCount * LaneLen * Factor + i;
21806 if (Mask[IdxI] >= 0) {
21807 Shuffles.push_back(Builder.CreateShuffleVector(
21808 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
21809 } else {
21810 unsigned StartMask = 0;
21811 for (unsigned j = 1; j < LaneLen; j++) {
21812 unsigned IdxJ = StoreCount * LaneLen * Factor + j;
21813 if (Mask[IdxJ * Factor + IdxI] >= 0) {
21814 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
21815 break;
21816 }
21817 }
21818 // Note: If all elements in a chunk are undefs, StartMask=0!
21819 // Note: Filling undef gaps with random elements is ok, since
21820 // those elements were being written anyway (with undefs).
21821 // In the case of all undefs we're defaulting to using elems from 0
21822 // Note: StartMask cannot be negative, it's checked in
21823 // isReInterleaveMask
21824 Shuffles.push_back(Builder.CreateShuffleVector(
21825 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
21826 }
21827 }
21828
21829 createStoreIntrinsic(BaseAddr, Shuffles);
21830 }
21831 return true;
21832}
21833
21841
21843 uint64_t &Members) {
21844 if (auto *ST = dyn_cast<StructType>(Ty)) {
21845 for (unsigned i = 0; i < ST->getNumElements(); ++i) {
21846 uint64_t SubMembers = 0;
21847 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
21848 return false;
21849 Members += SubMembers;
21850 }
21851 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
21852 uint64_t SubMembers = 0;
21853 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
21854 return false;
21855 Members += SubMembers * AT->getNumElements();
21856 } else if (Ty->isFloatTy()) {
21857 if (Base != HA_UNKNOWN && Base != HA_FLOAT)
21858 return false;
21859 Members = 1;
21860 Base = HA_FLOAT;
21861 } else if (Ty->isDoubleTy()) {
21862 if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
21863 return false;
21864 Members = 1;
21865 Base = HA_DOUBLE;
21866 } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
21867 Members = 1;
21868 switch (Base) {
21869 case HA_FLOAT:
21870 case HA_DOUBLE:
21871 return false;
21872 case HA_VECT64:
21873 return VT->getPrimitiveSizeInBits().getFixedValue() == 64;
21874 case HA_VECT128:
21875 return VT->getPrimitiveSizeInBits().getFixedValue() == 128;
21876 case HA_UNKNOWN:
21877 switch (VT->getPrimitiveSizeInBits().getFixedValue()) {
21878 case 64:
21879 Base = HA_VECT64;
21880 return true;
21881 case 128:
21882 Base = HA_VECT128;
21883 return true;
21884 default:
21885 return false;
21886 }
21887 }
21888 }
21889
21890 return (Members > 0 && Members <= 4);
21891}
21892
21893/// Return the correct alignment for the current calling convention.
21895 Type *ArgTy, const DataLayout &DL) const {
21896 const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
21897 if (!ArgTy->isVectorTy())
21898 return ABITypeAlign;
21899
21900 // Avoid over-aligning vector parameters. It would require realigning the
21901 // stack and waste space for no real benefit.
21902 MaybeAlign StackAlign = DL.getStackAlignment();
21903 assert(StackAlign && "data layout string is missing stack alignment");
21904 return std::min(ABITypeAlign, *StackAlign);
21905}
21906
21907/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
21908/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
21909/// passing according to AAPCS rules.
21911 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
21912 const DataLayout &DL) const {
21913 if (getEffectiveCallingConv(CallConv, isVarArg) !=
21915 return false;
21916
21918 uint64_t Members = 0;
21919 bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
21920 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
21921
21922 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
21923 return IsHA || IsIntArray;
21924}
21925
21927 const Constant *PersonalityFn) const {
21928 // Platforms which do not use SjLj EH may return values in these registers
21929 // via the personality function.
21931 return EM == ExceptionHandling::SjLj ? Register() : ARM::R0;
21932}
21933
21935 const Constant *PersonalityFn) const {
21936 // Platforms which do not use SjLj EH may return values in these registers
21937 // via the personality function.
21939 return EM == ExceptionHandling::SjLj ? Register() : ARM::R1;
21940}
21941
21942void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
21943 // Update IsSplitCSR in ARMFunctionInfo.
21944 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
21945 AFI->setIsSplitCSR(true);
21946}
21947
21948void ARMTargetLowering::insertCopiesSplitCSR(
21949 MachineBasicBlock *Entry,
21950 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
21951 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
21952 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
21953 if (!IStart)
21954 return;
21955
21956 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21957 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
21958 MachineBasicBlock::iterator MBBI = Entry->begin();
21959 for (const MCPhysReg *I = IStart; *I; ++I) {
21960 const TargetRegisterClass *RC = nullptr;
21961 if (ARM::GPRRegClass.contains(*I))
21962 RC = &ARM::GPRRegClass;
21963 else if (ARM::DPRRegClass.contains(*I))
21964 RC = &ARM::DPRRegClass;
21965 else
21966 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
21967
21968 Register NewVR = MRI->createVirtualRegister(RC);
21969 // Create copy from CSR to a virtual register.
21970 // FIXME: this currently does not emit CFI pseudo-instructions, it works
21971 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
21972 // nounwind. If we want to generalize this later, we may need to emit
21973 // CFI pseudo-instructions.
21974 assert(Entry->getParent()->getFunction().hasFnAttribute(
21975 Attribute::NoUnwind) &&
21976 "Function should be nounwind in insertCopiesSplitCSR!");
21977 Entry->addLiveIn(*I);
21978 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
21979 .addReg(*I);
21980
21981 // Insert the copy-back instructions right before the terminator.
21982 for (auto *Exit : Exits)
21983 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
21984 TII->get(TargetOpcode::COPY), *I)
21985 .addReg(NewVR);
21986 }
21987}
21988
21992}
21993
21995 return Subtarget->hasMVEIntegerOps();
21996}
21997
22000 auto *VTy = dyn_cast<FixedVectorType>(Ty);
22001 if (!VTy)
22002 return false;
22003
22004 auto *ScalarTy = VTy->getScalarType();
22005 unsigned NumElements = VTy->getNumElements();
22006
22007 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
22008 if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth))
22009 return false;
22010
22011 // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32
22012 if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy())
22013 return Subtarget->hasMVEFloatOps();
22014
22016 return false;
22017
22018 return Subtarget->hasMVEIntegerOps() &&
22019 (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
22020 ScalarTy->isIntegerTy(32));
22021}
22022
22025 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
22026 Value *Accumulator) const {
22027
22028 FixedVectorType *Ty = cast<FixedVectorType>(InputA->getType());
22029
22030 unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
22031
22032 assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits");
22033
22034 if (TyWidth > 128) {
22035 int Stride = Ty->getNumElements() / 2;
22036 auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());
22037 auto SplitSeqVec = llvm::to_vector(SplitSeq);
22038 ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
22039 ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
22040
22041 auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);
22042 auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);
22043 auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);
22044 auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);
22045 Value *LowerSplitAcc = nullptr;
22046 Value *UpperSplitAcc = nullptr;
22047
22048 if (Accumulator) {
22049 LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);
22050 UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);
22051 }
22052
22053 auto *LowerSplitInt = createComplexDeinterleavingIR(
22054 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
22055 auto *UpperSplitInt = createComplexDeinterleavingIR(
22056 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
22057
22058 ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
22059 return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
22060 }
22061
22062 auto *IntTy = Type::getInt32Ty(B.getContext());
22063
22064 ConstantInt *ConstRotation = nullptr;
22065 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
22066 ConstRotation = ConstantInt::get(IntTy, (int)Rotation);
22067
22068 if (Accumulator)
22069 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty,
22070 {ConstRotation, Accumulator, InputB, InputA});
22071 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty,
22072 {ConstRotation, InputB, InputA});
22073 }
22074
22075 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
22076 // 1 means the value is not halved.
22077 auto *ConstHalving = ConstantInt::get(IntTy, 1);
22078
22080 ConstRotation = ConstantInt::get(IntTy, 0);
22082 ConstRotation = ConstantInt::get(IntTy, 1);
22083
22084 if (!ConstRotation)
22085 return nullptr; // Invalid rotation for arm_mve_vcaddq
22086
22087 return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty,
22088 {ConstHalving, ConstRotation, InputA, InputB});
22089 }
22090
22091 return nullptr;
22092}
unsigned const MachineRegisterInfo * MRI
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static const MCPhysReg GPRArgRegs[]
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &DL)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
constexpr MVT FlagsVT
Value type used for NZCV flags.
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static bool isConstant(const MachineInstr &MI)
constexpr LLT F64
constexpr LLT S1
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG)
static bool isStore(int Opcode)
static bool isThumb(const MCSubtargetInfo &STI)
static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT)
static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
@ HA_DOUBLE
@ HA_VECT128
@ HA_VECT64
@ HA_FLOAT
@ HA_UNKNOWN
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total value size to 64 bits.
static cl::opt< unsigned > ConstpoolPromotionMaxSize("arm-promote-constant-max-size", cl::Hidden, cl::desc("Maximum size of constant to promote into a constant pool"), cl::init(64))
static bool isZeroOrAllOnes(SDValue N, bool AllOnes)
static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isVTBLMask(ArrayRef< int > M, EVT VT)
static SDValue PerformSUBCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
static cl::opt< bool > EnableConstpoolPromotion("arm-promote-constant", cl::Hidden, cl::desc("Enable / disable promotion of unnamed_addr constants into " "constant pools"), cl::init(false))
static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG)
static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
static SDValue PerformExtractEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static const APInt * isPowerOf2Constant(SDValue V)
static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) can replace combinations of ...
static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static bool isValidMVECond(unsigned CC, bool IsFloat)
static SDValue PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC)
IntCCToARMCC - Convert a DAG integer condition code to an ARM CC.
static SDValue PerformSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSTORECombine - Target-specific dag combine xforms for ISD::STORE.
static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, SelectionDAG &DAG)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isGTorGE(ISD::CondCode CC)
static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1) intrinsic,...
static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask)
static bool isReverseMask(ArrayRef< int > M, EVT VT)
static bool isVZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of "vector_shuffle v,...
static SDValue PerformSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVMulVCTPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD) can replace combinations...
static SDValue createGPRPairNode2xi32(SelectionDAG &DAG, SDValue V0, SDValue V1)
static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG)
static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc)
static bool isVTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool CanInvertMVEVCMP(SDValue N)
static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG)
static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformShiftCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
PerformShiftCombine - Checks for immediate versions of vector shifts and lowers them.
static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, ARMCC::CondCodes &CondCode2)
FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static EVT getVectorTyFromPredicateVector(EVT VT)
static SDValue PerformFADDVCMLACombine(SDNode *N, SelectionDAG &DAG)
static SDValue handleCMSEValue(const SDValue &Value, const ISD::InputArg &Arg, SelectionDAG &DAG, const SDLoc &DL)
static SDValue PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
static bool isSRL16(const SDValue &Op)
static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC)
static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr, SDValue Inc, const SelectionDAG &DAG)
static SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static Register genTPEntry(MachineBasicBlock *TpEntry, MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpExit, Register OpSizeReg, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI)
Adds logic in loop entry MBB to calculate loop iteration count and adds t2WhileLoopSetup and t2WhileL...
static SDValue createGPRPairNodei64(SelectionDAG &DAG, SDValue V)
static bool isLTorLE(ISD::CondCode CC)
static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static SDValue PerformBITCASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG)
static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG)
static bool hasNormalLoadOperand(SDNode *N)
hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node are normal,...
static SDValue PerformInsertEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
PerformInsertEltCombine - Target-specific dag combine xforms for ISD::INSERT_VECTOR_ELT.
static SDValue PerformVDUPLANECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVDUPLANECombine - Target-specific dag combine xforms for ARMISD::VDUPLANE.
static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static cl::opt< unsigned > ConstpoolPromotionMaxTotal("arm-promote-constant-max-total", cl::Hidden, cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128))
static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static RTLIB::Libcall getDivRemLibcall(const SDNode *N, MVT::SimpleValueType SVT)
static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG &DAG)
SkipLoadExtensionForVMULL - return a load of the original vector size that does not do any sign/zero ...
static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static const MCPhysReg GPRArgRegs[]
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, SelectionDAG &DAG)
static bool isVZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformORCombineToSMULWBT(SDNode *OR, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isVTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of "vector_shuffle v,...
static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue FindBFIToCombineWith(SDNode *N)
static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, SelectionDAG &DAG)
ShuffleOpCodes
@ OP_VEXT3
@ OP_VTRNR
@ OP_VDUP1
@ OP_VZIPR
@ OP_VUZPR
@ OP_VREV
@ OP_VZIPL
@ OP_VTRNL
@ OP_COPY
@ OP_VEXT1
@ OP_VDUP0
@ OP_VEXT2
@ OP_VUZPL
@ OP_VDUP3
@ OP_VDUP2
static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, bool &swpCmpOps, bool &swpVselOps)
static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isS16(const SDValue &Op, SelectionDAG &DAG)
static bool isSRA16(const SDValue &Op)
constexpr MVT FlagsVT
Value type used for "flags" operands / results (either CPSR or FPSCR_NZCV).
static SDValue AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue LowerInterruptReturn(SmallVectorImpl< SDValue > &RetOps, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, SelectionDAG &DAG)
static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, SDValue &RetVal1, SDValue &RetVal2)
static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isSHL16(const SDValue &Op)
static bool isVEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseVEXT, unsigned &Imm)
static SDValue PerformMVEVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
cl::opt< unsigned > ArmMaxBaseUpdatesToCheck("arm-max-base-updates-to-check", cl::Hidden, cl::desc("Maximum number of base-updates to check generating postindex."), cl::init(64))
static bool isTruncMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2)
Return the load opcode for a given load size.
static bool isLegalT2AddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
static bool isLegalMVEShuffleOp(unsigned PFEntry)
static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N, SelectionDAG &DAG)
static bool isVUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG)
PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for ISD::VECTOR_SHUFFLE.
static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG)
SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, ANY_EXTEND,...
static bool isVMOVNTruncMask(ArrayRef< int > M, EVT ToVT, bool rev)
static SDValue PerformVQMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static MachineBasicBlock * OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ)
static SDValue LowerVecReduceMinMax(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformAddcSubcCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static TargetLowering::ArgListTy getDivRemArgList(const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget)
static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static ARMCC::CondCodes getVCMPCondCode(SDValue N)
static cl::opt< bool > ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true))
static void ReplaceREADCYCLECOUNTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformORCombineToBFI(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDValue &CC, bool &Invert, SDValue &OtherOp, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVSetCCToVCTPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isZeroVector(SDValue N)
static SDValue PerformAddeSubeCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void ReplaceCMP_SWAP_64Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, const SDValue TrueVal, const SDValue FalseVal, const ISD::CondCode CC, const SDValue K)
static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG)
static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned StSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment store operation with given size.
static bool isVMOVNMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, NEON load/store intrinsics,...
static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVRRDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMOVRRDCombine - Target-specific dag combine xforms for ARMISD::VMOVRRD.
static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain)
static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue PerformVMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMULCombine Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the special multi...
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformORCombine - Target-specific dag combine xforms for ISD::OR.
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG)
static unsigned SelectPairHalf(unsigned Elements, ArrayRef< int > Mask, unsigned Index)
static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned LdSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment load operation with given size.
static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG)
static bool isValidBaseUpdate(SDNode *N, SDNode *User)
static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, const ARMSubtarget *ST, const SDLoc &dl)
static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op)
static SDValue PerformMinMaxToSatCombine(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformXORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, bool isSEXTLoad, bool IsMasked, bool isLE, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
std::pair< unsigned, const TargetRegisterClass * > RCPair
static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, TargetLowering::DAGCombinerInfo &DCI, bool AllOnes=false)
static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, ISD::ZERO_EXTEND,...
static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, const SDLoc &dl, EVT &VT, EVT VectorVT, VMOVModImmType type)
isVMOVModifiedImm - Check if the specified splat value corresponds to a valid vector constant for a N...
static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, SelectionDAG &DAG)
BC is a bitcast that is about to be turned into a VMOVDRR.
static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, const GlobalValue *GV, SelectionDAG &DAG, EVT PtrVT, const SDLoc &dl)
static unsigned isNEONTwoResultShuffleMask(ArrayRef< int > ShuffleMask, EVT VT, unsigned &WhichResult, bool &isV_UNDEF)
Check if ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), and return the corresponding AR...
static bool BitsProperlyConcatenate(const APInt &A, const APInt &B)
static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG)
static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target, struct BaseUpdateUser &User, bool SimpleConstIncOnly, TargetLowering::DAGCombinerInfo &DCI)
static bool allUsersAreInFunction(const Value *V, const Function *F)
Return true if all users of V are within function F, looking through ConstantExprs.
static bool isSingletonVEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG)
PerformVMOVDRRCombine - Target-specific dag combine xforms for ARMISD::VMOVDRR.
static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, SDValue &SatK)
static bool isLegalAddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
isLegalAddressImmediate - Return true if the integer value can be used as the offset of the target ad...
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isLegalT1AddressImmediate(int64_t V, EVT VT)
static SDValue CombineANDShift(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformADDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDECombine - Target-specific dag combine transform from ARMISD::ADDC, ARMISD::ADDE,...
static SDValue PerformReduceShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformHWLoopCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool isVUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of "vector_shuffle v,...
static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, uint64_t &Members)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG)
static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, bool &Negate)
static bool canChangeToInt(SDValue Op, bool &SeenZero, const ARMSubtarget *Subtarget)
canChangeToInt - Given the fp compare operand, return true if it is suitable to morph to an integer c...
static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2)
Return the store opcode for a given store size.
static bool IsVUZPShuffleNode(SDNode *N)
static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, MachineInstr &MI, const SDNode *Node)
Attaches vregs to MEMCPY that it will use as scratch registers when it is expanded into LDM/STM.
static bool isFloatingPointZero(SDValue Op)
isFloatingPointZero - Return true if this is +0.0.
static SDValue findMUL_LOHI(SDValue V)
static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG)
static void genTPLoopBody(MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI, Register OpSrcReg, Register OpDestReg, Register ElementCountReg, Register TotalIterationsReg, bool IsMemcpy)
Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and t2DoLoopEnd.
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformMinMaxCombine - Target-specific DAG combining for creating truncating saturates.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
This file a TargetTransformInfoImplBase conforming object specific to the ARM target machine.
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
BlockVerifier::State From
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static std::optional< bool > isBigEndian(const SmallDenseMap< int64_t, int64_t, 8 > &MemOffset2Idx, int64_t LowestIdx)
Given a map from byte offsets in memory to indices in a load/store, determine if that map corresponds...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset, dxil::ResourceTypeInfo &RTI)
static void createStoreIntrinsic(IntrinsicInst *II, StoreInst *SI, Value *Offset, dxil::ResourceTypeInfo &RTI)
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
uint64_t Addr
std::string Name
uint64_t Size
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
#define Check(C,...)
#define op(i)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
lazy value info
loop Loop Strength Reduction
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
This file declares the MachineConstantPool class which is an abstract constant pool to keep track of ...
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
#define MAKE_CASE(V)
Register const TargetRegisterInfo * TRI
nvptx lower args
uint64_t High
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
PowerPC Reduce CR logical Operation
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
SI Lower i1 Copies
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:480
static cl::opt< unsigned > MaxSteps("has-predecessor-max-steps", cl::Hidden, cl::init(8192), cl::desc("DAG combiner limit number of steps when searching DAG " "for predecessor nodes"))
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition: Debug.h:119
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
static bool isIntrinsic(const CallBase &Call, Intrinsic::ID ID)
bool getExactInverse(APFloat *Inv) const
If this value is normal and has an exact, normal, multiplicative inverse, store it in inv and return ...
Definition: APFloat.cpp:5999
APInt bitcastToAPInt() const
Definition: APFloat.h:1353
opStatus convertToInteger(MutableArrayRef< integerPart > Input, unsigned int Width, bool IsSigned, roundingMode RM, bool *IsExact) const
Definition: APFloat.h:1332
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1540
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1670
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1033
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1512
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:936
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition: APInt.h:1201
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1111
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1639
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition: APInt.h:1598
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:651
unsigned logBase2() const
Definition: APInt.h:1761
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition: APInt.h:475
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:239
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1562
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:858
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition: APInt.h:851
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1656
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1221
An arbitrary precision integer that knows its signedness.
Definition: APSInt.h:24
virtual const ARMBaseRegisterInfo & getRegisterInfo() const =0
const uint32_t * getSjLjDispatchPreservedMask(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
Code Generation virtual methods...
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
const uint32_t * getTLSCallPreservedMask(const MachineFunction &MF) const
const uint32_t * getThisReturnPreservedMask(const MachineFunction &MF, CallingConv::ID) const
getThisReturnPreservedMask - Returns a call preserved mask specific to the case that 'returned' is on...
static ARMConstantPoolConstant * Create(const Constant *C, unsigned ID)
static ARMConstantPoolMBB * Create(LLVMContext &C, const MachineBasicBlock *mbb, unsigned ID, unsigned char PCAdj)
static ARMConstantPoolSymbol * Create(LLVMContext &C, StringRef s, unsigned ID, unsigned char PCAdj)
ARMConstantPoolValue - ARM specific constantpool value.
ARMFunctionInfo - This class is derived from MachineFunctionInfo and contains private ARM-specific in...
SmallPtrSet< const GlobalVariable *, 2 > & getGlobalsPromotedToConstantPool()
void setArgumentStackToRestore(unsigned v)
void setPromotedConstpoolIncrease(int Sz)
void setArgRegsSaveSize(unsigned s)
void setReturnRegsCount(unsigned s)
void setVarArgsFrameIndex(int Index)
unsigned getArgRegsSaveSize() const
void markGlobalAsPromotedToConstantPool(const GlobalVariable *GV)
Indicate to the backend that GV has had its storage changed to inside a constant pool.
void setArgumentStackSize(unsigned size)
unsigned getArgumentStackSize() const
bool isTargetMachO() const
Definition: ARMSubtarget.h:346
bool useMovt() const
bool isTargetAEABI() const
Definition: ARMSubtarget.h:348
bool hasARMOps() const
Definition: ARMSubtarget.h:298
bool supportsTailCall() const
Definition: ARMSubtarget.h:399
const Triple & getTargetTriple() const
Definition: ARMSubtarget.h:330
bool hasVFP4Base() const
Definition: ARMSubtarget.h:306
const ARMBaseInstrInfo * getInstrInfo() const override
Definition: ARMSubtarget.h:235
bool isThumb1Only() const
Definition: ARMSubtarget.h:375
bool useFPVFMx() const
Definition: ARMSubtarget.h:315
bool hasFPARMv8Base() const
Definition: ARMSubtarget.h:307
bool isThumb2() const
Definition: ARMSubtarget.h:376
bool isTargetWindows() const
Definition: ARMSubtarget.h:342
bool isGVIndirectSymbol(const GlobalValue *GV) const
True if the GV will be accessed via an indirect symbol.
bool hasBaseDSP() const
Definition: ARMSubtarget.h:320
const ARMTargetLowering * getTargetLowering() const override
Definition: ARMSubtarget.h:239
bool isTargetDarwin() const
Definition: ARMSubtarget.h:335
const ARMBaseRegisterInfo * getRegisterInfo() const override
Definition: ARMSubtarget.h:247
bool hasVFP2Base() const
Definition: ARMSubtarget.h:304
bool isTargetAndroid() const
Definition: ARMSubtarget.h:365
bool isROPI() const
bool isTargetCOFF() const
Definition: ARMSubtarget.h:344
bool isTargetGNUAEABI() const
Definition: ARMSubtarget.h:350
bool hasVFP3Base() const
Definition: ARMSubtarget.h:305
bool useFPVFMx64() const
Definition: ARMSubtarget.h:319
unsigned getPreferBranchLogAlignment() const
Definition: ARMSubtarget.h:486
bool hasMinSize() const
Definition: ARMSubtarget.h:374
bool useNEONForSinglePrecisionFP() const
Definition: ARMSubtarget.h:300
bool hasAnyDataBarrier() const
Definition: ARMSubtarget.h:309
bool isRWPI() const
bool isLittle() const
Definition: ARMSubtarget.h:407
bool allowsUnalignedMem() const
Definition: ARMSubtarget.h:401
bool isTargetMuslAEABI() const
Definition: ARMSubtarget.h:352
bool useFPVFMx16() const
Definition: ARMSubtarget.h:318
bool isMClass() const
Definition: ARMSubtarget.h:377
bool useMulOps() const
Definition: ARMSubtarget.h:313
bool isTargetELF() const
Definition: ARMSubtarget.h:345
Align getDualLoadStoreAlignment() const
Definition: ARMSubtarget.h:443
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isReadOnly(const GlobalValue *GV) const
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getABIAlignmentForCallingConv(Type *ArgTy, const DataLayout &DL) const override
Return the correct alignment for the current calling convention.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const override
Examine constraint string and operand type and determine a weight value.
bool shouldExpandCmpUsingSelects(EVT VT) const override
Should we expand [US]CMP nodes using two selects and two compares, or by doing arithmetic on boolean ...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
const ARMSubtarget * getSubtarget() const
bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const
bool isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const
Returns true if the addressing mode representing by AM is legal for the Thumb1 target,...
bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPreIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mod...
bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, Align &PrefAlign) const override
Return true if the pointer arguments to CI should be aligned by aligning the object whose address is ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
ReplaceNodeResults - Replace the results of node with an illegal result type with new values built ou...
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool isLegalAddImmediate(int64_t Imm) const override
isLegalAddImmediate - Return true if the specified immediate is legal add immediate,...
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
SDValue PerformMVETruncCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize=false) const override
isFPImmLegal - Returns true if the target can instruction select the specified FP immediate natively.
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const
PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Return true if it is profitable to combine an XOR of a logical shift to create a logical shift of NOT...
bool ExpandInlineAsm(CallInst *CI) const override
This hook allows the target to expand an inline asm call to be explicit llvm code if it wants to.
SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const
PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
SDValue PerformMVEExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
getSetCCResultType - Return the value type to use for ISD::SETCC.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
createFastISel - This method returns a target specific FastISel object, or null if the target does no...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
This method should be implemented by targets that mark instructions with the 'hasPostISelHook' flag.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isTruncateFree(Type *SrcTy, Type *DstTy) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
isShuffleMaskLegal - Targets can use this to indicate that they only support some VECTOR_SHUFFLE oper...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved store into a vstN intrinsic.
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const override
getRegClassFor - Return the register class that should be used for the specified value type.
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved load into a vldN intrinsic.
std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override
Return the largest legal super-reg register class of the register class for the specified type and it...
bool isZExtFree(SDValue Val, EVT VT2) const override
Return true if zero-extending the specific node Val to type VT2 is free (either because it's implicit...
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI)
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const
PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
Type * shouldConvertSplatType(ShuffleVectorInst *SVI) const override
Given a shuffle vector SVI representing a vector splat, return a new scalar type of size equal to SVI...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
Instruction * makeDMB(IRBuilderBase &Builder, ARM_MB::MemBOpt Domain) const
bool isLegalICmpImmediate(int64_t Imm) const override
isLegalICmpImmediate - Return true if the specified immediate is legal icmp immediate,...
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override
Return true if it is profitable for dag combiner to transform a floating point op of specified opcode...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const override
allowsMisalignedMemoryAccesses - Returns true if the target allows unaligned memory accesses of the s...
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
bool isVectorLoadExtDesirable(SDValue ExtVal) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override
Return true if the target can combine store(extractelement VectorTy, Idx).
bool useSoftFloat() const override
bool alignLoopsWithOptSize() const override
Should loops be aligned even when the function is marked OptSize (but not MinSize).
SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
LowerAsmOperandForConstraint - Lower the specified operand into the Ops vector.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
Returns true if an argument of type Ty needs to be passed in a contiguous block of registers in calli...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
const ARMBaseTargetMachine & getTM() const
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPostIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mo...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:32
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:506
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:709
bool isFloatingPointOperation() const
Definition: Instructions.h:898
LLVM_ABI bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:95
static LLVM_ABI BaseIndexOffset match(const SDNode *N, const SelectionDAG &DAG)
Parses tree in N for base, index, offset addresses.
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:213
The address of a basic block.
Definition: Constants.h:899
static BranchProbability getZero()
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
void getInRegsParamInfo(unsigned InRegsParamRecordIndex, unsigned &BeginReg, unsigned &EndReg) const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
void rewindByValRegsInfo()
unsigned getInRegsParamsProcessed() const
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
void addInRegsParamInfo(unsigned RegBegin, unsigned RegEnd)
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
unsigned getInRegsParamsCount() const
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
bool isMemLoc() const
int64_t getLocMemOffset() const
unsigned getValNo() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1116
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
LLVM_ABI bool isIndirectCall() const
Return true if the callsite is an indirect call.
Value * getCalledOperand() const
Definition: InstrTypes.h:1340
AttributeList getAttributes() const
Return the attributes for this call.
Definition: InstrTypes.h:1424
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1506
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition: Constants.h:715
const APFloat & getValueAPF() const
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:277
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
This is an important base class in LLVM.
Definition: Constant.h:43
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
bool isLittleEndian() const
Layout endianness...
Definition: DataLayout.h:198
bool isBigEndian() const
Definition: DataLayout.h:199
MaybeAlign getStackAlignment() const
Returns the natural stack alignment, or MaybeAlign() if one wasn't specified.
Definition: DataLayout.h:228
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
LLVM_ABI Align getPreferredAlign(const GlobalVariable *GV) const
Returns the preferred alignment of the specified global.
Definition: DataLayout.cpp:988
StringRef getPrivateGlobalPrefix() const
Definition: DataLayout.h:286
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:846
A debug info location.
Definition: DebugLoc.h:124
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:165
unsigned size() const
Definition: DenseMap.h:108
bool empty() const
Definition: DenseMap.h:107
iterator begin()
Definition: DenseMap.h:78
iterator end()
Definition: DenseMap.h:81
Implements a dense probed hash-table based set.
Definition: DenseSet.h:263
Diagnostic information for unsupported feature in backend.
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:592
unsigned getNumElements() const
Definition: DerivedTypes.h:635
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:803
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:170
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:137
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:209
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:270
arg_iterator arg_begin()
Definition: Function.h:866
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:359
bool hasStructRetAttr() const
Determine if the function returns a structure through first or second pointer argument.
Definition: Function.h:687
bool isVarArg() const
isVarArg - Return true if this function takes a variable number of arguments.
Definition: Function.h:227
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:727
const GlobalValue * getGlobal() const
bool isDSOLocal() const
Definition: GlobalValue.h:307
bool hasExternalWeakLinkage() const
Definition: GlobalValue.h:531
bool hasDLLImportStorageClass() const
Definition: GlobalValue.h:280
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:663
bool isStrongDefinitionForLinker() const
Returns true if this global's definition will be the one chosen by the linker.
Definition: GlobalValue.h:638
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:60
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:114
Value * CreateZExtOrBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2214
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1936
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2618
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2199
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1513
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:201
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:834
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:522
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1492
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2082
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2593
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2194
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2508
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2068
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:605
Value * CreateTruncOrBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2230
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition: IRBuilder.h:1573
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2780
std::optional< unsigned > getOperandCycle(unsigned ItinClassIndx, unsigned OperandIdx) const
Return the cycle for the given class and operand.
bool isEmpty() const
Returns true if there are no itineraries.
LLVM_ABI bool hasAtomicStore() const LLVM_READONLY
Return true if this atomic instruction stores to memory.
Class to represent integer types.
Definition: DerivedTypes.h:42
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:74
static bool LowerToByteSwap(CallInst *CI)
Try to replace a call instruction with a call to a bswap intrinsic.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:180
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:199
unsigned getSchedClass() const
Return the scheduling class for this instruction.
Definition: MCInstrDesc.h:603
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:238
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:240
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
Definition: MCInstrDesc.h:249
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
Definition: MCInstrDesc.h:220
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:42
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isInteger() const
Return true if this is an integer or a vector integer type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:247
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
bool is64BitVector() const
Return true if this is a 64-bit vector type.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
LLVM_ABI void moveAfter(MachineBasicBlock *NewBefore)
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineConstantPool class keeps track of constants referenced by a function which must be spilled...
unsigned getConstantPoolIndex(const Constant *C, Align Alignment)
getConstantPoolIndex - Create a new entry in the constant pool or return an existing one.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
Properties which a MachineFunction may have at a given point in time.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineConstantPool * getConstantPool()
getConstantPool - Return the constant pool object for the current function.
const MachineFunctionProperties & getProperties() const
Get the function properties.
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addConstantPoolIndex(unsigned Idx, int Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:72
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:595
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
LLVM_ABI void setIsDef(bool Val=true)
Change a def to a use, or a use to a def.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class is used to represent an MLOAD node.
This class is used to represent an MSTORE node.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
virtual void print(raw_ostream &OS, const Module *M) const
print - Print out the internal state of the pass.
Definition: Pass.cpp:140
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:720
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< use_iterator > uses()
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
void dump() const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:229
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:758
LLVM_ABI SDValue getStackArgumentTokenFactor(SDValue Chain)
Compute a TokenFactor to force all the incoming stack arguments to be loaded from the stack.
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:500
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:813
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:504
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:768
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:868
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:839
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:498
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:719
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:499
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:707
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:493
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:511
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:777
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:581
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a logical NOT operation as (XOR Val, BooleanOne).
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
void reserve(size_type NewNumEntries)
Definition: SmallPtrSet.h:117
void insert_range(Range &&R)
Definition: SmallPtrSet.h:490
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134
bool empty() const
Definition: SmallSet.h:169
bool erase(const T &V)
Definition: SmallSet.h:198
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:182
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:806
void resize(size_type N)
Definition: SmallVector.h:639
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
An instruction for storing to memory.
Definition: Instructions.h:296
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
const unsigned char * bytes_end() const
Definition: StringRef.h:135
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:154
const unsigned char * bytes_begin() const
Definition: StringRef.h:132
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:43
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:68
R Default(T Value)
Definition: StringSwitch.h:177
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:414
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
void setMinStackArgumentAlignment(Align Alignment)
Set the minimum stack alignment of an argument.
const TargetMachine & getTargetMachine() const
void setIndexedMaskedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked load does or does not work with the specified type and ind...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
Sched::Preference getSchedulingPreference() const
Return target scheduling preference.
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual unsigned getMaxSupportedInterleaveFactor() const
Get the maximum supported factor for interleaved memory accesses.
void setIndexedMaskedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked store does or does not work with the specified type and in...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
virtual std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const
Return the largest legal super-reg register class of the register class for the specified type and it...
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Get the libcall impl routine name for the specified libcall.
static StringRef getLibcallImplName(RTLIB::LibcallImpl Call)
Get the libcall routine name for the specified libcall implementation.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
void setLibcallImpl(RTLIB::Libcall Call, RTLIB::LibcallImpl Impl)
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
SDValue buildLegalVectorShuffle(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, MutableArrayRef< int > Mask, SelectionDAG &DAG) const
Tries to build a legal vector shuffle using the provided parameters or equivalent variations.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool isConstTrueVal(SDValue N) const
Return if the N is a constant or constant vector equal to the true value from getBooleanContents().
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
ExceptionHandling getExceptionModel() const
Return the ExceptionHandling to use, considering TargetOptions and the Triple's default.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
TargetOptions Options
unsigned EnableFastISel
EnableFastISel - This flag enables fast-path instruction selection which trades away generated code q...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned EmitCallGraphSection
Emit section containing call graph metadata.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:47
ObjectFormatType getObjectFormat() const
Get the object format for this triple.
Definition: Triple.h:435
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:273
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:264
LLVM_ABI void dump() const
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:267
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Type * getArrayElementType() const
Definition: Type.h:408
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:352
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:35
Value * getOperand(unsigned i) const
Definition: User.h:232
unsigned getNumOperands() const
Definition: User.h:254
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:439
Base class of all SIMD vector types.
Definition: DerivedTypes.h:430
Type * getElementType() const
Definition: DerivedTypes.h:463
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:194
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:169
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:203
const ParentTy * getParent() const
Definition: ilist_node.h:34
self_iterator getIterator()
Definition: ilist_node.h:134
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
static CondCodes getOppositeCondition(CondCodes CC)
Definition: ARMBaseInfo.h:48
@ SECREL
Thread Pointer Offset.
@ SBREL
Section Relative (Windows TLS)
@ GOTTPOFF
Global Offset Table, PC Relative.
@ TPOFF
Global Offset Table, Thread Pointer Offset.
TOF
Target Operand Flag enum.
Definition: ARMBaseInfo.h:242
@ MO_NONLAZY
MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it represents a symbol which,...
Definition: ARMBaseInfo.h:288
@ MO_SBREL
MO_SBREL - On a symbol operand, this represents a static base relative relocation.
Definition: ARMBaseInfo.h:270
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
Definition: ARMBaseInfo.h:275
@ MO_GOT
MO_GOT - On a symbol operand, this represents a GOT relative relocation.
Definition: ARMBaseInfo.h:266
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
Definition: ARMBaseInfo.h:263
static ShiftOpc getShiftOpcForNode(unsigned Opcode)
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits)
decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the element value and the element ...
unsigned getAM2Offset(unsigned AM2Opc)
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
unsigned createVMOVModImm(unsigned OpCmode, unsigned Val)
int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm)
int getFP32FP16Imm(const APInt &Imm)
If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding for it.
AddrOpc getAM2Op(unsigned AM2Opc)
bool isBitFieldInvertedMask(unsigned v)
const unsigned FPStatusBits
const unsigned FPReservedBits
const unsigned RoundingBitsPos
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
@ Entry
Definition: COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Swift
Calling convention for Swift.
Definition: CallingConv.h:69
@ ARM_APCS
ARM Procedure Calling Standard (obsolete, but still used on some targets).
Definition: CallingConv.h:107
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition: CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition: CallingConv.h:63
@ ARM_AAPCS
ARM Architecture Procedure Calling Standard calling convention (aka EABI).
Definition: CallingConv.h:111
@ CXX_FAST_TLS
Used for access functions.
Definition: CallingConv.h:72
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition: CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition: CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ ARM_AAPCS_VFP
Same as ARM_AAPCS, but uses hard floating point ABI.
Definition: CallingConv.h:114
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:256
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1236
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1232
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:774
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:504
@ SET_FPENV
Sets the current floating-point environment.
Definition: ISDOpcodes.h:1108
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
Definition: ISDOpcodes.h:1401
@ VECREDUCE_SMIN
Definition: ISDOpcodes.h:1491
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition: ISDOpcodes.h:163
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition: ISDOpcodes.h:525
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:270
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1379
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:765
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1265
@ ConstantFP
Definition: ISDOpcodes.h:87
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1381
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1351
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1382
@ RESET_FPENV
Set floating-point environment to default state.
Definition: ISDOpcodes.h:1112
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:259
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1141
@ SET_FPMODE
Sets the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1131
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:215
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition: ISDOpcodes.h:167
@ GlobalAddress
Definition: ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:571
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
Definition: ISDOpcodes.h:1476
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:738
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
Definition: ISDOpcodes.h:1343
@ RESET_FPMODE
Sets default dynamic floating-point control modes.
Definition: ISDOpcodes.h:1135
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:275
@ VECREDUCE_SMAX
Definition: ISDOpcodes.h:1490
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:505
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:985
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1377
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:975
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:249
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1378
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:1018
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1574
@ GlobalTLSAddress
Definition: ISDOpcodes.h:89
@ FrameIndex
Definition: ISDOpcodes.h:90
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:957
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:826
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:706
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:656
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1157
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
Definition: ISDOpcodes.h:1473
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:773
@ WRITE_REGISTER
Definition: ISDOpcodes.h:135
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1331
@ VECREDUCE_FMIN
Definition: ISDOpcodes.h:1477
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1090
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:809
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:1002
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1187
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:347
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1380
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1166
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:778
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1347
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:228
@ VECREDUCE_UMAX
Definition: ISDOpcodes.h:1492
@ RegisterMask
Definition: ISDOpcodes.h:85
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:242
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1261
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:343
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
Definition: ISDOpcodes.h:1485
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:695
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1126
@ GET_FPENV
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1103
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:636
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1375
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:601
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition: ISDOpcodes.h:134
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:832
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1321
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:928
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:793
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1358
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1383
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
Definition: ISDOpcodes.h:1059
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:351
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1151
@ ConstantPool
Definition: ISDOpcodes.h:92
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:718
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:960
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:323
@ VECREDUCE_UMIN
Definition: ISDOpcodes.h:1493
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:994
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:110
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1373
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:470
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1081
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1374
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:908
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1292
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:730
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1318
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:200
@ SCMP
[US]CMP - 3-way comparison of signed or unsigned integers.
Definition: ISDOpcodes.h:726
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:701
@ VECREDUCE_FMUL
Definition: ISDOpcodes.h:1474
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:53
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1372
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1025
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:941
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition: ISDOpcodes.h:122
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:434
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:927
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition: ISDOpcodes.h:157
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:838
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1256
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1180
@ BlockAddress
Definition: ISDOpcodes.h:94
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:815
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:360
@ AssertZext
Definition: ISDOpcodes.h:63
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition: ISDOpcodes.h:713
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:543
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isZEXTLoad(const SDNode *N)
Returns true if the specified node is a ZEXTLOAD.
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1724
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1640
bool isSEXTLoad(const SDNode *N)
Returns true if the specified node is a SEXTLOAD.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1691
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1671
static const int LAST_INDEXED_MODE
Definition: ISDOpcodes.h:1642
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:751
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPEXT(EVT OpVT, EVT RetVT)
getFPEXT - Return the FPEXT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition: LLVMContext.h:55
@ GeneralDynamic
Definition: CodeGen.h:46
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
bool RetFastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:477
@ Length
Definition: DWP.cpp:477
void stable_sort(R &&Range)
Definition: STLExtras.h:2077
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1770
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns true if Val1 has a lower Constant Materialization Cost than Val2.
@ Read
Definition: CodeGenData.h:108
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isStrongerThanMonotonic(AtomicOrdering AO)
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition: bit.h:260
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
bool CC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:264
ExceptionHandling
Definition: CodeGen.h:53
@ SjLj
setjmp/longjmp based exceptions
bool RetCC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool RetCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2155
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:252
bool RetCC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:293
void shuffle(Iterator first, Iterator last, RNG &&g)
Definition: STLExtras.h:1563
bool CC_ARM_APCS_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
static std::array< MachineOperand, 2 > predOps(ARMCC::CondCodes Pred, unsigned PredReg=0)
Get the operands corresponding to the given Pred value.
constexpr bool isShiftedMask_32(uint32_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (32 bit ver...
Definition: MathExtras.h:276
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:157
bool isReleaseOrStronger(AtomicOrdering AO)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:336
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:203
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
Definition: SPIRVUtils.cpp:976
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
bool FastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
bool CC_ARM_Win32_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition: Error.cpp:167
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Definition: SmallVector.h:1300
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
CombineLevel
Definition: DAGCombine.h:15
@ BeforeLegalizeTypes
Definition: DAGCombine.h:16
unsigned ConstantMaterializationCost(unsigned Val, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns the number of instructions required to materialize the given constant in a register,...
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr U AbsoluteValue(T X)
Return the absolute value of a signed integer, converted to the corresponding unsigned integer type.
Definition: MathExtras.h:597
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:223
static MachineOperand t1CondCodeOp(bool isDead=false)
Get the operand corresponding to the conditional code result for Thumb1.
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1980
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1777
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:257
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
static MachineOperand condCodeOp(unsigned CCReg=0)
Get the operand corresponding to the conditional code result.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
unsigned gettBLXrOpcode(const MachineFunction &MF)
bool CC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool CC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
unsigned convertAddSubFlagsOpcode(unsigned OldOpc)
Map pseudo instructions that imply an 'S' bit onto real opcodes.
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
#define N
Load/store instruction that can be merged with a base address update.
SDNode * N
Instruction that updates a pointer.
unsigned ConstInc
Pointer increment value if it is a constant, or 0 otherwise.
SDValue Inc
Pointer increment operand.
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition: Metadata.h:760
static constexpr roundingMode rmTowardZero
Definition: APFloat.h:308
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Extended Value Type.
Definition: ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:279
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:345
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:458
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:354
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:465
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:299
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:376
bool isFixedLengthVector() const
Definition: ValueTypes.h:181
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:216
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:102
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition: ValueTypes.h:303
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:202
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
EVT ArgVT
Usually the non-legalized type of the argument, which is the EVT corresponding to the OrigTy IR type.
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:294
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:66
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition: KnownBits.h:165
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:74
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:304
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition: KnownBits.h:173
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:340
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
Definition: KnownBits.cpp:803
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition: KnownBits.h:128
Matching combinators.
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getJumpTable(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a jump table entry.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setInRegister(bool Value=true)
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
SmallVector< ISD::InputArg, 32 > Ins
CallLoweringInfo & setDiscardResult(bool Value=true)
CallLoweringInfo & setZExtResult(bool Value=true)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setSExtResult(bool Value=true)
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
CallLoweringInfo & setChain(SDValue InChain)
CallLoweringInfo & setCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList, AttributeSet ResultAttrs={})
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)