LLVM 22.0.0git
ARMISelLowering.cpp
Go to the documentation of this file.
1//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that ARM uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "ARMISelLowering.h"
15#include "ARMBaseInstrInfo.h"
16#include "ARMBaseRegisterInfo.h"
17#include "ARMCallingConv.h"
20#include "ARMPerfectShuffle.h"
21#include "ARMRegisterInfo.h"
22#include "ARMSelectionDAGInfo.h"
23#include "ARMSubtarget.h"
27#include "Utils/ARMBaseInfo.h"
28#include "llvm/ADT/APFloat.h"
29#include "llvm/ADT/APInt.h"
30#include "llvm/ADT/ArrayRef.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/DenseMap.h"
33#include "llvm/ADT/STLExtras.h"
36#include "llvm/ADT/Statistic.h"
38#include "llvm/ADT/StringRef.h"
40#include "llvm/ADT/Twine.h"
67#include "llvm/IR/Attributes.h"
68#include "llvm/IR/CallingConv.h"
69#include "llvm/IR/Constant.h"
70#include "llvm/IR/Constants.h"
71#include "llvm/IR/DataLayout.h"
72#include "llvm/IR/DebugLoc.h"
74#include "llvm/IR/Function.h"
75#include "llvm/IR/GlobalAlias.h"
76#include "llvm/IR/GlobalValue.h"
78#include "llvm/IR/IRBuilder.h"
79#include "llvm/IR/InlineAsm.h"
80#include "llvm/IR/Instruction.h"
83#include "llvm/IR/Intrinsics.h"
84#include "llvm/IR/IntrinsicsARM.h"
85#include "llvm/IR/Module.h"
86#include "llvm/IR/Type.h"
87#include "llvm/IR/User.h"
88#include "llvm/IR/Value.h"
89#include "llvm/MC/MCInstrDesc.h"
91#include "llvm/MC/MCSchedule.h"
98#include "llvm/Support/Debug.h"
106#include <algorithm>
107#include <cassert>
108#include <cstdint>
109#include <cstdlib>
110#include <iterator>
111#include <limits>
112#include <optional>
113#include <tuple>
114#include <utility>
115#include <vector>
116
117using namespace llvm;
118
119#define DEBUG_TYPE "arm-isel"
120
121STATISTIC(NumTailCalls, "Number of tail calls");
122STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
123STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
124STATISTIC(NumConstpoolPromoted,
125 "Number of constants with their storage promoted into constant pools");
126
127static cl::opt<bool>
128ARMInterworking("arm-interworking", cl::Hidden,
129 cl::desc("Enable / disable ARM interworking (for debugging only)"),
130 cl::init(true));
131
133 "arm-promote-constant", cl::Hidden,
134 cl::desc("Enable / disable promotion of unnamed_addr constants into "
135 "constant pools"),
136 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
138 "arm-promote-constant-max-size", cl::Hidden,
139 cl::desc("Maximum size of constant to promote into a constant pool"),
140 cl::init(64));
142 "arm-promote-constant-max-total", cl::Hidden,
143 cl::desc("Maximum size of ALL constants to promote into a constant pool"),
144 cl::init(128));
145
147MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
148 cl::desc("Maximum interleave factor for MVE VLDn to generate."),
149 cl::init(2));
150
152 "arm-max-base-updates-to-check", cl::Hidden,
153 cl::desc("Maximum number of base-updates to check generating postindex."),
154 cl::init(64));
155
156/// Value type used for "flags" operands / results (either CPSR or FPSCR_NZCV).
157constexpr MVT FlagsVT = MVT::i32;
158
159// The APCS parameter registers.
160static const MCPhysReg GPRArgRegs[] = {
161 ARM::R0, ARM::R1, ARM::R2, ARM::R3
162};
163
165 SelectionDAG &DAG, const SDLoc &DL) {
167 assert(Arg.ArgVT.bitsLT(MVT::i32));
168 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, Arg.ArgVT, Value);
169 SDValue Ext =
171 MVT::i32, Trunc);
172 return Ext;
173}
174
175void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
176 if (VT != PromotedLdStVT) {
178 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
179
181 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
182 }
183
184 MVT ElemTy = VT.getVectorElementType();
185 if (ElemTy != MVT::f64)
189 if (ElemTy == MVT::i32) {
194 } else {
199 }
208 if (VT.isInteger()) {
212 }
213
214 // Neon does not support vector divide/remainder operations.
223
224 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
225 for (auto Opcode : {ISD::ABS, ISD::ABDS, ISD::ABDU, ISD::SMIN, ISD::SMAX,
227 setOperationAction(Opcode, VT, Legal);
228 if (!VT.isFloatingPoint())
229 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
230 setOperationAction(Opcode, VT, Legal);
231}
232
233void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
234 addRegisterClass(VT, &ARM::DPRRegClass);
235 addTypeForNEON(VT, MVT::f64);
236}
237
238void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
239 addRegisterClass(VT, &ARM::DPairRegClass);
240 addTypeForNEON(VT, MVT::v2f64);
241}
242
243void ARMTargetLowering::setAllExpand(MVT VT) {
244 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
246
247 // We support these really simple operations even on types where all
248 // the actual arithmetic has to be broken down into simpler
249 // operations or turned into library calls.
254}
255
256void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
257 LegalizeAction Action) {
258 setLoadExtAction(ISD::EXTLOAD, From, To, Action);
259 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
260 setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
261}
262
263void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
264 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
265
266 for (auto VT : IntTypes) {
267 addRegisterClass(VT, &ARM::MQPRRegClass);
297
298 // No native support for these.
308
309 // Vector reductions
319
320 if (!HasMVEFP) {
325 } else {
328 }
329
330 // Pre and Post inc are supported on loads and stores
331 for (unsigned im = (unsigned)ISD::PRE_INC;
337 }
338 }
339
340 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
341 for (auto VT : FloatTypes) {
342 addRegisterClass(VT, &ARM::MQPRRegClass);
343 if (!HasMVEFP)
344 setAllExpand(VT);
345
346 // These are legal or custom whether we have MVE.fp or not
359
360 // Pre and Post inc are supported on loads and stores
361 for (unsigned im = (unsigned)ISD::PRE_INC;
367 }
368
369 if (HasMVEFP) {
382
383 // No native support for these.
398 }
399 }
400
401 // Custom Expand smaller than legal vector reductions to prevent false zero
402 // items being added.
411
412 // We 'support' these types up to bitcast/load/store level, regardless of
413 // MVE integer-only / float support. Only doing FP data processing on the FP
414 // vector types is inhibited at integer-only level.
415 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
416 for (auto VT : LongTypes) {
417 addRegisterClass(VT, &ARM::MQPRRegClass);
418 setAllExpand(VT);
424 }
426
427 // We can do bitwise operations on v2i64 vectors
428 setOperationAction(ISD::AND, MVT::v2i64, Legal);
429 setOperationAction(ISD::OR, MVT::v2i64, Legal);
430 setOperationAction(ISD::XOR, MVT::v2i64, Legal);
431
432 // It is legal to extload from v4i8 to v4i16 or v4i32.
433 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
434 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
435 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
436
437 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
443
444 // Some truncating stores are legal too.
445 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
446 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
447 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
448
449 // Pre and Post inc on these are legal, given the correct extends
450 for (unsigned im = (unsigned)ISD::PRE_INC;
452 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
457 }
458 }
459
460 // Predicate types
461 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
462 for (auto VT : pTypes) {
463 addRegisterClass(VT, &ARM::VCCRRegClass);
478
479 if (!HasMVEFP) {
484 }
485 }
489 setOperationAction(ISD::OR, MVT::v2i1, Expand);
495
504}
505
507 return static_cast<const ARMBaseTargetMachine &>(getTargetMachine());
508}
509
511 const ARMSubtarget &STI)
512 : TargetLowering(TM_), Subtarget(&STI),
513 RegInfo(Subtarget->getRegisterInfo()),
514 Itins(Subtarget->getInstrItineraryData()) {
515 const auto &TM = static_cast<const ARMBaseTargetMachine &>(TM_);
516
519
520 const Triple &TT = TM.getTargetTriple();
521
522 if (TT.isOSBinFormatMachO()) {
523 // Uses VFP for Thumb libfuncs if available.
524 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
525 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
526 // clang-format off
527 static const struct {
528 const RTLIB::Libcall Op;
529 const RTLIB::LibcallImpl Impl;
530 } LibraryCalls[] = {
531 // Single-precision floating-point arithmetic.
532 { RTLIB::ADD_F32, RTLIB::__addsf3vfp },
533 { RTLIB::SUB_F32, RTLIB::__subsf3vfp },
534 { RTLIB::MUL_F32, RTLIB::__mulsf3vfp },
535 { RTLIB::DIV_F32, RTLIB::__divsf3vfp },
536
537 // Double-precision floating-point arithmetic.
538 { RTLIB::ADD_F64, RTLIB::__adddf3vfp },
539 { RTLIB::SUB_F64, RTLIB::__subdf3vfp },
540 { RTLIB::MUL_F64, RTLIB::__muldf3vfp },
541 { RTLIB::DIV_F64, RTLIB::__divdf3vfp },
542
543 // Single-precision comparisons.
544 { RTLIB::OEQ_F32, RTLIB::__eqsf2vfp },
545 { RTLIB::UNE_F32, RTLIB::__nesf2vfp },
546 { RTLIB::OLT_F32, RTLIB::__ltsf2vfp },
547 { RTLIB::OLE_F32, RTLIB::__lesf2vfp },
548 { RTLIB::OGE_F32, RTLIB::__gesf2vfp },
549 { RTLIB::OGT_F32, RTLIB::__gtsf2vfp },
550 { RTLIB::UO_F32, RTLIB::__unordsf2vfp },
551
552 // Double-precision comparisons.
553 { RTLIB::OEQ_F64, RTLIB::__eqdf2vfp },
554 { RTLIB::UNE_F64, RTLIB::__nedf2vfp },
555 { RTLIB::OLT_F64, RTLIB::__ltdf2vfp },
556 { RTLIB::OLE_F64, RTLIB::__ledf2vfp },
557 { RTLIB::OGE_F64, RTLIB::__gedf2vfp },
558 { RTLIB::OGT_F64, RTLIB::__gtdf2vfp },
559 { RTLIB::UO_F64, RTLIB::__unorddf2vfp },
560
561 // Floating-point to integer conversions.
562 // i64 conversions are done via library routines even when generating VFP
563 // instructions, so use the same ones.
564 { RTLIB::FPTOSINT_F64_I32, RTLIB::__fixdfsivfp },
565 { RTLIB::FPTOUINT_F64_I32, RTLIB::__fixunsdfsivfp },
566 { RTLIB::FPTOSINT_F32_I32, RTLIB::__fixsfsivfp },
567 { RTLIB::FPTOUINT_F32_I32, RTLIB::__fixunssfsivfp },
568
569 // Conversions between floating types.
570 { RTLIB::FPROUND_F64_F32, RTLIB::__truncdfsf2vfp },
571 { RTLIB::FPEXT_F32_F64, RTLIB::__extendsfdf2vfp },
572
573 // Integer to floating-point conversions.
574 // i64 conversions are done via library routines even when generating VFP
575 // instructions, so use the same ones.
576 // FIXME: There appears to be some naming inconsistency in ARM libgcc:
577 // e.g., __floatunsidf vs. __floatunssidfvfp.
578 { RTLIB::SINTTOFP_I32_F64, RTLIB::__floatsidfvfp },
579 { RTLIB::UINTTOFP_I32_F64, RTLIB::__floatunssidfvfp },
580 { RTLIB::SINTTOFP_I32_F32, RTLIB::__floatsisfvfp },
581 { RTLIB::UINTTOFP_I32_F32, RTLIB::__floatunssisfvfp },
582 };
583 // clang-format on
584
585 for (const auto &LC : LibraryCalls)
586 setLibcallImpl(LC.Op, LC.Impl);
587 }
588 }
589
590 if (Subtarget->isThumb1Only())
591 addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
592 else
593 addRegisterClass(MVT::i32, &ARM::GPRRegClass);
594
595 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
596 Subtarget->hasFPRegs()) {
597 addRegisterClass(MVT::f32, &ARM::SPRRegClass);
598 addRegisterClass(MVT::f64, &ARM::DPRRegClass);
599
604
605 if (!Subtarget->hasVFP2Base())
606 setAllExpand(MVT::f32);
607 if (!Subtarget->hasFP64())
608 setAllExpand(MVT::f64);
609 }
610
611 if (Subtarget->hasFullFP16()) {
612 addRegisterClass(MVT::f16, &ARM::HPRRegClass);
615
618 }
619
620 if (Subtarget->hasBF16()) {
621 addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
622 setAllExpand(MVT::bf16);
623 if (!Subtarget->hasFullFP16())
625 } else {
630 }
631
633 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
634 setTruncStoreAction(VT, InnerVT, Expand);
635 addAllExtLoads(VT, InnerVT, Expand);
636 }
637
640
642 }
643
644 if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps())
646
647 if (!Subtarget->hasV8_1MMainlineOps())
649
652
655
656 if (Subtarget->hasMVEIntegerOps())
657 addMVEVectorTypes(Subtarget->hasMVEFloatOps());
658
659 // Combine low-overhead loop intrinsics so that we can lower i1 types.
660 if (Subtarget->hasLOB()) {
662 }
663
664 if (Subtarget->hasNEON()) {
665 addDRTypeForNEON(MVT::v2f32);
666 addDRTypeForNEON(MVT::v8i8);
667 addDRTypeForNEON(MVT::v4i16);
668 addDRTypeForNEON(MVT::v2i32);
669 addDRTypeForNEON(MVT::v1i64);
670
671 addQRTypeForNEON(MVT::v4f32);
672 addQRTypeForNEON(MVT::v2f64);
673 addQRTypeForNEON(MVT::v16i8);
674 addQRTypeForNEON(MVT::v8i16);
675 addQRTypeForNEON(MVT::v4i32);
676 addQRTypeForNEON(MVT::v2i64);
677
678 if (Subtarget->hasFullFP16()) {
679 addQRTypeForNEON(MVT::v8f16);
680 addDRTypeForNEON(MVT::v4f16);
681 }
682
683 if (Subtarget->hasBF16()) {
684 addQRTypeForNEON(MVT::v8bf16);
685 addDRTypeForNEON(MVT::v4bf16);
686 }
687 }
688
689 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
690 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
691 // none of Neon, MVE or VFP supports any arithmetic operations on it.
692 setOperationAction(ISD::FADD, MVT::v2f64, Expand);
693 setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
694 setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
695 // FIXME: Code duplication: FDIV and FREM are expanded always, see
696 // ARMTargetLowering::addTypeForNEON method for details.
697 setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
698 setOperationAction(ISD::FREM, MVT::v2f64, Expand);
699 // FIXME: Create unittest.
700 // In another words, find a way when "copysign" appears in DAG with vector
701 // operands.
703 // FIXME: Code duplication: SETCC has custom operation action, see
704 // ARMTargetLowering::addTypeForNEON method for details.
706 // FIXME: Create unittest for FNEG and for FABS.
707 setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
708 setOperationAction(ISD::FABS, MVT::v2f64, Expand);
710 setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
711 setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
712 setOperationAction(ISD::FTAN, MVT::v2f64, Expand);
713 setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
714 setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
717 setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
726 setOperationAction(ISD::FMA, MVT::v2f64, Expand);
727 }
728
729 if (Subtarget->hasNEON()) {
730 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
731 // supported for v4f32.
733 setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
734 setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
735 setOperationAction(ISD::FTAN, MVT::v4f32, Expand);
736 setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
737 setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
740 setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
749
750 // Mark v2f32 intrinsics.
752 setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
753 setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
754 setOperationAction(ISD::FTAN, MVT::v2f32, Expand);
755 setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
756 setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
759 setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
768
771 setOperationAction(Op, MVT::v4f16, Expand);
772 setOperationAction(Op, MVT::v8f16, Expand);
773 }
774
775 // Neon does not support some operations on v1i64 and v2i64 types.
776 setOperationAction(ISD::MUL, MVT::v1i64, Expand);
777 // Custom handling for some quad-vector types to detect VMULL.
778 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
779 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
780 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
781 // Custom handling for some vector types to avoid expensive expansions
782 setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
784 setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
786 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
787 // a destination type that is wider than the source, and nor does
788 // it have a FP_TO_[SU]INT instruction with a narrower destination than
789 // source.
798
801
802 // NEON does not have single instruction CTPOP for vectors with element
803 // types wider than 8-bits. However, custom lowering can leverage the
804 // v8i8/v16i8 vcnt instruction.
811
812 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
813 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
814
815 // NEON does not have single instruction CTTZ for vectors.
817 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
818 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
819 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
820
821 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
822 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
823 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
824 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
825
830
835
839 }
840
841 // NEON only has FMA instructions as of VFP4.
842 if (!Subtarget->hasVFP4Base()) {
843 setOperationAction(ISD::FMA, MVT::v2f32, Expand);
844 setOperationAction(ISD::FMA, MVT::v4f32, Expand);
845 }
846
849
850 // It is legal to extload from v4i8 to v4i16 or v4i32.
851 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
852 MVT::v2i32}) {
857 }
858 }
859
860 for (auto VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
861 MVT::v4i32}) {
866 }
867 }
868
869 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
876 }
877 if (Subtarget->hasMVEIntegerOps()) {
880 ISD::SETCC});
881 }
882 if (Subtarget->hasMVEFloatOps()) {
884 }
885
886 if (!Subtarget->hasFP64()) {
887 // When targeting a floating-point unit with only single-precision
888 // operations, f64 is legal for the few double-precision instructions which
889 // are present However, no double-precision operations other than moves,
890 // loads and stores are provided by the hardware.
929 }
930
931 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
934 if (Subtarget->hasFullFP16()) {
937 }
938 }
939
940 if (!Subtarget->hasFP16()) {
943 }
944
946
947 // ARM does not have floating-point extending loads.
948 for (MVT VT : MVT::fp_valuetypes()) {
949 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
950 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
951 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
952 }
953
954 // ... or truncating stores
955 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
956 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
957 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
958 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
959 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
960
961 // ARM does not have i1 sign extending load.
962 for (MVT VT : MVT::integer_valuetypes())
964
965 // ARM supports all 4 flavors of integer indexed load / store.
966 if (!Subtarget->isThumb1Only()) {
967 for (unsigned im = (unsigned)ISD::PRE_INC;
969 setIndexedLoadAction(im, MVT::i1, Legal);
970 setIndexedLoadAction(im, MVT::i8, Legal);
971 setIndexedLoadAction(im, MVT::i16, Legal);
972 setIndexedLoadAction(im, MVT::i32, Legal);
973 setIndexedStoreAction(im, MVT::i1, Legal);
974 setIndexedStoreAction(im, MVT::i8, Legal);
975 setIndexedStoreAction(im, MVT::i16, Legal);
976 setIndexedStoreAction(im, MVT::i32, Legal);
977 }
978 } else {
979 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
982 }
983
988
991 if (Subtarget->hasDSP()) {
1000 }
1001 if (Subtarget->hasBaseDSP()) {
1004 }
1005
1006 // i64 operation support.
1009 if (Subtarget->isThumb1Only()) {
1012 }
1013 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
1014 || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
1016
1026
1027 // MVE lowers 64 bit shifts to lsll and lsrl
1028 // assuming that ISD::SRL and SRA of i64 are already marked custom
1029 if (Subtarget->hasMVEIntegerOps())
1031
1032 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1033 if (Subtarget->isThumb1Only()) {
1037 }
1038
1039 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
1041
1042 // ARM does not have ROTL.
1047 }
1049 // TODO: These two should be set to LibCall, but this currently breaks
1050 // the Linux kernel build. See #101786.
1053 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1056 }
1057
1058 // @llvm.readcyclecounter requires the Performance Monitors extension.
1059 // Default to the 0 expansion on unsupported platforms.
1060 // FIXME: Technically there are older ARM CPUs that have
1061 // implementation-specific ways of obtaining this information.
1062 if (Subtarget->hasPerfMon())
1064
1065 // Only ARMv6 has BSWAP.
1066 if (!Subtarget->hasV6Ops())
1068
1069 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1070 : Subtarget->hasDivideInARMMode();
1071 if (!hasDivide) {
1072 // These are expanded into libcalls if the cpu doesn't have HW divider.
1075 }
1076
1077 if (TT.isOSWindows() && !Subtarget->hasDivideInThumbMode()) {
1080
1083 }
1084
1087
1088 // Register based DivRem for AEABI (RTABI 4.2)
1089 if (TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() ||
1090 TT.isTargetMuslAEABI() || TT.isOSWindows()) {
1093 HasStandaloneRem = false;
1094
1099 } else {
1102 }
1103
1108
1109 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1111
1112 // Use the default implementation.
1114 setOperationAction(ISD::VAARG, MVT::Other, Expand);
1116 setOperationAction(ISD::VAEND, MVT::Other, Expand);
1119
1120 if (TT.isOSWindows())
1122 else
1124
1125 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1126 // the default expansion.
1127 InsertFencesForAtomic = false;
1128 if (Subtarget->hasAnyDataBarrier() &&
1129 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1130 // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1131 // to ldrex/strex loops already.
1133 if (!Subtarget->isThumb() || !Subtarget->isMClass())
1135
1136 // On v8, we have particularly efficient implementations of atomic fences
1137 // if they can be combined with nearby atomic loads and stores.
1138 if (!Subtarget->hasAcquireRelease() ||
1139 getTargetMachine().getOptLevel() == CodeGenOptLevel::None) {
1140 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1141 InsertFencesForAtomic = true;
1142 }
1143 } else {
1144 // If there's anything we can use as a barrier, go through custom lowering
1145 // for ATOMIC_FENCE.
1146 // If target has DMB in thumb, Fences can be inserted.
1147 if (Subtarget->hasDataBarrier())
1148 InsertFencesForAtomic = true;
1149
1151 Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1152
1153 // Set them all for libcall, which will force libcalls.
1166 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1167 // Unordered/Monotonic case.
1168 if (!InsertFencesForAtomic) {
1171 }
1172 }
1173
1174 // Compute supported atomic widths.
1175 if (TT.isOSLinux() || (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
1176 // For targets where __sync_* routines are reliably available, we use them
1177 // if necessary.
1178 //
1179 // ARM Linux always supports 64-bit atomics through kernel-assisted atomic
1180 // routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
1181 //
1182 // ARMv6 targets have native instructions in ARM mode. For Thumb mode,
1183 // such targets should provide __sync_* routines, which use the ARM mode
1184 // instructions. (ARMv6 doesn't have dmb, but it has an equivalent
1185 // encoding; see ARMISD::MEMBARRIER_MCR.)
1187 } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||
1188 Subtarget->hasForced32BitAtomics()) {
1189 // Cortex-M (besides Cortex-M0) have 32-bit atomics.
1191 } else {
1192 // We can't assume anything about other targets; just use libatomic
1193 // routines.
1195 }
1196
1198
1200
1201 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1202 if (!Subtarget->hasV6Ops()) {
1205 }
1207
1208 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1209 !Subtarget->isThumb1Only()) {
1210 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1211 // iff target supports vfp2.
1221 }
1222
1223 // We want to custom lower some of our intrinsics.
1228
1238 if (Subtarget->hasFullFP16()) {
1242 }
1243
1245
1248 if (Subtarget->hasFullFP16())
1252 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
1253
1254 // We don't support sin/cos/fmod/copysign/pow
1263 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1264 !Subtarget->isThumb1Only()) {
1267 }
1270
1271 if (!Subtarget->hasVFP4Base()) {
1274 }
1275
1276 // Various VFP goodness
1277 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1278 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1279 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1282 }
1283
1284 // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1285 if (!Subtarget->hasFP16()) {
1288 }
1289
1290 // Strict floating-point comparisons need custom lowering.
1297 }
1298
1299 // Use __sincos_stret if available.
1300 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1301 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1304 }
1305
1306 // FP-ARMv8 implements a lot of rounding-like FP operations.
1307 if (Subtarget->hasFPARMv8Base()) {
1317 if (Subtarget->hasNEON()) {
1322 }
1323
1324 if (Subtarget->hasFP64()) {
1334 }
1335 }
1336
1337 // FP16 often need to be promoted to call lib functions
1338 if (Subtarget->hasFullFP16()) {
1353
1361 }
1362
1363 if (Subtarget->hasNEON()) {
1364 // vmin and vmax aren't available in a scalar form, so we can use
1365 // a NEON instruction with an undef lane instead.
1374
1375 if (Subtarget->hasV8Ops()) {
1376 setOperationAction(ISD::FFLOOR, MVT::v2f32, Legal);
1377 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
1378 setOperationAction(ISD::FROUND, MVT::v2f32, Legal);
1379 setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
1382 setOperationAction(ISD::FCEIL, MVT::v2f32, Legal);
1383 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
1384 setOperationAction(ISD::FTRUNC, MVT::v2f32, Legal);
1385 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
1386 setOperationAction(ISD::FRINT, MVT::v2f32, Legal);
1387 setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
1388 }
1389
1390 if (Subtarget->hasFullFP16()) {
1395
1400
1401 setOperationAction(ISD::FFLOOR, MVT::v4f16, Legal);
1402 setOperationAction(ISD::FFLOOR, MVT::v8f16, Legal);
1403 setOperationAction(ISD::FROUND, MVT::v4f16, Legal);
1404 setOperationAction(ISD::FROUND, MVT::v8f16, Legal);
1407 setOperationAction(ISD::FCEIL, MVT::v4f16, Legal);
1408 setOperationAction(ISD::FCEIL, MVT::v8f16, Legal);
1409 setOperationAction(ISD::FTRUNC, MVT::v4f16, Legal);
1410 setOperationAction(ISD::FTRUNC, MVT::v8f16, Legal);
1411 setOperationAction(ISD::FRINT, MVT::v4f16, Legal);
1412 setOperationAction(ISD::FRINT, MVT::v8f16, Legal);
1413 }
1414 }
1415
1416 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1417 // it, but it's just a wrapper around ldexp.
1418 if (TT.isOSWindows()) {
1420 if (isOperationExpand(Op, MVT::f32))
1421 setOperationAction(Op, MVT::f32, Promote);
1422 }
1423
1424 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1425 // isn't legal.
1427 if (isOperationExpand(Op, MVT::f16))
1428 setOperationAction(Op, MVT::f16, Promote);
1429
1430 // We have target-specific dag combine patterns for the following nodes:
1431 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
1434
1435 if (Subtarget->hasMVEIntegerOps())
1437
1438 if (Subtarget->hasV6Ops())
1440 if (Subtarget->isThumb1Only())
1442 // Attempt to lower smin/smax to ssat/usat
1443 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
1444 Subtarget->isThumb2()) {
1446 }
1447
1449
1450 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1451 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1453 else
1455
1456 //// temporary - rewrite interface to use type
1459 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1461 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1463
1464 // On ARM arguments smaller than 4 bytes are extended, so all arguments
1465 // are at least 4 bytes aligned.
1467
1468 // Prefer likely predicted branches to selects on out-of-order cores.
1469 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1470
1473 Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1474
1475 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1476}
1477
1479 return Subtarget->useSoftFloat();
1480}
1481
1483 return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32;
1484}
1485
1486// FIXME: It might make sense to define the representative register class as the
1487// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1488// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1489// SPR's representative would be DPR_VFP2. This should work well if register
1490// pressure tracking were modified such that a register use would increment the
1491// pressure of the register class's representative and all of it's super
1492// classes' representatives transitively. We have not implemented this because
1493// of the difficulty prior to coalescing of modeling operand register classes
1494// due to the common occurrence of cross class copies and subregister insertions
1495// and extractions.
1496std::pair<const TargetRegisterClass *, uint8_t>
1498 MVT VT) const {
1499 const TargetRegisterClass *RRC = nullptr;
1500 uint8_t Cost = 1;
1501 switch (VT.SimpleTy) {
1502 default:
1504 // Use DPR as representative register class for all floating point
1505 // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1506 // the cost is 1 for both f32 and f64.
1507 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1508 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1509 RRC = &ARM::DPRRegClass;
1510 // When NEON is used for SP, only half of the register file is available
1511 // because operations that define both SP and DP results will be constrained
1512 // to the VFP2 class (D0-D15). We currently model this constraint prior to
1513 // coalescing by double-counting the SP regs. See the FIXME above.
1514 if (Subtarget->useNEONForSinglePrecisionFP())
1515 Cost = 2;
1516 break;
1517 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1518 case MVT::v4f32: case MVT::v2f64:
1519 RRC = &ARM::DPRRegClass;
1520 Cost = 2;
1521 break;
1522 case MVT::v4i64:
1523 RRC = &ARM::DPRRegClass;
1524 Cost = 4;
1525 break;
1526 case MVT::v8i64:
1527 RRC = &ARM::DPRRegClass;
1528 Cost = 8;
1529 break;
1530 }
1531 return std::make_pair(RRC, Cost);
1532}
1533
1534const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
1535#define MAKE_CASE(V) \
1536 case V: \
1537 return #V;
1538 switch ((ARMISD::NodeType)Opcode) {
1540 break;
1743#undef MAKE_CASE
1744 }
1745 return nullptr;
1746}
1747
1749 EVT VT) const {
1750 if (!VT.isVector())
1751 return getPointerTy(DL);
1752
1753 // MVE has a predicate register.
1754 if ((Subtarget->hasMVEIntegerOps() &&
1755 (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
1756 VT == MVT::v16i8)) ||
1757 (Subtarget->hasMVEFloatOps() &&
1758 (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16)))
1759 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
1761}
1762
1763/// getRegClassFor - Return the register class that should be used for the
1764/// specified value type.
1765const TargetRegisterClass *
1766ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1767 (void)isDivergent;
1768 // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1769 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1770 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1771 // MVE Q registers.
1772 if (Subtarget->hasNEON()) {
1773 if (VT == MVT::v4i64)
1774 return &ARM::QQPRRegClass;
1775 if (VT == MVT::v8i64)
1776 return &ARM::QQQQPRRegClass;
1777 }
1778 if (Subtarget->hasMVEIntegerOps()) {
1779 if (VT == MVT::v4i64)
1780 return &ARM::MQQPRRegClass;
1781 if (VT == MVT::v8i64)
1782 return &ARM::MQQQQPRRegClass;
1783 }
1785}
1786
1787// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1788// source/dest is aligned and the copy size is large enough. We therefore want
1789// to align such objects passed to memory intrinsics.
1791 Align &PrefAlign) const {
1792 if (!isa<MemIntrinsic>(CI))
1793 return false;
1794 MinSize = 8;
1795 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1796 // cycle faster than 4-byte aligned LDM.
1797 PrefAlign =
1798 (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));
1799 return true;
1800}
1801
1802// Create a fast isel object.
1803FastISel *
1805 const TargetLibraryInfo *libInfo) const {
1806 return ARM::createFastISel(funcInfo, libInfo);
1807}
1808
1810 unsigned NumVals = N->getNumValues();
1811 if (!NumVals)
1812 return Sched::RegPressure;
1813
1814 for (unsigned i = 0; i != NumVals; ++i) {
1815 EVT VT = N->getValueType(i);
1816 if (VT == MVT::Glue || VT == MVT::Other)
1817 continue;
1818 if (VT.isFloatingPoint() || VT.isVector())
1819 return Sched::ILP;
1820 }
1821
1822 if (!N->isMachineOpcode())
1823 return Sched::RegPressure;
1824
1825 // Load are scheduled for latency even if there instruction itinerary
1826 // is not available.
1827 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1828 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1829
1830 if (MCID.getNumDefs() == 0)
1831 return Sched::RegPressure;
1832 if (!Itins->isEmpty() &&
1833 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2U)
1834 return Sched::ILP;
1835
1836 return Sched::RegPressure;
1837}
1838
1839//===----------------------------------------------------------------------===//
1840// Lowering Code
1841//===----------------------------------------------------------------------===//
1842
1843static bool isSRL16(const SDValue &Op) {
1844 if (Op.getOpcode() != ISD::SRL)
1845 return false;
1846 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1847 return Const->getZExtValue() == 16;
1848 return false;
1849}
1850
1851static bool isSRA16(const SDValue &Op) {
1852 if (Op.getOpcode() != ISD::SRA)
1853 return false;
1854 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1855 return Const->getZExtValue() == 16;
1856 return false;
1857}
1858
1859static bool isSHL16(const SDValue &Op) {
1860 if (Op.getOpcode() != ISD::SHL)
1861 return false;
1862 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1863 return Const->getZExtValue() == 16;
1864 return false;
1865}
1866
1867// Check for a signed 16-bit value. We special case SRA because it makes it
1868// more simple when also looking for SRAs that aren't sign extending a
1869// smaller value. Without the check, we'd need to take extra care with
1870// checking order for some operations.
1871static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
1872 if (isSRA16(Op))
1873 return isSHL16(Op.getOperand(0));
1874 return DAG.ComputeNumSignBits(Op) == 17;
1875}
1876
1877/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1879 switch (CC) {
1880 default: llvm_unreachable("Unknown condition code!");
1881 case ISD::SETNE: return ARMCC::NE;
1882 case ISD::SETEQ: return ARMCC::EQ;
1883 case ISD::SETGT: return ARMCC::GT;
1884 case ISD::SETGE: return ARMCC::GE;
1885 case ISD::SETLT: return ARMCC::LT;
1886 case ISD::SETLE: return ARMCC::LE;
1887 case ISD::SETUGT: return ARMCC::HI;
1888 case ISD::SETUGE: return ARMCC::HS;
1889 case ISD::SETULT: return ARMCC::LO;
1890 case ISD::SETULE: return ARMCC::LS;
1891 }
1892}
1893
1894/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1896 ARMCC::CondCodes &CondCode2) {
1897 CondCode2 = ARMCC::AL;
1898 switch (CC) {
1899 default: llvm_unreachable("Unknown FP condition!");
1900 case ISD::SETEQ:
1901 case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1902 case ISD::SETGT:
1903 case ISD::SETOGT: CondCode = ARMCC::GT; break;
1904 case ISD::SETGE:
1905 case ISD::SETOGE: CondCode = ARMCC::GE; break;
1906 case ISD::SETOLT: CondCode = ARMCC::MI; break;
1907 case ISD::SETOLE: CondCode = ARMCC::LS; break;
1908 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1909 case ISD::SETO: CondCode = ARMCC::VC; break;
1910 case ISD::SETUO: CondCode = ARMCC::VS; break;
1911 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1912 case ISD::SETUGT: CondCode = ARMCC::HI; break;
1913 case ISD::SETUGE: CondCode = ARMCC::PL; break;
1914 case ISD::SETLT:
1915 case ISD::SETULT: CondCode = ARMCC::LT; break;
1916 case ISD::SETLE:
1917 case ISD::SETULE: CondCode = ARMCC::LE; break;
1918 case ISD::SETNE:
1919 case ISD::SETUNE: CondCode = ARMCC::NE; break;
1920 }
1921}
1922
1923//===----------------------------------------------------------------------===//
1924// Calling Convention Implementation
1925//===----------------------------------------------------------------------===//
1926
1927/// getEffectiveCallingConv - Get the effective calling convention, taking into
1928/// account presence of floating point hardware and calling convention
1929/// limitations, such as support for variadic functions.
1931ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
1932 bool isVarArg) const {
1933 switch (CC) {
1934 default:
1935 report_fatal_error("Unsupported calling convention");
1938 case CallingConv::GHC:
1940 return CC;
1946 case CallingConv::Swift:
1949 case CallingConv::C:
1950 case CallingConv::Tail:
1951 if (!getTM().isAAPCS_ABI())
1952 return CallingConv::ARM_APCS;
1953 else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
1954 getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1955 !isVarArg)
1957 else
1959 case CallingConv::Fast:
1961 if (!getTM().isAAPCS_ABI()) {
1962 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
1963 return CallingConv::Fast;
1964 return CallingConv::ARM_APCS;
1965 } else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
1966 !isVarArg)
1968 else
1970 }
1971}
1972
1974 bool isVarArg) const {
1975 return CCAssignFnForNode(CC, false, isVarArg);
1976}
1977
1979 bool isVarArg) const {
1980 return CCAssignFnForNode(CC, true, isVarArg);
1981}
1982
1983/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
1984/// CallingConvention.
1985CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1986 bool Return,
1987 bool isVarArg) const {
1988 switch (getEffectiveCallingConv(CC, isVarArg)) {
1989 default:
1990 report_fatal_error("Unsupported calling convention");
1992 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1994 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1996 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1997 case CallingConv::Fast:
1998 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
1999 case CallingConv::GHC:
2000 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
2002 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2004 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2006 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
2007 }
2008}
2009
2010SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
2011 MVT LocVT, MVT ValVT, SDValue Val) const {
2012 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
2013 Val);
2014 if (Subtarget->hasFullFP16()) {
2015 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
2016 } else {
2017 Val = DAG.getNode(ISD::TRUNCATE, dl,
2018 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2019 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
2020 }
2021 return Val;
2022}
2023
2024SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
2025 MVT LocVT, MVT ValVT,
2026 SDValue Val) const {
2027 if (Subtarget->hasFullFP16()) {
2028 Val = DAG.getNode(ARMISD::VMOVrh, dl,
2029 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2030 } else {
2031 Val = DAG.getNode(ISD::BITCAST, dl,
2032 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2033 Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
2034 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2035 }
2036 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
2037}
2038
2039/// LowerCallResult - Lower the result values of a call into the
2040/// appropriate copies out of appropriate physical registers.
2041SDValue ARMTargetLowering::LowerCallResult(
2042 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
2043 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2044 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
2045 SDValue ThisVal, bool isCmseNSCall) const {
2046 // Assign locations to each value returned by this call.
2048 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2049 *DAG.getContext());
2050 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
2051
2052 // Copy all of the result registers out of their specified physreg.
2053 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2054 CCValAssign VA = RVLocs[i];
2055
2056 // Pass 'this' value directly from the argument to return value, to avoid
2057 // reg unit interference
2058 if (i == 0 && isThisReturn) {
2059 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
2060 "unexpected return calling convention register assignment");
2061 InVals.push_back(ThisVal);
2062 continue;
2063 }
2064
2065 SDValue Val;
2066 if (VA.needsCustom() &&
2067 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
2068 // Handle f64 or half of a v2f64.
2069 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2070 InGlue);
2071 Chain = Lo.getValue(1);
2072 InGlue = Lo.getValue(2);
2073 VA = RVLocs[++i]; // skip ahead to next loc
2074 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2075 InGlue);
2076 Chain = Hi.getValue(1);
2077 InGlue = Hi.getValue(2);
2078 if (!Subtarget->isLittle())
2079 std::swap (Lo, Hi);
2080 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2081
2082 if (VA.getLocVT() == MVT::v2f64) {
2083 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
2084 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2085 DAG.getConstant(0, dl, MVT::i32));
2086
2087 VA = RVLocs[++i]; // skip ahead to next loc
2088 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
2089 Chain = Lo.getValue(1);
2090 InGlue = Lo.getValue(2);
2091 VA = RVLocs[++i]; // skip ahead to next loc
2092 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
2093 Chain = Hi.getValue(1);
2094 InGlue = Hi.getValue(2);
2095 if (!Subtarget->isLittle())
2096 std::swap (Lo, Hi);
2097 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2098 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2099 DAG.getConstant(1, dl, MVT::i32));
2100 }
2101 } else {
2102 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
2103 InGlue);
2104 Chain = Val.getValue(1);
2105 InGlue = Val.getValue(2);
2106 }
2107
2108 switch (VA.getLocInfo()) {
2109 default: llvm_unreachable("Unknown loc info!");
2110 case CCValAssign::Full: break;
2111 case CCValAssign::BCvt:
2112 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
2113 break;
2114 }
2115
2116 // f16 arguments have their size extended to 4 bytes and passed as if they
2117 // had been copied to the LSBs of a 32-bit register.
2118 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2119 if (VA.needsCustom() &&
2120 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
2121 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
2122
2123 // On CMSE Non-secure Calls, call results (returned values) whose bitwidth
2124 // is less than 32 bits must be sign- or zero-extended after the call for
2125 // security reasons. Although the ABI mandates an extension done by the
2126 // callee, the latter cannot be trusted to follow the rules of the ABI.
2127 const ISD::InputArg &Arg = Ins[VA.getValNo()];
2128 if (isCmseNSCall && Arg.ArgVT.isScalarInteger() &&
2129 VA.getLocVT().isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
2130 Val = handleCMSEValue(Val, Arg, DAG, dl);
2131
2132 InVals.push_back(Val);
2133 }
2134
2135 return Chain;
2136}
2137
2138std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
2139 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
2140 bool IsTailCall, int SPDiff) const {
2141 SDValue DstAddr;
2142 MachinePointerInfo DstInfo;
2143 int32_t Offset = VA.getLocMemOffset();
2145
2146 if (IsTailCall) {
2147 Offset += SPDiff;
2148 auto PtrVT = getPointerTy(DAG.getDataLayout());
2149 int Size = VA.getLocVT().getFixedSizeInBits() / 8;
2150 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
2151 DstAddr = DAG.getFrameIndex(FI, PtrVT);
2152 DstInfo =
2154 } else {
2155 SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
2156 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2157 StackPtr, PtrOff);
2158 DstInfo =
2160 }
2161
2162 return std::make_pair(DstAddr, DstInfo);
2163}
2164
2165// Returns the type of copying which is required to set up a byval argument to
2166// a tail-called function. This isn't needed for non-tail calls, because they
2167// always need the equivalent of CopyOnce, but tail-calls sometimes need two to
2168// avoid clobbering another argument (CopyViaTemp), and sometimes can be
2169// optimised to zero copies when forwarding an argument from the caller's
2170// caller (NoCopy).
2171ARMTargetLowering::ByValCopyKind ARMTargetLowering::ByValNeedsCopyForTailCall(
2172 SelectionDAG &DAG, SDValue Src, SDValue Dst, ISD::ArgFlagsTy Flags) const {
2175
2176 // Globals are always safe to copy from.
2177 if (isa<GlobalAddressSDNode>(Src) || isa<ExternalSymbolSDNode>(Src))
2178 return CopyOnce;
2179
2180 // Can only analyse frame index nodes, conservatively assume we need a
2181 // temporary.
2182 auto *SrcFrameIdxNode = dyn_cast<FrameIndexSDNode>(Src);
2183 auto *DstFrameIdxNode = dyn_cast<FrameIndexSDNode>(Dst);
2184 if (!SrcFrameIdxNode || !DstFrameIdxNode)
2185 return CopyViaTemp;
2186
2187 int SrcFI = SrcFrameIdxNode->getIndex();
2188 int DstFI = DstFrameIdxNode->getIndex();
2189 assert(MFI.isFixedObjectIndex(DstFI) &&
2190 "byval passed in non-fixed stack slot");
2191
2192 int64_t SrcOffset = MFI.getObjectOffset(SrcFI);
2193 int64_t DstOffset = MFI.getObjectOffset(DstFI);
2194
2195 // If the source is in the local frame, then the copy to the argument memory
2196 // is always valid.
2197 bool FixedSrc = MFI.isFixedObjectIndex(SrcFI);
2198 if (!FixedSrc ||
2199 (FixedSrc && SrcOffset < -(int64_t)AFI->getArgRegsSaveSize()))
2200 return CopyOnce;
2201
2202 // In the case of byval arguments split between registers and the stack,
2203 // computeAddrForCallArg returns a FrameIndex which corresponds only to the
2204 // stack portion, but the Src SDValue will refer to the full value, including
2205 // the local stack memory that the register portion gets stored into. We only
2206 // need to compare them for equality, so normalise on the full value version.
2207 uint64_t RegSize = Flags.getByValSize() - MFI.getObjectSize(DstFI);
2208 DstOffset -= RegSize;
2209
2210 // If the value is already in the correct location, then no copying is
2211 // needed. If not, then we need to copy via a temporary.
2212 if (SrcOffset == DstOffset)
2213 return NoCopy;
2214 else
2215 return CopyViaTemp;
2216}
2217
2218void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
2219 SDValue Chain, SDValue &Arg,
2220 RegsToPassVector &RegsToPass,
2221 CCValAssign &VA, CCValAssign &NextVA,
2222 SDValue &StackPtr,
2223 SmallVectorImpl<SDValue> &MemOpChains,
2224 bool IsTailCall,
2225 int SPDiff) const {
2226 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2227 DAG.getVTList(MVT::i32, MVT::i32), Arg);
2228 unsigned id = Subtarget->isLittle() ? 0 : 1;
2229 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
2230
2231 if (NextVA.isRegLoc())
2232 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
2233 else {
2234 assert(NextVA.isMemLoc());
2235 if (!StackPtr.getNode())
2236 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
2238
2239 SDValue DstAddr;
2240 MachinePointerInfo DstInfo;
2241 std::tie(DstAddr, DstInfo) =
2242 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
2243 MemOpChains.push_back(
2244 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
2245 }
2246}
2247
2248static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
2249 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
2251}
2252
2253/// LowerCall - Lowering a call into a callseq_start <-
2254/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2255/// nodes.
2256SDValue
2257ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2258 SmallVectorImpl<SDValue> &InVals) const {
2259 SelectionDAG &DAG = CLI.DAG;
2260 SDLoc &dl = CLI.DL;
2262 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2264 SDValue Chain = CLI.Chain;
2265 SDValue Callee = CLI.Callee;
2266 bool &isTailCall = CLI.IsTailCall;
2267 CallingConv::ID CallConv = CLI.CallConv;
2268 bool doesNotRet = CLI.DoesNotReturn;
2269 bool isVarArg = CLI.IsVarArg;
2270 const CallBase *CB = CLI.CB;
2271
2276 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2277 bool isThisReturn = false;
2278 bool isCmseNSCall = false;
2279 bool isSibCall = false;
2280 bool PreferIndirect = false;
2281 bool GuardWithBTI = false;
2282
2283 // Analyze operands of the call, assigning locations to each operand.
2285 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2286 *DAG.getContext());
2287 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2288
2289 // Lower 'returns_twice' calls to a pseudo-instruction.
2290 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
2291 !Subtarget->noBTIAtReturnTwice())
2292 GuardWithBTI = AFI->branchTargetEnforcement();
2293
2294 // Set type id for call site info.
2295 if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
2296 CSInfo = MachineFunction::CallSiteInfo(*CB);
2297
2298 // Determine whether this is a non-secure function call.
2299 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
2300 isCmseNSCall = true;
2301
2302 // Disable tail calls if they're not supported.
2303 if (!Subtarget->supportsTailCall())
2304 isTailCall = false;
2305
2306 // For both the non-secure calls and the returns from a CMSE entry function,
2307 // the function needs to do some extra work after the call, or before the
2308 // return, respectively, thus it cannot end with a tail call
2309 if (isCmseNSCall || AFI->isCmseNSEntryFunction())
2310 isTailCall = false;
2311
2312 if (isa<GlobalAddressSDNode>(Callee)) {
2313 // If we're optimizing for minimum size and the function is called three or
2314 // more times in this block, we can improve codesize by calling indirectly
2315 // as BLXr has a 16-bit encoding.
2316 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2317 if (CLI.CB) {
2318 auto *BB = CLI.CB->getParent();
2319 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2320 count_if(GV->users(), [&BB](const User *U) {
2321 return isa<Instruction>(U) &&
2322 cast<Instruction>(U)->getParent() == BB;
2323 }) > 2;
2324 }
2325 }
2326 if (isTailCall) {
2327 // Check if it's really possible to do a tail call.
2328 isTailCall =
2329 IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, PreferIndirect);
2330
2331 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
2332 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
2333 isSibCall = true;
2334
2335 // We don't support GuaranteedTailCallOpt for ARM, only automatically
2336 // detected sibcalls.
2337 if (isTailCall)
2338 ++NumTailCalls;
2339 }
2340
2341 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
2342 report_fatal_error("failed to perform tail call elimination on a call "
2343 "site marked musttail");
2344
2345 // Get a count of how many bytes are to be pushed on the stack.
2346 unsigned NumBytes = CCInfo.getStackSize();
2347
2348 // SPDiff is the byte offset of the call's argument area from the callee's.
2349 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2350 // by this amount for a tail call. In a sibling call it must be 0 because the
2351 // caller will deallocate the entire stack and the callee still expects its
2352 // arguments to begin at SP+0. Completely unused for non-tail calls.
2353 int SPDiff = 0;
2354
2355 if (isTailCall && !isSibCall) {
2356 auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
2357 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
2358
2359 // Since callee will pop argument stack as a tail call, we must keep the
2360 // popped size 16-byte aligned.
2361 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
2362 assert(StackAlign && "data layout string is missing stack alignment");
2363 NumBytes = alignTo(NumBytes, *StackAlign);
2364
2365 // SPDiff will be negative if this tail call requires more space than we
2366 // would automatically have in our incoming argument space. Positive if we
2367 // can actually shrink the stack.
2368 SPDiff = NumReusableBytes - NumBytes;
2369
2370 // If this call requires more stack than we have available from
2371 // LowerFormalArguments, tell FrameLowering to reserve space for it.
2372 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
2373 AFI->setArgRegsSaveSize(-SPDiff);
2374 }
2375
2376 if (isSibCall) {
2377 // For sibling tail calls, memory operands are available in our caller's stack.
2378 NumBytes = 0;
2379 } else {
2380 // Adjust the stack pointer for the new arguments...
2381 // These operations are automatically eliminated by the prolog/epilog pass
2382 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
2383 }
2384
2386 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2387
2388 RegsToPassVector RegsToPass;
2389 SmallVector<SDValue, 8> MemOpChains;
2390
2391 // If we are doing a tail-call, any byval arguments will be written to stack
2392 // space which was used for incoming arguments. If any the values being used
2393 // are incoming byval arguments to this function, then they might be
2394 // overwritten by the stores of the outgoing arguments. To avoid this, we
2395 // need to make a temporary copy of them in local stack space, then copy back
2396 // to the argument area.
2397 DenseMap<unsigned, SDValue> ByValTemporaries;
2398 SDValue ByValTempChain;
2399 if (isTailCall) {
2400 SmallVector<SDValue, 8> ByValCopyChains;
2401 for (const CCValAssign &VA : ArgLocs) {
2402 unsigned ArgIdx = VA.getValNo();
2403 SDValue Src = OutVals[ArgIdx];
2404 ISD::ArgFlagsTy Flags = Outs[ArgIdx].Flags;
2405
2406 if (!Flags.isByVal())
2407 continue;
2408
2409 SDValue Dst;
2410 MachinePointerInfo DstInfo;
2411 std::tie(Dst, DstInfo) =
2412 computeAddrForCallArg(dl, DAG, VA, SDValue(), true, SPDiff);
2413 ByValCopyKind Copy = ByValNeedsCopyForTailCall(DAG, Src, Dst, Flags);
2414
2415 if (Copy == NoCopy) {
2416 // If the argument is already at the correct offset on the stack
2417 // (because we are forwarding a byval argument from our caller), we
2418 // don't need any copying.
2419 continue;
2420 } else if (Copy == CopyOnce) {
2421 // If the argument is in our local stack frame, no other argument
2422 // preparation can clobber it, so we can copy it to the final location
2423 // later.
2424 ByValTemporaries[ArgIdx] = Src;
2425 } else {
2426 assert(Copy == CopyViaTemp && "unexpected enum value");
2427 // If we might be copying this argument from the outgoing argument
2428 // stack area, we need to copy via a temporary in the local stack
2429 // frame.
2430 int TempFrameIdx = MFI.CreateStackObject(
2431 Flags.getByValSize(), Flags.getNonZeroByValAlign(), false);
2432 SDValue Temp =
2433 DAG.getFrameIndex(TempFrameIdx, getPointerTy(DAG.getDataLayout()));
2434
2435 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2436 SDValue AlignNode =
2437 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2438
2439 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2440 SDValue Ops[] = {Chain, Temp, Src, SizeNode, AlignNode};
2441 ByValCopyChains.push_back(
2442 DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops));
2443 ByValTemporaries[ArgIdx] = Temp;
2444 }
2445 }
2446 if (!ByValCopyChains.empty())
2447 ByValTempChain =
2448 DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ByValCopyChains);
2449 }
2450
2451 // During a tail call, stores to the argument area must happen after all of
2452 // the function's incoming arguments have been loaded because they may alias.
2453 // This is done by folding in a TokenFactor from LowerFormalArguments, but
2454 // there's no point in doing so repeatedly so this tracks whether that's
2455 // happened yet.
2456 bool AfterFormalArgLoads = false;
2457
2458 // Walk the register/memloc assignments, inserting copies/loads. In the case
2459 // of tail call optimization, arguments are handled later.
2460 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2461 i != e;
2462 ++i, ++realArgIdx) {
2463 CCValAssign &VA = ArgLocs[i];
2464 SDValue Arg = OutVals[realArgIdx];
2465 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2466 bool isByVal = Flags.isByVal();
2467
2468 // Promote the value if needed.
2469 switch (VA.getLocInfo()) {
2470 default: llvm_unreachable("Unknown loc info!");
2471 case CCValAssign::Full: break;
2472 case CCValAssign::SExt:
2473 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2474 break;
2475 case CCValAssign::ZExt:
2476 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2477 break;
2478 case CCValAssign::AExt:
2479 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2480 break;
2481 case CCValAssign::BCvt:
2482 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2483 break;
2484 }
2485
2486 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
2487 Chain = DAG.getStackArgumentTokenFactor(Chain);
2488 if (ByValTempChain)
2489 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain,
2490 ByValTempChain);
2491 AfterFormalArgLoads = true;
2492 }
2493
2494 // f16 arguments have their size extended to 4 bytes and passed as if they
2495 // had been copied to the LSBs of a 32-bit register.
2496 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2497 if (VA.needsCustom() &&
2498 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
2499 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2500 } else {
2501 // f16 arguments could have been extended prior to argument lowering.
2502 // Mask them arguments if this is a CMSE nonsecure call.
2503 auto ArgVT = Outs[realArgIdx].ArgVT;
2504 if (isCmseNSCall && (ArgVT == MVT::f16)) {
2505 auto LocBits = VA.getLocVT().getSizeInBits();
2506 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
2507 SDValue Mask =
2508 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2509 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2510 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2511 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2512 }
2513 }
2514
2515 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2516 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
2517 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2518 DAG.getConstant(0, dl, MVT::i32));
2519 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2520 DAG.getConstant(1, dl, MVT::i32));
2521
2522 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
2523 StackPtr, MemOpChains, isTailCall, SPDiff);
2524
2525 VA = ArgLocs[++i]; // skip ahead to next loc
2526 if (VA.isRegLoc()) {
2527 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
2528 StackPtr, MemOpChains, isTailCall, SPDiff);
2529 } else {
2530 assert(VA.isMemLoc());
2531 SDValue DstAddr;
2532 MachinePointerInfo DstInfo;
2533 std::tie(DstAddr, DstInfo) =
2534 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2535 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
2536 }
2537 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
2538 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2539 StackPtr, MemOpChains, isTailCall, SPDiff);
2540 } else if (VA.isRegLoc()) {
2541 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2542 Outs[0].VT == MVT::i32) {
2543 assert(VA.getLocVT() == MVT::i32 &&
2544 "unexpected calling convention register assignment");
2545 assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2546 "unexpected use of 'returned'");
2547 isThisReturn = true;
2548 }
2549 const TargetOptions &Options = DAG.getTarget().Options;
2550 if (Options.EmitCallSiteInfo)
2551 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
2552 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2553 } else if (isByVal) {
2554 assert(VA.isMemLoc());
2555 unsigned offset = 0;
2556
2557 // True if this byval aggregate will be split between registers
2558 // and memory.
2559 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2560 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2561
2562 SDValue ByValSrc;
2563 bool NeedsStackCopy;
2564 if (auto It = ByValTemporaries.find(realArgIdx);
2565 It != ByValTemporaries.end()) {
2566 ByValSrc = It->second;
2567 NeedsStackCopy = true;
2568 } else {
2569 ByValSrc = Arg;
2570 NeedsStackCopy = !isTailCall;
2571 }
2572
2573 // If part of the argument is in registers, load them.
2574 if (CurByValIdx < ByValArgsCount) {
2575 unsigned RegBegin, RegEnd;
2576 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2577
2578 EVT PtrVT = getPointerTy(DAG.getDataLayout());
2579 unsigned int i, j;
2580 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2581 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2582 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, Const);
2583 SDValue Load =
2584 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
2585 DAG.InferPtrAlign(AddArg));
2586 MemOpChains.push_back(Load.getValue(1));
2587 RegsToPass.push_back(std::make_pair(j, Load));
2588 }
2589
2590 // If parameter size outsides register area, "offset" value
2591 // helps us to calculate stack slot for remained part properly.
2592 offset = RegEnd - RegBegin;
2593
2594 CCInfo.nextInRegsParam();
2595 }
2596
2597 // If the memory part of the argument isn't already in the correct place
2598 // (which can happen with tail calls), copy it into the argument area.
2599 if (NeedsStackCopy && Flags.getByValSize() > 4 * offset) {
2600 auto PtrVT = getPointerTy(DAG.getDataLayout());
2601 SDValue Dst;
2602 MachinePointerInfo DstInfo;
2603 std::tie(Dst, DstInfo) =
2604 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2605 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2606 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, SrcOffset);
2607 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2608 MVT::i32);
2609 SDValue AlignNode =
2610 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2611
2612 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2613 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2614 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2615 Ops));
2616 }
2617 } else {
2618 assert(VA.isMemLoc());
2619 SDValue DstAddr;
2620 MachinePointerInfo DstInfo;
2621 std::tie(DstAddr, DstInfo) =
2622 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2623
2624 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
2625 MemOpChains.push_back(Store);
2626 }
2627 }
2628
2629 if (!MemOpChains.empty())
2630 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2631
2632 // Build a sequence of copy-to-reg nodes chained together with token chain
2633 // and flag operands which copy the outgoing args into the appropriate regs.
2634 SDValue InGlue;
2635 for (const auto &[Reg, N] : RegsToPass) {
2636 Chain = DAG.getCopyToReg(Chain, dl, Reg, N, InGlue);
2637 InGlue = Chain.getValue(1);
2638 }
2639
2640 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2641 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2642 // node so that legalize doesn't hack it.
2643 bool isDirect = false;
2644
2646 const GlobalValue *GVal = nullptr;
2647 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2648 GVal = G->getGlobal();
2649 bool isStub = !TM.shouldAssumeDSOLocal(GVal) && Subtarget->isTargetMachO();
2650
2651 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2652 bool isLocalARMFunc = false;
2653 auto PtrVt = getPointerTy(DAG.getDataLayout());
2654
2655 if (Subtarget->genLongCalls()) {
2656 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
2657 "long-calls codegen is not position independent!");
2658 // Handle a global address or an external symbol. If it's not one of
2659 // those, the target's already in a register, so we don't need to do
2660 // anything extra.
2661 if (isa<GlobalAddressSDNode>(Callee)) {
2662 if (Subtarget->genExecuteOnly()) {
2663 if (Subtarget->useMovt())
2664 ++NumMovwMovt;
2665 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2666 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2667 } else {
2668 // Create a constant pool entry for the callee address
2669 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2671 GVal, ARMPCLabelIndex, ARMCP::CPValue, 0);
2672
2673 // Get the address of the callee into a register
2674 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2675 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2676 Callee = DAG.getLoad(
2677 PtrVt, dl, DAG.getEntryNode(), Addr,
2679 }
2680 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2681 const char *Sym = S->getSymbol();
2682
2683 if (Subtarget->genExecuteOnly()) {
2684 if (Subtarget->useMovt())
2685 ++NumMovwMovt;
2686 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2687 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2688 } else {
2689 // Create a constant pool entry for the callee address
2690 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2692 *DAG.getContext(), Sym, ARMPCLabelIndex, 0);
2693
2694 // Get the address of the callee into a register
2695 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2696 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2697 Callee = DAG.getLoad(
2698 PtrVt, dl, DAG.getEntryNode(), Addr,
2700 }
2701 }
2702 } else if (isa<GlobalAddressSDNode>(Callee)) {
2703 if (!PreferIndirect) {
2704 isDirect = true;
2705 bool isDef = GVal->isStrongDefinitionForLinker();
2706
2707 // ARM call to a local ARM function is predicable.
2708 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2709 // tBX takes a register source operand.
2710 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2711 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2712 Callee = DAG.getNode(
2713 ARMISD::WrapperPIC, dl, PtrVt,
2714 DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2715 Callee = DAG.getLoad(
2716 PtrVt, dl, DAG.getEntryNode(), Callee,
2720 } else if (Subtarget->isTargetCOFF()) {
2721 assert(Subtarget->isTargetWindows() &&
2722 "Windows is the only supported COFF target");
2723 unsigned TargetFlags = ARMII::MO_NO_FLAG;
2724 if (GVal->hasDLLImportStorageClass())
2725 TargetFlags = ARMII::MO_DLLIMPORT;
2726 else if (!TM.shouldAssumeDSOLocal(GVal))
2727 TargetFlags = ARMII::MO_COFFSTUB;
2728 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
2729 TargetFlags);
2730 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
2731 Callee =
2732 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2733 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2735 } else {
2736 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0);
2737 }
2738 }
2739 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2740 isDirect = true;
2741 // tBX takes a register source operand.
2742 const char *Sym = S->getSymbol();
2743 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2744 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2747 ARMPCLabelIndex, 4);
2748 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2749 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2750 Callee = DAG.getLoad(
2751 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2753 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2754 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2755 } else {
2756 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2757 }
2758 }
2759
2760 if (isCmseNSCall) {
2761 assert(!isARMFunc && !isDirect &&
2762 "Cannot handle call to ARM function or direct call");
2763 if (NumBytes > 0) {
2764 DAG.getContext()->diagnose(
2766 "call to non-secure function would require "
2767 "passing arguments on stack",
2768 dl.getDebugLoc()));
2769 }
2770 if (isStructRet) {
2773 "call to non-secure function would return value through pointer",
2774 dl.getDebugLoc()));
2775 }
2776 }
2777
2778 // FIXME: handle tail calls differently.
2779 unsigned CallOpc;
2780 if (Subtarget->isThumb()) {
2781 if (GuardWithBTI)
2782 CallOpc = ARMISD::t2CALL_BTI;
2783 else if (isCmseNSCall)
2784 CallOpc = ARMISD::tSECALL;
2785 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2786 CallOpc = ARMISD::CALL_NOLINK;
2787 else
2788 CallOpc = ARMISD::CALL;
2789 } else {
2790 if (!isDirect && !Subtarget->hasV5TOps())
2791 CallOpc = ARMISD::CALL_NOLINK;
2792 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2793 // Emit regular call when code size is the priority
2794 !Subtarget->hasMinSize())
2795 // "mov lr, pc; b _foo" to avoid confusing the RSP
2796 CallOpc = ARMISD::CALL_NOLINK;
2797 else
2798 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2799 }
2800
2801 // We don't usually want to end the call-sequence here because we would tidy
2802 // the frame up *after* the call, however in the ABI-changing tail-call case
2803 // we've carefully laid out the parameters so that when sp is reset they'll be
2804 // in the correct location.
2805 if (isTailCall && !isSibCall) {
2806 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, dl);
2807 InGlue = Chain.getValue(1);
2808 }
2809
2810 std::vector<SDValue> Ops;
2811 Ops.push_back(Chain);
2812 Ops.push_back(Callee);
2813
2814 if (isTailCall) {
2815 Ops.push_back(DAG.getSignedTargetConstant(SPDiff, dl, MVT::i32));
2816 }
2817
2818 // Add argument registers to the end of the list so that they are known live
2819 // into the call.
2820 for (const auto &[Reg, N] : RegsToPass)
2821 Ops.push_back(DAG.getRegister(Reg, N.getValueType()));
2822
2823 // Add a register mask operand representing the call-preserved registers.
2824 const uint32_t *Mask;
2825 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2826 if (isThisReturn) {
2827 // For 'this' returns, use the R0-preserving mask if applicable
2828 Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2829 if (!Mask) {
2830 // Set isThisReturn to false if the calling convention is not one that
2831 // allows 'returned' to be modeled in this way, so LowerCallResult does
2832 // not try to pass 'this' straight through
2833 isThisReturn = false;
2834 Mask = ARI->getCallPreservedMask(MF, CallConv);
2835 }
2836 } else
2837 Mask = ARI->getCallPreservedMask(MF, CallConv);
2838
2839 assert(Mask && "Missing call preserved mask for calling convention");
2840 Ops.push_back(DAG.getRegisterMask(Mask));
2841
2842 if (InGlue.getNode())
2843 Ops.push_back(InGlue);
2844
2845 if (isTailCall) {
2847 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, MVT::Other, Ops);
2848 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
2849 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2850 return Ret;
2851 }
2852
2853 // Returns a chain and a flag for retval copy to use.
2854 Chain = DAG.getNode(CallOpc, dl, {MVT::Other, MVT::Glue}, Ops);
2855 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2856 InGlue = Chain.getValue(1);
2857 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2858
2859 // If we're guaranteeing tail-calls will be honoured, the callee must
2860 // pop its own argument stack on return. But this call is *not* a tail call so
2861 // we need to undo that after it returns to restore the status-quo.
2862 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
2863 uint64_t CalleePopBytes =
2864 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1U;
2865
2866 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, dl);
2867 if (!Ins.empty())
2868 InGlue = Chain.getValue(1);
2869
2870 // Handle result values, copying them out of physregs into vregs that we
2871 // return.
2872 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2873 InVals, isThisReturn,
2874 isThisReturn ? OutVals[0] : SDValue(), isCmseNSCall);
2875}
2876
2877/// HandleByVal - Every parameter *after* a byval parameter is passed
2878/// on the stack. Remember the next parameter register to allocate,
2879/// and then confiscate the rest of the parameter registers to insure
2880/// this.
2881void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2882 Align Alignment) const {
2883 // Byval (as with any stack) slots are always at least 4 byte aligned.
2884 Alignment = std::max(Alignment, Align(4));
2885
2887 if (!Reg)
2888 return;
2889
2890 unsigned AlignInRegs = Alignment.value() / 4;
2891 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2892 for (unsigned i = 0; i < Waste; ++i)
2893 Reg = State->AllocateReg(GPRArgRegs);
2894
2895 if (!Reg)
2896 return;
2897
2898 unsigned Excess = 4 * (ARM::R4 - Reg);
2899
2900 // Special case when NSAA != SP and parameter size greater than size of
2901 // all remained GPR regs. In that case we can't split parameter, we must
2902 // send it to stack. We also must set NCRN to R4, so waste all
2903 // remained registers.
2904 const unsigned NSAAOffset = State->getStackSize();
2905 if (NSAAOffset != 0 && Size > Excess) {
2906 while (State->AllocateReg(GPRArgRegs))
2907 ;
2908 return;
2909 }
2910
2911 // First register for byval parameter is the first register that wasn't
2912 // allocated before this method call, so it would be "reg".
2913 // If parameter is small enough to be saved in range [reg, r4), then
2914 // the end (first after last) register would be reg + param-size-in-regs,
2915 // else parameter would be splitted between registers and stack,
2916 // end register would be r4 in this case.
2917 unsigned ByValRegBegin = Reg;
2918 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2919 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2920 // Note, first register is allocated in the beginning of function already,
2921 // allocate remained amount of registers we need.
2922 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2923 State->AllocateReg(GPRArgRegs);
2924 // A byval parameter that is split between registers and memory needs its
2925 // size truncated here.
2926 // In the case where the entire structure fits in registers, we set the
2927 // size in memory to zero.
2928 Size = std::max<int>(Size - Excess, 0);
2929}
2930
2931/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2932/// for tail call optimization. Targets which want to do tail call
2933/// optimization should implement this function. Note that this function also
2934/// processes musttail calls, so when this function returns false on a valid
2935/// musttail call, a fatal backend error occurs.
2936bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2938 SmallVectorImpl<CCValAssign> &ArgLocs, const bool isIndirect) const {
2939 CallingConv::ID CalleeCC = CLI.CallConv;
2940 SDValue Callee = CLI.Callee;
2941 bool isVarArg = CLI.IsVarArg;
2942 const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2943 const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2945 const SelectionDAG &DAG = CLI.DAG;
2947 const Function &CallerF = MF.getFunction();
2948 CallingConv::ID CallerCC = CallerF.getCallingConv();
2949
2950 assert(Subtarget->supportsTailCall());
2951
2952 // Indirect tail-calls require a register to hold the target address. That
2953 // register must be:
2954 // * Allocatable (i.e. r0-r7 if the target is Thumb1).
2955 // * Not callee-saved, so must be one of r0-r3 or r12.
2956 // * Not used to hold an argument to the tail-called function, which might be
2957 // in r0-r3.
2958 // * Not used to hold the return address authentication code, which is in r12
2959 // if enabled.
2960 // Sometimes, no register matches all of these conditions, so we can't do a
2961 // tail-call.
2962 if (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect) {
2963 SmallSet<MCPhysReg, 5> AddressRegisters = {ARM::R0, ARM::R1, ARM::R2,
2964 ARM::R3};
2965 if (!(Subtarget->isThumb1Only() ||
2967 AddressRegisters.insert(ARM::R12);
2968 for (const CCValAssign &AL : ArgLocs)
2969 if (AL.isRegLoc())
2970 AddressRegisters.erase(AL.getLocReg());
2971 if (AddressRegisters.empty()) {
2972 LLVM_DEBUG(dbgs() << "false (no reg to hold function pointer)\n");
2973 return false;
2974 }
2975 }
2976
2977 // Look for obvious safe cases to perform tail call optimization that do not
2978 // require ABI changes. This is what gcc calls sibcall.
2979
2980 // Exception-handling functions need a special set of instructions to indicate
2981 // a return to the hardware. Tail-calling another function would probably
2982 // break this.
2983 if (CallerF.hasFnAttribute("interrupt")) {
2984 LLVM_DEBUG(dbgs() << "false (interrupt attribute)\n");
2985 return false;
2986 }
2987
2988 if (canGuaranteeTCO(CalleeCC,
2989 getTargetMachine().Options.GuaranteedTailCallOpt)) {
2990 LLVM_DEBUG(dbgs() << (CalleeCC == CallerCC ? "true" : "false")
2991 << " (guaranteed tail-call CC)\n");
2992 return CalleeCC == CallerCC;
2993 }
2994
2995 // Also avoid sibcall optimization if either caller or callee uses struct
2996 // return semantics.
2997 bool isCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
2998 bool isCallerStructRet = MF.getFunction().hasStructRetAttr();
2999 if (isCalleeStructRet != isCallerStructRet) {
3000 LLVM_DEBUG(dbgs() << "false (struct-ret)\n");
3001 return false;
3002 }
3003
3004 // Externally-defined functions with weak linkage should not be
3005 // tail-called on ARM when the OS does not support dynamic
3006 // pre-emption of symbols, as the AAELF spec requires normal calls
3007 // to undefined weak functions to be replaced with a NOP or jump to the
3008 // next instruction. The behaviour of branch instructions in this
3009 // situation (as used for tail calls) is implementation-defined, so we
3010 // cannot rely on the linker replacing the tail call with a return.
3011 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3012 const GlobalValue *GV = G->getGlobal();
3014 if (GV->hasExternalWeakLinkage() &&
3015 (!TT.isOSWindows() || TT.isOSBinFormatELF() ||
3016 TT.isOSBinFormatMachO())) {
3017 LLVM_DEBUG(dbgs() << "false (external weak linkage)\n");
3018 return false;
3019 }
3020 }
3021
3022 // Check that the call results are passed in the same way.
3023 LLVMContext &C = *DAG.getContext();
3025 getEffectiveCallingConv(CalleeCC, isVarArg),
3026 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
3027 CCAssignFnForReturn(CalleeCC, isVarArg),
3028 CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) {
3029 LLVM_DEBUG(dbgs() << "false (incompatible results)\n");
3030 return false;
3031 }
3032 // The callee has to preserve all registers the caller needs to preserve.
3033 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3034 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3035 if (CalleeCC != CallerCC) {
3036 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3037 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) {
3038 LLVM_DEBUG(dbgs() << "false (not all registers preserved)\n");
3039 return false;
3040 }
3041 }
3042
3043 // If Caller's vararg argument has been split between registers and stack, do
3044 // not perform tail call, since part of the argument is in caller's local
3045 // frame.
3046 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
3047 if (CLI.IsVarArg && AFI_Caller->getArgRegsSaveSize()) {
3048 LLVM_DEBUG(dbgs() << "false (arg reg save area)\n");
3049 return false;
3050 }
3051
3052 // If the callee takes no arguments then go on to check the results of the
3053 // call.
3054 const MachineRegisterInfo &MRI = MF.getRegInfo();
3055 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) {
3056 LLVM_DEBUG(dbgs() << "false (parameters in CSRs do not match)\n");
3057 return false;
3058 }
3059
3060 // If the stack arguments for this call do not fit into our own save area then
3061 // the call cannot be made tail.
3062 if (CCInfo.getStackSize() > AFI_Caller->getArgumentStackSize())
3063 return false;
3064
3065 LLVM_DEBUG(dbgs() << "true\n");
3066 return true;
3067}
3068
3069bool
3070ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
3071 MachineFunction &MF, bool isVarArg,
3073 LLVMContext &Context, const Type *RetTy) const {
3075 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3076 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3077}
3078
3080 const SDLoc &DL, SelectionDAG &DAG) {
3081 const MachineFunction &MF = DAG.getMachineFunction();
3082 const Function &F = MF.getFunction();
3083
3084 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
3085
3086 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
3087 // version of the "preferred return address". These offsets affect the return
3088 // instruction if this is a return from PL1 without hypervisor extensions.
3089 // IRQ/FIQ: +4 "subs pc, lr, #4"
3090 // SWI: 0 "subs pc, lr, #0"
3091 // ABORT: +4 "subs pc, lr, #4"
3092 // UNDEF: +4/+2 "subs pc, lr, #0"
3093 // UNDEF varies depending on where the exception came from ARM or Thumb
3094 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
3095
3096 int64_t LROffset;
3097 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
3098 IntKind == "ABORT")
3099 LROffset = 4;
3100 else if (IntKind == "SWI" || IntKind == "UNDEF")
3101 LROffset = 0;
3102 else
3103 report_fatal_error("Unsupported interrupt attribute. If present, value "
3104 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
3105
3106 RetOps.insert(RetOps.begin() + 1,
3107 DAG.getConstant(LROffset, DL, MVT::i32, false));
3108
3109 return DAG.getNode(ARMISD::INTRET_GLUE, DL, MVT::Other, RetOps);
3110}
3111
3112SDValue
3113ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3114 bool isVarArg,
3116 const SmallVectorImpl<SDValue> &OutVals,
3117 const SDLoc &dl, SelectionDAG &DAG) const {
3118 // CCValAssign - represent the assignment of the return value to a location.
3120
3121 // CCState - Info about the registers and stack slots.
3122 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3123 *DAG.getContext());
3124
3125 // Analyze outgoing return values.
3126 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3127
3128 SDValue Glue;
3130 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3131 bool isLittleEndian = Subtarget->isLittle();
3132
3135 AFI->setReturnRegsCount(RVLocs.size());
3136
3137 // Report error if cmse entry function returns structure through first ptr arg.
3138 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
3139 // Note: using an empty SDLoc(), as the first line of the function is a
3140 // better place to report than the last line.
3143 "secure entry function would return value through pointer",
3144 SDLoc().getDebugLoc()));
3145 }
3146
3147 // Copy the result values into the output registers.
3148 for (unsigned i = 0, realRVLocIdx = 0;
3149 i != RVLocs.size();
3150 ++i, ++realRVLocIdx) {
3151 CCValAssign &VA = RVLocs[i];
3152 assert(VA.isRegLoc() && "Can only return in registers!");
3153
3154 SDValue Arg = OutVals[realRVLocIdx];
3155 bool ReturnF16 = false;
3156
3157 if (Subtarget->hasFullFP16() && getTM().isTargetHardFloat()) {
3158 // Half-precision return values can be returned like this:
3159 //
3160 // t11 f16 = fadd ...
3161 // t12: i16 = bitcast t11
3162 // t13: i32 = zero_extend t12
3163 // t14: f32 = bitcast t13 <~~~~~~~ Arg
3164 //
3165 // to avoid code generation for bitcasts, we simply set Arg to the node
3166 // that produces the f16 value, t11 in this case.
3167 //
3168 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
3169 SDValue ZE = Arg.getOperand(0);
3170 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
3171 SDValue BC = ZE.getOperand(0);
3172 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
3173 Arg = BC.getOperand(0);
3174 ReturnF16 = true;
3175 }
3176 }
3177 }
3178 }
3179
3180 switch (VA.getLocInfo()) {
3181 default: llvm_unreachable("Unknown loc info!");
3182 case CCValAssign::Full: break;
3183 case CCValAssign::BCvt:
3184 if (!ReturnF16)
3185 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3186 break;
3187 }
3188
3189 // Mask f16 arguments if this is a CMSE nonsecure entry.
3190 auto RetVT = Outs[realRVLocIdx].ArgVT;
3191 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
3192 if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
3193 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
3194 } else {
3195 auto LocBits = VA.getLocVT().getSizeInBits();
3196 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
3197 SDValue Mask =
3198 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
3199 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
3200 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
3201 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3202 }
3203 }
3204
3205 if (VA.needsCustom() &&
3206 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
3207 if (VA.getLocVT() == MVT::v2f64) {
3208 // Extract the first half and return it in two registers.
3209 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3210 DAG.getConstant(0, dl, MVT::i32));
3211 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
3212 DAG.getVTList(MVT::i32, MVT::i32), Half);
3213
3214 Chain =
3215 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3216 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Glue);
3217 Glue = Chain.getValue(1);
3218 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3219 VA = RVLocs[++i]; // skip ahead to next loc
3220 Chain =
3221 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3222 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Glue);
3223 Glue = Chain.getValue(1);
3224 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3225 VA = RVLocs[++i]; // skip ahead to next loc
3226
3227 // Extract the 2nd half and fall through to handle it as an f64 value.
3228 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3229 DAG.getConstant(1, dl, MVT::i32));
3230 }
3231 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
3232 // available.
3233 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
3234 DAG.getVTList(MVT::i32, MVT::i32), Arg);
3235 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3236 fmrrd.getValue(isLittleEndian ? 0 : 1), Glue);
3237 Glue = Chain.getValue(1);
3238 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3239 VA = RVLocs[++i]; // skip ahead to next loc
3240 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3241 fmrrd.getValue(isLittleEndian ? 1 : 0), Glue);
3242 } else
3243 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Glue);
3244
3245 // Guarantee that all emitted copies are
3246 // stuck together, avoiding something bad.
3247 Glue = Chain.getValue(1);
3248 RetOps.push_back(DAG.getRegister(
3249 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
3250 }
3251 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3252 const MCPhysReg *I =
3253 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3254 if (I) {
3255 for (; *I; ++I) {
3256 if (ARM::GPRRegClass.contains(*I))
3257 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3258 else if (ARM::DPRRegClass.contains(*I))
3260 else
3261 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3262 }
3263 }
3264
3265 // Update chain and glue.
3266 RetOps[0] = Chain;
3267 if (Glue.getNode())
3268 RetOps.push_back(Glue);
3269
3270 // CPUs which aren't M-class use a special sequence to return from
3271 // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3272 // though we use "subs pc, lr, #N").
3273 //
3274 // M-class CPUs actually use a normal return sequence with a special
3275 // (hardware-provided) value in LR, so the normal code path works.
3276 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
3277 !Subtarget->isMClass()) {
3278 if (Subtarget->isThumb1Only())
3279 report_fatal_error("interrupt attribute is not supported in Thumb1");
3280 return LowerInterruptReturn(RetOps, dl, DAG);
3281 }
3282
3285 return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
3286}
3287
3288bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3289 if (N->getNumValues() != 1)
3290 return false;
3291 if (!N->hasNUsesOfValue(1, 0))
3292 return false;
3293
3294 SDValue TCChain = Chain;
3295 SDNode *Copy = *N->user_begin();
3296 if (Copy->getOpcode() == ISD::CopyToReg) {
3297 // If the copy has a glue operand, we conservatively assume it isn't safe to
3298 // perform a tail call.
3299 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3300 return false;
3301 TCChain = Copy->getOperand(0);
3302 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
3303 SDNode *VMov = Copy;
3304 // f64 returned in a pair of GPRs.
3306 for (SDNode *U : VMov->users()) {
3307 if (U->getOpcode() != ISD::CopyToReg)
3308 return false;
3309 Copies.insert(U);
3310 }
3311 if (Copies.size() > 2)
3312 return false;
3313
3314 for (SDNode *U : VMov->users()) {
3315 SDValue UseChain = U->getOperand(0);
3316 if (Copies.count(UseChain.getNode()))
3317 // Second CopyToReg
3318 Copy = U;
3319 else {
3320 // We are at the top of this chain.
3321 // If the copy has a glue operand, we conservatively assume it
3322 // isn't safe to perform a tail call.
3323 if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)
3324 return false;
3325 // First CopyToReg
3326 TCChain = UseChain;
3327 }
3328 }
3329 } else if (Copy->getOpcode() == ISD::BITCAST) {
3330 // f32 returned in a single GPR.
3331 if (!Copy->hasOneUse())
3332 return false;
3333 Copy = *Copy->user_begin();
3334 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
3335 return false;
3336 // If the copy has a glue operand, we conservatively assume it isn't safe to
3337 // perform a tail call.
3338 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3339 return false;
3340 TCChain = Copy->getOperand(0);
3341 } else {
3342 return false;
3343 }
3344
3345 bool HasRet = false;
3346 for (const SDNode *U : Copy->users()) {
3347 if (U->getOpcode() != ARMISD::RET_GLUE &&
3348 U->getOpcode() != ARMISD::INTRET_GLUE)
3349 return false;
3350 HasRet = true;
3351 }
3352
3353 if (!HasRet)
3354 return false;
3355
3356 Chain = TCChain;
3357 return true;
3358}
3359
3360bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3361 if (!Subtarget->supportsTailCall())
3362 return false;
3363
3364 if (!CI->isTailCall())
3365 return false;
3366
3367 return true;
3368}
3369
3370// Trying to write a 64 bit value so need to split into two 32 bit values first,
3371// and pass the lower and high parts through.
3373 SDLoc DL(Op);
3374 SDValue WriteValue = Op->getOperand(2);
3375
3376 // This function is only supposed to be called for i64 type argument.
3377 assert(WriteValue.getValueType() == MVT::i64
3378 && "LowerWRITE_REGISTER called for non-i64 type argument.");
3379
3380 SDValue Lo, Hi;
3381 std::tie(Lo, Hi) = DAG.SplitScalar(WriteValue, DL, MVT::i32, MVT::i32);
3382 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
3383 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
3384}
3385
3386// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
3387// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
3388// one of the above mentioned nodes. It has to be wrapped because otherwise
3389// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
3390// be used to form addressing mode. These wrapped nodes will be selected
3391// into MOVi.
3392SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
3393 SelectionDAG &DAG) const {
3394 EVT PtrVT = Op.getValueType();
3395 // FIXME there is no actual debug info here
3396 SDLoc dl(Op);
3397 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
3398 SDValue Res;
3399
3400 // When generating execute-only code Constant Pools must be promoted to the
3401 // global data section. It's a bit ugly that we can't share them across basic
3402 // blocks, but this way we guarantee that execute-only behaves correct with
3403 // position-independent addressing modes.
3404 if (Subtarget->genExecuteOnly()) {
3405 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3406 auto *T = CP->getType();
3407 auto C = const_cast<Constant*>(CP->getConstVal());
3408 auto M = DAG.getMachineFunction().getFunction().getParent();
3409 auto GV = new GlobalVariable(
3410 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
3413 Twine(AFI->createPICLabelUId())
3414 );
3415 SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV),
3416 dl, PtrVT);
3417 return LowerGlobalAddress(GA, DAG);
3418 }
3419
3420 // The 16-bit ADR instruction can only encode offsets that are multiples of 4,
3421 // so we need to align to at least 4 bytes when we don't have 32-bit ADR.
3422 Align CPAlign = CP->getAlign();
3423 if (Subtarget->isThumb1Only())
3424 CPAlign = std::max(CPAlign, Align(4));
3425 if (CP->isMachineConstantPoolEntry())
3426 Res =
3427 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign);
3428 else
3429 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign);
3430 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
3431}
3432
3434 // If we don't have a 32-bit pc-relative branch instruction then the jump
3435 // table consists of block addresses. Usually this is inline, but for
3436 // execute-only it must be placed out-of-line.
3437 if (Subtarget->genExecuteOnly() && !Subtarget->hasV8MBaselineOps())
3440}
3441
3442SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
3443 SelectionDAG &DAG) const {
3446 unsigned ARMPCLabelIndex = 0;
3447 SDLoc DL(Op);
3448 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3449 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
3450 SDValue CPAddr;
3451 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
3452 if (!IsPositionIndependent) {
3453 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
3454 } else {
3455 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3456 ARMPCLabelIndex = AFI->createPICLabelUId();
3458 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
3459 ARMCP::CPBlockAddress, PCAdj);
3460 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3461 }
3462 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
3463 SDValue Result = DAG.getLoad(
3464 PtrVT, DL, DAG.getEntryNode(), CPAddr,
3466 if (!IsPositionIndependent)
3467 return Result;
3468 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
3469 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
3470}
3471
3472/// Convert a TLS address reference into the correct sequence of loads
3473/// and calls to compute the variable's address for Darwin, and return an
3474/// SDValue containing the final node.
3475
3476/// Darwin only has one TLS scheme which must be capable of dealing with the
3477/// fully general situation, in the worst case. This means:
3478/// + "extern __thread" declaration.
3479/// + Defined in a possibly unknown dynamic library.
3480///
3481/// The general system is that each __thread variable has a [3 x i32] descriptor
3482/// which contains information used by the runtime to calculate the address. The
3483/// only part of this the compiler needs to know about is the first word, which
3484/// contains a function pointer that must be called with the address of the
3485/// entire descriptor in "r0".
3486///
3487/// Since this descriptor may be in a different unit, in general access must
3488/// proceed along the usual ARM rules. A common sequence to produce is:
3489///
3490/// movw rT1, :lower16:_var$non_lazy_ptr
3491/// movt rT1, :upper16:_var$non_lazy_ptr
3492/// ldr r0, [rT1]
3493/// ldr rT2, [r0]
3494/// blx rT2
3495/// [...address now in r0...]
3496SDValue
3497ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3498 SelectionDAG &DAG) const {
3499 assert(Subtarget->isTargetDarwin() &&
3500 "This function expects a Darwin target");
3501 SDLoc DL(Op);
3502
3503 // First step is to get the address of the actua global symbol. This is where
3504 // the TLS descriptor lives.
3505 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3506
3507 // The first entry in the descriptor is a function pointer that we must call
3508 // to obtain the address of the variable.
3509 SDValue Chain = DAG.getEntryNode();
3510 SDValue FuncTLVGet = DAG.getLoad(
3511 MVT::i32, DL, Chain, DescAddr,
3515 Chain = FuncTLVGet.getValue(1);
3516
3518 MachineFrameInfo &MFI = F.getFrameInfo();
3519 MFI.setAdjustsStack(true);
3520
3521 // TLS calls preserve all registers except those that absolutely must be
3522 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3523 // silly).
3524 auto TRI =
3526 auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3528
3529 // Finally, we can make the call. This is just a degenerate version of a
3530 // normal AArch64 call node: r0 takes the address of the descriptor, and
3531 // returns the address of the variable in this thread.
3532 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3533 Chain =
3534 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3535 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3536 DAG.getRegisterMask(Mask), Chain.getValue(1));
3537 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3538}
3539
3540SDValue
3541ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3542 SelectionDAG &DAG) const {
3543 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
3544
3545 SDValue Chain = DAG.getEntryNode();
3546 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3547 SDLoc DL(Op);
3548
3549 // Load the current TEB (thread environment block)
3550 SDValue Ops[] = {Chain,
3551 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3552 DAG.getTargetConstant(15, DL, MVT::i32),
3553 DAG.getTargetConstant(0, DL, MVT::i32),
3554 DAG.getTargetConstant(13, DL, MVT::i32),
3555 DAG.getTargetConstant(0, DL, MVT::i32),
3556 DAG.getTargetConstant(2, DL, MVT::i32)};
3557 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
3558 DAG.getVTList(MVT::i32, MVT::Other), Ops);
3559
3560 SDValue TEB = CurrentTEB.getValue(0);
3561 Chain = CurrentTEB.getValue(1);
3562
3563 // Load the ThreadLocalStoragePointer from the TEB
3564 // A pointer to the TLS array is located at offset 0x2c from the TEB.
3565 SDValue TLSArray =
3566 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
3567 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
3568
3569 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3570 // offset into the TLSArray.
3571
3572 // Load the TLS index from the C runtime
3573 SDValue TLSIndex =
3574 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
3575 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
3576 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
3577
3578 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
3579 DAG.getConstant(2, DL, MVT::i32));
3580 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
3581 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
3583
3584 // Get the offset of the start of the .tls section (section base)
3585 const auto *GA = cast<GlobalAddressSDNode>(Op);
3586 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
3587 SDValue Offset = DAG.getLoad(
3588 PtrVT, DL, Chain,
3589 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
3590 DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
3592
3593 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
3594}
3595
3596// Lower ISD::GlobalTLSAddress using the "general dynamic" model
3597SDValue
3598ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
3599 SelectionDAG &DAG) const {
3600 SDLoc dl(GA);
3601 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3602 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3605 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3607 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3608 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
3609 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3610 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
3611 Argument = DAG.getLoad(
3612 PtrVT, dl, DAG.getEntryNode(), Argument,
3614 SDValue Chain = Argument.getValue(1);
3615
3616 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3617 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
3618
3619 // call __tls_get_addr.
3621 Args.emplace_back(Argument, Type::getInt32Ty(*DAG.getContext()));
3622
3623 // FIXME: is there useful debug info available here?
3625 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3627 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
3628
3629 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3630 return CallResult.first;
3631}
3632
3633// Lower ISD::GlobalTLSAddress using the "initial exec" or
3634// "local exec" model.
3635SDValue
3636ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
3637 SelectionDAG &DAG,
3638 TLSModel::Model model) const {
3639 const GlobalValue *GV = GA->getGlobal();
3640 SDLoc dl(GA);
3642 SDValue Chain = DAG.getEntryNode();
3643 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3644 // Get the Thread Pointer
3646
3647 if (model == TLSModel::InitialExec) {
3650 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3651 // Initial exec model.
3652 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3654 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3656 true);
3657 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3658 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3659 Offset = DAG.getLoad(
3660 PtrVT, dl, Chain, Offset,
3662 Chain = Offset.getValue(1);
3663
3664 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3665 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
3666
3667 Offset = DAG.getLoad(
3668 PtrVT, dl, Chain, Offset,
3670 } else {
3671 // local exec model
3672 assert(model == TLSModel::LocalExec);
3675 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3676 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3677 Offset = DAG.getLoad(
3678 PtrVT, dl, Chain, Offset,
3680 }
3681
3682 // The address of the thread local variable is the add of the thread
3683 // pointer with the offset of the variable.
3684 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
3685}
3686
3687SDValue
3688ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
3689 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3690 if (DAG.getTarget().useEmulatedTLS())
3691 return LowerToTLSEmulatedModel(GA, DAG);
3692
3693 if (Subtarget->isTargetDarwin())
3694 return LowerGlobalTLSAddressDarwin(Op, DAG);
3695
3696 if (Subtarget->isTargetWindows())
3697 return LowerGlobalTLSAddressWindows(Op, DAG);
3698
3699 // TODO: implement the "local dynamic" model
3700 assert(Subtarget->isTargetELF() && "Only ELF implemented here");
3702
3703 switch (model) {
3706 return LowerToTLSGeneralDynamicModel(GA, DAG);
3709 return LowerToTLSExecModels(GA, DAG, model);
3710 }
3711 llvm_unreachable("bogus TLS model");
3712}
3713
3714/// Return true if all users of V are within function F, looking through
3715/// ConstantExprs.
3716static bool allUsersAreInFunction(const Value *V, const Function *F) {
3717 SmallVector<const User*,4> Worklist(V->users());
3718 while (!Worklist.empty()) {
3719 auto *U = Worklist.pop_back_val();
3720 if (isa<ConstantExpr>(U)) {
3721 append_range(Worklist, U->users());
3722 continue;
3723 }
3724
3725 auto *I = dyn_cast<Instruction>(U);
3726 if (!I || I->getParent()->getParent() != F)
3727 return false;
3728 }
3729 return true;
3730}
3731
3733 const GlobalValue *GV, SelectionDAG &DAG,
3734 EVT PtrVT, const SDLoc &dl) {
3735 // If we're creating a pool entry for a constant global with unnamed address,
3736 // and the global is small enough, we can emit it inline into the constant pool
3737 // to save ourselves an indirection.
3738 //
3739 // This is a win if the constant is only used in one function (so it doesn't
3740 // need to be duplicated) or duplicating the constant wouldn't increase code
3741 // size (implying the constant is no larger than 4 bytes).
3742 const Function &F = DAG.getMachineFunction().getFunction();
3743
3744 // We rely on this decision to inline being idemopotent and unrelated to the
3745 // use-site. We know that if we inline a variable at one use site, we'll
3746 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3747 // doesn't know about this optimization, so bail out if it's enabled else
3748 // we could decide to inline here (and thus never emit the GV) but require
3749 // the GV from fast-isel generated code.
3752 return SDValue();
3753
3754 auto *GVar = dyn_cast<GlobalVariable>(GV);
3755 if (!GVar || !GVar->hasInitializer() ||
3756 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3757 !GVar->hasLocalLinkage())
3758 return SDValue();
3759
3760 // If we inline a value that contains relocations, we move the relocations
3761 // from .data to .text. This is not allowed in position-independent code.
3762 auto *Init = GVar->getInitializer();
3763 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
3764 Init->needsDynamicRelocation())
3765 return SDValue();
3766
3767 // The constant islands pass can only really deal with alignment requests
3768 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3769 // any type wanting greater alignment requirements than 4 bytes. We also
3770 // can only promote constants that are multiples of 4 bytes in size or
3771 // are paddable to a multiple of 4. Currently we only try and pad constants
3772 // that are strings for simplicity.
3773 auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
3774 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3775 Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);
3776 unsigned RequiredPadding = 4 - (Size % 4);
3777 bool PaddingPossible =
3778 RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3779 if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
3780 Size == 0)
3781 return SDValue();
3782
3783 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3786
3787 // We can't bloat the constant pool too much, else the ConstantIslands pass
3788 // may fail to converge. If we haven't promoted this global yet (it may have
3789 // multiple uses), and promoting it would increase the constant pool size (Sz
3790 // > 4), ensure we have space to do so up to MaxTotal.
3791 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3792 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3794 return SDValue();
3795
3796 // This is only valid if all users are in a single function; we can't clone
3797 // the constant in general. The LLVM IR unnamed_addr allows merging
3798 // constants, but not cloning them.
3799 //
3800 // We could potentially allow cloning if we could prove all uses of the
3801 // constant in the current function don't care about the address, like
3802 // printf format strings. But that isn't implemented for now.
3803 if (!allUsersAreInFunction(GVar, &F))
3804 return SDValue();
3805
3806 // We're going to inline this global. Pad it out if needed.
3807 if (RequiredPadding != 4) {
3808 StringRef S = CDAInit->getAsString();
3809
3811 std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3812 while (RequiredPadding--)
3813 V.push_back(0);
3815 }
3816
3817 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
3818 SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4));
3819 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3822 PaddedSize - 4);
3823 }
3824 ++NumConstpoolPromoted;
3825 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3826}
3827
3829 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3830 if (!(GV = GA->getAliaseeObject()))
3831 return false;
3832 if (const auto *V = dyn_cast<GlobalVariable>(GV))
3833 return V->isConstant();
3834 return isa<Function>(GV);
3835}
3836
3837SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
3838 SelectionDAG &DAG) const {
3839 switch (Subtarget->getTargetTriple().getObjectFormat()) {
3840 default: llvm_unreachable("unknown object format");
3841 case Triple::COFF:
3842 return LowerGlobalAddressWindows(Op, DAG);
3843 case Triple::ELF:
3844 return LowerGlobalAddressELF(Op, DAG);
3845 case Triple::MachO:
3846 return LowerGlobalAddressDarwin(Op, DAG);
3847 }
3848}
3849
3850SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3851 SelectionDAG &DAG) const {
3852 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3853 SDLoc dl(Op);
3854 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3855 bool IsRO = isReadOnly(GV);
3856
3857 // promoteToConstantPool only if not generating XO text section
3858 if (GV->isDSOLocal() && !Subtarget->genExecuteOnly())
3859 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
3860 return V;
3861
3862 if (isPositionIndependent()) {
3864 GV, dl, PtrVT, 0, GV->isDSOLocal() ? 0 : ARMII::MO_GOT);
3865 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3866 if (!GV->isDSOLocal())
3867 Result =
3868 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3870 return Result;
3871 } else if (Subtarget->isROPI() && IsRO) {
3872 // PC-relative.
3873 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3874 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3875 return Result;
3876 } else if (Subtarget->isRWPI() && !IsRO) {
3877 // SB-relative.
3878 SDValue RelAddr;
3879 if (Subtarget->useMovt()) {
3880 ++NumMovwMovt;
3881 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
3882 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
3883 } else { // use literal pool for address constant
3886 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3887 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3888 RelAddr = DAG.getLoad(
3889 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3891 }
3892 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3893 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
3894 return Result;
3895 }
3896
3897 // If we have T2 ops, we can materialize the address directly via movt/movw
3898 // pair. This is always cheaper. If need to generate Execute Only code, and we
3899 // only have Thumb1 available, we can't use a constant pool and are forced to
3900 // use immediate relocations.
3901 if (Subtarget->useMovt() || Subtarget->genExecuteOnly()) {
3902 if (Subtarget->useMovt())
3903 ++NumMovwMovt;
3904 // FIXME: Once remat is capable of dealing with instructions with register
3905 // operands, expand this into two nodes.
3906 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
3907 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
3908 } else {
3909 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
3910 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3911 return DAG.getLoad(
3912 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3914 }
3915}
3916
3917SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
3918 SelectionDAG &DAG) const {
3919 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3920 "ROPI/RWPI not currently supported for Darwin");
3921 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3922 SDLoc dl(Op);
3923 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3924
3925 if (Subtarget->useMovt())
3926 ++NumMovwMovt;
3927
3928 // FIXME: Once remat is capable of dealing with instructions with register
3929 // operands, expand this into multiple nodes
3930 unsigned Wrapper =
3932
3933 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
3934 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
3935
3936 if (Subtarget->isGVIndirectSymbol(GV))
3937 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3939 return Result;
3940}
3941
3942SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
3943 SelectionDAG &DAG) const {
3944 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
3945 assert(Subtarget->useMovt() &&
3946 "Windows on ARM expects to use movw/movt");
3947 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3948 "ROPI/RWPI not currently supported for Windows");
3949
3951 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3952 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
3953 if (GV->hasDLLImportStorageClass())
3954 TargetFlags = ARMII::MO_DLLIMPORT;
3955 else if (!TM.shouldAssumeDSOLocal(GV))
3956 TargetFlags = ARMII::MO_COFFSTUB;
3957 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3959 SDLoc DL(Op);
3960
3961 ++NumMovwMovt;
3962
3963 // FIXME: Once remat is capable of dealing with instructions with register
3964 // operands, expand this into two nodes.
3965 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
3966 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
3967 TargetFlags));
3968 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
3969 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3971 return Result;
3972}
3973
3974SDValue
3975ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
3976 SDLoc dl(Op);
3977 SDValue Val = DAG.getConstant(0, dl, MVT::i32);
3978 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
3979 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
3980 Op.getOperand(1), Val);
3981}
3982
3983SDValue
3984ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
3985 SDLoc dl(Op);
3986 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
3987 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
3988}
3989
3990SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
3991 SelectionDAG &DAG) const {
3992 SDLoc dl(Op);
3993 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
3994 Op.getOperand(0));
3995}
3996
3997SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
3998 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
3999 unsigned IntNo =
4000 Op.getConstantOperandVal(Op.getOperand(0).getValueType() == MVT::Other);
4001 switch (IntNo) {
4002 default:
4003 return SDValue(); // Don't custom lower most intrinsics.
4004 case Intrinsic::arm_gnu_eabi_mcount: {
4006 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4007 SDLoc dl(Op);
4008 SDValue Chain = Op.getOperand(0);
4009 // call "\01__gnu_mcount_nc"
4010 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
4011 const uint32_t *Mask =
4013 assert(Mask && "Missing call preserved mask for calling convention");
4014 // Mark LR an implicit live-in.
4015 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
4016 SDValue ReturnAddress =
4017 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
4018 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
4019 SDValue Callee =
4020 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
4022 if (Subtarget->isThumb())
4023 return SDValue(
4024 DAG.getMachineNode(
4025 ARM::tBL_PUSHLR, dl, ResultTys,
4026 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
4027 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
4028 0);
4029 return SDValue(
4030 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
4031 {ReturnAddress, Callee, RegisterMask, Chain}),
4032 0);
4033 }
4034 }
4035}
4036
4037SDValue
4038ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
4039 const ARMSubtarget *Subtarget) const {
4040 unsigned IntNo = Op.getConstantOperandVal(0);
4041 SDLoc dl(Op);
4042 switch (IntNo) {
4043 default: return SDValue(); // Don't custom lower most intrinsics.
4044 case Intrinsic::thread_pointer: {
4045 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4046 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
4047 }
4048 case Intrinsic::arm_cls: {
4049 const SDValue &Operand = Op.getOperand(1);
4050 const EVT VTy = Op.getValueType();
4051 SDValue SRA =
4052 DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
4053 SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
4054 SDValue SHL =
4055 DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
4056 SDValue OR =
4057 DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
4058 SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR);
4059 return Result;
4060 }
4061 case Intrinsic::arm_cls64: {
4062 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x))
4063 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x)))
4064 const SDValue &Operand = Op.getOperand(1);
4065 const EVT VTy = Op.getValueType();
4066 SDValue Lo, Hi;
4067 std::tie(Lo, Hi) = DAG.SplitScalar(Operand, dl, VTy, VTy);
4068 SDValue Constant0 = DAG.getConstant(0, dl, VTy);
4069 SDValue Constant1 = DAG.getConstant(1, dl, VTy);
4070 SDValue Constant31 = DAG.getConstant(31, dl, VTy);
4071 SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31);
4072 SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi);
4073 SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1);
4074 SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1);
4075 SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi);
4076 SDValue CheckLo =
4077 DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ);
4078 SDValue HiIsZero =
4079 DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ);
4080 SDValue AdjustedLo =
4081 DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy));
4082 SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo);
4083 SDValue Result =
4084 DAG.getSelect(dl, VTy, CheckLo,
4085 DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi);
4086 return Result;
4087 }
4088 case Intrinsic::eh_sjlj_lsda: {
4091 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
4092 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4093 SDValue CPAddr;
4094 bool IsPositionIndependent = isPositionIndependent();
4095 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
4097 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
4098 ARMCP::CPLSDA, PCAdj);
4099 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
4100 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
4101 SDValue Result = DAG.getLoad(
4102 PtrVT, dl, DAG.getEntryNode(), CPAddr,
4104
4105 if (IsPositionIndependent) {
4106 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
4107 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
4108 }
4109 return Result;
4110 }
4111 case Intrinsic::arm_neon_vabs:
4112 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
4113 Op.getOperand(1));
4114 case Intrinsic::arm_neon_vabds:
4115 if (Op.getValueType().isInteger())
4116 return DAG.getNode(ISD::ABDS, SDLoc(Op), Op.getValueType(),
4117 Op.getOperand(1), Op.getOperand(2));
4118 return SDValue();
4119 case Intrinsic::arm_neon_vabdu:
4120 return DAG.getNode(ISD::ABDU, SDLoc(Op), Op.getValueType(),
4121 Op.getOperand(1), Op.getOperand(2));
4122 case Intrinsic::arm_neon_vmulls:
4123 case Intrinsic::arm_neon_vmullu: {
4124 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
4126 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4127 Op.getOperand(1), Op.getOperand(2));
4128 }
4129 case Intrinsic::arm_neon_vminnm:
4130 case Intrinsic::arm_neon_vmaxnm: {
4131 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
4133 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4134 Op.getOperand(1), Op.getOperand(2));
4135 }
4136 case Intrinsic::arm_neon_vminu:
4137 case Intrinsic::arm_neon_vmaxu: {
4138 if (Op.getValueType().isFloatingPoint())
4139 return SDValue();
4140 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
4141 ? ISD::UMIN : ISD::UMAX;
4142 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4143 Op.getOperand(1), Op.getOperand(2));
4144 }
4145 case Intrinsic::arm_neon_vmins:
4146 case Intrinsic::arm_neon_vmaxs: {
4147 // v{min,max}s is overloaded between signed integers and floats.
4148 if (!Op.getValueType().isFloatingPoint()) {
4149 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
4150 ? ISD::SMIN : ISD::SMAX;
4151 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4152 Op.getOperand(1), Op.getOperand(2));
4153 }
4154 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
4156 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4157 Op.getOperand(1), Op.getOperand(2));
4158 }
4159 case Intrinsic::arm_neon_vtbl1:
4160 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
4161 Op.getOperand(1), Op.getOperand(2));
4162 case Intrinsic::arm_neon_vtbl2:
4163 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
4164 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4165 case Intrinsic::arm_mve_pred_i2v:
4166 case Intrinsic::arm_mve_pred_v2i:
4167 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
4168 Op.getOperand(1));
4169 case Intrinsic::arm_mve_vreinterpretq:
4170 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
4171 Op.getOperand(1));
4172 case Intrinsic::arm_mve_lsll:
4173 return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),
4174 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4175 case Intrinsic::arm_mve_asrl:
4176 return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),
4177 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4178 }
4179}
4180
4182 const ARMSubtarget *Subtarget) {
4183 SDLoc dl(Op);
4184 auto SSID = static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
4185 if (SSID == SyncScope::SingleThread)
4186 return Op;
4187
4188 if (!Subtarget->hasDataBarrier()) {
4189 // Some ARMv6 cpus can support data barriers with an mcr instruction.
4190 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
4191 // here.
4192 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
4193 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
4194 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
4195 DAG.getConstant(0, dl, MVT::i32));
4196 }
4197
4198 AtomicOrdering Ord =
4199 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
4201 if (Subtarget->isMClass()) {
4202 // Only a full system barrier exists in the M-class architectures.
4204 } else if (Subtarget->preferISHSTBarriers() &&
4205 Ord == AtomicOrdering::Release) {
4206 // Swift happens to implement ISHST barriers in a way that's compatible with
4207 // Release semantics but weaker than ISH so we'd be fools not to use
4208 // it. Beware: other processors probably don't!
4210 }
4211
4212 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
4213 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
4214 DAG.getConstant(Domain, dl, MVT::i32));
4215}
4216
4218 const ARMSubtarget *Subtarget) {
4219 // ARM pre v5TE and Thumb1 does not have preload instructions.
4220 if (!(Subtarget->isThumb2() ||
4221 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
4222 // Just preserve the chain.
4223 return Op.getOperand(0);
4224
4225 SDLoc dl(Op);
4226 unsigned isRead = ~Op.getConstantOperandVal(2) & 1;
4227 if (!isRead &&
4228 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
4229 // ARMv7 with MP extension has PLDW.
4230 return Op.getOperand(0);
4231
4232 unsigned isData = Op.getConstantOperandVal(4);
4233 if (Subtarget->isThumb()) {
4234 // Invert the bits.
4235 isRead = ~isRead & 1;
4236 isData = ~isData & 1;
4237 }
4238
4239 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
4240 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
4241 DAG.getConstant(isData, dl, MVT::i32));
4242}
4243
4246 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
4247
4248 // vastart just stores the address of the VarArgsFrameIndex slot into the
4249 // memory location argument.
4250 SDLoc dl(Op);
4252 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
4253 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4254 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
4255 MachinePointerInfo(SV));
4256}
4257
4258SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
4259 CCValAssign &NextVA,
4260 SDValue &Root,
4261 SelectionDAG &DAG,
4262 const SDLoc &dl) const {
4265
4266 const TargetRegisterClass *RC;
4267 if (AFI->isThumb1OnlyFunction())
4268 RC = &ARM::tGPRRegClass;
4269 else
4270 RC = &ARM::GPRRegClass;
4271
4272 // Transform the arguments stored in physical registers into virtual ones.
4273 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4274 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4275
4276 SDValue ArgValue2;
4277 if (NextVA.isMemLoc()) {
4278 MachineFrameInfo &MFI = MF.getFrameInfo();
4279 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
4280
4281 // Create load node to retrieve arguments from the stack.
4282 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4283 ArgValue2 = DAG.getLoad(
4284 MVT::i32, dl, Root, FIN,
4286 } else {
4287 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
4288 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4289 }
4290 if (!Subtarget->isLittle())
4291 std::swap (ArgValue, ArgValue2);
4292 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
4293}
4294
4295// The remaining GPRs hold either the beginning of variable-argument
4296// data, or the beginning of an aggregate passed by value (usually
4297// byval). Either way, we allocate stack slots adjacent to the data
4298// provided by our caller, and store the unallocated registers there.
4299// If this is a variadic function, the va_list pointer will begin with
4300// these values; otherwise, this reassembles a (byval) structure that
4301// was split between registers and memory.
4302// Return: The frame index registers were stored into.
4303int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
4304 const SDLoc &dl, SDValue &Chain,
4305 const Value *OrigArg,
4306 unsigned InRegsParamRecordIdx,
4307 int ArgOffset, unsigned ArgSize) const {
4308 // Currently, two use-cases possible:
4309 // Case #1. Non-var-args function, and we meet first byval parameter.
4310 // Setup first unallocated register as first byval register;
4311 // eat all remained registers
4312 // (these two actions are performed by HandleByVal method).
4313 // Then, here, we initialize stack frame with
4314 // "store-reg" instructions.
4315 // Case #2. Var-args function, that doesn't contain byval parameters.
4316 // The same: eat all remained unallocated registers,
4317 // initialize stack frame.
4318
4320 MachineFrameInfo &MFI = MF.getFrameInfo();
4322 unsigned RBegin, REnd;
4323 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
4324 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
4325 } else {
4326 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4327 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
4328 REnd = ARM::R4;
4329 }
4330
4331 if (REnd != RBegin)
4332 ArgOffset = -4 * (ARM::R4 - RBegin);
4333
4334 auto PtrVT = getPointerTy(DAG.getDataLayout());
4335 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
4336 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
4337
4339 const TargetRegisterClass *RC =
4340 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
4341
4342 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
4343 Register VReg = MF.addLiveIn(Reg, RC);
4344 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
4345 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
4346 MachinePointerInfo(OrigArg, 4 * i));
4347 MemOps.push_back(Store);
4348 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
4349 }
4350
4351 if (!MemOps.empty())
4352 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
4353 return FrameIndex;
4354}
4355
4356// Setup stack frame, the va_list pointer will start from.
4357void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
4358 const SDLoc &dl, SDValue &Chain,
4359 unsigned ArgOffset,
4360 unsigned TotalArgRegsSaveSize,
4361 bool ForceMutable) const {
4364
4365 // Try to store any remaining integer argument regs
4366 // to their spots on the stack so that they may be loaded by dereferencing
4367 // the result of va_next.
4368 // If there is no regs to be stored, just point address after last
4369 // argument passed via stack.
4370 int FrameIndex = StoreByValRegs(
4371 CCInfo, DAG, dl, Chain, nullptr, CCInfo.getInRegsParamsCount(),
4372 CCInfo.getStackSize(), std::max(4U, TotalArgRegsSaveSize));
4373 AFI->setVarArgsFrameIndex(FrameIndex);
4374}
4375
4376bool ARMTargetLowering::splitValueIntoRegisterParts(
4377 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
4378 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
4379 EVT ValueVT = Val.getValueType();
4380 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4381 unsigned ValueBits = ValueVT.getSizeInBits();
4382 unsigned PartBits = PartVT.getSizeInBits();
4383 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
4384 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
4385 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
4386 Parts[0] = Val;
4387 return true;
4388 }
4389 return false;
4390}
4391
4392SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
4393 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
4394 MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
4395 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4396 unsigned ValueBits = ValueVT.getSizeInBits();
4397 unsigned PartBits = PartVT.getSizeInBits();
4398 SDValue Val = Parts[0];
4399
4400 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
4401 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
4402 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
4403 return Val;
4404 }
4405 return SDValue();
4406}
4407
4408SDValue ARMTargetLowering::LowerFormalArguments(
4409 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4410 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4411 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4413 MachineFrameInfo &MFI = MF.getFrameInfo();
4414
4416
4417 // Assign locations to all of the incoming arguments.
4419 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4420 *DAG.getContext());
4421 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
4422
4424 unsigned CurArgIdx = 0;
4425
4426 // Initially ArgRegsSaveSize is zero.
4427 // Then we increase this value each time we meet byval parameter.
4428 // We also increase this value in case of varargs function.
4429 AFI->setArgRegsSaveSize(0);
4430
4431 // Calculate the amount of stack space that we need to allocate to store
4432 // byval and variadic arguments that are passed in registers.
4433 // We need to know this before we allocate the first byval or variadic
4434 // argument, as they will be allocated a stack slot below the CFA (Canonical
4435 // Frame Address, the stack pointer at entry to the function).
4436 unsigned ArgRegBegin = ARM::R4;
4437 for (const CCValAssign &VA : ArgLocs) {
4438 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
4439 break;
4440
4441 unsigned Index = VA.getValNo();
4442 ISD::ArgFlagsTy Flags = Ins[Index].Flags;
4443 if (!Flags.isByVal())
4444 continue;
4445
4446 assert(VA.isMemLoc() && "unexpected byval pointer in reg");
4447 unsigned RBegin, REnd;
4448 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
4449 ArgRegBegin = std::min(ArgRegBegin, RBegin);
4450
4451 CCInfo.nextInRegsParam();
4452 }
4453 CCInfo.rewindByValRegsInfo();
4454
4455 int lastInsIndex = -1;
4456 if (isVarArg && MFI.hasVAStart()) {
4457 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4458 if (RegIdx != std::size(GPRArgRegs))
4459 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
4460 }
4461
4462 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
4463 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
4464 auto PtrVT = getPointerTy(DAG.getDataLayout());
4465
4466 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4467 CCValAssign &VA = ArgLocs[i];
4468 if (Ins[VA.getValNo()].isOrigArg()) {
4469 std::advance(CurOrigArg,
4470 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
4471 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
4472 }
4473 // Arguments stored in registers.
4474 if (VA.isRegLoc()) {
4475 EVT RegVT = VA.getLocVT();
4476 SDValue ArgValue;
4477
4478 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
4479 // f64 and vector types are split up into multiple registers or
4480 // combinations of registers and stack slots.
4481 SDValue ArgValue1 =
4482 GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4483 VA = ArgLocs[++i]; // skip ahead to next loc
4484 SDValue ArgValue2;
4485 if (VA.isMemLoc()) {
4486 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
4487 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4488 ArgValue2 = DAG.getLoad(
4489 MVT::f64, dl, Chain, FIN,
4491 } else {
4492 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4493 }
4494 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
4495 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4496 ArgValue1, DAG.getIntPtrConstant(0, dl));
4497 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4498 ArgValue2, DAG.getIntPtrConstant(1, dl));
4499 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
4500 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4501 } else {
4502 const TargetRegisterClass *RC;
4503
4504 if (RegVT == MVT::f16 || RegVT == MVT::bf16)
4505 RC = &ARM::HPRRegClass;
4506 else if (RegVT == MVT::f32)
4507 RC = &ARM::SPRRegClass;
4508 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
4509 RegVT == MVT::v4bf16)
4510 RC = &ARM::DPRRegClass;
4511 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
4512 RegVT == MVT::v8bf16)
4513 RC = &ARM::QPRRegClass;
4514 else if (RegVT == MVT::i32)
4515 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
4516 : &ARM::GPRRegClass;
4517 else
4518 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
4519
4520 // Transform the arguments in physical registers into virtual ones.
4521 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4522 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4523
4524 // If this value is passed in r0 and has the returned attribute (e.g.
4525 // C++ 'structors), record this fact for later use.
4526 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
4527 AFI->setPreservesR0();
4528 }
4529 }
4530
4531 // If this is an 8 or 16-bit value, it is really passed promoted
4532 // to 32 bits. Insert an assert[sz]ext to capture this, then
4533 // truncate to the right size.
4534 switch (VA.getLocInfo()) {
4535 default: llvm_unreachable("Unknown loc info!");
4536 case CCValAssign::Full: break;
4537 case CCValAssign::BCvt:
4538 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
4539 break;
4540 }
4541
4542 // f16 arguments have their size extended to 4 bytes and passed as if they
4543 // had been copied to the LSBs of a 32-bit register.
4544 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
4545 if (VA.needsCustom() &&
4546 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
4547 ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
4548
4549 // On CMSE Entry Functions, formal integer arguments whose bitwidth is
4550 // less than 32 bits must be sign- or zero-extended in the callee for
4551 // security reasons. Although the ABI mandates an extension done by the
4552 // caller, the latter cannot be trusted to follow the rules of the ABI.
4553 const ISD::InputArg &Arg = Ins[VA.getValNo()];
4554 if (AFI->isCmseNSEntryFunction() && Arg.ArgVT.isScalarInteger() &&
4555 RegVT.isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
4556 ArgValue = handleCMSEValue(ArgValue, Arg, DAG, dl);
4557
4558 InVals.push_back(ArgValue);
4559 } else { // VA.isRegLoc()
4560 // Only arguments passed on the stack should make it here.
4561 assert(VA.isMemLoc());
4562 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
4563
4564 int index = VA.getValNo();
4565
4566 // Some Ins[] entries become multiple ArgLoc[] entries.
4567 // Process them only once.
4568 if (index != lastInsIndex)
4569 {
4570 ISD::ArgFlagsTy Flags = Ins[index].Flags;
4571 // FIXME: For now, all byval parameter objects are marked mutable.
4572 // This can be changed with more analysis.
4573 // In case of tail call optimization mark all arguments mutable.
4574 // Since they could be overwritten by lowering of arguments in case of
4575 // a tail call.
4576 if (Flags.isByVal()) {
4577 assert(Ins[index].isOrigArg() &&
4578 "Byval arguments cannot be implicit");
4579 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
4580
4581 int FrameIndex = StoreByValRegs(
4582 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
4583 VA.getLocMemOffset(), Flags.getByValSize());
4584 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
4585 CCInfo.nextInRegsParam();
4586 } else if (VA.needsCustom() && (VA.getValVT() == MVT::f16 ||
4587 VA.getValVT() == MVT::bf16)) {
4588 // f16 and bf16 values are passed in the least-significant half of
4589 // a 4 byte stack slot. This is done as-if the extension was done
4590 // in a 32-bit register, so the actual bytes used for the value
4591 // differ between little and big endian.
4592 assert(VA.getLocVT().getSizeInBits() == 32);
4593 unsigned FIOffset = VA.getLocMemOffset();
4594 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits() / 8,
4595 FIOffset, true);
4596
4597 SDValue Addr = DAG.getFrameIndex(FI, PtrVT);
4598 if (DAG.getDataLayout().isBigEndian())
4600
4601 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, Addr,
4603 DAG.getMachineFunction(), FI)));
4604
4605 } else {
4606 unsigned FIOffset = VA.getLocMemOffset();
4607 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
4608 FIOffset, true);
4609
4610 // Create load nodes to retrieve arguments from the stack.
4611 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4612 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
4614 DAG.getMachineFunction(), FI)));
4615 }
4616 lastInsIndex = index;
4617 }
4618 }
4619 }
4620
4621 // varargs
4622 if (isVarArg && MFI.hasVAStart()) {
4623 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getStackSize(),
4624 TotalArgRegsSaveSize);
4625 if (AFI->isCmseNSEntryFunction()) {
4628 "secure entry function must not be variadic", dl.getDebugLoc()));
4629 }
4630 }
4631
4632 unsigned StackArgSize = CCInfo.getStackSize();
4633 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4634 if (canGuaranteeTCO(CallConv, TailCallOpt)) {
4635 // The only way to guarantee a tail call is if the callee restores its
4636 // argument area, but it must also keep the stack aligned when doing so.
4637 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
4638 assert(StackAlign && "data layout string is missing stack alignment");
4639 StackArgSize = alignTo(StackArgSize, *StackAlign);
4640
4641 AFI->setArgumentStackToRestore(StackArgSize);
4642 }
4643 AFI->setArgumentStackSize(StackArgSize);
4644
4645 if (CCInfo.getStackSize() > 0 && AFI->isCmseNSEntryFunction()) {
4648 "secure entry function requires arguments on stack", dl.getDebugLoc()));
4649 }
4650
4651 return Chain;
4652}
4653
4654/// isFloatingPointZero - Return true if this is +0.0.
4656 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
4657 return CFP->getValueAPF().isPosZero();
4658 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
4659 // Maybe this has already been legalized into the constant pool?
4660 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
4661 SDValue WrapperOp = Op.getOperand(1).getOperand(0);
4662 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
4663 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
4664 return CFP->getValueAPF().isPosZero();
4665 }
4666 } else if (Op->getOpcode() == ISD::BITCAST &&
4667 Op->getValueType(0) == MVT::f64) {
4668 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
4669 // created by LowerConstantFP().
4670 SDValue BitcastOp = Op->getOperand(0);
4671 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
4672 isNullConstant(BitcastOp->getOperand(0)))
4673 return true;
4674 }
4675 return false;
4676}
4677
4678/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
4679/// the given operands.
4680SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
4681 SDValue &ARMcc, SelectionDAG &DAG,
4682 const SDLoc &dl) const {
4683 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4684 unsigned C = RHSC->getZExtValue();
4685 if (!isLegalICmpImmediate((int32_t)C)) {
4686 // Constant does not fit, try adjusting it by one.
4687 switch (CC) {
4688 default: break;
4689 case ISD::SETLT:
4690 case ISD::SETGE:
4691 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
4692 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4693 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4694 }
4695 break;
4696 case ISD::SETULT:
4697 case ISD::SETUGE:
4698 if (C != 0 && isLegalICmpImmediate(C-1)) {
4699 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4700 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4701 }
4702 break;
4703 case ISD::SETLE:
4704 case ISD::SETGT:
4705 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
4706 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4707 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4708 }
4709 break;
4710 case ISD::SETULE:
4711 case ISD::SETUGT:
4712 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
4713 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4714 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4715 }
4716 break;
4717 }
4718 }
4719 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
4721 // In ARM and Thumb-2, the compare instructions can shift their second
4722 // operand.
4724 std::swap(LHS, RHS);
4725 }
4726
4727 // Thumb1 has very limited immediate modes, so turning an "and" into a
4728 // shift can save multiple instructions.
4729 //
4730 // If we have (x & C1), and C1 is an appropriate mask, we can transform it
4731 // into "((x << n) >> n)". But that isn't necessarily profitable on its
4732 // own. If it's the operand to an unsigned comparison with an immediate,
4733 // we can eliminate one of the shifts: we transform
4734 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
4735 //
4736 // We avoid transforming cases which aren't profitable due to encoding
4737 // details:
4738 //
4739 // 1. C2 fits into the immediate field of a cmp, and the transformed version
4740 // would not; in that case, we're essentially trading one immediate load for
4741 // another.
4742 // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
4743 // 3. C2 is zero; we have other code for this special case.
4744 //
4745 // FIXME: Figure out profitability for Thumb2; we usually can't save an
4746 // instruction, since the AND is always one instruction anyway, but we could
4747 // use narrow instructions in some cases.
4748 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
4749 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4750 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
4751 !isSignedIntSetCC(CC)) {
4752 unsigned Mask = LHS.getConstantOperandVal(1);
4753 auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
4754 uint64_t RHSV = RHSC->getZExtValue();
4755 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
4756 unsigned ShiftBits = llvm::countl_zero(Mask);
4757 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
4758 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
4759 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
4760 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
4761 }
4762 }
4763 }
4764
4765 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
4766 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same
4767 // way a cmp would.
4768 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
4769 // some tweaks to the heuristics for the previous and->shift transform.
4770 // FIXME: Optimize cases where the LHS isn't a shift.
4771 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
4772 isa<ConstantSDNode>(RHS) && RHS->getAsZExtVal() == 0x80000000U &&
4773 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4774 LHS.getConstantOperandVal(1) < 31) {
4775 unsigned ShiftAmt = LHS.getConstantOperandVal(1) + 1;
4776 SDValue Shift =
4777 DAG.getNode(ARMISD::LSLS, dl, DAG.getVTList(MVT::i32, FlagsVT),
4778 LHS.getOperand(0), DAG.getConstant(ShiftAmt, dl, MVT::i32));
4779 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
4780 return Shift.getValue(1);
4781 }
4782
4784
4785 // If the RHS is a constant zero then the V (overflow) flag will never be
4786 // set. This can allow us to simplify GE to PL or LT to MI, which can be
4787 // simpler for other passes (like the peephole optimiser) to deal with.
4788 if (isNullConstant(RHS)) {
4789 switch (CondCode) {
4790 default: break;
4791 case ARMCC::GE:
4793 break;
4794 case ARMCC::LT:
4796 break;
4797 }
4798 }
4799
4800 ARMISD::NodeType CompareType;
4801 switch (CondCode) {
4802 default:
4803 CompareType = ARMISD::CMP;
4804 break;
4805 case ARMCC::EQ:
4806 case ARMCC::NE:
4807 // Uses only Z Flag
4808 CompareType = ARMISD::CMPZ;
4809 break;
4810 }
4811 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4812 return DAG.getNode(CompareType, dl, FlagsVT, LHS, RHS);
4813}
4814
4815/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
4816SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
4817 SelectionDAG &DAG, const SDLoc &dl,
4818 bool Signaling) const {
4819 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
4820 SDValue Flags;
4821 if (!isFloatingPointZero(RHS))
4822 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP, dl, FlagsVT,
4823 LHS, RHS);
4824 else
4825 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0, dl,
4826 FlagsVT, LHS);
4827 return DAG.getNode(ARMISD::FMSTAT, dl, FlagsVT, Flags);
4828}
4829
4830// This function returns three things: the arithmetic computation itself
4831// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The
4832// comparison and the condition code define the case in which the arithmetic
4833// computation *does not* overflow.
4834std::pair<SDValue, SDValue>
4835ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
4836 SDValue &ARMcc) const {
4837 assert(Op.getValueType() == MVT::i32 && "Unsupported value type");
4838
4839 SDValue Value, OverflowCmp;
4840 SDValue LHS = Op.getOperand(0);
4841 SDValue RHS = Op.getOperand(1);
4842 SDLoc dl(Op);
4843
4844 // FIXME: We are currently always generating CMPs because we don't support
4845 // generating CMN through the backend. This is not as good as the natural
4846 // CMP case because it causes a register dependency and cannot be folded
4847 // later.
4848
4849 switch (Op.getOpcode()) {
4850 default:
4851 llvm_unreachable("Unknown overflow instruction!");
4852 case ISD::SADDO:
4853 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4854 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
4855 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4856 break;
4857 case ISD::UADDO:
4858 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4859 // We use ADDC here to correspond to its use in LowerUnsignedALUO.
4860 // We do not use it in the USUBO case as Value may not be used.
4861 Value = DAG.getNode(ARMISD::ADDC, dl,
4862 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
4863 .getValue(0);
4864 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4865 break;
4866 case ISD::SSUBO:
4867 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4868 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4869 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4870 break;
4871 case ISD::USUBO:
4872 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4873 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4874 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4875 break;
4876 case ISD::UMULO:
4877 // We generate a UMUL_LOHI and then check if the high word is 0.
4878 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4879 Value = DAG.getNode(ISD::UMUL_LOHI, dl,
4880 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4881 LHS, RHS);
4882 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value.getValue(1),
4883 DAG.getConstant(0, dl, MVT::i32));
4884 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4885 break;
4886 case ISD::SMULO:
4887 // We generate a SMUL_LOHI and then check if all the bits of the high word
4888 // are the same as the sign bit of the low word.
4889 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4890 Value = DAG.getNode(ISD::SMUL_LOHI, dl,
4891 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4892 LHS, RHS);
4893 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value.getValue(1),
4894 DAG.getNode(ISD::SRA, dl, Op.getValueType(),
4895 Value.getValue(0),
4896 DAG.getConstant(31, dl, MVT::i32)));
4897 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4898 break;
4899 } // switch (...)
4900
4901 return std::make_pair(Value, OverflowCmp);
4902}
4903
4904SDValue
4905ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
4906 // Let legalize expand this if it isn't a legal type yet.
4907 if (!isTypeLegal(Op.getValueType()))
4908 return SDValue();
4909
4910 SDValue Value, OverflowCmp;
4911 SDValue ARMcc;
4912 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
4913 SDLoc dl(Op);
4914 // We use 0 and 1 as false and true values.
4915 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4916 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4917 EVT VT = Op.getValueType();
4918
4919 SDValue Overflow =
4920 DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, ARMcc, OverflowCmp);
4921
4922 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4923 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4924}
4925
4927 SelectionDAG &DAG) {
4928 SDLoc DL(BoolCarry);
4929 EVT CarryVT = BoolCarry.getValueType();
4930
4931 // This converts the boolean value carry into the carry flag by doing
4932 // ARMISD::SUBC Carry, 1
4933 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
4934 DAG.getVTList(CarryVT, MVT::i32),
4935 BoolCarry, DAG.getConstant(1, DL, CarryVT));
4936 return Carry.getValue(1);
4937}
4938
4940 SelectionDAG &DAG) {
4941 SDLoc DL(Flags);
4942
4943 // Now convert the carry flag into a boolean carry. We do this
4944 // using ARMISD:ADDE 0, 0, Carry
4945 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
4946 DAG.getConstant(0, DL, MVT::i32),
4947 DAG.getConstant(0, DL, MVT::i32), Flags);
4948}
4949
4950SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
4951 SelectionDAG &DAG) const {
4952 // Let legalize expand this if it isn't a legal type yet.
4953 if (!isTypeLegal(Op.getValueType()))
4954 return SDValue();
4955
4956 SDValue LHS = Op.getOperand(0);
4957 SDValue RHS = Op.getOperand(1);
4958 SDLoc dl(Op);
4959
4960 EVT VT = Op.getValueType();
4961 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
4962 SDValue Value;
4963 SDValue Overflow;
4964 switch (Op.getOpcode()) {
4965 default:
4966 llvm_unreachable("Unknown overflow instruction!");
4967 case ISD::UADDO:
4968 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
4969 // Convert the carry flag into a boolean value.
4970 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4971 break;
4972 case ISD::USUBO: {
4973 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
4974 // Convert the carry flag into a boolean value.
4975 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4976 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
4977 // value. So compute 1 - C.
4978 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
4979 DAG.getConstant(1, dl, MVT::i32), Overflow);
4980 break;
4981 }
4982 }
4983
4984 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4985}
4986
4988 const ARMSubtarget *Subtarget) {
4989 EVT VT = Op.getValueType();
4990 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() || Subtarget->isThumb1Only())
4991 return SDValue();
4992 if (!VT.isSimple())
4993 return SDValue();
4994
4995 unsigned NewOpcode;
4996 switch (VT.getSimpleVT().SimpleTy) {
4997 default:
4998 return SDValue();
4999 case MVT::i8:
5000 switch (Op->getOpcode()) {
5001 case ISD::UADDSAT:
5002 NewOpcode = ARMISD::UQADD8b;
5003 break;
5004 case ISD::SADDSAT:
5005 NewOpcode = ARMISD::QADD8b;
5006 break;
5007 case ISD::USUBSAT:
5008 NewOpcode = ARMISD::UQSUB8b;
5009 break;
5010 case ISD::SSUBSAT:
5011 NewOpcode = ARMISD::QSUB8b;
5012 break;
5013 }
5014 break;
5015 case MVT::i16:
5016 switch (Op->getOpcode()) {
5017 case ISD::UADDSAT:
5018 NewOpcode = ARMISD::UQADD16b;
5019 break;
5020 case ISD::SADDSAT:
5021 NewOpcode = ARMISD::QADD16b;
5022 break;
5023 case ISD::USUBSAT:
5024 NewOpcode = ARMISD::UQSUB16b;
5025 break;
5026 case ISD::SSUBSAT:
5027 NewOpcode = ARMISD::QSUB16b;
5028 break;
5029 }
5030 break;
5031 }
5032
5033 SDLoc dl(Op);
5034 SDValue Add =
5035 DAG.getNode(NewOpcode, dl, MVT::i32,
5036 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
5037 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
5038 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
5039}
5040
5041SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
5042 SDValue Cond = Op.getOperand(0);
5043 SDValue SelectTrue = Op.getOperand(1);
5044 SDValue SelectFalse = Op.getOperand(2);
5045 SDLoc dl(Op);
5046 unsigned Opc = Cond.getOpcode();
5047
5048 if (Cond.getResNo() == 1 &&
5049 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5050 Opc == ISD::USUBO)) {
5051 if (!isTypeLegal(Cond->getValueType(0)))
5052 return SDValue();
5053
5054 SDValue Value, OverflowCmp;
5055 SDValue ARMcc;
5056 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5057 EVT VT = Op.getValueType();
5058
5059 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, OverflowCmp, DAG);
5060 }
5061
5062 // Convert:
5063 //
5064 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
5065 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
5066 //
5067 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
5068 const ConstantSDNode *CMOVTrue =
5069 dyn_cast<ConstantSDNode>(Cond.getOperand(0));
5070 const ConstantSDNode *CMOVFalse =
5071 dyn_cast<ConstantSDNode>(Cond.getOperand(1));
5072
5073 if (CMOVTrue && CMOVFalse) {
5074 unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
5075 unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
5076
5077 SDValue True;
5078 SDValue False;
5079 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
5080 True = SelectTrue;
5081 False = SelectFalse;
5082 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
5083 True = SelectFalse;
5084 False = SelectTrue;
5085 }
5086
5087 if (True.getNode() && False.getNode())
5088 return getCMOV(dl, Op.getValueType(), True, False, Cond.getOperand(2),
5089 Cond.getOperand(3), DAG);
5090 }
5091 }
5092
5093 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
5094 // undefined bits before doing a full-word comparison with zero.
5095 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
5096 DAG.getConstant(1, dl, Cond.getValueType()));
5097
5098 return DAG.getSelectCC(dl, Cond,
5099 DAG.getConstant(0, dl, Cond.getValueType()),
5100 SelectTrue, SelectFalse, ISD::SETNE);
5101}
5102
5104 bool &swpCmpOps, bool &swpVselOps) {
5105 // Start by selecting the GE condition code for opcodes that return true for
5106 // 'equality'
5107 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
5108 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)
5109 CondCode = ARMCC::GE;
5110
5111 // and GT for opcodes that return false for 'equality'.
5112 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
5113 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)
5114 CondCode = ARMCC::GT;
5115
5116 // Since we are constrained to GE/GT, if the opcode contains 'less', we need
5117 // to swap the compare operands.
5118 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
5119 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)
5120 swpCmpOps = true;
5121
5122 // Both GT and GE are ordered comparisons, and return false for 'unordered'.
5123 // If we have an unordered opcode, we need to swap the operands to the VSEL
5124 // instruction (effectively negating the condition).
5125 //
5126 // This also has the effect of swapping which one of 'less' or 'greater'
5127 // returns true, so we also swap the compare operands. It also switches
5128 // whether we return true for 'equality', so we compensate by picking the
5129 // opposite condition code to our original choice.
5130 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
5131 CC == ISD::SETUGT) {
5132 swpCmpOps = !swpCmpOps;
5133 swpVselOps = !swpVselOps;
5134 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
5135 }
5136
5137 // 'ordered' is 'anything but unordered', so use the VS condition code and
5138 // swap the VSEL operands.
5139 if (CC == ISD::SETO) {
5140 CondCode = ARMCC::VS;
5141 swpVselOps = true;
5142 }
5143
5144 // 'unordered or not equal' is 'anything but equal', so use the EQ condition
5145 // code and swap the VSEL operands. Also do this if we don't care about the
5146 // unordered case.
5147 if (CC == ISD::SETUNE || CC == ISD::SETNE) {
5148 CondCode = ARMCC::EQ;
5149 swpVselOps = true;
5150 }
5151}
5152
5153SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
5154 SDValue TrueVal, SDValue ARMcc,
5155 SDValue Flags, SelectionDAG &DAG) const {
5156 if (!Subtarget->hasFP64() && VT == MVT::f64) {
5158 DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
5160 DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
5161
5162 SDValue TrueLow = TrueVal.getValue(0);
5163 SDValue TrueHigh = TrueVal.getValue(1);
5164 SDValue FalseLow = FalseVal.getValue(0);
5165 SDValue FalseHigh = FalseVal.getValue(1);
5166
5167 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
5168 ARMcc, Flags);
5169 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
5170 ARMcc, Flags);
5171
5172 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
5173 }
5174 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, Flags);
5175}
5176
5177static bool isGTorGE(ISD::CondCode CC) {
5178 return CC == ISD::SETGT || CC == ISD::SETGE;
5179}
5180
5181static bool isLTorLE(ISD::CondCode CC) {
5182 return CC == ISD::SETLT || CC == ISD::SETLE;
5183}
5184
5185// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
5186// All of these conditions (and their <= and >= counterparts) will do:
5187// x < k ? k : x
5188// x > k ? x : k
5189// k < x ? x : k
5190// k > x ? k : x
5191static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
5192 const SDValue TrueVal, const SDValue FalseVal,
5193 const ISD::CondCode CC, const SDValue K) {
5194 return (isGTorGE(CC) &&
5195 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
5196 (isLTorLE(CC) &&
5197 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
5198}
5199
5200// Check if two chained conditionals could be converted into SSAT or USAT.
5201//
5202// SSAT can replace a set of two conditional selectors that bound a number to an
5203// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
5204//
5205// x < -k ? -k : (x > k ? k : x)
5206// x < -k ? -k : (x < k ? x : k)
5207// x > -k ? (x > k ? k : x) : -k
5208// x < k ? (x < -k ? -k : x) : k
5209// etc.
5210//
5211// LLVM canonicalizes these to either a min(max()) or a max(min())
5212// pattern. This function tries to match one of these and will return a SSAT
5213// node if successful.
5214//
5215// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1
5216// is a power of 2.
5218 EVT VT = Op.getValueType();
5219 SDValue V1 = Op.getOperand(0);
5220 SDValue K1 = Op.getOperand(1);
5221 SDValue TrueVal1 = Op.getOperand(2);
5222 SDValue FalseVal1 = Op.getOperand(3);
5223 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5224
5225 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
5226 if (Op2.getOpcode() != ISD::SELECT_CC)
5227 return SDValue();
5228
5229 SDValue V2 = Op2.getOperand(0);
5230 SDValue K2 = Op2.getOperand(1);
5231 SDValue TrueVal2 = Op2.getOperand(2);
5232 SDValue FalseVal2 = Op2.getOperand(3);
5233 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
5234
5235 SDValue V1Tmp = V1;
5236 SDValue V2Tmp = V2;
5237
5238 // Check that the registers and the constants match a max(min()) or min(max())
5239 // pattern
5240 if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
5241 K2 != FalseVal2 ||
5242 !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))
5243 return SDValue();
5244
5245 // Check that the constant in the lower-bound check is
5246 // the opposite of the constant in the upper-bound check
5247 // in 1's complement.
5248 if (!isa<ConstantSDNode>(K1) || !isa<ConstantSDNode>(K2))
5249 return SDValue();
5250
5251 int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();
5252 int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();
5253 int64_t PosVal = std::max(Val1, Val2);
5254 int64_t NegVal = std::min(Val1, Val2);
5255
5256 if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||
5257 !isPowerOf2_64(PosVal + 1))
5258 return SDValue();
5259
5260 // Handle the difference between USAT (unsigned) and SSAT (signed)
5261 // saturation
5262 // At this point, PosVal is guaranteed to be positive
5263 uint64_t K = PosVal;
5264 SDLoc dl(Op);
5265 if (Val1 == ~Val2)
5266 return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,
5267 DAG.getConstant(llvm::countr_one(K), dl, VT));
5268 if (NegVal == 0)
5269 return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,
5270 DAG.getConstant(llvm::countr_one(K), dl, VT));
5271
5272 return SDValue();
5273}
5274
5275// Check if a condition of the type x < k ? k : x can be converted into a
5276// bit operation instead of conditional moves.
5277// Currently this is allowed given:
5278// - The conditions and values match up
5279// - k is 0 or -1 (all ones)
5280// This function will not check the last condition, thats up to the caller
5281// It returns true if the transformation can be made, and in such case
5282// returns x in V, and k in SatK.
5284 SDValue &SatK)
5285{
5286 SDValue LHS = Op.getOperand(0);
5287 SDValue RHS = Op.getOperand(1);
5288 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5289 SDValue TrueVal = Op.getOperand(2);
5290 SDValue FalseVal = Op.getOperand(3);
5291
5292 SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS)
5293 ? &RHS
5294 : nullptr;
5295
5296 // No constant operation in comparison, early out
5297 if (!K)
5298 return false;
5299
5300 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
5301 V = (KTmp == TrueVal) ? FalseVal : TrueVal;
5302 SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
5303
5304 // If the constant on left and right side, or variable on left and right,
5305 // does not match, early out
5306 if (*K != KTmp || V != VTmp)
5307 return false;
5308
5309 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
5310 SatK = *K;
5311 return true;
5312 }
5313
5314 return false;
5315}
5316
5317bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
5318 if (VT == MVT::f32)
5319 return !Subtarget->hasVFP2Base();
5320 if (VT == MVT::f64)
5321 return !Subtarget->hasFP64();
5322 if (VT == MVT::f16)
5323 return !Subtarget->hasFullFP16();
5324 return false;
5325}
5326
5327SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
5328 EVT VT = Op.getValueType();
5329 SDLoc dl(Op);
5330
5331 // Try to convert two saturating conditional selects into a single SSAT
5332 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
5333 if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))
5334 return SatValue;
5335
5336 // Try to convert expressions of the form x < k ? k : x (and similar forms)
5337 // into more efficient bit operations, which is possible when k is 0 or -1
5338 // On ARM and Thumb-2 which have flexible operand 2 this will result in
5339 // single instructions. On Thumb the shift and the bit operation will be two
5340 // instructions.
5341 // Only allow this transformation on full-width (32-bit) operations
5342 SDValue LowerSatConstant;
5343 SDValue SatValue;
5344 if (VT == MVT::i32 &&
5345 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
5346 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
5347 DAG.getConstant(31, dl, VT));
5348 if (isNullConstant(LowerSatConstant)) {
5349 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
5350 DAG.getAllOnesConstant(dl, VT));
5351 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
5352 } else if (isAllOnesConstant(LowerSatConstant))
5353 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
5354 }
5355
5356 SDValue LHS = Op.getOperand(0);
5357 SDValue RHS = Op.getOperand(1);
5358 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5359 SDValue TrueVal = Op.getOperand(2);
5360 SDValue FalseVal = Op.getOperand(3);
5361 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
5362 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
5363 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
5364 if (Op.getValueType().isInteger()) {
5365
5366 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
5367 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
5368 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
5369 // Both require less instructions than compare and conditional select.
5370 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TrueVal && RHSC &&
5371 RHSC->isZero() && CFVal && CFVal->isZero() &&
5372 LHS.getValueType() == RHS.getValueType()) {
5373 EVT VT = LHS.getValueType();
5374 SDValue Shift =
5375 DAG.getNode(ISD::SRA, dl, VT, LHS,
5376 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
5377
5378 if (CC == ISD::SETGT)
5379 Shift = DAG.getNOT(dl, Shift, VT);
5380
5381 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
5382 }
5383 }
5384
5385 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal &&
5386 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) {
5387 unsigned TVal = CTVal->getZExtValue();
5388 unsigned FVal = CFVal->getZExtValue();
5389 unsigned Opcode = 0;
5390
5391 if (TVal == ~FVal) {
5392 Opcode = ARMISD::CSINV;
5393 } else if (TVal == ~FVal + 1) {
5394 Opcode = ARMISD::CSNEG;
5395 } else if (TVal + 1 == FVal) {
5396 Opcode = ARMISD::CSINC;
5397 } else if (TVal == FVal + 1) {
5398 Opcode = ARMISD::CSINC;
5399 std::swap(TrueVal, FalseVal);
5400 std::swap(TVal, FVal);
5401 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5402 }
5403
5404 if (Opcode) {
5405 // If one of the constants is cheaper than another, materialise the
5406 // cheaper one and let the csel generate the other.
5407 if (Opcode != ARMISD::CSINC &&
5408 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
5409 std::swap(TrueVal, FalseVal);
5410 std::swap(TVal, FVal);
5411 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5412 }
5413
5414 // Attempt to use ZR checking TVal is 0, possibly inverting the condition
5415 // to get there. CSINC not is invertable like the other two (~(~a) == a,
5416 // -(-a) == a, but (a+1)+1 != a).
5417 if (FVal == 0 && Opcode != ARMISD::CSINC) {
5418 std::swap(TrueVal, FalseVal);
5419 std::swap(TVal, FVal);
5420 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5421 }
5422
5423 // Drops F's value because we can get it by inverting/negating TVal.
5424 FalseVal = TrueVal;
5425
5426 SDValue ARMcc;
5427 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5428 EVT VT = TrueVal.getValueType();
5429 return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp);
5430 }
5431 }
5432
5433 if (isUnsupportedFloatingType(LHS.getValueType())) {
5434 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5435
5436 // If softenSetCCOperands only returned one value, we should compare it to
5437 // zero.
5438 if (!RHS.getNode()) {
5439 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5440 CC = ISD::SETNE;
5441 }
5442 }
5443
5444 if (LHS.getValueType() == MVT::i32) {
5445 // Try to generate VSEL on ARMv8.
5446 // The VSEL instruction can't use all the usual ARM condition
5447 // codes: it only has two bits to select the condition code, so it's
5448 // constrained to use only GE, GT, VS and EQ.
5449 //
5450 // To implement all the various ISD::SETXXX opcodes, we sometimes need to
5451 // swap the operands of the previous compare instruction (effectively
5452 // inverting the compare condition, swapping 'less' and 'greater') and
5453 // sometimes need to swap the operands to the VSEL (which inverts the
5454 // condition in the sense of firing whenever the previous condition didn't)
5455 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
5456 TrueVal.getValueType() == MVT::f32 ||
5457 TrueVal.getValueType() == MVT::f64)) {
5459 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
5460 CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
5461 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5462 std::swap(TrueVal, FalseVal);
5463 }
5464 }
5465
5466 SDValue ARMcc;
5467 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5468 // Choose GE over PL, which vsel does now support
5469 if (ARMcc->getAsZExtVal() == ARMCC::PL)
5470 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
5471 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5472 }
5473
5474 ARMCC::CondCodes CondCode, CondCode2;
5475 FPCCToARMCC(CC, CondCode, CondCode2);
5476
5477 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
5478 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
5479 // must use VSEL (limited condition codes), due to not having conditional f16
5480 // moves.
5481 if (Subtarget->hasFPARMv8Base() &&
5482 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
5483 (TrueVal.getValueType() == MVT::f16 ||
5484 TrueVal.getValueType() == MVT::f32 ||
5485 TrueVal.getValueType() == MVT::f64)) {
5486 bool swpCmpOps = false;
5487 bool swpVselOps = false;
5488 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
5489
5490 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
5491 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
5492 if (swpCmpOps)
5493 std::swap(LHS, RHS);
5494 if (swpVselOps)
5495 std::swap(TrueVal, FalseVal);
5496 }
5497 }
5498
5499 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5500 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5501 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5502 if (CondCode2 != ARMCC::AL) {
5503 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
5504 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, Cmp, DAG);
5505 }
5506 return Result;
5507}
5508
5509/// canChangeToInt - Given the fp compare operand, return true if it is suitable
5510/// to morph to an integer compare sequence.
5511static bool canChangeToInt(SDValue Op, bool &SeenZero,
5512 const ARMSubtarget *Subtarget) {
5513 SDNode *N = Op.getNode();
5514 if (!N->hasOneUse())
5515 // Otherwise it requires moving the value from fp to integer registers.
5516 return false;
5517 if (!N->getNumValues())
5518 return false;
5519 EVT VT = Op.getValueType();
5520 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
5521 // f32 case is generally profitable. f64 case only makes sense when vcmpe +
5522 // vmrs are very slow, e.g. cortex-a8.
5523 return false;
5524
5525 if (isFloatingPointZero(Op)) {
5526 SeenZero = true;
5527 return true;
5528 }
5529 return ISD::isNormalLoad(N);
5530}
5531
5534 return DAG.getConstant(0, SDLoc(Op), MVT::i32);
5535
5536 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
5537 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
5538 Ld->getPointerInfo(), Ld->getAlign(),
5539 Ld->getMemOperand()->getFlags());
5540
5541 llvm_unreachable("Unknown VFP cmp argument!");
5542}
5543
5545 SDValue &RetVal1, SDValue &RetVal2) {
5546 SDLoc dl(Op);
5547
5548 if (isFloatingPointZero(Op)) {
5549 RetVal1 = DAG.getConstant(0, dl, MVT::i32);
5550 RetVal2 = DAG.getConstant(0, dl, MVT::i32);
5551 return;
5552 }
5553
5554 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
5555 SDValue Ptr = Ld->getBasePtr();
5556 RetVal1 =
5557 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
5558 Ld->getAlign(), Ld->getMemOperand()->getFlags());
5559
5560 EVT PtrType = Ptr.getValueType();
5561 SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
5562 PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
5563 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
5564 Ld->getPointerInfo().getWithOffset(4),
5565 commonAlignment(Ld->getAlign(), 4),
5566 Ld->getMemOperand()->getFlags());
5567 return;
5568 }
5569
5570 llvm_unreachable("Unknown VFP cmp argument!");
5571}
5572
5573/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
5574/// f32 and even f64 comparisons to integer ones.
5575SDValue
5576ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
5577 SDValue Chain = Op.getOperand(0);
5578 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5579 SDValue LHS = Op.getOperand(2);
5580 SDValue RHS = Op.getOperand(3);
5581 SDValue Dest = Op.getOperand(4);
5582 SDLoc dl(Op);
5583
5584 bool LHSSeenZero = false;
5585 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
5586 bool RHSSeenZero = false;
5587 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
5588 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
5589 // If unsafe fp math optimization is enabled and there are no other uses of
5590 // the CMP operands, and the condition code is EQ or NE, we can optimize it
5591 // to an integer comparison.
5592 if (CC == ISD::SETOEQ)
5593 CC = ISD::SETEQ;
5594 else if (CC == ISD::SETUNE)
5595 CC = ISD::SETNE;
5596
5597 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5598 SDValue ARMcc;
5599 if (LHS.getValueType() == MVT::f32) {
5600 LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5601 bitcastf32Toi32(LHS, DAG), Mask);
5602 RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5603 bitcastf32Toi32(RHS, DAG), Mask);
5604 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5605 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5606 Cmp);
5607 }
5608
5609 SDValue LHS1, LHS2;
5610 SDValue RHS1, RHS2;
5611 expandf64Toi32(LHS, DAG, LHS1, LHS2);
5612 expandf64Toi32(RHS, DAG, RHS1, RHS2);
5613 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
5614 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
5616 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5617 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
5618 return DAG.getNode(ARMISD::BCC_i64, dl, MVT::Other, Ops);
5619 }
5620
5621 return SDValue();
5622}
5623
5624SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
5625 SDValue Chain = Op.getOperand(0);
5626 SDValue Cond = Op.getOperand(1);
5627 SDValue Dest = Op.getOperand(2);
5628 SDLoc dl(Op);
5629
5630 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5631 // instruction.
5632 unsigned Opc = Cond.getOpcode();
5633 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5634 !Subtarget->isThumb1Only();
5635 if (Cond.getResNo() == 1 &&
5636 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5637 Opc == ISD::USUBO || OptimizeMul)) {
5638 // Only lower legal XALUO ops.
5639 if (!isTypeLegal(Cond->getValueType(0)))
5640 return SDValue();
5641
5642 // The actual operation with overflow check.
5643 SDValue Value, OverflowCmp;
5644 SDValue ARMcc;
5645 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5646
5647 // Reverse the condition code.
5649 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5651 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5652
5653 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5654 OverflowCmp);
5655 }
5656
5657 return SDValue();
5658}
5659
5660SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
5661 SDValue Chain = Op.getOperand(0);
5662 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5663 SDValue LHS = Op.getOperand(2);
5664 SDValue RHS = Op.getOperand(3);
5665 SDValue Dest = Op.getOperand(4);
5666 SDLoc dl(Op);
5667
5668 if (isUnsupportedFloatingType(LHS.getValueType())) {
5669 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5670
5671 // If softenSetCCOperands only returned one value, we should compare it to
5672 // zero.
5673 if (!RHS.getNode()) {
5674 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5675 CC = ISD::SETNE;
5676 }
5677 }
5678
5679 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5680 // instruction.
5681 unsigned Opc = LHS.getOpcode();
5682 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5683 !Subtarget->isThumb1Only();
5684 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
5685 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5686 Opc == ISD::USUBO || OptimizeMul) &&
5687 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5688 // Only lower legal XALUO ops.
5689 if (!isTypeLegal(LHS->getValueType(0)))
5690 return SDValue();
5691
5692 // The actual operation with overflow check.
5693 SDValue Value, OverflowCmp;
5694 SDValue ARMcc;
5695 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
5696
5697 if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
5698 // Reverse the condition code.
5700 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5702 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5703 }
5704
5705 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5706 OverflowCmp);
5707 }
5708
5709 if (LHS.getValueType() == MVT::i32) {
5710 SDValue ARMcc;
5711 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5712 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, Cmp);
5713 }
5714
5715 if (getTargetMachine().Options.UnsafeFPMath &&
5716 (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
5717 CC == ISD::SETNE || CC == ISD::SETUNE)) {
5718 if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
5719 return Result;
5720 }
5721
5722 ARMCC::CondCodes CondCode, CondCode2;
5723 FPCCToARMCC(CC, CondCode, CondCode2);
5724
5725 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5726 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5727 SDValue Ops[] = {Chain, Dest, ARMcc, Cmp};
5728 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5729 if (CondCode2 != ARMCC::AL) {
5730 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
5731 SDValue Ops[] = {Res, Dest, ARMcc, Cmp};
5732 Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5733 }
5734 return Res;
5735}
5736
5737SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
5738 SDValue Chain = Op.getOperand(0);
5739 SDValue Table = Op.getOperand(1);
5740 SDValue Index = Op.getOperand(2);
5741 SDLoc dl(Op);
5742
5743 EVT PTy = getPointerTy(DAG.getDataLayout());
5744 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
5745 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
5746 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
5747 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
5748 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
5749 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
5750 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
5751 // which does another jump to the destination. This also makes it easier
5752 // to translate it to TBB / TBH later (Thumb2 only).
5753 // FIXME: This might not work if the function is extremely large.
5754 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
5755 Addr, Op.getOperand(2), JTI);
5756 }
5757 if (isPositionIndependent() || Subtarget->isROPI()) {
5758 Addr =
5759 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
5761 Chain = Addr.getValue(1);
5762 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
5763 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5764 } else {
5765 Addr =
5766 DAG.getLoad(PTy, dl, Chain, Addr,
5768 Chain = Addr.getValue(1);
5769 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5770 }
5771}
5772
5774 EVT VT = Op.getValueType();
5775 SDLoc dl(Op);
5776
5777 if (Op.getValueType().getVectorElementType() == MVT::i32) {
5778 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
5779 return Op;
5780 return DAG.UnrollVectorOp(Op.getNode());
5781 }
5782
5783 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5784
5785 EVT NewTy;
5786 const EVT OpTy = Op.getOperand(0).getValueType();
5787 if (OpTy == MVT::v4f32)
5788 NewTy = MVT::v4i32;
5789 else if (OpTy == MVT::v4f16 && HasFullFP16)
5790 NewTy = MVT::v4i16;
5791 else if (OpTy == MVT::v8f16 && HasFullFP16)
5792 NewTy = MVT::v8i16;
5793 else
5794 llvm_unreachable("Invalid type for custom lowering!");
5795
5796 if (VT != MVT::v4i16 && VT != MVT::v8i16)
5797 return DAG.UnrollVectorOp(Op.getNode());
5798
5799 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
5800 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
5801}
5802
5803SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
5804 EVT VT = Op.getValueType();
5805 if (VT.isVector())
5806 return LowerVectorFP_TO_INT(Op, DAG);
5807
5808 bool IsStrict = Op->isStrictFPOpcode();
5809 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5810
5811 if (isUnsupportedFloatingType(SrcVal.getValueType())) {
5812 RTLIB::Libcall LC;
5813 if (Op.getOpcode() == ISD::FP_TO_SINT ||
5814 Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
5815 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(),
5816 Op.getValueType());
5817 else
5818 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(),
5819 Op.getValueType());
5820 SDLoc Loc(Op);
5821 MakeLibCallOptions CallOptions;
5822 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
5824 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
5825 CallOptions, Loc, Chain);
5826 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
5827 }
5828
5829 // FIXME: Remove this when we have strict fp instruction selection patterns
5830 if (IsStrict) {
5831 SDLoc Loc(Op);
5832 SDValue Result =
5835 Loc, Op.getValueType(), SrcVal);
5836 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
5837 }
5838
5839 return Op;
5840}
5841
5843 const ARMSubtarget *Subtarget) {
5844 EVT VT = Op.getValueType();
5845 EVT ToVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5846 EVT FromVT = Op.getOperand(0).getValueType();
5847
5848 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32)
5849 return Op;
5850 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 &&
5851 Subtarget->hasFP64())
5852 return Op;
5853 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 &&
5854 Subtarget->hasFullFP16())
5855 return Op;
5856 if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 &&
5857 Subtarget->hasMVEFloatOps())
5858 return Op;
5859 if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 &&
5860 Subtarget->hasMVEFloatOps())
5861 return Op;
5862
5863 if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16)
5864 return SDValue();
5865
5866 SDLoc DL(Op);
5867 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;
5868 unsigned BW = ToVT.getScalarSizeInBits() - IsSigned;
5869 SDValue CVT = DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
5870 DAG.getValueType(VT.getScalarType()));
5871 SDValue Max = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, CVT,
5872 DAG.getConstant((1 << BW) - 1, DL, VT));
5873 if (IsSigned)
5874 Max = DAG.getNode(ISD::SMAX, DL, VT, Max,
5875 DAG.getSignedConstant(-(1 << BW), DL, VT));
5876 return Max;
5877}
5878
5880 EVT VT = Op.getValueType();
5881 SDLoc dl(Op);
5882
5883 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
5884 if (VT.getVectorElementType() == MVT::f32)
5885 return Op;
5886 return DAG.UnrollVectorOp(Op.getNode());
5887 }
5888
5889 assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
5890 Op.getOperand(0).getValueType() == MVT::v8i16) &&
5891 "Invalid type for custom lowering!");
5892
5893 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5894
5895 EVT DestVecType;
5896 if (VT == MVT::v4f32)
5897 DestVecType = MVT::v4i32;
5898 else if (VT == MVT::v4f16 && HasFullFP16)
5899 DestVecType = MVT::v4i16;
5900 else if (VT == MVT::v8f16 && HasFullFP16)
5901 DestVecType = MVT::v8i16;
5902 else
5903 return DAG.UnrollVectorOp(Op.getNode());
5904
5905 unsigned CastOpc;
5906 unsigned Opc;
5907 switch (Op.getOpcode()) {
5908 default: llvm_unreachable("Invalid opcode!");
5909 case ISD::SINT_TO_FP:
5910 CastOpc = ISD::SIGN_EXTEND;
5912 break;
5913 case ISD::UINT_TO_FP:
5914 CastOpc = ISD::ZERO_EXTEND;
5916 break;
5917 }
5918
5919 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
5920 return DAG.getNode(Opc, dl, VT, Op);
5921}
5922
5923SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
5924 EVT VT = Op.getValueType();
5925 if (VT.isVector())
5926 return LowerVectorINT_TO_FP(Op, DAG);
5927 if (isUnsupportedFloatingType(VT)) {
5928 RTLIB::Libcall LC;
5929 if (Op.getOpcode() == ISD::SINT_TO_FP)
5930 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
5931 Op.getValueType());
5932 else
5933 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
5934 Op.getValueType());
5935 MakeLibCallOptions CallOptions;
5936 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
5937 CallOptions, SDLoc(Op)).first;
5938 }
5939
5940 return Op;
5941}
5942
5943SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
5944 // Implement fcopysign with a fabs and a conditional fneg.
5945 SDValue Tmp0 = Op.getOperand(0);
5946 SDValue Tmp1 = Op.getOperand(1);
5947 SDLoc dl(Op);
5948 EVT VT = Op.getValueType();
5949 EVT SrcVT = Tmp1.getValueType();
5950 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
5951 Tmp0.getOpcode() == ARMISD::VMOVDRR;
5952 bool UseNEON = !InGPR && Subtarget->hasNEON();
5953
5954 if (UseNEON) {
5955 // Use VBSL to copy the sign bit.
5956 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
5957 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
5958 DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
5959 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
5960 if (VT == MVT::f64)
5961 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5962 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
5963 DAG.getConstant(32, dl, MVT::i32));
5964 else /*if (VT == MVT::f32)*/
5965 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
5966 if (SrcVT == MVT::f32) {
5967 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
5968 if (VT == MVT::f64)
5969 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5970 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
5971 DAG.getConstant(32, dl, MVT::i32));
5972 } else if (VT == MVT::f32)
5973 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
5974 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
5975 DAG.getConstant(32, dl, MVT::i32));
5976 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
5977 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
5978
5980 dl, MVT::i32);
5981 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
5982 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
5983 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
5984
5985 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
5986 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
5987 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
5988 if (VT == MVT::f32) {
5989 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
5990 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
5991 DAG.getConstant(0, dl, MVT::i32));
5992 } else {
5993 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
5994 }
5995
5996 return Res;
5997 }
5998
5999 // Bitcast operand 1 to i32.
6000 if (SrcVT == MVT::f64)
6001 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
6002 Tmp1).getValue(1);
6003 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
6004
6005 // Or in the signbit with integer operations.
6006 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
6007 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
6008 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
6009 if (VT == MVT::f32) {
6010 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
6011 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
6012 return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
6013 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
6014 }
6015
6016 // f64: Or the high part with signbit and then combine two parts.
6017 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
6018 Tmp0);
6019 SDValue Lo = Tmp0.getValue(0);
6020 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
6021 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
6022 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
6023}
6024
6025SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
6027 MachineFrameInfo &MFI = MF.getFrameInfo();
6028 MFI.setReturnAddressIsTaken(true);
6029
6030 EVT VT = Op.getValueType();
6031 SDLoc dl(Op);
6032 unsigned Depth = Op.getConstantOperandVal(0);
6033 if (Depth) {
6034 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
6035 SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
6036 return DAG.getLoad(VT, dl, DAG.getEntryNode(),
6037 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
6039 }
6040
6041 // Return LR, which contains the return address. Mark it an implicit live-in.
6042 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
6043 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
6044}
6045
6046SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
6047 const ARMBaseRegisterInfo &ARI =
6048 *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
6050 MachineFrameInfo &MFI = MF.getFrameInfo();
6051 MFI.setFrameAddressIsTaken(true);
6052
6053 EVT VT = Op.getValueType();
6054 SDLoc dl(Op); // FIXME probably not meaningful
6055 unsigned Depth = Op.getConstantOperandVal(0);
6056 Register FrameReg = ARI.getFrameRegister(MF);
6057 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
6058 while (Depth--)
6059 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
6061 return FrameAddr;
6062}
6063
6064// FIXME? Maybe this could be a TableGen attribute on some registers and
6065// this table could be generated automatically from RegInfo.
6066Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
6067 const MachineFunction &MF) const {
6069 .Case("sp", ARM::SP)
6070 .Default(Register());
6071}
6072
6073// Result is 64 bit value so split into two 32 bit values and return as a
6074// pair of values.
6076 SelectionDAG &DAG) {
6077 SDLoc DL(N);
6078
6079 // This function is only supposed to be called for i64 type destination.
6080 assert(N->getValueType(0) == MVT::i64
6081 && "ExpandREAD_REGISTER called for non-i64 type result.");
6082
6084 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
6085 N->getOperand(0),
6086 N->getOperand(1));
6087
6088 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
6089 Read.getValue(1)));
6090 Results.push_back(Read.getValue(2)); // Chain
6091}
6092
6093/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
6094/// When \p DstVT, the destination type of \p BC, is on the vector
6095/// register bank and the source of bitcast, \p Op, operates on the same bank,
6096/// it might be possible to combine them, such that everything stays on the
6097/// vector register bank.
6098/// \p return The node that would replace \p BT, if the combine
6099/// is possible.
6101 SelectionDAG &DAG) {
6102 SDValue Op = BC->getOperand(0);
6103 EVT DstVT = BC->getValueType(0);
6104
6105 // The only vector instruction that can produce a scalar (remember,
6106 // since the bitcast was about to be turned into VMOVDRR, the source
6107 // type is i64) from a vector is EXTRACT_VECTOR_ELT.
6108 // Moreover, we can do this combine only if there is one use.
6109 // Finally, if the destination type is not a vector, there is not
6110 // much point on forcing everything on the vector bank.
6111 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6112 !Op.hasOneUse())
6113 return SDValue();
6114
6115 // If the index is not constant, we will introduce an additional
6116 // multiply that will stick.
6117 // Give up in that case.
6118 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
6119 if (!Index)
6120 return SDValue();
6121 unsigned DstNumElt = DstVT.getVectorNumElements();
6122
6123 // Compute the new index.
6124 const APInt &APIntIndex = Index->getAPIntValue();
6125 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
6126 NewIndex *= APIntIndex;
6127 // Check if the new constant index fits into i32.
6128 if (NewIndex.getBitWidth() > 32)
6129 return SDValue();
6130
6131 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
6132 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
6133 SDLoc dl(Op);
6134 SDValue ExtractSrc = Op.getOperand(0);
6135 EVT VecVT = EVT::getVectorVT(
6136 *DAG.getContext(), DstVT.getScalarType(),
6137 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
6138 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
6139 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
6140 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
6141}
6142
6143/// ExpandBITCAST - If the target supports VFP, this function is called to
6144/// expand a bit convert where either the source or destination type is i64 to
6145/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
6146/// operand type is illegal (e.g., v2f32 for a target that doesn't support
6147/// vectors), since the legalizer won't know what to do with that.
6148SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
6149 const ARMSubtarget *Subtarget) const {
6150 SDLoc dl(N);
6151 SDValue Op = N->getOperand(0);
6152
6153 // This function is only supposed to be called for i16 and i64 types, either
6154 // as the source or destination of the bit convert.
6155 EVT SrcVT = Op.getValueType();
6156 EVT DstVT = N->getValueType(0);
6157
6158 if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
6159 (DstVT == MVT::f16 || DstVT == MVT::bf16))
6160 return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
6161 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
6162
6163 if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
6164 (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) {
6165 if (Subtarget->hasFullFP16() && !Subtarget->hasBF16())
6166 Op = DAG.getBitcast(MVT::f16, Op);
6167 return DAG.getNode(
6168 ISD::TRUNCATE, SDLoc(N), DstVT,
6169 MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
6170 }
6171
6172 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
6173 return SDValue();
6174
6175 // Turn i64->f64 into VMOVDRR.
6176 if (SrcVT == MVT::i64 && isTypeLegal(DstVT)) {
6177 // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
6178 // if we can combine the bitcast with its source.
6180 return Val;
6181 SDValue Lo, Hi;
6182 std::tie(Lo, Hi) = DAG.SplitScalar(Op, dl, MVT::i32, MVT::i32);
6183 return DAG.getNode(ISD::BITCAST, dl, DstVT,
6184 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
6185 }
6186
6187 // Turn f64->i64 into VMOVRRD.
6188 if (DstVT == MVT::i64 && isTypeLegal(SrcVT)) {
6189 SDValue Cvt;
6190 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
6191 SrcVT.getVectorNumElements() > 1)
6192 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6193 DAG.getVTList(MVT::i32, MVT::i32),
6194 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
6195 else
6196 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6197 DAG.getVTList(MVT::i32, MVT::i32), Op);
6198 // Merge the pieces into a single i64 value.
6199 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
6200 }
6201
6202 return SDValue();
6203}
6204
6205/// getZeroVector - Returns a vector of specified type with all zero elements.
6206/// Zero vectors are used to represent vector negation and in those cases
6207/// will be implemented with the NEON VNEG instruction. However, VNEG does
6208/// not support i64 elements, so sometimes the zero vectors will need to be
6209/// explicitly constructed. Regardless, use a canonical VMOV to create the
6210/// zero vector.
6211static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6212 assert(VT.isVector() && "Expected a vector type");
6213 // The canonical modified immediate encoding of a zero vector is....0!
6214 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
6215 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
6216 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
6217 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6218}
6219
6220/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
6221/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6222SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
6223 SelectionDAG &DAG) const {
6224 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6225 EVT VT = Op.getValueType();
6226 unsigned VTBits = VT.getSizeInBits();
6227 SDLoc dl(Op);
6228 SDValue ShOpLo = Op.getOperand(0);
6229 SDValue ShOpHi = Op.getOperand(1);
6230 SDValue ShAmt = Op.getOperand(2);
6231 SDValue ARMcc;
6232 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
6233
6234 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
6235
6236 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6237 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6238 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
6239 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6240 DAG.getConstant(VTBits, dl, MVT::i32));
6241 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
6242 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6243 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
6244 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6245 ISD::SETGE, ARMcc, DAG, dl);
6246 SDValue Lo =
6247 DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, ARMcc, CmpLo);
6248
6249 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
6250 SDValue HiBigShift = Opc == ISD::SRA
6251 ? DAG.getNode(Opc, dl, VT, ShOpHi,
6252 DAG.getConstant(VTBits - 1, dl, VT))
6253 : DAG.getConstant(0, dl, VT);
6254 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6255 ISD::SETGE, ARMcc, DAG, dl);
6256 SDValue Hi =
6257 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6258
6259 SDValue Ops[2] = { Lo, Hi };
6260 return DAG.getMergeValues(Ops, dl);
6261}
6262
6263/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
6264/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6265SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
6266 SelectionDAG &DAG) const {
6267 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6268 EVT VT = Op.getValueType();
6269 unsigned VTBits = VT.getSizeInBits();
6270 SDLoc dl(Op);
6271 SDValue ShOpLo = Op.getOperand(0);
6272 SDValue ShOpHi = Op.getOperand(1);
6273 SDValue ShAmt = Op.getOperand(2);
6274 SDValue ARMcc;
6275
6276 assert(Op.getOpcode() == ISD::SHL_PARTS);
6277 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6278 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6279 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
6280 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
6281 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6282
6283 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6284 DAG.getConstant(VTBits, dl, MVT::i32));
6285 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
6286 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6287 ISD::SETGE, ARMcc, DAG, dl);
6288 SDValue Hi =
6289 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6290
6291 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6292 ISD::SETGE, ARMcc, DAG, dl);
6293 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
6294 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
6295 DAG.getConstant(0, dl, VT), ARMcc, CmpLo);
6296
6297 SDValue Ops[2] = { Lo, Hi };
6298 return DAG.getMergeValues(Ops, dl);
6299}
6300
6301SDValue ARMTargetLowering::LowerGET_ROUNDING(SDValue Op,
6302 SelectionDAG &DAG) const {
6303 // The rounding mode is in bits 23:22 of the FPSCR.
6304 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
6305 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
6306 // so that the shift + and get folded into a bitfield extract.
6307 SDLoc dl(Op);
6308 SDValue Chain = Op.getOperand(0);
6309 SDValue Ops[] = {Chain,
6310 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
6311
6312 SDValue FPSCR =
6313 DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);
6314 Chain = FPSCR.getValue(1);
6315 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
6316 DAG.getConstant(1U << 22, dl, MVT::i32));
6317 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
6318 DAG.getConstant(22, dl, MVT::i32));
6319 SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
6320 DAG.getConstant(3, dl, MVT::i32));
6321 return DAG.getMergeValues({And, Chain}, dl);
6322}
6323
6324SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op,
6325 SelectionDAG &DAG) const {
6326 SDLoc DL(Op);
6327 SDValue Chain = Op->getOperand(0);
6328 SDValue RMValue = Op->getOperand(1);
6329
6330 // The rounding mode is in bits 23:22 of the FPSCR.
6331 // The llvm.set.rounding argument value to ARM rounding mode value mapping
6332 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
6333 // ((arg - 1) & 3) << 22).
6334 //
6335 // It is expected that the argument of llvm.set.rounding is within the
6336 // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is
6337 // responsibility of the code generated llvm.set.rounding to ensure this
6338 // condition.
6339
6340 // Calculate new value of FPSCR[23:22].
6341 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
6342 DAG.getConstant(1, DL, MVT::i32));
6343 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
6344 DAG.getConstant(0x3, DL, MVT::i32));
6345 RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
6346 DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32));
6347
6348 // Get current value of FPSCR.
6349 SDValue Ops[] = {Chain,
6350 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6351 SDValue FPSCR =
6352 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6353 Chain = FPSCR.getValue(1);
6354 FPSCR = FPSCR.getValue(0);
6355
6356 // Put new rounding mode into FPSCR[23:22].
6357 const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos);
6358 FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6359 DAG.getConstant(RMMask, DL, MVT::i32));
6360 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue);
6361 SDValue Ops2[] = {
6362 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6363 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6364}
6365
6366SDValue ARMTargetLowering::LowerSET_FPMODE(SDValue Op,
6367 SelectionDAG &DAG) const {
6368 SDLoc DL(Op);
6369 SDValue Chain = Op->getOperand(0);
6370 SDValue Mode = Op->getOperand(1);
6371
6372 // Generate nodes to build:
6373 // FPSCR = (FPSCR & FPStatusBits) | (Mode & ~FPStatusBits)
6374 SDValue Ops[] = {Chain,
6375 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6376 SDValue FPSCR =
6377 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6378 Chain = FPSCR.getValue(1);
6379 FPSCR = FPSCR.getValue(0);
6380
6381 SDValue FPSCRMasked =
6382 DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6383 DAG.getConstant(ARM::FPStatusBits, DL, MVT::i32));
6384 SDValue InputMasked =
6385 DAG.getNode(ISD::AND, DL, MVT::i32, Mode,
6386 DAG.getConstant(~ARM::FPStatusBits, DL, MVT::i32));
6387 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCRMasked, InputMasked);
6388
6389 SDValue Ops2[] = {
6390 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6391 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6392}
6393
6394SDValue ARMTargetLowering::LowerRESET_FPMODE(SDValue Op,
6395 SelectionDAG &DAG) const {
6396 SDLoc DL(Op);
6397 SDValue Chain = Op->getOperand(0);
6398
6399 // To get the default FP mode all control bits are cleared:
6400 // FPSCR = FPSCR & (FPStatusBits | FPReservedBits)
6401 SDValue Ops[] = {Chain,
6402 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6403 SDValue FPSCR =
6404 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6405 Chain = FPSCR.getValue(1);
6406 FPSCR = FPSCR.getValue(0);
6407
6408 SDValue FPSCRMasked = DAG.getNode(
6409 ISD::AND, DL, MVT::i32, FPSCR,
6411 SDValue Ops2[] = {Chain,
6412 DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32),
6413 FPSCRMasked};
6414 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6415}
6416
6418 const ARMSubtarget *ST) {
6419 SDLoc dl(N);
6420 EVT VT = N->getValueType(0);
6421 if (VT.isVector() && ST->hasNEON()) {
6422
6423 // Compute the least significant set bit: LSB = X & -X
6424 SDValue X = N->getOperand(0);
6425 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
6426 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
6427
6428 EVT ElemTy = VT.getVectorElementType();
6429
6430 if (ElemTy == MVT::i8) {
6431 // Compute with: cttz(x) = ctpop(lsb - 1)
6432 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6433 DAG.getTargetConstant(1, dl, ElemTy));
6434 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6435 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6436 }
6437
6438 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
6439 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
6440 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
6441 unsigned NumBits = ElemTy.getSizeInBits();
6442 SDValue WidthMinus1 =
6443 DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6444 DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
6445 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
6446 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
6447 }
6448
6449 // Compute with: cttz(x) = ctpop(lsb - 1)
6450
6451 // Compute LSB - 1.
6452 SDValue Bits;
6453 if (ElemTy == MVT::i64) {
6454 // Load constant 0xffff'ffff'ffff'ffff to register.
6455 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6456 DAG.getTargetConstant(0x1eff, dl, MVT::i32));
6457 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
6458 } else {
6459 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6460 DAG.getTargetConstant(1, dl, ElemTy));
6461 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6462 }
6463 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6464 }
6465
6466 if (!ST->hasV6T2Ops())
6467 return SDValue();
6468
6469 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
6470 return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
6471}
6472
6474 const ARMSubtarget *ST) {
6475 EVT VT = N->getValueType(0);
6476 SDLoc DL(N);
6477
6478 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
6479 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
6480 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
6481 "Unexpected type for custom ctpop lowering");
6482
6483 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6484 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
6485 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
6486 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
6487
6488 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
6489 unsigned EltSize = 8;
6490 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
6491 while (EltSize != VT.getScalarSizeInBits()) {
6493 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
6494 TLI.getPointerTy(DAG.getDataLayout())));
6495 Ops.push_back(Res);
6496
6497 EltSize *= 2;
6498 NumElts /= 2;
6499 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
6500 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
6501 }
6502
6503 return Res;
6504}
6505
6506/// Getvshiftimm - Check if this is a valid build_vector for the immediate
6507/// operand of a vector shift operation, where all the elements of the
6508/// build_vector must have the same constant integer value.
6509static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
6510 // Ignore bit_converts.
6511 while (Op.getOpcode() == ISD::BITCAST)
6512 Op = Op.getOperand(0);
6513 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
6514 APInt SplatBits, SplatUndef;
6515 unsigned SplatBitSize;
6516 bool HasAnyUndefs;
6517 if (!BVN ||
6518 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
6519 ElementBits) ||
6520 SplatBitSize > ElementBits)
6521 return false;
6522 Cnt = SplatBits.getSExtValue();
6523 return true;
6524}
6525
6526/// isVShiftLImm - Check if this is a valid build_vector for the immediate
6527/// operand of a vector shift left operation. That value must be in the range:
6528/// 0 <= Value < ElementBits for a left shift; or
6529/// 0 <= Value <= ElementBits for a long left shift.
6530static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
6531 assert(VT.isVector() && "vector shift count is not a vector type");
6532 int64_t ElementBits = VT.getScalarSizeInBits();
6533 if (!getVShiftImm(Op, ElementBits, Cnt))
6534 return false;
6535 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
6536}
6537
6538/// isVShiftRImm - Check if this is a valid build_vector for the immediate
6539/// operand of a vector shift right operation. For a shift opcode, the value
6540/// is positive, but for an intrinsic the value count must be negative. The
6541/// absolute value must be in the range:
6542/// 1 <= |Value| <= ElementBits for a right shift; or
6543/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
6544static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
6545 int64_t &Cnt) {
6546 assert(VT.isVector() && "vector shift count is not a vector type");
6547 int64_t ElementBits = VT.getScalarSizeInBits();
6548 if (!getVShiftImm(Op, ElementBits, Cnt))
6549 return false;
6550 if (!isIntrinsic)
6551 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
6552 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
6553 Cnt = -Cnt;
6554 return true;
6555 }
6556 return false;
6557}
6558
6560 const ARMSubtarget *ST) {
6561 EVT VT = N->getValueType(0);
6562 SDLoc dl(N);
6563 int64_t Cnt;
6564
6565 if (!VT.isVector())
6566 return SDValue();
6567
6568 // We essentially have two forms here. Shift by an immediate and shift by a
6569 // vector register (there are also shift by a gpr, but that is just handled
6570 // with a tablegen pattern). We cannot easily match shift by an immediate in
6571 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
6572 // For shifting by a vector, we don't have VSHR, only VSHL (which can be
6573 // signed or unsigned, and a negative shift indicates a shift right).
6574 if (N->getOpcode() == ISD::SHL) {
6575 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
6576 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
6577 DAG.getConstant(Cnt, dl, MVT::i32));
6578 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
6579 N->getOperand(1));
6580 }
6581
6582 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
6583 "unexpected vector shift opcode");
6584
6585 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
6586 unsigned VShiftOpc =
6587 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
6588 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
6589 DAG.getConstant(Cnt, dl, MVT::i32));
6590 }
6591
6592 // Other right shifts we don't have operations for (we use a shift left by a
6593 // negative number).
6594 EVT ShiftVT = N->getOperand(1).getValueType();
6595 SDValue NegatedCount = DAG.getNode(
6596 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
6597 unsigned VShiftOpc =
6598 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
6599 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
6600}
6601
6603 const ARMSubtarget *ST) {
6604 EVT VT = N->getValueType(0);
6605 SDLoc dl(N);
6606
6607 // We can get here for a node like i32 = ISD::SHL i32, i64
6608 if (VT != MVT::i64)
6609 return SDValue();
6610
6611 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
6612 N->getOpcode() == ISD::SHL) &&
6613 "Unknown shift to lower!");
6614
6615 unsigned ShOpc = N->getOpcode();
6616 if (ST->hasMVEIntegerOps()) {
6617 SDValue ShAmt = N->getOperand(1);
6618 unsigned ShPartsOpc = ARMISD::LSLL;
6619 ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt);
6620
6621 // If the shift amount is greater than 32 or has a greater bitwidth than 64
6622 // then do the default optimisation
6623 if ((!Con && ShAmt->getValueType(0).getSizeInBits() > 64) ||
6624 (Con && (Con->getAPIntValue() == 0 || Con->getAPIntValue().uge(32))))
6625 return SDValue();
6626
6627 // Extract the lower 32 bits of the shift amount if it's not an i32
6628 if (ShAmt->getValueType(0) != MVT::i32)
6629 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
6630
6631 if (ShOpc == ISD::SRL) {
6632 if (!Con)
6633 // There is no t2LSRLr instruction so negate and perform an lsll if the
6634 // shift amount is in a register, emulating a right shift.
6635 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6636 DAG.getConstant(0, dl, MVT::i32), ShAmt);
6637 else
6638 // Else generate an lsrl on the immediate shift amount
6639 ShPartsOpc = ARMISD::LSRL;
6640 } else if (ShOpc == ISD::SRA)
6641 ShPartsOpc = ARMISD::ASRL;
6642
6643 // Split Lower/Upper 32 bits of the destination/source
6644 SDValue Lo, Hi;
6645 std::tie(Lo, Hi) =
6646 DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6647 // Generate the shift operation as computed above
6648 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
6649 ShAmt);
6650 // The upper 32 bits come from the second return value of lsll
6651 Hi = SDValue(Lo.getNode(), 1);
6652 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6653 }
6654
6655 // We only lower SRA, SRL of 1 here, all others use generic lowering.
6656 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
6657 return SDValue();
6658
6659 // If we are in thumb mode, we don't have RRX.
6660 if (ST->isThumb1Only())
6661 return SDValue();
6662
6663 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
6664 SDValue Lo, Hi;
6665 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6666
6667 // First, build a LSRS1/ASRS1 op, which shifts the top part by one and
6668 // captures the shifted out bit into a carry flag.
6669 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::LSRS1 : ARMISD::ASRS1;
6670 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, FlagsVT), Hi);
6671
6672 // The low part is an ARMISD::RRX operand, which shifts the carry in.
6673 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
6674
6675 // Merge the pieces into a single i64 value.
6676 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6677}
6678
6680 const ARMSubtarget *ST) {
6681 bool Invert = false;
6682 bool Swap = false;
6683 unsigned Opc = ARMCC::AL;
6684
6685 SDValue Op0 = Op.getOperand(0);
6686 SDValue Op1 = Op.getOperand(1);
6687 SDValue CC = Op.getOperand(2);
6688 EVT VT = Op.getValueType();
6689 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
6690 SDLoc dl(Op);
6691
6692 EVT CmpVT;
6693 if (ST->hasNEON())
6695 else {
6696 assert(ST->hasMVEIntegerOps() &&
6697 "No hardware support for integer vector comparison!");
6698
6699 if (Op.getValueType().getVectorElementType() != MVT::i1)
6700 return SDValue();
6701
6702 // Make sure we expand floating point setcc to scalar if we do not have
6703 // mve.fp, so that we can handle them from there.
6704 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
6705 return SDValue();
6706
6707 CmpVT = VT;
6708 }
6709
6710 if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
6711 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
6712 // Special-case integer 64-bit equality comparisons. They aren't legal,
6713 // but they can be lowered with a few vector instructions.
6714 unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
6715 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
6716 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
6717 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
6718 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
6719 DAG.getCondCode(ISD::SETEQ));
6720 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
6721 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
6722 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
6723 if (SetCCOpcode == ISD::SETNE)
6724 Merged = DAG.getNOT(dl, Merged, CmpVT);
6725 Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
6726 return Merged;
6727 }
6728
6729 if (CmpVT.getVectorElementType() == MVT::i64)
6730 // 64-bit comparisons are not legal in general.
6731 return SDValue();
6732
6733 if (Op1.getValueType().isFloatingPoint()) {
6734 switch (SetCCOpcode) {
6735 default: llvm_unreachable("Illegal FP comparison");
6736 case ISD::SETUNE:
6737 case ISD::SETNE:
6738 if (ST->hasMVEFloatOps()) {
6739 Opc = ARMCC::NE; break;
6740 } else {
6741 Invert = true; [[fallthrough]];
6742 }
6743 case ISD::SETOEQ:
6744 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6745 case ISD::SETOLT:
6746 case ISD::SETLT: Swap = true; [[fallthrough]];
6747 case ISD::SETOGT:
6748 case ISD::SETGT: Opc = ARMCC::GT; break;
6749 case ISD::SETOLE:
6750 case ISD::SETLE: Swap = true; [[fallthrough]];
6751 case ISD::SETOGE:
6752 case ISD::SETGE: Opc = ARMCC::GE; break;
6753 case ISD::SETUGE: Swap = true; [[fallthrough]];
6754 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
6755 case ISD::SETUGT: Swap = true; [[fallthrough]];
6756 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
6757 case ISD::SETUEQ: Invert = true; [[fallthrough]];
6758 case ISD::SETONE: {
6759 // Expand this to (OLT | OGT).
6760 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6761 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6762 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6763 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6764 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6765 if (Invert)
6766 Result = DAG.getNOT(dl, Result, VT);
6767 return Result;
6768 }
6769 case ISD::SETUO: Invert = true; [[fallthrough]];
6770 case ISD::SETO: {
6771 // Expand this to (OLT | OGE).
6772 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6773 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6774 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6775 DAG.getConstant(ARMCC::GE, dl, MVT::i32));
6776 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6777 if (Invert)
6778 Result = DAG.getNOT(dl, Result, VT);
6779 return Result;
6780 }
6781 }
6782 } else {
6783 // Integer comparisons.
6784 switch (SetCCOpcode) {
6785 default: llvm_unreachable("Illegal integer comparison");
6786 case ISD::SETNE:
6787 if (ST->hasMVEIntegerOps()) {
6788 Opc = ARMCC::NE; break;
6789 } else {
6790 Invert = true; [[fallthrough]];
6791 }
6792 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6793 case ISD::SETLT: Swap = true; [[fallthrough]];
6794 case ISD::SETGT: Opc = ARMCC::GT; break;
6795 case ISD::SETLE: Swap = true; [[fallthrough]];
6796 case ISD::SETGE: Opc = ARMCC::GE; break;
6797 case ISD::SETULT: Swap = true; [[fallthrough]];
6798 case ISD::SETUGT: Opc = ARMCC::HI; break;
6799 case ISD::SETULE: Swap = true; [[fallthrough]];
6800 case ISD::SETUGE: Opc = ARMCC::HS; break;
6801 }
6802
6803 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
6804 if (ST->hasNEON() && Opc == ARMCC::EQ) {
6805 SDValue AndOp;
6807 AndOp = Op0;
6808 else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
6809 AndOp = Op1;
6810
6811 // Ignore bitconvert.
6812 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
6813 AndOp = AndOp.getOperand(0);
6814
6815 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
6816 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
6817 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
6818 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
6819 if (!Invert)
6820 Result = DAG.getNOT(dl, Result, VT);
6821 return Result;
6822 }
6823 }
6824 }
6825
6826 if (Swap)
6827 std::swap(Op0, Op1);
6828
6829 // If one of the operands is a constant vector zero, attempt to fold the
6830 // comparison to a specialized compare-against-zero form.
6832 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::EQ ||
6833 Opc == ARMCC::NE)) {
6834 if (Opc == ARMCC::GE)
6835 Opc = ARMCC::LE;
6836 else if (Opc == ARMCC::GT)
6837 Opc = ARMCC::LT;
6838 std::swap(Op0, Op1);
6839 }
6840
6841 SDValue Result;
6843 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::LE ||
6844 Opc == ARMCC::LT || Opc == ARMCC::NE || Opc == ARMCC::EQ))
6845 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, Op0,
6846 DAG.getConstant(Opc, dl, MVT::i32));
6847 else
6848 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6849 DAG.getConstant(Opc, dl, MVT::i32));
6850
6851 Result = DAG.getSExtOrTrunc(Result, dl, VT);
6852
6853 if (Invert)
6854 Result = DAG.getNOT(dl, Result, VT);
6855
6856 return Result;
6857}
6858
6860 SDValue LHS = Op.getOperand(0);
6861 SDValue RHS = Op.getOperand(1);
6862 SDValue Carry = Op.getOperand(2);
6863 SDValue Cond = Op.getOperand(3);
6864 SDLoc DL(Op);
6865
6866 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
6867
6868 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
6869 // have to invert the carry first.
6870 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
6871 DAG.getConstant(1, DL, MVT::i32), Carry);
6872 // This converts the boolean value carry into the carry flag.
6873 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
6874
6875 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
6876 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
6877
6878 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
6879 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
6880 SDValue ARMcc = DAG.getConstant(
6881 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
6882 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
6883 Cmp.getValue(1));
6884}
6885
6886/// isVMOVModifiedImm - Check if the specified splat value corresponds to a
6887/// valid vector constant for a NEON or MVE instruction with a "modified
6888/// immediate" operand (e.g., VMOV). If so, return the encoded value.
6889static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
6890 unsigned SplatBitSize, SelectionDAG &DAG,
6891 const SDLoc &dl, EVT &VT, EVT VectorVT,
6892 VMOVModImmType type) {
6893 unsigned OpCmode, Imm;
6894 bool is128Bits = VectorVT.is128BitVector();
6895
6896 // SplatBitSize is set to the smallest size that splats the vector, so a
6897 // zero vector will always have SplatBitSize == 8. However, NEON modified
6898 // immediate instructions others than VMOV do not support the 8-bit encoding
6899 // of a zero vector, and the default encoding of zero is supposed to be the
6900 // 32-bit version.
6901 if (SplatBits == 0)
6902 SplatBitSize = 32;
6903
6904 switch (SplatBitSize) {
6905 case 8:
6906 if (type != VMOVModImm)
6907 return SDValue();
6908 // Any 1-byte value is OK. Op=0, Cmode=1110.
6909 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
6910 OpCmode = 0xe;
6911 Imm = SplatBits;
6912 VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
6913 break;
6914
6915 case 16:
6916 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
6917 VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
6918 if ((SplatBits & ~0xff) == 0) {
6919 // Value = 0x00nn: Op=x, Cmode=100x.
6920 OpCmode = 0x8;
6921 Imm = SplatBits;
6922 break;
6923 }
6924 if ((SplatBits & ~0xff00) == 0) {
6925 // Value = 0xnn00: Op=x, Cmode=101x.
6926 OpCmode = 0xa;
6927 Imm = SplatBits >> 8;
6928 break;
6929 }
6930 return SDValue();
6931
6932 case 32:
6933 // NEON's 32-bit VMOV supports splat values where:
6934 // * only one byte is nonzero, or
6935 // * the least significant byte is 0xff and the second byte is nonzero, or
6936 // * the least significant 2 bytes are 0xff and the third is nonzero.
6937 VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
6938 if ((SplatBits & ~0xff) == 0) {
6939 // Value = 0x000000nn: Op=x, Cmode=000x.
6940 OpCmode = 0;
6941 Imm = SplatBits;
6942 break;
6943 }
6944 if ((SplatBits & ~0xff00) == 0) {
6945 // Value = 0x0000nn00: Op=x, Cmode=001x.
6946 OpCmode = 0x2;
6947 Imm = SplatBits >> 8;
6948 break;
6949 }
6950 if ((SplatBits & ~0xff0000) == 0) {
6951 // Value = 0x00nn0000: Op=x, Cmode=010x.
6952 OpCmode = 0x4;
6953 Imm = SplatBits >> 16;
6954 break;
6955 }
6956 if ((SplatBits & ~0xff000000) == 0) {
6957 // Value = 0xnn000000: Op=x, Cmode=011x.
6958 OpCmode = 0x6;
6959 Imm = SplatBits >> 24;
6960 break;
6961 }
6962
6963 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
6964 if (type == OtherModImm) return SDValue();
6965
6966 if ((SplatBits & ~0xffff) == 0 &&
6967 ((SplatBits | SplatUndef) & 0xff) == 0xff) {
6968 // Value = 0x0000nnff: Op=x, Cmode=1100.
6969 OpCmode = 0xc;
6970 Imm = SplatBits >> 8;
6971 break;
6972 }
6973
6974 // cmode == 0b1101 is not supported for MVE VMVN
6975 if (type == MVEVMVNModImm)
6976 return SDValue();
6977
6978 if ((SplatBits & ~0xffffff) == 0 &&
6979 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
6980 // Value = 0x00nnffff: Op=x, Cmode=1101.
6981 OpCmode = 0xd;
6982 Imm = SplatBits >> 16;
6983 break;
6984 }
6985
6986 // Note: there are a few 32-bit splat values (specifically: 00ffff00,
6987 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
6988 // VMOV.I32. A (very) minor optimization would be to replicate the value
6989 // and fall through here to test for a valid 64-bit splat. But, then the
6990 // caller would also need to check and handle the change in size.
6991 return SDValue();
6992
6993 case 64: {
6994 if (type != VMOVModImm)
6995 return SDValue();
6996 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
6997 uint64_t BitMask = 0xff;
6998 unsigned ImmMask = 1;
6999 Imm = 0;
7000 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
7001 if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
7002 Imm |= ImmMask;
7003 } else if ((SplatBits & BitMask) != 0) {
7004 return SDValue();
7005 }
7006 BitMask <<= 8;
7007 ImmMask <<= 1;
7008 }
7009
7010 // Op=1, Cmode=1110.
7011 OpCmode = 0x1e;
7012 VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
7013 break;
7014 }
7015
7016 default:
7017 llvm_unreachable("unexpected size for isVMOVModifiedImm");
7018 }
7019
7020 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
7021 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
7022}
7023
7024SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
7025 const ARMSubtarget *ST) const {
7026 EVT VT = Op.getValueType();
7027 bool IsDouble = (VT == MVT::f64);
7028 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
7029 const APFloat &FPVal = CFP->getValueAPF();
7030
7031 // Prevent floating-point constants from using literal loads
7032 // when execute-only is enabled.
7033 if (ST->genExecuteOnly()) {
7034 // We shouldn't trigger this for v6m execute-only
7035 assert((!ST->isThumb1Only() || ST->hasV8MBaselineOps()) &&
7036 "Unexpected architecture");
7037
7038 // If we can represent the constant as an immediate, don't lower it
7039 if (isFPImmLegal(FPVal, VT))
7040 return Op;
7041 // Otherwise, construct as integer, and move to float register
7042 APInt INTVal = FPVal.bitcastToAPInt();
7043 SDLoc DL(CFP);
7044 switch (VT.getSimpleVT().SimpleTy) {
7045 default:
7046 llvm_unreachable("Unknown floating point type!");
7047 break;
7048 case MVT::f64: {
7049 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
7050 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
7051 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
7052 }
7053 case MVT::f32:
7054 return DAG.getNode(ARMISD::VMOVSR, DL, VT,
7055 DAG.getConstant(INTVal, DL, MVT::i32));
7056 }
7057 }
7058
7059 if (!ST->hasVFP3Base())
7060 return SDValue();
7061
7062 // Use the default (constant pool) lowering for double constants when we have
7063 // an SP-only FPU
7064 if (IsDouble && !Subtarget->hasFP64())
7065 return SDValue();
7066
7067 // Try splatting with a VMOV.f32...
7068 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
7069
7070 if (ImmVal != -1) {
7071 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
7072 // We have code in place to select a valid ConstantFP already, no need to
7073 // do any mangling.
7074 return Op;
7075 }
7076
7077 // It's a float and we are trying to use NEON operations where
7078 // possible. Lower it to a splat followed by an extract.
7079 SDLoc DL(Op);
7080 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
7081 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
7082 NewVal);
7083 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
7084 DAG.getConstant(0, DL, MVT::i32));
7085 }
7086
7087 // The rest of our options are NEON only, make sure that's allowed before
7088 // proceeding..
7089 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
7090 return SDValue();
7091
7092 EVT VMovVT;
7093 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
7094
7095 // It wouldn't really be worth bothering for doubles except for one very
7096 // important value, which does happen to match: 0.0. So make sure we don't do
7097 // anything stupid.
7098 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
7099 return SDValue();
7100
7101 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
7102 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
7103 VMovVT, VT, VMOVModImm);
7104 if (NewVal != SDValue()) {
7105 SDLoc DL(Op);
7106 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
7107 NewVal);
7108 if (IsDouble)
7109 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
7110
7111 // It's a float: cast and extract a vector element.
7112 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
7113 VecConstant);
7114 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
7115 DAG.getConstant(0, DL, MVT::i32));
7116 }
7117
7118 // Finally, try a VMVN.i32
7119 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
7120 VT, VMVNModImm);
7121 if (NewVal != SDValue()) {
7122 SDLoc DL(Op);
7123 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
7124
7125 if (IsDouble)
7126 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
7127
7128 // It's a float: cast and extract a vector element.
7129 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
7130 VecConstant);
7131 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
7132 DAG.getConstant(0, DL, MVT::i32));
7133 }
7134
7135 return SDValue();
7136}
7137
7138// check if an VEXT instruction can handle the shuffle mask when the
7139// vector sources of the shuffle are the same.
7140static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
7141 unsigned NumElts = VT.getVectorNumElements();
7142
7143 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7144 if (M[0] < 0)
7145 return false;
7146
7147 Imm = M[0];
7148
7149 // If this is a VEXT shuffle, the immediate value is the index of the first
7150 // element. The other shuffle indices must be the successive elements after
7151 // the first one.
7152 unsigned ExpectedElt = Imm;
7153 for (unsigned i = 1; i < NumElts; ++i) {
7154 // Increment the expected index. If it wraps around, just follow it
7155 // back to index zero and keep going.
7156 ++ExpectedElt;
7157 if (ExpectedElt == NumElts)
7158 ExpectedElt = 0;
7159
7160 if (M[i] < 0) continue; // ignore UNDEF indices
7161 if (ExpectedElt != static_cast<unsigned>(M[i]))
7162 return false;
7163 }
7164
7165 return true;
7166}
7167
7168static bool isVEXTMask(ArrayRef<int> M, EVT VT,
7169 bool &ReverseVEXT, unsigned &Imm) {
7170 unsigned NumElts = VT.getVectorNumElements();
7171 ReverseVEXT = false;
7172
7173 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7174 if (M[0] < 0)
7175 return false;
7176
7177 Imm = M[0];
7178
7179 // If this is a VEXT shuffle, the immediate value is the index of the first
7180 // element. The other shuffle indices must be the successive elements after
7181 // the first one.
7182 unsigned ExpectedElt = Imm;
7183 for (unsigned i = 1; i < NumElts; ++i) {
7184 // Increment the expected index. If it wraps around, it may still be
7185 // a VEXT but the source vectors must be swapped.
7186 ExpectedElt += 1;
7187 if (ExpectedElt == NumElts * 2) {
7188 ExpectedElt = 0;
7189 ReverseVEXT = true;
7190 }
7191
7192 if (M[i] < 0) continue; // ignore UNDEF indices
7193 if (ExpectedElt != static_cast<unsigned>(M[i]))
7194 return false;
7195 }
7196
7197 // Adjust the index value if the source operands will be swapped.
7198 if (ReverseVEXT)
7199 Imm -= NumElts;
7200
7201 return true;
7202}
7203
7204static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
7205 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
7206 // range, then 0 is placed into the resulting vector. So pretty much any mask
7207 // of 8 elements can work here.
7208 return VT == MVT::v8i8 && M.size() == 8;
7209}
7210
7211static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
7212 unsigned Index) {
7213 if (Mask.size() == Elements * 2)
7214 return Index / Elements;
7215 return Mask[Index] == 0 ? 0 : 1;
7216}
7217
7218// Checks whether the shuffle mask represents a vector transpose (VTRN) by
7219// checking that pairs of elements in the shuffle mask represent the same index
7220// in each vector, incrementing the expected index by 2 at each step.
7221// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
7222// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
7223// v2={e,f,g,h}
7224// WhichResult gives the offset for each element in the mask based on which
7225// of the two results it belongs to.
7226//
7227// The transpose can be represented either as:
7228// result1 = shufflevector v1, v2, result1_shuffle_mask
7229// result2 = shufflevector v1, v2, result2_shuffle_mask
7230// where v1/v2 and the shuffle masks have the same number of elements
7231// (here WhichResult (see below) indicates which result is being checked)
7232//
7233// or as:
7234// results = shufflevector v1, v2, shuffle_mask
7235// where both results are returned in one vector and the shuffle mask has twice
7236// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
7237// want to check the low half and high half of the shuffle mask as if it were
7238// the other case
7239static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7240 unsigned EltSz = VT.getScalarSizeInBits();
7241 if (EltSz == 64)
7242 return false;
7243
7244 unsigned NumElts = VT.getVectorNumElements();
7245 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7246 return false;
7247
7248 // If the mask is twice as long as the input vector then we need to check the
7249 // upper and lower parts of the mask with a matching value for WhichResult
7250 // FIXME: A mask with only even values will be rejected in case the first
7251 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
7252 // M[0] is used to determine WhichResult
7253 for (unsigned i = 0; i < M.size(); i += NumElts) {
7254 WhichResult = SelectPairHalf(NumElts, M, i);
7255 for (unsigned j = 0; j < NumElts; j += 2) {
7256 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7257 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
7258 return false;
7259 }
7260 }
7261
7262 if (M.size() == NumElts*2)
7263 WhichResult = 0;
7264
7265 return true;
7266}
7267
7268/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
7269/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7270/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
7271static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7272 unsigned EltSz = VT.getScalarSizeInBits();
7273 if (EltSz == 64)
7274 return false;
7275
7276 unsigned NumElts = VT.getVectorNumElements();
7277 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7278 return false;
7279
7280 for (unsigned i = 0; i < M.size(); i += NumElts) {
7281 WhichResult = SelectPairHalf(NumElts, M, i);
7282 for (unsigned j = 0; j < NumElts; j += 2) {
7283 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7284 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
7285 return false;
7286 }
7287 }
7288
7289 if (M.size() == NumElts*2)
7290 WhichResult = 0;
7291
7292 return true;
7293}
7294
7295// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
7296// that the mask elements are either all even and in steps of size 2 or all odd
7297// and in steps of size 2.
7298// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
7299// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
7300// v2={e,f,g,h}
7301// Requires similar checks to that of isVTRNMask with
7302// respect the how results are returned.
7303static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7304 unsigned EltSz = VT.getScalarSizeInBits();
7305 if (EltSz == 64)
7306 return false;
7307
7308 unsigned NumElts = VT.getVectorNumElements();
7309 if (M.size() != NumElts && M.size() != NumElts*2)
7310 return false;
7311
7312 for (unsigned i = 0; i < M.size(); i += NumElts) {
7313 WhichResult = SelectPairHalf(NumElts, M, i);
7314 for (unsigned j = 0; j < NumElts; ++j) {
7315 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
7316 return false;
7317 }
7318 }
7319
7320 if (M.size() == NumElts*2)
7321 WhichResult = 0;
7322
7323 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7324 if (VT.is64BitVector() && EltSz == 32)
7325 return false;
7326
7327 return true;
7328}
7329
7330/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
7331/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7332/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
7333static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7334 unsigned EltSz = VT.getScalarSizeInBits();
7335 if (EltSz == 64)
7336 return false;
7337
7338 unsigned NumElts = VT.getVectorNumElements();
7339 if (M.size() != NumElts && M.size() != NumElts*2)
7340 return false;
7341
7342 unsigned Half = NumElts / 2;
7343 for (unsigned i = 0; i < M.size(); i += NumElts) {
7344 WhichResult = SelectPairHalf(NumElts, M, i);
7345 for (unsigned j = 0; j < NumElts; j += Half) {
7346 unsigned Idx = WhichResult;
7347 for (unsigned k = 0; k < Half; ++k) {
7348 int MIdx = M[i + j + k];
7349 if (MIdx >= 0 && (unsigned) MIdx != Idx)
7350 return false;
7351 Idx += 2;
7352 }
7353 }
7354 }
7355
7356 if (M.size() == NumElts*2)
7357 WhichResult = 0;
7358
7359 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7360 if (VT.is64BitVector() && EltSz == 32)
7361 return false;
7362
7363 return true;
7364}
7365
7366// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
7367// that pairs of elements of the shufflemask represent the same index in each
7368// vector incrementing sequentially through the vectors.
7369// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
7370// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
7371// v2={e,f,g,h}
7372// Requires similar checks to that of isVTRNMask with respect the how results
7373// are returned.
7374static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7375 unsigned EltSz = VT.getScalarSizeInBits();
7376 if (EltSz == 64)
7377 return false;
7378
7379 unsigned NumElts = VT.getVectorNumElements();
7380 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7381 return false;
7382
7383 for (unsigned i = 0; i < M.size(); i += NumElts) {
7384 WhichResult = SelectPairHalf(NumElts, M, i);
7385 unsigned Idx = WhichResult * NumElts / 2;
7386 for (unsigned j = 0; j < NumElts; j += 2) {
7387 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7388 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
7389 return false;
7390 Idx += 1;
7391 }
7392 }
7393
7394 if (M.size() == NumElts*2)
7395 WhichResult = 0;
7396
7397 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7398 if (VT.is64BitVector() && EltSz == 32)
7399 return false;
7400
7401 return true;
7402}
7403
7404/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
7405/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7406/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
7407static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7408 unsigned EltSz = VT.getScalarSizeInBits();
7409 if (EltSz == 64)
7410 return false;
7411
7412 unsigned NumElts = VT.getVectorNumElements();
7413 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7414 return false;
7415
7416 for (unsigned i = 0; i < M.size(); i += NumElts) {
7417 WhichResult = SelectPairHalf(NumElts, M, i);
7418 unsigned Idx = WhichResult * NumElts / 2;
7419 for (unsigned j = 0; j < NumElts; j += 2) {
7420 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7421 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
7422 return false;
7423 Idx += 1;
7424 }
7425 }
7426
7427 if (M.size() == NumElts*2)
7428 WhichResult = 0;
7429
7430 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7431 if (VT.is64BitVector() && EltSz == 32)
7432 return false;
7433
7434 return true;
7435}
7436
7437/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
7438/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
7439static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
7440 unsigned &WhichResult,
7441 bool &isV_UNDEF) {
7442 isV_UNDEF = false;
7443 if (isVTRNMask(ShuffleMask, VT, WhichResult))
7444 return ARMISD::VTRN;
7445 if (isVUZPMask(ShuffleMask, VT, WhichResult))
7446 return ARMISD::VUZP;
7447 if (isVZIPMask(ShuffleMask, VT, WhichResult))
7448 return ARMISD::VZIP;
7449
7450 isV_UNDEF = true;
7451 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
7452 return ARMISD::VTRN;
7453 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7454 return ARMISD::VUZP;
7455 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7456 return ARMISD::VZIP;
7457
7458 return 0;
7459}
7460
7461/// \return true if this is a reverse operation on an vector.
7462static bool isReverseMask(ArrayRef<int> M, EVT VT) {
7463 unsigned NumElts = VT.getVectorNumElements();
7464 // Make sure the mask has the right size.
7465 if (NumElts != M.size())
7466 return false;
7467
7468 // Look for <15, ..., 3, -1, 1, 0>.
7469 for (unsigned i = 0; i != NumElts; ++i)
7470 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
7471 return false;
7472
7473 return true;
7474}
7475
7476static bool isTruncMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7477 unsigned NumElts = VT.getVectorNumElements();
7478 // Make sure the mask has the right size.
7479 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7480 return false;
7481
7482 // Half-width truncation patterns (e.g. v4i32 -> v8i16):
7483 // !Top && SingleSource: <0, 2, 4, 6, 0, 2, 4, 6>
7484 // !Top && !SingleSource: <0, 2, 4, 6, 8, 10, 12, 14>
7485 // Top && SingleSource: <1, 3, 5, 7, 1, 3, 5, 7>
7486 // Top && !SingleSource: <1, 3, 5, 7, 9, 11, 13, 15>
7487 int Ofs = Top ? 1 : 0;
7488 int Upper = SingleSource ? 0 : NumElts;
7489 for (int i = 0, e = NumElts / 2; i != e; ++i) {
7490 if (M[i] >= 0 && M[i] != (i * 2) + Ofs)
7491 return false;
7492 if (M[i + e] >= 0 && M[i + e] != (i * 2) + Ofs + Upper)
7493 return false;
7494 }
7495 return true;
7496}
7497
7498static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7499 unsigned NumElts = VT.getVectorNumElements();
7500 // Make sure the mask has the right size.
7501 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7502 return false;
7503
7504 // If Top
7505 // Look for <0, N, 2, N+2, 4, N+4, ..>.
7506 // This inserts Input2 into Input1
7507 // else if not Top
7508 // Look for <0, N+1, 2, N+3, 4, N+5, ..>
7509 // This inserts Input1 into Input2
7510 unsigned Offset = Top ? 0 : 1;
7511 unsigned N = SingleSource ? 0 : NumElts;
7512 for (unsigned i = 0; i < NumElts; i += 2) {
7513 if (M[i] >= 0 && M[i] != (int)i)
7514 return false;
7515 if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset))
7516 return false;
7517 }
7518
7519 return true;
7520}
7521
7522static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) {
7523 unsigned NumElts = ToVT.getVectorNumElements();
7524 if (NumElts != M.size())
7525 return false;
7526
7527 // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are
7528 // looking for patterns of:
7529 // !rev: 0 N/2 1 N/2+1 2 N/2+2 ...
7530 // rev: N/2 0 N/2+1 1 N/2+2 2 ...
7531
7532 unsigned Off0 = rev ? NumElts / 2 : 0;
7533 unsigned Off1 = rev ? 0 : NumElts / 2;
7534 for (unsigned i = 0; i < NumElts; i += 2) {
7535 if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
7536 return false;
7537 if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
7538 return false;
7539 }
7540
7541 return true;
7542}
7543
7544// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
7545// from a pair of inputs. For example:
7546// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7547// FP_ROUND(EXTRACT_ELT(Y, 0),
7548// FP_ROUND(EXTRACT_ELT(X, 1),
7549// FP_ROUND(EXTRACT_ELT(Y, 1), ...)
7551 const ARMSubtarget *ST) {
7552 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7553 if (!ST->hasMVEFloatOps())
7554 return SDValue();
7555
7556 SDLoc dl(BV);
7557 EVT VT = BV.getValueType();
7558 if (VT != MVT::v8f16)
7559 return SDValue();
7560
7561 // We are looking for a buildvector of fptrunc elements, where all the
7562 // elements are interleavingly extracted from two sources. Check the first two
7563 // items are valid enough and extract some info from them (they are checked
7564 // properly in the loop below).
7565 if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||
7568 return SDValue();
7569 if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||
7572 return SDValue();
7573 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7574 SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);
7575 if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
7576 return SDValue();
7577
7578 // Check all the values in the BuildVector line up with our expectations.
7579 for (unsigned i = 1; i < 4; i++) {
7580 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7581 return Trunc.getOpcode() == ISD::FP_ROUND &&
7583 Trunc.getOperand(0).getOperand(0) == Op &&
7584 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7585 };
7586 if (!Check(BV.getOperand(i * 2 + 0), Op0, i))
7587 return SDValue();
7588 if (!Check(BV.getOperand(i * 2 + 1), Op1, i))
7589 return SDValue();
7590 }
7591
7592 SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
7593 DAG.getConstant(0, dl, MVT::i32));
7594 return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
7595 DAG.getConstant(1, dl, MVT::i32));
7596}
7597
7598// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
7599// from a single input on alternating lanes. For example:
7600// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7601// FP_ROUND(EXTRACT_ELT(X, 2),
7602// FP_ROUND(EXTRACT_ELT(X, 4), ...)
7604 const ARMSubtarget *ST) {
7605 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7606 if (!ST->hasMVEFloatOps())
7607 return SDValue();
7608
7609 SDLoc dl(BV);
7610 EVT VT = BV.getValueType();
7611 if (VT != MVT::v4f32)
7612 return SDValue();
7613
7614 // We are looking for a buildvector of fptext elements, where all the
7615 // elements are alternating lanes from a single source. For example <0,2,4,6>
7616 // or <1,3,5,7>. Check the first two items are valid enough and extract some
7617 // info from them (they are checked properly in the loop below).
7618 if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
7620 return SDValue();
7621 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7623 if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
7624 return SDValue();
7625
7626 // Check all the values in the BuildVector line up with our expectations.
7627 for (unsigned i = 1; i < 4; i++) {
7628 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7629 return Trunc.getOpcode() == ISD::FP_EXTEND &&
7631 Trunc.getOperand(0).getOperand(0) == Op &&
7632 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7633 };
7634 if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
7635 return SDValue();
7636 }
7637
7638 return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
7639 DAG.getConstant(Offset, dl, MVT::i32));
7640}
7641
7642// If N is an integer constant that can be moved into a register in one
7643// instruction, return an SDValue of such a constant (will become a MOV
7644// instruction). Otherwise return null.
7646 const ARMSubtarget *ST, const SDLoc &dl) {
7647 uint64_t Val;
7648 if (!isa<ConstantSDNode>(N))
7649 return SDValue();
7650 Val = N->getAsZExtVal();
7651
7652 if (ST->isThumb1Only()) {
7653 if (Val <= 255 || ~Val <= 255)
7654 return DAG.getConstant(Val, dl, MVT::i32);
7655 } else {
7656 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
7657 return DAG.getConstant(Val, dl, MVT::i32);
7658 }
7659 return SDValue();
7660}
7661
7663 const ARMSubtarget *ST) {
7664 SDLoc dl(Op);
7665 EVT VT = Op.getValueType();
7666
7667 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
7668
7669 unsigned NumElts = VT.getVectorNumElements();
7670 unsigned BoolMask;
7671 unsigned BitsPerBool;
7672 if (NumElts == 2) {
7673 BitsPerBool = 8;
7674 BoolMask = 0xff;
7675 } else if (NumElts == 4) {
7676 BitsPerBool = 4;
7677 BoolMask = 0xf;
7678 } else if (NumElts == 8) {
7679 BitsPerBool = 2;
7680 BoolMask = 0x3;
7681 } else if (NumElts == 16) {
7682 BitsPerBool = 1;
7683 BoolMask = 0x1;
7684 } else
7685 return SDValue();
7686
7687 // If this is a single value copied into all lanes (a splat), we can just sign
7688 // extend that single value
7689 SDValue FirstOp = Op.getOperand(0);
7690 if (!isa<ConstantSDNode>(FirstOp) &&
7691 llvm::all_of(llvm::drop_begin(Op->ops()), [&FirstOp](const SDUse &U) {
7692 return U.get().isUndef() || U.get() == FirstOp;
7693 })) {
7694 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
7695 DAG.getValueType(MVT::i1));
7696 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
7697 }
7698
7699 // First create base with bits set where known
7700 unsigned Bits32 = 0;
7701 for (unsigned i = 0; i < NumElts; ++i) {
7702 SDValue V = Op.getOperand(i);
7703 if (!isa<ConstantSDNode>(V) && !V.isUndef())
7704 continue;
7705 bool BitSet = V.isUndef() ? false : V->getAsZExtVal();
7706 if (BitSet)
7707 Bits32 |= BoolMask << (i * BitsPerBool);
7708 }
7709
7710 // Add in unknown nodes
7712 DAG.getConstant(Bits32, dl, MVT::i32));
7713 for (unsigned i = 0; i < NumElts; ++i) {
7714 SDValue V = Op.getOperand(i);
7715 if (isa<ConstantSDNode>(V) || V.isUndef())
7716 continue;
7717 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
7718 DAG.getConstant(i, dl, MVT::i32));
7719 }
7720
7721 return Base;
7722}
7723
7725 const ARMSubtarget *ST) {
7726 if (!ST->hasMVEIntegerOps())
7727 return SDValue();
7728
7729 // We are looking for a buildvector where each element is Op[0] + i*N
7730 EVT VT = Op.getValueType();
7731 SDValue Op0 = Op.getOperand(0);
7732 unsigned NumElts = VT.getVectorNumElements();
7733
7734 // Get the increment value from operand 1
7735 SDValue Op1 = Op.getOperand(1);
7736 if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 ||
7737 !isa<ConstantSDNode>(Op1.getOperand(1)))
7738 return SDValue();
7739 unsigned N = Op1.getConstantOperandVal(1);
7740 if (N != 1 && N != 2 && N != 4 && N != 8)
7741 return SDValue();
7742
7743 // Check that each other operand matches
7744 for (unsigned I = 2; I < NumElts; I++) {
7745 SDValue OpI = Op.getOperand(I);
7746 if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 ||
7747 !isa<ConstantSDNode>(OpI.getOperand(1)) ||
7748 OpI.getConstantOperandVal(1) != I * N)
7749 return SDValue();
7750 }
7751
7752 SDLoc DL(Op);
7753 return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0,
7754 DAG.getConstant(N, DL, MVT::i32));
7755}
7756
7757// Returns true if the operation N can be treated as qr instruction variant at
7758// operand Op.
7759static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) {
7760 switch (N->getOpcode()) {
7761 case ISD::ADD:
7762 case ISD::MUL:
7763 case ISD::SADDSAT:
7764 case ISD::UADDSAT:
7765 case ISD::AVGFLOORS:
7766 case ISD::AVGFLOORU:
7767 return true;
7768 case ISD::SUB:
7769 case ISD::SSUBSAT:
7770 case ISD::USUBSAT:
7771 return N->getOperand(1).getNode() == Op;
7773 switch (N->getConstantOperandVal(0)) {
7774 case Intrinsic::arm_mve_add_predicated:
7775 case Intrinsic::arm_mve_mul_predicated:
7776 case Intrinsic::arm_mve_qadd_predicated:
7777 case Intrinsic::arm_mve_vhadd:
7778 case Intrinsic::arm_mve_hadd_predicated:
7779 case Intrinsic::arm_mve_vqdmulh:
7780 case Intrinsic::arm_mve_qdmulh_predicated:
7781 case Intrinsic::arm_mve_vqrdmulh:
7782 case Intrinsic::arm_mve_qrdmulh_predicated:
7783 case Intrinsic::arm_mve_vqdmull:
7784 case Intrinsic::arm_mve_vqdmull_predicated:
7785 return true;
7786 case Intrinsic::arm_mve_sub_predicated:
7787 case Intrinsic::arm_mve_qsub_predicated:
7788 case Intrinsic::arm_mve_vhsub:
7789 case Intrinsic::arm_mve_hsub_predicated:
7790 return N->getOperand(2).getNode() == Op;
7791 default:
7792 return false;
7793 }
7794 default:
7795 return false;
7796 }
7797}
7798
7799// If this is a case we can't handle, return null and let the default
7800// expansion code take care of it.
7801SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
7802 const ARMSubtarget *ST) const {
7803 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
7804 SDLoc dl(Op);
7805 EVT VT = Op.getValueType();
7806
7807 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
7808 return LowerBUILD_VECTOR_i1(Op, DAG, ST);
7809
7810 if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST))
7811 return R;
7812
7813 APInt SplatBits, SplatUndef;
7814 unsigned SplatBitSize;
7815 bool HasAnyUndefs;
7816 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
7817 if (SplatUndef.isAllOnes())
7818 return DAG.getUNDEF(VT);
7819
7820 // If all the users of this constant splat are qr instruction variants,
7821 // generate a vdup of the constant.
7822 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize &&
7823 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) &&
7824 all_of(BVN->users(),
7825 [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) {
7826 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7827 : SplatBitSize == 16 ? MVT::v8i16
7828 : MVT::v16i8;
7829 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7830 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7831 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7832 }
7833
7834 if ((ST->hasNEON() && SplatBitSize <= 64) ||
7835 (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
7836 // Check if an immediate VMOV works.
7837 EVT VmovVT;
7838 SDValue Val =
7839 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
7840 SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);
7841
7842 if (Val.getNode()) {
7843 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
7844 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7845 }
7846
7847 // Try an immediate VMVN.
7848 uint64_t NegatedImm = (~SplatBits).getZExtValue();
7849 Val = isVMOVModifiedImm(
7850 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
7851 VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
7852 if (Val.getNode()) {
7853 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
7854 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7855 }
7856
7857 // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
7858 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
7859 int ImmVal = ARM_AM::getFP32Imm(SplatBits);
7860 if (ImmVal != -1) {
7861 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
7862 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
7863 }
7864 }
7865
7866 // If we are under MVE, generate a VDUP(constant), bitcast to the original
7867 // type.
7868 if (ST->hasMVEIntegerOps() &&
7869 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) {
7870 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7871 : SplatBitSize == 16 ? MVT::v8i16
7872 : MVT::v16i8;
7873 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7874 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7875 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7876 }
7877 }
7878 }
7879
7880 // Scan through the operands to see if only one value is used.
7881 //
7882 // As an optimisation, even if more than one value is used it may be more
7883 // profitable to splat with one value then change some lanes.
7884 //
7885 // Heuristically we decide to do this if the vector has a "dominant" value,
7886 // defined as splatted to more than half of the lanes.
7887 unsigned NumElts = VT.getVectorNumElements();
7888 bool isOnlyLowElement = true;
7889 bool usesOnlyOneValue = true;
7890 bool hasDominantValue = false;
7891 bool isConstant = true;
7892
7893 // Map of the number of times a particular SDValue appears in the
7894 // element list.
7895 DenseMap<SDValue, unsigned> ValueCounts;
7896 SDValue Value;
7897 for (unsigned i = 0; i < NumElts; ++i) {
7898 SDValue V = Op.getOperand(i);
7899 if (V.isUndef())
7900 continue;
7901 if (i > 0)
7902 isOnlyLowElement = false;
7903 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
7904 isConstant = false;
7905
7906 unsigned &Count = ValueCounts[V];
7907
7908 // Is this value dominant? (takes up more than half of the lanes)
7909 if (++Count > (NumElts / 2)) {
7910 hasDominantValue = true;
7911 Value = V;
7912 }
7913 }
7914 if (ValueCounts.size() != 1)
7915 usesOnlyOneValue = false;
7916 if (!Value.getNode() && !ValueCounts.empty())
7917 Value = ValueCounts.begin()->first;
7918
7919 if (ValueCounts.empty())
7920 return DAG.getUNDEF(VT);
7921
7922 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
7923 // Keep going if we are hitting this case.
7924 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
7925 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
7926
7927 unsigned EltSize = VT.getScalarSizeInBits();
7928
7929 // Use VDUP for non-constant splats. For f32 constant splats, reduce to
7930 // i32 and try again.
7931 if (hasDominantValue && EltSize <= 32) {
7932 if (!isConstant) {
7933 SDValue N;
7934
7935 // If we are VDUPing a value that comes directly from a vector, that will
7936 // cause an unnecessary move to and from a GPR, where instead we could
7937 // just use VDUPLANE. We can only do this if the lane being extracted
7938 // is at a constant index, as the VDUP from lane instructions only have
7939 // constant-index forms.
7940 ConstantSDNode *constIndex;
7941 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7942 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
7943 // We need to create a new undef vector to use for the VDUPLANE if the
7944 // size of the vector from which we get the value is different than the
7945 // size of the vector that we need to create. We will insert the element
7946 // such that the register coalescer will remove unnecessary copies.
7947 if (VT != Value->getOperand(0).getValueType()) {
7948 unsigned index = constIndex->getAPIntValue().getLimitedValue() %
7950 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7951 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
7952 Value, DAG.getConstant(index, dl, MVT::i32)),
7953 DAG.getConstant(index, dl, MVT::i32));
7954 } else
7955 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7956 Value->getOperand(0), Value->getOperand(1));
7957 } else
7958 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
7959
7960 if (!usesOnlyOneValue) {
7961 // The dominant value was splatted as 'N', but we now have to insert
7962 // all differing elements.
7963 for (unsigned I = 0; I < NumElts; ++I) {
7964 if (Op.getOperand(I) == Value)
7965 continue;
7967 Ops.push_back(N);
7968 Ops.push_back(Op.getOperand(I));
7969 Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
7970 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
7971 }
7972 }
7973 return N;
7974 }
7978 assert(FVT == MVT::f32 || FVT == MVT::f16);
7979 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
7980 for (unsigned i = 0; i < NumElts; ++i)
7981 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
7982 Op.getOperand(i)));
7983 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
7984 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
7985 Val = LowerBUILD_VECTOR(Val, DAG, ST);
7986 if (Val.getNode())
7987 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7988 }
7989 if (usesOnlyOneValue) {
7990 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
7991 if (isConstant && Val.getNode())
7992 return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
7993 }
7994 }
7995
7996 // If all elements are constants and the case above didn't get hit, fall back
7997 // to the default expansion, which will generate a load from the constant
7998 // pool.
7999 if (isConstant)
8000 return SDValue();
8001
8002 // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
8003 // vmovn). Empirical tests suggest this is rarely worth it for vectors of
8004 // length <= 2.
8005 if (NumElts >= 4)
8006 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
8007 return shuffle;
8008
8009 // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
8010 // VCVT's
8011 if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
8012 return VCVT;
8013 if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
8014 return VCVT;
8015
8016 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
8017 // If we haven't found an efficient lowering, try splitting a 128-bit vector
8018 // into two 64-bit vectors; we might discover a better way to lower it.
8019 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
8020 EVT ExtVT = VT.getVectorElementType();
8021 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
8022 SDValue Lower = DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[0], NumElts / 2));
8023 if (Lower.getOpcode() == ISD::BUILD_VECTOR)
8024 Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
8025 SDValue Upper =
8026 DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[NumElts / 2], NumElts / 2));
8027 if (Upper.getOpcode() == ISD::BUILD_VECTOR)
8028 Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
8029 if (Lower && Upper)
8030 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
8031 }
8032
8033 // Vectors with 32- or 64-bit elements can be built by directly assigning
8034 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
8035 // will be legalized.
8036 if (EltSize >= 32) {
8037 // Do the expansion with floating-point types, since that is what the VFP
8038 // registers are defined to use, and since i64 is not legal.
8039 EVT EltVT = EVT::getFloatingPointVT(EltSize);
8040 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8042 for (unsigned i = 0; i < NumElts; ++i)
8043 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
8044 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8045 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8046 }
8047
8048 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
8049 // know the default expansion would otherwise fall back on something even
8050 // worse. For a vector with one or two non-undef values, that's
8051 // scalar_to_vector for the elements followed by a shuffle (provided the
8052 // shuffle is valid for the target) and materialization element by element
8053 // on the stack followed by a load for everything else.
8054 if (!isConstant && !usesOnlyOneValue) {
8055 SDValue Vec = DAG.getUNDEF(VT);
8056 for (unsigned i = 0 ; i < NumElts; ++i) {
8057 SDValue V = Op.getOperand(i);
8058 if (V.isUndef())
8059 continue;
8060 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
8061 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
8062 }
8063 return Vec;
8064 }
8065
8066 return SDValue();
8067}
8068
8069// Gather data to see if the operation can be modelled as a
8070// shuffle in combination with VEXTs.
8071SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
8072 SelectionDAG &DAG) const {
8073 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
8074 SDLoc dl(Op);
8075 EVT VT = Op.getValueType();
8076 unsigned NumElts = VT.getVectorNumElements();
8077
8078 struct ShuffleSourceInfo {
8079 SDValue Vec;
8080 unsigned MinElt = std::numeric_limits<unsigned>::max();
8081 unsigned MaxElt = 0;
8082
8083 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
8084 // be compatible with the shuffle we intend to construct. As a result
8085 // ShuffleVec will be some sliding window into the original Vec.
8086 SDValue ShuffleVec;
8087
8088 // Code should guarantee that element i in Vec starts at element "WindowBase
8089 // + i * WindowScale in ShuffleVec".
8090 int WindowBase = 0;
8091 int WindowScale = 1;
8092
8093 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
8094
8095 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
8096 };
8097
8098 // First gather all vectors used as an immediate source for this BUILD_VECTOR
8099 // node.
8101 for (unsigned i = 0; i < NumElts; ++i) {
8102 SDValue V = Op.getOperand(i);
8103 if (V.isUndef())
8104 continue;
8105 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
8106 // A shuffle can only come from building a vector from various
8107 // elements of other vectors.
8108 return SDValue();
8109 } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
8110 // Furthermore, shuffles require a constant mask, whereas extractelts
8111 // accept variable indices.
8112 return SDValue();
8113 }
8114
8115 // Add this element source to the list if it's not already there.
8116 SDValue SourceVec = V.getOperand(0);
8117 auto Source = llvm::find(Sources, SourceVec);
8118 if (Source == Sources.end())
8119 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
8120
8121 // Update the minimum and maximum lane number seen.
8122 unsigned EltNo = V.getConstantOperandVal(1);
8123 Source->MinElt = std::min(Source->MinElt, EltNo);
8124 Source->MaxElt = std::max(Source->MaxElt, EltNo);
8125 }
8126
8127 // Currently only do something sane when at most two source vectors
8128 // are involved.
8129 if (Sources.size() > 2)
8130 return SDValue();
8131
8132 // Find out the smallest element size among result and two sources, and use
8133 // it as element size to build the shuffle_vector.
8134 EVT SmallestEltTy = VT.getVectorElementType();
8135 for (auto &Source : Sources) {
8136 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
8137 if (SrcEltTy.bitsLT(SmallestEltTy))
8138 SmallestEltTy = SrcEltTy;
8139 }
8140 unsigned ResMultiplier =
8141 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
8142 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
8143 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
8144
8145 // If the source vector is too wide or too narrow, we may nevertheless be able
8146 // to construct a compatible shuffle either by concatenating it with UNDEF or
8147 // extracting a suitable range of elements.
8148 for (auto &Src : Sources) {
8149 EVT SrcVT = Src.ShuffleVec.getValueType();
8150
8151 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
8152 uint64_t VTSize = VT.getFixedSizeInBits();
8153 if (SrcVTSize == VTSize)
8154 continue;
8155
8156 // This stage of the search produces a source with the same element type as
8157 // the original, but with a total width matching the BUILD_VECTOR output.
8158 EVT EltVT = SrcVT.getVectorElementType();
8159 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
8160 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
8161
8162 if (SrcVTSize < VTSize) {
8163 if (2 * SrcVTSize != VTSize)
8164 return SDValue();
8165 // We can pad out the smaller vector for free, so if it's part of a
8166 // shuffle...
8167 Src.ShuffleVec =
8168 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
8169 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
8170 continue;
8171 }
8172
8173 if (SrcVTSize != 2 * VTSize)
8174 return SDValue();
8175
8176 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
8177 // Span too large for a VEXT to cope
8178 return SDValue();
8179 }
8180
8181 if (Src.MinElt >= NumSrcElts) {
8182 // The extraction can just take the second half
8183 Src.ShuffleVec =
8184 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8185 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8186 Src.WindowBase = -NumSrcElts;
8187 } else if (Src.MaxElt < NumSrcElts) {
8188 // The extraction can just take the first half
8189 Src.ShuffleVec =
8190 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8191 DAG.getConstant(0, dl, MVT::i32));
8192 } else {
8193 // An actual VEXT is needed
8194 SDValue VEXTSrc1 =
8195 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8196 DAG.getConstant(0, dl, MVT::i32));
8197 SDValue VEXTSrc2 =
8198 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8199 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8200
8201 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
8202 VEXTSrc2,
8203 DAG.getConstant(Src.MinElt, dl, MVT::i32));
8204 Src.WindowBase = -Src.MinElt;
8205 }
8206 }
8207
8208 // Another possible incompatibility occurs from the vector element types. We
8209 // can fix this by bitcasting the source vectors to the same type we intend
8210 // for the shuffle.
8211 for (auto &Src : Sources) {
8212 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
8213 if (SrcEltTy == SmallestEltTy)
8214 continue;
8215 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
8216 Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
8217 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
8218 Src.WindowBase *= Src.WindowScale;
8219 }
8220
8221 // Final check before we try to actually produce a shuffle.
8222 LLVM_DEBUG({
8223 for (auto Src : Sources)
8224 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
8225 });
8226
8227 // The stars all align, our next step is to produce the mask for the shuffle.
8229 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
8230 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
8231 SDValue Entry = Op.getOperand(i);
8232 if (Entry.isUndef())
8233 continue;
8234
8235 auto Src = llvm::find(Sources, Entry.getOperand(0));
8236 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
8237
8238 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8239 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8240 // segment.
8241 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
8242 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
8243 VT.getScalarSizeInBits());
8244 int LanesDefined = BitsDefined / BitsPerShuffleLane;
8245
8246 // This source is expected to fill ResMultiplier lanes of the final shuffle,
8247 // starting at the appropriate offset.
8248 int *LaneMask = &Mask[i * ResMultiplier];
8249
8250 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
8251 ExtractBase += NumElts * (Src - Sources.begin());
8252 for (int j = 0; j < LanesDefined; ++j)
8253 LaneMask[j] = ExtractBase + j;
8254 }
8255
8256
8257 // We can't handle more than two sources. This should have already
8258 // been checked before this point.
8259 assert(Sources.size() <= 2 && "Too many sources!");
8260
8261 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
8262 for (unsigned i = 0; i < Sources.size(); ++i)
8263 ShuffleOps[i] = Sources[i].ShuffleVec;
8264
8265 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
8266 ShuffleOps[1], Mask, DAG);
8267 if (!Shuffle)
8268 return SDValue();
8269 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
8270}
8271
8273 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
8282 OP_VUZPL, // VUZP, left result
8283 OP_VUZPR, // VUZP, right result
8284 OP_VZIPL, // VZIP, left result
8285 OP_VZIPR, // VZIP, right result
8286 OP_VTRNL, // VTRN, left result
8287 OP_VTRNR // VTRN, right result
8289
8290static bool isLegalMVEShuffleOp(unsigned PFEntry) {
8291 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8292 switch (OpNum) {
8293 case OP_COPY:
8294 case OP_VREV:
8295 case OP_VDUP0:
8296 case OP_VDUP1:
8297 case OP_VDUP2:
8298 case OP_VDUP3:
8299 return true;
8300 }
8301 return false;
8302}
8303
8304/// isShuffleMaskLegal - Targets can use this to indicate that they only
8305/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
8306/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
8307/// are assumed to be legal.
8309 if (VT.getVectorNumElements() == 4 &&
8310 (VT.is128BitVector() || VT.is64BitVector())) {
8311 unsigned PFIndexes[4];
8312 for (unsigned i = 0; i != 4; ++i) {
8313 if (M[i] < 0)
8314 PFIndexes[i] = 8;
8315 else
8316 PFIndexes[i] = M[i];
8317 }
8318
8319 // Compute the index in the perfect shuffle table.
8320 unsigned PFTableIndex =
8321 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8322 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8323 unsigned Cost = (PFEntry >> 30);
8324
8325 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
8326 return true;
8327 }
8328
8329 bool ReverseVEXT, isV_UNDEF;
8330 unsigned Imm, WhichResult;
8331
8332 unsigned EltSize = VT.getScalarSizeInBits();
8333 if (EltSize >= 32 ||
8335 ShuffleVectorInst::isIdentityMask(M, M.size()) ||
8336 isVREVMask(M, VT, 64) ||
8337 isVREVMask(M, VT, 32) ||
8338 isVREVMask(M, VT, 16))
8339 return true;
8340 else if (Subtarget->hasNEON() &&
8341 (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
8342 isVTBLMask(M, VT) ||
8343 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
8344 return true;
8345 else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8346 isReverseMask(M, VT))
8347 return true;
8348 else if (Subtarget->hasMVEIntegerOps() &&
8349 (isVMOVNMask(M, VT, true, false) ||
8350 isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true)))
8351 return true;
8352 else if (Subtarget->hasMVEIntegerOps() &&
8353 (isTruncMask(M, VT, false, false) ||
8354 isTruncMask(M, VT, false, true) ||
8355 isTruncMask(M, VT, true, false) || isTruncMask(M, VT, true, true)))
8356 return true;
8357 else
8358 return false;
8359}
8360
8361/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
8362/// the specified operations to build the shuffle.
8363static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
8364 SDValue RHS, SelectionDAG &DAG,
8365 const SDLoc &dl) {
8366 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8367 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8368 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8369
8370 if (OpNum == OP_COPY) {
8371 if (LHSID == (1*9+2)*9+3) return LHS;
8372 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
8373 return RHS;
8374 }
8375
8376 SDValue OpLHS, OpRHS;
8377 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
8378 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
8379 EVT VT = OpLHS.getValueType();
8380
8381 switch (OpNum) {
8382 default: llvm_unreachable("Unknown shuffle opcode!");
8383 case OP_VREV:
8384 // VREV divides the vector in half and swaps within the half.
8385 if (VT.getScalarSizeInBits() == 32)
8386 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
8387 // vrev <4 x i16> -> VREV32
8388 if (VT.getScalarSizeInBits() == 16)
8389 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
8390 // vrev <4 x i8> -> VREV16
8391 assert(VT.getScalarSizeInBits() == 8);
8392 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
8393 case OP_VDUP0:
8394 case OP_VDUP1:
8395 case OP_VDUP2:
8396 case OP_VDUP3:
8397 return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8398 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
8399 case OP_VEXT1:
8400 case OP_VEXT2:
8401 case OP_VEXT3:
8402 return DAG.getNode(ARMISD::VEXT, dl, VT,
8403 OpLHS, OpRHS,
8404 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
8405 case OP_VUZPL:
8406 case OP_VUZPR:
8407 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
8408 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
8409 case OP_VZIPL:
8410 case OP_VZIPR:
8411 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
8412 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
8413 case OP_VTRNL:
8414 case OP_VTRNR:
8415 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
8416 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
8417 }
8418}
8419
8421 ArrayRef<int> ShuffleMask,
8422 SelectionDAG &DAG) {
8423 // Check to see if we can use the VTBL instruction.
8424 SDValue V1 = Op.getOperand(0);
8425 SDValue V2 = Op.getOperand(1);
8426 SDLoc DL(Op);
8427
8428 SmallVector<SDValue, 8> VTBLMask;
8429 for (int I : ShuffleMask)
8430 VTBLMask.push_back(DAG.getSignedConstant(I, DL, MVT::i32));
8431
8432 if (V2.getNode()->isUndef())
8433 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
8434 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8435
8436 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
8437 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8438}
8439
8441 SDLoc DL(Op);
8442 EVT VT = Op.getValueType();
8443
8444 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8445 "Expect an v8i16/v16i8 type");
8446 SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0));
8447 // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now,
8448 // extract the first 8 bytes into the top double word and the last 8 bytes
8449 // into the bottom double word, through a new vector shuffle that will be
8450 // turned into a VEXT on Neon, or a couple of VMOVDs on MVE.
8451 std::vector<int> NewMask;
8452 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8453 NewMask.push_back(VT.getVectorNumElements() / 2 + i);
8454 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8455 NewMask.push_back(i);
8456 return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask);
8457}
8458
8460 switch (VT.getSimpleVT().SimpleTy) {
8461 case MVT::v2i1:
8462 return MVT::v2f64;
8463 case MVT::v4i1:
8464 return MVT::v4i32;
8465 case MVT::v8i1:
8466 return MVT::v8i16;
8467 case MVT::v16i1:
8468 return MVT::v16i8;
8469 default:
8470 llvm_unreachable("Unexpected vector predicate type");
8471 }
8472}
8473
8475 SelectionDAG &DAG) {
8476 // Converting from boolean predicates to integers involves creating a vector
8477 // of all ones or all zeroes and selecting the lanes based upon the real
8478 // predicate.
8480 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
8481 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
8482
8483 SDValue AllZeroes =
8484 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
8485 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
8486
8487 // Get full vector type from predicate type
8489
8490 SDValue RecastV1;
8491 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
8492 // this to a v16i1. This cannot be done with an ordinary bitcast because the
8493 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
8494 // since we know in hardware the sizes are really the same.
8495 if (VT != MVT::v16i1)
8496 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
8497 else
8498 RecastV1 = Pred;
8499
8500 // Select either all ones or zeroes depending upon the real predicate bits.
8501 SDValue PredAsVector =
8502 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
8503
8504 // Recast our new predicate-as-integer v16i8 vector into something
8505 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
8506 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
8507}
8508
8510 const ARMSubtarget *ST) {
8511 EVT VT = Op.getValueType();
8512 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
8513 ArrayRef<int> ShuffleMask = SVN->getMask();
8514
8515 assert(ST->hasMVEIntegerOps() &&
8516 "No support for vector shuffle of boolean predicates");
8517
8518 SDValue V1 = Op.getOperand(0);
8519 SDValue V2 = Op.getOperand(1);
8520 SDLoc dl(Op);
8521 if (isReverseMask(ShuffleMask, VT)) {
8522 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
8523 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
8524 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
8525 DAG.getConstant(16, dl, MVT::i32));
8526 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
8527 }
8528
8529 // Until we can come up with optimised cases for every single vector
8530 // shuffle in existence we have chosen the least painful strategy. This is
8531 // to essentially promote the boolean predicate to a 8-bit integer, where
8532 // each predicate represents a byte. Then we fall back on a normal integer
8533 // vector shuffle and convert the result back into a predicate vector. In
8534 // many cases the generated code might be even better than scalar code
8535 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
8536 // fields in a register into 8 other arbitrary 2-bit fields!
8537 SDValue PredAsVector1 = PromoteMVEPredVector(dl, V1, VT, DAG);
8538 EVT NewVT = PredAsVector1.getValueType();
8539 SDValue PredAsVector2 = V2.isUndef() ? DAG.getUNDEF(NewVT)
8540 : PromoteMVEPredVector(dl, V2, VT, DAG);
8541 assert(PredAsVector2.getValueType() == NewVT &&
8542 "Expected identical vector type in expanded i1 shuffle!");
8543
8544 // Do the shuffle!
8545 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector1,
8546 PredAsVector2, ShuffleMask);
8547
8548 // Now return the result of comparing the shuffled vector with zero,
8549 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1
8550 // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
8551 if (VT == MVT::v2i1) {
8552 SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled);
8553 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
8554 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8555 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
8556 }
8557 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
8558 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8559}
8560
8562 ArrayRef<int> ShuffleMask,
8563 SelectionDAG &DAG) {
8564 // Attempt to lower the vector shuffle using as many whole register movs as
8565 // possible. This is useful for types smaller than 32bits, which would
8566 // often otherwise become a series for grp movs.
8567 SDLoc dl(Op);
8568 EVT VT = Op.getValueType();
8569 if (VT.getScalarSizeInBits() >= 32)
8570 return SDValue();
8571
8572 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8573 "Unexpected vector type");
8574 int NumElts = VT.getVectorNumElements();
8575 int QuarterSize = NumElts / 4;
8576 // The four final parts of the vector, as i32's
8577 SDValue Parts[4];
8578
8579 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
8580 // <u,u,u,u>), returning the vmov lane index
8581 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
8582 // Detect which mov lane this would be from the first non-undef element.
8583 int MovIdx = -1;
8584 for (int i = 0; i < Length; i++) {
8585 if (ShuffleMask[Start + i] >= 0) {
8586 if (ShuffleMask[Start + i] % Length != i)
8587 return -1;
8588 MovIdx = ShuffleMask[Start + i] / Length;
8589 break;
8590 }
8591 }
8592 // If all items are undef, leave this for other combines
8593 if (MovIdx == -1)
8594 return -1;
8595 // Check the remaining values are the correct part of the same mov
8596 for (int i = 1; i < Length; i++) {
8597 if (ShuffleMask[Start + i] >= 0 &&
8598 (ShuffleMask[Start + i] / Length != MovIdx ||
8599 ShuffleMask[Start + i] % Length != i))
8600 return -1;
8601 }
8602 return MovIdx;
8603 };
8604
8605 for (int Part = 0; Part < 4; ++Part) {
8606 // Does this part look like a mov
8607 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
8608 if (Elt != -1) {
8609 SDValue Input = Op->getOperand(0);
8610 if (Elt >= 4) {
8611 Input = Op->getOperand(1);
8612 Elt -= 4;
8613 }
8614 SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input);
8615 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast,
8616 DAG.getConstant(Elt, dl, MVT::i32));
8617 }
8618 }
8619
8620 // Nothing interesting found, just return
8621 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
8622 return SDValue();
8623
8624 // The other parts need to be built with the old shuffle vector, cast to a
8625 // v4i32 and extract_vector_elts
8626 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
8627 SmallVector<int, 16> NewShuffleMask;
8628 for (int Part = 0; Part < 4; ++Part)
8629 for (int i = 0; i < QuarterSize; i++)
8630 NewShuffleMask.push_back(
8631 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
8632 SDValue NewShuffle = DAG.getVectorShuffle(
8633 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
8634 SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle);
8635
8636 for (int Part = 0; Part < 4; ++Part)
8637 if (!Parts[Part])
8638 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32,
8639 BitCast, DAG.getConstant(Part, dl, MVT::i32));
8640 }
8641 // Build a vector out of the various parts and bitcast it back to the original
8642 // type.
8643 SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts);
8644 return DAG.getBitcast(VT, NewVec);
8645}
8646
8648 ArrayRef<int> ShuffleMask,
8649 SelectionDAG &DAG) {
8650 SDValue V1 = Op.getOperand(0);
8651 SDValue V2 = Op.getOperand(1);
8652 EVT VT = Op.getValueType();
8653 unsigned NumElts = VT.getVectorNumElements();
8654
8655 // An One-Off Identity mask is one that is mostly an identity mask from as
8656 // single source but contains a single element out-of-place, either from a
8657 // different vector or from another position in the same vector. As opposed to
8658 // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
8659 // pair directly.
8660 auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,
8661 int &OffElement) {
8662 OffElement = -1;
8663 int NonUndef = 0;
8664 for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {
8665 if (Mask[i] == -1)
8666 continue;
8667 NonUndef++;
8668 if (Mask[i] != i + BaseOffset) {
8669 if (OffElement == -1)
8670 OffElement = i;
8671 else
8672 return false;
8673 }
8674 }
8675 return NonUndef > 2 && OffElement != -1;
8676 };
8677 int OffElement;
8678 SDValue VInput;
8679 if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))
8680 VInput = V1;
8681 else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))
8682 VInput = V2;
8683 else
8684 return SDValue();
8685
8686 SDLoc dl(Op);
8687 EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16
8688 ? MVT::i32
8689 : VT.getScalarType();
8690 SDValue Elt = DAG.getNode(
8691 ISD::EXTRACT_VECTOR_ELT, dl, SVT,
8692 ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,
8693 DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl));
8694 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt,
8695 DAG.getVectorIdxConstant(OffElement % NumElts, dl));
8696}
8697
8699 const ARMSubtarget *ST) {
8700 SDValue V1 = Op.getOperand(0);
8701 SDValue V2 = Op.getOperand(1);
8702 SDLoc dl(Op);
8703 EVT VT = Op.getValueType();
8704 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
8705 unsigned EltSize = VT.getScalarSizeInBits();
8706
8707 if (ST->hasMVEIntegerOps() && EltSize == 1)
8708 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
8709
8710 // Convert shuffles that are directly supported on NEON to target-specific
8711 // DAG nodes, instead of keeping them as shuffles and matching them again
8712 // during code selection. This is more efficient and avoids the possibility
8713 // of inconsistencies between legalization and selection.
8714 // FIXME: floating-point vectors should be canonicalized to integer vectors
8715 // of the same time so that they get CSEd properly.
8716 ArrayRef<int> ShuffleMask = SVN->getMask();
8717
8718 if (EltSize <= 32) {
8719 if (SVN->isSplat()) {
8720 int Lane = SVN->getSplatIndex();
8721 // If this is undef splat, generate it via "just" vdup, if possible.
8722 if (Lane == -1) Lane = 0;
8723
8724 // Test if V1 is a SCALAR_TO_VECTOR.
8725 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8726 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8727 }
8728 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
8729 // (and probably will turn into a SCALAR_TO_VECTOR once legalization
8730 // reaches it).
8731 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
8732 !isa<ConstantSDNode>(V1.getOperand(0))) {
8733 bool IsScalarToVector = true;
8734 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
8735 if (!V1.getOperand(i).isUndef()) {
8736 IsScalarToVector = false;
8737 break;
8738 }
8739 if (IsScalarToVector)
8740 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8741 }
8742 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
8743 DAG.getConstant(Lane, dl, MVT::i32));
8744 }
8745
8746 bool ReverseVEXT = false;
8747 unsigned Imm = 0;
8748 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
8749 if (ReverseVEXT)
8750 std::swap(V1, V2);
8751 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
8752 DAG.getConstant(Imm, dl, MVT::i32));
8753 }
8754
8755 if (isVREVMask(ShuffleMask, VT, 64))
8756 return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
8757 if (isVREVMask(ShuffleMask, VT, 32))
8758 return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
8759 if (isVREVMask(ShuffleMask, VT, 16))
8760 return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
8761
8762 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
8763 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
8764 DAG.getConstant(Imm, dl, MVT::i32));
8765 }
8766
8767 // Check for Neon shuffles that modify both input vectors in place.
8768 // If both results are used, i.e., if there are two shuffles with the same
8769 // source operands and with masks corresponding to both results of one of
8770 // these operations, DAG memoization will ensure that a single node is
8771 // used for both shuffles.
8772 unsigned WhichResult = 0;
8773 bool isV_UNDEF = false;
8774 if (ST->hasNEON()) {
8775 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8776 ShuffleMask, VT, WhichResult, isV_UNDEF)) {
8777 if (isV_UNDEF)
8778 V2 = V1;
8779 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
8780 .getValue(WhichResult);
8781 }
8782 }
8783 if (ST->hasMVEIntegerOps()) {
8784 if (isVMOVNMask(ShuffleMask, VT, false, false))
8785 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
8786 DAG.getConstant(0, dl, MVT::i32));
8787 if (isVMOVNMask(ShuffleMask, VT, true, false))
8788 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
8789 DAG.getConstant(1, dl, MVT::i32));
8790 if (isVMOVNMask(ShuffleMask, VT, true, true))
8791 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1,
8792 DAG.getConstant(1, dl, MVT::i32));
8793 }
8794
8795 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
8796 // shuffles that produce a result larger than their operands with:
8797 // shuffle(concat(v1, undef), concat(v2, undef))
8798 // ->
8799 // shuffle(concat(v1, v2), undef)
8800 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
8801 //
8802 // This is useful in the general case, but there are special cases where
8803 // native shuffles produce larger results: the two-result ops.
8804 //
8805 // Look through the concat when lowering them:
8806 // shuffle(concat(v1, v2), undef)
8807 // ->
8808 // concat(VZIP(v1, v2):0, :1)
8809 //
8810 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
8811 SDValue SubV1 = V1->getOperand(0);
8812 SDValue SubV2 = V1->getOperand(1);
8813 EVT SubVT = SubV1.getValueType();
8814
8815 // We expect these to have been canonicalized to -1.
8816 assert(llvm::all_of(ShuffleMask, [&](int i) {
8817 return i < (int)VT.getVectorNumElements();
8818 }) && "Unexpected shuffle index into UNDEF operand!");
8819
8820 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8821 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
8822 if (isV_UNDEF)
8823 SubV2 = SubV1;
8824 assert((WhichResult == 0) &&
8825 "In-place shuffle of concat can only have one result!");
8826 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
8827 SubV1, SubV2);
8828 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
8829 Res.getValue(1));
8830 }
8831 }
8832 }
8833
8834 if (ST->hasMVEIntegerOps() && EltSize <= 32) {
8835 if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
8836 return V;
8837
8838 for (bool Top : {false, true}) {
8839 for (bool SingleSource : {false, true}) {
8840 if (isTruncMask(ShuffleMask, VT, Top, SingleSource)) {
8841 MVT FromSVT = MVT::getIntegerVT(EltSize * 2);
8842 MVT FromVT = MVT::getVectorVT(FromSVT, ShuffleMask.size() / 2);
8843 SDValue Lo = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, V1);
8844 SDValue Hi = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT,
8845 SingleSource ? V1 : V2);
8846 if (Top) {
8847 SDValue Amt = DAG.getConstant(EltSize, dl, FromVT);
8848 Lo = DAG.getNode(ISD::SRL, dl, FromVT, Lo, Amt);
8849 Hi = DAG.getNode(ISD::SRL, dl, FromVT, Hi, Amt);
8850 }
8851 return DAG.getNode(ARMISD::MVETRUNC, dl, VT, Lo, Hi);
8852 }
8853 }
8854 }
8855 }
8856
8857 // If the shuffle is not directly supported and it has 4 elements, use
8858 // the PerfectShuffle-generated table to synthesize it from other shuffles.
8859 unsigned NumElts = VT.getVectorNumElements();
8860 if (NumElts == 4) {
8861 unsigned PFIndexes[4];
8862 for (unsigned i = 0; i != 4; ++i) {
8863 if (ShuffleMask[i] < 0)
8864 PFIndexes[i] = 8;
8865 else
8866 PFIndexes[i] = ShuffleMask[i];
8867 }
8868
8869 // Compute the index in the perfect shuffle table.
8870 unsigned PFTableIndex =
8871 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8872 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8873 unsigned Cost = (PFEntry >> 30);
8874
8875 if (Cost <= 4) {
8876 if (ST->hasNEON())
8877 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8878 else if (isLegalMVEShuffleOp(PFEntry)) {
8879 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8880 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8881 unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
8882 unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
8883 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
8884 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8885 }
8886 }
8887 }
8888
8889 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
8890 if (EltSize >= 32) {
8891 // Do the expansion with floating-point types, since that is what the VFP
8892 // registers are defined to use, and since i64 is not legal.
8893 EVT EltVT = EVT::getFloatingPointVT(EltSize);
8894 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8895 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
8896 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
8898 for (unsigned i = 0; i < NumElts; ++i) {
8899 if (ShuffleMask[i] < 0)
8900 Ops.push_back(DAG.getUNDEF(EltVT));
8901 else
8902 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
8903 ShuffleMask[i] < (int)NumElts ? V1 : V2,
8904 DAG.getConstant(ShuffleMask[i] & (NumElts-1),
8905 dl, MVT::i32)));
8906 }
8907 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8908 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8909 }
8910
8911 if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8912 isReverseMask(ShuffleMask, VT))
8913 return LowerReverse_VECTOR_SHUFFLE(Op, DAG);
8914
8915 if (ST->hasNEON() && VT == MVT::v8i8)
8916 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
8917 return NewOp;
8918
8919 if (ST->hasMVEIntegerOps())
8920 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
8921 return NewOp;
8922
8923 return SDValue();
8924}
8925
8927 const ARMSubtarget *ST) {
8928 EVT VecVT = Op.getOperand(0).getValueType();
8929 SDLoc dl(Op);
8930
8931 assert(ST->hasMVEIntegerOps() &&
8932 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8933
8934 SDValue Conv =
8935 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8936 unsigned Lane = Op.getConstantOperandVal(2);
8937 unsigned LaneWidth =
8939 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
8940 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
8941 Op.getOperand(1), DAG.getValueType(MVT::i1));
8942 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
8943 DAG.getConstant(~Mask, dl, MVT::i32));
8944 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
8945}
8946
8947SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
8948 SelectionDAG &DAG) const {
8949 // INSERT_VECTOR_ELT is legal only for immediate indexes.
8950 SDValue Lane = Op.getOperand(2);
8951 if (!isa<ConstantSDNode>(Lane))
8952 return SDValue();
8953
8954 SDValue Elt = Op.getOperand(1);
8955 EVT EltVT = Elt.getValueType();
8956
8957 if (Subtarget->hasMVEIntegerOps() &&
8958 Op.getValueType().getScalarSizeInBits() == 1)
8959 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
8960
8961 if (getTypeAction(*DAG.getContext(), EltVT) ==
8963 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
8964 // but the type system will try to do that if we don't intervene.
8965 // Reinterpret any such vector-element insertion as one with the
8966 // corresponding integer types.
8967
8968 SDLoc dl(Op);
8969
8970 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
8971 assert(getTypeAction(*DAG.getContext(), IEltVT) !=
8973
8974 SDValue VecIn = Op.getOperand(0);
8975 EVT VecVT = VecIn.getValueType();
8976 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
8977 VecVT.getVectorNumElements());
8978
8979 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
8980 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
8981 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
8982 IVecIn, IElt, Lane);
8983 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
8984 }
8985
8986 return Op;
8987}
8988
8990 const ARMSubtarget *ST) {
8991 EVT VecVT = Op.getOperand(0).getValueType();
8992 SDLoc dl(Op);
8993
8994 assert(ST->hasMVEIntegerOps() &&
8995 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8996
8997 SDValue Conv =
8998 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8999 unsigned Lane = Op.getConstantOperandVal(1);
9000 unsigned LaneWidth =
9002 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
9003 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
9004 return Shift;
9005}
9006
9008 const ARMSubtarget *ST) {
9009 // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
9010 SDValue Lane = Op.getOperand(1);
9011 if (!isa<ConstantSDNode>(Lane))
9012 return SDValue();
9013
9014 SDValue Vec = Op.getOperand(0);
9015 EVT VT = Vec.getValueType();
9016
9017 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
9018 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
9019
9020 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
9021 SDLoc dl(Op);
9022 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
9023 }
9024
9025 return Op;
9026}
9027
9029 const ARMSubtarget *ST) {
9030 SDLoc dl(Op);
9031 assert(Op.getValueType().getScalarSizeInBits() == 1 &&
9032 "Unexpected custom CONCAT_VECTORS lowering");
9034 "Unexpected custom CONCAT_VECTORS lowering");
9035 assert(ST->hasMVEIntegerOps() &&
9036 "CONCAT_VECTORS lowering only supported for MVE");
9037
9038 auto ConcatPair = [&](SDValue V1, SDValue V2) {
9039 EVT Op1VT = V1.getValueType();
9040 EVT Op2VT = V2.getValueType();
9041 assert(Op1VT == Op2VT && "Operand types don't match!");
9042 assert((Op1VT == MVT::v2i1 || Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) &&
9043 "Unexpected i1 concat operations!");
9044 EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext());
9045
9046 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
9047 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
9048
9049 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
9050 // promoted to v8i16, etc.
9051 MVT ElType =
9053 unsigned NumElts = 2 * Op1VT.getVectorNumElements();
9054
9055 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
9056 if (Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) {
9057 // Use MVETRUNC to truncate the combined NewV1::NewV2 into the smaller
9058 // ConcatVT.
9059 SDValue ConVec =
9060 DAG.getNode(ARMISD::MVETRUNC, dl, ConcatVT, NewV1, NewV2);
9061 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
9062 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9063 }
9064
9065 // Extract the vector elements from Op1 and Op2 one by one and truncate them
9066 // to be the right size for the destination. For example, if Op1 is v4i1
9067 // then the promoted vector is v4i32. The result of concatenation gives a
9068 // v8i1, which when promoted is v8i16. That means each i32 element from Op1
9069 // needs truncating to i16 and inserting in the result.
9070 auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
9071 EVT NewVT = NewV.getValueType();
9072 EVT ConcatVT = ConVec.getValueType();
9073 unsigned ExtScale = 1;
9074 if (NewVT == MVT::v2f64) {
9075 NewV = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, NewV);
9076 ExtScale = 2;
9077 }
9078 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
9079 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
9080 DAG.getIntPtrConstant(i * ExtScale, dl));
9081 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
9082 DAG.getConstant(j, dl, MVT::i32));
9083 }
9084 return ConVec;
9085 };
9086 unsigned j = 0;
9087 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
9088 ConVec = ExtractInto(NewV1, ConVec, j);
9089 ConVec = ExtractInto(NewV2, ConVec, j);
9090
9091 // Now return the result of comparing the subvector with zero, which will
9092 // generate a real predicate, i.e. v4i1, v8i1 or v16i1.
9093 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
9094 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9095 };
9096
9097 // Concat each pair of subvectors and pack into the lower half of the array.
9098 SmallVector<SDValue> ConcatOps(Op->ops());
9099 while (ConcatOps.size() > 1) {
9100 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
9101 SDValue V1 = ConcatOps[I];
9102 SDValue V2 = ConcatOps[I + 1];
9103 ConcatOps[I / 2] = ConcatPair(V1, V2);
9104 }
9105 ConcatOps.resize(ConcatOps.size() / 2);
9106 }
9107 return ConcatOps[0];
9108}
9109
9111 const ARMSubtarget *ST) {
9112 EVT VT = Op->getValueType(0);
9113 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
9114 return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
9115
9116 // The only time a CONCAT_VECTORS operation can have legal types is when
9117 // two 64-bit vectors are concatenated to a 128-bit vector.
9118 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
9119 "unexpected CONCAT_VECTORS");
9120 SDLoc dl(Op);
9121 SDValue Val = DAG.getUNDEF(MVT::v2f64);
9122 SDValue Op0 = Op.getOperand(0);
9123 SDValue Op1 = Op.getOperand(1);
9124 if (!Op0.isUndef())
9125 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
9126 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
9127 DAG.getIntPtrConstant(0, dl));
9128 if (!Op1.isUndef())
9129 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
9130 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
9131 DAG.getIntPtrConstant(1, dl));
9132 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
9133}
9134
9136 const ARMSubtarget *ST) {
9137 SDValue V1 = Op.getOperand(0);
9138 SDValue V2 = Op.getOperand(1);
9139 SDLoc dl(Op);
9140 EVT VT = Op.getValueType();
9141 EVT Op1VT = V1.getValueType();
9142 unsigned NumElts = VT.getVectorNumElements();
9143 unsigned Index = V2->getAsZExtVal();
9144
9145 assert(VT.getScalarSizeInBits() == 1 &&
9146 "Unexpected custom EXTRACT_SUBVECTOR lowering");
9147 assert(ST->hasMVEIntegerOps() &&
9148 "EXTRACT_SUBVECTOR lowering only supported for MVE");
9149
9150 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
9151
9152 // We now have Op1 promoted to a vector of integers, where v8i1 gets
9153 // promoted to v8i16, etc.
9154
9156
9157 if (NumElts == 2) {
9158 EVT SubVT = MVT::v4i32;
9159 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
9160 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) {
9161 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9162 DAG.getIntPtrConstant(i, dl));
9163 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9164 DAG.getConstant(j, dl, MVT::i32));
9165 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9166 DAG.getConstant(j + 1, dl, MVT::i32));
9167 }
9168 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec,
9169 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9170 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
9171 }
9172
9173 EVT SubVT = MVT::getVectorVT(ElType, NumElts);
9174 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
9175 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
9176 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9177 DAG.getIntPtrConstant(i, dl));
9178 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9179 DAG.getConstant(j, dl, MVT::i32));
9180 }
9181
9182 // Now return the result of comparing the subvector with zero,
9183 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
9184 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
9185 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9186}
9187
9188// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
9190 const ARMSubtarget *ST) {
9191 assert(ST->hasMVEIntegerOps() && "Expected MVE!");
9192 EVT VT = N->getValueType(0);
9193 assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
9194 "Expected a vector i1 type!");
9195 SDValue Op = N->getOperand(0);
9196 EVT FromVT = Op.getValueType();
9197 SDLoc DL(N);
9198
9199 SDValue And =
9200 DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));
9201 return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),
9202 DAG.getCondCode(ISD::SETNE));
9203}
9204
9206 const ARMSubtarget *Subtarget) {
9207 if (!Subtarget->hasMVEIntegerOps())
9208 return SDValue();
9209
9210 EVT ToVT = N->getValueType(0);
9211 if (ToVT.getScalarType() == MVT::i1)
9212 return LowerTruncatei1(N, DAG, Subtarget);
9213
9214 // MVE does not have a single instruction to perform the truncation of a v4i32
9215 // into the lower half of a v8i16, in the same way that a NEON vmovn would.
9216 // Most of the instructions in MVE follow the 'Beats' system, where moving
9217 // values from different lanes is usually something that the instructions
9218 // avoid.
9219 //
9220 // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,
9221 // which take a the top/bottom half of a larger lane and extend it (or do the
9222 // opposite, truncating into the top/bottom lane from a larger lane). Note
9223 // that because of the way we widen lanes, a v4i16 is really a v4i32 using the
9224 // bottom 16bits from each vector lane. This works really well with T/B
9225 // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need
9226 // to move order.
9227 //
9228 // But truncates and sext/zext are always going to be fairly common from llvm.
9229 // We have several options for how to deal with them:
9230 // - Wherever possible combine them into an instruction that makes them
9231 // "free". This includes loads/stores, which can perform the trunc as part
9232 // of the memory operation. Or certain shuffles that can be turned into
9233 // VMOVN/VMOVL.
9234 // - Lane Interleaving to transform blocks surrounded by ext/trunc. So
9235 // trunc(mul(sext(a), sext(b))) may become
9236 // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in
9237 // this case can use VMULL). This is performed in the
9238 // MVELaneInterleavingPass.
9239 // - Otherwise we have an option. By default we would expand the
9240 // zext/sext/trunc into a series of lane extract/inserts going via GPR
9241 // registers. One for each vector lane in the vector. This can obviously be
9242 // very expensive.
9243 // - The other option is to use the fact that loads/store can extend/truncate
9244 // to turn a trunc into two truncating stack stores and a stack reload. This
9245 // becomes 3 back-to-back memory operations, but at least that is less than
9246 // all the insert/extracts.
9247 //
9248 // In order to do the last, we convert certain trunc's into MVETRUNC, which
9249 // are either optimized where they can be, or eventually lowered into stack
9250 // stores/loads. This prevents us from splitting a v8i16 trunc into two stores
9251 // two early, where other instructions would be better, and stops us from
9252 // having to reconstruct multiple buildvector shuffles into loads/stores.
9253 if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)
9254 return SDValue();
9255 EVT FromVT = N->getOperand(0).getValueType();
9256 if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16)
9257 return SDValue();
9258
9259 SDValue Lo, Hi;
9260 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
9261 SDLoc DL(N);
9262 return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi);
9263}
9264
9266 const ARMSubtarget *Subtarget) {
9267 if (!Subtarget->hasMVEIntegerOps())
9268 return SDValue();
9269
9270 // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
9271
9272 EVT ToVT = N->getValueType(0);
9273 if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)
9274 return SDValue();
9275 SDValue Op = N->getOperand(0);
9276 EVT FromVT = Op.getValueType();
9277 if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)
9278 return SDValue();
9279
9280 SDLoc DL(N);
9281 EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
9282 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)
9283 ExtVT = MVT::v8i16;
9284
9285 unsigned Opcode =
9287 SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op);
9288 SDValue Ext1 = Ext.getValue(1);
9289
9290 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {
9291 Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);
9292 Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);
9293 }
9294
9295 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1);
9296}
9297
9298/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
9299/// element has been zero/sign-extended, depending on the isSigned parameter,
9300/// from an integer type half its size.
9302 bool isSigned) {
9303 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
9304 EVT VT = N->getValueType(0);
9305 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
9306 SDNode *BVN = N->getOperand(0).getNode();
9307 if (BVN->getValueType(0) != MVT::v4i32 ||
9308 BVN->getOpcode() != ISD::BUILD_VECTOR)
9309 return false;
9310 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9311 unsigned HiElt = 1 - LoElt;
9312 ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
9313 ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
9314 ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
9315 ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
9316 if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
9317 return false;
9318 if (isSigned) {
9319 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
9320 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
9321 return true;
9322 } else {
9323 if (Hi0->isZero() && Hi1->isZero())
9324 return true;
9325 }
9326 return false;
9327 }
9328
9329 if (N->getOpcode() != ISD::BUILD_VECTOR)
9330 return false;
9331
9332 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
9333 SDNode *Elt = N->getOperand(i).getNode();
9334 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
9335 unsigned EltSize = VT.getScalarSizeInBits();
9336 unsigned HalfSize = EltSize / 2;
9337 if (isSigned) {
9338 if (!isIntN(HalfSize, C->getSExtValue()))
9339 return false;
9340 } else {
9341 if (!isUIntN(HalfSize, C->getZExtValue()))
9342 return false;
9343 }
9344 continue;
9345 }
9346 return false;
9347 }
9348
9349 return true;
9350}
9351
9352/// isSignExtended - Check if a node is a vector value that is sign-extended
9353/// or a constant BUILD_VECTOR with sign-extended elements.
9355 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
9356 return true;
9357 if (isExtendedBUILD_VECTOR(N, DAG, true))
9358 return true;
9359 return false;
9360}
9361
9362/// isZeroExtended - Check if a node is a vector value that is zero-extended (or
9363/// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
9365 if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
9367 return true;
9368 if (isExtendedBUILD_VECTOR(N, DAG, false))
9369 return true;
9370 return false;
9371}
9372
9373static EVT getExtensionTo64Bits(const EVT &OrigVT) {
9374 if (OrigVT.getSizeInBits() >= 64)
9375 return OrigVT;
9376
9377 assert(OrigVT.isSimple() && "Expecting a simple value type");
9378
9379 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
9380 switch (OrigSimpleTy) {
9381 default: llvm_unreachable("Unexpected Vector Type");
9382 case MVT::v2i8:
9383 case MVT::v2i16:
9384 return MVT::v2i32;
9385 case MVT::v4i8:
9386 return MVT::v4i16;
9387 }
9388}
9389
9390/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
9391/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
9392/// We insert the required extension here to get the vector to fill a D register.
9394 const EVT &OrigTy,
9395 const EVT &ExtTy,
9396 unsigned ExtOpcode) {
9397 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
9398 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
9399 // 64-bits we need to insert a new extension so that it will be 64-bits.
9400 assert(ExtTy.is128BitVector() && "Unexpected extension size");
9401 if (OrigTy.getSizeInBits() >= 64)
9402 return N;
9403
9404 // Must extend size to at least 64 bits to be used as an operand for VMULL.
9405 EVT NewVT = getExtensionTo64Bits(OrigTy);
9406
9407 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
9408}
9409
9410/// SkipLoadExtensionForVMULL - return a load of the original vector size that
9411/// does not do any sign/zero extension. If the original vector is less
9412/// than 64 bits, an appropriate extension will be added after the load to
9413/// reach a total size of 64 bits. We have to add the extension separately
9414/// because ARM does not have a sign/zero extending load for vectors.
9416 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
9417
9418 // The load already has the right type.
9419 if (ExtendedTy == LD->getMemoryVT())
9420 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
9421 LD->getBasePtr(), LD->getPointerInfo(), LD->getAlign(),
9422 LD->getMemOperand()->getFlags());
9423
9424 // We need to create a zextload/sextload. We cannot just create a load
9425 // followed by a zext/zext node because LowerMUL is also run during normal
9426 // operation legalization where we can't create illegal types.
9427 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
9428 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
9429 LD->getMemoryVT(), LD->getAlign(),
9430 LD->getMemOperand()->getFlags());
9431}
9432
9433/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
9434/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
9435/// the unextended value. The unextended vector should be 64 bits so that it can
9436/// be used as an operand to a VMULL instruction. If the original vector size
9437/// before extension is less than 64 bits we add a an extension to resize
9438/// the vector to 64 bits.
9440 if (N->getOpcode() == ISD::SIGN_EXTEND ||
9441 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
9442 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
9443 N->getOperand(0)->getValueType(0),
9444 N->getValueType(0),
9445 N->getOpcode());
9446
9447 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
9448 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
9449 "Expected extending load");
9450
9451 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
9452 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
9453 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
9454 SDValue extLoad =
9455 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
9456 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
9457
9458 return newLoad;
9459 }
9460
9461 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
9462 // have been legalized as a BITCAST from v4i32.
9463 if (N->getOpcode() == ISD::BITCAST) {
9464 SDNode *BVN = N->getOperand(0).getNode();
9466 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
9467 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9468 return DAG.getBuildVector(
9469 MVT::v2i32, SDLoc(N),
9470 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
9471 }
9472 // Construct a new BUILD_VECTOR with elements truncated to half the size.
9473 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
9474 EVT VT = N->getValueType(0);
9475 unsigned EltSize = VT.getScalarSizeInBits() / 2;
9476 unsigned NumElts = VT.getVectorNumElements();
9477 MVT TruncVT = MVT::getIntegerVT(EltSize);
9479 SDLoc dl(N);
9480 for (unsigned i = 0; i != NumElts; ++i) {
9481 const APInt &CInt = N->getConstantOperandAPInt(i);
9482 // Element types smaller than 32 bits are not legal, so use i32 elements.
9483 // The values are implicitly truncated so sext vs. zext doesn't matter.
9484 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
9485 }
9486 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
9487}
9488
9489static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
9490 unsigned Opcode = N->getOpcode();
9491 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9492 SDNode *N0 = N->getOperand(0).getNode();
9493 SDNode *N1 = N->getOperand(1).getNode();
9494 return N0->hasOneUse() && N1->hasOneUse() &&
9495 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
9496 }
9497 return false;
9498}
9499
9500static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
9501 unsigned Opcode = N->getOpcode();
9502 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9503 SDNode *N0 = N->getOperand(0).getNode();
9504 SDNode *N1 = N->getOperand(1).getNode();
9505 return N0->hasOneUse() && N1->hasOneUse() &&
9506 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
9507 }
9508 return false;
9509}
9510
9512 // Multiplications are only custom-lowered for 128-bit vectors so that
9513 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
9514 EVT VT = Op.getValueType();
9515 assert(VT.is128BitVector() && VT.isInteger() &&
9516 "unexpected type for custom-lowering ISD::MUL");
9517 SDNode *N0 = Op.getOperand(0).getNode();
9518 SDNode *N1 = Op.getOperand(1).getNode();
9519 unsigned NewOpc = 0;
9520 bool isMLA = false;
9521 bool isN0SExt = isSignExtended(N0, DAG);
9522 bool isN1SExt = isSignExtended(N1, DAG);
9523 if (isN0SExt && isN1SExt)
9524 NewOpc = ARMISD::VMULLs;
9525 else {
9526 bool isN0ZExt = isZeroExtended(N0, DAG);
9527 bool isN1ZExt = isZeroExtended(N1, DAG);
9528 if (isN0ZExt && isN1ZExt)
9529 NewOpc = ARMISD::VMULLu;
9530 else if (isN1SExt || isN1ZExt) {
9531 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
9532 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
9533 if (isN1SExt && isAddSubSExt(N0, DAG)) {
9534 NewOpc = ARMISD::VMULLs;
9535 isMLA = true;
9536 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
9537 NewOpc = ARMISD::VMULLu;
9538 isMLA = true;
9539 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
9540 std::swap(N0, N1);
9541 NewOpc = ARMISD::VMULLu;
9542 isMLA = true;
9543 }
9544 }
9545
9546 if (!NewOpc) {
9547 if (VT == MVT::v2i64)
9548 // Fall through to expand this. It is not legal.
9549 return SDValue();
9550 else
9551 // Other vector multiplications are legal.
9552 return Op;
9553 }
9554 }
9555
9556 // Legalize to a VMULL instruction.
9557 SDLoc DL(Op);
9558 SDValue Op0;
9559 SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
9560 if (!isMLA) {
9561 Op0 = SkipExtensionForVMULL(N0, DAG);
9563 Op1.getValueType().is64BitVector() &&
9564 "unexpected types for extended operands to VMULL");
9565 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
9566 }
9567
9568 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
9569 // isel lowering to take advantage of no-stall back to back vmul + vmla.
9570 // vmull q0, d4, d6
9571 // vmlal q0, d5, d6
9572 // is faster than
9573 // vaddl q0, d4, d5
9574 // vmovl q1, d6
9575 // vmul q0, q0, q1
9576 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
9577 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
9578 EVT Op1VT = Op1.getValueType();
9579 return DAG.getNode(N0->getOpcode(), DL, VT,
9580 DAG.getNode(NewOpc, DL, VT,
9581 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
9582 DAG.getNode(NewOpc, DL, VT,
9583 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
9584}
9585
9587 SelectionDAG &DAG) {
9588 // TODO: Should this propagate fast-math-flags?
9589
9590 // Convert to float
9591 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
9592 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
9593 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
9594 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
9595 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
9596 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
9597 // Get reciprocal estimate.
9598 // float4 recip = vrecpeq_f32(yf);
9599 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9600 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9601 Y);
9602 // Because char has a smaller range than uchar, we can actually get away
9603 // without any newton steps. This requires that we use a weird bias
9604 // of 0xb000, however (again, this has been exhaustively tested).
9605 // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
9606 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
9607 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
9608 Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
9609 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
9610 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
9611 // Convert back to short.
9612 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
9613 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
9614 return X;
9615}
9616
9618 SelectionDAG &DAG) {
9619 // TODO: Should this propagate fast-math-flags?
9620
9621 SDValue N2;
9622 // Convert to float.
9623 // float4 yf = vcvt_f32_s32(vmovl_s16(y));
9624 // float4 xf = vcvt_f32_s32(vmovl_s16(x));
9625 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
9626 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
9627 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9628 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9629
9630 // Use reciprocal estimate and one refinement step.
9631 // float4 recip = vrecpeq_f32(yf);
9632 // recip *= vrecpsq_f32(yf, recip);
9633 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9634 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9635 N1);
9636 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9637 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9638 N1, N2);
9639 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9640 // Because short has a smaller range than ushort, we can actually get away
9641 // with only a single newton step. This requires that we use a weird bias
9642 // of 89, however (again, this has been exhaustively tested).
9643 // float4 result = as_float4(as_int4(xf*recip) + 0x89);
9644 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9645 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9646 N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
9647 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9648 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9649 // Convert back to integer and return.
9650 // return vmovn_s32(vcvt_s32_f32(result));
9651 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9652 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9653 return N0;
9654}
9655
9657 const ARMSubtarget *ST) {
9658 EVT VT = Op.getValueType();
9659 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9660 "unexpected type for custom-lowering ISD::SDIV");
9661
9662 SDLoc dl(Op);
9663 SDValue N0 = Op.getOperand(0);
9664 SDValue N1 = Op.getOperand(1);
9665 SDValue N2, N3;
9666
9667 if (VT == MVT::v8i8) {
9668 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
9669 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
9670
9671 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9672 DAG.getIntPtrConstant(4, dl));
9673 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9674 DAG.getIntPtrConstant(4, dl));
9675 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9676 DAG.getIntPtrConstant(0, dl));
9677 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9678 DAG.getIntPtrConstant(0, dl));
9679
9680 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
9681 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
9682
9683 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9684 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9685
9686 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
9687 return N0;
9688 }
9689 return LowerSDIV_v4i16(N0, N1, dl, DAG);
9690}
9691
9693 const ARMSubtarget *ST) {
9694 // TODO: Should this propagate fast-math-flags?
9695 EVT VT = Op.getValueType();
9696 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9697 "unexpected type for custom-lowering ISD::UDIV");
9698
9699 SDLoc dl(Op);
9700 SDValue N0 = Op.getOperand(0);
9701 SDValue N1 = Op.getOperand(1);
9702 SDValue N2, N3;
9703
9704 if (VT == MVT::v8i8) {
9705 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
9706 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
9707
9708 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9709 DAG.getIntPtrConstant(4, dl));
9710 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9711 DAG.getIntPtrConstant(4, dl));
9712 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9713 DAG.getIntPtrConstant(0, dl));
9714 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9715 DAG.getIntPtrConstant(0, dl));
9716
9717 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
9718 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
9719
9720 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9721 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9722
9723 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
9724 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
9725 MVT::i32),
9726 N0);
9727 return N0;
9728 }
9729
9730 // v4i16 sdiv ... Convert to float.
9731 // float4 yf = vcvt_f32_s32(vmovl_u16(y));
9732 // float4 xf = vcvt_f32_s32(vmovl_u16(x));
9733 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
9734 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
9735 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9736 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9737
9738 // Use reciprocal estimate and two refinement steps.
9739 // float4 recip = vrecpeq_f32(yf);
9740 // recip *= vrecpsq_f32(yf, recip);
9741 // recip *= vrecpsq_f32(yf, recip);
9742 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9743 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9744 BN1);
9745 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9746 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9747 BN1, N2);
9748 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9749 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9750 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9751 BN1, N2);
9752 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9753 // Simply multiplying by the reciprocal estimate can leave us a few ulps
9754 // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
9755 // and that it will never cause us to return an answer too large).
9756 // float4 result = as_float4(as_int4(xf*recip) + 2);
9757 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9758 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9759 N1 = DAG.getConstant(2, dl, MVT::v4i32);
9760 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9761 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9762 // Convert back to integer and return.
9763 // return vmovn_u32(vcvt_s32_f32(result));
9764 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9765 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9766 return N0;
9767}
9768
9770 SDNode *N = Op.getNode();
9771 EVT VT = N->getValueType(0);
9772 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
9773
9774 SDValue Carry = Op.getOperand(2);
9775
9776 SDLoc DL(Op);
9777
9778 SDValue Result;
9779 if (Op.getOpcode() == ISD::UADDO_CARRY) {
9780 // This converts the boolean value carry into the carry flag.
9781 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9782
9783 // Do the addition proper using the carry flag we wanted.
9784 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
9785 Op.getOperand(1), Carry);
9786
9787 // Now convert the carry flag into a boolean value.
9788 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9789 } else {
9790 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
9791 // have to invert the carry first.
9792 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9793 DAG.getConstant(1, DL, MVT::i32), Carry);
9794 // This converts the boolean value carry into the carry flag.
9795 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9796
9797 // Do the subtraction proper using the carry flag we wanted.
9798 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
9799 Op.getOperand(1), Carry);
9800
9801 // Now convert the carry flag into a boolean value.
9802 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9803 // But the carry returned by ARMISD::SUBE is not a borrow as expected
9804 // by ISD::USUBO_CARRY, so compute 1 - C.
9805 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9806 DAG.getConstant(1, DL, MVT::i32), Carry);
9807 }
9808
9809 // Return both values.
9810 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
9811}
9812
9813SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
9814 assert(Subtarget->isTargetDarwin());
9815
9816 // For iOS, we want to call an alternative entry point: __sincos_stret,
9817 // return values are passed via sret.
9818 SDLoc dl(Op);
9819 SDValue Arg = Op.getOperand(0);
9820 EVT ArgVT = Arg.getValueType();
9821 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
9822 auto PtrVT = getPointerTy(DAG.getDataLayout());
9823
9825
9826 // Pair of floats / doubles used to pass the result.
9827 Type *RetTy = StructType::get(ArgTy, ArgTy);
9828 auto &DL = DAG.getDataLayout();
9829
9831 bool ShouldUseSRet = getTM().isAPCS_ABI();
9832 SDValue SRet;
9833 if (ShouldUseSRet) {
9834 // Create stack object for sret.
9835 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
9836 const Align StackAlign = DL.getPrefTypeAlign(RetTy);
9837 int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
9838 SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL));
9839
9840 ArgListEntry Entry(SRet, PointerType::getUnqual(RetTy->getContext()));
9841 Entry.IsSExt = false;
9842 Entry.IsZExt = false;
9843 Entry.IsSRet = true;
9844 Args.push_back(Entry);
9846 }
9847
9848 Args.emplace_back(Arg, ArgTy);
9849
9850 RTLIB::Libcall LC =
9851 (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
9852 const char *LibcallName = getLibcallName(LC);
9854 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
9855
9857 CLI.setDebugLoc(dl)
9858 .setChain(DAG.getEntryNode())
9859 .setCallee(CC, RetTy, Callee, std::move(Args))
9860 .setDiscardResult(ShouldUseSRet);
9861 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
9862
9863 if (!ShouldUseSRet)
9864 return CallResult.first;
9865
9866 SDValue LoadSin =
9867 DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
9868
9869 // Address of cos field.
9870 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
9871 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
9872 SDValue LoadCos =
9873 DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
9874
9875 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
9876 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
9877 LoadSin.getValue(0), LoadCos.getValue(0));
9878}
9879
9880SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
9881 bool Signed,
9882 SDValue &Chain) const {
9883 EVT VT = Op.getValueType();
9884 assert((VT == MVT::i32 || VT == MVT::i64) &&
9885 "unexpected type for custom lowering DIV");
9886 SDLoc dl(Op);
9887
9888 const auto &DL = DAG.getDataLayout();
9889 RTLIB::Libcall LC;
9890 if (Signed)
9891 LC = VT == MVT::i32 ? RTLIB::SDIVREM_I32 : RTLIB::SDIVREM_I64;
9892 else
9893 LC = VT == MVT::i32 ? RTLIB::UDIVREM_I32 : RTLIB::UDIVREM_I64;
9894
9895 const char *Name = getLibcallName(LC);
9897
9899
9900 for (auto AI : {1, 0}) {
9901 SDValue Operand = Op.getOperand(AI);
9902 Args.emplace_back(Operand,
9903 Operand.getValueType().getTypeForEVT(*DAG.getContext()));
9904 }
9905
9906 CallLoweringInfo CLI(DAG);
9907 CLI.setDebugLoc(dl)
9908 .setChain(Chain)
9910 ES, std::move(Args));
9911
9912 return LowerCallTo(CLI).first;
9913}
9914
9915// This is a code size optimisation: return the original SDIV node to
9916// DAGCombiner when we don't want to expand SDIV into a sequence of
9917// instructions, and an empty node otherwise which will cause the
9918// SDIV to be expanded in DAGCombine.
9919SDValue
9920ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
9921 SelectionDAG &DAG,
9922 SmallVectorImpl<SDNode *> &Created) const {
9923 // TODO: Support SREM
9924 if (N->getOpcode() != ISD::SDIV)
9925 return SDValue();
9926
9927 const auto &ST = DAG.getSubtarget<ARMSubtarget>();
9928 const bool MinSize = ST.hasMinSize();
9929 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
9930 : ST.hasDivideInARMMode();
9931
9932 // Don't touch vector types; rewriting this may lead to scalarizing
9933 // the int divs.
9934 if (N->getOperand(0).getValueType().isVector())
9935 return SDValue();
9936
9937 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
9938 // hwdiv support for this to be really profitable.
9939 if (!(MinSize && HasDivide))
9940 return SDValue();
9941
9942 // ARM mode is a bit simpler than Thumb: we can handle large power
9943 // of 2 immediates with 1 mov instruction; no further checks required,
9944 // just return the sdiv node.
9945 if (!ST.isThumb())
9946 return SDValue(N, 0);
9947
9948 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
9949 // and thus lose the code size benefits of a MOVS that requires only 2.
9950 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
9951 // but as it's doing exactly this, it's not worth the trouble to get TTI.
9952 if (Divisor.sgt(128))
9953 return SDValue();
9954
9955 return SDValue(N, 0);
9956}
9957
9958SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
9959 bool Signed) const {
9960 assert(Op.getValueType() == MVT::i32 &&
9961 "unexpected type for custom lowering DIV");
9962 SDLoc dl(Op);
9963
9964 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
9965 DAG.getEntryNode(), Op.getOperand(1));
9966
9967 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9968}
9969
9971 SDLoc DL(N);
9972 SDValue Op = N->getOperand(1);
9973 if (N->getValueType(0) == MVT::i32)
9974 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
9975 SDValue Lo, Hi;
9976 std::tie(Lo, Hi) = DAG.SplitScalar(Op, DL, MVT::i32, MVT::i32);
9977 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
9978 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
9979}
9980
9981void ARMTargetLowering::ExpandDIV_Windows(
9982 SDValue Op, SelectionDAG &DAG, bool Signed,
9984 const auto &DL = DAG.getDataLayout();
9985
9986 assert(Op.getValueType() == MVT::i64 &&
9987 "unexpected type for custom lowering DIV");
9988 SDLoc dl(Op);
9989
9990 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
9991
9992 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9993
9994 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
9995 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
9996 DAG.getConstant(32, dl, getPointerTy(DL)));
9997 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
9998
9999 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
10000}
10001
10003 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
10004 EVT MemVT = LD->getMemoryVT();
10005 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10006 MemVT == MVT::v16i1) &&
10007 "Expected a predicate type!");
10008 assert(MemVT == Op.getValueType());
10009 assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
10010 "Expected a non-extending load");
10011 assert(LD->isUnindexed() && "Expected a unindexed load");
10012
10013 // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit
10014 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
10015 // need to make sure that 8/4/2 bits are actually loaded into the correct
10016 // place, which means loading the value and then shuffling the values into
10017 // the bottom bits of the predicate.
10018 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
10019 // for BE).
10020 // Speaking of BE, apparently the rest of llvm will assume a reverse order to
10021 // a natural VMSR(load), so needs to be reversed.
10022
10023 SDLoc dl(Op);
10024 SDValue Load = DAG.getExtLoad(
10025 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
10027 LD->getMemOperand());
10028 SDValue Val = Load;
10029 if (DAG.getDataLayout().isBigEndian())
10030 Val = DAG.getNode(ISD::SRL, dl, MVT::i32,
10031 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),
10032 DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));
10033 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);
10034 if (MemVT != MVT::v16i1)
10035 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
10036 DAG.getConstant(0, dl, MVT::i32));
10037 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
10038}
10039
10040void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
10041 SelectionDAG &DAG) const {
10042 LoadSDNode *LD = cast<LoadSDNode>(N);
10043 EVT MemVT = LD->getMemoryVT();
10044 assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
10045
10046 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
10047 !Subtarget->isThumb1Only() && LD->isVolatile() &&
10048 LD->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
10049 SDLoc dl(N);
10051 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
10052 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
10053 SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
10054 SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
10055 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
10056 Results.append({Pair, Result.getValue(2)});
10057 }
10058}
10059
10061 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
10062 EVT MemVT = ST->getMemoryVT();
10063 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10064 MemVT == MVT::v16i1) &&
10065 "Expected a predicate type!");
10066 assert(MemVT == ST->getValue().getValueType());
10067 assert(!ST->isTruncatingStore() && "Expected a non-extending store");
10068 assert(ST->isUnindexed() && "Expected a unindexed store");
10069
10070 // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with
10071 // top bits unset and a scalar store.
10072 SDLoc dl(Op);
10073 SDValue Build = ST->getValue();
10074 if (MemVT != MVT::v16i1) {
10076 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {
10077 unsigned Elt = DAG.getDataLayout().isBigEndian()
10078 ? MemVT.getVectorNumElements() - I - 1
10079 : I;
10080 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
10081 DAG.getConstant(Elt, dl, MVT::i32)));
10082 }
10083 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
10084 Ops.push_back(DAG.getUNDEF(MVT::i32));
10085 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
10086 }
10087 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
10088 if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())
10089 GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,
10090 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP),
10091 DAG.getConstant(16, dl, MVT::i32));
10092 return DAG.getTruncStore(
10093 ST->getChain(), dl, GRP, ST->getBasePtr(),
10095 ST->getMemOperand());
10096}
10097
10099 const ARMSubtarget *Subtarget) {
10100 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
10101 EVT MemVT = ST->getMemoryVT();
10102 assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
10103
10104 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
10105 !Subtarget->isThumb1Only() && ST->isVolatile() &&
10106 ST->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
10107 SDNode *N = Op.getNode();
10108 SDLoc dl(N);
10109
10110 SDValue Lo = DAG.getNode(
10111 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
10112 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
10113 MVT::i32));
10114 SDValue Hi = DAG.getNode(
10115 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
10116 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
10117 MVT::i32));
10118
10119 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
10120 {ST->getChain(), Lo, Hi, ST->getBasePtr()},
10121 MemVT, ST->getMemOperand());
10122 } else if (Subtarget->hasMVEIntegerOps() &&
10123 ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10124 MemVT == MVT::v16i1))) {
10125 return LowerPredicateStore(Op, DAG);
10126 }
10127
10128 return SDValue();
10129}
10130
10131static bool isZeroVector(SDValue N) {
10132 return (ISD::isBuildVectorAllZeros(N.getNode()) ||
10133 (N->getOpcode() == ARMISD::VMOVIMM &&
10134 isNullConstant(N->getOperand(0))));
10135}
10136
10138 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
10139 MVT VT = Op.getSimpleValueType();
10140 SDValue Mask = N->getMask();
10141 SDValue PassThru = N->getPassThru();
10142 SDLoc dl(Op);
10143
10144 if (isZeroVector(PassThru))
10145 return Op;
10146
10147 // MVE Masked loads use zero as the passthru value. Here we convert undef to
10148 // zero too, and other values are lowered to a select.
10149 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
10150 DAG.getTargetConstant(0, dl, MVT::i32));
10151 SDValue NewLoad = DAG.getMaskedLoad(
10152 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
10153 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
10154 N->getExtensionType(), N->isExpandingLoad());
10155 SDValue Combo = NewLoad;
10156 bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
10157 PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
10158 isZeroVector(PassThru->getOperand(0));
10159 if (!PassThru.isUndef() && !PassThruIsCastZero)
10160 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
10161 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
10162}
10163
10165 const ARMSubtarget *ST) {
10166 if (!ST->hasMVEIntegerOps())
10167 return SDValue();
10168
10169 SDLoc dl(Op);
10170 unsigned BaseOpcode = 0;
10171 switch (Op->getOpcode()) {
10172 default: llvm_unreachable("Expected VECREDUCE opcode");
10173 case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
10174 case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
10175 case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
10176 case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
10177 case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
10178 case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
10179 case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
10180 case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
10181 }
10182
10183 SDValue Op0 = Op->getOperand(0);
10184 EVT VT = Op0.getValueType();
10185 EVT EltVT = VT.getVectorElementType();
10186 unsigned NumElts = VT.getVectorNumElements();
10187 unsigned NumActiveLanes = NumElts;
10188
10189 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10190 NumActiveLanes == 2) &&
10191 "Only expected a power 2 vector size");
10192
10193 // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
10194 // allows us to easily extract vector elements from the lanes.
10195 while (NumActiveLanes > 4) {
10196 unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
10197 SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
10198 Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
10199 NumActiveLanes /= 2;
10200 }
10201
10202 SDValue Res;
10203 if (NumActiveLanes == 4) {
10204 // The remaining 4 elements are summed sequentially
10205 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10206 DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
10207 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10208 DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
10209 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10210 DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
10211 SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10212 DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
10213 SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10214 SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
10215 Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
10216 } else {
10217 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10218 DAG.getConstant(0, dl, MVT::i32));
10219 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10220 DAG.getConstant(1, dl, MVT::i32));
10221 Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10222 }
10223
10224 // Result type may be wider than element type.
10225 if (EltVT != Op->getValueType(0))
10226 Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
10227 return Res;
10228}
10229
10231 const ARMSubtarget *ST) {
10232 if (!ST->hasMVEFloatOps())
10233 return SDValue();
10234 return LowerVecReduce(Op, DAG, ST);
10235}
10236
10238 const ARMSubtarget *ST) {
10239 if (!ST->hasNEON())
10240 return SDValue();
10241
10242 SDLoc dl(Op);
10243 SDValue Op0 = Op->getOperand(0);
10244 EVT VT = Op0.getValueType();
10245 EVT EltVT = VT.getVectorElementType();
10246
10247 unsigned PairwiseIntrinsic = 0;
10248 switch (Op->getOpcode()) {
10249 default:
10250 llvm_unreachable("Expected VECREDUCE opcode");
10252 PairwiseIntrinsic = Intrinsic::arm_neon_vpminu;
10253 break;
10255 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxu;
10256 break;
10258 PairwiseIntrinsic = Intrinsic::arm_neon_vpmins;
10259 break;
10261 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxs;
10262 break;
10263 }
10264 SDValue PairwiseOp = DAG.getConstant(PairwiseIntrinsic, dl, MVT::i32);
10265
10266 unsigned NumElts = VT.getVectorNumElements();
10267 unsigned NumActiveLanes = NumElts;
10268
10269 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10270 NumActiveLanes == 2) &&
10271 "Only expected a power 2 vector size");
10272
10273 // Split 128-bit vectors, since vpmin/max takes 2 64-bit vectors.
10274 if (VT.is128BitVector()) {
10275 SDValue Lo, Hi;
10276 std::tie(Lo, Hi) = DAG.SplitVector(Op0, dl);
10277 VT = Lo.getValueType();
10278 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Lo, Hi});
10279 NumActiveLanes /= 2;
10280 }
10281
10282 // Use pairwise reductions until one lane remains
10283 while (NumActiveLanes > 1) {
10284 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Op0, Op0});
10285 NumActiveLanes /= 2;
10286 }
10287
10288 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10289 DAG.getConstant(0, dl, MVT::i32));
10290
10291 // Result type may be wider than element type.
10292 if (EltVT != Op.getValueType()) {
10293 unsigned Extend = 0;
10294 switch (Op->getOpcode()) {
10295 default:
10296 llvm_unreachable("Expected VECREDUCE opcode");
10299 Extend = ISD::ZERO_EXTEND;
10300 break;
10303 Extend = ISD::SIGN_EXTEND;
10304 break;
10305 }
10306 Res = DAG.getNode(Extend, dl, Op.getValueType(), Res);
10307 }
10308 return Res;
10309}
10310
10312 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))
10313 // Acquire/Release load/store is not legal for targets without a dmb or
10314 // equivalent available.
10315 return SDValue();
10316
10317 // Monotonic load/store is legal for all targets.
10318 return Op;
10319}
10320
10323 SelectionDAG &DAG,
10324 const ARMSubtarget *Subtarget) {
10325 SDLoc DL(N);
10326 // Under Power Management extensions, the cycle-count is:
10327 // mrc p15, #0, <Rt>, c9, c13, #0
10328 SDValue Ops[] = { N->getOperand(0), // Chain
10329 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
10330 DAG.getTargetConstant(15, DL, MVT::i32),
10331 DAG.getTargetConstant(0, DL, MVT::i32),
10332 DAG.getTargetConstant(9, DL, MVT::i32),
10333 DAG.getTargetConstant(13, DL, MVT::i32),
10334 DAG.getTargetConstant(0, DL, MVT::i32)
10335 };
10336
10337 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
10338 DAG.getVTList(MVT::i32, MVT::Other), Ops);
10339 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
10340 DAG.getConstant(0, DL, MVT::i32)));
10341 Results.push_back(Cycles32.getValue(1));
10342}
10343
10345 SDValue V1) {
10346 SDLoc dl(V0.getNode());
10347 SDValue RegClass =
10348 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
10349 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
10350 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
10351 const SDValue Ops[] = {RegClass, V0, SubReg0, V1, SubReg1};
10352 return SDValue(
10353 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
10354}
10355
10357 SDLoc dl(V.getNode());
10358 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i32, MVT::i32);
10359 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10360 if (isBigEndian)
10361 std::swap(VLo, VHi);
10362 return createGPRPairNode2xi32(DAG, VLo, VHi);
10363}
10364
10367 SelectionDAG &DAG) {
10368 assert(N->getValueType(0) == MVT::i64 &&
10369 "AtomicCmpSwap on types less than 64 should be legal");
10370 SDValue Ops[] = {
10371 createGPRPairNode2xi32(DAG, N->getOperand(1),
10372 DAG.getUNDEF(MVT::i32)), // pointer, temp
10373 createGPRPairNodei64(DAG, N->getOperand(2)), // expected
10374 createGPRPairNodei64(DAG, N->getOperand(3)), // new
10375 N->getOperand(0), // chain in
10376 };
10377 SDNode *CmpSwap = DAG.getMachineNode(
10378 ARM::CMP_SWAP_64, SDLoc(N),
10379 DAG.getVTList(MVT::Untyped, MVT::Untyped, MVT::Other), Ops);
10380
10381 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
10382 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
10383
10384 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10385
10386 SDValue Lo =
10387 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
10388 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10389 SDValue Hi =
10390 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
10391 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10392 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
10393 Results.push_back(SDValue(CmpSwap, 2));
10394}
10395
10396SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
10397 SDLoc dl(Op);
10398 EVT VT = Op.getValueType();
10399 SDValue Chain = Op.getOperand(0);
10400 SDValue LHS = Op.getOperand(1);
10401 SDValue RHS = Op.getOperand(2);
10402 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();
10403 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10404
10405 // If we don't have instructions of this float type then soften to a libcall
10406 // and use SETCC instead.
10407 if (isUnsupportedFloatingType(LHS.getValueType())) {
10408 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS,
10409 Chain, IsSignaling);
10410 if (!RHS.getNode()) {
10411 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10412 CC = ISD::SETNE;
10413 }
10414 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS,
10415 DAG.getCondCode(CC));
10416 return DAG.getMergeValues({Result, Chain}, dl);
10417 }
10418
10419 ARMCC::CondCodes CondCode, CondCode2;
10420 FPCCToARMCC(CC, CondCode, CondCode2);
10421
10422 SDValue True = DAG.getConstant(1, dl, VT);
10423 SDValue False = DAG.getConstant(0, dl, VT);
10424 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
10425 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
10426 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, Cmp, DAG);
10427 if (CondCode2 != ARMCC::AL) {
10428 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
10429 Result = getCMOV(dl, VT, Result, True, ARMcc, Cmp, DAG);
10430 }
10431 return DAG.getMergeValues({Result, Chain}, dl);
10432}
10433
10434SDValue ARMTargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
10436
10437 EVT VT = getPointerTy(DAG.getDataLayout());
10438 int FI = MFI.CreateFixedObject(4, 0, false);
10439 return DAG.getFrameIndex(FI, VT);
10440}
10441
10442SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op,
10443 SelectionDAG &DAG) const {
10444 SDLoc DL(Op);
10445 MakeLibCallOptions CallOptions;
10446 MVT SVT = Op.getOperand(0).getSimpleValueType();
10447 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
10448 SDValue Res =
10449 makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
10450 return DAG.getBitcast(MVT::i32, Res);
10451}
10452
10453SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
10454 SDLoc dl(Op);
10455 SDValue LHS = Op.getOperand(0);
10456 SDValue RHS = Op.getOperand(1);
10457
10458 // Determine if this is signed or unsigned comparison
10459 bool IsSigned = (Op.getOpcode() == ISD::SCMP);
10460
10461 // Special case for Thumb1 UCMP only
10462 if (!IsSigned && Subtarget->isThumb1Only()) {
10463 // For Thumb unsigned comparison, use this sequence:
10464 // subs r2, r0, r1 ; r2 = LHS - RHS, sets flags
10465 // sbc r2, r2 ; r2 = r2 - r2 - !carry
10466 // cmp r1, r0 ; compare RHS with LHS
10467 // sbc r1, r1 ; r1 = r1 - r1 - !carry
10468 // subs r0, r2, r1 ; r0 = r2 - r1 (final result)
10469
10470 // First subtraction: LHS - RHS
10471 SDValue Sub1WithFlags = DAG.getNode(
10472 ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10473 SDValue Sub1Result = Sub1WithFlags.getValue(0);
10474 SDValue Flags1 = Sub1WithFlags.getValue(1);
10475
10476 // SUBE: Sub1Result - Sub1Result - !carry
10477 // This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned)
10478 SDValue Sbc1 =
10479 DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT),
10480 Sub1Result, Sub1Result, Flags1);
10481 SDValue Sbc1Result = Sbc1.getValue(0);
10482
10483 // Second comparison: RHS vs LHS (reverse comparison)
10484 SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS);
10485
10486 // SUBE: RHS - RHS - !carry
10487 // This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned)
10488 SDValue Sbc2 = DAG.getNode(
10489 ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags);
10490 SDValue Sbc2Result = Sbc2.getValue(0);
10491
10492 // Final subtraction: Sbc1Result - Sbc2Result (no flags needed)
10493 SDValue Result =
10494 DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result);
10495 if (Op.getValueType() != MVT::i32)
10496 Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
10497
10498 return Result;
10499 }
10500
10501 // For the ARM assembly pattern:
10502 // subs r0, r0, r1 ; subtract RHS from LHS and set flags
10503 // movgt r0, #1 ; if LHS > RHS, set result to 1 (GT for signed, HI for
10504 // unsigned) mvnlt r0, #0 ; if LHS < RHS, set result to -1 (LT for
10505 // signed, LO for unsigned)
10506 // ; if LHS == RHS, result remains 0 from the subs
10507
10508 // Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC
10509 unsigned Opcode = ARMISD::SUBC;
10510
10511 // Check if RHS is a subtraction against 0: (0 - X)
10512 if (RHS.getOpcode() == ISD::SUB) {
10513 SDValue SubLHS = RHS.getOperand(0);
10514 SDValue SubRHS = RHS.getOperand(1);
10515
10516 // Check if it's 0 - X
10517 if (isNullConstant(SubLHS)) {
10518 bool CanUseAdd = false;
10519 if (IsSigned) {
10520 // For SCMP: only if X is known to never be INT_MIN (to avoid overflow)
10521 if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS)
10523 .isMinSignedValue()) {
10524 CanUseAdd = true;
10525 }
10526 } else {
10527 // For UCMP: only if X is known to never be zero
10528 if (DAG.isKnownNeverZero(SubRHS)) {
10529 CanUseAdd = true;
10530 }
10531 }
10532
10533 if (CanUseAdd) {
10534 Opcode = ARMISD::ADDC;
10535 RHS = SubRHS; // Replace RHS with X, so we do LHS + X instead of
10536 // LHS - (0 - X)
10537 }
10538 }
10539 }
10540
10541 // Generate the operation with flags
10542 SDValue OpWithFlags;
10543 if (Opcode == ARMISD::ADDC) {
10544 // Use ADDC: LHS + RHS (where RHS was 0 - X, now X)
10545 OpWithFlags = DAG.getNode(ARMISD::ADDC, dl,
10546 DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10547 } else {
10548 // Use ARMISD::SUBC to generate SUBS instruction (subtract with flags)
10549 OpWithFlags = DAG.getNode(ARMISD::SUBC, dl,
10550 DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10551 }
10552
10553 SDValue OpResult = OpWithFlags.getValue(0); // The operation result
10554 SDValue Flags = OpWithFlags.getValue(1); // The flags
10555
10556 // Constants for conditional moves
10557 SDValue One = DAG.getConstant(1, dl, MVT::i32);
10558 SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32);
10559
10560 // Select condition codes based on signed vs unsigned
10561 ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI;
10562 ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO;
10563
10564 // First conditional move: if greater than, set to 1
10565 SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32);
10566 SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One,
10567 GTCondValue, Flags);
10568
10569 // Second conditional move: if less than, set to -1
10570 SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32);
10571 SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne,
10572 LTCondValue, Flags);
10573
10574 if (Op.getValueType() != MVT::i32)
10575 Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
10576
10577 return Result2;
10578}
10579
10581 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
10582 switch (Op.getOpcode()) {
10583 default: llvm_unreachable("Don't know how to custom lower this!");
10584 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
10585 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
10586 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
10587 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
10588 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
10589 case ISD::SELECT: return LowerSELECT(Op, DAG);
10590 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
10591 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
10592 case ISD::BR_CC: return LowerBR_CC(Op, DAG);
10593 case ISD::BR_JT: return LowerBR_JT(Op, DAG);
10594 case ISD::VASTART: return LowerVASTART(Op, DAG);
10595 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);
10596 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
10597 case ISD::SINT_TO_FP:
10598 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
10601 case ISD::FP_TO_SINT:
10602 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
10604 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget);
10605 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
10606 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
10607 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
10608 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
10609 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
10610 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
10611 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
10612 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
10613 Subtarget);
10614 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
10615 case ISD::SHL:
10616 case ISD::SRL:
10617 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
10618 case ISD::SREM: return LowerREM(Op.getNode(), DAG);
10619 case ISD::UREM: return LowerREM(Op.getNode(), DAG);
10620 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
10621 case ISD::SRL_PARTS:
10622 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
10623 case ISD::CTTZ:
10624 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
10625 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
10626 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
10627 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
10628 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
10629 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
10630 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
10631 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
10632 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
10633 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
10634 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
10635 case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget);
10636 case ISD::SIGN_EXTEND:
10637 case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget);
10638 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
10639 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
10640 case ISD::SET_FPMODE:
10641 return LowerSET_FPMODE(Op, DAG);
10642 case ISD::RESET_FPMODE:
10643 return LowerRESET_FPMODE(Op, DAG);
10644 case ISD::MUL: return LowerMUL(Op, DAG);
10645 case ISD::SDIV:
10646 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10647 return LowerDIV_Windows(Op, DAG, /* Signed */ true);
10648 return LowerSDIV(Op, DAG, Subtarget);
10649 case ISD::UDIV:
10650 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10651 return LowerDIV_Windows(Op, DAG, /* Signed */ false);
10652 return LowerUDIV(Op, DAG, Subtarget);
10653 case ISD::UADDO_CARRY:
10654 case ISD::USUBO_CARRY:
10655 return LowerUADDSUBO_CARRY(Op, DAG);
10656 case ISD::SADDO:
10657 case ISD::SSUBO:
10658 return LowerSignedALUO(Op, DAG);
10659 case ISD::UADDO:
10660 case ISD::USUBO:
10661 return LowerUnsignedALUO(Op, DAG);
10662 case ISD::SADDSAT:
10663 case ISD::SSUBSAT:
10664 case ISD::UADDSAT:
10665 case ISD::USUBSAT:
10666 return LowerADDSUBSAT(Op, DAG, Subtarget);
10667 case ISD::LOAD:
10668 return LowerPredicateLoad(Op, DAG);
10669 case ISD::STORE:
10670 return LowerSTORE(Op, DAG, Subtarget);
10671 case ISD::MLOAD:
10672 return LowerMLOAD(Op, DAG);
10673 case ISD::VECREDUCE_MUL:
10674 case ISD::VECREDUCE_AND:
10675 case ISD::VECREDUCE_OR:
10676 case ISD::VECREDUCE_XOR:
10677 return LowerVecReduce(Op, DAG, Subtarget);
10682 return LowerVecReduceF(Op, DAG, Subtarget);
10687 return LowerVecReduceMinMax(Op, DAG, Subtarget);
10688 case ISD::ATOMIC_LOAD:
10689 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
10690 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
10691 case ISD::SDIVREM:
10692 case ISD::UDIVREM: return LowerDivRem(Op, DAG);
10694 if (Subtarget->isTargetWindows())
10695 return LowerDYNAMIC_STACKALLOC(Op, DAG);
10696 llvm_unreachable("Don't know how to custom lower this!");
10698 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
10700 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
10701 case ISD::STRICT_FSETCC:
10702 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
10703 case ISD::SPONENTRY:
10704 return LowerSPONENTRY(Op, DAG);
10705 case ISD::FP_TO_BF16:
10706 return LowerFP_TO_BF16(Op, DAG);
10707 case ARMISD::WIN__DBZCHK: return SDValue();
10708 case ISD::UCMP:
10709 case ISD::SCMP:
10710 return LowerCMP(Op, DAG);
10711 }
10712}
10713
10715 SelectionDAG &DAG) {
10716 unsigned IntNo = N->getConstantOperandVal(0);
10717 unsigned Opc = 0;
10718 if (IntNo == Intrinsic::arm_smlald)
10720 else if (IntNo == Intrinsic::arm_smlaldx)
10722 else if (IntNo == Intrinsic::arm_smlsld)
10724 else if (IntNo == Intrinsic::arm_smlsldx)
10726 else
10727 return;
10728
10729 SDLoc dl(N);
10730 SDValue Lo, Hi;
10731 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(3), dl, MVT::i32, MVT::i32);
10732
10733 SDValue LongMul = DAG.getNode(Opc, dl,
10734 DAG.getVTList(MVT::i32, MVT::i32),
10735 N->getOperand(1), N->getOperand(2),
10736 Lo, Hi);
10737 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
10738 LongMul.getValue(0), LongMul.getValue(1)));
10739}
10740
10741/// ReplaceNodeResults - Replace the results of node with an illegal result
10742/// type with new values built out of custom code.
10745 SelectionDAG &DAG) const {
10746 SDValue Res;
10747 switch (N->getOpcode()) {
10748 default:
10749 llvm_unreachable("Don't know how to custom expand this!");
10750 case ISD::READ_REGISTER:
10752 break;
10753 case ISD::BITCAST:
10754 Res = ExpandBITCAST(N, DAG, Subtarget);
10755 break;
10756 case ISD::SRL:
10757 case ISD::SRA:
10758 case ISD::SHL:
10759 Res = Expand64BitShift(N, DAG, Subtarget);
10760 break;
10761 case ISD::SREM:
10762 case ISD::UREM:
10763 Res = LowerREM(N, DAG);
10764 break;
10765 case ISD::SDIVREM:
10766 case ISD::UDIVREM:
10767 Res = LowerDivRem(SDValue(N, 0), DAG);
10768 assert(Res.getNumOperands() == 2 && "DivRem needs two values");
10769 Results.push_back(Res.getValue(0));
10770 Results.push_back(Res.getValue(1));
10771 return;
10772 case ISD::SADDSAT:
10773 case ISD::SSUBSAT:
10774 case ISD::UADDSAT:
10775 case ISD::USUBSAT:
10776 Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
10777 break;
10779 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
10780 return;
10781 case ISD::UDIV:
10782 case ISD::SDIV:
10783 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
10784 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
10785 Results);
10788 return;
10790 return ReplaceLongIntrinsic(N, Results, DAG);
10791 case ISD::LOAD:
10792 LowerLOAD(N, Results, DAG);
10793 break;
10794 case ISD::TRUNCATE:
10795 Res = LowerTruncate(N, DAG, Subtarget);
10796 break;
10797 case ISD::SIGN_EXTEND:
10798 case ISD::ZERO_EXTEND:
10799 Res = LowerVectorExtend(N, DAG, Subtarget);
10800 break;
10803 Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget);
10804 break;
10805 }
10806 if (Res.getNode())
10807 Results.push_back(Res);
10808}
10809
10810//===----------------------------------------------------------------------===//
10811// ARM Scheduler Hooks
10812//===----------------------------------------------------------------------===//
10813
10814/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
10815/// registers the function context.
10816void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
10818 MachineBasicBlock *DispatchBB,
10819 int FI) const {
10820 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
10821 "ROPI/RWPI not currently supported with SjLj");
10822 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10823 DebugLoc dl = MI.getDebugLoc();
10824 MachineFunction *MF = MBB->getParent();
10828 const Function &F = MF->getFunction();
10829
10830 bool isThumb = Subtarget->isThumb();
10831 bool isThumb2 = Subtarget->isThumb2();
10832
10833 unsigned PCLabelId = AFI->createPICLabelUId();
10834 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
10836 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
10837 unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
10838
10839 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
10840 : &ARM::GPRRegClass;
10841
10842 // Grab constant pool and fixed stack memory operands.
10843 MachineMemOperand *CPMMO =
10846
10847 MachineMemOperand *FIMMOSt =
10850
10851 // Load the address of the dispatch MBB into the jump buffer.
10852 if (isThumb2) {
10853 // Incoming value: jbuf
10854 // ldr.n r5, LCPI1_1
10855 // orr r5, r5, #1
10856 // add r5, pc
10857 // str r5, [$jbuf, #+4] ; &jbuf[1]
10858 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10859 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
10861 .addMemOperand(CPMMO)
10863 // Set the low bit because of thumb mode.
10864 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10865 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
10866 .addReg(NewVReg1, RegState::Kill)
10867 .addImm(0x01)
10869 .add(condCodeOp());
10870 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10871 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
10872 .addReg(NewVReg2, RegState::Kill)
10873 .addImm(PCLabelId);
10874 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
10875 .addReg(NewVReg3, RegState::Kill)
10876 .addFrameIndex(FI)
10877 .addImm(36) // &jbuf[1] :: pc
10878 .addMemOperand(FIMMOSt)
10880 } else if (isThumb) {
10881 // Incoming value: jbuf
10882 // ldr.n r1, LCPI1_4
10883 // add r1, pc
10884 // mov r2, #1
10885 // orrs r1, r2
10886 // add r2, $jbuf, #+4 ; &jbuf[1]
10887 // str r1, [r2]
10888 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10889 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
10891 .addMemOperand(CPMMO)
10893 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10894 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
10895 .addReg(NewVReg1, RegState::Kill)
10896 .addImm(PCLabelId);
10897 // Set the low bit because of thumb mode.
10898 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10899 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
10900 .addReg(ARM::CPSR, RegState::Define)
10901 .addImm(1)
10903 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10904 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
10905 .addReg(ARM::CPSR, RegState::Define)
10906 .addReg(NewVReg2, RegState::Kill)
10907 .addReg(NewVReg3, RegState::Kill)
10909 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10910 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
10911 .addFrameIndex(FI)
10912 .addImm(36); // &jbuf[1] :: pc
10913 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
10914 .addReg(NewVReg4, RegState::Kill)
10915 .addReg(NewVReg5, RegState::Kill)
10916 .addImm(0)
10917 .addMemOperand(FIMMOSt)
10919 } else {
10920 // Incoming value: jbuf
10921 // ldr r1, LCPI1_1
10922 // add r1, pc, r1
10923 // str r1, [$jbuf, #+4] ; &jbuf[1]
10924 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10925 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
10927 .addImm(0)
10928 .addMemOperand(CPMMO)
10930 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10931 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
10932 .addReg(NewVReg1, RegState::Kill)
10933 .addImm(PCLabelId)
10935 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
10936 .addReg(NewVReg2, RegState::Kill)
10937 .addFrameIndex(FI)
10938 .addImm(36) // &jbuf[1] :: pc
10939 .addMemOperand(FIMMOSt)
10941 }
10942}
10943
10944void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
10945 MachineBasicBlock *MBB) const {
10946 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10947 DebugLoc dl = MI.getDebugLoc();
10948 MachineFunction *MF = MBB->getParent();
10950 MachineFrameInfo &MFI = MF->getFrameInfo();
10951 int FI = MFI.getFunctionContextIndex();
10952
10953 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
10954 : &ARM::GPRnopcRegClass;
10955
10956 // Get a mapping of the call site numbers to all of the landing pads they're
10957 // associated with.
10959 unsigned MaxCSNum = 0;
10960 for (MachineBasicBlock &BB : *MF) {
10961 if (!BB.isEHPad())
10962 continue;
10963
10964 // FIXME: We should assert that the EH_LABEL is the first MI in the landing
10965 // pad.
10966 for (MachineInstr &II : BB) {
10967 if (!II.isEHLabel())
10968 continue;
10969
10970 MCSymbol *Sym = II.getOperand(0).getMCSymbol();
10971 if (!MF->hasCallSiteLandingPad(Sym)) continue;
10972
10973 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
10974 for (unsigned Idx : CallSiteIdxs) {
10975 CallSiteNumToLPad[Idx].push_back(&BB);
10976 MaxCSNum = std::max(MaxCSNum, Idx);
10977 }
10978 break;
10979 }
10980 }
10981
10982 // Get an ordered list of the machine basic blocks for the jump table.
10983 std::vector<MachineBasicBlock*> LPadList;
10985 LPadList.reserve(CallSiteNumToLPad.size());
10986 for (unsigned I = 1; I <= MaxCSNum; ++I) {
10987 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
10988 for (MachineBasicBlock *MBB : MBBList) {
10989 LPadList.push_back(MBB);
10990 InvokeBBs.insert_range(MBB->predecessors());
10991 }
10992 }
10993
10994 assert(!LPadList.empty() &&
10995 "No landing pad destinations for the dispatch jump table!");
10996
10997 // Create the jump table and associated information.
10999 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
11000 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
11001
11002 // Create the MBBs for the dispatch code.
11003
11004 // Shove the dispatch's address into the return slot in the function context.
11005 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
11006 DispatchBB->setIsEHPad();
11007
11008 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
11009
11010 BuildMI(TrapBB, dl, TII->get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP));
11011 DispatchBB->addSuccessor(TrapBB);
11012
11013 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
11014 DispatchBB->addSuccessor(DispContBB);
11015
11016 // Insert and MBBs.
11017 MF->insert(MF->end(), DispatchBB);
11018 MF->insert(MF->end(), DispContBB);
11019 MF->insert(MF->end(), TrapBB);
11020
11021 // Insert code into the entry block that creates and registers the function
11022 // context.
11023 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
11024
11025 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
11028
11030 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
11031
11032 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
11033 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
11034
11035 // Add a register mask with no preserved registers. This results in all
11036 // registers being marked as clobbered. This can't work if the dispatch block
11037 // is in a Thumb1 function and is linked with ARM code which uses the FP
11038 // registers, as there is no way to preserve the FP registers in Thumb1 mode.
11040
11041 bool IsPositionIndependent = isPositionIndependent();
11042 unsigned NumLPads = LPadList.size();
11043 if (Subtarget->isThumb2()) {
11044 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11045 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
11046 .addFrameIndex(FI)
11047 .addImm(4)
11048 .addMemOperand(FIMMOLd)
11050
11051 if (NumLPads < 256) {
11052 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
11053 .addReg(NewVReg1)
11054 .addImm(LPadList.size())
11056 } else {
11057 Register VReg1 = MRI->createVirtualRegister(TRC);
11058 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
11059 .addImm(NumLPads & 0xFFFF)
11061
11062 unsigned VReg2 = VReg1;
11063 if ((NumLPads & 0xFFFF0000) != 0) {
11064 VReg2 = MRI->createVirtualRegister(TRC);
11065 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
11066 .addReg(VReg1)
11067 .addImm(NumLPads >> 16)
11069 }
11070
11071 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
11072 .addReg(NewVReg1)
11073 .addReg(VReg2)
11075 }
11076
11077 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
11078 .addMBB(TrapBB)
11080 .addReg(ARM::CPSR);
11081
11082 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11083 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
11084 .addJumpTableIndex(MJTI)
11086
11087 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11088 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
11089 .addReg(NewVReg3, RegState::Kill)
11090 .addReg(NewVReg1)
11093 .add(condCodeOp());
11094
11095 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
11096 .addReg(NewVReg4, RegState::Kill)
11097 .addReg(NewVReg1)
11098 .addJumpTableIndex(MJTI);
11099 } else if (Subtarget->isThumb()) {
11100 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11101 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
11102 .addFrameIndex(FI)
11103 .addImm(1)
11104 .addMemOperand(FIMMOLd)
11106
11107 if (NumLPads < 256) {
11108 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
11109 .addReg(NewVReg1)
11110 .addImm(NumLPads)
11112 } else {
11113 MachineConstantPool *ConstantPool = MF->getConstantPool();
11114 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11115 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11116
11117 // MachineConstantPool wants an explicit alignment.
11118 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11119 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11120
11121 Register VReg1 = MRI->createVirtualRegister(TRC);
11122 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
11123 .addReg(VReg1, RegState::Define)
11126 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
11127 .addReg(NewVReg1)
11128 .addReg(VReg1)
11130 }
11131
11132 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
11133 .addMBB(TrapBB)
11135 .addReg(ARM::CPSR);
11136
11137 Register NewVReg2 = MRI->createVirtualRegister(TRC);
11138 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
11139 .addReg(ARM::CPSR, RegState::Define)
11140 .addReg(NewVReg1)
11141 .addImm(2)
11143
11144 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11145 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
11146 .addJumpTableIndex(MJTI)
11148
11149 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11150 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
11151 .addReg(ARM::CPSR, RegState::Define)
11152 .addReg(NewVReg2, RegState::Kill)
11153 .addReg(NewVReg3)
11155
11156 MachineMemOperand *JTMMOLd =
11157 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11159
11160 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11161 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
11162 .addReg(NewVReg4, RegState::Kill)
11163 .addImm(0)
11164 .addMemOperand(JTMMOLd)
11166
11167 unsigned NewVReg6 = NewVReg5;
11168 if (IsPositionIndependent) {
11169 NewVReg6 = MRI->createVirtualRegister(TRC);
11170 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
11171 .addReg(ARM::CPSR, RegState::Define)
11172 .addReg(NewVReg5, RegState::Kill)
11173 .addReg(NewVReg3)
11175 }
11176
11177 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
11178 .addReg(NewVReg6, RegState::Kill)
11179 .addJumpTableIndex(MJTI);
11180 } else {
11181 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11182 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
11183 .addFrameIndex(FI)
11184 .addImm(4)
11185 .addMemOperand(FIMMOLd)
11187
11188 if (NumLPads < 256) {
11189 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
11190 .addReg(NewVReg1)
11191 .addImm(NumLPads)
11193 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
11194 Register VReg1 = MRI->createVirtualRegister(TRC);
11195 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
11196 .addImm(NumLPads & 0xFFFF)
11198
11199 unsigned VReg2 = VReg1;
11200 if ((NumLPads & 0xFFFF0000) != 0) {
11201 VReg2 = MRI->createVirtualRegister(TRC);
11202 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
11203 .addReg(VReg1)
11204 .addImm(NumLPads >> 16)
11206 }
11207
11208 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11209 .addReg(NewVReg1)
11210 .addReg(VReg2)
11212 } else {
11213 MachineConstantPool *ConstantPool = MF->getConstantPool();
11214 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11215 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11216
11217 // MachineConstantPool wants an explicit alignment.
11218 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11219 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11220
11221 Register VReg1 = MRI->createVirtualRegister(TRC);
11222 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
11223 .addReg(VReg1, RegState::Define)
11225 .addImm(0)
11227 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11228 .addReg(NewVReg1)
11229 .addReg(VReg1, RegState::Kill)
11231 }
11232
11233 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
11234 .addMBB(TrapBB)
11236 .addReg(ARM::CPSR);
11237
11238 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11239 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
11240 .addReg(NewVReg1)
11243 .add(condCodeOp());
11244 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11245 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
11246 .addJumpTableIndex(MJTI)
11248
11249 MachineMemOperand *JTMMOLd =
11250 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11252 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11253 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
11254 .addReg(NewVReg3, RegState::Kill)
11255 .addReg(NewVReg4)
11256 .addImm(0)
11257 .addMemOperand(JTMMOLd)
11259
11260 if (IsPositionIndependent) {
11261 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
11262 .addReg(NewVReg5, RegState::Kill)
11263 .addReg(NewVReg4)
11264 .addJumpTableIndex(MJTI);
11265 } else {
11266 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
11267 .addReg(NewVReg5, RegState::Kill)
11268 .addJumpTableIndex(MJTI);
11269 }
11270 }
11271
11272 // Add the jump table entries as successors to the MBB.
11274 for (MachineBasicBlock *CurMBB : LPadList) {
11275 if (SeenMBBs.insert(CurMBB).second)
11276 DispContBB->addSuccessor(CurMBB);
11277 }
11278
11279 // N.B. the order the invoke BBs are processed in doesn't matter here.
11280 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
11282 for (MachineBasicBlock *BB : InvokeBBs) {
11283
11284 // Remove the landing pad successor from the invoke block and replace it
11285 // with the new dispatch block.
11286 SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
11287 while (!Successors.empty()) {
11288 MachineBasicBlock *SMBB = Successors.pop_back_val();
11289 if (SMBB->isEHPad()) {
11290 BB->removeSuccessor(SMBB);
11291 MBBLPads.push_back(SMBB);
11292 }
11293 }
11294
11295 BB->addSuccessor(DispatchBB, BranchProbability::getZero());
11296 BB->normalizeSuccProbs();
11297
11298 // Find the invoke call and mark all of the callee-saved registers as
11299 // 'implicit defined' so that they're spilled. This prevents code from
11300 // moving instructions to before the EH block, where they will never be
11301 // executed.
11303 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
11304 if (!II->isCall()) continue;
11305
11306 DenseSet<unsigned> DefRegs;
11308 OI = II->operands_begin(), OE = II->operands_end();
11309 OI != OE; ++OI) {
11310 if (!OI->isReg()) continue;
11311 DefRegs.insert(OI->getReg());
11312 }
11313
11314 MachineInstrBuilder MIB(*MF, &*II);
11315
11316 for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
11317 unsigned Reg = SavedRegs[i];
11318 if (Subtarget->isThumb2() &&
11319 !ARM::tGPRRegClass.contains(Reg) &&
11320 !ARM::hGPRRegClass.contains(Reg))
11321 continue;
11322 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
11323 continue;
11324 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
11325 continue;
11326 if (!DefRegs.contains(Reg))
11328 }
11329
11330 break;
11331 }
11332 }
11333
11334 // Mark all former landing pads as non-landing pads. The dispatch is the only
11335 // landing pad now.
11336 for (MachineBasicBlock *MBBLPad : MBBLPads)
11337 MBBLPad->setIsEHPad(false);
11338
11339 // The instruction is gone now.
11340 MI.eraseFromParent();
11341}
11342
11343static
11345 for (MachineBasicBlock *S : MBB->successors())
11346 if (S != Succ)
11347 return S;
11348 llvm_unreachable("Expecting a BB with two successors!");
11349}
11350
11351/// Return the load opcode for a given load size. If load size >= 8,
11352/// neon opcode will be returned.
11353static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
11354 if (LdSize >= 8)
11355 return LdSize == 16 ? ARM::VLD1q32wb_fixed
11356 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
11357 if (IsThumb1)
11358 return LdSize == 4 ? ARM::tLDRi
11359 : LdSize == 2 ? ARM::tLDRHi
11360 : LdSize == 1 ? ARM::tLDRBi : 0;
11361 if (IsThumb2)
11362 return LdSize == 4 ? ARM::t2LDR_POST
11363 : LdSize == 2 ? ARM::t2LDRH_POST
11364 : LdSize == 1 ? ARM::t2LDRB_POST : 0;
11365 return LdSize == 4 ? ARM::LDR_POST_IMM
11366 : LdSize == 2 ? ARM::LDRH_POST
11367 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
11368}
11369
11370/// Return the store opcode for a given store size. If store size >= 8,
11371/// neon opcode will be returned.
11372static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
11373 if (StSize >= 8)
11374 return StSize == 16 ? ARM::VST1q32wb_fixed
11375 : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
11376 if (IsThumb1)
11377 return StSize == 4 ? ARM::tSTRi
11378 : StSize == 2 ? ARM::tSTRHi
11379 : StSize == 1 ? ARM::tSTRBi : 0;
11380 if (IsThumb2)
11381 return StSize == 4 ? ARM::t2STR_POST
11382 : StSize == 2 ? ARM::t2STRH_POST
11383 : StSize == 1 ? ARM::t2STRB_POST : 0;
11384 return StSize == 4 ? ARM::STR_POST_IMM
11385 : StSize == 2 ? ARM::STRH_POST
11386 : StSize == 1 ? ARM::STRB_POST_IMM : 0;
11387}
11388
11389/// Emit a post-increment load operation with given size. The instructions
11390/// will be added to BB at Pos.
11392 const TargetInstrInfo *TII, const DebugLoc &dl,
11393 unsigned LdSize, unsigned Data, unsigned AddrIn,
11394 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11395 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
11396 assert(LdOpc != 0 && "Should have a load opcode");
11397 if (LdSize >= 8) {
11398 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11399 .addReg(AddrOut, RegState::Define)
11400 .addReg(AddrIn)
11401 .addImm(0)
11403 } else if (IsThumb1) {
11404 // load + update AddrIn
11405 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11406 .addReg(AddrIn)
11407 .addImm(0)
11409 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11410 .add(t1CondCodeOp())
11411 .addReg(AddrIn)
11412 .addImm(LdSize)
11414 } else if (IsThumb2) {
11415 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11416 .addReg(AddrOut, RegState::Define)
11417 .addReg(AddrIn)
11418 .addImm(LdSize)
11420 } else { // arm
11421 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11422 .addReg(AddrOut, RegState::Define)
11423 .addReg(AddrIn)
11424 .addReg(0)
11425 .addImm(LdSize)
11427 }
11428}
11429
11430/// Emit a post-increment store operation with given size. The instructions
11431/// will be added to BB at Pos.
11433 const TargetInstrInfo *TII, const DebugLoc &dl,
11434 unsigned StSize, unsigned Data, unsigned AddrIn,
11435 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11436 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
11437 assert(StOpc != 0 && "Should have a store opcode");
11438 if (StSize >= 8) {
11439 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11440 .addReg(AddrIn)
11441 .addImm(0)
11442 .addReg(Data)
11444 } else if (IsThumb1) {
11445 // store + update AddrIn
11446 BuildMI(*BB, Pos, dl, TII->get(StOpc))
11447 .addReg(Data)
11448 .addReg(AddrIn)
11449 .addImm(0)
11451 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11452 .add(t1CondCodeOp())
11453 .addReg(AddrIn)
11454 .addImm(StSize)
11456 } else if (IsThumb2) {
11457 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11458 .addReg(Data)
11459 .addReg(AddrIn)
11460 .addImm(StSize)
11462 } else { // arm
11463 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11464 .addReg(Data)
11465 .addReg(AddrIn)
11466 .addReg(0)
11467 .addImm(StSize)
11469 }
11470}
11471
11473ARMTargetLowering::EmitStructByval(MachineInstr &MI,
11474 MachineBasicBlock *BB) const {
11475 // This pseudo instruction has 3 operands: dst, src, size
11476 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
11477 // Otherwise, we will generate unrolled scalar copies.
11478 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11479 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11481
11482 Register dest = MI.getOperand(0).getReg();
11483 Register src = MI.getOperand(1).getReg();
11484 unsigned SizeVal = MI.getOperand(2).getImm();
11485 unsigned Alignment = MI.getOperand(3).getImm();
11486 DebugLoc dl = MI.getDebugLoc();
11487
11488 MachineFunction *MF = BB->getParent();
11490 unsigned UnitSize = 0;
11491 const TargetRegisterClass *TRC = nullptr;
11492 const TargetRegisterClass *VecTRC = nullptr;
11493
11494 bool IsThumb1 = Subtarget->isThumb1Only();
11495 bool IsThumb2 = Subtarget->isThumb2();
11496 bool IsThumb = Subtarget->isThumb();
11497
11498 if (Alignment & 1) {
11499 UnitSize = 1;
11500 } else if (Alignment & 2) {
11501 UnitSize = 2;
11502 } else {
11503 // Check whether we can use NEON instructions.
11504 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
11505 Subtarget->hasNEON()) {
11506 if ((Alignment % 16 == 0) && SizeVal >= 16)
11507 UnitSize = 16;
11508 else if ((Alignment % 8 == 0) && SizeVal >= 8)
11509 UnitSize = 8;
11510 }
11511 // Can't use NEON instructions.
11512 if (UnitSize == 0)
11513 UnitSize = 4;
11514 }
11515
11516 // Select the correct opcode and register class for unit size load/store
11517 bool IsNeon = UnitSize >= 8;
11518 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
11519 if (IsNeon)
11520 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
11521 : UnitSize == 8 ? &ARM::DPRRegClass
11522 : nullptr;
11523
11524 unsigned BytesLeft = SizeVal % UnitSize;
11525 unsigned LoopSize = SizeVal - BytesLeft;
11526
11527 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
11528 // Use LDR and STR to copy.
11529 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
11530 // [destOut] = STR_POST(scratch, destIn, UnitSize)
11531 unsigned srcIn = src;
11532 unsigned destIn = dest;
11533 for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
11534 Register srcOut = MRI.createVirtualRegister(TRC);
11535 Register destOut = MRI.createVirtualRegister(TRC);
11536 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11537 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
11538 IsThumb1, IsThumb2);
11539 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
11540 IsThumb1, IsThumb2);
11541 srcIn = srcOut;
11542 destIn = destOut;
11543 }
11544
11545 // Handle the leftover bytes with LDRB and STRB.
11546 // [scratch, srcOut] = LDRB_POST(srcIn, 1)
11547 // [destOut] = STRB_POST(scratch, destIn, 1)
11548 for (unsigned i = 0; i < BytesLeft; i++) {
11549 Register srcOut = MRI.createVirtualRegister(TRC);
11550 Register destOut = MRI.createVirtualRegister(TRC);
11551 Register scratch = MRI.createVirtualRegister(TRC);
11552 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
11553 IsThumb1, IsThumb2);
11554 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
11555 IsThumb1, IsThumb2);
11556 srcIn = srcOut;
11557 destIn = destOut;
11558 }
11559 MI.eraseFromParent(); // The instruction is gone now.
11560 return BB;
11561 }
11562
11563 // Expand the pseudo op to a loop.
11564 // thisMBB:
11565 // ...
11566 // movw varEnd, # --> with thumb2
11567 // movt varEnd, #
11568 // ldrcp varEnd, idx --> without thumb2
11569 // fallthrough --> loopMBB
11570 // loopMBB:
11571 // PHI varPhi, varEnd, varLoop
11572 // PHI srcPhi, src, srcLoop
11573 // PHI destPhi, dst, destLoop
11574 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11575 // [destLoop] = STR_POST(scratch, destPhi, UnitSize)
11576 // subs varLoop, varPhi, #UnitSize
11577 // bne loopMBB
11578 // fallthrough --> exitMBB
11579 // exitMBB:
11580 // epilogue to handle left-over bytes
11581 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11582 // [destOut] = STRB_POST(scratch, destLoop, 1)
11583 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11584 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11585 MF->insert(It, loopMBB);
11586 MF->insert(It, exitMBB);
11587
11588 // Set the call frame size on entry to the new basic blocks.
11589 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
11590 loopMBB->setCallFrameSize(CallFrameSize);
11591 exitMBB->setCallFrameSize(CallFrameSize);
11592
11593 // Transfer the remainder of BB and its successor edges to exitMBB.
11594 exitMBB->splice(exitMBB->begin(), BB,
11595 std::next(MachineBasicBlock::iterator(MI)), BB->end());
11597
11598 // Load an immediate to varEnd.
11599 Register varEnd = MRI.createVirtualRegister(TRC);
11600 if (Subtarget->useMovt()) {
11601 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi32imm : ARM::MOVi32imm),
11602 varEnd)
11603 .addImm(LoopSize);
11604 } else if (Subtarget->genExecuteOnly()) {
11605 assert(IsThumb && "Non-thumb expected to have used movt");
11606 BuildMI(BB, dl, TII->get(ARM::tMOVi32imm), varEnd).addImm(LoopSize);
11607 } else {
11609 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11610 const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
11611
11612 // MachineConstantPool wants an explicit alignment.
11613 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11614 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11615 MachineMemOperand *CPMMO =
11618
11619 if (IsThumb)
11620 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
11621 .addReg(varEnd, RegState::Define)
11624 .addMemOperand(CPMMO);
11625 else
11626 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
11627 .addReg(varEnd, RegState::Define)
11629 .addImm(0)
11631 .addMemOperand(CPMMO);
11632 }
11633 BB->addSuccessor(loopMBB);
11634
11635 // Generate the loop body:
11636 // varPhi = PHI(varLoop, varEnd)
11637 // srcPhi = PHI(srcLoop, src)
11638 // destPhi = PHI(destLoop, dst)
11639 MachineBasicBlock *entryBB = BB;
11640 BB = loopMBB;
11641 Register varLoop = MRI.createVirtualRegister(TRC);
11642 Register varPhi = MRI.createVirtualRegister(TRC);
11643 Register srcLoop = MRI.createVirtualRegister(TRC);
11644 Register srcPhi = MRI.createVirtualRegister(TRC);
11645 Register destLoop = MRI.createVirtualRegister(TRC);
11646 Register destPhi = MRI.createVirtualRegister(TRC);
11647
11648 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
11649 .addReg(varLoop).addMBB(loopMBB)
11650 .addReg(varEnd).addMBB(entryBB);
11651 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
11652 .addReg(srcLoop).addMBB(loopMBB)
11653 .addReg(src).addMBB(entryBB);
11654 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
11655 .addReg(destLoop).addMBB(loopMBB)
11656 .addReg(dest).addMBB(entryBB);
11657
11658 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11659 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
11660 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11661 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
11662 IsThumb1, IsThumb2);
11663 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
11664 IsThumb1, IsThumb2);
11665
11666 // Decrement loop variable by UnitSize.
11667 if (IsThumb1) {
11668 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
11669 .add(t1CondCodeOp())
11670 .addReg(varPhi)
11671 .addImm(UnitSize)
11673 } else {
11675 BuildMI(*BB, BB->end(), dl,
11676 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
11677 MIB.addReg(varPhi)
11678 .addImm(UnitSize)
11680 .add(condCodeOp());
11681 MIB->getOperand(5).setReg(ARM::CPSR);
11682 MIB->getOperand(5).setIsDef(true);
11683 }
11684 BuildMI(*BB, BB->end(), dl,
11685 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
11686 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
11687
11688 // loopMBB can loop back to loopMBB or fall through to exitMBB.
11689 BB->addSuccessor(loopMBB);
11690 BB->addSuccessor(exitMBB);
11691
11692 // Add epilogue to handle BytesLeft.
11693 BB = exitMBB;
11694 auto StartOfExit = exitMBB->begin();
11695
11696 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11697 // [destOut] = STRB_POST(scratch, destLoop, 1)
11698 unsigned srcIn = srcLoop;
11699 unsigned destIn = destLoop;
11700 for (unsigned i = 0; i < BytesLeft; i++) {
11701 Register srcOut = MRI.createVirtualRegister(TRC);
11702 Register destOut = MRI.createVirtualRegister(TRC);
11703 Register scratch = MRI.createVirtualRegister(TRC);
11704 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
11705 IsThumb1, IsThumb2);
11706 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
11707 IsThumb1, IsThumb2);
11708 srcIn = srcOut;
11709 destIn = destOut;
11710 }
11711
11712 MI.eraseFromParent(); // The instruction is gone now.
11713 return BB;
11714}
11715
11717ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
11718 MachineBasicBlock *MBB) const {
11720 const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
11721 DebugLoc DL = MI.getDebugLoc();
11722
11723 assert(Subtarget->isTargetWindows() &&
11724 "__chkstk is only supported on Windows");
11725 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
11726
11727 // __chkstk takes the number of words to allocate on the stack in R4, and
11728 // returns the stack adjustment in number of bytes in R4. This will not
11729 // clober any other registers (other than the obvious lr).
11730 //
11731 // Although, technically, IP should be considered a register which may be
11732 // clobbered, the call itself will not touch it. Windows on ARM is a pure
11733 // thumb-2 environment, so there is no interworking required. As a result, we
11734 // do not expect a veneer to be emitted by the linker, clobbering IP.
11735 //
11736 // Each module receives its own copy of __chkstk, so no import thunk is
11737 // required, again, ensuring that IP is not clobbered.
11738 //
11739 // Finally, although some linkers may theoretically provide a trampoline for
11740 // out of range calls (which is quite common due to a 32M range limitation of
11741 // branches for Thumb), we can generate the long-call version via
11742 // -mcmodel=large, alleviating the need for the trampoline which may clobber
11743 // IP.
11744
11745 switch (TM.getCodeModel()) {
11746 case CodeModel::Tiny:
11747 llvm_unreachable("Tiny code model not available on ARM.");
11748 case CodeModel::Small:
11749 case CodeModel::Medium:
11750 case CodeModel::Kernel:
11751 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
11753 .addExternalSymbol("__chkstk")
11756 .addReg(ARM::R12,
11758 .addReg(ARM::CPSR,
11760 break;
11761 case CodeModel::Large: {
11763 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11764
11765 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
11766 .addExternalSymbol("__chkstk");
11769 .addReg(Reg, RegState::Kill)
11772 .addReg(ARM::R12,
11774 .addReg(ARM::CPSR,
11776 break;
11777 }
11778 }
11779
11780 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
11781 .addReg(ARM::SP, RegState::Kill)
11782 .addReg(ARM::R4, RegState::Kill)
11785 .add(condCodeOp());
11786
11787 MI.eraseFromParent();
11788 return MBB;
11789}
11790
11792ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
11793 MachineBasicBlock *MBB) const {
11794 DebugLoc DL = MI.getDebugLoc();
11795 MachineFunction *MF = MBB->getParent();
11796 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11797
11799 MF->insert(++MBB->getIterator(), ContBB);
11800 ContBB->splice(ContBB->begin(), MBB,
11801 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
11803 MBB->addSuccessor(ContBB);
11804
11806 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
11807 MF->push_back(TrapBB);
11808 MBB->addSuccessor(TrapBB);
11809
11810 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
11811 .addReg(MI.getOperand(0).getReg())
11812 .addImm(0)
11814 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
11815 .addMBB(TrapBB)
11817 .addReg(ARM::CPSR);
11818
11819 MI.eraseFromParent();
11820 return ContBB;
11821}
11822
11823// The CPSR operand of SelectItr might be missing a kill marker
11824// because there were multiple uses of CPSR, and ISel didn't know
11825// which to mark. Figure out whether SelectItr should have had a
11826// kill marker, and set it if it should. Returns the correct kill
11827// marker value.
11830 const TargetRegisterInfo* TRI) {
11831 // Scan forward through BB for a use/def of CPSR.
11832 MachineBasicBlock::iterator miI(std::next(SelectItr));
11833 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
11834 const MachineInstr& mi = *miI;
11835 if (mi.readsRegister(ARM::CPSR, /*TRI=*/nullptr))
11836 return false;
11837 if (mi.definesRegister(ARM::CPSR, /*TRI=*/nullptr))
11838 break; // Should have kill-flag - update below.
11839 }
11840
11841 // If we hit the end of the block, check whether CPSR is live into a
11842 // successor.
11843 if (miI == BB->end()) {
11844 for (MachineBasicBlock *Succ : BB->successors())
11845 if (Succ->isLiveIn(ARM::CPSR))
11846 return false;
11847 }
11848
11849 // We found a def, or hit the end of the basic block and CPSR wasn't live
11850 // out. SelectMI should have a kill flag on CPSR.
11851 SelectItr->addRegisterKilled(ARM::CPSR, TRI);
11852 return true;
11853}
11854
11855/// Adds logic in loop entry MBB to calculate loop iteration count and adds
11856/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
11858 MachineBasicBlock *TpLoopBody,
11859 MachineBasicBlock *TpExit, Register OpSizeReg,
11860 const TargetInstrInfo *TII, DebugLoc Dl,
11862 // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.
11863 Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11864 BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
11865 .addUse(OpSizeReg)
11866 .addImm(15)
11868 .addReg(0);
11869
11870 Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11871 BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
11872 .addUse(AddDestReg, RegState::Kill)
11873 .addImm(4)
11875 .addReg(0);
11876
11877 Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11878 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
11879 .addUse(LsrDestReg, RegState::Kill);
11880
11881 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
11882 .addUse(TotalIterationsReg)
11883 .addMBB(TpExit);
11884
11885 BuildMI(TpEntry, Dl, TII->get(ARM::t2B))
11886 .addMBB(TpLoopBody)
11888
11889 return TotalIterationsReg;
11890}
11891
11892/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
11893/// t2DoLoopEnd. These are used by later passes to generate tail predicated
11894/// loops.
11895static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
11896 MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
11897 const TargetInstrInfo *TII, DebugLoc Dl,
11898 MachineRegisterInfo &MRI, Register OpSrcReg,
11899 Register OpDestReg, Register ElementCountReg,
11900 Register TotalIterationsReg, bool IsMemcpy) {
11901 // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
11902 // array, loop iteration counter, predication counter.
11903
11904 Register SrcPhiReg, CurrSrcReg;
11905 if (IsMemcpy) {
11906 // Current position in the src array
11907 SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11908 CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11909 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
11910 .addUse(OpSrcReg)
11911 .addMBB(TpEntry)
11912 .addUse(CurrSrcReg)
11913 .addMBB(TpLoopBody);
11914 }
11915
11916 // Current position in the dest array
11917 Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11918 Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11919 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
11920 .addUse(OpDestReg)
11921 .addMBB(TpEntry)
11922 .addUse(CurrDestReg)
11923 .addMBB(TpLoopBody);
11924
11925 // Current loop counter
11926 Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11927 Register RemainingLoopIterationsReg =
11928 MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11929 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
11930 .addUse(TotalIterationsReg)
11931 .addMBB(TpEntry)
11932 .addUse(RemainingLoopIterationsReg)
11933 .addMBB(TpLoopBody);
11934
11935 // Predication counter
11936 Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11937 Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11938 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
11939 .addUse(ElementCountReg)
11940 .addMBB(TpEntry)
11941 .addUse(RemainingElementsReg)
11942 .addMBB(TpLoopBody);
11943
11944 // Pass predication counter to VCTP
11945 Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
11946 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
11947 .addUse(PredCounterPhiReg)
11949 .addReg(0)
11950 .addReg(0);
11951
11952 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
11953 .addUse(PredCounterPhiReg)
11954 .addImm(16)
11956 .addReg(0);
11957
11958 // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
11959 Register SrcValueReg;
11960 if (IsMemcpy) {
11961 SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
11962 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
11963 .addDef(CurrSrcReg)
11964 .addDef(SrcValueReg)
11965 .addReg(SrcPhiReg)
11966 .addImm(16)
11968 .addUse(VccrReg)
11969 .addReg(0);
11970 } else
11971 SrcValueReg = OpSrcReg;
11972
11973 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
11974 .addDef(CurrDestReg)
11975 .addUse(SrcValueReg)
11976 .addReg(DestPhiReg)
11977 .addImm(16)
11979 .addUse(VccrReg)
11980 .addReg(0);
11981
11982 // Add the pseudoInstrs for decrementing the loop counter and marking the
11983 // end:t2DoLoopDec and t2DoLoopEnd
11984 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
11985 .addUse(LoopCounterPhiReg)
11986 .addImm(1);
11987
11988 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
11989 .addUse(RemainingLoopIterationsReg)
11990 .addMBB(TpLoopBody);
11991
11992 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
11993 .addMBB(TpExit)
11995}
11996
11999 MachineBasicBlock *BB) const {
12000 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
12001 DebugLoc dl = MI.getDebugLoc();
12002 bool isThumb2 = Subtarget->isThumb2();
12003 switch (MI.getOpcode()) {
12004 default: {
12005 MI.print(errs());
12006 llvm_unreachable("Unexpected instr type to insert");
12007 }
12008
12009 // Thumb1 post-indexed loads are really just single-register LDMs.
12010 case ARM::tLDR_postidx: {
12011 MachineOperand Def(MI.getOperand(1));
12012 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
12013 .add(Def) // Rn_wb
12014 .add(MI.getOperand(2)) // Rn
12015 .add(MI.getOperand(3)) // PredImm
12016 .add(MI.getOperand(4)) // PredReg
12017 .add(MI.getOperand(0)) // Rt
12018 .cloneMemRefs(MI);
12019 MI.eraseFromParent();
12020 return BB;
12021 }
12022
12023 case ARM::MVE_MEMCPYLOOPINST:
12024 case ARM::MVE_MEMSETLOOPINST: {
12025
12026 // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
12027 // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
12028 // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
12029 // adds the relevant instructions in the TP loop Body for generation of a
12030 // WLSTP loop.
12031
12032 // Below is relevant portion of the CFG after the transformation.
12033 // The Machine Basic Blocks are shown along with branch conditions (in
12034 // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
12035 // portion of the CFG and may not necessarily be the entry/exit of the
12036 // function.
12037
12038 // (Relevant) CFG after transformation:
12039 // TP entry MBB
12040 // |
12041 // |-----------------|
12042 // (n <= 0) (n > 0)
12043 // | |
12044 // | TP loop Body MBB<--|
12045 // | | |
12046 // \ |___________|
12047 // \ /
12048 // TP exit MBB
12049
12050 MachineFunction *MF = BB->getParent();
12051 MachineFunctionProperties &Properties = MF->getProperties();
12053
12054 Register OpDestReg = MI.getOperand(0).getReg();
12055 Register OpSrcReg = MI.getOperand(1).getReg();
12056 Register OpSizeReg = MI.getOperand(2).getReg();
12057
12058 // Allocate the required MBBs and add to parent function.
12059 MachineBasicBlock *TpEntry = BB;
12060 MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
12061 MachineBasicBlock *TpExit;
12062
12063 MF->push_back(TpLoopBody);
12064
12065 // If any instructions are present in the current block after
12066 // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
12067 // move the instructions into the newly created exit block. If there are no
12068 // instructions add an explicit branch to the FallThrough block and then
12069 // split.
12070 //
12071 // The split is required for two reasons:
12072 // 1) A terminator(t2WhileLoopStart) will be placed at that site.
12073 // 2) Since a TPLoopBody will be added later, any phis in successive blocks
12074 // need to be updated. splitAt() already handles this.
12075 TpExit = BB->splitAt(MI, false);
12076 if (TpExit == BB) {
12077 assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
12078 "block containing memcpy/memset Pseudo");
12079 TpExit = BB->getFallThrough();
12080 BuildMI(BB, dl, TII->get(ARM::t2B))
12081 .addMBB(TpExit)
12083 TpExit = BB->splitAt(MI, false);
12084 }
12085
12086 // Add logic for iteration count
12087 Register TotalIterationsReg =
12088 genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
12089
12090 // Add the vectorized (and predicated) loads/store instructions
12091 bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
12092 genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
12093 OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);
12094
12095 // Required to avoid conflict with the MachineVerifier during testing.
12096 Properties.resetNoPHIs();
12097
12098 // Connect the blocks
12099 TpEntry->addSuccessor(TpLoopBody);
12100 TpLoopBody->addSuccessor(TpLoopBody);
12101 TpLoopBody->addSuccessor(TpExit);
12102
12103 // Reorder for a more natural layout
12104 TpLoopBody->moveAfter(TpEntry);
12105 TpExit->moveAfter(TpLoopBody);
12106
12107 // Finally, remove the memcpy Pseudo Instruction
12108 MI.eraseFromParent();
12109
12110 // Return the exit block as it may contain other instructions requiring a
12111 // custom inserter
12112 return TpExit;
12113 }
12114
12115 // The Thumb2 pre-indexed stores have the same MI operands, they just
12116 // define them differently in the .td files from the isel patterns, so
12117 // they need pseudos.
12118 case ARM::t2STR_preidx:
12119 MI.setDesc(TII->get(ARM::t2STR_PRE));
12120 return BB;
12121 case ARM::t2STRB_preidx:
12122 MI.setDesc(TII->get(ARM::t2STRB_PRE));
12123 return BB;
12124 case ARM::t2STRH_preidx:
12125 MI.setDesc(TII->get(ARM::t2STRH_PRE));
12126 return BB;
12127
12128 case ARM::STRi_preidx:
12129 case ARM::STRBi_preidx: {
12130 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
12131 : ARM::STRB_PRE_IMM;
12132 // Decode the offset.
12133 unsigned Offset = MI.getOperand(4).getImm();
12134 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
12136 if (isSub)
12137 Offset = -Offset;
12138
12139 MachineMemOperand *MMO = *MI.memoperands_begin();
12140 BuildMI(*BB, MI, dl, TII->get(NewOpc))
12141 .add(MI.getOperand(0)) // Rn_wb
12142 .add(MI.getOperand(1)) // Rt
12143 .add(MI.getOperand(2)) // Rn
12144 .addImm(Offset) // offset (skip GPR==zero_reg)
12145 .add(MI.getOperand(5)) // pred
12146 .add(MI.getOperand(6))
12147 .addMemOperand(MMO);
12148 MI.eraseFromParent();
12149 return BB;
12150 }
12151 case ARM::STRr_preidx:
12152 case ARM::STRBr_preidx:
12153 case ARM::STRH_preidx: {
12154 unsigned NewOpc;
12155 switch (MI.getOpcode()) {
12156 default: llvm_unreachable("unexpected opcode!");
12157 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
12158 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
12159 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
12160 }
12161 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
12162 for (const MachineOperand &MO : MI.operands())
12163 MIB.add(MO);
12164 MI.eraseFromParent();
12165 return BB;
12166 }
12167
12168 case ARM::tMOVCCr_pseudo: {
12169 // To "insert" a SELECT_CC instruction, we actually have to insert the
12170 // diamond control-flow pattern. The incoming instruction knows the
12171 // destination vreg to set, the condition code register to branch on, the
12172 // true/false values to select between, and a branch opcode to use.
12173 const BasicBlock *LLVM_BB = BB->getBasicBlock();
12175
12176 // thisMBB:
12177 // ...
12178 // TrueVal = ...
12179 // cmpTY ccX, r1, r2
12180 // bCC copy1MBB
12181 // fallthrough --> copy0MBB
12182 MachineBasicBlock *thisMBB = BB;
12183 MachineFunction *F = BB->getParent();
12184 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
12185 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
12186 F->insert(It, copy0MBB);
12187 F->insert(It, sinkMBB);
12188
12189 // Set the call frame size on entry to the new basic blocks.
12190 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
12191 copy0MBB->setCallFrameSize(CallFrameSize);
12192 sinkMBB->setCallFrameSize(CallFrameSize);
12193
12194 // Check whether CPSR is live past the tMOVCCr_pseudo.
12195 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
12196 if (!MI.killsRegister(ARM::CPSR, /*TRI=*/nullptr) &&
12197 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
12198 copy0MBB->addLiveIn(ARM::CPSR);
12199 sinkMBB->addLiveIn(ARM::CPSR);
12200 }
12201
12202 // Transfer the remainder of BB and its successor edges to sinkMBB.
12203 sinkMBB->splice(sinkMBB->begin(), BB,
12204 std::next(MachineBasicBlock::iterator(MI)), BB->end());
12206
12207 BB->addSuccessor(copy0MBB);
12208 BB->addSuccessor(sinkMBB);
12209
12210 BuildMI(BB, dl, TII->get(ARM::tBcc))
12211 .addMBB(sinkMBB)
12212 .addImm(MI.getOperand(3).getImm())
12213 .addReg(MI.getOperand(4).getReg());
12214
12215 // copy0MBB:
12216 // %FalseValue = ...
12217 // # fallthrough to sinkMBB
12218 BB = copy0MBB;
12219
12220 // Update machine-CFG edges
12221 BB->addSuccessor(sinkMBB);
12222
12223 // sinkMBB:
12224 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
12225 // ...
12226 BB = sinkMBB;
12227 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
12228 .addReg(MI.getOperand(1).getReg())
12229 .addMBB(copy0MBB)
12230 .addReg(MI.getOperand(2).getReg())
12231 .addMBB(thisMBB);
12232
12233 MI.eraseFromParent(); // The pseudo instruction is gone now.
12234 return BB;
12235 }
12236
12237 case ARM::BCCi64:
12238 case ARM::BCCZi64: {
12239 // If there is an unconditional branch to the other successor, remove it.
12240 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
12241
12242 // Compare both parts that make up the double comparison separately for
12243 // equality.
12244 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
12245
12246 Register LHS1 = MI.getOperand(1).getReg();
12247 Register LHS2 = MI.getOperand(2).getReg();
12248 if (RHSisZero) {
12249 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12250 .addReg(LHS1)
12251 .addImm(0)
12253 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12254 .addReg(LHS2).addImm(0)
12255 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12256 } else {
12257 Register RHS1 = MI.getOperand(3).getReg();
12258 Register RHS2 = MI.getOperand(4).getReg();
12259 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12260 .addReg(LHS1)
12261 .addReg(RHS1)
12263 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12264 .addReg(LHS2).addReg(RHS2)
12265 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12266 }
12267
12268 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
12269 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
12270 if (MI.getOperand(0).getImm() == ARMCC::NE)
12271 std::swap(destMBB, exitMBB);
12272
12273 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
12274 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
12275 if (isThumb2)
12276 BuildMI(BB, dl, TII->get(ARM::t2B))
12277 .addMBB(exitMBB)
12279 else
12280 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
12281
12282 MI.eraseFromParent(); // The pseudo instruction is gone now.
12283 return BB;
12284 }
12285
12286 case ARM::Int_eh_sjlj_setjmp:
12287 case ARM::Int_eh_sjlj_setjmp_nofp:
12288 case ARM::tInt_eh_sjlj_setjmp:
12289 case ARM::t2Int_eh_sjlj_setjmp:
12290 case ARM::t2Int_eh_sjlj_setjmp_nofp:
12291 return BB;
12292
12293 case ARM::Int_eh_sjlj_setup_dispatch:
12294 EmitSjLjDispatchBlock(MI, BB);
12295 return BB;
12296
12297 case ARM::ABS:
12298 case ARM::t2ABS: {
12299 // To insert an ABS instruction, we have to insert the
12300 // diamond control-flow pattern. The incoming instruction knows the
12301 // source vreg to test against 0, the destination vreg to set,
12302 // the condition code register to branch on, the
12303 // true/false values to select between, and a branch opcode to use.
12304 // It transforms
12305 // V1 = ABS V0
12306 // into
12307 // V2 = MOVS V0
12308 // BCC (branch to SinkBB if V0 >= 0)
12309 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0)
12310 // SinkBB: V1 = PHI(V2, V3)
12311 const BasicBlock *LLVM_BB = BB->getBasicBlock();
12313 MachineFunction *Fn = BB->getParent();
12314 MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
12315 MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB);
12316 Fn->insert(BBI, RSBBB);
12317 Fn->insert(BBI, SinkBB);
12318
12319 // Set the call frame size on entry to the new basic blocks.
12320 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
12321 RSBBB->setCallFrameSize(CallFrameSize);
12322 SinkBB->setCallFrameSize(CallFrameSize);
12323
12324 Register ABSSrcReg = MI.getOperand(1).getReg();
12325 Register ABSDstReg = MI.getOperand(0).getReg();
12326 bool ABSSrcKIll = MI.getOperand(1).isKill();
12327 bool isThumb2 = Subtarget->isThumb2();
12329 // In Thumb mode S must not be specified if source register is the SP or
12330 // PC and if destination register is the SP, so restrict register class
12331 Register NewRsbDstReg = MRI.createVirtualRegister(
12332 isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
12333
12334 // Transfer the remainder of BB and its successor edges to sinkMBB.
12335 SinkBB->splice(SinkBB->begin(), BB,
12336 std::next(MachineBasicBlock::iterator(MI)), BB->end());
12338
12339 BB->addSuccessor(RSBBB);
12340 BB->addSuccessor(SinkBB);
12341
12342 // fall through to SinkMBB
12343 RSBBB->addSuccessor(SinkBB);
12344
12345 // insert a cmp at the end of BB
12346 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12347 .addReg(ABSSrcReg)
12348 .addImm(0)
12350
12351 // insert a bcc with opposite CC to ARMCC::MI at the end of BB
12352 BuildMI(BB, dl,
12353 TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
12355
12356 // insert rsbri in RSBBB
12357 // Note: BCC and rsbri will be converted into predicated rsbmi
12358 // by if-conversion pass
12359 BuildMI(*RSBBB, RSBBB->begin(), dl,
12360 TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
12361 .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
12362 .addImm(0)
12364 .add(condCodeOp());
12365
12366 // insert PHI in SinkBB,
12367 // reuse ABSDstReg to not change uses of ABS instruction
12368 BuildMI(*SinkBB, SinkBB->begin(), dl,
12369 TII->get(ARM::PHI), ABSDstReg)
12370 .addReg(NewRsbDstReg).addMBB(RSBBB)
12371 .addReg(ABSSrcReg).addMBB(BB);
12372
12373 // remove ABS instruction
12374 MI.eraseFromParent();
12375
12376 // return last added BB
12377 return SinkBB;
12378 }
12379 case ARM::COPY_STRUCT_BYVAL_I32:
12380 ++NumLoopByVals;
12381 return EmitStructByval(MI, BB);
12382 case ARM::WIN__CHKSTK:
12383 return EmitLowered__chkstk(MI, BB);
12384 case ARM::WIN__DBZCHK:
12385 return EmitLowered__dbzchk(MI, BB);
12386 }
12387}
12388
12389/// Attaches vregs to MEMCPY that it will use as scratch registers
12390/// when it is expanded into LDM/STM. This is done as a post-isel lowering
12391/// instead of as a custom inserter because we need the use list from the SDNode.
12392static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
12393 MachineInstr &MI, const SDNode *Node) {
12394 bool isThumb1 = Subtarget->isThumb1Only();
12395
12396 MachineFunction *MF = MI.getParent()->getParent();
12398 MachineInstrBuilder MIB(*MF, MI);
12399
12400 // If the new dst/src is unused mark it as dead.
12401 if (!Node->hasAnyUseOfValue(0)) {
12402 MI.getOperand(0).setIsDead(true);
12403 }
12404 if (!Node->hasAnyUseOfValue(1)) {
12405 MI.getOperand(1).setIsDead(true);
12406 }
12407
12408 // The MEMCPY both defines and kills the scratch registers.
12409 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
12410 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
12411 : &ARM::GPRRegClass);
12413 }
12414}
12415
12417 SDNode *Node) const {
12418 if (MI.getOpcode() == ARM::MEMCPY) {
12419 attachMEMCPYScratchRegs(Subtarget, MI, Node);
12420 return;
12421 }
12422
12423 const MCInstrDesc *MCID = &MI.getDesc();
12424 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
12425 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
12426 // operand is still set to noreg. If needed, set the optional operand's
12427 // register to CPSR, and remove the redundant implicit def.
12428 //
12429 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
12430
12431 // Rename pseudo opcodes.
12432 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
12433 unsigned ccOutIdx;
12434 if (NewOpc) {
12435 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
12436 MCID = &TII->get(NewOpc);
12437
12438 assert(MCID->getNumOperands() ==
12439 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
12440 && "converted opcode should be the same except for cc_out"
12441 " (and, on Thumb1, pred)");
12442
12443 MI.setDesc(*MCID);
12444
12445 // Add the optional cc_out operand
12446 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
12447
12448 // On Thumb1, move all input operands to the end, then add the predicate
12449 if (Subtarget->isThumb1Only()) {
12450 for (unsigned c = MCID->getNumOperands() - 4; c--;) {
12451 MI.addOperand(MI.getOperand(1));
12452 MI.removeOperand(1);
12453 }
12454
12455 // Restore the ties
12456 for (unsigned i = MI.getNumOperands(); i--;) {
12457 const MachineOperand& op = MI.getOperand(i);
12458 if (op.isReg() && op.isUse()) {
12459 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
12460 if (DefIdx != -1)
12461 MI.tieOperands(DefIdx, i);
12462 }
12463 }
12464
12466 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
12467 ccOutIdx = 1;
12468 } else
12469 ccOutIdx = MCID->getNumOperands() - 1;
12470 } else
12471 ccOutIdx = MCID->getNumOperands() - 1;
12472
12473 // Any ARM instruction that sets the 's' bit should specify an optional
12474 // "cc_out" operand in the last operand position.
12475 if (!MI.hasOptionalDef() || !MCID->operands()[ccOutIdx].isOptionalDef()) {
12476 assert(!NewOpc && "Optional cc_out operand required");
12477 return;
12478 }
12479 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
12480 // since we already have an optional CPSR def.
12481 bool definesCPSR = false;
12482 bool deadCPSR = false;
12483 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
12484 ++i) {
12485 const MachineOperand &MO = MI.getOperand(i);
12486 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
12487 definesCPSR = true;
12488 if (MO.isDead())
12489 deadCPSR = true;
12490 MI.removeOperand(i);
12491 break;
12492 }
12493 }
12494 if (!definesCPSR) {
12495 assert(!NewOpc && "Optional cc_out operand required");
12496 return;
12497 }
12498 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
12499 if (deadCPSR) {
12500 assert(!MI.getOperand(ccOutIdx).getReg() &&
12501 "expect uninitialized optional cc_out operand");
12502 // Thumb1 instructions must have the S bit even if the CPSR is dead.
12503 if (!Subtarget->isThumb1Only())
12504 return;
12505 }
12506
12507 // If this instruction was defined with an optional CPSR def and its dag node
12508 // had a live implicit CPSR def, then activate the optional CPSR def.
12509 MachineOperand &MO = MI.getOperand(ccOutIdx);
12510 MO.setReg(ARM::CPSR);
12511 MO.setIsDef(true);
12512}
12513
12514//===----------------------------------------------------------------------===//
12515// ARM Optimization Hooks
12516//===----------------------------------------------------------------------===//
12517
12518// Helper function that checks if N is a null or all ones constant.
12519static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
12521}
12522
12523// Return true if N is conditionally 0 or all ones.
12524// Detects these expressions where cc is an i1 value:
12525//
12526// (select cc 0, y) [AllOnes=0]
12527// (select cc y, 0) [AllOnes=0]
12528// (zext cc) [AllOnes=0]
12529// (sext cc) [AllOnes=0/1]
12530// (select cc -1, y) [AllOnes=1]
12531// (select cc y, -1) [AllOnes=1]
12532//
12533// Invert is set when N is the null/all ones constant when CC is false.
12534// OtherOp is set to the alternative value of N.
12536 SDValue &CC, bool &Invert,
12537 SDValue &OtherOp,
12538 SelectionDAG &DAG) {
12539 switch (N->getOpcode()) {
12540 default: return false;
12541 case ISD::SELECT: {
12542 CC = N->getOperand(0);
12543 SDValue N1 = N->getOperand(1);
12544 SDValue N2 = N->getOperand(2);
12545 if (isZeroOrAllOnes(N1, AllOnes)) {
12546 Invert = false;
12547 OtherOp = N2;
12548 return true;
12549 }
12550 if (isZeroOrAllOnes(N2, AllOnes)) {
12551 Invert = true;
12552 OtherOp = N1;
12553 return true;
12554 }
12555 return false;
12556 }
12557 case ISD::ZERO_EXTEND:
12558 // (zext cc) can never be the all ones value.
12559 if (AllOnes)
12560 return false;
12561 [[fallthrough]];
12562 case ISD::SIGN_EXTEND: {
12563 SDLoc dl(N);
12564 EVT VT = N->getValueType(0);
12565 CC = N->getOperand(0);
12566 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
12567 return false;
12568 Invert = !AllOnes;
12569 if (AllOnes)
12570 // When looking for an AllOnes constant, N is an sext, and the 'other'
12571 // value is 0.
12572 OtherOp = DAG.getConstant(0, dl, VT);
12573 else if (N->getOpcode() == ISD::ZERO_EXTEND)
12574 // When looking for a 0 constant, N can be zext or sext.
12575 OtherOp = DAG.getConstant(1, dl, VT);
12576 else
12577 OtherOp = DAG.getAllOnesConstant(dl, VT);
12578 return true;
12579 }
12580 }
12581}
12582
12583// Combine a constant select operand into its use:
12584//
12585// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
12586// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
12587// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
12588// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
12589// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
12590//
12591// The transform is rejected if the select doesn't have a constant operand that
12592// is null, or all ones when AllOnes is set.
12593//
12594// Also recognize sext/zext from i1:
12595//
12596// (add (zext cc), x) -> (select cc (add x, 1), x)
12597// (add (sext cc), x) -> (select cc (add x, -1), x)
12598//
12599// These transformations eventually create predicated instructions.
12600//
12601// @param N The node to transform.
12602// @param Slct The N operand that is a select.
12603// @param OtherOp The other N operand (x above).
12604// @param DCI Context.
12605// @param AllOnes Require the select constant to be all ones instead of null.
12606// @returns The new node, or SDValue() on failure.
12607static
12610 bool AllOnes = false) {
12611 SelectionDAG &DAG = DCI.DAG;
12612 EVT VT = N->getValueType(0);
12613 SDValue NonConstantVal;
12614 SDValue CCOp;
12615 bool SwapSelectOps;
12616 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
12617 NonConstantVal, DAG))
12618 return SDValue();
12619
12620 // Slct is now know to be the desired identity constant when CC is true.
12621 SDValue TrueVal = OtherOp;
12622 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
12623 OtherOp, NonConstantVal);
12624 // Unless SwapSelectOps says CC should be false.
12625 if (SwapSelectOps)
12626 std::swap(TrueVal, FalseVal);
12627
12628 return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
12629 CCOp, TrueVal, FalseVal);
12630}
12631
12632// Attempt combineSelectAndUse on each operand of a commutative operator N.
12633static
12636 SDValue N0 = N->getOperand(0);
12637 SDValue N1 = N->getOperand(1);
12638 if (N0.getNode()->hasOneUse())
12639 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
12640 return Result;
12641 if (N1.getNode()->hasOneUse())
12642 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
12643 return Result;
12644 return SDValue();
12645}
12646
12648 // VUZP shuffle node.
12649 if (N->getOpcode() == ARMISD::VUZP)
12650 return true;
12651
12652 // "VUZP" on i32 is an alias for VTRN.
12653 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
12654 return true;
12655
12656 return false;
12657}
12658
12661 const ARMSubtarget *Subtarget) {
12662 // Look for ADD(VUZP.0, VUZP.1).
12663 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
12664 N0 == N1)
12665 return SDValue();
12666
12667 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
12668 if (!N->getValueType(0).is64BitVector())
12669 return SDValue();
12670
12671 // Generate vpadd.
12672 SelectionDAG &DAG = DCI.DAG;
12673 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12674 SDLoc dl(N);
12675 SDNode *Unzip = N0.getNode();
12676 EVT VT = N->getValueType(0);
12677
12679 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
12680 TLI.getPointerTy(DAG.getDataLayout())));
12681 Ops.push_back(Unzip->getOperand(0));
12682 Ops.push_back(Unzip->getOperand(1));
12683
12684 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12685}
12686
12689 const ARMSubtarget *Subtarget) {
12690 // Check for two extended operands.
12691 if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
12692 N1.getOpcode() == ISD::SIGN_EXTEND) &&
12693 !(N0.getOpcode() == ISD::ZERO_EXTEND &&
12694 N1.getOpcode() == ISD::ZERO_EXTEND))
12695 return SDValue();
12696
12697 SDValue N00 = N0.getOperand(0);
12698 SDValue N10 = N1.getOperand(0);
12699
12700 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
12701 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
12702 N00 == N10)
12703 return SDValue();
12704
12705 // We only recognize Q register paddl here; this can't be reached until
12706 // after type legalization.
12707 if (!N00.getValueType().is64BitVector() ||
12709 return SDValue();
12710
12711 // Generate vpaddl.
12712 SelectionDAG &DAG = DCI.DAG;
12713 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12714 SDLoc dl(N);
12715 EVT VT = N->getValueType(0);
12716
12718 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
12719 unsigned Opcode;
12720 if (N0.getOpcode() == ISD::SIGN_EXTEND)
12721 Opcode = Intrinsic::arm_neon_vpaddls;
12722 else
12723 Opcode = Intrinsic::arm_neon_vpaddlu;
12724 Ops.push_back(DAG.getConstant(Opcode, dl,
12725 TLI.getPointerTy(DAG.getDataLayout())));
12726 EVT ElemTy = N00.getValueType().getVectorElementType();
12727 unsigned NumElts = VT.getVectorNumElements();
12728 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
12729 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
12730 N00.getOperand(0), N00.getOperand(1));
12731 Ops.push_back(Concat);
12732
12733 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12734}
12735
12736// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
12737// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
12738// much easier to match.
12739static SDValue
12742 const ARMSubtarget *Subtarget) {
12743 // Only perform optimization if after legalize, and if NEON is available. We
12744 // also expected both operands to be BUILD_VECTORs.
12745 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
12746 || N0.getOpcode() != ISD::BUILD_VECTOR
12747 || N1.getOpcode() != ISD::BUILD_VECTOR)
12748 return SDValue();
12749
12750 // Check output type since VPADDL operand elements can only be 8, 16, or 32.
12751 EVT VT = N->getValueType(0);
12752 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
12753 return SDValue();
12754
12755 // Check that the vector operands are of the right form.
12756 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
12757 // operands, where N is the size of the formed vector.
12758 // Each EXTRACT_VECTOR should have the same input vector and odd or even
12759 // index such that we have a pair wise add pattern.
12760
12761 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
12763 return SDValue();
12764 SDValue Vec = N0->getOperand(0)->getOperand(0);
12765 SDNode *V = Vec.getNode();
12766 unsigned nextIndex = 0;
12767
12768 // For each operands to the ADD which are BUILD_VECTORs,
12769 // check to see if each of their operands are an EXTRACT_VECTOR with
12770 // the same vector and appropriate index.
12771 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
12774
12775 SDValue ExtVec0 = N0->getOperand(i);
12776 SDValue ExtVec1 = N1->getOperand(i);
12777
12778 // First operand is the vector, verify its the same.
12779 if (V != ExtVec0->getOperand(0).getNode() ||
12780 V != ExtVec1->getOperand(0).getNode())
12781 return SDValue();
12782
12783 // Second is the constant, verify its correct.
12784 ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
12785 ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
12786
12787 // For the constant, we want to see all the even or all the odd.
12788 if (!C0 || !C1 || C0->getZExtValue() != nextIndex
12789 || C1->getZExtValue() != nextIndex+1)
12790 return SDValue();
12791
12792 // Increment index.
12793 nextIndex+=2;
12794 } else
12795 return SDValue();
12796 }
12797
12798 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
12799 // we're using the entire input vector, otherwise there's a size/legality
12800 // mismatch somewhere.
12801 if (nextIndex != Vec.getValueType().getVectorNumElements() ||
12803 return SDValue();
12804
12805 // Create VPADDL node.
12806 SelectionDAG &DAG = DCI.DAG;
12807 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12808
12809 SDLoc dl(N);
12810
12811 // Build operand list.
12813 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
12814 TLI.getPointerTy(DAG.getDataLayout())));
12815
12816 // Input is the vector.
12817 Ops.push_back(Vec);
12818
12819 // Get widened type and narrowed type.
12820 MVT widenType;
12821 unsigned numElem = VT.getVectorNumElements();
12822
12823 EVT inputLaneType = Vec.getValueType().getVectorElementType();
12824 switch (inputLaneType.getSimpleVT().SimpleTy) {
12825 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
12826 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
12827 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
12828 default:
12829 llvm_unreachable("Invalid vector element type for padd optimization.");
12830 }
12831
12832 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
12833 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
12834 return DAG.getNode(ExtOp, dl, VT, tmp);
12835}
12836
12838 if (V->getOpcode() == ISD::UMUL_LOHI ||
12839 V->getOpcode() == ISD::SMUL_LOHI)
12840 return V;
12841 return SDValue();
12842}
12843
12844static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
12846 const ARMSubtarget *Subtarget) {
12847 if (!Subtarget->hasBaseDSP())
12848 return SDValue();
12849
12850 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
12851 // accumulates the product into a 64-bit value. The 16-bit values will
12852 // be sign extended somehow or SRA'd into 32-bit values
12853 // (addc (adde (mul 16bit, 16bit), lo), hi)
12854 SDValue Mul = AddcNode->getOperand(0);
12855 SDValue Lo = AddcNode->getOperand(1);
12856 if (Mul.getOpcode() != ISD::MUL) {
12857 Lo = AddcNode->getOperand(0);
12858 Mul = AddcNode->getOperand(1);
12859 if (Mul.getOpcode() != ISD::MUL)
12860 return SDValue();
12861 }
12862
12863 SDValue SRA = AddeNode->getOperand(0);
12864 SDValue Hi = AddeNode->getOperand(1);
12865 if (SRA.getOpcode() != ISD::SRA) {
12866 SRA = AddeNode->getOperand(1);
12867 Hi = AddeNode->getOperand(0);
12868 if (SRA.getOpcode() != ISD::SRA)
12869 return SDValue();
12870 }
12871 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
12872 if (Const->getZExtValue() != 31)
12873 return SDValue();
12874 } else
12875 return SDValue();
12876
12877 if (SRA.getOperand(0) != Mul)
12878 return SDValue();
12879
12880 SelectionDAG &DAG = DCI.DAG;
12881 SDLoc dl(AddcNode);
12882 unsigned Opcode = 0;
12883 SDValue Op0;
12884 SDValue Op1;
12885
12886 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
12887 Opcode = ARMISD::SMLALBB;
12888 Op0 = Mul.getOperand(0);
12889 Op1 = Mul.getOperand(1);
12890 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
12891 Opcode = ARMISD::SMLALBT;
12892 Op0 = Mul.getOperand(0);
12893 Op1 = Mul.getOperand(1).getOperand(0);
12894 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
12895 Opcode = ARMISD::SMLALTB;
12896 Op0 = Mul.getOperand(0).getOperand(0);
12897 Op1 = Mul.getOperand(1);
12898 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
12899 Opcode = ARMISD::SMLALTT;
12900 Op0 = Mul->getOperand(0).getOperand(0);
12901 Op1 = Mul->getOperand(1).getOperand(0);
12902 }
12903
12904 if (!Op0 || !Op1)
12905 return SDValue();
12906
12907 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
12908 Op0, Op1, Lo, Hi);
12909 // Replace the ADDs' nodes uses by the MLA node's values.
12910 SDValue HiMLALResult(SMLAL.getNode(), 1);
12911 SDValue LoMLALResult(SMLAL.getNode(), 0);
12912
12913 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
12914 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
12915
12916 // Return original node to notify the driver to stop replacing.
12917 SDValue resNode(AddcNode, 0);
12918 return resNode;
12919}
12920
12923 const ARMSubtarget *Subtarget) {
12924 // Look for multiply add opportunities.
12925 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
12926 // each add nodes consumes a value from ISD::UMUL_LOHI and there is
12927 // a glue link from the first add to the second add.
12928 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
12929 // a S/UMLAL instruction.
12930 // UMUL_LOHI
12931 // / :lo \ :hi
12932 // V \ [no multiline comment]
12933 // loAdd -> ADDC |
12934 // \ :carry /
12935 // V V
12936 // ADDE <- hiAdd
12937 //
12938 // In the special case where only the higher part of a signed result is used
12939 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
12940 // a constant with the exact value of 0x80000000, we recognize we are dealing
12941 // with a "rounded multiply and add" (or subtract) and transform it into
12942 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
12943
12944 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
12945 AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
12946 "Expect an ADDE or SUBE");
12947
12948 assert(AddeSubeNode->getNumOperands() == 3 &&
12949 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
12950 "ADDE node has the wrong inputs");
12951
12952 // Check that we are chained to the right ADDC or SUBC node.
12953 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
12954 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12955 AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
12956 (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
12957 AddcSubcNode->getOpcode() != ARMISD::SUBC))
12958 return SDValue();
12959
12960 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
12961 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
12962
12963 // Check if the two operands are from the same mul_lohi node.
12964 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
12965 return SDValue();
12966
12967 assert(AddcSubcNode->getNumValues() == 2 &&
12968 AddcSubcNode->getValueType(0) == MVT::i32 &&
12969 "Expect ADDC with two result values. First: i32");
12970
12971 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
12972 // maybe a SMLAL which multiplies two 16-bit values.
12973 if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12974 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
12975 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
12976 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
12977 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
12978 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
12979
12980 // Check for the triangle shape.
12981 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
12982 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
12983
12984 // Make sure that the ADDE/SUBE operands are not coming from the same node.
12985 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
12986 return SDValue();
12987
12988 // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
12989 bool IsLeftOperandMUL = false;
12990 SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
12991 if (MULOp == SDValue())
12992 MULOp = findMUL_LOHI(AddeSubeOp1);
12993 else
12994 IsLeftOperandMUL = true;
12995 if (MULOp == SDValue())
12996 return SDValue();
12997
12998 // Figure out the right opcode.
12999 unsigned Opc = MULOp->getOpcode();
13000 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
13001
13002 // Figure out the high and low input values to the MLAL node.
13003 SDValue *HiAddSub = nullptr;
13004 SDValue *LoMul = nullptr;
13005 SDValue *LowAddSub = nullptr;
13006
13007 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
13008 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
13009 return SDValue();
13010
13011 if (IsLeftOperandMUL)
13012 HiAddSub = &AddeSubeOp1;
13013 else
13014 HiAddSub = &AddeSubeOp0;
13015
13016 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
13017 // whose low result is fed to the ADDC/SUBC we are checking.
13018
13019 if (AddcSubcOp0 == MULOp.getValue(0)) {
13020 LoMul = &AddcSubcOp0;
13021 LowAddSub = &AddcSubcOp1;
13022 }
13023 if (AddcSubcOp1 == MULOp.getValue(0)) {
13024 LoMul = &AddcSubcOp1;
13025 LowAddSub = &AddcSubcOp0;
13026 }
13027
13028 if (!LoMul)
13029 return SDValue();
13030
13031 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
13032 // the replacement below will create a cycle.
13033 if (AddcSubcNode == HiAddSub->getNode() ||
13034 AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
13035 return SDValue();
13036
13037 // Create the merged node.
13038 SelectionDAG &DAG = DCI.DAG;
13039
13040 // Start building operand list.
13042 Ops.push_back(LoMul->getOperand(0));
13043 Ops.push_back(LoMul->getOperand(1));
13044
13045 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
13046 // the case, we must be doing signed multiplication and only use the higher
13047 // part of the result of the MLAL, furthermore the LowAddSub must be a constant
13048 // addition or subtraction with the value of 0x800000.
13049 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
13050 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
13051 LowAddSub->getNode()->getOpcode() == ISD::Constant &&
13052 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
13053 0x80000000) {
13054 Ops.push_back(*HiAddSub);
13055 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
13056 FinalOpc = ARMISD::SMMLSR;
13057 } else {
13058 FinalOpc = ARMISD::SMMLAR;
13059 }
13060 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
13061 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
13062
13063 return SDValue(AddeSubeNode, 0);
13064 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
13065 // SMMLS is generated during instruction selection and the rest of this
13066 // function can not handle the case where AddcSubcNode is a SUBC.
13067 return SDValue();
13068
13069 // Finish building the operand list for {U/S}MLAL
13070 Ops.push_back(*LowAddSub);
13071 Ops.push_back(*HiAddSub);
13072
13073 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
13074 DAG.getVTList(MVT::i32, MVT::i32), Ops);
13075
13076 // Replace the ADDs' nodes uses by the MLA node's values.
13077 SDValue HiMLALResult(MLALNode.getNode(), 1);
13078 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
13079
13080 SDValue LoMLALResult(MLALNode.getNode(), 0);
13081 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
13082
13083 // Return original node to notify the driver to stop replacing.
13084 return SDValue(AddeSubeNode, 0);
13085}
13086
13089 const ARMSubtarget *Subtarget) {
13090 // UMAAL is similar to UMLAL except that it adds two unsigned values.
13091 // While trying to combine for the other MLAL nodes, first search for the
13092 // chance to use UMAAL. Check if Addc uses a node which has already
13093 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
13094 // as the addend, and it's handled in PerformUMLALCombine.
13095
13096 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
13097 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
13098
13099 // Check that we have a glued ADDC node.
13100 SDNode* AddcNode = AddeNode->getOperand(2).getNode();
13101 if (AddcNode->getOpcode() != ARMISD::ADDC)
13102 return SDValue();
13103
13104 // Find the converted UMAAL or quit if it doesn't exist.
13105 SDNode *UmlalNode = nullptr;
13106 SDValue AddHi;
13107 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
13108 UmlalNode = AddcNode->getOperand(0).getNode();
13109 AddHi = AddcNode->getOperand(1);
13110 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
13111 UmlalNode = AddcNode->getOperand(1).getNode();
13112 AddHi = AddcNode->getOperand(0);
13113 } else {
13114 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
13115 }
13116
13117 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
13118 // the ADDC as well as Zero.
13119 if (!isNullConstant(UmlalNode->getOperand(3)))
13120 return SDValue();
13121
13122 if ((isNullConstant(AddeNode->getOperand(0)) &&
13123 AddeNode->getOperand(1).getNode() == UmlalNode) ||
13124 (AddeNode->getOperand(0).getNode() == UmlalNode &&
13125 isNullConstant(AddeNode->getOperand(1)))) {
13126 SelectionDAG &DAG = DCI.DAG;
13127 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
13128 UmlalNode->getOperand(2), AddHi };
13129 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
13130 DAG.getVTList(MVT::i32, MVT::i32), Ops);
13131
13132 // Replace the ADDs' nodes uses by the UMAAL node's values.
13133 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
13134 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
13135
13136 // Return original node to notify the driver to stop replacing.
13137 return SDValue(AddeNode, 0);
13138 }
13139 return SDValue();
13140}
13141
13143 const ARMSubtarget *Subtarget) {
13144 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
13145 return SDValue();
13146
13147 // Check that we have a pair of ADDC and ADDE as operands.
13148 // Both addends of the ADDE must be zero.
13149 SDNode* AddcNode = N->getOperand(2).getNode();
13150 SDNode* AddeNode = N->getOperand(3).getNode();
13151 if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
13152 (AddeNode->getOpcode() == ARMISD::ADDE) &&
13153 isNullConstant(AddeNode->getOperand(0)) &&
13154 isNullConstant(AddeNode->getOperand(1)) &&
13155 (AddeNode->getOperand(2).getNode() == AddcNode))
13156 return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
13157 DAG.getVTList(MVT::i32, MVT::i32),
13158 {N->getOperand(0), N->getOperand(1),
13159 AddcNode->getOperand(0), AddcNode->getOperand(1)});
13160 else
13161 return SDValue();
13162}
13163
13166 const ARMSubtarget *Subtarget) {
13167 SelectionDAG &DAG(DCI.DAG);
13168
13169 if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(1)) {
13170 // (SUBC (ADDE 0, 0, C), 1) -> C
13171 SDValue LHS = N->getOperand(0);
13172 SDValue RHS = N->getOperand(1);
13173 if (LHS->getOpcode() == ARMISD::ADDE &&
13174 isNullConstant(LHS->getOperand(0)) &&
13175 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
13176 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
13177 }
13178 }
13179
13180 if (Subtarget->isThumb1Only()) {
13181 SDValue RHS = N->getOperand(1);
13182 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
13183 int32_t imm = C->getSExtValue();
13184 if (imm < 0 && imm > std::numeric_limits<int>::min()) {
13185 SDLoc DL(N);
13186 RHS = DAG.getConstant(-imm, DL, MVT::i32);
13187 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
13188 : ARMISD::ADDC;
13189 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
13190 }
13191 }
13192 }
13193
13194 return SDValue();
13195}
13196
13199 const ARMSubtarget *Subtarget) {
13200 if (Subtarget->isThumb1Only()) {
13201 SelectionDAG &DAG = DCI.DAG;
13202 SDValue RHS = N->getOperand(1);
13203 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
13204 int64_t imm = C->getSExtValue();
13205 if (imm < 0) {
13206 SDLoc DL(N);
13207
13208 // The with-carry-in form matches bitwise not instead of the negation.
13209 // Effectively, the inverse interpretation of the carry flag already
13210 // accounts for part of the negation.
13211 RHS = DAG.getConstant(~imm, DL, MVT::i32);
13212
13213 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
13214 : ARMISD::ADDE;
13215 return DAG.getNode(Opcode, DL, N->getVTList(),
13216 N->getOperand(0), RHS, N->getOperand(2));
13217 }
13218 }
13219 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
13220 return AddCombineTo64bitMLAL(N, DCI, Subtarget);
13221 }
13222 return SDValue();
13223}
13224
13227 const ARMSubtarget *Subtarget) {
13228 if (!Subtarget->hasMVEIntegerOps())
13229 return SDValue();
13230
13231 SDLoc dl(N);
13232 SDValue SetCC;
13233 SDValue LHS;
13234 SDValue RHS;
13235 ISD::CondCode CC;
13236 SDValue TrueVal;
13237 SDValue FalseVal;
13238
13239 if (N->getOpcode() == ISD::SELECT &&
13240 N->getOperand(0)->getOpcode() == ISD::SETCC) {
13241 SetCC = N->getOperand(0);
13242 LHS = SetCC->getOperand(0);
13243 RHS = SetCC->getOperand(1);
13244 CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
13245 TrueVal = N->getOperand(1);
13246 FalseVal = N->getOperand(2);
13247 } else if (N->getOpcode() == ISD::SELECT_CC) {
13248 LHS = N->getOperand(0);
13249 RHS = N->getOperand(1);
13250 CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
13251 TrueVal = N->getOperand(2);
13252 FalseVal = N->getOperand(3);
13253 } else {
13254 return SDValue();
13255 }
13256
13257 unsigned int Opcode = 0;
13258 if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
13259 FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
13260 (CC == ISD::SETULT || CC == ISD::SETUGT)) {
13261 Opcode = ARMISD::VMINVu;
13262 if (CC == ISD::SETUGT)
13263 std::swap(TrueVal, FalseVal);
13264 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
13265 FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
13266 (CC == ISD::SETLT || CC == ISD::SETGT)) {
13267 Opcode = ARMISD::VMINVs;
13268 if (CC == ISD::SETGT)
13269 std::swap(TrueVal, FalseVal);
13270 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
13271 FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
13272 (CC == ISD::SETUGT || CC == ISD::SETULT)) {
13273 Opcode = ARMISD::VMAXVu;
13274 if (CC == ISD::SETULT)
13275 std::swap(TrueVal, FalseVal);
13276 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
13277 FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
13278 (CC == ISD::SETGT || CC == ISD::SETLT)) {
13279 Opcode = ARMISD::VMAXVs;
13280 if (CC == ISD::SETLT)
13281 std::swap(TrueVal, FalseVal);
13282 } else
13283 return SDValue();
13284
13285 // Normalise to the right hand side being the vector reduction
13286 switch (TrueVal->getOpcode()) {
13291 std::swap(LHS, RHS);
13292 std::swap(TrueVal, FalseVal);
13293 break;
13294 }
13295
13296 EVT VectorType = FalseVal->getOperand(0).getValueType();
13297
13298 if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&
13299 VectorType != MVT::v4i32)
13300 return SDValue();
13301
13302 EVT VectorScalarType = VectorType.getVectorElementType();
13303
13304 // The values being selected must also be the ones being compared
13305 if (TrueVal != LHS || FalseVal != RHS)
13306 return SDValue();
13307
13308 EVT LeftType = LHS->getValueType(0);
13309 EVT RightType = RHS->getValueType(0);
13310
13311 // The types must match the reduced type too
13312 if (LeftType != VectorScalarType || RightType != VectorScalarType)
13313 return SDValue();
13314
13315 // Legalise the scalar to an i32
13316 if (VectorScalarType != MVT::i32)
13317 LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
13318
13319 // Generate the reduction as an i32 for legalisation purposes
13320 auto Reduction =
13321 DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
13322
13323 // The result isn't actually an i32 so truncate it back to its original type
13324 if (VectorScalarType != MVT::i32)
13325 Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
13326
13327 return Reduction;
13328}
13329
13330// A special combine for the vqdmulh family of instructions. This is one of the
13331// potential set of patterns that could patch this instruction. The base pattern
13332// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
13333// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
13334// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
13335// the max is unnecessary.
13337 EVT VT = N->getValueType(0);
13338 SDValue Shft;
13339 ConstantSDNode *Clamp;
13340
13341 if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
13342 return SDValue();
13343
13344 if (N->getOpcode() == ISD::SMIN) {
13345 Shft = N->getOperand(0);
13346 Clamp = isConstOrConstSplat(N->getOperand(1));
13347 } else if (N->getOpcode() == ISD::VSELECT) {
13348 // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
13349 SDValue Cmp = N->getOperand(0);
13350 if (Cmp.getOpcode() != ISD::SETCC ||
13351 cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||
13352 Cmp.getOperand(0) != N->getOperand(1) ||
13353 Cmp.getOperand(1) != N->getOperand(2))
13354 return SDValue();
13355 Shft = N->getOperand(1);
13356 Clamp = isConstOrConstSplat(N->getOperand(2));
13357 } else
13358 return SDValue();
13359
13360 if (!Clamp)
13361 return SDValue();
13362
13363 MVT ScalarType;
13364 int ShftAmt = 0;
13365 switch (Clamp->getSExtValue()) {
13366 case (1 << 7) - 1:
13367 ScalarType = MVT::i8;
13368 ShftAmt = 7;
13369 break;
13370 case (1 << 15) - 1:
13371 ScalarType = MVT::i16;
13372 ShftAmt = 15;
13373 break;
13374 case (1ULL << 31) - 1:
13375 ScalarType = MVT::i32;
13376 ShftAmt = 31;
13377 break;
13378 default:
13379 return SDValue();
13380 }
13381
13382 if (Shft.getOpcode() != ISD::SRA)
13383 return SDValue();
13385 if (!N1 || N1->getSExtValue() != ShftAmt)
13386 return SDValue();
13387
13388 SDValue Mul = Shft.getOperand(0);
13389 if (Mul.getOpcode() != ISD::MUL)
13390 return SDValue();
13391
13392 SDValue Ext0 = Mul.getOperand(0);
13393 SDValue Ext1 = Mul.getOperand(1);
13394 if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
13395 Ext1.getOpcode() != ISD::SIGN_EXTEND)
13396 return SDValue();
13397 EVT VecVT = Ext0.getOperand(0).getValueType();
13398 if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1)
13399 return SDValue();
13400 if (Ext1.getOperand(0).getValueType() != VecVT ||
13401 VecVT.getScalarType() != ScalarType ||
13402 VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
13403 return SDValue();
13404
13405 SDLoc DL(Mul);
13406 unsigned LegalLanes = 128 / (ShftAmt + 1);
13407 EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes);
13408 // For types smaller than legal vectors extend to be legal and only use needed
13409 // lanes.
13410 if (VecVT.getSizeInBits() < 128) {
13411 EVT ExtVecVT =
13413 VecVT.getVectorNumElements());
13414 SDValue Inp0 =
13415 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0));
13416 SDValue Inp1 =
13417 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0));
13418 Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0);
13419 Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1);
13420 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13421 SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH);
13422 Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc);
13423 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc);
13424 }
13425
13426 // For larger types, split into legal sized chunks.
13427 assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type");
13428 unsigned NumParts = VecVT.getSizeInBits() / 128;
13430 for (unsigned I = 0; I < NumParts; ++I) {
13431 SDValue Inp0 =
13432 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0),
13433 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13434 SDValue Inp1 =
13435 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0),
13436 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13437 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13438 Parts.push_back(VQDMULH);
13439 }
13440 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT,
13441 DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));
13442}
13443
13446 const ARMSubtarget *Subtarget) {
13447 if (!Subtarget->hasMVEIntegerOps())
13448 return SDValue();
13449
13450 if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))
13451 return V;
13452
13453 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
13454 //
13455 // We need to re-implement this optimization here as the implementation in the
13456 // Target-Independent DAGCombiner does not handle the kind of constant we make
13457 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
13458 // good reason, allowing truncation there would break other targets).
13459 //
13460 // Currently, this is only done for MVE, as it's the only target that benefits
13461 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
13462 if (N->getOperand(0).getOpcode() != ISD::XOR)
13463 return SDValue();
13464 SDValue XOR = N->getOperand(0);
13465
13466 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
13467 // It is important to check with truncation allowed as the BUILD_VECTORs we
13468 // generate in those situations will truncate their operands.
13469 ConstantSDNode *Const =
13470 isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,
13471 /*AllowTruncation*/ true);
13472 if (!Const || !Const->isOne())
13473 return SDValue();
13474
13475 // Rewrite into vselect(cond, rhs, lhs).
13476 SDValue Cond = XOR->getOperand(0);
13477 SDValue LHS = N->getOperand(1);
13478 SDValue RHS = N->getOperand(2);
13479 EVT Type = N->getValueType(0);
13480 return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
13481}
13482
13483// Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n
13486 const ARMSubtarget *Subtarget) {
13487 SDValue Op0 = N->getOperand(0);
13488 SDValue Op1 = N->getOperand(1);
13489 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
13490 EVT VT = N->getValueType(0);
13491
13492 if (!Subtarget->hasMVEIntegerOps() ||
13494 return SDValue();
13495
13496 if (CC == ISD::SETUGE) {
13497 std::swap(Op0, Op1);
13498 CC = ISD::SETULT;
13499 }
13500
13501 if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 ||
13503 return SDValue();
13504
13505 // Check first operand is BuildVector of 0,1,2,...
13506 for (unsigned I = 0; I < VT.getVectorNumElements(); I++) {
13507 if (!Op0.getOperand(I).isUndef() &&
13508 !(isa<ConstantSDNode>(Op0.getOperand(I)) &&
13509 Op0.getConstantOperandVal(I) == I))
13510 return SDValue();
13511 }
13512
13513 // The second is a Splat of Op1S
13514 SDValue Op1S = DCI.DAG.getSplatValue(Op1);
13515 if (!Op1S)
13516 return SDValue();
13517
13518 unsigned Opc;
13519 switch (VT.getVectorNumElements()) {
13520 case 2:
13521 Opc = Intrinsic::arm_mve_vctp64;
13522 break;
13523 case 4:
13524 Opc = Intrinsic::arm_mve_vctp32;
13525 break;
13526 case 8:
13527 Opc = Intrinsic::arm_mve_vctp16;
13528 break;
13529 case 16:
13530 Opc = Intrinsic::arm_mve_vctp8;
13531 break;
13532 default:
13533 return SDValue();
13534 }
13535
13536 SDLoc DL(N);
13537 return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
13538 DCI.DAG.getConstant(Opc, DL, MVT::i32),
13539 DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32));
13540}
13541
13542/// PerformADDECombine - Target-specific dag combine transform from
13543/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
13544/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
13547 const ARMSubtarget *Subtarget) {
13548 // Only ARM and Thumb2 support UMLAL/SMLAL.
13549 if (Subtarget->isThumb1Only())
13550 return PerformAddeSubeCombine(N, DCI, Subtarget);
13551
13552 // Only perform the checks after legalize when the pattern is available.
13553 if (DCI.isBeforeLegalize()) return SDValue();
13554
13555 return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
13556}
13557
13558/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
13559/// operands N0 and N1. This is a helper for PerformADDCombine that is
13560/// called with the default operands, and if that fails, with commuted
13561/// operands.
13564 const ARMSubtarget *Subtarget){
13565 // Attempt to create vpadd for this add.
13566 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
13567 return Result;
13568
13569 // Attempt to create vpaddl for this add.
13570 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
13571 return Result;
13572 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
13573 Subtarget))
13574 return Result;
13575
13576 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
13577 if (N0.getNode()->hasOneUse())
13578 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
13579 return Result;
13580 return SDValue();
13581}
13582
13584 EVT VT = N->getValueType(0);
13585 SDValue N0 = N->getOperand(0);
13586 SDValue N1 = N->getOperand(1);
13587 SDLoc dl(N);
13588
13589 auto IsVecReduce = [](SDValue Op) {
13590 switch (Op.getOpcode()) {
13591 case ISD::VECREDUCE_ADD:
13592 case ARMISD::VADDVs:
13593 case ARMISD::VADDVu:
13594 case ARMISD::VMLAVs:
13595 case ARMISD::VMLAVu:
13596 return true;
13597 }
13598 return false;
13599 };
13600
13601 auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) {
13602 // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->
13603 // add(add(X, vecreduce(Y)), vecreduce(Z))
13604 // to make better use of vaddva style instructions.
13605 if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&
13606 IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) &&
13607 !isa<ConstantSDNode>(N0) && N1->hasOneUse()) {
13608 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0));
13609 return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1));
13610 }
13611 // And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
13612 // add(add(add(A, C), reduce(B)), reduce(D))
13613 if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
13614 N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) {
13615 unsigned N0RedOp = 0;
13616 if (!IsVecReduce(N0.getOperand(N0RedOp))) {
13617 N0RedOp = 1;
13618 if (!IsVecReduce(N0.getOperand(N0RedOp)))
13619 return SDValue();
13620 }
13621
13622 unsigned N1RedOp = 0;
13623 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13624 N1RedOp = 1;
13625 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13626 return SDValue();
13627
13628 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp),
13629 N1.getOperand(1 - N1RedOp));
13630 SDValue Add1 =
13631 DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp));
13632 return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp));
13633 }
13634 return SDValue();
13635 };
13636 if (SDValue R = DistrubuteAddAddVecReduce(N0, N1))
13637 return R;
13638 if (SDValue R = DistrubuteAddAddVecReduce(N1, N0))
13639 return R;
13640
13641 // Distribute add(vecreduce(load(Y)), vecreduce(load(Z)))
13642 // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z)))
13643 // by ascending load offsets. This can help cores prefetch if the order of
13644 // loads is more predictable.
13645 auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) {
13646 // Check if two reductions are known to load data where one is before/after
13647 // another. Return negative if N0 loads data before N1, positive if N1 is
13648 // before N0 and 0 otherwise if nothing is known.
13649 auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) {
13650 // Look through to the first operand of a MUL, for the VMLA case.
13651 // Currently only looks at the first operand, in the hope they are equal.
13652 if (N0.getOpcode() == ISD::MUL)
13653 N0 = N0.getOperand(0);
13654 if (N1.getOpcode() == ISD::MUL)
13655 N1 = N1.getOperand(0);
13656
13657 // Return true if the two operands are loads to the same object and the
13658 // offset of the first is known to be less than the offset of the second.
13659 LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0);
13660 LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1);
13661 if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() ||
13662 !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() ||
13663 Load1->isIndexed())
13664 return 0;
13665
13666 auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG);
13667 auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG);
13668
13669 if (!BaseLocDecomp0.getBase() ||
13670 BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() ||
13671 !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset())
13672 return 0;
13673 if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset())
13674 return -1;
13675 if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset())
13676 return 1;
13677 return 0;
13678 };
13679
13680 SDValue X;
13681 if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) {
13682 if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) {
13683 int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0),
13684 N0.getOperand(1).getOperand(0));
13685 if (IsBefore < 0) {
13686 X = N0.getOperand(0);
13687 N0 = N0.getOperand(1);
13688 } else if (IsBefore > 0) {
13689 X = N0.getOperand(1);
13690 N0 = N0.getOperand(0);
13691 } else
13692 return SDValue();
13693 } else if (IsVecReduce(N0.getOperand(0))) {
13694 X = N0.getOperand(1);
13695 N0 = N0.getOperand(0);
13696 } else if (IsVecReduce(N0.getOperand(1))) {
13697 X = N0.getOperand(0);
13698 N0 = N0.getOperand(1);
13699 } else
13700 return SDValue();
13701 } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) &&
13702 IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) {
13703 // Note this is backward to how you would expect. We create
13704 // add(reduce(load + 16), reduce(load + 0)) so that the
13705 // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving
13706 // the X as VADDV(load + 0)
13707 return DAG.getNode(ISD::ADD, dl, VT, N1, N0);
13708 } else
13709 return SDValue();
13710
13711 if (!IsVecReduce(N0) || !IsVecReduce(N1))
13712 return SDValue();
13713
13714 if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0)
13715 return SDValue();
13716
13717 // Switch from add(add(X, N0), N1) to add(add(X, N1), N0)
13718 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1);
13719 return DAG.getNode(ISD::ADD, dl, VT, Add0, N0);
13720 };
13721 if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true))
13722 return R;
13723 if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false))
13724 return R;
13725 return SDValue();
13726}
13727
13729 const ARMSubtarget *Subtarget) {
13730 if (!Subtarget->hasMVEIntegerOps())
13731 return SDValue();
13732
13734 return R;
13735
13736 EVT VT = N->getValueType(0);
13737 SDValue N0 = N->getOperand(0);
13738 SDValue N1 = N->getOperand(1);
13739 SDLoc dl(N);
13740
13741 if (VT != MVT::i64)
13742 return SDValue();
13743
13744 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
13745 // will look like:
13746 // t1: i32,i32 = ARMISD::VADDLVs x
13747 // t2: i64 = build_pair t1, t1:1
13748 // t3: i64 = add t2, y
13749 // Otherwise we try to push the add up above VADDLVAx, to potentially allow
13750 // the add to be simplified separately.
13751 // We also need to check for sext / zext and commutitive adds.
13752 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
13753 SDValue NB) {
13754 if (NB->getOpcode() != ISD::BUILD_PAIR)
13755 return SDValue();
13756 SDValue VecRed = NB->getOperand(0);
13757 if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) ||
13758 VecRed.getResNo() != 0 ||
13759 NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
13760 return SDValue();
13761
13762 if (VecRed->getOpcode() == OpcodeA) {
13763 // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
13764 SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
13765 VecRed.getOperand(0), VecRed.getOperand(1));
13766 NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA);
13767 }
13768
13770 std::tie(Ops[0], Ops[1]) = DAG.SplitScalar(NA, dl, MVT::i32, MVT::i32);
13771
13772 unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0;
13773 for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++)
13774 Ops.push_back(VecRed->getOperand(I));
13775 SDValue Red =
13776 DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops);
13777 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
13778 SDValue(Red.getNode(), 1));
13779 };
13780
13781 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
13782 return M;
13783 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))
13784 return M;
13785 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))
13786 return M;
13787 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
13788 return M;
13789 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
13790 return M;
13791 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
13792 return M;
13793 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
13794 return M;
13795 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
13796 return M;
13797 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
13798 return M;
13799 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
13800 return M;
13801 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))
13802 return M;
13803 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
13804 return M;
13805 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))
13806 return M;
13807 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))
13808 return M;
13809 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))
13810 return M;
13811 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))
13812 return M;
13813 return SDValue();
13814}
13815
13816bool
13818 CombineLevel Level) const {
13819 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
13820 N->getOpcode() == ISD::SRL) &&
13821 "Expected shift op");
13822
13823 SDValue ShiftLHS = N->getOperand(0);
13824 if (!ShiftLHS->hasOneUse())
13825 return false;
13826
13827 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
13828 !ShiftLHS.getOperand(0)->hasOneUse())
13829 return false;
13830
13831 if (Level == BeforeLegalizeTypes)
13832 return true;
13833
13834 if (N->getOpcode() != ISD::SHL)
13835 return true;
13836
13837 if (Subtarget->isThumb1Only()) {
13838 // Avoid making expensive immediates by commuting shifts. (This logic
13839 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
13840 // for free.)
13841 if (N->getOpcode() != ISD::SHL)
13842 return true;
13843 SDValue N1 = N->getOperand(0);
13844 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
13845 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
13846 return true;
13847 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
13848 if (Const->getAPIntValue().ult(256))
13849 return false;
13850 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
13851 Const->getAPIntValue().sgt(-256))
13852 return false;
13853 }
13854 return true;
13855 }
13856
13857 // Turn off commute-with-shift transform after legalization, so it doesn't
13858 // conflict with PerformSHLSimplify. (We could try to detect when
13859 // PerformSHLSimplify would trigger more precisely, but it isn't
13860 // really necessary.)
13861 return false;
13862}
13863
13865 const SDNode *N) const {
13866 assert(N->getOpcode() == ISD::XOR &&
13867 (N->getOperand(0).getOpcode() == ISD::SHL ||
13868 N->getOperand(0).getOpcode() == ISD::SRL) &&
13869 "Expected XOR(SHIFT) pattern");
13870
13871 // Only commute if the entire NOT mask is a hidden shifted mask.
13872 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
13873 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
13874 if (XorC && ShiftC) {
13875 unsigned MaskIdx, MaskLen;
13876 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
13877 unsigned ShiftAmt = ShiftC->getZExtValue();
13878 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
13879 if (N->getOperand(0).getOpcode() == ISD::SHL)
13880 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
13881 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
13882 }
13883 }
13884
13885 return false;
13886}
13887
13889 const SDNode *N, CombineLevel Level) const {
13890 assert(((N->getOpcode() == ISD::SHL &&
13891 N->getOperand(0).getOpcode() == ISD::SRL) ||
13892 (N->getOpcode() == ISD::SRL &&
13893 N->getOperand(0).getOpcode() == ISD::SHL)) &&
13894 "Expected shift-shift mask");
13895
13896 if (!Subtarget->isThumb1Only())
13897 return true;
13898
13899 if (Level == BeforeLegalizeTypes)
13900 return true;
13901
13902 return false;
13903}
13904
13906 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
13907 SDValue Y) const {
13908 return Subtarget->hasMVEIntegerOps() && isTypeLegal(VT) &&
13909 SelectOpcode == ISD::VSELECT;
13910}
13911
13913 if (!Subtarget->hasNEON()) {
13914 if (Subtarget->isThumb1Only())
13915 return VT.getScalarSizeInBits() <= 32;
13916 return true;
13917 }
13918 return VT.isScalarInteger();
13919}
13920
13922 EVT VT) const {
13923 if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple())
13924 return false;
13925
13926 switch (FPVT.getSimpleVT().SimpleTy) {
13927 case MVT::f16:
13928 return Subtarget->hasVFP2Base();
13929 case MVT::f32:
13930 return Subtarget->hasVFP2Base();
13931 case MVT::f64:
13932 return Subtarget->hasFP64();
13933 case MVT::v4f32:
13934 case MVT::v8f16:
13935 return Subtarget->hasMVEFloatOps();
13936 default:
13937 return false;
13938 }
13939}
13940
13943 const ARMSubtarget *ST) {
13944 // Allow the generic combiner to identify potential bswaps.
13945 if (DCI.isBeforeLegalize())
13946 return SDValue();
13947
13948 // DAG combiner will fold:
13949 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
13950 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
13951 // Other code patterns that can be also be modified have the following form:
13952 // b + ((a << 1) | 510)
13953 // b + ((a << 1) & 510)
13954 // b + ((a << 1) ^ 510)
13955 // b + ((a << 1) + 510)
13956
13957 // Many instructions can perform the shift for free, but it requires both
13958 // the operands to be registers. If c1 << c2 is too large, a mov immediate
13959 // instruction will needed. So, unfold back to the original pattern if:
13960 // - if c1 and c2 are small enough that they don't require mov imms.
13961 // - the user(s) of the node can perform an shl
13962
13963 // No shifted operands for 16-bit instructions.
13964 if (ST->isThumb() && ST->isThumb1Only())
13965 return SDValue();
13966
13967 // Check that all the users could perform the shl themselves.
13968 for (auto *U : N->users()) {
13969 switch(U->getOpcode()) {
13970 default:
13971 return SDValue();
13972 case ISD::SUB:
13973 case ISD::ADD:
13974 case ISD::AND:
13975 case ISD::OR:
13976 case ISD::XOR:
13977 case ISD::SETCC:
13978 case ARMISD::CMP:
13979 // Check that the user isn't already using a constant because there
13980 // aren't any instructions that support an immediate operand and a
13981 // shifted operand.
13982 if (isa<ConstantSDNode>(U->getOperand(0)) ||
13983 isa<ConstantSDNode>(U->getOperand(1)))
13984 return SDValue();
13985
13986 // Check that it's not already using a shift.
13987 if (U->getOperand(0).getOpcode() == ISD::SHL ||
13988 U->getOperand(1).getOpcode() == ISD::SHL)
13989 return SDValue();
13990 break;
13991 }
13992 }
13993
13994 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
13995 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
13996 return SDValue();
13997
13998 if (N->getOperand(0).getOpcode() != ISD::SHL)
13999 return SDValue();
14000
14001 SDValue SHL = N->getOperand(0);
14002
14003 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
14004 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
14005 if (!C1ShlC2 || !C2)
14006 return SDValue();
14007
14008 APInt C2Int = C2->getAPIntValue();
14009 APInt C1Int = C1ShlC2->getAPIntValue();
14010 unsigned C2Width = C2Int.getBitWidth();
14011 if (C2Int.uge(C2Width))
14012 return SDValue();
14013 uint64_t C2Value = C2Int.getZExtValue();
14014
14015 // Check that performing a lshr will not lose any information.
14016 APInt Mask = APInt::getHighBitsSet(C2Width, C2Width - C2Value);
14017 if ((C1Int & Mask) != C1Int)
14018 return SDValue();
14019
14020 // Shift the first constant.
14021 C1Int.lshrInPlace(C2Int);
14022
14023 // The immediates are encoded as an 8-bit value that can be rotated.
14024 auto LargeImm = [](const APInt &Imm) {
14025 unsigned Zeros = Imm.countl_zero() + Imm.countr_zero();
14026 return Imm.getBitWidth() - Zeros > 8;
14027 };
14028
14029 if (LargeImm(C1Int) || LargeImm(C2Int))
14030 return SDValue();
14031
14032 SelectionDAG &DAG = DCI.DAG;
14033 SDLoc dl(N);
14034 SDValue X = SHL.getOperand(0);
14035 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
14036 DAG.getConstant(C1Int, dl, MVT::i32));
14037 // Shift left to compensate for the lshr of C1Int.
14038 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
14039
14040 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
14041 SHL.dump(); N->dump());
14042 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
14043 return Res;
14044}
14045
14046
14047/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
14048///
14051 const ARMSubtarget *Subtarget) {
14052 SDValue N0 = N->getOperand(0);
14053 SDValue N1 = N->getOperand(1);
14054
14055 // Only works one way, because it needs an immediate operand.
14056 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14057 return Result;
14058
14059 if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget))
14060 return Result;
14061
14062 // First try with the default operand order.
14063 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
14064 return Result;
14065
14066 // If that didn't work, try again with the operands commuted.
14067 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
14068}
14069
14070// Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)
14071// providing -X is as cheap as X (currently, just a constant).
14073 if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0)))
14074 return SDValue();
14075 SDValue CSINC = N->getOperand(1);
14076 if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse())
14077 return SDValue();
14078
14079 ConstantSDNode *X = dyn_cast<ConstantSDNode>(CSINC.getOperand(0));
14080 if (!X)
14081 return SDValue();
14082
14083 return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32,
14084 DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0),
14085 CSINC.getOperand(0)),
14086 CSINC.getOperand(1), CSINC.getOperand(2),
14087 CSINC.getOperand(3));
14088}
14089
14090/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
14091///
14094 const ARMSubtarget *Subtarget) {
14095 SDValue N0 = N->getOperand(0);
14096 SDValue N1 = N->getOperand(1);
14097
14098 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
14099 if (N1.getNode()->hasOneUse())
14100 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
14101 return Result;
14102
14103 if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG))
14104 return R;
14105
14106 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
14107 return SDValue();
14108
14109 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
14110 // so that we can readily pattern match more mve instructions which can use
14111 // a scalar operand.
14112 SDValue VDup = N->getOperand(1);
14113 if (VDup->getOpcode() != ARMISD::VDUP)
14114 return SDValue();
14115
14116 SDValue VMov = N->getOperand(0);
14117 if (VMov->getOpcode() == ISD::BITCAST)
14118 VMov = VMov->getOperand(0);
14119
14120 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov))
14121 return SDValue();
14122
14123 SDLoc dl(N);
14124 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,
14125 DCI.DAG.getConstant(0, dl, MVT::i32),
14126 VDup->getOperand(0));
14127 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate);
14128}
14129
14130/// PerformVMULCombine
14131/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
14132/// special multiplier accumulator forwarding.
14133/// vmul d3, d0, d2
14134/// vmla d3, d1, d2
14135/// is faster than
14136/// vadd d3, d0, d1
14137/// vmul d3, d3, d2
14138// However, for (A + B) * (A + B),
14139// vadd d2, d0, d1
14140// vmul d3, d0, d2
14141// vmla d3, d1, d2
14142// is slower than
14143// vadd d2, d0, d1
14144// vmul d3, d2, d2
14147 const ARMSubtarget *Subtarget) {
14148 if (!Subtarget->hasVMLxForwarding())
14149 return SDValue();
14150
14151 SelectionDAG &DAG = DCI.DAG;
14152 SDValue N0 = N->getOperand(0);
14153 SDValue N1 = N->getOperand(1);
14154 unsigned Opcode = N0.getOpcode();
14155 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14156 Opcode != ISD::FADD && Opcode != ISD::FSUB) {
14157 Opcode = N1.getOpcode();
14158 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14159 Opcode != ISD::FADD && Opcode != ISD::FSUB)
14160 return SDValue();
14161 std::swap(N0, N1);
14162 }
14163
14164 if (N0 == N1)
14165 return SDValue();
14166
14167 EVT VT = N->getValueType(0);
14168 SDLoc DL(N);
14169 SDValue N00 = N0->getOperand(0);
14170 SDValue N01 = N0->getOperand(1);
14171 return DAG.getNode(Opcode, DL, VT,
14172 DAG.getNode(ISD::MUL, DL, VT, N00, N1),
14173 DAG.getNode(ISD::MUL, DL, VT, N01, N1));
14174}
14175
14177 const ARMSubtarget *Subtarget) {
14178 EVT VT = N->getValueType(0);
14179 if (VT != MVT::v2i64)
14180 return SDValue();
14181
14182 SDValue N0 = N->getOperand(0);
14183 SDValue N1 = N->getOperand(1);
14184
14185 auto IsSignExt = [&](SDValue Op) {
14186 if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
14187 return SDValue();
14188 EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
14189 if (VT.getScalarSizeInBits() == 32)
14190 return Op->getOperand(0);
14191 return SDValue();
14192 };
14193 auto IsZeroExt = [&](SDValue Op) {
14194 // Zero extends are a little more awkward. At the point we are matching
14195 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
14196 // That might be before of after a bitcast depending on how the and is
14197 // placed. Because this has to look through bitcasts, it is currently only
14198 // supported on LE.
14199 if (!Subtarget->isLittle())
14200 return SDValue();
14201
14202 SDValue And = Op;
14203 if (And->getOpcode() == ISD::BITCAST)
14204 And = And->getOperand(0);
14205 if (And->getOpcode() != ISD::AND)
14206 return SDValue();
14207 SDValue Mask = And->getOperand(1);
14208 if (Mask->getOpcode() == ISD::BITCAST)
14209 Mask = Mask->getOperand(0);
14210
14211 if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
14212 Mask.getValueType() != MVT::v4i32)
14213 return SDValue();
14214 if (isAllOnesConstant(Mask->getOperand(0)) &&
14215 isNullConstant(Mask->getOperand(1)) &&
14216 isAllOnesConstant(Mask->getOperand(2)) &&
14217 isNullConstant(Mask->getOperand(3)))
14218 return And->getOperand(0);
14219 return SDValue();
14220 };
14221
14222 SDLoc dl(N);
14223 if (SDValue Op0 = IsSignExt(N0)) {
14224 if (SDValue Op1 = IsSignExt(N1)) {
14225 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14226 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14227 return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
14228 }
14229 }
14230 if (SDValue Op0 = IsZeroExt(N0)) {
14231 if (SDValue Op1 = IsZeroExt(N1)) {
14232 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14233 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14234 return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
14235 }
14236 }
14237
14238 return SDValue();
14239}
14240
14243 const ARMSubtarget *Subtarget) {
14244 SelectionDAG &DAG = DCI.DAG;
14245
14246 EVT VT = N->getValueType(0);
14247 if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
14248 return PerformMVEVMULLCombine(N, DAG, Subtarget);
14249
14250 if (Subtarget->isThumb1Only())
14251 return SDValue();
14252
14253 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14254 return SDValue();
14255
14256 if (VT.is64BitVector() || VT.is128BitVector())
14257 return PerformVMULCombine(N, DCI, Subtarget);
14258 if (VT != MVT::i32)
14259 return SDValue();
14260
14261 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14262 if (!C)
14263 return SDValue();
14264
14265 int64_t MulAmt = C->getSExtValue();
14266 unsigned ShiftAmt = llvm::countr_zero<uint64_t>(MulAmt);
14267
14268 ShiftAmt = ShiftAmt & (32 - 1);
14269 SDValue V = N->getOperand(0);
14270 SDLoc DL(N);
14271
14272 SDValue Res;
14273 MulAmt >>= ShiftAmt;
14274
14275 if (MulAmt >= 0) {
14276 if (llvm::has_single_bit<uint32_t>(MulAmt - 1)) {
14277 // (mul x, 2^N + 1) => (add (shl x, N), x)
14278 Res = DAG.getNode(ISD::ADD, DL, VT,
14279 V,
14280 DAG.getNode(ISD::SHL, DL, VT,
14281 V,
14282 DAG.getConstant(Log2_32(MulAmt - 1), DL,
14283 MVT::i32)));
14284 } else if (llvm::has_single_bit<uint32_t>(MulAmt + 1)) {
14285 // (mul x, 2^N - 1) => (sub (shl x, N), x)
14286 Res = DAG.getNode(ISD::SUB, DL, VT,
14287 DAG.getNode(ISD::SHL, DL, VT,
14288 V,
14289 DAG.getConstant(Log2_32(MulAmt + 1), DL,
14290 MVT::i32)),
14291 V);
14292 } else
14293 return SDValue();
14294 } else {
14295 uint64_t MulAmtAbs = -MulAmt;
14296 if (llvm::has_single_bit<uint32_t>(MulAmtAbs + 1)) {
14297 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
14298 Res = DAG.getNode(ISD::SUB, DL, VT,
14299 V,
14300 DAG.getNode(ISD::SHL, DL, VT,
14301 V,
14302 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
14303 MVT::i32)));
14304 } else if (llvm::has_single_bit<uint32_t>(MulAmtAbs - 1)) {
14305 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
14306 Res = DAG.getNode(ISD::ADD, DL, VT,
14307 V,
14308 DAG.getNode(ISD::SHL, DL, VT,
14309 V,
14310 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
14311 MVT::i32)));
14312 Res = DAG.getNode(ISD::SUB, DL, VT,
14313 DAG.getConstant(0, DL, MVT::i32), Res);
14314 } else
14315 return SDValue();
14316 }
14317
14318 if (ShiftAmt != 0)
14319 Res = DAG.getNode(ISD::SHL, DL, VT,
14320 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
14321
14322 // Do not add new nodes to DAG combiner worklist.
14323 DCI.CombineTo(N, Res, false);
14324 return SDValue();
14325}
14326
14329 const ARMSubtarget *Subtarget) {
14330 // Allow DAGCombine to pattern-match before we touch the canonical form.
14331 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14332 return SDValue();
14333
14334 if (N->getValueType(0) != MVT::i32)
14335 return SDValue();
14336
14337 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14338 if (!N1C)
14339 return SDValue();
14340
14341 uint32_t C1 = (uint32_t)N1C->getZExtValue();
14342 // Don't transform uxtb/uxth.
14343 if (C1 == 255 || C1 == 65535)
14344 return SDValue();
14345
14346 SDNode *N0 = N->getOperand(0).getNode();
14347 if (!N0->hasOneUse())
14348 return SDValue();
14349
14350 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
14351 return SDValue();
14352
14353 bool LeftShift = N0->getOpcode() == ISD::SHL;
14354
14355 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1));
14356 if (!N01C)
14357 return SDValue();
14358
14359 uint32_t C2 = (uint32_t)N01C->getZExtValue();
14360 if (!C2 || C2 >= 32)
14361 return SDValue();
14362
14363 // Clear irrelevant bits in the mask.
14364 if (LeftShift)
14365 C1 &= (-1U << C2);
14366 else
14367 C1 &= (-1U >> C2);
14368
14369 SelectionDAG &DAG = DCI.DAG;
14370 SDLoc DL(N);
14371
14372 // We have a pattern of the form "(and (shl x, c2) c1)" or
14373 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
14374 // transform to a pair of shifts, to save materializing c1.
14375
14376 // First pattern: right shift, then mask off leading bits.
14377 // FIXME: Use demanded bits?
14378 if (!LeftShift && isMask_32(C1)) {
14379 uint32_t C3 = llvm::countl_zero(C1);
14380 if (C2 < C3) {
14381 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14382 DAG.getConstant(C3 - C2, DL, MVT::i32));
14383 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14384 DAG.getConstant(C3, DL, MVT::i32));
14385 }
14386 }
14387
14388 // First pattern, reversed: left shift, then mask off trailing bits.
14389 if (LeftShift && isMask_32(~C1)) {
14390 uint32_t C3 = llvm::countr_zero(C1);
14391 if (C2 < C3) {
14392 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14393 DAG.getConstant(C3 - C2, DL, MVT::i32));
14394 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14395 DAG.getConstant(C3, DL, MVT::i32));
14396 }
14397 }
14398
14399 // Second pattern: left shift, then mask off leading bits.
14400 // FIXME: Use demanded bits?
14401 if (LeftShift && isShiftedMask_32(C1)) {
14402 uint32_t Trailing = llvm::countr_zero(C1);
14403 uint32_t C3 = llvm::countl_zero(C1);
14404 if (Trailing == C2 && C2 + C3 < 32) {
14405 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14406 DAG.getConstant(C2 + C3, DL, MVT::i32));
14407 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14408 DAG.getConstant(C3, DL, MVT::i32));
14409 }
14410 }
14411
14412 // Second pattern, reversed: right shift, then mask off trailing bits.
14413 // FIXME: Handle other patterns of known/demanded bits.
14414 if (!LeftShift && isShiftedMask_32(C1)) {
14415 uint32_t Leading = llvm::countl_zero(C1);
14416 uint32_t C3 = llvm::countr_zero(C1);
14417 if (Leading == C2 && C2 + C3 < 32) {
14418 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14419 DAG.getConstant(C2 + C3, DL, MVT::i32));
14420 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14421 DAG.getConstant(C3, DL, MVT::i32));
14422 }
14423 }
14424
14425 // Transform "(and (shl x, c2) c1)" into "(shl (and x, c1>>c2), c2)"
14426 // if "c1 >> c2" is a cheaper immediate than "c1"
14427 if (LeftShift &&
14428 HasLowerConstantMaterializationCost(C1 >> C2, C1, Subtarget)) {
14429
14430 SDValue And = DAG.getNode(ISD::AND, DL, MVT::i32, N0->getOperand(0),
14431 DAG.getConstant(C1 >> C2, DL, MVT::i32));
14432 return DAG.getNode(ISD::SHL, DL, MVT::i32, And,
14433 DAG.getConstant(C2, DL, MVT::i32));
14434 }
14435
14436 return SDValue();
14437}
14438
14441 const ARMSubtarget *Subtarget) {
14442 // Attempt to use immediate-form VBIC
14443 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14444 SDLoc dl(N);
14445 EVT VT = N->getValueType(0);
14446 SelectionDAG &DAG = DCI.DAG;
14447
14448 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 ||
14449 VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)
14450 return SDValue();
14451
14452 APInt SplatBits, SplatUndef;
14453 unsigned SplatBitSize;
14454 bool HasAnyUndefs;
14455 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14456 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14457 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14458 SplatBitSize == 64) {
14459 EVT VbicVT;
14460 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
14461 SplatUndef.getZExtValue(), SplatBitSize,
14462 DAG, dl, VbicVT, VT, OtherModImm);
14463 if (Val.getNode()) {
14464 SDValue Input =
14465 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VbicVT, N->getOperand(0));
14466 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
14467 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vbic);
14468 }
14469 }
14470 }
14471
14472 if (!Subtarget->isThumb1Only()) {
14473 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
14474 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
14475 return Result;
14476
14477 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14478 return Result;
14479 }
14480
14481 if (Subtarget->isThumb1Only())
14482 if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
14483 return Result;
14484
14485 return SDValue();
14486}
14487
14488// Try combining OR nodes to SMULWB, SMULWT.
14491 const ARMSubtarget *Subtarget) {
14492 if (!Subtarget->hasV6Ops() ||
14493 (Subtarget->isThumb() &&
14494 (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
14495 return SDValue();
14496
14497 SDValue SRL = OR->getOperand(0);
14498 SDValue SHL = OR->getOperand(1);
14499
14500 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
14501 SRL = OR->getOperand(1);
14502 SHL = OR->getOperand(0);
14503 }
14504 if (!isSRL16(SRL) || !isSHL16(SHL))
14505 return SDValue();
14506
14507 // The first operands to the shifts need to be the two results from the
14508 // same smul_lohi node.
14509 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
14510 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
14511 return SDValue();
14512
14513 SDNode *SMULLOHI = SRL.getOperand(0).getNode();
14514 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
14515 SHL.getOperand(0) != SDValue(SMULLOHI, 1))
14516 return SDValue();
14517
14518 // Now we have:
14519 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
14520 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
14521 // For SMUWB the 16-bit value will signed extended somehow.
14522 // For SMULWT only the SRA is required.
14523 // Check both sides of SMUL_LOHI
14524 SDValue OpS16 = SMULLOHI->getOperand(0);
14525 SDValue OpS32 = SMULLOHI->getOperand(1);
14526
14527 SelectionDAG &DAG = DCI.DAG;
14528 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
14529 OpS16 = OpS32;
14530 OpS32 = SMULLOHI->getOperand(0);
14531 }
14532
14533 SDLoc dl(OR);
14534 unsigned Opcode = 0;
14535 if (isS16(OpS16, DAG))
14536 Opcode = ARMISD::SMULWB;
14537 else if (isSRA16(OpS16)) {
14538 Opcode = ARMISD::SMULWT;
14539 OpS16 = OpS16->getOperand(0);
14540 }
14541 else
14542 return SDValue();
14543
14544 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
14545 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
14546 return SDValue(OR, 0);
14547}
14548
14551 const ARMSubtarget *Subtarget) {
14552 // BFI is only available on V6T2+
14553 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
14554 return SDValue();
14555
14556 EVT VT = N->getValueType(0);
14557 SDValue N0 = N->getOperand(0);
14558 SDValue N1 = N->getOperand(1);
14559 SelectionDAG &DAG = DCI.DAG;
14560 SDLoc DL(N);
14561 // 1) or (and A, mask), val => ARMbfi A, val, mask
14562 // iff (val & mask) == val
14563 //
14564 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14565 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
14566 // && mask == ~mask2
14567 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
14568 // && ~mask == mask2
14569 // (i.e., copy a bitfield value into another bitfield of the same width)
14570
14571 if (VT != MVT::i32)
14572 return SDValue();
14573
14574 SDValue N00 = N0.getOperand(0);
14575
14576 // The value and the mask need to be constants so we can verify this is
14577 // actually a bitfield set. If the mask is 0xffff, we can do better
14578 // via a movt instruction, so don't use BFI in that case.
14579 SDValue MaskOp = N0.getOperand(1);
14580 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
14581 if (!MaskC)
14582 return SDValue();
14583 unsigned Mask = MaskC->getZExtValue();
14584 if (Mask == 0xffff)
14585 return SDValue();
14586 SDValue Res;
14587 // Case (1): or (and A, mask), val => ARMbfi A, val, mask
14588 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
14589 if (N1C) {
14590 unsigned Val = N1C->getZExtValue();
14591 if ((Val & ~Mask) != Val)
14592 return SDValue();
14593
14594 if (ARM::isBitFieldInvertedMask(Mask)) {
14595 Val >>= llvm::countr_zero(~Mask);
14596
14597 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
14598 DAG.getConstant(Val, DL, MVT::i32),
14599 DAG.getConstant(Mask, DL, MVT::i32));
14600
14601 DCI.CombineTo(N, Res, false);
14602 // Return value from the original node to inform the combiner than N is
14603 // now dead.
14604 return SDValue(N, 0);
14605 }
14606 } else if (N1.getOpcode() == ISD::AND) {
14607 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14608 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
14609 if (!N11C)
14610 return SDValue();
14611 unsigned Mask2 = N11C->getZExtValue();
14612
14613 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
14614 // as is to match.
14615 if (ARM::isBitFieldInvertedMask(Mask) &&
14616 (Mask == ~Mask2)) {
14617 // The pack halfword instruction works better for masks that fit it,
14618 // so use that when it's available.
14619 if (Subtarget->hasDSP() &&
14620 (Mask == 0xffff || Mask == 0xffff0000))
14621 return SDValue();
14622 // 2a
14623 unsigned amt = llvm::countr_zero(Mask2);
14624 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
14625 DAG.getConstant(amt, DL, MVT::i32));
14626 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
14627 DAG.getConstant(Mask, DL, MVT::i32));
14628 DCI.CombineTo(N, Res, false);
14629 // Return value from the original node to inform the combiner than N is
14630 // now dead.
14631 return SDValue(N, 0);
14632 } else if (ARM::isBitFieldInvertedMask(~Mask) &&
14633 (~Mask == Mask2)) {
14634 // The pack halfword instruction works better for masks that fit it,
14635 // so use that when it's available.
14636 if (Subtarget->hasDSP() &&
14637 (Mask2 == 0xffff || Mask2 == 0xffff0000))
14638 return SDValue();
14639 // 2b
14640 unsigned lsb = llvm::countr_zero(Mask);
14641 Res = DAG.getNode(ISD::SRL, DL, VT, N00,
14642 DAG.getConstant(lsb, DL, MVT::i32));
14643 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
14644 DAG.getConstant(Mask2, DL, MVT::i32));
14645 DCI.CombineTo(N, Res, false);
14646 // Return value from the original node to inform the combiner than N is
14647 // now dead.
14648 return SDValue(N, 0);
14649 }
14650 }
14651
14652 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
14653 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
14655 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
14656 // where lsb(mask) == #shamt and masked bits of B are known zero.
14657 SDValue ShAmt = N00.getOperand(1);
14658 unsigned ShAmtC = ShAmt->getAsZExtVal();
14659 unsigned LSB = llvm::countr_zero(Mask);
14660 if (ShAmtC != LSB)
14661 return SDValue();
14662
14663 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
14664 DAG.getConstant(~Mask, DL, MVT::i32));
14665
14666 DCI.CombineTo(N, Res, false);
14667 // Return value from the original node to inform the combiner than N is
14668 // now dead.
14669 return SDValue(N, 0);
14670 }
14671
14672 return SDValue();
14673}
14674
14675static bool isValidMVECond(unsigned CC, bool IsFloat) {
14676 switch (CC) {
14677 case ARMCC::EQ:
14678 case ARMCC::NE:
14679 case ARMCC::LE:
14680 case ARMCC::GT:
14681 case ARMCC::GE:
14682 case ARMCC::LT:
14683 return true;
14684 case ARMCC::HS:
14685 case ARMCC::HI:
14686 return !IsFloat;
14687 default:
14688 return false;
14689 };
14690}
14691
14693 if (N->getOpcode() == ARMISD::VCMP)
14694 return (ARMCC::CondCodes)N->getConstantOperandVal(2);
14695 else if (N->getOpcode() == ARMISD::VCMPZ)
14696 return (ARMCC::CondCodes)N->getConstantOperandVal(1);
14697 else
14698 llvm_unreachable("Not a VCMP/VCMPZ!");
14699}
14700
14703 return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
14704}
14705
14707 const ARMSubtarget *Subtarget) {
14708 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
14709 // together with predicates
14710 EVT VT = N->getValueType(0);
14711 SDLoc DL(N);
14712 SDValue N0 = N->getOperand(0);
14713 SDValue N1 = N->getOperand(1);
14714
14715 auto IsFreelyInvertable = [&](SDValue V) {
14716 if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
14717 return CanInvertMVEVCMP(V);
14718 return false;
14719 };
14720
14721 // At least one operand must be freely invertable.
14722 if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
14723 return SDValue();
14724
14725 SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT);
14726 SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT);
14727 SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
14728 return DAG.getLogicalNOT(DL, And, VT);
14729}
14730
14731/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
14734 const ARMSubtarget *Subtarget) {
14735 // Attempt to use immediate-form VORR
14736 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14737 SDLoc dl(N);
14738 EVT VT = N->getValueType(0);
14739 SelectionDAG &DAG = DCI.DAG;
14740
14741 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14742 return SDValue();
14743
14744 if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 ||
14745 VT == MVT::v8i1 || VT == MVT::v16i1))
14746 return PerformORCombine_i1(N, DAG, Subtarget);
14747
14748 APInt SplatBits, SplatUndef;
14749 unsigned SplatBitSize;
14750 bool HasAnyUndefs;
14751 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14752 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14753 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14754 SplatBitSize == 64) {
14755 EVT VorrVT;
14756 SDValue Val =
14757 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
14758 SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
14759 if (Val.getNode()) {
14760 SDValue Input =
14761 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VorrVT, N->getOperand(0));
14762 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
14763 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vorr);
14764 }
14765 }
14766 }
14767
14768 if (!Subtarget->isThumb1Only()) {
14769 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
14770 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14771 return Result;
14772 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
14773 return Result;
14774 }
14775
14776 SDValue N0 = N->getOperand(0);
14777 SDValue N1 = N->getOperand(1);
14778
14779 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
14780 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
14782
14783 // The code below optimizes (or (and X, Y), Z).
14784 // The AND operand needs to have a single user to make these optimizations
14785 // profitable.
14786 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
14787 return SDValue();
14788
14789 APInt SplatUndef;
14790 unsigned SplatBitSize;
14791 bool HasAnyUndefs;
14792
14793 APInt SplatBits0, SplatBits1;
14794 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
14795 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
14796 // Ensure that the second operand of both ands are constants
14797 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
14798 HasAnyUndefs) && !HasAnyUndefs) {
14799 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
14800 HasAnyUndefs) && !HasAnyUndefs) {
14801 // Ensure that the bit width of the constants are the same and that
14802 // the splat arguments are logical inverses as per the pattern we
14803 // are trying to simplify.
14804 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
14805 SplatBits0 == ~SplatBits1) {
14806 // Canonicalize the vector type to make instruction selection
14807 // simpler.
14808 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
14809 SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,
14810 N0->getOperand(1),
14811 N0->getOperand(0),
14812 N1->getOperand(0));
14813 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Result);
14814 }
14815 }
14816 }
14817 }
14818
14819 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
14820 // reasonable.
14821 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
14822 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
14823 return Res;
14824 }
14825
14826 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14827 return Result;
14828
14829 return SDValue();
14830}
14831
14834 const ARMSubtarget *Subtarget) {
14835 EVT VT = N->getValueType(0);
14836 SelectionDAG &DAG = DCI.DAG;
14837
14838 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14839 return SDValue();
14840
14841 if (!Subtarget->isThumb1Only()) {
14842 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
14843 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14844 return Result;
14845
14846 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14847 return Result;
14848 }
14849
14850 if (Subtarget->hasMVEIntegerOps()) {
14851 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
14852 SDValue N0 = N->getOperand(0);
14853 SDValue N1 = N->getOperand(1);
14854 const TargetLowering *TLI = Subtarget->getTargetLowering();
14855 if (TLI->isConstTrueVal(N1) &&
14856 (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
14857 if (CanInvertMVEVCMP(N0)) {
14858 SDLoc DL(N0);
14860
14862 Ops.push_back(N0->getOperand(0));
14863 if (N0->getOpcode() == ARMISD::VCMP)
14864 Ops.push_back(N0->getOperand(1));
14865 Ops.push_back(DAG.getConstant(CC, DL, MVT::i32));
14866 return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
14867 }
14868 }
14869 }
14870
14871 return SDValue();
14872}
14873
14874// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
14875// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
14876// their position in "to" (Rd).
14877static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
14878 assert(N->getOpcode() == ARMISD::BFI);
14879
14880 SDValue From = N->getOperand(1);
14881 ToMask = ~N->getConstantOperandAPInt(2);
14882 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.popcount());
14883
14884 // If the Base came from a SHR #C, we can deduce that it is really testing bit
14885 // #C in the base of the SHR.
14886 if (From->getOpcode() == ISD::SRL &&
14887 isa<ConstantSDNode>(From->getOperand(1))) {
14888 APInt Shift = From->getConstantOperandAPInt(1);
14889 assert(Shift.getLimitedValue() < 32 && "Shift too large!");
14890 FromMask <<= Shift.getLimitedValue(31);
14891 From = From->getOperand(0);
14892 }
14893
14894 return From;
14895}
14896
14897// If A and B contain one contiguous set of bits, does A | B == A . B?
14898//
14899// Neither A nor B must be zero.
14900static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
14901 unsigned LastActiveBitInA = A.countr_zero();
14902 unsigned FirstActiveBitInB = B.getBitWidth() - B.countl_zero() - 1;
14903 return LastActiveBitInA - 1 == FirstActiveBitInB;
14904}
14905
14907 // We have a BFI in N. Find a BFI it can combine with, if one exists.
14908 APInt ToMask, FromMask;
14909 SDValue From = ParseBFI(N, ToMask, FromMask);
14910 SDValue To = N->getOperand(0);
14911
14912 SDValue V = To;
14913 if (V.getOpcode() != ARMISD::BFI)
14914 return SDValue();
14915
14916 APInt NewToMask, NewFromMask;
14917 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
14918 if (NewFrom != From)
14919 return SDValue();
14920
14921 // Do the written bits conflict with any we've seen so far?
14922 if ((NewToMask & ToMask).getBoolValue())
14923 // Conflicting bits.
14924 return SDValue();
14925
14926 // Are the new bits contiguous when combined with the old bits?
14927 if (BitsProperlyConcatenate(ToMask, NewToMask) &&
14928 BitsProperlyConcatenate(FromMask, NewFromMask))
14929 return V;
14930 if (BitsProperlyConcatenate(NewToMask, ToMask) &&
14931 BitsProperlyConcatenate(NewFromMask, FromMask))
14932 return V;
14933
14934 return SDValue();
14935}
14936
14938 SDValue N0 = N->getOperand(0);
14939 SDValue N1 = N->getOperand(1);
14940
14941 if (N1.getOpcode() == ISD::AND) {
14942 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
14943 // the bits being cleared by the AND are not demanded by the BFI.
14944 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
14945 if (!N11C)
14946 return SDValue();
14947 unsigned InvMask = N->getConstantOperandVal(2);
14948 unsigned LSB = llvm::countr_zero(~InvMask);
14949 unsigned Width = llvm::bit_width<unsigned>(~InvMask) - LSB;
14950 assert(Width <
14951 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
14952 "undefined behavior");
14953 unsigned Mask = (1u << Width) - 1;
14954 unsigned Mask2 = N11C->getZExtValue();
14955 if ((Mask & (~Mask2)) == 0)
14956 return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
14957 N->getOperand(0), N1.getOperand(0), N->getOperand(2));
14958 return SDValue();
14959 }
14960
14961 // Look for another BFI to combine with.
14962 if (SDValue CombineBFI = FindBFIToCombineWith(N)) {
14963 // We've found a BFI.
14964 APInt ToMask1, FromMask1;
14965 SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
14966
14967 APInt ToMask2, FromMask2;
14968 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
14969 assert(From1 == From2);
14970 (void)From2;
14971
14972 // Create a new BFI, combining the two together.
14973 APInt NewFromMask = FromMask1 | FromMask2;
14974 APInt NewToMask = ToMask1 | ToMask2;
14975
14976 EVT VT = N->getValueType(0);
14977 SDLoc dl(N);
14978
14979 if (NewFromMask[0] == 0)
14980 From1 = DAG.getNode(ISD::SRL, dl, VT, From1,
14981 DAG.getConstant(NewFromMask.countr_zero(), dl, VT));
14982 return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1,
14983 DAG.getConstant(~NewToMask, dl, VT));
14984 }
14985
14986 // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so
14987 // that lower bit insertions are performed first, providing that M1 and M2
14988 // do no overlap. This can allow multiple BFI instructions to be combined
14989 // together by the other folds above.
14990 if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
14991 APInt ToMask1 = ~N->getConstantOperandAPInt(2);
14992 APInt ToMask2 = ~N0.getConstantOperandAPInt(2);
14993
14994 if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 ||
14995 ToMask1.countl_zero() < ToMask2.countl_zero())
14996 return SDValue();
14997
14998 EVT VT = N->getValueType(0);
14999 SDLoc dl(N);
15000 SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0),
15001 N->getOperand(1), N->getOperand(2));
15002 return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1),
15003 N0.getOperand(2));
15004 }
15005
15006 return SDValue();
15007}
15008
15009// Check that N is CMPZ(CSINC(0, 0, CC, X)),
15010// or CMPZ(CMOV(1, 0, CC, X))
15011// return X if valid.
15013 if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1)))
15014 return SDValue();
15015 SDValue CSInc = Cmp->getOperand(0);
15016
15017 // Ignore any `And 1` nodes that may not yet have been removed. We are
15018 // looking for a value that produces 1/0, so these have no effect on the
15019 // code.
15020 while (CSInc.getOpcode() == ISD::AND &&
15021 isa<ConstantSDNode>(CSInc.getOperand(1)) &&
15022 CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse())
15023 CSInc = CSInc.getOperand(0);
15024
15025 if (CSInc.getOpcode() == ARMISD::CSINC &&
15026 isNullConstant(CSInc.getOperand(0)) &&
15027 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
15029 return CSInc.getOperand(3);
15030 }
15031 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) &&
15032 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
15034 return CSInc.getOperand(3);
15035 }
15036 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) &&
15037 isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) {
15040 return CSInc.getOperand(3);
15041 }
15042 return SDValue();
15043}
15044
15046 // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in
15047 // t92: flags = ARMISD::CMPZ t74, 0
15048 // t93: i32 = ARMISD::CSINC 0, 0, 1, t92
15049 // t96: flags = ARMISD::CMPZ t93, 0
15050 // t114: i32 = ARMISD::CSINV 0, 0, 0, t96
15052 if (SDValue C = IsCMPZCSINC(N, Cond))
15053 if (Cond == ARMCC::EQ)
15054 return C;
15055 return SDValue();
15056}
15057
15059 // Fold away an unneccessary CMPZ/CSINC
15060 // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) ->
15061 // if C1==EQ -> CSXYZ A, B, C2, D
15062 // if C1==NE -> CSXYZ A, B, NOT(C2), D
15064 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
15065 if (N->getConstantOperandVal(2) == ARMCC::EQ)
15066 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15067 N->getOperand(1),
15068 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
15069 if (N->getConstantOperandVal(2) == ARMCC::NE)
15070 return DAG.getNode(
15071 N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15072 N->getOperand(1),
15074 }
15075 return SDValue();
15076}
15077
15078/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
15079/// ARMISD::VMOVRRD.
15082 const ARMSubtarget *Subtarget) {
15083 // vmovrrd(vmovdrr x, y) -> x,y
15084 SDValue InDouble = N->getOperand(0);
15085 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
15086 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
15087
15088 // vmovrrd(load f64) -> (load i32), (load i32)
15089 SDNode *InNode = InDouble.getNode();
15090 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
15091 InNode->getValueType(0) == MVT::f64 &&
15092 InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
15093 !cast<LoadSDNode>(InNode)->isVolatile()) {
15094 // TODO: Should this be done for non-FrameIndex operands?
15095 LoadSDNode *LD = cast<LoadSDNode>(InNode);
15096
15097 SelectionDAG &DAG = DCI.DAG;
15098 SDLoc DL(LD);
15099 SDValue BasePtr = LD->getBasePtr();
15100 SDValue NewLD1 =
15101 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
15102 LD->getAlign(), LD->getMemOperand()->getFlags());
15103
15104 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
15105 DAG.getConstant(4, DL, MVT::i32));
15106
15107 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
15108 LD->getPointerInfo().getWithOffset(4),
15109 commonAlignment(LD->getAlign(), 4),
15110 LD->getMemOperand()->getFlags());
15111
15112 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
15113 if (DCI.DAG.getDataLayout().isBigEndian())
15114 std::swap (NewLD1, NewLD2);
15115 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
15116 return Result;
15117 }
15118
15119 // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
15120 // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
15121 if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15122 isa<ConstantSDNode>(InDouble.getOperand(1))) {
15123 SDValue BV = InDouble.getOperand(0);
15124 // Look up through any nop bitcasts and vector_reg_casts. bitcasts may
15125 // change lane order under big endian.
15126 bool BVSwap = BV.getOpcode() == ISD::BITCAST;
15127 while (
15128 (BV.getOpcode() == ISD::BITCAST ||
15130 (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {
15131 BVSwap = BV.getOpcode() == ISD::BITCAST;
15132 BV = BV.getOperand(0);
15133 }
15134 if (BV.getValueType() != MVT::v4i32)
15135 return SDValue();
15136
15137 // Handle buildvectors, pulling out the correct lane depending on
15138 // endianness.
15139 unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0;
15140 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
15141 SDValue Op0 = BV.getOperand(Offset);
15142 SDValue Op1 = BV.getOperand(Offset + 1);
15143 if (!Subtarget->isLittle() && BVSwap)
15144 std::swap(Op0, Op1);
15145
15146 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15147 }
15148
15149 // A chain of insert_vectors, grabbing the correct value of the chain of
15150 // inserts.
15151 SDValue Op0, Op1;
15152 while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {
15153 if (isa<ConstantSDNode>(BV.getOperand(2))) {
15154 if (BV.getConstantOperandVal(2) == Offset && !Op0)
15155 Op0 = BV.getOperand(1);
15156 if (BV.getConstantOperandVal(2) == Offset + 1 && !Op1)
15157 Op1 = BV.getOperand(1);
15158 }
15159 BV = BV.getOperand(0);
15160 }
15161 if (!Subtarget->isLittle() && BVSwap)
15162 std::swap(Op0, Op1);
15163 if (Op0 && Op1)
15164 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15165 }
15166
15167 return SDValue();
15168}
15169
15170/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
15171/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
15173 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
15174 SDValue Op0 = N->getOperand(0);
15175 SDValue Op1 = N->getOperand(1);
15176 if (Op0.getOpcode() == ISD::BITCAST)
15177 Op0 = Op0.getOperand(0);
15178 if (Op1.getOpcode() == ISD::BITCAST)
15179 Op1 = Op1.getOperand(0);
15180 if (Op0.getOpcode() == ARMISD::VMOVRRD &&
15181 Op0.getNode() == Op1.getNode() &&
15182 Op0.getResNo() == 0 && Op1.getResNo() == 1)
15183 return DAG.getNode(ISD::BITCAST, SDLoc(N),
15184 N->getValueType(0), Op0.getOperand(0));
15185 return SDValue();
15186}
15187
15190 SDValue Op0 = N->getOperand(0);
15191
15192 // VMOVhr (VMOVrh (X)) -> X
15193 if (Op0->getOpcode() == ARMISD::VMOVrh)
15194 return Op0->getOperand(0);
15195
15196 // FullFP16: half values are passed in S-registers, and we don't
15197 // need any of the bitcast and moves:
15198 //
15199 // t2: f32,ch1,gl1? = CopyFromReg ch, Register:f32 %0, gl?
15200 // t5: i32 = bitcast t2
15201 // t18: f16 = ARMISD::VMOVhr t5
15202 // =>
15203 // tN: f16,ch2,gl2? = CopyFromReg ch, Register::f32 %0, gl?
15204 if (Op0->getOpcode() == ISD::BITCAST) {
15205 SDValue Copy = Op0->getOperand(0);
15206 if (Copy.getValueType() == MVT::f32 &&
15207 Copy->getOpcode() == ISD::CopyFromReg) {
15208 bool HasGlue = Copy->getNumOperands() == 3;
15209 SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1),
15210 HasGlue ? Copy->getOperand(2) : SDValue()};
15211 EVT OutTys[] = {N->getValueType(0), MVT::Other, MVT::Glue};
15212 SDValue NewCopy =
15214 DCI.DAG.getVTList(ArrayRef(OutTys, HasGlue ? 3 : 2)),
15215 ArrayRef(Ops, HasGlue ? 3 : 2));
15216
15217 // Update Users, Chains, and Potential Glue.
15218 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), NewCopy.getValue(0));
15219 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(1), NewCopy.getValue(1));
15220 if (HasGlue)
15221 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(2),
15222 NewCopy.getValue(2));
15223
15224 return NewCopy;
15225 }
15226 }
15227
15228 // fold (VMOVhr (load x)) -> (load (f16*)x)
15229 if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {
15230 if (LN0->hasOneUse() && LN0->isUnindexed() &&
15231 LN0->getMemoryVT() == MVT::i16) {
15232 SDValue Load =
15233 DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),
15234 LN0->getBasePtr(), LN0->getMemOperand());
15235 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15236 DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
15237 return Load;
15238 }
15239 }
15240
15241 // Only the bottom 16 bits of the source register are used.
15242 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15243 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15244 if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
15245 return SDValue(N, 0);
15246
15247 return SDValue();
15248}
15249
15251 SDValue N0 = N->getOperand(0);
15252 EVT VT = N->getValueType(0);
15253
15254 // fold (VMOVrh (fpconst x)) -> const x
15255 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0)) {
15256 APFloat V = C->getValueAPF();
15257 return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
15258 }
15259
15260 // fold (VMOVrh (load x)) -> (zextload (i16*)x)
15261 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
15262 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
15263
15264 SDValue Load =
15265 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
15266 LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
15267 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15268 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
15269 return Load;
15270 }
15271
15272 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
15273 if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15274 isa<ConstantSDNode>(N0->getOperand(1)))
15275 return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
15276 N0->getOperand(1));
15277
15278 return SDValue();
15279}
15280
15281/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
15282/// are normal, non-volatile loads. If so, it is profitable to bitcast an
15283/// i64 vector to have f64 elements, since the value can then be loaded
15284/// directly into a VFP register.
15286 unsigned NumElts = N->getValueType(0).getVectorNumElements();
15287 for (unsigned i = 0; i < NumElts; ++i) {
15288 SDNode *Elt = N->getOperand(i).getNode();
15289 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
15290 return true;
15291 }
15292 return false;
15293}
15294
15295/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
15296/// ISD::BUILD_VECTOR.
15299 const ARMSubtarget *Subtarget) {
15300 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
15301 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
15302 // into a pair of GPRs, which is fine when the value is used as a scalar,
15303 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
15304 SelectionDAG &DAG = DCI.DAG;
15305 if (N->getNumOperands() == 2)
15306 if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
15307 return RV;
15308
15309 // Load i64 elements as f64 values so that type legalization does not split
15310 // them up into i32 values.
15311 EVT VT = N->getValueType(0);
15312 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
15313 return SDValue();
15314 SDLoc dl(N);
15316 unsigned NumElts = VT.getVectorNumElements();
15317 for (unsigned i = 0; i < NumElts; ++i) {
15318 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
15319 Ops.push_back(V);
15320 // Make the DAGCombiner fold the bitcast.
15321 DCI.AddToWorklist(V.getNode());
15322 }
15323 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
15324 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
15325 return DAG.getNode(ISD::BITCAST, dl, VT, BV);
15326}
15327
15328/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
15329static SDValue
15331 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
15332 // At that time, we may have inserted bitcasts from integer to float.
15333 // If these bitcasts have survived DAGCombine, change the lowering of this
15334 // BUILD_VECTOR in something more vector friendly, i.e., that does not
15335 // force to use floating point types.
15336
15337 // Make sure we can change the type of the vector.
15338 // This is possible iff:
15339 // 1. The vector is only used in a bitcast to a integer type. I.e.,
15340 // 1.1. Vector is used only once.
15341 // 1.2. Use is a bit convert to an integer type.
15342 // 2. The size of its operands are 32-bits (64-bits are not legal).
15343 EVT VT = N->getValueType(0);
15344 EVT EltVT = VT.getVectorElementType();
15345
15346 // Check 1.1. and 2.
15347 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
15348 return SDValue();
15349
15350 // By construction, the input type must be float.
15351 assert(EltVT == MVT::f32 && "Unexpected type!");
15352
15353 // Check 1.2.
15354 SDNode *Use = *N->user_begin();
15355 if (Use->getOpcode() != ISD::BITCAST ||
15356 Use->getValueType(0).isFloatingPoint())
15357 return SDValue();
15358
15359 // Check profitability.
15360 // Model is, if more than half of the relevant operands are bitcast from
15361 // i32, turn the build_vector into a sequence of insert_vector_elt.
15362 // Relevant operands are everything that is not statically
15363 // (i.e., at compile time) bitcasted.
15364 unsigned NumOfBitCastedElts = 0;
15365 unsigned NumElts = VT.getVectorNumElements();
15366 unsigned NumOfRelevantElts = NumElts;
15367 for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
15368 SDValue Elt = N->getOperand(Idx);
15369 if (Elt->getOpcode() == ISD::BITCAST) {
15370 // Assume only bit cast to i32 will go away.
15371 if (Elt->getOperand(0).getValueType() == MVT::i32)
15372 ++NumOfBitCastedElts;
15373 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
15374 // Constants are statically casted, thus do not count them as
15375 // relevant operands.
15376 --NumOfRelevantElts;
15377 }
15378
15379 // Check if more than half of the elements require a non-free bitcast.
15380 if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
15381 return SDValue();
15382
15383 SelectionDAG &DAG = DCI.DAG;
15384 // Create the new vector type.
15385 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
15386 // Check if the type is legal.
15387 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15388 if (!TLI.isTypeLegal(VecVT))
15389 return SDValue();
15390
15391 // Combine:
15392 // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
15393 // => BITCAST INSERT_VECTOR_ELT
15394 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
15395 // (BITCAST EN), N.
15396 SDValue Vec = DAG.getUNDEF(VecVT);
15397 SDLoc dl(N);
15398 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
15399 SDValue V = N->getOperand(Idx);
15400 if (V.isUndef())
15401 continue;
15402 if (V.getOpcode() == ISD::BITCAST &&
15403 V->getOperand(0).getValueType() == MVT::i32)
15404 // Fold obvious case.
15405 V = V.getOperand(0);
15406 else {
15407 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
15408 // Make the DAGCombiner fold the bitcasts.
15409 DCI.AddToWorklist(V.getNode());
15410 }
15411 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
15412 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
15413 }
15414 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
15415 // Make the DAGCombiner fold the bitcasts.
15416 DCI.AddToWorklist(Vec.getNode());
15417 return Vec;
15418}
15419
15420static SDValue
15422 EVT VT = N->getValueType(0);
15423 SDValue Op = N->getOperand(0);
15424 SDLoc dl(N);
15425
15426 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
15427 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
15428 // If the valuetypes are the same, we can remove the cast entirely.
15429 if (Op->getOperand(0).getValueType() == VT)
15430 return Op->getOperand(0);
15431 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15432 }
15433
15434 // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
15435 // more VPNOT which might get folded as else predicates.
15436 if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
15437 SDValue X =
15438 DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15440 DCI.DAG.getConstant(65535, dl, MVT::i32));
15441 return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);
15442 }
15443
15444 // Only the bottom 16 bits of the source register are used.
15445 if (Op.getValueType() == MVT::i32) {
15446 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15447 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15448 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
15449 return SDValue(N, 0);
15450 }
15451 return SDValue();
15452}
15453
15455 const ARMSubtarget *ST) {
15456 EVT VT = N->getValueType(0);
15457 SDValue Op = N->getOperand(0);
15458 SDLoc dl(N);
15459
15460 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
15461 if (ST->isLittle())
15462 return DAG.getNode(ISD::BITCAST, dl, VT, Op);
15463
15464 // VT VECTOR_REG_CAST (VT Op) -> Op
15465 if (Op.getValueType() == VT)
15466 return Op;
15467 // VECTOR_REG_CAST undef -> undef
15468 if (Op.isUndef())
15469 return DAG.getUNDEF(VT);
15470
15471 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
15472 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
15473 // If the valuetypes are the same, we can remove the cast entirely.
15474 if (Op->getOperand(0).getValueType() == VT)
15475 return Op->getOperand(0);
15476 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
15477 }
15478
15479 return SDValue();
15480}
15481
15483 const ARMSubtarget *Subtarget) {
15484 if (!Subtarget->hasMVEIntegerOps())
15485 return SDValue();
15486
15487 EVT VT = N->getValueType(0);
15488 SDValue Op0 = N->getOperand(0);
15489 SDValue Op1 = N->getOperand(1);
15490 ARMCC::CondCodes Cond = (ARMCC::CondCodes)N->getConstantOperandVal(2);
15491 SDLoc dl(N);
15492
15493 // vcmp X, 0, cc -> vcmpz X, cc
15494 if (isZeroVector(Op1))
15495 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2));
15496
15497 unsigned SwappedCond = getSwappedCondition(Cond);
15498 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
15499 // vcmp 0, X, cc -> vcmpz X, reversed(cc)
15500 if (isZeroVector(Op0))
15501 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
15502 DAG.getConstant(SwappedCond, dl, MVT::i32));
15503 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
15504 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
15505 return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
15506 DAG.getConstant(SwappedCond, dl, MVT::i32));
15507 }
15508
15509 return SDValue();
15510}
15511
15512/// PerformInsertEltCombine - Target-specific dag combine xforms for
15513/// ISD::INSERT_VECTOR_ELT.
15516 // Bitcast an i64 load inserted into a vector to f64.
15517 // Otherwise, the i64 value will be legalized to a pair of i32 values.
15518 EVT VT = N->getValueType(0);
15519 SDNode *Elt = N->getOperand(1).getNode();
15520 if (VT.getVectorElementType() != MVT::i64 ||
15521 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
15522 return SDValue();
15523
15524 SelectionDAG &DAG = DCI.DAG;
15525 SDLoc dl(N);
15526 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
15528 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
15529 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
15530 // Make the DAGCombiner fold the bitcasts.
15531 DCI.AddToWorklist(Vec.getNode());
15532 DCI.AddToWorklist(V.getNode());
15533 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
15534 Vec, V, N->getOperand(2));
15535 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
15536}
15537
15538// Convert a pair of extracts from the same base vector to a VMOVRRD. Either
15539// directly or bitcast to an integer if the original is a float vector.
15540// extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2)
15541// bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2)
15542static SDValue
15544 EVT VT = N->getValueType(0);
15545 SDLoc dl(N);
15546
15547 if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 ||
15548 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64))
15549 return SDValue();
15550
15551 SDValue Ext = SDValue(N, 0);
15552 if (Ext.getOpcode() == ISD::BITCAST &&
15553 Ext.getOperand(0).getValueType() == MVT::f32)
15554 Ext = Ext.getOperand(0);
15555 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15556 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
15557 Ext.getConstantOperandVal(1) % 2 != 0)
15558 return SDValue();
15559 if (Ext->hasOneUse() && (Ext->user_begin()->getOpcode() == ISD::SINT_TO_FP ||
15560 Ext->user_begin()->getOpcode() == ISD::UINT_TO_FP))
15561 return SDValue();
15562
15563 SDValue Op0 = Ext.getOperand(0);
15564 EVT VecVT = Op0.getValueType();
15565 unsigned ResNo = Op0.getResNo();
15566 unsigned Lane = Ext.getConstantOperandVal(1);
15567 if (VecVT.getVectorNumElements() != 4)
15568 return SDValue();
15569
15570 // Find another extract, of Lane + 1
15571 auto OtherIt = find_if(Op0->users(), [&](SDNode *V) {
15572 return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15573 isa<ConstantSDNode>(V->getOperand(1)) &&
15574 V->getConstantOperandVal(1) == Lane + 1 &&
15575 V->getOperand(0).getResNo() == ResNo;
15576 });
15577 if (OtherIt == Op0->users().end())
15578 return SDValue();
15579
15580 // For float extracts, we need to be converting to a i32 for both vector
15581 // lanes.
15582 SDValue OtherExt(*OtherIt, 0);
15583 if (OtherExt.getValueType() != MVT::i32) {
15584 if (!OtherExt->hasOneUse() ||
15585 OtherExt->user_begin()->getOpcode() != ISD::BITCAST ||
15586 OtherExt->user_begin()->getValueType(0) != MVT::i32)
15587 return SDValue();
15588 OtherExt = SDValue(*OtherExt->user_begin(), 0);
15589 }
15590
15591 // Convert the type to a f64 and extract with a VMOVRRD.
15592 SDValue F64 = DCI.DAG.getNode(
15593 ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15594 DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),
15595 DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));
15596 SDValue VMOVRRD =
15597 DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);
15598
15599 DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1));
15600 return VMOVRRD;
15601}
15602
15605 const ARMSubtarget *ST) {
15606 SDValue Op0 = N->getOperand(0);
15607 EVT VT = N->getValueType(0);
15608 SDLoc dl(N);
15609
15610 // extract (vdup x) -> x
15611 if (Op0->getOpcode() == ARMISD::VDUP) {
15612 SDValue X = Op0->getOperand(0);
15613 if (VT == MVT::f16 && X.getValueType() == MVT::i32)
15614 return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
15615 if (VT == MVT::i32 && X.getValueType() == MVT::f16)
15616 return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
15617 if (VT == MVT::f32 && X.getValueType() == MVT::i32)
15618 return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X);
15619
15620 while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
15621 X = X->getOperand(0);
15622 if (X.getValueType() == VT)
15623 return X;
15624 }
15625
15626 // extract ARM_BUILD_VECTOR -> x
15627 if (Op0->getOpcode() == ARMISD::BUILD_VECTOR &&
15628 isa<ConstantSDNode>(N->getOperand(1)) &&
15629 N->getConstantOperandVal(1) < Op0.getNumOperands()) {
15630 return Op0.getOperand(N->getConstantOperandVal(1));
15631 }
15632
15633 // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b
15634 if (Op0.getValueType() == MVT::v4i32 &&
15635 isa<ConstantSDNode>(N->getOperand(1)) &&
15636 Op0.getOpcode() == ISD::BITCAST &&
15638 Op0.getOperand(0).getValueType() == MVT::v2f64) {
15639 SDValue BV = Op0.getOperand(0);
15640 unsigned Offset = N->getConstantOperandVal(1);
15641 SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1);
15642 if (MOV.getOpcode() == ARMISD::VMOVDRR)
15643 return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2);
15644 }
15645
15646 // extract x, n; extract x, n+1 -> VMOVRRD x
15647 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
15648 return R;
15649
15650 // extract (MVETrunc(x)) -> extract x
15651 if (Op0->getOpcode() == ARMISD::MVETRUNC) {
15652 unsigned Idx = N->getConstantOperandVal(1);
15653 unsigned Vec =
15655 unsigned SubIdx =
15657 return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec),
15658 DCI.DAG.getConstant(SubIdx, dl, MVT::i32));
15659 }
15660
15661 return SDValue();
15662}
15663
15665 SDValue Op = N->getOperand(0);
15666 EVT VT = N->getValueType(0);
15667
15668 // sext_inreg(VGETLANEu) -> VGETLANEs
15669 if (Op.getOpcode() == ARMISD::VGETLANEu &&
15670 cast<VTSDNode>(N->getOperand(1))->getVT() ==
15671 Op.getOperand(0).getValueType().getScalarType())
15672 return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0),
15673 Op.getOperand(1));
15674
15675 return SDValue();
15676}
15677
15678static SDValue
15680 SDValue Vec = N->getOperand(0);
15681 SDValue SubVec = N->getOperand(1);
15682 uint64_t IdxVal = N->getConstantOperandVal(2);
15683 EVT VecVT = Vec.getValueType();
15684 EVT SubVT = SubVec.getValueType();
15685
15686 // Only do this for legal fixed vector types.
15687 if (!VecVT.isFixedLengthVector() ||
15688 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
15690 return SDValue();
15691
15692 // Ignore widening patterns.
15693 if (IdxVal == 0 && Vec.isUndef())
15694 return SDValue();
15695
15696 // Subvector must be half the width and an "aligned" insertion.
15697 unsigned NumSubElts = SubVT.getVectorNumElements();
15698 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
15699 (IdxVal != 0 && IdxVal != NumSubElts))
15700 return SDValue();
15701
15702 // Fold insert_subvector -> concat_vectors
15703 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
15704 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
15705 SDLoc DL(N);
15706 SDValue Lo, Hi;
15707 if (IdxVal == 0) {
15708 Lo = SubVec;
15709 Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15710 DCI.DAG.getVectorIdxConstant(NumSubElts, DL));
15711 } else {
15712 Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15713 DCI.DAG.getVectorIdxConstant(0, DL));
15714 Hi = SubVec;
15715 }
15716 return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
15717}
15718
15719// shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
15721 SelectionDAG &DAG) {
15722 SDValue Trunc = N->getOperand(0);
15723 EVT VT = Trunc.getValueType();
15724 if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef())
15725 return SDValue();
15726
15727 SDLoc DL(Trunc);
15728 if (isVMOVNTruncMask(N->getMask(), VT, false))
15729 return DAG.getNode(
15730 ARMISD::VMOVN, DL, VT,
15731 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15732 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15733 DAG.getConstant(1, DL, MVT::i32));
15734 else if (isVMOVNTruncMask(N->getMask(), VT, true))
15735 return DAG.getNode(
15736 ARMISD::VMOVN, DL, VT,
15737 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15738 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15739 DAG.getConstant(1, DL, MVT::i32));
15740 return SDValue();
15741}
15742
15743/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
15744/// ISD::VECTOR_SHUFFLE.
15746 if (SDValue R = PerformShuffleVMOVNCombine(cast<ShuffleVectorSDNode>(N), DAG))
15747 return R;
15748
15749 // The LLVM shufflevector instruction does not require the shuffle mask
15750 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
15751 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
15752 // operands do not match the mask length, they are extended by concatenating
15753 // them with undef vectors. That is probably the right thing for other
15754 // targets, but for NEON it is better to concatenate two double-register
15755 // size vector operands into a single quad-register size vector. Do that
15756 // transformation here:
15757 // shuffle(concat(v1, undef), concat(v2, undef)) ->
15758 // shuffle(concat(v1, v2), undef)
15759 SDValue Op0 = N->getOperand(0);
15760 SDValue Op1 = N->getOperand(1);
15761 if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
15762 Op1.getOpcode() != ISD::CONCAT_VECTORS ||
15763 Op0.getNumOperands() != 2 ||
15764 Op1.getNumOperands() != 2)
15765 return SDValue();
15766 SDValue Concat0Op1 = Op0.getOperand(1);
15767 SDValue Concat1Op1 = Op1.getOperand(1);
15768 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
15769 return SDValue();
15770 // Skip the transformation if any of the types are illegal.
15771 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15772 EVT VT = N->getValueType(0);
15773 if (!TLI.isTypeLegal(VT) ||
15774 !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
15775 !TLI.isTypeLegal(Concat1Op1.getValueType()))
15776 return SDValue();
15777
15778 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
15779 Op0.getOperand(0), Op1.getOperand(0));
15780 // Translate the shuffle mask.
15781 SmallVector<int, 16> NewMask;
15782 unsigned NumElts = VT.getVectorNumElements();
15783 unsigned HalfElts = NumElts/2;
15784 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
15785 for (unsigned n = 0; n < NumElts; ++n) {
15786 int MaskElt = SVN->getMaskElt(n);
15787 int NewElt = -1;
15788 if (MaskElt < (int)HalfElts)
15789 NewElt = MaskElt;
15790 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
15791 NewElt = HalfElts + MaskElt - NumElts;
15792 NewMask.push_back(NewElt);
15793 }
15794 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
15795 DAG.getUNDEF(VT), NewMask);
15796}
15797
15798/// Load/store instruction that can be merged with a base address
15799/// update
15804 unsigned AddrOpIdx;
15805};
15806
15808 /// Instruction that updates a pointer
15810 /// Pointer increment operand
15812 /// Pointer increment value if it is a constant, or 0 otherwise
15813 unsigned ConstInc;
15814};
15815
15817 // Check that the add is independent of the load/store.
15818 // Otherwise, folding it would create a cycle. Search through Addr
15819 // as well, since the User may not be a direct user of Addr and
15820 // only share a base pointer.
15823 Worklist.push_back(N);
15824 Worklist.push_back(User);
15825 const unsigned MaxSteps = 1024;
15826 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
15827 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
15828 return false;
15829 return true;
15830}
15831
15833 struct BaseUpdateUser &User,
15834 bool SimpleConstIncOnly,
15836 SelectionDAG &DAG = DCI.DAG;
15837 SDNode *N = Target.N;
15838 MemSDNode *MemN = cast<MemSDNode>(N);
15839 SDLoc dl(N);
15840
15841 // Find the new opcode for the updating load/store.
15842 bool isLoadOp = true;
15843 bool isLaneOp = false;
15844 // Workaround for vst1x and vld1x intrinsics which do not have alignment
15845 // as an operand.
15846 bool hasAlignment = true;
15847 unsigned NewOpc = 0;
15848 unsigned NumVecs = 0;
15849 if (Target.isIntrinsic) {
15850 unsigned IntNo = N->getConstantOperandVal(1);
15851 switch (IntNo) {
15852 default:
15853 llvm_unreachable("unexpected intrinsic for Neon base update");
15854 case Intrinsic::arm_neon_vld1:
15855 NewOpc = ARMISD::VLD1_UPD;
15856 NumVecs = 1;
15857 break;
15858 case Intrinsic::arm_neon_vld2:
15859 NewOpc = ARMISD::VLD2_UPD;
15860 NumVecs = 2;
15861 break;
15862 case Intrinsic::arm_neon_vld3:
15863 NewOpc = ARMISD::VLD3_UPD;
15864 NumVecs = 3;
15865 break;
15866 case Intrinsic::arm_neon_vld4:
15867 NewOpc = ARMISD::VLD4_UPD;
15868 NumVecs = 4;
15869 break;
15870 case Intrinsic::arm_neon_vld1x2:
15871 NewOpc = ARMISD::VLD1x2_UPD;
15872 NumVecs = 2;
15873 hasAlignment = false;
15874 break;
15875 case Intrinsic::arm_neon_vld1x3:
15876 NewOpc = ARMISD::VLD1x3_UPD;
15877 NumVecs = 3;
15878 hasAlignment = false;
15879 break;
15880 case Intrinsic::arm_neon_vld1x4:
15881 NewOpc = ARMISD::VLD1x4_UPD;
15882 NumVecs = 4;
15883 hasAlignment = false;
15884 break;
15885 case Intrinsic::arm_neon_vld2dup:
15886 NewOpc = ARMISD::VLD2DUP_UPD;
15887 NumVecs = 2;
15888 break;
15889 case Intrinsic::arm_neon_vld3dup:
15890 NewOpc = ARMISD::VLD3DUP_UPD;
15891 NumVecs = 3;
15892 break;
15893 case Intrinsic::arm_neon_vld4dup:
15894 NewOpc = ARMISD::VLD4DUP_UPD;
15895 NumVecs = 4;
15896 break;
15897 case Intrinsic::arm_neon_vld2lane:
15898 NewOpc = ARMISD::VLD2LN_UPD;
15899 NumVecs = 2;
15900 isLaneOp = true;
15901 break;
15902 case Intrinsic::arm_neon_vld3lane:
15903 NewOpc = ARMISD::VLD3LN_UPD;
15904 NumVecs = 3;
15905 isLaneOp = true;
15906 break;
15907 case Intrinsic::arm_neon_vld4lane:
15908 NewOpc = ARMISD::VLD4LN_UPD;
15909 NumVecs = 4;
15910 isLaneOp = true;
15911 break;
15912 case Intrinsic::arm_neon_vst1:
15913 NewOpc = ARMISD::VST1_UPD;
15914 NumVecs = 1;
15915 isLoadOp = false;
15916 break;
15917 case Intrinsic::arm_neon_vst2:
15918 NewOpc = ARMISD::VST2_UPD;
15919 NumVecs = 2;
15920 isLoadOp = false;
15921 break;
15922 case Intrinsic::arm_neon_vst3:
15923 NewOpc = ARMISD::VST3_UPD;
15924 NumVecs = 3;
15925 isLoadOp = false;
15926 break;
15927 case Intrinsic::arm_neon_vst4:
15928 NewOpc = ARMISD::VST4_UPD;
15929 NumVecs = 4;
15930 isLoadOp = false;
15931 break;
15932 case Intrinsic::arm_neon_vst2lane:
15933 NewOpc = ARMISD::VST2LN_UPD;
15934 NumVecs = 2;
15935 isLoadOp = false;
15936 isLaneOp = true;
15937 break;
15938 case Intrinsic::arm_neon_vst3lane:
15939 NewOpc = ARMISD::VST3LN_UPD;
15940 NumVecs = 3;
15941 isLoadOp = false;
15942 isLaneOp = true;
15943 break;
15944 case Intrinsic::arm_neon_vst4lane:
15945 NewOpc = ARMISD::VST4LN_UPD;
15946 NumVecs = 4;
15947 isLoadOp = false;
15948 isLaneOp = true;
15949 break;
15950 case Intrinsic::arm_neon_vst1x2:
15951 NewOpc = ARMISD::VST1x2_UPD;
15952 NumVecs = 2;
15953 isLoadOp = false;
15954 hasAlignment = false;
15955 break;
15956 case Intrinsic::arm_neon_vst1x3:
15957 NewOpc = ARMISD::VST1x3_UPD;
15958 NumVecs = 3;
15959 isLoadOp = false;
15960 hasAlignment = false;
15961 break;
15962 case Intrinsic::arm_neon_vst1x4:
15963 NewOpc = ARMISD::VST1x4_UPD;
15964 NumVecs = 4;
15965 isLoadOp = false;
15966 hasAlignment = false;
15967 break;
15968 }
15969 } else {
15970 isLaneOp = true;
15971 switch (N->getOpcode()) {
15972 default:
15973 llvm_unreachable("unexpected opcode for Neon base update");
15974 case ARMISD::VLD1DUP:
15975 NewOpc = ARMISD::VLD1DUP_UPD;
15976 NumVecs = 1;
15977 break;
15978 case ARMISD::VLD2DUP:
15979 NewOpc = ARMISD::VLD2DUP_UPD;
15980 NumVecs = 2;
15981 break;
15982 case ARMISD::VLD3DUP:
15983 NewOpc = ARMISD::VLD3DUP_UPD;
15984 NumVecs = 3;
15985 break;
15986 case ARMISD::VLD4DUP:
15987 NewOpc = ARMISD::VLD4DUP_UPD;
15988 NumVecs = 4;
15989 break;
15990 case ISD::LOAD:
15991 NewOpc = ARMISD::VLD1_UPD;
15992 NumVecs = 1;
15993 isLaneOp = false;
15994 break;
15995 case ISD::STORE:
15996 NewOpc = ARMISD::VST1_UPD;
15997 NumVecs = 1;
15998 isLaneOp = false;
15999 isLoadOp = false;
16000 break;
16001 }
16002 }
16003
16004 // Find the size of memory referenced by the load/store.
16005 EVT VecTy;
16006 if (isLoadOp) {
16007 VecTy = N->getValueType(0);
16008 } else if (Target.isIntrinsic) {
16009 VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType();
16010 } else {
16011 assert(Target.isStore &&
16012 "Node has to be a load, a store, or an intrinsic!");
16013 VecTy = N->getOperand(1).getValueType();
16014 }
16015
16016 bool isVLDDUPOp =
16017 NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
16018 NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
16019
16020 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
16021 if (isLaneOp || isVLDDUPOp)
16022 NumBytes /= VecTy.getVectorNumElements();
16023
16024 if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {
16025 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
16026 // separate instructions that make it harder to use a non-constant update.
16027 return false;
16028 }
16029
16030 if (SimpleConstIncOnly && User.ConstInc != NumBytes)
16031 return false;
16032
16033 if (!isValidBaseUpdate(N, User.N))
16034 return false;
16035
16036 // OK, we found an ADD we can fold into the base update.
16037 // Now, create a _UPD node, taking care of not breaking alignment.
16038
16039 EVT AlignedVecTy = VecTy;
16040 Align Alignment = MemN->getAlign();
16041
16042 // If this is a less-than-standard-aligned load/store, change the type to
16043 // match the standard alignment.
16044 // The alignment is overlooked when selecting _UPD variants; and it's
16045 // easier to introduce bitcasts here than fix that.
16046 // There are 3 ways to get to this base-update combine:
16047 // - intrinsics: they are assumed to be properly aligned (to the standard
16048 // alignment of the memory type), so we don't need to do anything.
16049 // - ARMISD::VLDx nodes: they are only generated from the aforementioned
16050 // intrinsics, so, likewise, there's nothing to do.
16051 // - generic load/store instructions: the alignment is specified as an
16052 // explicit operand, rather than implicitly as the standard alignment
16053 // of the memory type (like the intrisics). We need to change the
16054 // memory type to match the explicit alignment. That way, we don't
16055 // generate non-standard-aligned ARMISD::VLDx nodes.
16056 if (isa<LSBaseSDNode>(N)) {
16057 if (Alignment.value() < VecTy.getScalarSizeInBits() / 8) {
16058 MVT EltTy = MVT::getIntegerVT(Alignment.value() * 8);
16059 assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
16060 assert(!isLaneOp && "Unexpected generic load/store lane.");
16061 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
16062 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
16063 }
16064 // Don't set an explicit alignment on regular load/stores that we want
16065 // to transform to VLD/VST 1_UPD nodes.
16066 // This matches the behavior of regular load/stores, which only get an
16067 // explicit alignment if the MMO alignment is larger than the standard
16068 // alignment of the memory type.
16069 // Intrinsics, however, always get an explicit alignment, set to the
16070 // alignment of the MMO.
16071 Alignment = Align(1);
16072 }
16073
16074 // Create the new updating load/store node.
16075 // First, create an SDVTList for the new updating node's results.
16076 EVT Tys[6];
16077 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16078 unsigned n;
16079 for (n = 0; n < NumResultVecs; ++n)
16080 Tys[n] = AlignedVecTy;
16081 Tys[n++] = MVT::i32;
16082 Tys[n] = MVT::Other;
16083 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16084
16085 // Then, gather the new node's operands.
16087 Ops.push_back(N->getOperand(0)); // incoming chain
16088 Ops.push_back(N->getOperand(Target.AddrOpIdx));
16089 Ops.push_back(User.Inc);
16090
16091 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
16092 // Try to match the intrinsic's signature
16093 Ops.push_back(StN->getValue());
16094 } else {
16095 // Loads (and of course intrinsics) match the intrinsics' signature,
16096 // so just add all but the alignment operand.
16097 unsigned LastOperand =
16098 hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
16099 for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)
16100 Ops.push_back(N->getOperand(i));
16101 }
16102
16103 // For all node types, the alignment operand is always the last one.
16104 Ops.push_back(DAG.getConstant(Alignment.value(), dl, MVT::i32));
16105
16106 // If this is a non-standard-aligned STORE, the penultimate operand is the
16107 // stored value. Bitcast it to the aligned type.
16108 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
16109 SDValue &StVal = Ops[Ops.size() - 2];
16110 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
16111 }
16112
16113 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
16114 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
16115 MemN->getMemOperand());
16116
16117 // Update the uses.
16118 SmallVector<SDValue, 5> NewResults;
16119 for (unsigned i = 0; i < NumResultVecs; ++i)
16120 NewResults.push_back(SDValue(UpdN.getNode(), i));
16121
16122 // If this is an non-standard-aligned LOAD, the first result is the loaded
16123 // value. Bitcast it to the expected result type.
16124 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
16125 SDValue &LdVal = NewResults[0];
16126 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
16127 }
16128
16129 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16130 DCI.CombineTo(N, NewResults);
16131 DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs));
16132
16133 return true;
16134}
16135
16136// If (opcode ptr inc) is and ADD-like instruction, return the
16137// increment value. Otherwise return 0.
16138static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,
16139 SDValue Inc, const SelectionDAG &DAG) {
16140 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
16141 if (!CInc)
16142 return 0;
16143
16144 switch (Opcode) {
16145 case ARMISD::VLD1_UPD:
16146 case ISD::ADD:
16147 return CInc->getZExtValue();
16148 case ISD::OR: {
16149 if (DAG.haveNoCommonBitsSet(Ptr, Inc)) {
16150 // (OR ptr inc) is the same as (ADD ptr inc)
16151 return CInc->getZExtValue();
16152 }
16153 return 0;
16154 }
16155 default:
16156 return 0;
16157 }
16158}
16159
16161 switch (N->getOpcode()) {
16162 case ISD::ADD:
16163 case ISD::OR: {
16164 if (isa<ConstantSDNode>(N->getOperand(1))) {
16165 *Ptr = N->getOperand(0);
16166 *CInc = N->getOperand(1);
16167 return true;
16168 }
16169 return false;
16170 }
16171 case ARMISD::VLD1_UPD: {
16172 if (isa<ConstantSDNode>(N->getOperand(2))) {
16173 *Ptr = N->getOperand(1);
16174 *CInc = N->getOperand(2);
16175 return true;
16176 }
16177 return false;
16178 }
16179 default:
16180 return false;
16181 }
16182}
16183
16184/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
16185/// NEON load/store intrinsics, and generic vector load/stores, to merge
16186/// base address updates.
16187/// For generic load/stores, the memory type is assumed to be a vector.
16188/// The caller is assumed to have checked legality.
16191 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
16192 N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
16193 const bool isStore = N->getOpcode() == ISD::STORE;
16194 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
16195 BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
16196
16197 // Limit the number of possible base-updates we look at to prevent degenerate
16198 // cases.
16199 unsigned MaxBaseUpdates = ArmMaxBaseUpdatesToCheck;
16200
16201 SDValue Addr = N->getOperand(AddrOpIdx);
16202
16204
16205 // Search for a use of the address operand that is an increment.
16206 for (SDUse &Use : Addr->uses()) {
16207 SDNode *User = Use.getUser();
16208 if (Use.getResNo() != Addr.getResNo() || User->getNumOperands() != 2)
16209 continue;
16210
16211 SDValue Inc = User->getOperand(Use.getOperandNo() == 1 ? 0 : 1);
16212 unsigned ConstInc =
16213 getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
16214
16215 if (ConstInc || User->getOpcode() == ISD::ADD) {
16216 BaseUpdates.push_back({User, Inc, ConstInc});
16217 if (BaseUpdates.size() >= MaxBaseUpdates)
16218 break;
16219 }
16220 }
16221
16222 // If the address is a constant pointer increment itself, find
16223 // another constant increment that has the same base operand
16224 SDValue Base;
16225 SDValue CInc;
16226 if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {
16227 unsigned Offset =
16228 getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);
16229 for (SDUse &Use : Base->uses()) {
16230
16231 SDNode *User = Use.getUser();
16232 if (Use.getResNo() != Base.getResNo() || User == Addr.getNode() ||
16233 User->getNumOperands() != 2)
16234 continue;
16235
16236 SDValue UserInc = User->getOperand(Use.getOperandNo() == 0 ? 1 : 0);
16237 unsigned UserOffset =
16238 getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);
16239
16240 if (!UserOffset || UserOffset <= Offset)
16241 continue;
16242
16243 unsigned NewConstInc = UserOffset - Offset;
16244 SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
16245 BaseUpdates.push_back({User, NewInc, NewConstInc});
16246 if (BaseUpdates.size() >= MaxBaseUpdates)
16247 break;
16248 }
16249 }
16250
16251 // Try to fold the load/store with an update that matches memory
16252 // access size. This should work well for sequential loads.
16253 unsigned NumValidUpd = BaseUpdates.size();
16254 for (unsigned I = 0; I < NumValidUpd; I++) {
16255 BaseUpdateUser &User = BaseUpdates[I];
16256 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
16257 return SDValue();
16258 }
16259
16260 // Try to fold with other users. Non-constant updates are considered
16261 // first, and constant updates are sorted to not break a sequence of
16262 // strided accesses (if there is any).
16263 llvm::stable_sort(BaseUpdates,
16264 [](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) {
16265 return LHS.ConstInc < RHS.ConstInc;
16266 });
16267 for (BaseUpdateUser &User : BaseUpdates) {
16268 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
16269 return SDValue();
16270 }
16271 return SDValue();
16272}
16273
16276 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16277 return SDValue();
16278
16279 return CombineBaseUpdate(N, DCI);
16280}
16281
16284 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16285 return SDValue();
16286
16287 SelectionDAG &DAG = DCI.DAG;
16288 SDValue Addr = N->getOperand(2);
16289 MemSDNode *MemN = cast<MemSDNode>(N);
16290 SDLoc dl(N);
16291
16292 // For the stores, where there are multiple intrinsics we only actually want
16293 // to post-inc the last of the them.
16294 unsigned IntNo = N->getConstantOperandVal(1);
16295 if (IntNo == Intrinsic::arm_mve_vst2q && N->getConstantOperandVal(5) != 1)
16296 return SDValue();
16297 if (IntNo == Intrinsic::arm_mve_vst4q && N->getConstantOperandVal(7) != 3)
16298 return SDValue();
16299
16300 // Search for a use of the address operand that is an increment.
16301 for (SDUse &Use : Addr->uses()) {
16302 SDNode *User = Use.getUser();
16303 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
16304 continue;
16305
16306 // Check that the add is independent of the load/store. Otherwise, folding
16307 // it would create a cycle. We can avoid searching through Addr as it's a
16308 // predecessor to both.
16311 Visited.insert(Addr.getNode());
16312 Worklist.push_back(N);
16313 Worklist.push_back(User);
16314 const unsigned MaxSteps = 1024;
16315 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
16316 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
16317 continue;
16318
16319 // Find the new opcode for the updating load/store.
16320 bool isLoadOp = true;
16321 unsigned NewOpc = 0;
16322 unsigned NumVecs = 0;
16323 switch (IntNo) {
16324 default:
16325 llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
16326 case Intrinsic::arm_mve_vld2q:
16327 NewOpc = ARMISD::VLD2_UPD;
16328 NumVecs = 2;
16329 break;
16330 case Intrinsic::arm_mve_vld4q:
16331 NewOpc = ARMISD::VLD4_UPD;
16332 NumVecs = 4;
16333 break;
16334 case Intrinsic::arm_mve_vst2q:
16335 NewOpc = ARMISD::VST2_UPD;
16336 NumVecs = 2;
16337 isLoadOp = false;
16338 break;
16339 case Intrinsic::arm_mve_vst4q:
16340 NewOpc = ARMISD::VST4_UPD;
16341 NumVecs = 4;
16342 isLoadOp = false;
16343 break;
16344 }
16345
16346 // Find the size of memory referenced by the load/store.
16347 EVT VecTy;
16348 if (isLoadOp) {
16349 VecTy = N->getValueType(0);
16350 } else {
16351 VecTy = N->getOperand(3).getValueType();
16352 }
16353
16354 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
16355
16356 // If the increment is a constant, it must match the memory ref size.
16357 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
16358 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
16359 if (!CInc || CInc->getZExtValue() != NumBytes)
16360 continue;
16361
16362 // Create the new updating load/store node.
16363 // First, create an SDVTList for the new updating node's results.
16364 EVT Tys[6];
16365 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16366 unsigned n;
16367 for (n = 0; n < NumResultVecs; ++n)
16368 Tys[n] = VecTy;
16369 Tys[n++] = MVT::i32;
16370 Tys[n] = MVT::Other;
16371 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16372
16373 // Then, gather the new node's operands.
16375 Ops.push_back(N->getOperand(0)); // incoming chain
16376 Ops.push_back(N->getOperand(2)); // ptr
16377 Ops.push_back(Inc);
16378
16379 for (unsigned i = 3; i < N->getNumOperands(); ++i)
16380 Ops.push_back(N->getOperand(i));
16381
16382 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,
16383 MemN->getMemOperand());
16384
16385 // Update the uses.
16386 SmallVector<SDValue, 5> NewResults;
16387 for (unsigned i = 0; i < NumResultVecs; ++i)
16388 NewResults.push_back(SDValue(UpdN.getNode(), i));
16389
16390 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16391 DCI.CombineTo(N, NewResults);
16392 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
16393
16394 break;
16395 }
16396
16397 return SDValue();
16398}
16399
16400/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
16401/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
16402/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
16403/// return true.
16405 SelectionDAG &DAG = DCI.DAG;
16406 EVT VT = N->getValueType(0);
16407 // vldN-dup instructions only support 64-bit vectors for N > 1.
16408 if (!VT.is64BitVector())
16409 return false;
16410
16411 // Check if the VDUPLANE operand is a vldN-dup intrinsic.
16412 SDNode *VLD = N->getOperand(0).getNode();
16413 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
16414 return false;
16415 unsigned NumVecs = 0;
16416 unsigned NewOpc = 0;
16417 unsigned IntNo = VLD->getConstantOperandVal(1);
16418 if (IntNo == Intrinsic::arm_neon_vld2lane) {
16419 NumVecs = 2;
16420 NewOpc = ARMISD::VLD2DUP;
16421 } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
16422 NumVecs = 3;
16423 NewOpc = ARMISD::VLD3DUP;
16424 } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
16425 NumVecs = 4;
16426 NewOpc = ARMISD::VLD4DUP;
16427 } else {
16428 return false;
16429 }
16430
16431 // First check that all the vldN-lane uses are VDUPLANEs and that the lane
16432 // numbers match the load.
16433 unsigned VLDLaneNo = VLD->getConstantOperandVal(NumVecs + 3);
16434 for (SDUse &Use : VLD->uses()) {
16435 // Ignore uses of the chain result.
16436 if (Use.getResNo() == NumVecs)
16437 continue;
16438 SDNode *User = Use.getUser();
16439 if (User->getOpcode() != ARMISD::VDUPLANE ||
16440 VLDLaneNo != User->getConstantOperandVal(1))
16441 return false;
16442 }
16443
16444 // Create the vldN-dup node.
16445 EVT Tys[5];
16446 unsigned n;
16447 for (n = 0; n < NumVecs; ++n)
16448 Tys[n] = VT;
16449 Tys[n] = MVT::Other;
16450 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumVecs + 1));
16451 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
16452 MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
16453 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
16454 Ops, VLDMemInt->getMemoryVT(),
16455 VLDMemInt->getMemOperand());
16456
16457 // Update the uses.
16458 for (SDUse &Use : VLD->uses()) {
16459 unsigned ResNo = Use.getResNo();
16460 // Ignore uses of the chain result.
16461 if (ResNo == NumVecs)
16462 continue;
16463 DCI.CombineTo(Use.getUser(), SDValue(VLDDup.getNode(), ResNo));
16464 }
16465
16466 // Now the vldN-lane intrinsic is dead except for its chain result.
16467 // Update uses of the chain.
16468 std::vector<SDValue> VLDDupResults;
16469 for (unsigned n = 0; n < NumVecs; ++n)
16470 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
16471 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
16472 DCI.CombineTo(VLD, VLDDupResults);
16473
16474 return true;
16475}
16476
16477/// PerformVDUPLANECombine - Target-specific dag combine xforms for
16478/// ARMISD::VDUPLANE.
16481 const ARMSubtarget *Subtarget) {
16482 SDValue Op = N->getOperand(0);
16483 EVT VT = N->getValueType(0);
16484
16485 // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
16486 if (Subtarget->hasMVEIntegerOps()) {
16487 EVT ExtractVT = VT.getVectorElementType();
16488 // We need to ensure we are creating a legal type.
16489 if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
16490 ExtractVT = MVT::i32;
16491 SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
16492 N->getOperand(0), N->getOperand(1));
16493 return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
16494 }
16495
16496 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
16497 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
16498 if (CombineVLDDUP(N, DCI))
16499 return SDValue(N, 0);
16500
16501 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
16502 // redundant. Ignore bit_converts for now; element sizes are checked below.
16503 while (Op.getOpcode() == ISD::BITCAST)
16504 Op = Op.getOperand(0);
16505 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
16506 return SDValue();
16507
16508 // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
16509 unsigned EltSize = Op.getScalarValueSizeInBits();
16510 // The canonical VMOV for a zero vector uses a 32-bit element size.
16511 unsigned Imm = Op.getConstantOperandVal(0);
16512 unsigned EltBits;
16513 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
16514 EltSize = 8;
16515 if (EltSize > VT.getScalarSizeInBits())
16516 return SDValue();
16517
16518 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
16519}
16520
16521/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
16523 const ARMSubtarget *Subtarget) {
16524 SDValue Op = N->getOperand(0);
16525 SDLoc dl(N);
16526
16527 if (Subtarget->hasMVEIntegerOps()) {
16528 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
16529 // need to come from a GPR.
16530 if (Op.getValueType() == MVT::f32)
16531 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16532 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
16533 else if (Op.getValueType() == MVT::f16)
16534 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16535 DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
16536 }
16537
16538 if (!Subtarget->hasNEON())
16539 return SDValue();
16540
16541 // Match VDUP(LOAD) -> VLD1DUP.
16542 // We match this pattern here rather than waiting for isel because the
16543 // transform is only legal for unindexed loads.
16544 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
16545 if (LD && Op.hasOneUse() && LD->isUnindexed() &&
16546 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
16547 SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),
16548 DAG.getConstant(LD->getAlign().value(), SDLoc(N), MVT::i32)};
16549 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
16550 SDValue VLDDup =
16551 DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, Ops,
16552 LD->getMemoryVT(), LD->getMemOperand());
16553 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
16554 return VLDDup;
16555 }
16556
16557 return SDValue();
16558}
16559
16562 const ARMSubtarget *Subtarget) {
16563 EVT VT = N->getValueType(0);
16564
16565 // If this is a legal vector load, try to combine it into a VLD1_UPD.
16566 if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() &&
16568 return CombineBaseUpdate(N, DCI);
16569
16570 return SDValue();
16571}
16572
16573// Optimize trunc store (of multiple scalars) to shuffle and store. First,
16574// pack all of the elements in one place. Next, store to memory in fewer
16575// chunks.
16577 SelectionDAG &DAG) {
16578 SDValue StVal = St->getValue();
16579 EVT VT = StVal.getValueType();
16580 if (!St->isTruncatingStore() || !VT.isVector())
16581 return SDValue();
16582 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16583 EVT StVT = St->getMemoryVT();
16584 unsigned NumElems = VT.getVectorNumElements();
16585 assert(StVT != VT && "Cannot truncate to the same type");
16586 unsigned FromEltSz = VT.getScalarSizeInBits();
16587 unsigned ToEltSz = StVT.getScalarSizeInBits();
16588
16589 // From, To sizes and ElemCount must be pow of two
16590 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
16591 return SDValue();
16592
16593 // We are going to use the original vector elt for storing.
16594 // Accumulated smaller vector elements must be a multiple of the store size.
16595 if (0 != (NumElems * FromEltSz) % ToEltSz)
16596 return SDValue();
16597
16598 unsigned SizeRatio = FromEltSz / ToEltSz;
16599 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
16600
16601 // Create a type on which we perform the shuffle.
16602 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
16603 NumElems * SizeRatio);
16604 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
16605
16606 SDLoc DL(St);
16607 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
16608 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
16609 for (unsigned i = 0; i < NumElems; ++i)
16610 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
16611 : i * SizeRatio;
16612
16613 // Can't shuffle using an illegal type.
16614 if (!TLI.isTypeLegal(WideVecVT))
16615 return SDValue();
16616
16617 SDValue Shuff = DAG.getVectorShuffle(
16618 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
16619 // At this point all of the data is stored at the bottom of the
16620 // register. We now need to save it to mem.
16621
16622 // Find the largest store unit
16623 MVT StoreType = MVT::i8;
16624 for (MVT Tp : MVT::integer_valuetypes()) {
16625 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
16626 StoreType = Tp;
16627 }
16628 // Didn't find a legal store type.
16629 if (!TLI.isTypeLegal(StoreType))
16630 return SDValue();
16631
16632 // Bitcast the original vector into a vector of store-size units
16633 EVT StoreVecVT =
16634 EVT::getVectorVT(*DAG.getContext(), StoreType,
16635 VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
16636 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
16637 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
16639 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
16640 TLI.getPointerTy(DAG.getDataLayout()));
16641 SDValue BasePtr = St->getBasePtr();
16642
16643 // Perform one or more big stores into memory.
16644 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
16645 for (unsigned I = 0; I < E; I++) {
16646 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
16647 ShuffWide, DAG.getIntPtrConstant(I, DL));
16648 SDValue Ch =
16649 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
16650 St->getAlign(), St->getMemOperand()->getFlags());
16651 BasePtr =
16652 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
16653 Chains.push_back(Ch);
16654 }
16655 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
16656}
16657
16658// Try taking a single vector store from an fpround (which would otherwise turn
16659// into an expensive buildvector) and splitting it into a series of narrowing
16660// stores.
16662 SelectionDAG &DAG) {
16663 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16664 return SDValue();
16665 SDValue Trunc = St->getValue();
16666 if (Trunc->getOpcode() != ISD::FP_ROUND)
16667 return SDValue();
16668 EVT FromVT = Trunc->getOperand(0).getValueType();
16669 EVT ToVT = Trunc.getValueType();
16670 if (!ToVT.isVector())
16671 return SDValue();
16673 EVT ToEltVT = ToVT.getVectorElementType();
16674 EVT FromEltVT = FromVT.getVectorElementType();
16675
16676 if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16)
16677 return SDValue();
16678
16679 unsigned NumElements = 4;
16680 if (FromVT.getVectorNumElements() % NumElements != 0)
16681 return SDValue();
16682
16683 // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so
16684 // use the VMOVN over splitting the store. We are looking for patterns of:
16685 // !rev: 0 N 1 N+1 2 N+2 ...
16686 // rev: N 0 N+1 1 N+2 2 ...
16687 // The shuffle may either be a single source (in which case N = NumElts/2) or
16688 // two inputs extended with concat to the same size (in which case N =
16689 // NumElts).
16690 auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
16691 ArrayRef<int> M = SVN->getMask();
16692 unsigned NumElts = ToVT.getVectorNumElements();
16693 if (SVN->getOperand(1).isUndef())
16694 NumElts /= 2;
16695
16696 unsigned Off0 = Rev ? NumElts : 0;
16697 unsigned Off1 = Rev ? 0 : NumElts;
16698
16699 for (unsigned I = 0; I < NumElts; I += 2) {
16700 if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
16701 return false;
16702 if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
16703 return false;
16704 }
16705
16706 return true;
16707 };
16708
16709 if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
16710 if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
16711 return SDValue();
16712
16713 LLVMContext &C = *DAG.getContext();
16714 SDLoc DL(St);
16715 // Details about the old store
16716 SDValue Ch = St->getChain();
16717 SDValue BasePtr = St->getBasePtr();
16718 Align Alignment = St->getBaseAlign();
16720 AAMDNodes AAInfo = St->getAAInfo();
16721
16722 // We split the store into slices of NumElements. fp16 trunc stores are vcvt
16723 // and then stored as truncating integer stores.
16724 EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
16725 EVT NewToVT = EVT::getVectorVT(
16726 C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
16727
16729 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
16730 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
16731 SDValue NewPtr =
16732 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16733
16734 SDValue Extract =
16735 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
16736 DAG.getConstant(i * NumElements, DL, MVT::i32));
16737
16738 SDValue FPTrunc =
16739 DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
16740 Extract, DAG.getConstant(0, DL, MVT::i32));
16741 Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
16742
16743 SDValue Store = DAG.getTruncStore(
16744 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16745 NewToVT, Alignment, MMOFlags, AAInfo);
16746 Stores.push_back(Store);
16747 }
16748 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16749}
16750
16751// Try taking a single vector store from an MVETRUNC (which would otherwise turn
16752// into an expensive buildvector) and splitting it into a series of narrowing
16753// stores.
16755 SelectionDAG &DAG) {
16756 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16757 return SDValue();
16758 SDValue Trunc = St->getValue();
16759 if (Trunc->getOpcode() != ARMISD::MVETRUNC)
16760 return SDValue();
16761 EVT FromVT = Trunc->getOperand(0).getValueType();
16762 EVT ToVT = Trunc.getValueType();
16763
16764 LLVMContext &C = *DAG.getContext();
16765 SDLoc DL(St);
16766 // Details about the old store
16767 SDValue Ch = St->getChain();
16768 SDValue BasePtr = St->getBasePtr();
16769 Align Alignment = St->getBaseAlign();
16771 AAMDNodes AAInfo = St->getAAInfo();
16772
16773 EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(),
16774 FromVT.getVectorNumElements());
16775
16777 for (unsigned i = 0; i < Trunc.getNumOperands(); i++) {
16778 unsigned NewOffset =
16779 i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8;
16780 SDValue NewPtr =
16781 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16782
16783 SDValue Extract = Trunc.getOperand(i);
16784 SDValue Store = DAG.getTruncStore(
16785 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16786 NewToVT, Alignment, MMOFlags, AAInfo);
16787 Stores.push_back(Store);
16788 }
16789 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16790}
16791
16792// Given a floating point store from an extracted vector, with an integer
16793// VGETLANE that already exists, store the existing VGETLANEu directly. This can
16794// help reduce fp register pressure, doesn't require the fp extract and allows
16795// use of more integer post-inc stores not available with vstr.
16797 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16798 return SDValue();
16799 SDValue Extract = St->getValue();
16800 EVT VT = Extract.getValueType();
16801 // For now only uses f16. This may be useful for f32 too, but that will
16802 // be bitcast(extract), not the VGETLANEu we currently check here.
16803 if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16804 return SDValue();
16805
16806 SDNode *GetLane =
16807 DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32),
16808 {Extract.getOperand(0), Extract.getOperand(1)});
16809 if (!GetLane)
16810 return SDValue();
16811
16812 LLVMContext &C = *DAG.getContext();
16813 SDLoc DL(St);
16814 // Create a new integer store to replace the existing floating point version.
16815 SDValue Ch = St->getChain();
16816 SDValue BasePtr = St->getBasePtr();
16817 Align Alignment = St->getBaseAlign();
16819 AAMDNodes AAInfo = St->getAAInfo();
16820 EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits());
16821 SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr,
16822 St->getPointerInfo(), NewToVT, Alignment,
16823 MMOFlags, AAInfo);
16824
16825 return Store;
16826}
16827
16828/// PerformSTORECombine - Target-specific dag combine xforms for
16829/// ISD::STORE.
16832 const ARMSubtarget *Subtarget) {
16833 StoreSDNode *St = cast<StoreSDNode>(N);
16834 if (St->isVolatile())
16835 return SDValue();
16836 SDValue StVal = St->getValue();
16837 EVT VT = StVal.getValueType();
16838
16839 if (Subtarget->hasNEON())
16840 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
16841 return Store;
16842
16843 if (Subtarget->hasMVEFloatOps())
16844 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
16845 return NewToken;
16846
16847 if (Subtarget->hasMVEIntegerOps()) {
16848 if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG))
16849 return NewChain;
16850 if (SDValue NewToken =
16852 return NewToken;
16853 }
16854
16855 if (!ISD::isNormalStore(St))
16856 return SDValue();
16857
16858 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
16859 // ARM stores of arguments in the same cache line.
16860 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
16861 StVal.getNode()->hasOneUse()) {
16862 SelectionDAG &DAG = DCI.DAG;
16863 bool isBigEndian = DAG.getDataLayout().isBigEndian();
16864 SDLoc DL(St);
16865 SDValue BasePtr = St->getBasePtr();
16866 SDValue NewST1 = DAG.getStore(
16867 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
16868 BasePtr, St->getPointerInfo(), St->getBaseAlign(),
16869 St->getMemOperand()->getFlags());
16870
16871 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
16872 DAG.getConstant(4, DL, MVT::i32));
16873 return DAG.getStore(NewST1.getValue(0), DL,
16874 StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
16875 OffsetPtr, St->getPointerInfo().getWithOffset(4),
16876 St->getBaseAlign(), St->getMemOperand()->getFlags());
16877 }
16878
16879 if (StVal.getValueType() == MVT::i64 &&
16881
16882 // Bitcast an i64 store extracted from a vector to f64.
16883 // Otherwise, the i64 value will be legalized to a pair of i32 values.
16884 SelectionDAG &DAG = DCI.DAG;
16885 SDLoc dl(StVal);
16886 SDValue IntVec = StVal.getOperand(0);
16887 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
16889 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
16890 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16891 Vec, StVal.getOperand(1));
16892 dl = SDLoc(N);
16893 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
16894 // Make the DAGCombiner fold the bitcasts.
16895 DCI.AddToWorklist(Vec.getNode());
16896 DCI.AddToWorklist(ExtElt.getNode());
16897 DCI.AddToWorklist(V.getNode());
16898 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
16899 St->getPointerInfo(), St->getAlign(),
16900 St->getMemOperand()->getFlags(), St->getAAInfo());
16901 }
16902
16903 // If this is a legal vector store, try to combine it into a VST1_UPD.
16904 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
16906 return CombineBaseUpdate(N, DCI);
16907
16908 return SDValue();
16909}
16910
16911/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
16912/// can replace combinations of VMUL and VCVT (floating-point to integer)
16913/// when the VMUL has a constant operand that is a power of 2.
16914///
16915/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
16916/// vmul.f32 d16, d17, d16
16917/// vcvt.s32.f32 d16, d16
16918/// becomes:
16919/// vcvt.s32.f32 d16, d16, #3
16921 const ARMSubtarget *Subtarget) {
16922 if (!Subtarget->hasNEON())
16923 return SDValue();
16924
16925 SDValue Op = N->getOperand(0);
16926 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
16927 Op.getOpcode() != ISD::FMUL)
16928 return SDValue();
16929
16930 SDValue ConstVec = Op->getOperand(1);
16931 if (!isa<BuildVectorSDNode>(ConstVec))
16932 return SDValue();
16933
16934 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
16935 uint32_t FloatBits = FloatTy.getSizeInBits();
16936 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
16937 uint32_t IntBits = IntTy.getSizeInBits();
16938 unsigned NumLanes = Op.getValueType().getVectorNumElements();
16939 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
16940 // These instructions only exist converting from f32 to i32. We can handle
16941 // smaller integers by generating an extra truncate, but larger ones would
16942 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
16943 // these intructions only support v2i32/v4i32 types.
16944 return SDValue();
16945 }
16946
16947 BitVector UndefElements;
16948 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
16949 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
16950 if (C == -1 || C == 0 || C > 32)
16951 return SDValue();
16952
16953 SDLoc dl(N);
16954 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
16955 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
16956 Intrinsic::arm_neon_vcvtfp2fxu;
16957 SDValue FixConv = DAG.getNode(
16958 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
16959 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
16960 DAG.getConstant(C, dl, MVT::i32));
16961
16962 if (IntBits < FloatBits)
16963 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
16964
16965 return FixConv;
16966}
16967
16969 const ARMSubtarget *Subtarget) {
16970 if (!Subtarget->hasMVEFloatOps())
16971 return SDValue();
16972
16973 // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x)
16974 // The second form can be more easily turned into a predicated vadd, and
16975 // possibly combined into a fma to become a predicated vfma.
16976 SDValue Op0 = N->getOperand(0);
16977 SDValue Op1 = N->getOperand(1);
16978 EVT VT = N->getValueType(0);
16979 SDLoc DL(N);
16980
16981 // The identity element for a fadd is -0.0 or +0.0 when the nsz flag is set,
16982 // which these VMOV's represent.
16983 auto isIdentitySplat = [&](SDValue Op, bool NSZ) {
16984 if (Op.getOpcode() != ISD::BITCAST ||
16985 Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM)
16986 return false;
16987 uint64_t ImmVal = Op.getOperand(0).getConstantOperandVal(0);
16988 if (VT == MVT::v4f32 && (ImmVal == 1664 || (ImmVal == 0 && NSZ)))
16989 return true;
16990 if (VT == MVT::v8f16 && (ImmVal == 2688 || (ImmVal == 0 && NSZ)))
16991 return true;
16992 return false;
16993 };
16994
16995 if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT)
16996 std::swap(Op0, Op1);
16997
16998 if (Op1.getOpcode() != ISD::VSELECT)
16999 return SDValue();
17000
17001 SDNodeFlags FaddFlags = N->getFlags();
17002 bool NSZ = FaddFlags.hasNoSignedZeros();
17003 if (!isIdentitySplat(Op1.getOperand(2), NSZ))
17004 return SDValue();
17005
17006 SDValue FAdd =
17007 DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), FaddFlags);
17008 return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0, FaddFlags);
17009}
17010
17012 SDValue LHS = N->getOperand(0);
17013 SDValue RHS = N->getOperand(1);
17014 EVT VT = N->getValueType(0);
17015 SDLoc DL(N);
17016
17017 if (!N->getFlags().hasAllowReassociation())
17018 return SDValue();
17019
17020 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
17021 auto ReassocComplex = [&](SDValue A, SDValue B) {
17022 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
17023 return SDValue();
17024 unsigned Opc = A.getConstantOperandVal(0);
17025 if (Opc != Intrinsic::arm_mve_vcmlaq)
17026 return SDValue();
17027 SDValue VCMLA = DAG.getNode(
17028 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0), A.getOperand(1),
17029 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(2), B, N->getFlags()),
17030 A.getOperand(3), A.getOperand(4));
17031 VCMLA->setFlags(A->getFlags());
17032 return VCMLA;
17033 };
17034 if (SDValue R = ReassocComplex(LHS, RHS))
17035 return R;
17036 if (SDValue R = ReassocComplex(RHS, LHS))
17037 return R;
17038
17039 return SDValue();
17040}
17041
17043 const ARMSubtarget *Subtarget) {
17044 if (SDValue S = PerformFAddVSelectCombine(N, DAG, Subtarget))
17045 return S;
17046 if (SDValue S = PerformFADDVCMLACombine(N, DAG))
17047 return S;
17048 return SDValue();
17049}
17050
17051/// PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
17052/// can replace combinations of VCVT (integer to floating-point) and VMUL
17053/// when the VMUL has a constant operand that is a power of 2.
17054///
17055/// Example (assume d17 = <float 0.125, float 0.125>):
17056/// vcvt.f32.s32 d16, d16
17057/// vmul.f32 d16, d16, d17
17058/// becomes:
17059/// vcvt.f32.s32 d16, d16, #3
17061 const ARMSubtarget *Subtarget) {
17062 if (!Subtarget->hasNEON())
17063 return SDValue();
17064
17065 SDValue Op = N->getOperand(0);
17066 unsigned OpOpcode = Op.getNode()->getOpcode();
17067 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
17068 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
17069 return SDValue();
17070
17071 SDValue ConstVec = N->getOperand(1);
17072 if (!isa<BuildVectorSDNode>(ConstVec))
17073 return SDValue();
17074
17075 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
17076 uint32_t FloatBits = FloatTy.getSizeInBits();
17077 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
17078 uint32_t IntBits = IntTy.getSizeInBits();
17079 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17080 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
17081 // These instructions only exist converting from i32 to f32. We can handle
17082 // smaller integers by generating an extra extend, but larger ones would
17083 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
17084 // these intructions only support v2i32/v4i32 types.
17085 return SDValue();
17086 }
17087
17088 ConstantFPSDNode *CN = isConstOrConstSplatFP(ConstVec, true);
17089 APFloat Recip(0.0f);
17090 if (!CN || !CN->getValueAPF().getExactInverse(&Recip))
17091 return SDValue();
17092
17093 bool IsExact;
17094 APSInt IntVal(33);
17095 if (Recip.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
17096 APFloat::opOK ||
17097 !IsExact)
17098 return SDValue();
17099
17100 int32_t C = IntVal.exactLogBase2();
17101 if (C == -1 || C == 0 || C > 32)
17102 return SDValue();
17103
17104 SDLoc DL(N);
17105 bool isSigned = OpOpcode == ISD::SINT_TO_FP;
17106 SDValue ConvInput = Op.getOperand(0);
17107 if (IntBits < FloatBits)
17109 NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput);
17110
17111 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp
17112 : Intrinsic::arm_neon_vcvtfxu2fp;
17113 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
17114 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
17115 DAG.getConstant(C, DL, MVT::i32));
17116}
17117
17119 const ARMSubtarget *ST) {
17120 if (!ST->hasMVEIntegerOps())
17121 return SDValue();
17122
17123 assert(N->getOpcode() == ISD::VECREDUCE_ADD);
17124 EVT ResVT = N->getValueType(0);
17125 SDValue N0 = N->getOperand(0);
17126 SDLoc dl(N);
17127
17128 // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
17129 if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
17130 (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 ||
17131 N0.getValueType() == MVT::v16i8)) {
17132 SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0));
17133 SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1));
17134 return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1);
17135 }
17136
17137 // We are looking for something that will have illegal types if left alone,
17138 // but that we can convert to a single instruction under MVE. For example
17139 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
17140 // or
17141 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
17142
17143 // The legal cases are:
17144 // VADDV u/s 8/16/32
17145 // VMLAV u/s 8/16/32
17146 // VADDLV u/s 32
17147 // VMLALV u/s 16/32
17148
17149 // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
17150 // extend it and use v4i32 instead.
17151 auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) {
17152 EVT AVT = A.getValueType();
17153 return any_of(ExtTypes, [&](MVT Ty) {
17154 return AVT.getVectorNumElements() == Ty.getVectorNumElements() &&
17155 AVT.bitsLE(Ty);
17156 });
17157 };
17158 auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
17159 EVT AVT = A.getValueType();
17160 if (!AVT.is128BitVector())
17161 A = DAG.getNode(ExtendCode, dl,
17163 128 / AVT.getVectorMinNumElements())),
17164 A);
17165 return A;
17166 };
17167 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
17168 if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
17169 return SDValue();
17170 SDValue A = N0->getOperand(0);
17171 if (ExtTypeMatches(A, ExtTypes))
17172 return ExtendIfNeeded(A, ExtendCode);
17173 return SDValue();
17174 };
17175 auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
17176 ArrayRef<MVT> ExtTypes, SDValue &Mask) {
17177 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17179 return SDValue();
17180 Mask = N0->getOperand(0);
17181 SDValue Ext = N0->getOperand(1);
17182 if (Ext->getOpcode() != ExtendCode)
17183 return SDValue();
17184 SDValue A = Ext->getOperand(0);
17185 if (ExtTypeMatches(A, ExtTypes))
17186 return ExtendIfNeeded(A, ExtendCode);
17187 return SDValue();
17188 };
17189 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17190 SDValue &A, SDValue &B) {
17191 // For a vmla we are trying to match a larger pattern:
17192 // ExtA = sext/zext A
17193 // ExtB = sext/zext B
17194 // Mul = mul ExtA, ExtB
17195 // vecreduce.add Mul
17196 // There might also be en extra extend between the mul and the addreduce, so
17197 // long as the bitwidth is high enough to make them equivalent (for example
17198 // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
17199 if (ResVT != RetTy)
17200 return false;
17201 SDValue Mul = N0;
17202 if (Mul->getOpcode() == ExtendCode &&
17203 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17204 ResVT.getScalarSizeInBits())
17205 Mul = Mul->getOperand(0);
17206 if (Mul->getOpcode() != ISD::MUL)
17207 return false;
17208 SDValue ExtA = Mul->getOperand(0);
17209 SDValue ExtB = Mul->getOperand(1);
17210 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17211 return false;
17212 A = ExtA->getOperand(0);
17213 B = ExtB->getOperand(0);
17214 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17215 A = ExtendIfNeeded(A, ExtendCode);
17216 B = ExtendIfNeeded(B, ExtendCode);
17217 return true;
17218 }
17219 return false;
17220 };
17221 auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17222 SDValue &A, SDValue &B, SDValue &Mask) {
17223 // Same as the pattern above with a select for the zero predicated lanes
17224 // ExtA = sext/zext A
17225 // ExtB = sext/zext B
17226 // Mul = mul ExtA, ExtB
17227 // N0 = select Mask, Mul, 0
17228 // vecreduce.add N0
17229 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17231 return false;
17232 Mask = N0->getOperand(0);
17233 SDValue Mul = N0->getOperand(1);
17234 if (Mul->getOpcode() == ExtendCode &&
17235 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17236 ResVT.getScalarSizeInBits())
17237 Mul = Mul->getOperand(0);
17238 if (Mul->getOpcode() != ISD::MUL)
17239 return false;
17240 SDValue ExtA = Mul->getOperand(0);
17241 SDValue ExtB = Mul->getOperand(1);
17242 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17243 return false;
17244 A = ExtA->getOperand(0);
17245 B = ExtB->getOperand(0);
17246 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17247 A = ExtendIfNeeded(A, ExtendCode);
17248 B = ExtendIfNeeded(B, ExtendCode);
17249 return true;
17250 }
17251 return false;
17252 };
17253 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
17254 // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64
17255 // reductions. The operands are extended with MVEEXT, but as they are
17256 // reductions the lane orders do not matter. MVEEXT may be combined with
17257 // loads to produce two extending loads, or else they will be expanded to
17258 // VREV/VMOVL.
17259 EVT VT = Ops[0].getValueType();
17260 if (VT == MVT::v16i8) {
17261 assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) &&
17262 "Unexpected illegal long reduction opcode");
17263 bool IsUnsigned = Opcode == ARMISD::VMLALVu;
17264
17265 SDValue Ext0 =
17266 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17267 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]);
17268 SDValue Ext1 =
17269 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17270 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]);
17271
17272 SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
17273 Ext0, Ext1);
17274 SDValue MLA1 =
17275 DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl,
17276 DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1),
17277 Ext0.getValue(1), Ext1.getValue(1));
17278 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1));
17279 }
17280 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
17281 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
17282 SDValue(Node.getNode(), 1));
17283 };
17284
17285 SDValue A, B;
17286 SDValue Mask;
17287 if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17288 return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
17289 if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17290 return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
17291 if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17292 A, B))
17293 return Create64bitNode(ARMISD::VMLALVs, {A, B});
17294 if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17295 A, B))
17296 return Create64bitNode(ARMISD::VMLALVu, {A, B});
17297 if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
17298 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17299 DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
17300 if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))
17301 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17302 DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
17303
17304 if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17305 Mask))
17306 return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
17307 if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17308 Mask))
17309 return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
17310 if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17311 Mask))
17312 return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
17313 if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17314 Mask))
17315 return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
17316 if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
17317 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17318 DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
17319 if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))
17320 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17321 DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
17322
17323 if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
17324 return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
17325 if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
17326 return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
17327 if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
17328 return Create64bitNode(ARMISD::VADDLVs, {A});
17329 if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
17330 return Create64bitNode(ARMISD::VADDLVu, {A});
17331 if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
17332 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17333 DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
17334 if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
17335 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17336 DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
17337
17338 if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17339 return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
17340 if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17341 return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
17342 if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))
17343 return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
17344 if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))
17345 return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
17346 if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
17347 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17348 DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
17349 if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
17350 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17351 DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
17352
17353 // Some complications. We can get a case where the two inputs of the mul are
17354 // the same, then the output sext will have been helpfully converted to a
17355 // zext. Turn it back.
17356 SDValue Op = N0;
17357 if (Op->getOpcode() == ISD::VSELECT)
17358 Op = Op->getOperand(1);
17359 if (Op->getOpcode() == ISD::ZERO_EXTEND &&
17360 Op->getOperand(0)->getOpcode() == ISD::MUL) {
17361 SDValue Mul = Op->getOperand(0);
17362 if (Mul->getOperand(0) == Mul->getOperand(1) &&
17363 Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
17364 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);
17365 if (Op != N0)
17366 Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),
17367 N0->getOperand(0), Ext, N0->getOperand(2));
17368 return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);
17369 }
17370 }
17371
17372 return SDValue();
17373}
17374
17375// Looks for vaddv(shuffle) or vmlav(shuffle, shuffle), with a shuffle where all
17376// the lanes are used. Due to the reduction being commutative the shuffle can be
17377// removed.
17379 unsigned VecOp = N->getOperand(0).getValueType().isVector() ? 0 : 2;
17380 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp));
17381 if (!Shuf || !Shuf->getOperand(1).isUndef())
17382 return SDValue();
17383
17384 // Check all elements are used once in the mask.
17385 ArrayRef<int> Mask = Shuf->getMask();
17386 APInt SetElts(Mask.size(), 0);
17387 for (int E : Mask) {
17388 if (E < 0 || E >= (int)Mask.size())
17389 return SDValue();
17390 SetElts.setBit(E);
17391 }
17392 if (!SetElts.isAllOnes())
17393 return SDValue();
17394
17395 if (N->getNumOperands() != VecOp + 1) {
17396 auto *Shuf2 = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp + 1));
17397 if (!Shuf2 || !Shuf2->getOperand(1).isUndef() || Shuf2->getMask() != Mask)
17398 return SDValue();
17399 }
17400
17402 for (SDValue Op : N->ops()) {
17403 if (Op.getValueType().isVector())
17404 Ops.push_back(Op.getOperand(0));
17405 else
17406 Ops.push_back(Op);
17407 }
17408 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops);
17409}
17410
17413 SDValue Op0 = N->getOperand(0);
17414 SDValue Op1 = N->getOperand(1);
17415 unsigned IsTop = N->getConstantOperandVal(2);
17416
17417 // VMOVNT a undef -> a
17418 // VMOVNB a undef -> a
17419 // VMOVNB undef a -> a
17420 if (Op1->isUndef())
17421 return Op0;
17422 if (Op0->isUndef() && !IsTop)
17423 return Op1;
17424
17425 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
17426 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
17427 if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
17428 Op1->getOpcode() == ARMISD::VQMOVNu) &&
17429 Op1->getConstantOperandVal(2) == 0)
17430 return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),
17431 Op0, Op1->getOperand(1), N->getOperand(2));
17432
17433 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
17434 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
17435 // into the top or bottom lanes.
17436 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17437 APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1));
17438 APInt Op0DemandedElts =
17439 IsTop ? Op1DemandedElts
17440 : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));
17441
17442 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17443 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17444 return SDValue(N, 0);
17445 if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, DCI))
17446 return SDValue(N, 0);
17447
17448 return SDValue();
17449}
17450
17453 SDValue Op0 = N->getOperand(0);
17454 unsigned IsTop = N->getConstantOperandVal(2);
17455
17456 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17457 APInt Op0DemandedElts =
17458 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
17459 : APInt::getHighBitsSet(2, 1));
17460
17461 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17462 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17463 return SDValue(N, 0);
17464 return SDValue();
17465}
17466
17469 EVT VT = N->getValueType(0);
17470 SDValue LHS = N->getOperand(0);
17471 SDValue RHS = N->getOperand(1);
17472
17473 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS);
17474 auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS);
17475 // Turn VQDMULH(shuffle, shuffle) -> shuffle(VQDMULH)
17476 if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) &&
17477 LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() &&
17478 (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {
17479 SDLoc DL(N);
17480 SDValue NewBinOp = DCI.DAG.getNode(N->getOpcode(), DL, VT,
17481 LHS.getOperand(0), RHS.getOperand(0));
17482 SDValue UndefV = LHS.getOperand(1);
17483 return DCI.DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());
17484 }
17485 return SDValue();
17486}
17487
17489 SDLoc DL(N);
17490 SDValue Op0 = N->getOperand(0);
17491 SDValue Op1 = N->getOperand(1);
17492
17493 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
17494 // uses of the intrinsics.
17495 if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
17496 int ShiftAmt = C->getSExtValue();
17497 if (ShiftAmt == 0) {
17498 SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);
17499 DAG.ReplaceAllUsesWith(N, Merge.getNode());
17500 return SDValue();
17501 }
17502
17503 if (ShiftAmt >= -32 && ShiftAmt < 0) {
17504 unsigned NewOpcode =
17505 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
17506 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
17507 DAG.getConstant(-ShiftAmt, DL, MVT::i32));
17508 DAG.ReplaceAllUsesWith(N, NewShift.getNode());
17509 return NewShift;
17510 }
17511 }
17512
17513 return SDValue();
17514}
17515
17516/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
17518 DAGCombinerInfo &DCI) const {
17519 SelectionDAG &DAG = DCI.DAG;
17520 unsigned IntNo = N->getConstantOperandVal(0);
17521 switch (IntNo) {
17522 default:
17523 // Don't do anything for most intrinsics.
17524 break;
17525
17526 // Vector shifts: check for immediate versions and lower them.
17527 // Note: This is done during DAG combining instead of DAG legalizing because
17528 // the build_vectors for 64-bit vector element shift counts are generally
17529 // not legal, and it is hard to see their values after they get legalized to
17530 // loads from a constant pool.
17531 case Intrinsic::arm_neon_vshifts:
17532 case Intrinsic::arm_neon_vshiftu:
17533 case Intrinsic::arm_neon_vrshifts:
17534 case Intrinsic::arm_neon_vrshiftu:
17535 case Intrinsic::arm_neon_vrshiftn:
17536 case Intrinsic::arm_neon_vqshifts:
17537 case Intrinsic::arm_neon_vqshiftu:
17538 case Intrinsic::arm_neon_vqshiftsu:
17539 case Intrinsic::arm_neon_vqshiftns:
17540 case Intrinsic::arm_neon_vqshiftnu:
17541 case Intrinsic::arm_neon_vqshiftnsu:
17542 case Intrinsic::arm_neon_vqrshiftns:
17543 case Intrinsic::arm_neon_vqrshiftnu:
17544 case Intrinsic::arm_neon_vqrshiftnsu: {
17545 EVT VT = N->getOperand(1).getValueType();
17546 int64_t Cnt;
17547 unsigned VShiftOpc = 0;
17548
17549 switch (IntNo) {
17550 case Intrinsic::arm_neon_vshifts:
17551 case Intrinsic::arm_neon_vshiftu:
17552 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
17553 VShiftOpc = ARMISD::VSHLIMM;
17554 break;
17555 }
17556 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
17557 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
17559 break;
17560 }
17561 return SDValue();
17562
17563 case Intrinsic::arm_neon_vrshifts:
17564 case Intrinsic::arm_neon_vrshiftu:
17565 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
17566 break;
17567 return SDValue();
17568
17569 case Intrinsic::arm_neon_vqshifts:
17570 case Intrinsic::arm_neon_vqshiftu:
17571 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17572 break;
17573 return SDValue();
17574
17575 case Intrinsic::arm_neon_vqshiftsu:
17576 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17577 break;
17578 llvm_unreachable("invalid shift count for vqshlu intrinsic");
17579
17580 case Intrinsic::arm_neon_vrshiftn:
17581 case Intrinsic::arm_neon_vqshiftns:
17582 case Intrinsic::arm_neon_vqshiftnu:
17583 case Intrinsic::arm_neon_vqshiftnsu:
17584 case Intrinsic::arm_neon_vqrshiftns:
17585 case Intrinsic::arm_neon_vqrshiftnu:
17586 case Intrinsic::arm_neon_vqrshiftnsu:
17587 // Narrowing shifts require an immediate right shift.
17588 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
17589 break;
17590 llvm_unreachable("invalid shift count for narrowing vector shift "
17591 "intrinsic");
17592
17593 default:
17594 llvm_unreachable("unhandled vector shift");
17595 }
17596
17597 switch (IntNo) {
17598 case Intrinsic::arm_neon_vshifts:
17599 case Intrinsic::arm_neon_vshiftu:
17600 // Opcode already set above.
17601 break;
17602 case Intrinsic::arm_neon_vrshifts:
17603 VShiftOpc = ARMISD::VRSHRsIMM;
17604 break;
17605 case Intrinsic::arm_neon_vrshiftu:
17606 VShiftOpc = ARMISD::VRSHRuIMM;
17607 break;
17608 case Intrinsic::arm_neon_vrshiftn:
17609 VShiftOpc = ARMISD::VRSHRNIMM;
17610 break;
17611 case Intrinsic::arm_neon_vqshifts:
17612 VShiftOpc = ARMISD::VQSHLsIMM;
17613 break;
17614 case Intrinsic::arm_neon_vqshiftu:
17615 VShiftOpc = ARMISD::VQSHLuIMM;
17616 break;
17617 case Intrinsic::arm_neon_vqshiftsu:
17618 VShiftOpc = ARMISD::VQSHLsuIMM;
17619 break;
17620 case Intrinsic::arm_neon_vqshiftns:
17621 VShiftOpc = ARMISD::VQSHRNsIMM;
17622 break;
17623 case Intrinsic::arm_neon_vqshiftnu:
17624 VShiftOpc = ARMISD::VQSHRNuIMM;
17625 break;
17626 case Intrinsic::arm_neon_vqshiftnsu:
17627 VShiftOpc = ARMISD::VQSHRNsuIMM;
17628 break;
17629 case Intrinsic::arm_neon_vqrshiftns:
17630 VShiftOpc = ARMISD::VQRSHRNsIMM;
17631 break;
17632 case Intrinsic::arm_neon_vqrshiftnu:
17633 VShiftOpc = ARMISD::VQRSHRNuIMM;
17634 break;
17635 case Intrinsic::arm_neon_vqrshiftnsu:
17636 VShiftOpc = ARMISD::VQRSHRNsuIMM;
17637 break;
17638 }
17639
17640 SDLoc dl(N);
17641 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17642 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
17643 }
17644
17645 case Intrinsic::arm_neon_vshiftins: {
17646 EVT VT = N->getOperand(1).getValueType();
17647 int64_t Cnt;
17648 unsigned VShiftOpc = 0;
17649
17650 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
17651 VShiftOpc = ARMISD::VSLIIMM;
17652 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
17653 VShiftOpc = ARMISD::VSRIIMM;
17654 else {
17655 llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
17656 }
17657
17658 SDLoc dl(N);
17659 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17660 N->getOperand(1), N->getOperand(2),
17661 DAG.getConstant(Cnt, dl, MVT::i32));
17662 }
17663
17664 case Intrinsic::arm_neon_vqrshifts:
17665 case Intrinsic::arm_neon_vqrshiftu:
17666 // No immediate versions of these to check for.
17667 break;
17668
17669 case Intrinsic::arm_neon_vbsl: {
17670 SDLoc dl(N);
17671 return DAG.getNode(ARMISD::VBSP, dl, N->getValueType(0), N->getOperand(1),
17672 N->getOperand(2), N->getOperand(3));
17673 }
17674 case Intrinsic::arm_mve_vqdmlah:
17675 case Intrinsic::arm_mve_vqdmlash:
17676 case Intrinsic::arm_mve_vqrdmlah:
17677 case Intrinsic::arm_mve_vqrdmlash:
17678 case Intrinsic::arm_mve_vmla_n_predicated:
17679 case Intrinsic::arm_mve_vmlas_n_predicated:
17680 case Intrinsic::arm_mve_vqdmlah_predicated:
17681 case Intrinsic::arm_mve_vqdmlash_predicated:
17682 case Intrinsic::arm_mve_vqrdmlah_predicated:
17683 case Intrinsic::arm_mve_vqrdmlash_predicated: {
17684 // These intrinsics all take an i32 scalar operand which is narrowed to the
17685 // size of a single lane of the vector type they return. So we don't need
17686 // any bits of that operand above that point, which allows us to eliminate
17687 // uxth/sxth.
17688 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
17689 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17690 if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
17691 return SDValue();
17692 break;
17693 }
17694
17695 case Intrinsic::arm_mve_minv:
17696 case Intrinsic::arm_mve_maxv:
17697 case Intrinsic::arm_mve_minav:
17698 case Intrinsic::arm_mve_maxav:
17699 case Intrinsic::arm_mve_minv_predicated:
17700 case Intrinsic::arm_mve_maxv_predicated:
17701 case Intrinsic::arm_mve_minav_predicated:
17702 case Intrinsic::arm_mve_maxav_predicated: {
17703 // These intrinsics all take an i32 scalar operand which is narrowed to the
17704 // size of a single lane of the vector type they take as the other input.
17705 unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
17706 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17707 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
17708 return SDValue();
17709 break;
17710 }
17711
17712 case Intrinsic::arm_mve_addv: {
17713 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
17714 // which allow PerformADDVecReduce to turn it into VADDLV when possible.
17715 bool Unsigned = N->getConstantOperandVal(2);
17717 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
17718 }
17719
17720 case Intrinsic::arm_mve_addlv:
17721 case Intrinsic::arm_mve_addlv_predicated: {
17722 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
17723 // which recombines the two outputs into an i64
17724 bool Unsigned = N->getConstantOperandVal(2);
17725 unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
17728
17730 for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
17731 if (i != 2) // skip the unsigned flag
17732 Ops.push_back(N->getOperand(i));
17733
17734 SDLoc dl(N);
17735 SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
17736 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
17737 val.getValue(1));
17738 }
17739 }
17740
17741 return SDValue();
17742}
17743
17744/// PerformShiftCombine - Checks for immediate versions of vector shifts and
17745/// lowers them. As with the vector shift intrinsics, this is done during DAG
17746/// combining instead of DAG legalizing because the build_vectors for 64-bit
17747/// vector element shift counts are generally not legal, and it is hard to see
17748/// their values after they get legalized to loads from a constant pool.
17751 const ARMSubtarget *ST) {
17752 SelectionDAG &DAG = DCI.DAG;
17753 EVT VT = N->getValueType(0);
17754
17755 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
17756 N->getOperand(0)->getOpcode() == ISD::AND &&
17757 N->getOperand(0)->hasOneUse()) {
17758 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
17759 return SDValue();
17760 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
17761 // usually show up because instcombine prefers to canonicalize it to
17762 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
17763 // out of GEP lowering in some cases.
17764 SDValue N0 = N->getOperand(0);
17765 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
17766 if (!ShiftAmtNode)
17767 return SDValue();
17768 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
17769 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
17770 if (!AndMaskNode)
17771 return SDValue();
17772 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
17773 // Don't transform uxtb/uxth.
17774 if (AndMask == 255 || AndMask == 65535)
17775 return SDValue();
17776 if (isMask_32(AndMask)) {
17777 uint32_t MaskedBits = llvm::countl_zero(AndMask);
17778 if (MaskedBits > ShiftAmt) {
17779 SDLoc DL(N);
17780 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
17781 DAG.getConstant(MaskedBits, DL, MVT::i32));
17782 return DAG.getNode(
17783 ISD::SRL, DL, MVT::i32, SHL,
17784 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
17785 }
17786 }
17787 }
17788
17789 // Nothing to be done for scalar shifts.
17790 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17791 if (!VT.isVector() || !TLI.isTypeLegal(VT))
17792 return SDValue();
17793 if (ST->hasMVEIntegerOps())
17794 return SDValue();
17795
17796 int64_t Cnt;
17797
17798 switch (N->getOpcode()) {
17799 default: llvm_unreachable("unexpected shift opcode");
17800
17801 case ISD::SHL:
17802 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
17803 SDLoc dl(N);
17804 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
17805 DAG.getConstant(Cnt, dl, MVT::i32));
17806 }
17807 break;
17808
17809 case ISD::SRA:
17810 case ISD::SRL:
17811 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
17812 unsigned VShiftOpc =
17813 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
17814 SDLoc dl(N);
17815 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
17816 DAG.getConstant(Cnt, dl, MVT::i32));
17817 }
17818 }
17819 return SDValue();
17820}
17821
17822// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
17823// split into multiple extending loads, which are simpler to deal with than an
17824// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
17825// to convert the type to an f32.
17827 SDValue N0 = N->getOperand(0);
17828 if (N0.getOpcode() != ISD::LOAD)
17829 return SDValue();
17830 LoadSDNode *LD = cast<LoadSDNode>(N0.getNode());
17831 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
17832 LD->getExtensionType() != ISD::NON_EXTLOAD)
17833 return SDValue();
17834 EVT FromVT = LD->getValueType(0);
17835 EVT ToVT = N->getValueType(0);
17836 if (!ToVT.isVector())
17837 return SDValue();
17839 EVT ToEltVT = ToVT.getVectorElementType();
17840 EVT FromEltVT = FromVT.getVectorElementType();
17841
17842 unsigned NumElements = 0;
17843 if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)
17844 NumElements = 4;
17845 if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
17846 NumElements = 4;
17847 if (NumElements == 0 ||
17848 (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
17849 FromVT.getVectorNumElements() % NumElements != 0 ||
17850 !isPowerOf2_32(NumElements))
17851 return SDValue();
17852
17853 LLVMContext &C = *DAG.getContext();
17854 SDLoc DL(LD);
17855 // Details about the old load
17856 SDValue Ch = LD->getChain();
17857 SDValue BasePtr = LD->getBasePtr();
17858 Align Alignment = LD->getBaseAlign();
17859 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
17860 AAMDNodes AAInfo = LD->getAAInfo();
17861
17862 ISD::LoadExtType NewExtType =
17863 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
17864 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
17865 EVT NewFromVT = EVT::getVectorVT(
17866 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
17867 EVT NewToVT = EVT::getVectorVT(
17868 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
17869
17872 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
17873 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
17874 SDValue NewPtr =
17875 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
17876
17877 SDValue NewLoad =
17878 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
17879 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
17880 Alignment, MMOFlags, AAInfo);
17881 Loads.push_back(NewLoad);
17882 Chains.push_back(SDValue(NewLoad.getNode(), 1));
17883 }
17884
17885 // Float truncs need to extended with VCVTB's into their floating point types.
17886 if (FromEltVT == MVT::f16) {
17888
17889 for (unsigned i = 0; i < Loads.size(); i++) {
17890 SDValue LoadBC =
17891 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
17892 SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
17893 DAG.getConstant(0, DL, MVT::i32));
17894 Extends.push_back(FPExt);
17895 }
17896
17897 Loads = Extends;
17898 }
17899
17900 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
17901 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
17902 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
17903}
17904
17905/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
17906/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
17908 const ARMSubtarget *ST) {
17909 SDValue N0 = N->getOperand(0);
17910
17911 // Check for sign- and zero-extensions of vector extract operations of 8- and
17912 // 16-bit vector elements. NEON and MVE support these directly. They are
17913 // handled during DAG combining because type legalization will promote them
17914 // to 32-bit types and it is messy to recognize the operations after that.
17915 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&
17917 SDValue Vec = N0.getOperand(0);
17918 SDValue Lane = N0.getOperand(1);
17919 EVT VT = N->getValueType(0);
17920 EVT EltVT = N0.getValueType();
17921 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17922
17923 if (VT == MVT::i32 &&
17924 (EltVT == MVT::i8 || EltVT == MVT::i16) &&
17925 TLI.isTypeLegal(Vec.getValueType()) &&
17926 isa<ConstantSDNode>(Lane)) {
17927
17928 unsigned Opc = 0;
17929 switch (N->getOpcode()) {
17930 default: llvm_unreachable("unexpected opcode");
17931 case ISD::SIGN_EXTEND:
17933 break;
17934 case ISD::ZERO_EXTEND:
17935 case ISD::ANY_EXTEND:
17937 break;
17938 }
17939 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
17940 }
17941 }
17942
17943 if (ST->hasMVEIntegerOps())
17944 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17945 return NewLoad;
17946
17947 return SDValue();
17948}
17949
17951 const ARMSubtarget *ST) {
17952 if (ST->hasMVEFloatOps())
17953 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17954 return NewLoad;
17955
17956 return SDValue();
17957}
17958
17959// Lower smin(smax(x, C1), C2) to ssat or usat, if they have saturating
17960// constant bounds.
17962 const ARMSubtarget *Subtarget) {
17963 if ((Subtarget->isThumb() || !Subtarget->hasV6Ops()) &&
17964 !Subtarget->isThumb2())
17965 return SDValue();
17966
17967 EVT VT = Op.getValueType();
17968 SDValue Op0 = Op.getOperand(0);
17969
17970 if (VT != MVT::i32 ||
17971 (Op0.getOpcode() != ISD::SMIN && Op0.getOpcode() != ISD::SMAX) ||
17972 !isa<ConstantSDNode>(Op.getOperand(1)) ||
17973 !isa<ConstantSDNode>(Op0.getOperand(1)))
17974 return SDValue();
17975
17976 SDValue Min = Op;
17977 SDValue Max = Op0;
17978 SDValue Input = Op0.getOperand(0);
17979 if (Min.getOpcode() == ISD::SMAX)
17980 std::swap(Min, Max);
17981
17982 APInt MinC = Min.getConstantOperandAPInt(1);
17983 APInt MaxC = Max.getConstantOperandAPInt(1);
17984
17985 if (Min.getOpcode() != ISD::SMIN || Max.getOpcode() != ISD::SMAX ||
17986 !(MinC + 1).isPowerOf2())
17987 return SDValue();
17988
17989 SDLoc DL(Op);
17990 if (MinC == ~MaxC)
17991 return DAG.getNode(ARMISD::SSAT, DL, VT, Input,
17992 DAG.getConstant(MinC.countr_one(), DL, VT));
17993 if (MaxC == 0)
17994 return DAG.getNode(ARMISD::USAT, DL, VT, Input,
17995 DAG.getConstant(MinC.countr_one(), DL, VT));
17996
17997 return SDValue();
17998}
17999
18000/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
18001/// saturates.
18003 const ARMSubtarget *ST) {
18004 EVT VT = N->getValueType(0);
18005 SDValue N0 = N->getOperand(0);
18006
18007 if (VT == MVT::i32)
18008 return PerformMinMaxToSatCombine(SDValue(N, 0), DAG, ST);
18009
18010 if (!ST->hasMVEIntegerOps())
18011 return SDValue();
18012
18013 if (SDValue V = PerformVQDMULHCombine(N, DAG))
18014 return V;
18015
18016 if (VT != MVT::v4i32 && VT != MVT::v8i16)
18017 return SDValue();
18018
18019 auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
18020 // Check one is a smin and the other is a smax
18021 if (Min->getOpcode() != ISD::SMIN)
18022 std::swap(Min, Max);
18023 if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
18024 return false;
18025
18026 APInt SaturateC;
18027 if (VT == MVT::v4i32)
18028 SaturateC = APInt(32, (1 << 15) - 1, true);
18029 else //if (VT == MVT::v8i16)
18030 SaturateC = APInt(16, (1 << 7) - 1, true);
18031
18032 APInt MinC, MaxC;
18033 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18034 MinC != SaturateC)
18035 return false;
18036 if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
18037 MaxC != ~SaturateC)
18038 return false;
18039 return true;
18040 };
18041
18042 if (IsSignedSaturate(N, N0.getNode())) {
18043 SDLoc DL(N);
18044 MVT ExtVT, HalfVT;
18045 if (VT == MVT::v4i32) {
18046 HalfVT = MVT::v8i16;
18047 ExtVT = MVT::v4i16;
18048 } else { // if (VT == MVT::v8i16)
18049 HalfVT = MVT::v16i8;
18050 ExtVT = MVT::v8i8;
18051 }
18052
18053 // Create a VQMOVNB with undef top lanes, then signed extended into the top
18054 // half. That extend will hopefully be removed if only the bottom bits are
18055 // demanded (though a truncating store, for example).
18056 SDValue VQMOVN =
18057 DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
18058 N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
18059 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18060 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
18061 DAG.getValueType(ExtVT));
18062 }
18063
18064 auto IsUnsignedSaturate = [&](SDNode *Min) {
18065 // For unsigned, we just need to check for <= 0xffff
18066 if (Min->getOpcode() != ISD::UMIN)
18067 return false;
18068
18069 APInt SaturateC;
18070 if (VT == MVT::v4i32)
18071 SaturateC = APInt(32, (1 << 16) - 1, true);
18072 else //if (VT == MVT::v8i16)
18073 SaturateC = APInt(16, (1 << 8) - 1, true);
18074
18075 APInt MinC;
18076 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18077 MinC != SaturateC)
18078 return false;
18079 return true;
18080 };
18081
18082 if (IsUnsignedSaturate(N)) {
18083 SDLoc DL(N);
18084 MVT HalfVT;
18085 unsigned ExtConst;
18086 if (VT == MVT::v4i32) {
18087 HalfVT = MVT::v8i16;
18088 ExtConst = 0x0000FFFF;
18089 } else { //if (VT == MVT::v8i16)
18090 HalfVT = MVT::v16i8;
18091 ExtConst = 0x00FF;
18092 }
18093
18094 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
18095 // an AND. That extend will hopefully be removed if only the bottom bits are
18096 // demanded (though a truncating store, for example).
18097 SDValue VQMOVN =
18098 DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
18099 DAG.getConstant(0, DL, MVT::i32));
18100 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18101 return DAG.getNode(ISD::AND, DL, VT, Bitcast,
18102 DAG.getConstant(ExtConst, DL, VT));
18103 }
18104
18105 return SDValue();
18106}
18107
18109 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
18110 if (!C)
18111 return nullptr;
18112 const APInt *CV = &C->getAPIntValue();
18113 return CV->isPowerOf2() ? CV : nullptr;
18114}
18115
18117 // If we have a CMOV, OR and AND combination such as:
18118 // if (x & CN)
18119 // y |= CM;
18120 //
18121 // And:
18122 // * CN is a single bit;
18123 // * All bits covered by CM are known zero in y
18124 //
18125 // Then we can convert this into a sequence of BFI instructions. This will
18126 // always be a win if CM is a single bit, will always be no worse than the
18127 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
18128 // three bits (due to the extra IT instruction).
18129
18130 SDValue Op0 = CMOV->getOperand(0);
18131 SDValue Op1 = CMOV->getOperand(1);
18132 auto CC = CMOV->getConstantOperandAPInt(2).getLimitedValue();
18133 SDValue CmpZ = CMOV->getOperand(3);
18134
18135 // The compare must be against zero.
18136 if (!isNullConstant(CmpZ->getOperand(1)))
18137 return SDValue();
18138
18139 assert(CmpZ->getOpcode() == ARMISD::CMPZ);
18140 SDValue And = CmpZ->getOperand(0);
18141 if (And->getOpcode() != ISD::AND)
18142 return SDValue();
18143 const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
18144 if (!AndC)
18145 return SDValue();
18146 SDValue X = And->getOperand(0);
18147
18148 if (CC == ARMCC::EQ) {
18149 // We're performing an "equal to zero" compare. Swap the operands so we
18150 // canonicalize on a "not equal to zero" compare.
18151 std::swap(Op0, Op1);
18152 } else {
18153 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
18154 }
18155
18156 if (Op1->getOpcode() != ISD::OR)
18157 return SDValue();
18158
18159 ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1));
18160 if (!OrC)
18161 return SDValue();
18162 SDValue Y = Op1->getOperand(0);
18163
18164 if (Op0 != Y)
18165 return SDValue();
18166
18167 // Now, is it profitable to continue?
18168 APInt OrCI = OrC->getAPIntValue();
18169 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
18170 if (OrCI.popcount() > Heuristic)
18171 return SDValue();
18172
18173 // Lastly, can we determine that the bits defined by OrCI
18174 // are zero in Y?
18175 KnownBits Known = DAG.computeKnownBits(Y);
18176 if ((OrCI & Known.Zero) != OrCI)
18177 return SDValue();
18178
18179 // OK, we can do the combine.
18180 SDValue V = Y;
18181 SDLoc dl(X);
18182 EVT VT = X.getValueType();
18183 unsigned BitInX = AndC->logBase2();
18184
18185 if (BitInX != 0) {
18186 // We must shift X first.
18187 X = DAG.getNode(ISD::SRL, dl, VT, X,
18188 DAG.getConstant(BitInX, dl, VT));
18189 }
18190
18191 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
18192 BitInY < NumActiveBits; ++BitInY) {
18193 if (OrCI[BitInY] == 0)
18194 continue;
18195 APInt Mask(VT.getSizeInBits(), 0);
18196 Mask.setBit(BitInY);
18197 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
18198 // Confusingly, the operand is an *inverted* mask.
18199 DAG.getConstant(~Mask, dl, VT));
18200 }
18201
18202 return V;
18203}
18204
18205// Given N, the value controlling the conditional branch, search for the loop
18206// intrinsic, returning it, along with how the value is used. We need to handle
18207// patterns such as the following:
18208// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
18209// (brcond (setcc (loop.decrement), 0, eq), exit)
18210// (brcond (setcc (loop.decrement), 0, ne), header)
18212 bool &Negate) {
18213 switch (N->getOpcode()) {
18214 default:
18215 break;
18216 case ISD::XOR: {
18217 if (!isa<ConstantSDNode>(N.getOperand(1)))
18218 return SDValue();
18219 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
18220 return SDValue();
18221 Negate = !Negate;
18222 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
18223 }
18224 case ISD::SETCC: {
18225 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
18226 if (!Const)
18227 return SDValue();
18228 if (Const->isZero())
18229 Imm = 0;
18230 else if (Const->isOne())
18231 Imm = 1;
18232 else
18233 return SDValue();
18234 CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
18235 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
18236 }
18238 unsigned IntOp = N.getConstantOperandVal(1);
18239 if (IntOp != Intrinsic::test_start_loop_iterations &&
18240 IntOp != Intrinsic::loop_decrement_reg)
18241 return SDValue();
18242 return N;
18243 }
18244 }
18245 return SDValue();
18246}
18247
18250 const ARMSubtarget *ST) {
18251
18252 // The hwloop intrinsics that we're interested are used for control-flow,
18253 // either for entering or exiting the loop:
18254 // - test.start.loop.iterations will test whether its operand is zero. If it
18255 // is zero, the proceeding branch should not enter the loop.
18256 // - loop.decrement.reg also tests whether its operand is zero. If it is
18257 // zero, the proceeding branch should not branch back to the beginning of
18258 // the loop.
18259 // So here, we need to check that how the brcond is using the result of each
18260 // of the intrinsics to ensure that we're branching to the right place at the
18261 // right time.
18262
18263 ISD::CondCode CC;
18264 SDValue Cond;
18265 int Imm = 1;
18266 bool Negate = false;
18267 SDValue Chain = N->getOperand(0);
18268 SDValue Dest;
18269
18270 if (N->getOpcode() == ISD::BRCOND) {
18271 CC = ISD::SETEQ;
18272 Cond = N->getOperand(1);
18273 Dest = N->getOperand(2);
18274 } else {
18275 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
18276 CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
18277 Cond = N->getOperand(2);
18278 Dest = N->getOperand(4);
18279 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
18280 if (!Const->isOne() && !Const->isZero())
18281 return SDValue();
18282 Imm = Const->getZExtValue();
18283 } else
18284 return SDValue();
18285 }
18286
18287 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
18288 if (!Int)
18289 return SDValue();
18290
18291 if (Negate)
18292 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);
18293
18294 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
18295 return (CC == ISD::SETEQ && Imm == 0) ||
18296 (CC == ISD::SETNE && Imm == 1) ||
18297 (CC == ISD::SETLT && Imm == 1) ||
18298 (CC == ISD::SETULT && Imm == 1);
18299 };
18300
18301 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
18302 return (CC == ISD::SETEQ && Imm == 1) ||
18303 (CC == ISD::SETNE && Imm == 0) ||
18304 (CC == ISD::SETGT && Imm == 0) ||
18305 (CC == ISD::SETUGT && Imm == 0) ||
18306 (CC == ISD::SETGE && Imm == 1) ||
18307 (CC == ISD::SETUGE && Imm == 1);
18308 };
18309
18310 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
18311 "unsupported condition");
18312
18313 SDLoc dl(Int);
18314 SelectionDAG &DAG = DCI.DAG;
18315 SDValue Elements = Int.getOperand(2);
18316 unsigned IntOp = Int->getConstantOperandVal(1);
18317 assert((N->hasOneUse() && N->user_begin()->getOpcode() == ISD::BR) &&
18318 "expected single br user");
18319 SDNode *Br = *N->user_begin();
18320 SDValue OtherTarget = Br->getOperand(1);
18321
18322 // Update the unconditional branch to branch to the given Dest.
18323 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
18324 SDValue NewBrOps[] = { Br->getOperand(0), Dest };
18325 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
18326 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
18327 };
18328
18329 if (IntOp == Intrinsic::test_start_loop_iterations) {
18330 SDValue Res;
18331 SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements);
18332 // We expect this 'instruction' to branch when the counter is zero.
18333 if (IsTrueIfZero(CC, Imm)) {
18334 SDValue Ops[] = {Chain, Setup, Dest};
18335 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18336 } else {
18337 // The logic is the reverse of what we need for WLS, so find the other
18338 // basic block target: the target of the proceeding br.
18339 UpdateUncondBr(Br, Dest, DAG);
18340
18341 SDValue Ops[] = {Chain, Setup, OtherTarget};
18342 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18343 }
18344 // Update LR count to the new value
18345 DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup);
18346 // Update chain
18347 DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));
18348 return Res;
18349 } else {
18350 SDValue Size =
18351 DAG.getTargetConstant(Int.getConstantOperandVal(3), dl, MVT::i32);
18352 SDValue Args[] = { Int.getOperand(0), Elements, Size, };
18353 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
18354 DAG.getVTList(MVT::i32, MVT::Other), Args);
18355 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
18356
18357 // We expect this instruction to branch when the count is not zero.
18358 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
18359
18360 // Update the unconditional branch to target the loop preheader if we've
18361 // found the condition has been reversed.
18362 if (Target == OtherTarget)
18363 UpdateUncondBr(Br, Dest, DAG);
18364
18365 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
18366 SDValue(LoopDec.getNode(), 1), Chain);
18367
18368 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
18369 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
18370 }
18371 return SDValue();
18372}
18373
18374/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
18375SDValue
18377 SDValue Cmp = N->getOperand(3);
18378 if (Cmp.getOpcode() != ARMISD::CMPZ)
18379 // Only looking at NE cases.
18380 return SDValue();
18381
18382 SDLoc dl(N);
18383 SDValue LHS = Cmp.getOperand(0);
18384 SDValue RHS = Cmp.getOperand(1);
18385 SDValue Chain = N->getOperand(0);
18386 SDValue BB = N->getOperand(1);
18387 SDValue ARMcc = N->getOperand(2);
18389
18390 // (brcond Chain BB ne (cmpz (and (cmov 0 1 CC Flags) 1) 0))
18391 // -> (brcond Chain BB CC Flags)
18392 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
18393 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
18394 LHS->getOperand(0)->hasOneUse() &&
18395 isNullConstant(LHS->getOperand(0)->getOperand(0)) &&
18396 isOneConstant(LHS->getOperand(0)->getOperand(1)) &&
18397 isOneConstant(LHS->getOperand(1)) && isNullConstant(RHS)) {
18398 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, BB,
18399 LHS->getOperand(0)->getOperand(2),
18400 LHS->getOperand(0)->getOperand(3));
18401 }
18402
18403 return SDValue();
18404}
18405
18406/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
18407SDValue
18409 SDValue Cmp = N->getOperand(3);
18410 if (Cmp.getOpcode() != ARMISD::CMPZ)
18411 // Only looking at EQ and NE cases.
18412 return SDValue();
18413
18414 EVT VT = N->getValueType(0);
18415 SDLoc dl(N);
18416 SDValue LHS = Cmp.getOperand(0);
18417 SDValue RHS = Cmp.getOperand(1);
18418 SDValue FalseVal = N->getOperand(0);
18419 SDValue TrueVal = N->getOperand(1);
18420 SDValue ARMcc = N->getOperand(2);
18422
18423 // BFI is only available on V6T2+.
18424 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
18426 if (R)
18427 return R;
18428 }
18429
18430 // Simplify
18431 // mov r1, r0
18432 // cmp r1, x
18433 // mov r0, y
18434 // moveq r0, x
18435 // to
18436 // cmp r0, x
18437 // movne r0, y
18438 //
18439 // mov r1, r0
18440 // cmp r1, x
18441 // mov r0, x
18442 // movne r0, y
18443 // to
18444 // cmp r0, x
18445 // movne r0, y
18446 /// FIXME: Turn this into a target neutral optimization?
18447 SDValue Res;
18448 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
18449 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, Cmp);
18450 } else if (CC == ARMCC::EQ && TrueVal == RHS) {
18451 SDValue ARMcc;
18452 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
18453 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, NewCmp);
18454 }
18455
18456 // (cmov F T ne (cmpz (cmov 0 1 CC Flags) 0))
18457 // -> (cmov F T CC Flags)
18458 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse() &&
18459 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
18461 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
18462 LHS->getOperand(2), LHS->getOperand(3));
18463 }
18464
18465 if (!VT.isInteger())
18466 return SDValue();
18467
18468 // Fold away an unneccessary CMPZ/CMOV
18469 // CMOV A, B, C1, (CMPZ (CMOV 1, 0, C2, D), 0) ->
18470 // if C1==EQ -> CMOV A, B, C2, D
18471 // if C1==NE -> CMOV A, B, NOT(C2), D
18472 if (N->getConstantOperandVal(2) == ARMCC::EQ ||
18473 N->getConstantOperandVal(2) == ARMCC::NE) {
18475 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
18476 if (N->getConstantOperandVal(2) == ARMCC::NE)
18478 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
18479 N->getOperand(1),
18480 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
18481 }
18482 }
18483
18484 // Materialize a boolean comparison for integers so we can avoid branching.
18485 if (isNullConstant(FalseVal)) {
18486 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
18487 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
18488 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
18489 // right 5 bits will make that 32 be 1, otherwise it will be 0.
18490 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
18491 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18492 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
18493 DAG.getConstant(5, dl, MVT::i32));
18494 } else {
18495 // CMOV 0, 1, ==, (CMPZ x, y) ->
18496 // (UADDO_CARRY (SUB x, y), t:0, t:1)
18497 // where t = (USUBO_CARRY 0, (SUB x, y), 0)
18498 //
18499 // The USUBO_CARRY computes 0 - (x - y) and this will give a borrow when
18500 // x != y. In other words, a carry C == 1 when x == y, C == 0
18501 // otherwise.
18502 // The final UADDO_CARRY computes
18503 // x - y + (0 - (x - y)) + C == C
18504 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18505 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18506 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
18507 // ISD::USUBO_CARRY returns a borrow but we want the carry here
18508 // actually.
18509 SDValue Carry =
18510 DAG.getNode(ISD::SUB, dl, MVT::i32,
18511 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
18512 Res = DAG.getNode(ISD::UADDO_CARRY, dl, VTs, Sub, Neg, Carry);
18513 }
18514 } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
18515 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
18516 // This seems pointless but will allow us to combine it further below.
18517 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18518 SDValue Sub =
18519 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18520 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
18521 Sub.getValue(1));
18522 FalseVal = Sub;
18523 }
18524 } else if (isNullConstant(TrueVal)) {
18525 if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
18526 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
18527 // This seems pointless but will allow us to combine it further below
18528 // Note that we change == for != as this is the dual for the case above.
18529 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18530 SDValue Sub =
18531 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18532 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
18533 DAG.getConstant(ARMCC::NE, dl, MVT::i32),
18534 Sub.getValue(1));
18535 FalseVal = Sub;
18536 }
18537 }
18538
18539 // On Thumb1, the DAG above may be further combined if z is a power of 2
18540 // (z == 2 ^ K).
18541 // CMOV (SUBC x, y), z, !=, (SUBC x, y):1 ->
18542 // t1 = (USUBO (SUB x, y), 1)
18543 // t2 = (USUBO_CARRY (SUB x, y), t1:0, t1:1)
18544 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18545 //
18546 // This also handles the special case of comparing against zero; it's
18547 // essentially, the same pattern, except there's no SUBC:
18548 // CMOV x, z, !=, (CMPZ x, 0) ->
18549 // t1 = (USUBO x, 1)
18550 // t2 = (USUBO_CARRY x, t1:0, t1:1)
18551 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18552 const APInt *TrueConst;
18553 if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
18554 ((FalseVal.getOpcode() == ARMISD::SUBC && FalseVal.getOperand(0) == LHS &&
18555 FalseVal.getOperand(1) == RHS) ||
18556 (FalseVal == LHS && isNullConstant(RHS))) &&
18557 (TrueConst = isPowerOf2Constant(TrueVal))) {
18558 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18559 unsigned ShiftAmount = TrueConst->logBase2();
18560 if (ShiftAmount)
18561 TrueVal = DAG.getConstant(1, dl, VT);
18562 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
18563 Res = DAG.getNode(ISD::USUBO_CARRY, dl, VTs, FalseVal, Subc,
18564 Subc.getValue(1));
18565
18566 if (ShiftAmount)
18567 Res = DAG.getNode(ISD::SHL, dl, VT, Res,
18568 DAG.getConstant(ShiftAmount, dl, MVT::i32));
18569 }
18570
18571 if (Res.getNode()) {
18572 KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
18573 // Capture demanded bits information that would be otherwise lost.
18574 if (Known.Zero == 0xfffffffe)
18575 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18576 DAG.getValueType(MVT::i1));
18577 else if (Known.Zero == 0xffffff00)
18578 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18579 DAG.getValueType(MVT::i8));
18580 else if (Known.Zero == 0xffff0000)
18581 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18582 DAG.getValueType(MVT::i16));
18583 }
18584
18585 return Res;
18586}
18587
18590 const ARMSubtarget *ST) {
18591 SelectionDAG &DAG = DCI.DAG;
18592 SDValue Src = N->getOperand(0);
18593 EVT DstVT = N->getValueType(0);
18594
18595 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
18596 if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
18597 EVT SrcVT = Src.getValueType();
18598 if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
18599 return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
18600 }
18601
18602 // We may have a bitcast of something that has already had this bitcast
18603 // combine performed on it, so skip past any VECTOR_REG_CASTs.
18604 if (Src.getOpcode() == ARMISD::VECTOR_REG_CAST &&
18605 Src.getOperand(0).getValueType().getScalarSizeInBits() <=
18606 Src.getValueType().getScalarSizeInBits())
18607 Src = Src.getOperand(0);
18608
18609 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
18610 // would be generated is at least the width of the element type.
18611 EVT SrcVT = Src.getValueType();
18612 if ((Src.getOpcode() == ARMISD::VMOVIMM ||
18613 Src.getOpcode() == ARMISD::VMVNIMM ||
18614 Src.getOpcode() == ARMISD::VMOVFPIMM) &&
18615 SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
18616 DAG.getDataLayout().isBigEndian())
18617 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
18618
18619 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x
18620 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
18621 return R;
18622
18623 return SDValue();
18624}
18625
18626// Some combines for the MVETrunc truncations legalizer helper. Also lowers the
18627// node into stack operations after legalizeOps.
18630 SelectionDAG &DAG = DCI.DAG;
18631 EVT VT = N->getValueType(0);
18632 SDLoc DL(N);
18633
18634 // MVETrunc(Undef, Undef) -> Undef
18635 if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); }))
18636 return DAG.getUNDEF(VT);
18637
18638 // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc
18639 if (N->getNumOperands() == 2 &&
18640 N->getOperand(0).getOpcode() == ARMISD::MVETRUNC &&
18641 N->getOperand(1).getOpcode() == ARMISD::MVETRUNC)
18642 return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0),
18643 N->getOperand(0).getOperand(1),
18644 N->getOperand(1).getOperand(0),
18645 N->getOperand(1).getOperand(1));
18646
18647 // MVETrunc(shuffle, shuffle) -> VMOVN
18648 if (N->getNumOperands() == 2 &&
18649 N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
18650 N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) {
18651 auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode());
18652 auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode());
18653
18654 if (S0->getOperand(0) == S1->getOperand(0) &&
18655 S0->getOperand(1) == S1->getOperand(1)) {
18656 // Construct complete shuffle mask
18657 SmallVector<int, 8> Mask(S0->getMask());
18658 Mask.append(S1->getMask().begin(), S1->getMask().end());
18659
18660 if (isVMOVNTruncMask(Mask, VT, false))
18661 return DAG.getNode(
18662 ARMISD::VMOVN, DL, VT,
18663 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18664 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18665 DAG.getConstant(1, DL, MVT::i32));
18666 if (isVMOVNTruncMask(Mask, VT, true))
18667 return DAG.getNode(
18668 ARMISD::VMOVN, DL, VT,
18669 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18670 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18671 DAG.getConstant(1, DL, MVT::i32));
18672 }
18673 }
18674
18675 // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
18676 // truncate to a buildvector to allow the generic optimisations to kick in.
18677 if (all_of(N->ops(), [](SDValue Op) {
18678 return Op.getOpcode() == ISD::BUILD_VECTOR ||
18679 Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
18680 (Op.getOpcode() == ISD::BITCAST &&
18681 Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
18682 })) {
18683 SmallVector<SDValue, 8> Extracts;
18684 for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
18685 SDValue O = N->getOperand(Op);
18686 for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) {
18687 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O,
18688 DAG.getConstant(i, DL, MVT::i32));
18689 Extracts.push_back(Ext);
18690 }
18691 }
18692 return DAG.getBuildVector(VT, DL, Extracts);
18693 }
18694
18695 // If we are late in the legalization process and nothing has optimised
18696 // the trunc to anything better, lower it to a stack store and reload,
18697 // performing the truncation whilst keeping the lanes in the correct order:
18698 // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
18699 if (!DCI.isAfterLegalizeDAG())
18700 return SDValue();
18701
18702 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18703 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18704 int NumIns = N->getNumOperands();
18705 assert((NumIns == 2 || NumIns == 4) &&
18706 "Expected 2 or 4 inputs to an MVETrunc");
18707 EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
18708 if (N->getNumOperands() == 4)
18709 StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());
18710
18711 SmallVector<SDValue> Chains;
18712 for (int I = 0; I < NumIns; I++) {
18713 SDValue Ptr = DAG.getNode(
18714 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18715 DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
18717 DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
18718 SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
18719 Ptr, MPI, StoreVT, Align(4));
18720 Chains.push_back(Ch);
18721 }
18722
18723 SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18724 MachinePointerInfo MPI =
18726 return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
18727}
18728
18729// Take a MVEEXT(load x) and split that into (extload x, extload x+8)
18731 SelectionDAG &DAG) {
18732 SDValue N0 = N->getOperand(0);
18733 LoadSDNode *LD = dyn_cast<LoadSDNode>(N0.getNode());
18734 if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())
18735 return SDValue();
18736
18737 EVT FromVT = LD->getMemoryVT();
18738 EVT ToVT = N->getValueType(0);
18739 if (!ToVT.isVector())
18740 return SDValue();
18741 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);
18742 EVT ToEltVT = ToVT.getVectorElementType();
18743 EVT FromEltVT = FromVT.getVectorElementType();
18744
18745 unsigned NumElements = 0;
18746 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
18747 NumElements = 4;
18748 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
18749 NumElements = 8;
18750 assert(NumElements != 0);
18751
18752 ISD::LoadExtType NewExtType =
18753 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
18754 if (LD->getExtensionType() != ISD::NON_EXTLOAD &&
18755 LD->getExtensionType() != ISD::EXTLOAD &&
18756 LD->getExtensionType() != NewExtType)
18757 return SDValue();
18758
18759 LLVMContext &C = *DAG.getContext();
18760 SDLoc DL(LD);
18761 // Details about the old load
18762 SDValue Ch = LD->getChain();
18763 SDValue BasePtr = LD->getBasePtr();
18764 Align Alignment = LD->getBaseAlign();
18765 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
18766 AAMDNodes AAInfo = LD->getAAInfo();
18767
18768 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
18769 EVT NewFromVT = EVT::getVectorVT(
18770 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
18771 EVT NewToVT = EVT::getVectorVT(
18772 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
18773
18776 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
18777 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
18778 SDValue NewPtr =
18779 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
18780
18781 SDValue NewLoad =
18782 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
18783 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
18784 Alignment, MMOFlags, AAInfo);
18785 Loads.push_back(NewLoad);
18786 Chains.push_back(SDValue(NewLoad.getNode(), 1));
18787 }
18788
18789 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18790 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
18791 return DAG.getMergeValues(Loads, DL);
18792}
18793
18794// Perform combines for MVEEXT. If it has not be optimized to anything better
18795// before lowering, it gets converted to stack store and extloads performing the
18796// extend whilst still keeping the same lane ordering.
18799 SelectionDAG &DAG = DCI.DAG;
18800 EVT VT = N->getValueType(0);
18801 SDLoc DL(N);
18802 assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
18803 assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");
18804
18805 EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18806 *DAG.getContext());
18807 auto Extend = [&](SDValue V) {
18808 SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V);
18809 return N->getOpcode() == ARMISD::MVESEXT
18810 ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT,
18811 DAG.getValueType(ExtVT))
18812 : DAG.getZeroExtendInReg(VVT, DL, ExtVT);
18813 };
18814
18815 // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
18816 if (N->getOperand(0).getOpcode() == ARMISD::VDUP) {
18817 SDValue Ext = Extend(N->getOperand(0));
18818 return DAG.getMergeValues({Ext, Ext}, DL);
18819 }
18820
18821 // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
18822 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) {
18823 ArrayRef<int> Mask = SVN->getMask();
18824 assert(Mask.size() == 2 * VT.getVectorNumElements());
18825 assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());
18826 unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;
18827 SDValue Op0 = SVN->getOperand(0);
18828 SDValue Op1 = SVN->getOperand(1);
18829
18830 auto CheckInregMask = [&](int Start, int Offset) {
18831 for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)
18832 if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)
18833 return false;
18834 return true;
18835 };
18836 SDValue V0 = SDValue(N, 0);
18837 SDValue V1 = SDValue(N, 1);
18838 if (CheckInregMask(0, 0))
18839 V0 = Extend(Op0);
18840 else if (CheckInregMask(0, 1))
18841 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18842 else if (CheckInregMask(0, Mask.size()))
18843 V0 = Extend(Op1);
18844 else if (CheckInregMask(0, Mask.size() + 1))
18845 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18846
18847 if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))
18848 V1 = Extend(Op1);
18849 else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))
18850 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18851 else if (CheckInregMask(VT.getVectorNumElements(), 0))
18852 V1 = Extend(Op0);
18853 else if (CheckInregMask(VT.getVectorNumElements(), 1))
18854 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18855
18856 if (V0.getNode() != N || V1.getNode() != N)
18857 return DAG.getMergeValues({V0, V1}, DL);
18858 }
18859
18860 // MVEEXT(load) -> extload, extload
18861 if (N->getOperand(0)->getOpcode() == ISD::LOAD)
18863 return L;
18864
18865 if (!DCI.isAfterLegalizeDAG())
18866 return SDValue();
18867
18868 // Lower to a stack store and reload:
18869 // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
18870 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18871 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18872 int NumOuts = N->getNumValues();
18873 assert((NumOuts == 2 || NumOuts == 4) &&
18874 "Expected 2 or 4 outputs to an MVEEXT");
18875 EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18876 *DAG.getContext());
18877 if (N->getNumOperands() == 4)
18878 LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext());
18879
18880 MachinePointerInfo MPI =
18882 SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0),
18883 StackPtr, MPI, Align(4));
18884
18886 for (int I = 0; I < NumOuts; I++) {
18887 SDValue Ptr = DAG.getNode(
18888 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18889 DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType()));
18891 DAG.getMachineFunction(), SPFI, I * 16 / NumOuts);
18892 SDValue Load = DAG.getExtLoad(
18893 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL,
18894 VT, Chain, Ptr, MPI, LoadVT, Align(4));
18895 Loads.push_back(Load);
18896 }
18897
18898 return DAG.getMergeValues(Loads, DL);
18899}
18900
18902 DAGCombinerInfo &DCI) const {
18903 switch (N->getOpcode()) {
18904 default: break;
18905 case ISD::SELECT_CC:
18906 case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget);
18907 case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);
18908 case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
18909 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
18910 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
18911 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
18912 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget);
18913 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
18914 case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
18915 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
18916 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
18917 case ISD::BRCOND:
18918 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget);
18919 case ARMISD::ADDC:
18920 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
18921 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
18922 case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG);
18923 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
18924 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
18925 case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI);
18926 case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG);
18927 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
18928 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
18931 return PerformExtractEltCombine(N, DCI, Subtarget);
18935 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
18936 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget);
18937 case ISD::FP_TO_SINT:
18938 case ISD::FP_TO_UINT:
18939 return PerformVCVTCombine(N, DCI.DAG, Subtarget);
18940 case ISD::FADD:
18941 return PerformFADDCombine(N, DCI.DAG, Subtarget);
18942 case ISD::FMUL:
18943 return PerformVMulVCTPCombine(N, DCI.DAG, Subtarget);
18945 return PerformIntrinsicCombine(N, DCI);
18946 case ISD::SHL:
18947 case ISD::SRA:
18948 case ISD::SRL:
18949 return PerformShiftCombine(N, DCI, Subtarget);
18950 case ISD::SIGN_EXTEND:
18951 case ISD::ZERO_EXTEND:
18952 case ISD::ANY_EXTEND:
18953 return PerformExtendCombine(N, DCI.DAG, Subtarget);
18954 case ISD::FP_EXTEND:
18955 return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
18956 case ISD::SMIN:
18957 case ISD::UMIN:
18958 case ISD::SMAX:
18959 case ISD::UMAX:
18960 return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
18961 case ARMISD::CMOV:
18962 return PerformCMOVCombine(N, DCI.DAG);
18963 case ARMISD::BRCOND:
18964 return PerformBRCONDCombine(N, DCI.DAG);
18965 case ARMISD::CMPZ:
18966 return PerformCMPZCombine(N, DCI.DAG);
18967 case ARMISD::CSINC:
18968 case ARMISD::CSINV:
18969 case ARMISD::CSNEG:
18970 return PerformCSETCombine(N, DCI.DAG);
18971 case ISD::LOAD:
18972 return PerformLOADCombine(N, DCI, Subtarget);
18973 case ARMISD::VLD1DUP:
18974 case ARMISD::VLD2DUP:
18975 case ARMISD::VLD3DUP:
18976 case ARMISD::VLD4DUP:
18977 return PerformVLDCombine(N, DCI);
18979 return PerformARMBUILD_VECTORCombine(N, DCI);
18980 case ISD::BITCAST:
18981 return PerformBITCASTCombine(N, DCI, Subtarget);
18983 return PerformPREDICATE_CASTCombine(N, DCI);
18985 return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget);
18986 case ARMISD::MVETRUNC:
18987 return PerformMVETruncCombine(N, DCI);
18988 case ARMISD::MVESEXT:
18989 case ARMISD::MVEZEXT:
18990 return PerformMVEExtCombine(N, DCI);
18991 case ARMISD::VCMP:
18992 return PerformVCMPCombine(N, DCI.DAG, Subtarget);
18993 case ISD::VECREDUCE_ADD:
18994 return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
18995 case ARMISD::VADDVs:
18996 case ARMISD::VADDVu:
18997 case ARMISD::VADDLVs:
18998 case ARMISD::VADDLVu:
18999 case ARMISD::VADDLVAs:
19000 case ARMISD::VADDLVAu:
19001 case ARMISD::VMLAVs:
19002 case ARMISD::VMLAVu:
19003 case ARMISD::VMLALVs:
19004 case ARMISD::VMLALVu:
19005 case ARMISD::VMLALVAs:
19006 case ARMISD::VMLALVAu:
19007 return PerformReduceShuffleCombine(N, DCI.DAG);
19008 case ARMISD::VMOVN:
19009 return PerformVMOVNCombine(N, DCI);
19010 case ARMISD::VQMOVNs:
19011 case ARMISD::VQMOVNu:
19012 return PerformVQMOVNCombine(N, DCI);
19013 case ARMISD::VQDMULH:
19014 return PerformVQDMULHCombine(N, DCI);
19015 case ARMISD::ASRL:
19016 case ARMISD::LSRL:
19017 case ARMISD::LSLL:
19018 return PerformLongShiftCombine(N, DCI.DAG);
19019 case ARMISD::SMULWB: {
19020 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19021 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
19022 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
19023 return SDValue();
19024 break;
19025 }
19026 case ARMISD::SMULWT: {
19027 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19028 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
19029 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
19030 return SDValue();
19031 break;
19032 }
19033 case ARMISD::SMLALBB:
19034 case ARMISD::QADD16b:
19035 case ARMISD::QSUB16b:
19036 case ARMISD::UQADD16b:
19037 case ARMISD::UQSUB16b: {
19038 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19039 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
19040 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19041 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19042 return SDValue();
19043 break;
19044 }
19045 case ARMISD::SMLALBT: {
19046 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
19047 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
19048 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
19049 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
19050 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
19051 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
19052 return SDValue();
19053 break;
19054 }
19055 case ARMISD::SMLALTB: {
19056 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
19057 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
19058 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
19059 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
19060 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
19061 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
19062 return SDValue();
19063 break;
19064 }
19065 case ARMISD::SMLALTT: {
19066 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19067 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
19068 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19069 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19070 return SDValue();
19071 break;
19072 }
19073 case ARMISD::QADD8b:
19074 case ARMISD::QSUB8b:
19075 case ARMISD::UQADD8b:
19076 case ARMISD::UQSUB8b: {
19077 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19078 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
19079 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19080 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19081 return SDValue();
19082 break;
19083 }
19084 case ARMISD::VBSP:
19085 if (N->getOperand(1) == N->getOperand(2))
19086 return N->getOperand(1);
19087 return SDValue();
19090 switch (N->getConstantOperandVal(1)) {
19091 case Intrinsic::arm_neon_vld1:
19092 case Intrinsic::arm_neon_vld1x2:
19093 case Intrinsic::arm_neon_vld1x3:
19094 case Intrinsic::arm_neon_vld1x4:
19095 case Intrinsic::arm_neon_vld2:
19096 case Intrinsic::arm_neon_vld3:
19097 case Intrinsic::arm_neon_vld4:
19098 case Intrinsic::arm_neon_vld2lane:
19099 case Intrinsic::arm_neon_vld3lane:
19100 case Intrinsic::arm_neon_vld4lane:
19101 case Intrinsic::arm_neon_vld2dup:
19102 case Intrinsic::arm_neon_vld3dup:
19103 case Intrinsic::arm_neon_vld4dup:
19104 case Intrinsic::arm_neon_vst1:
19105 case Intrinsic::arm_neon_vst1x2:
19106 case Intrinsic::arm_neon_vst1x3:
19107 case Intrinsic::arm_neon_vst1x4:
19108 case Intrinsic::arm_neon_vst2:
19109 case Intrinsic::arm_neon_vst3:
19110 case Intrinsic::arm_neon_vst4:
19111 case Intrinsic::arm_neon_vst2lane:
19112 case Intrinsic::arm_neon_vst3lane:
19113 case Intrinsic::arm_neon_vst4lane:
19114 return PerformVLDCombine(N, DCI);
19115 case Intrinsic::arm_mve_vld2q:
19116 case Intrinsic::arm_mve_vld4q:
19117 case Intrinsic::arm_mve_vst2q:
19118 case Intrinsic::arm_mve_vst4q:
19119 return PerformMVEVLDCombine(N, DCI);
19120 default: break;
19121 }
19122 break;
19123 }
19124 return SDValue();
19125}
19126
19128 EVT VT) const {
19129 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
19130}
19131
19133 Align Alignment,
19135 unsigned *Fast) const {
19136 // Depends what it gets converted into if the type is weird.
19137 if (!VT.isSimple())
19138 return false;
19139
19140 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
19141 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
19142 auto Ty = VT.getSimpleVT().SimpleTy;
19143
19144 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
19145 // Unaligned access can use (for example) LRDB, LRDH, LDR
19146 if (AllowsUnaligned) {
19147 if (Fast)
19148 *Fast = Subtarget->hasV7Ops();
19149 return true;
19150 }
19151 }
19152
19153 if (Ty == MVT::f64 || Ty == MVT::v2f64) {
19154 // For any little-endian targets with neon, we can support unaligned ld/st
19155 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
19156 // A big-endian target may also explicitly support unaligned accesses
19157 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
19158 if (Fast)
19159 *Fast = 1;
19160 return true;
19161 }
19162 }
19163
19164 if (!Subtarget->hasMVEIntegerOps())
19165 return false;
19166
19167 // These are for predicates
19168 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 ||
19169 Ty == MVT::v2i1)) {
19170 if (Fast)
19171 *Fast = 1;
19172 return true;
19173 }
19174
19175 // These are for truncated stores/narrowing loads. They are fine so long as
19176 // the alignment is at least the size of the item being loaded
19177 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
19178 Alignment >= VT.getScalarSizeInBits() / 8) {
19179 if (Fast)
19180 *Fast = true;
19181 return true;
19182 }
19183
19184 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
19185 // VSTRW.U32 all store the vector register in exactly the same format, and
19186 // differ only in the range of their immediate offset field and the required
19187 // alignment. So there is always a store that can be used, regardless of
19188 // actual type.
19189 //
19190 // For big endian, that is not the case. But can still emit a (VSTRB.U8;
19191 // VREV64.8) pair and get the same effect. This will likely be better than
19192 // aligning the vector through the stack.
19193 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
19194 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
19195 Ty == MVT::v2f64) {
19196 if (Fast)
19197 *Fast = 1;
19198 return true;
19199 }
19200
19201 return false;
19202}
19203
19205 LLVMContext &Context, const MemOp &Op,
19206 const AttributeList &FuncAttributes) const {
19207 // See if we can use NEON instructions for this...
19208 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
19209 !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
19210 unsigned Fast;
19211 if (Op.size() >= 16 &&
19212 (Op.isAligned(Align(16)) ||
19213 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1),
19215 Fast))) {
19216 return MVT::v2f64;
19217 } else if (Op.size() >= 8 &&
19218 (Op.isAligned(Align(8)) ||
19220 MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) &&
19221 Fast))) {
19222 return MVT::f64;
19223 }
19224 }
19225
19226 // Let the target-independent logic figure it out.
19227 return MVT::Other;
19228}
19229
19230// 64-bit integers are split into their high and low parts and held in two
19231// different registers, so the trunc is free since the low register can just
19232// be used.
19233bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
19234 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
19235 return false;
19236 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
19237 unsigned DestBits = DstTy->getPrimitiveSizeInBits();
19238 return (SrcBits == 64 && DestBits == 32);
19239}
19240
19242 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
19243 !DstVT.isInteger())
19244 return false;
19245 unsigned SrcBits = SrcVT.getSizeInBits();
19246 unsigned DestBits = DstVT.getSizeInBits();
19247 return (SrcBits == 64 && DestBits == 32);
19248}
19249
19251 if (Val.getOpcode() != ISD::LOAD)
19252 return false;
19253
19254 EVT VT1 = Val.getValueType();
19255 if (!VT1.isSimple() || !VT1.isInteger() ||
19256 !VT2.isSimple() || !VT2.isInteger())
19257 return false;
19258
19259 switch (VT1.getSimpleVT().SimpleTy) {
19260 default: break;
19261 case MVT::i1:
19262 case MVT::i8:
19263 case MVT::i16:
19264 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
19265 return true;
19266 }
19267
19268 return false;
19269}
19270
19272 if (!VT.isSimple())
19273 return false;
19274
19275 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
19276 // negate values directly (fneg is free). So, we don't want to let the DAG
19277 // combiner rewrite fneg into xors and some other instructions. For f16 and
19278 // FullFP16 argument passing, some bitcast nodes may be introduced,
19279 // triggering this DAG combine rewrite, so we are avoiding that with this.
19280 switch (VT.getSimpleVT().SimpleTy) {
19281 default: break;
19282 case MVT::f16:
19283 return Subtarget->hasFullFP16();
19284 }
19285
19286 return false;
19287}
19288
19290 if (!Subtarget->hasMVEIntegerOps())
19291 return nullptr;
19292 Type *SVIType = SVI->getType();
19293 Type *ScalarType = SVIType->getScalarType();
19294
19295 if (ScalarType->isFloatTy())
19296 return Type::getInt32Ty(SVIType->getContext());
19297 if (ScalarType->isHalfTy())
19298 return Type::getInt16Ty(SVIType->getContext());
19299 return nullptr;
19300}
19301
19303 EVT VT = ExtVal.getValueType();
19304
19305 if (!isTypeLegal(VT))
19306 return false;
19307
19308 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
19309 if (Ld->isExpandingLoad())
19310 return false;
19311 }
19312
19313 if (Subtarget->hasMVEIntegerOps())
19314 return true;
19315
19316 // Don't create a loadext if we can fold the extension into a wide/long
19317 // instruction.
19318 // If there's more than one user instruction, the loadext is desirable no
19319 // matter what. There can be two uses by the same instruction.
19320 if (ExtVal->use_empty() ||
19321 !ExtVal->user_begin()->isOnlyUserOf(ExtVal.getNode()))
19322 return true;
19323
19324 SDNode *U = *ExtVal->user_begin();
19325 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
19326 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
19327 return false;
19328
19329 return true;
19330}
19331
19333 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
19334 return false;
19335
19336 if (!isTypeLegal(EVT::getEVT(Ty1)))
19337 return false;
19338
19339 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
19340
19341 // Assuming the caller doesn't have a zeroext or signext return parameter,
19342 // truncation all the way down to i1 is valid.
19343 return true;
19344}
19345
19346/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
19347/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
19348/// expanded to FMAs when this method returns true, otherwise fmuladd is
19349/// expanded to fmul + fadd.
19350///
19351/// ARM supports both fused and unfused multiply-add operations; we already
19352/// lower a pair of fmul and fadd to the latter so it's not clear that there
19353/// would be a gain or that the gain would be worthwhile enough to risk
19354/// correctness bugs.
19355///
19356/// For MVE, we set this to true as it helps simplify the need for some
19357/// patterns (and we don't have the non-fused floating point instruction).
19358bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
19359 EVT VT) const {
19360 if (Subtarget->useSoftFloat())
19361 return false;
19362
19363 if (!VT.isSimple())
19364 return false;
19365
19366 switch (VT.getSimpleVT().SimpleTy) {
19367 case MVT::v4f32:
19368 case MVT::v8f16:
19369 return Subtarget->hasMVEFloatOps();
19370 case MVT::f16:
19371 return Subtarget->useFPVFMx16();
19372 case MVT::f32:
19373 return Subtarget->useFPVFMx();
19374 case MVT::f64:
19375 return Subtarget->useFPVFMx64();
19376 default:
19377 break;
19378 }
19379
19380 return false;
19381}
19382
19383static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
19384 if (V < 0)
19385 return false;
19386
19387 unsigned Scale = 1;
19388 switch (VT.getSimpleVT().SimpleTy) {
19389 case MVT::i1:
19390 case MVT::i8:
19391 // Scale == 1;
19392 break;
19393 case MVT::i16:
19394 // Scale == 2;
19395 Scale = 2;
19396 break;
19397 default:
19398 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
19399 // Scale == 4;
19400 Scale = 4;
19401 break;
19402 }
19403
19404 if ((V & (Scale - 1)) != 0)
19405 return false;
19406 return isUInt<5>(V / Scale);
19407}
19408
19409static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
19410 const ARMSubtarget *Subtarget) {
19411 if (!VT.isInteger() && !VT.isFloatingPoint())
19412 return false;
19413 if (VT.isVector() && Subtarget->hasNEON())
19414 return false;
19415 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
19416 !Subtarget->hasMVEFloatOps())
19417 return false;
19418
19419 bool IsNeg = false;
19420 if (V < 0) {
19421 IsNeg = true;
19422 V = -V;
19423 }
19424
19425 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U);
19426
19427 // MVE: size * imm7
19428 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
19429 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
19430 case MVT::i32:
19431 case MVT::f32:
19432 return isShiftedUInt<7,2>(V);
19433 case MVT::i16:
19434 case MVT::f16:
19435 return isShiftedUInt<7,1>(V);
19436 case MVT::i8:
19437 return isUInt<7>(V);
19438 default:
19439 return false;
19440 }
19441 }
19442
19443 // half VLDR: 2 * imm8
19444 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
19445 return isShiftedUInt<8, 1>(V);
19446 // VLDR and LDRD: 4 * imm8
19447 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
19448 return isShiftedUInt<8, 2>(V);
19449
19450 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
19451 // + imm12 or - imm8
19452 if (IsNeg)
19453 return isUInt<8>(V);
19454 return isUInt<12>(V);
19455 }
19456
19457 return false;
19458}
19459
19460/// isLegalAddressImmediate - Return true if the integer value can be used
19461/// as the offset of the target addressing mode for load / store of the
19462/// given type.
19463static bool isLegalAddressImmediate(int64_t V, EVT VT,
19464 const ARMSubtarget *Subtarget) {
19465 if (V == 0)
19466 return true;
19467
19468 if (!VT.isSimple())
19469 return false;
19470
19471 if (Subtarget->isThumb1Only())
19472 return isLegalT1AddressImmediate(V, VT);
19473 else if (Subtarget->isThumb2())
19474 return isLegalT2AddressImmediate(V, VT, Subtarget);
19475
19476 // ARM mode.
19477 if (V < 0)
19478 V = - V;
19479 switch (VT.getSimpleVT().SimpleTy) {
19480 default: return false;
19481 case MVT::i1:
19482 case MVT::i8:
19483 case MVT::i32:
19484 // +- imm12
19485 return isUInt<12>(V);
19486 case MVT::i16:
19487 // +- imm8
19488 return isUInt<8>(V);
19489 case MVT::f32:
19490 case MVT::f64:
19491 if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
19492 return false;
19493 return isShiftedUInt<8, 2>(V);
19494 }
19495}
19496
19498 EVT VT) const {
19499 int Scale = AM.Scale;
19500 if (Scale < 0)
19501 return false;
19502
19503 switch (VT.getSimpleVT().SimpleTy) {
19504 default: return false;
19505 case MVT::i1:
19506 case MVT::i8:
19507 case MVT::i16:
19508 case MVT::i32:
19509 if (Scale == 1)
19510 return true;
19511 // r + r << imm
19512 Scale = Scale & ~1;
19513 return Scale == 2 || Scale == 4 || Scale == 8;
19514 case MVT::i64:
19515 // FIXME: What are we trying to model here? ldrd doesn't have an r + r
19516 // version in Thumb mode.
19517 // r + r
19518 if (Scale == 1)
19519 return true;
19520 // r * 2 (this can be lowered to r + r).
19521 if (!AM.HasBaseReg && Scale == 2)
19522 return true;
19523 return false;
19524 case MVT::isVoid:
19525 // Note, we allow "void" uses (basically, uses that aren't loads or
19526 // stores), because arm allows folding a scale into many arithmetic
19527 // operations. This should be made more precise and revisited later.
19528
19529 // Allow r << imm, but the imm has to be a multiple of two.
19530 if (Scale & 1) return false;
19531 return isPowerOf2_32(Scale);
19532 }
19533}
19534
19536 EVT VT) const {
19537 const int Scale = AM.Scale;
19538
19539 // Negative scales are not supported in Thumb1.
19540 if (Scale < 0)
19541 return false;
19542
19543 // Thumb1 addressing modes do not support register scaling excepting the
19544 // following cases:
19545 // 1. Scale == 1 means no scaling.
19546 // 2. Scale == 2 this can be lowered to r + r if there is no base register.
19547 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
19548}
19549
19550/// isLegalAddressingMode - Return true if the addressing mode represented
19551/// by AM is legal for this target, for a load/store of the specified type.
19553 const AddrMode &AM, Type *Ty,
19554 unsigned AS, Instruction *I) const {
19555 EVT VT = getValueType(DL, Ty, true);
19556 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
19557 return false;
19558
19559 // Can never fold addr of global into load/store.
19560 if (AM.BaseGV)
19561 return false;
19562
19563 switch (AM.Scale) {
19564 case 0: // no scale reg, must be "r+i" or "r", or "i".
19565 break;
19566 default:
19567 // ARM doesn't support any R+R*scale+imm addr modes.
19568 if (AM.BaseOffs)
19569 return false;
19570
19571 if (!VT.isSimple())
19572 return false;
19573
19574 if (Subtarget->isThumb1Only())
19575 return isLegalT1ScaledAddressingMode(AM, VT);
19576
19577 if (Subtarget->isThumb2())
19578 return isLegalT2ScaledAddressingMode(AM, VT);
19579
19580 int Scale = AM.Scale;
19581 switch (VT.getSimpleVT().SimpleTy) {
19582 default: return false;
19583 case MVT::i1:
19584 case MVT::i8:
19585 case MVT::i32:
19586 if (Scale < 0) Scale = -Scale;
19587 if (Scale == 1)
19588 return true;
19589 // r + r << imm
19590 return isPowerOf2_32(Scale & ~1);
19591 case MVT::i16:
19592 case MVT::i64:
19593 // r +/- r
19594 if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
19595 return true;
19596 // r * 2 (this can be lowered to r + r).
19597 if (!AM.HasBaseReg && Scale == 2)
19598 return true;
19599 return false;
19600
19601 case MVT::isVoid:
19602 // Note, we allow "void" uses (basically, uses that aren't loads or
19603 // stores), because arm allows folding a scale into many arithmetic
19604 // operations. This should be made more precise and revisited later.
19605
19606 // Allow r << imm, but the imm has to be a multiple of two.
19607 if (Scale & 1) return false;
19608 return isPowerOf2_32(Scale);
19609 }
19610 }
19611 return true;
19612}
19613
19614/// isLegalICmpImmediate - Return true if the specified immediate is legal
19615/// icmp immediate, that is the target has icmp instructions which can compare
19616/// a register against the immediate without having to materialize the
19617/// immediate into a register.
19619 // Thumb2 and ARM modes can use cmn for negative immediates.
19620 if (!Subtarget->isThumb())
19621 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
19622 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
19623 if (Subtarget->isThumb2())
19624 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
19625 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
19626 // Thumb1 doesn't have cmn, and only 8-bit immediates.
19627 return Imm >= 0 && Imm <= 255;
19628}
19629
19630/// isLegalAddImmediate - Return true if the specified immediate is a legal add
19631/// *or sub* immediate, that is the target has add or sub instructions which can
19632/// add a register with the immediate without having to materialize the
19633/// immediate into a register.
19635 // Same encoding for add/sub, just flip the sign.
19636 uint64_t AbsImm = AbsoluteValue(Imm);
19637 if (!Subtarget->isThumb())
19638 return ARM_AM::getSOImmVal(AbsImm) != -1;
19639 if (Subtarget->isThumb2())
19640 return ARM_AM::getT2SOImmVal(AbsImm) != -1;
19641 // Thumb1 only has 8-bit unsigned immediate.
19642 return AbsImm <= 255;
19643}
19644
19645// Return false to prevent folding
19646// (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine,
19647// if the folding leads to worse code.
19649 SDValue ConstNode) const {
19650 // Let the DAGCombiner decide for vector types and large types.
19651 const EVT VT = AddNode.getValueType();
19652 if (VT.isVector() || VT.getScalarSizeInBits() > 32)
19653 return true;
19654
19655 // It is worse if c0 is legal add immediate, while c1*c0 is not
19656 // and has to be composed by at least two instructions.
19657 const ConstantSDNode *C0Node = cast<ConstantSDNode>(AddNode.getOperand(1));
19658 const ConstantSDNode *C1Node = cast<ConstantSDNode>(ConstNode);
19659 const int64_t C0 = C0Node->getSExtValue();
19660 APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue();
19662 return true;
19663 if (ConstantMaterializationCost((unsigned)CA.getZExtValue(), Subtarget) > 1)
19664 return false;
19665
19666 // Default to true and let the DAGCombiner decide.
19667 return true;
19668}
19669
19671 bool isSEXTLoad, SDValue &Base,
19672 SDValue &Offset, bool &isInc,
19673 SelectionDAG &DAG) {
19674 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19675 return false;
19676
19677 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
19678 // AddressingMode 3
19679 Base = Ptr->getOperand(0);
19680 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
19681 int RHSC = (int)RHS->getZExtValue();
19682 if (RHSC < 0 && RHSC > -256) {
19683 assert(Ptr->getOpcode() == ISD::ADD);
19684 isInc = false;
19685 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19686 return true;
19687 }
19688 }
19689 isInc = (Ptr->getOpcode() == ISD::ADD);
19690 Offset = Ptr->getOperand(1);
19691 return true;
19692 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
19693 // AddressingMode 2
19694 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
19695 int RHSC = (int)RHS->getZExtValue();
19696 if (RHSC < 0 && RHSC > -0x1000) {
19697 assert(Ptr->getOpcode() == ISD::ADD);
19698 isInc = false;
19699 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19700 Base = Ptr->getOperand(0);
19701 return true;
19702 }
19703 }
19704
19705 if (Ptr->getOpcode() == ISD::ADD) {
19706 isInc = true;
19707 ARM_AM::ShiftOpc ShOpcVal=
19708 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
19709 if (ShOpcVal != ARM_AM::no_shift) {
19710 Base = Ptr->getOperand(1);
19711 Offset = Ptr->getOperand(0);
19712 } else {
19713 Base = Ptr->getOperand(0);
19714 Offset = Ptr->getOperand(1);
19715 }
19716 return true;
19717 }
19718
19719 isInc = (Ptr->getOpcode() == ISD::ADD);
19720 Base = Ptr->getOperand(0);
19721 Offset = Ptr->getOperand(1);
19722 return true;
19723 }
19724
19725 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
19726 return false;
19727}
19728
19730 bool isSEXTLoad, SDValue &Base,
19731 SDValue &Offset, bool &isInc,
19732 SelectionDAG &DAG) {
19733 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19734 return false;
19735
19736 Base = Ptr->getOperand(0);
19737 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
19738 int RHSC = (int)RHS->getZExtValue();
19739 if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
19740 assert(Ptr->getOpcode() == ISD::ADD);
19741 isInc = false;
19742 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19743 return true;
19744 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
19745 isInc = Ptr->getOpcode() == ISD::ADD;
19746 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19747 return true;
19748 }
19749 }
19750
19751 return false;
19752}
19753
19754static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
19755 bool isSEXTLoad, bool IsMasked, bool isLE,
19757 bool &isInc, SelectionDAG &DAG) {
19758 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19759 return false;
19760 if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
19761 return false;
19762
19763 // We allow LE non-masked loads to change the type (for example use a vldrb.8
19764 // as opposed to a vldrw.32). This can allow extra addressing modes or
19765 // alignments for what is otherwise an equivalent instruction.
19766 bool CanChangeType = isLE && !IsMasked;
19767
19768 ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1));
19769 int RHSC = (int)RHS->getZExtValue();
19770
19771 auto IsInRange = [&](int RHSC, int Limit, int Scale) {
19772 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
19773 assert(Ptr->getOpcode() == ISD::ADD);
19774 isInc = false;
19775 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19776 return true;
19777 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
19778 isInc = Ptr->getOpcode() == ISD::ADD;
19779 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19780 return true;
19781 }
19782 return false;
19783 };
19784
19785 // Try to find a matching instruction based on s/zext, Alignment, Offset and
19786 // (in BE/masked) type.
19787 Base = Ptr->getOperand(0);
19788 if (VT == MVT::v4i16) {
19789 if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
19790 return true;
19791 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
19792 if (IsInRange(RHSC, 0x80, 1))
19793 return true;
19794 } else if (Alignment >= 4 &&
19795 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
19796 IsInRange(RHSC, 0x80, 4))
19797 return true;
19798 else if (Alignment >= 2 &&
19799 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
19800 IsInRange(RHSC, 0x80, 2))
19801 return true;
19802 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
19803 return true;
19804 return false;
19805}
19806
19807/// getPreIndexedAddressParts - returns true by value, base pointer and
19808/// offset pointer and addressing mode by reference if the node's address
19809/// can be legally represented as pre-indexed load / store address.
19810bool
19812 SDValue &Offset,
19814 SelectionDAG &DAG) const {
19815 if (Subtarget->isThumb1Only())
19816 return false;
19817
19818 EVT VT;
19819 SDValue Ptr;
19820 Align Alignment;
19821 bool isSEXTLoad = false;
19822 bool IsMasked = false;
19823 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19824 Ptr = LD->getBasePtr();
19825 VT = LD->getMemoryVT();
19826 Alignment = LD->getAlign();
19827 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19828 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19829 Ptr = ST->getBasePtr();
19830 VT = ST->getMemoryVT();
19831 Alignment = ST->getAlign();
19832 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19833 Ptr = LD->getBasePtr();
19834 VT = LD->getMemoryVT();
19835 Alignment = LD->getAlign();
19836 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19837 IsMasked = true;
19838 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
19839 Ptr = ST->getBasePtr();
19840 VT = ST->getMemoryVT();
19841 Alignment = ST->getAlign();
19842 IsMasked = true;
19843 } else
19844 return false;
19845
19846 bool isInc;
19847 bool isLegal = false;
19848 if (VT.isVector())
19849 isLegal = Subtarget->hasMVEIntegerOps() &&
19851 Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
19852 Subtarget->isLittle(), Base, Offset, isInc, DAG);
19853 else {
19854 if (Subtarget->isThumb2())
19855 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19856 Offset, isInc, DAG);
19857 else
19858 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19859 Offset, isInc, DAG);
19860 }
19861 if (!isLegal)
19862 return false;
19863
19864 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
19865 return true;
19866}
19867
19868/// getPostIndexedAddressParts - returns true by value, base pointer and
19869/// offset pointer and addressing mode by reference if this node can be
19870/// combined with a load / store to form a post-indexed load / store.
19872 SDValue &Base,
19873 SDValue &Offset,
19875 SelectionDAG &DAG) const {
19876 EVT VT;
19877 SDValue Ptr;
19878 Align Alignment;
19879 bool isSEXTLoad = false, isNonExt;
19880 bool IsMasked = false;
19881 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19882 VT = LD->getMemoryVT();
19883 Ptr = LD->getBasePtr();
19884 Alignment = LD->getAlign();
19885 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19886 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19887 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19888 VT = ST->getMemoryVT();
19889 Ptr = ST->getBasePtr();
19890 Alignment = ST->getAlign();
19891 isNonExt = !ST->isTruncatingStore();
19892 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19893 VT = LD->getMemoryVT();
19894 Ptr = LD->getBasePtr();
19895 Alignment = LD->getAlign();
19896 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19897 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19898 IsMasked = true;
19899 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
19900 VT = ST->getMemoryVT();
19901 Ptr = ST->getBasePtr();
19902 Alignment = ST->getAlign();
19903 isNonExt = !ST->isTruncatingStore();
19904 IsMasked = true;
19905 } else
19906 return false;
19907
19908 if (Subtarget->isThumb1Only()) {
19909 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
19910 // must be non-extending/truncating, i32, with an offset of 4.
19911 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
19912 if (Op->getOpcode() != ISD::ADD || !isNonExt)
19913 return false;
19914 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
19915 if (!RHS || RHS->getZExtValue() != 4)
19916 return false;
19917 if (Alignment < Align(4))
19918 return false;
19919
19920 Offset = Op->getOperand(1);
19921 Base = Op->getOperand(0);
19922 AM = ISD::POST_INC;
19923 return true;
19924 }
19925
19926 bool isInc;
19927 bool isLegal = false;
19928 if (VT.isVector())
19929 isLegal = Subtarget->hasMVEIntegerOps() &&
19930 getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,
19931 Subtarget->isLittle(), Base, Offset,
19932 isInc, DAG);
19933 else {
19934 if (Subtarget->isThumb2())
19935 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19936 isInc, DAG);
19937 else
19938 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19939 isInc, DAG);
19940 }
19941 if (!isLegal)
19942 return false;
19943
19944 if (Ptr != Base) {
19945 // Swap base ptr and offset to catch more post-index load / store when
19946 // it's legal. In Thumb2 mode, offset must be an immediate.
19947 if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
19948 !Subtarget->isThumb2())
19950
19951 // Post-indexed load / store update the base pointer.
19952 if (Ptr != Base)
19953 return false;
19954 }
19955
19956 AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
19957 return true;
19958}
19959
19961 KnownBits &Known,
19962 const APInt &DemandedElts,
19963 const SelectionDAG &DAG,
19964 unsigned Depth) const {
19965 unsigned BitWidth = Known.getBitWidth();
19966 Known.resetAll();
19967 switch (Op.getOpcode()) {
19968 default: break;
19969 case ARMISD::ADDC:
19970 case ARMISD::ADDE:
19971 case ARMISD::SUBC:
19972 case ARMISD::SUBE:
19973 // Special cases when we convert a carry to a boolean.
19974 if (Op.getResNo() == 0) {
19975 SDValue LHS = Op.getOperand(0);
19976 SDValue RHS = Op.getOperand(1);
19977 // (ADDE 0, 0, C) will give us a single bit.
19978 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
19981 return;
19982 }
19983 }
19984 break;
19985 case ARMISD::CMOV: {
19986 // Bits are known zero/one if known on the LHS and RHS.
19987 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
19988 if (Known.isUnknown())
19989 return;
19990
19991 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
19992 Known = Known.intersectWith(KnownRHS);
19993 return;
19994 }
19996 Intrinsic::ID IntID =
19997 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
19998 switch (IntID) {
19999 default: return;
20000 case Intrinsic::arm_ldaex:
20001 case Intrinsic::arm_ldrex: {
20002 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
20003 unsigned MemBits = VT.getScalarSizeInBits();
20004 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
20005 return;
20006 }
20007 }
20008 }
20009 case ARMISD::BFI: {
20010 // Conservatively, we can recurse down the first operand
20011 // and just mask out all affected bits.
20012 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
20013
20014 // The operand to BFI is already a mask suitable for removing the bits it
20015 // sets.
20016 const APInt &Mask = Op.getConstantOperandAPInt(2);
20017 Known.Zero &= Mask;
20018 Known.One &= Mask;
20019 return;
20020 }
20021 case ARMISD::VGETLANEs:
20022 case ARMISD::VGETLANEu: {
20023 const SDValue &SrcSV = Op.getOperand(0);
20024 EVT VecVT = SrcSV.getValueType();
20025 assert(VecVT.isVector() && "VGETLANE expected a vector type");
20026 const unsigned NumSrcElts = VecVT.getVectorNumElements();
20027 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
20028 assert(Pos->getAPIntValue().ult(NumSrcElts) &&
20029 "VGETLANE index out of bounds");
20030 unsigned Idx = Pos->getZExtValue();
20031 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
20032 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
20033
20034 EVT VT = Op.getValueType();
20035 const unsigned DstSz = VT.getScalarSizeInBits();
20036 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
20037 (void)SrcSz;
20038 assert(SrcSz == Known.getBitWidth());
20039 assert(DstSz > SrcSz);
20040 if (Op.getOpcode() == ARMISD::VGETLANEs)
20041 Known = Known.sext(DstSz);
20042 else {
20043 Known = Known.zext(DstSz);
20044 }
20045 assert(DstSz == Known.getBitWidth());
20046 break;
20047 }
20048 case ARMISD::VMOVrh: {
20049 KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20050 assert(KnownOp.getBitWidth() == 16);
20051 Known = KnownOp.zext(32);
20052 break;
20053 }
20054 case ARMISD::CSINC:
20055 case ARMISD::CSINV:
20056 case ARMISD::CSNEG: {
20057 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20058 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
20059
20060 // The result is either:
20061 // CSINC: KnownOp0 or KnownOp1 + 1
20062 // CSINV: KnownOp0 or ~KnownOp1
20063 // CSNEG: KnownOp0 or KnownOp1 * -1
20064 if (Op.getOpcode() == ARMISD::CSINC)
20065 KnownOp1 =
20066 KnownBits::add(KnownOp1, KnownBits::makeConstant(APInt(32, 1)));
20067 else if (Op.getOpcode() == ARMISD::CSINV)
20068 std::swap(KnownOp1.Zero, KnownOp1.One);
20069 else if (Op.getOpcode() == ARMISD::CSNEG)
20070 KnownOp1 = KnownBits::mul(KnownOp1,
20072
20073 Known = KnownOp0.intersectWith(KnownOp1);
20074 break;
20075 }
20076 }
20077}
20078
20080 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
20081 TargetLoweringOpt &TLO) const {
20082 // Delay optimization, so we don't have to deal with illegal types, or block
20083 // optimizations.
20084 if (!TLO.LegalOps)
20085 return false;
20086
20087 // Only optimize AND for now.
20088 if (Op.getOpcode() != ISD::AND)
20089 return false;
20090
20091 EVT VT = Op.getValueType();
20092
20093 // Ignore vectors.
20094 if (VT.isVector())
20095 return false;
20096
20097 assert(VT == MVT::i32 && "Unexpected integer type");
20098
20099 // Make sure the RHS really is a constant.
20100 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
20101 if (!C)
20102 return false;
20103
20104 unsigned Mask = C->getZExtValue();
20105
20106 unsigned Demanded = DemandedBits.getZExtValue();
20107 unsigned ShrunkMask = Mask & Demanded;
20108 unsigned ExpandedMask = Mask | ~Demanded;
20109
20110 // If the mask is all zeros, let the target-independent code replace the
20111 // result with zero.
20112 if (ShrunkMask == 0)
20113 return false;
20114
20115 // If the mask is all ones, erase the AND. (Currently, the target-independent
20116 // code won't do this, so we have to do it explicitly to avoid an infinite
20117 // loop in obscure cases.)
20118 if (ExpandedMask == ~0U)
20119 return TLO.CombineTo(Op, Op.getOperand(0));
20120
20121 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
20122 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
20123 };
20124 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
20125 if (NewMask == Mask)
20126 return true;
20127 SDLoc DL(Op);
20128 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
20129 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
20130 return TLO.CombineTo(Op, NewOp);
20131 };
20132
20133 // Prefer uxtb mask.
20134 if (IsLegalMask(0xFF))
20135 return UseMask(0xFF);
20136
20137 // Prefer uxth mask.
20138 if (IsLegalMask(0xFFFF))
20139 return UseMask(0xFFFF);
20140
20141 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
20142 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
20143 if (ShrunkMask < 256)
20144 return UseMask(ShrunkMask);
20145
20146 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
20147 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
20148 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
20149 return UseMask(ExpandedMask);
20150
20151 // Potential improvements:
20152 //
20153 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
20154 // We could try to prefer Thumb1 immediates which can be lowered to a
20155 // two-instruction sequence.
20156 // We could try to recognize more legal ARM/Thumb2 immediates here.
20157
20158 return false;
20159}
20160
20162 SDValue Op, const APInt &OriginalDemandedBits,
20163 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
20164 unsigned Depth) const {
20165 unsigned Opc = Op.getOpcode();
20166
20167 switch (Opc) {
20168 case ARMISD::ASRL:
20169 case ARMISD::LSRL: {
20170 // If this is result 0 and the other result is unused, see if the demand
20171 // bits allow us to shrink this long shift into a standard small shift in
20172 // the opposite direction.
20173 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
20174 isa<ConstantSDNode>(Op->getOperand(2))) {
20175 unsigned ShAmt = Op->getConstantOperandVal(2);
20176 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32)
20177 << (32 - ShAmt)))
20178 return TLO.CombineTo(
20179 Op, TLO.DAG.getNode(
20180 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
20181 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
20182 }
20183 break;
20184 }
20185 case ARMISD::VBICIMM: {
20186 SDValue Op0 = Op.getOperand(0);
20187 unsigned ModImm = Op.getConstantOperandVal(1);
20188 unsigned EltBits = 0;
20189 uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits);
20190 if ((OriginalDemandedBits & Mask) == 0)
20191 return TLO.CombineTo(Op, Op0);
20192 }
20193 }
20194
20196 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
20197}
20198
20199//===----------------------------------------------------------------------===//
20200// ARM Inline Assembly Support
20201//===----------------------------------------------------------------------===//
20202
20204 // Looking for "rev" which is V6+.
20205 if (!Subtarget->hasV6Ops())
20206 return false;
20207
20208 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
20209 StringRef AsmStr = IA->getAsmString();
20210 SmallVector<StringRef, 4> AsmPieces;
20211 SplitString(AsmStr, AsmPieces, ";\n");
20212
20213 switch (AsmPieces.size()) {
20214 default: return false;
20215 case 1:
20216 AsmStr = AsmPieces[0];
20217 AsmPieces.clear();
20218 SplitString(AsmStr, AsmPieces, " \t,");
20219
20220 // rev $0, $1
20221 if (AsmPieces.size() == 3 && AsmPieces[0] == "rev" &&
20222 AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
20223 IA->getConstraintString().starts_with("=l,l")) {
20224 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
20225 if (Ty && Ty->getBitWidth() == 32)
20227 }
20228 break;
20229 }
20230
20231 return false;
20232}
20233
20234const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
20235 // At this point, we have to lower this constraint to something else, so we
20236 // lower it to an "r" or "w". However, by doing this we will force the result
20237 // to be in register, while the X constraint is much more permissive.
20238 //
20239 // Although we are correct (we are free to emit anything, without
20240 // constraints), we might break use cases that would expect us to be more
20241 // efficient and emit something else.
20242 if (!Subtarget->hasVFP2Base())
20243 return "r";
20244 if (ConstraintVT.isFloatingPoint())
20245 return "w";
20246 if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
20247 (ConstraintVT.getSizeInBits() == 64 ||
20248 ConstraintVT.getSizeInBits() == 128))
20249 return "w";
20250
20251 return "r";
20252}
20253
20254/// getConstraintType - Given a constraint letter, return the type of
20255/// constraint it is for this target.
20258 unsigned S = Constraint.size();
20259 if (S == 1) {
20260 switch (Constraint[0]) {
20261 default: break;
20262 case 'l': return C_RegisterClass;
20263 case 'w': return C_RegisterClass;
20264 case 'h': return C_RegisterClass;
20265 case 'x': return C_RegisterClass;
20266 case 't': return C_RegisterClass;
20267 case 'j': return C_Immediate; // Constant for movw.
20268 // An address with a single base register. Due to the way we
20269 // currently handle addresses it is the same as an 'r' memory constraint.
20270 case 'Q': return C_Memory;
20271 }
20272 } else if (S == 2) {
20273 switch (Constraint[0]) {
20274 default: break;
20275 case 'T': return C_RegisterClass;
20276 // All 'U+' constraints are addresses.
20277 case 'U': return C_Memory;
20278 }
20279 }
20280 return TargetLowering::getConstraintType(Constraint);
20281}
20282
20283/// Examine constraint type and operand type and determine a weight value.
20284/// This object must already have been set up with the operand type
20285/// and the current alternative constraint selected.
20288 AsmOperandInfo &info, const char *constraint) const {
20290 Value *CallOperandVal = info.CallOperandVal;
20291 // If we don't have a value, we can't do a match,
20292 // but allow it at the lowest weight.
20293 if (!CallOperandVal)
20294 return CW_Default;
20295 Type *type = CallOperandVal->getType();
20296 // Look at the constraint type.
20297 switch (*constraint) {
20298 default:
20300 break;
20301 case 'l':
20302 if (type->isIntegerTy()) {
20303 if (Subtarget->isThumb())
20304 weight = CW_SpecificReg;
20305 else
20306 weight = CW_Register;
20307 }
20308 break;
20309 case 'w':
20310 if (type->isFloatingPointTy())
20311 weight = CW_Register;
20312 break;
20313 }
20314 return weight;
20315}
20316
20317static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT) {
20318 if (PR == 0 || VT == MVT::Other)
20319 return false;
20320 return (ARM::SPRRegClass.contains(PR) && VT != MVT::f32 && VT != MVT::i32) ||
20321 (ARM::DPRRegClass.contains(PR) && VT != MVT::f64 &&
20322 !VT.is64BitVector());
20323}
20324
20325using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
20326
20328 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
20329 switch (Constraint.size()) {
20330 case 1:
20331 // GCC ARM Constraint Letters
20332 switch (Constraint[0]) {
20333 case 'l': // Low regs or general regs.
20334 if (Subtarget->isThumb())
20335 return RCPair(0U, &ARM::tGPRRegClass);
20336 return RCPair(0U, &ARM::GPRRegClass);
20337 case 'h': // High regs or no regs.
20338 if (Subtarget->isThumb())
20339 return RCPair(0U, &ARM::hGPRRegClass);
20340 break;
20341 case 'r':
20342 if (Subtarget->isThumb1Only())
20343 return RCPair(0U, &ARM::tGPRRegClass);
20344 return RCPair(0U, &ARM::GPRRegClass);
20345 case 'w':
20346 if (VT == MVT::Other)
20347 break;
20348 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20349 return RCPair(0U, &ARM::SPRRegClass);
20350 if (VT.getSizeInBits() == 64)
20351 return RCPair(0U, &ARM::DPRRegClass);
20352 if (VT.getSizeInBits() == 128)
20353 return RCPair(0U, &ARM::QPRRegClass);
20354 break;
20355 case 'x':
20356 if (VT == MVT::Other)
20357 break;
20358 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20359 return RCPair(0U, &ARM::SPR_8RegClass);
20360 if (VT.getSizeInBits() == 64)
20361 return RCPair(0U, &ARM::DPR_8RegClass);
20362 if (VT.getSizeInBits() == 128)
20363 return RCPair(0U, &ARM::QPR_8RegClass);
20364 break;
20365 case 't':
20366 if (VT == MVT::Other)
20367 break;
20368 if (VT == MVT::f32 || VT == MVT::i32 || VT == MVT::f16 || VT == MVT::bf16)
20369 return RCPair(0U, &ARM::SPRRegClass);
20370 if (VT.getSizeInBits() == 64)
20371 return RCPair(0U, &ARM::DPR_VFP2RegClass);
20372 if (VT.getSizeInBits() == 128)
20373 return RCPair(0U, &ARM::QPR_VFP2RegClass);
20374 break;
20375 }
20376 break;
20377
20378 case 2:
20379 if (Constraint[0] == 'T') {
20380 switch (Constraint[1]) {
20381 default:
20382 break;
20383 case 'e':
20384 return RCPair(0U, &ARM::tGPREvenRegClass);
20385 case 'o':
20386 return RCPair(0U, &ARM::tGPROddRegClass);
20387 }
20388 }
20389 break;
20390
20391 default:
20392 break;
20393 }
20394
20395 if (StringRef("{cc}").equals_insensitive(Constraint))
20396 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
20397
20398 auto RCP = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
20399 if (isIncompatibleReg(RCP.first, VT))
20400 return {0, nullptr};
20401 return RCP;
20402}
20403
20404/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
20405/// vector. If it is invalid, don't add anything to Ops.
20407 StringRef Constraint,
20408 std::vector<SDValue> &Ops,
20409 SelectionDAG &DAG) const {
20410 SDValue Result;
20411
20412 // Currently only support length 1 constraints.
20413 if (Constraint.size() != 1)
20414 return;
20415
20416 char ConstraintLetter = Constraint[0];
20417 switch (ConstraintLetter) {
20418 default: break;
20419 case 'j':
20420 case 'I': case 'J': case 'K': case 'L':
20421 case 'M': case 'N': case 'O':
20422 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
20423 if (!C)
20424 return;
20425
20426 int64_t CVal64 = C->getSExtValue();
20427 int CVal = (int) CVal64;
20428 // None of these constraints allow values larger than 32 bits. Check
20429 // that the value fits in an int.
20430 if (CVal != CVal64)
20431 return;
20432
20433 switch (ConstraintLetter) {
20434 case 'j':
20435 // Constant suitable for movw, must be between 0 and
20436 // 65535.
20437 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
20438 if (CVal >= 0 && CVal <= 65535)
20439 break;
20440 return;
20441 case 'I':
20442 if (Subtarget->isThumb1Only()) {
20443 // This must be a constant between 0 and 255, for ADD
20444 // immediates.
20445 if (CVal >= 0 && CVal <= 255)
20446 break;
20447 } else if (Subtarget->isThumb2()) {
20448 // A constant that can be used as an immediate value in a
20449 // data-processing instruction.
20450 if (ARM_AM::getT2SOImmVal(CVal) != -1)
20451 break;
20452 } else {
20453 // A constant that can be used as an immediate value in a
20454 // data-processing instruction.
20455 if (ARM_AM::getSOImmVal(CVal) != -1)
20456 break;
20457 }
20458 return;
20459
20460 case 'J':
20461 if (Subtarget->isThumb1Only()) {
20462 // This must be a constant between -255 and -1, for negated ADD
20463 // immediates. This can be used in GCC with an "n" modifier that
20464 // prints the negated value, for use with SUB instructions. It is
20465 // not useful otherwise but is implemented for compatibility.
20466 if (CVal >= -255 && CVal <= -1)
20467 break;
20468 } else {
20469 // This must be a constant between -4095 and 4095. It is not clear
20470 // what this constraint is intended for. Implemented for
20471 // compatibility with GCC.
20472 if (CVal >= -4095 && CVal <= 4095)
20473 break;
20474 }
20475 return;
20476
20477 case 'K':
20478 if (Subtarget->isThumb1Only()) {
20479 // A 32-bit value where only one byte has a nonzero value. Exclude
20480 // zero to match GCC. This constraint is used by GCC internally for
20481 // constants that can be loaded with a move/shift combination.
20482 // It is not useful otherwise but is implemented for compatibility.
20483 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
20484 break;
20485 } else if (Subtarget->isThumb2()) {
20486 // A constant whose bitwise inverse can be used as an immediate
20487 // value in a data-processing instruction. This can be used in GCC
20488 // with a "B" modifier that prints the inverted value, for use with
20489 // BIC and MVN instructions. It is not useful otherwise but is
20490 // implemented for compatibility.
20491 if (ARM_AM::getT2SOImmVal(~CVal) != -1)
20492 break;
20493 } else {
20494 // A constant whose bitwise inverse can be used as an immediate
20495 // value in a data-processing instruction. This can be used in GCC
20496 // with a "B" modifier that prints the inverted value, for use with
20497 // BIC and MVN instructions. It is not useful otherwise but is
20498 // implemented for compatibility.
20499 if (ARM_AM::getSOImmVal(~CVal) != -1)
20500 break;
20501 }
20502 return;
20503
20504 case 'L':
20505 if (Subtarget->isThumb1Only()) {
20506 // This must be a constant between -7 and 7,
20507 // for 3-operand ADD/SUB immediate instructions.
20508 if (CVal >= -7 && CVal < 7)
20509 break;
20510 } else if (Subtarget->isThumb2()) {
20511 // A constant whose negation can be used as an immediate value in a
20512 // data-processing instruction. This can be used in GCC with an "n"
20513 // modifier that prints the negated value, for use with SUB
20514 // instructions. It is not useful otherwise but is implemented for
20515 // compatibility.
20516 if (ARM_AM::getT2SOImmVal(-CVal) != -1)
20517 break;
20518 } else {
20519 // A constant whose negation can be used as an immediate value in a
20520 // data-processing instruction. This can be used in GCC with an "n"
20521 // modifier that prints the negated value, for use with SUB
20522 // instructions. It is not useful otherwise but is implemented for
20523 // compatibility.
20524 if (ARM_AM::getSOImmVal(-CVal) != -1)
20525 break;
20526 }
20527 return;
20528
20529 case 'M':
20530 if (Subtarget->isThumb1Only()) {
20531 // This must be a multiple of 4 between 0 and 1020, for
20532 // ADD sp + immediate.
20533 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
20534 break;
20535 } else {
20536 // A power of two or a constant between 0 and 32. This is used in
20537 // GCC for the shift amount on shifted register operands, but it is
20538 // useful in general for any shift amounts.
20539 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
20540 break;
20541 }
20542 return;
20543
20544 case 'N':
20545 if (Subtarget->isThumb1Only()) {
20546 // This must be a constant between 0 and 31, for shift amounts.
20547 if (CVal >= 0 && CVal <= 31)
20548 break;
20549 }
20550 return;
20551
20552 case 'O':
20553 if (Subtarget->isThumb1Only()) {
20554 // This must be a multiple of 4 between -508 and 508, for
20555 // ADD/SUB sp = sp + immediate.
20556 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
20557 break;
20558 }
20559 return;
20560 }
20561 Result = DAG.getSignedTargetConstant(CVal, SDLoc(Op), Op.getValueType());
20562 break;
20563 }
20564
20565 if (Result.getNode()) {
20566 Ops.push_back(Result);
20567 return;
20568 }
20569 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
20570}
20571
20572static RTLIB::Libcall getDivRemLibcall(
20573 const SDNode *N, MVT::SimpleValueType SVT) {
20574 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20575 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20576 "Unhandled Opcode in getDivRemLibcall");
20577 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20578 N->getOpcode() == ISD::SREM;
20579 RTLIB::Libcall LC;
20580 switch (SVT) {
20581 default: llvm_unreachable("Unexpected request for libcall!");
20582 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
20583 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
20584 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
20585 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
20586 }
20587 return LC;
20588}
20589
20591 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
20592 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20593 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20594 "Unhandled Opcode in getDivRemArgList");
20595 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20596 N->getOpcode() == ISD::SREM;
20598 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
20599 EVT ArgVT = N->getOperand(i).getValueType();
20600 Type *ArgTy = ArgVT.getTypeForEVT(*Context);
20601 TargetLowering::ArgListEntry Entry(N->getOperand(i), ArgTy);
20602 Entry.IsSExt = isSigned;
20603 Entry.IsZExt = !isSigned;
20604 Args.push_back(Entry);
20605 }
20606 if (Subtarget->isTargetWindows() && Args.size() >= 2)
20607 std::swap(Args[0], Args[1]);
20608 return Args;
20609}
20610
20611SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
20612 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
20613 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
20614 Subtarget->isTargetWindows()) &&
20615 "Register-based DivRem lowering only");
20616 unsigned Opcode = Op->getOpcode();
20617 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
20618 "Invalid opcode for Div/Rem lowering");
20619 bool isSigned = (Opcode == ISD::SDIVREM);
20620 EVT VT = Op->getValueType(0);
20621 SDLoc dl(Op);
20622
20623 if (VT == MVT::i64 && isa<ConstantSDNode>(Op.getOperand(1))) {
20625 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i32, DAG)) {
20626 SDValue Res0 =
20627 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[0], Result[1]);
20628 SDValue Res1 =
20629 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[2], Result[3]);
20630 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20631 {Res0, Res1});
20632 }
20633 }
20634
20635 Type *Ty = VT.getTypeForEVT(*DAG.getContext());
20636
20637 // If the target has hardware divide, use divide + multiply + subtract:
20638 // div = a / b
20639 // rem = a - b * div
20640 // return {div, rem}
20641 // This should be lowered into UDIV/SDIV + MLS later on.
20642 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
20643 : Subtarget->hasDivideInARMMode();
20644 if (hasDivide && Op->getValueType(0).isSimple() &&
20645 Op->getSimpleValueType(0) == MVT::i32) {
20646 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
20647 const SDValue Dividend = Op->getOperand(0);
20648 const SDValue Divisor = Op->getOperand(1);
20649 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
20650 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
20651 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
20652
20653 SDValue Values[2] = {Div, Rem};
20654 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
20655 }
20656
20657 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
20658 VT.getSimpleVT().SimpleTy);
20659 SDValue InChain = DAG.getEntryNode();
20660
20662 DAG.getContext(),
20663 Subtarget);
20664
20667
20668 Type *RetTy = StructType::get(Ty, Ty);
20669
20670 if (Subtarget->isTargetWindows())
20671 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
20672
20674 CLI.setDebugLoc(dl).setChain(InChain)
20675 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
20677
20678 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
20679 return CallInfo.first;
20680}
20681
20682// Lowers REM using divmod helpers
20683// see RTABI section 4.2/4.3
20684SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
20685 EVT VT = N->getValueType(0);
20686
20687 if (VT == MVT::i64 && isa<ConstantSDNode>(N->getOperand(1))) {
20689 if (expandDIVREMByConstant(N, Result, MVT::i32, DAG))
20690 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), N->getValueType(0),
20691 Result[0], Result[1]);
20692 }
20693
20694 // Build return types (div and rem)
20695 std::vector<Type*> RetTyParams;
20696 Type *RetTyElement;
20697
20698 switch (VT.getSimpleVT().SimpleTy) {
20699 default: llvm_unreachable("Unexpected request for libcall!");
20700 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
20701 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
20702 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
20703 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
20704 }
20705
20706 RetTyParams.push_back(RetTyElement);
20707 RetTyParams.push_back(RetTyElement);
20708 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
20709 Type *RetTy = StructType::get(*DAG.getContext(), ret);
20710
20711 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
20712 SimpleTy);
20713 SDValue InChain = DAG.getEntryNode();
20715 Subtarget);
20716 bool isSigned = N->getOpcode() == ISD::SREM;
20719
20720 if (Subtarget->isTargetWindows())
20721 InChain = WinDBZCheckDenominator(DAG, N, InChain);
20722
20723 // Lower call
20724 CallLoweringInfo CLI(DAG);
20725 CLI.setChain(InChain)
20726 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
20728 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
20729
20730 // Return second (rem) result operand (first contains div)
20731 SDNode *ResNode = CallResult.first.getNode();
20732 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
20733 return ResNode->getOperand(1);
20734}
20735
20736SDValue
20737ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
20738 assert(Subtarget->isTargetWindows() && "unsupported target platform");
20739 SDLoc DL(Op);
20740
20741 // Get the inputs.
20742 SDValue Chain = Op.getOperand(0);
20743 SDValue Size = Op.getOperand(1);
20744
20746 "no-stack-arg-probe")) {
20748 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
20749 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20750 Chain = SP.getValue(1);
20751 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
20752 if (Align)
20753 SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
20754 DAG.getSignedConstant(-Align->value(), DL, MVT::i32));
20755 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
20756 SDValue Ops[2] = { SP, Chain };
20757 return DAG.getMergeValues(Ops, DL);
20758 }
20759
20760 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
20761 DAG.getConstant(2, DL, MVT::i32));
20762
20763 SDValue Glue;
20764 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Glue);
20765 Glue = Chain.getValue(1);
20766
20767 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20768 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Glue);
20769
20770 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20771 Chain = NewSP.getValue(1);
20772
20773 SDValue Ops[2] = { NewSP, Chain };
20774 return DAG.getMergeValues(Ops, DL);
20775}
20776
20777SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
20778 bool IsStrict = Op->isStrictFPOpcode();
20779 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20780 const unsigned DstSz = Op.getValueType().getSizeInBits();
20781 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
20782 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
20783 "Unexpected type for custom-lowering FP_EXTEND");
20784
20785 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20786 "With both FP DP and 16, any FP conversion is legal!");
20787
20788 assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
20789 "With FP16, 16 to 32 conversion is legal!");
20790
20791 // Converting from 32 -> 64 is valid if we have FP64.
20792 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) {
20793 // FIXME: Remove this when we have strict fp instruction selection patterns
20794 if (IsStrict) {
20795 SDLoc Loc(Op);
20797 Loc, Op.getValueType(), SrcVal);
20798 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
20799 }
20800 return Op;
20801 }
20802
20803 // Either we are converting from 16 -> 64, without FP16 and/or
20804 // FP.double-precision or without Armv8-fp. So we must do it in two
20805 // steps.
20806 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
20807 // without FP16. So we must do a function call.
20808 SDLoc Loc(Op);
20809 RTLIB::Libcall LC;
20810 MakeLibCallOptions CallOptions;
20811 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20812 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {
20813 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());
20814 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);
20815 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);
20816 if (Supported) {
20817 if (IsStrict) {
20818 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc,
20819 {DstVT, MVT::Other}, {Chain, SrcVal});
20820 Chain = SrcVal.getValue(1);
20821 } else {
20822 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal);
20823 }
20824 } else {
20825 LC = RTLIB::getFPEXT(SrcVT, DstVT);
20826 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20827 "Unexpected type for custom-lowering FP_EXTEND");
20828 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20829 Loc, Chain);
20830 }
20831 }
20832
20833 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal;
20834}
20835
20836SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
20837 bool IsStrict = Op->isStrictFPOpcode();
20838
20839 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20840 EVT SrcVT = SrcVal.getValueType();
20841 EVT DstVT = Op.getValueType();
20842 const unsigned DstSz = Op.getValueType().getSizeInBits();
20843 const unsigned SrcSz = SrcVT.getSizeInBits();
20844 (void)DstSz;
20845 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
20846 "Unexpected type for custom-lowering FP_ROUND");
20847
20848 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20849 "With both FP DP and 16, any FP conversion is legal!");
20850
20851 SDLoc Loc(Op);
20852
20853 // Instruction from 32 -> 16 if hasFP16 is valid
20854 if (SrcSz == 32 && Subtarget->hasFP16())
20855 return Op;
20856
20857 // Lib call from 32 -> 16 / 64 -> [32, 16]
20858 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
20859 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20860 "Unexpected type for custom-lowering FP_ROUND");
20861 MakeLibCallOptions CallOptions;
20862 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20864 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20865 Loc, Chain);
20866 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
20867}
20868
20869bool
20871 // The ARM target isn't yet aware of offsets.
20872 return false;
20873}
20874
20876 if (v == 0xffffffff)
20877 return false;
20878
20879 // there can be 1's on either or both "outsides", all the "inside"
20880 // bits must be 0's
20881 return isShiftedMask_32(~v);
20882}
20883
20884/// isFPImmLegal - Returns true if the target can instruction select the
20885/// specified FP immediate natively. If false, the legalizer will
20886/// materialize the FP immediate as a load from a constant pool.
20888 bool ForCodeSize) const {
20889 if (!Subtarget->hasVFP3Base())
20890 return false;
20891 if (VT == MVT::f16 && Subtarget->hasFullFP16())
20892 return ARM_AM::getFP16Imm(Imm) != -1;
20893 if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
20894 ARM_AM::getFP32FP16Imm(Imm) != -1)
20895 return true;
20896 if (VT == MVT::f32)
20897 return ARM_AM::getFP32Imm(Imm) != -1;
20898 if (VT == MVT::f64 && Subtarget->hasFP64())
20899 return ARM_AM::getFP64Imm(Imm) != -1;
20900 return false;
20901}
20902
20903/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
20904/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
20905/// specified in the intrinsic calls.
20907 const CallInst &I,
20908 MachineFunction &MF,
20909 unsigned Intrinsic) const {
20910 switch (Intrinsic) {
20911 case Intrinsic::arm_neon_vld1:
20912 case Intrinsic::arm_neon_vld2:
20913 case Intrinsic::arm_neon_vld3:
20914 case Intrinsic::arm_neon_vld4:
20915 case Intrinsic::arm_neon_vld2lane:
20916 case Intrinsic::arm_neon_vld3lane:
20917 case Intrinsic::arm_neon_vld4lane:
20918 case Intrinsic::arm_neon_vld2dup:
20919 case Intrinsic::arm_neon_vld3dup:
20920 case Intrinsic::arm_neon_vld4dup: {
20922 // Conservatively set memVT to the entire set of vectors loaded.
20923 auto &DL = I.getDataLayout();
20924 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20925 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20926 Info.ptrVal = I.getArgOperand(0);
20927 Info.offset = 0;
20928 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
20929 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
20930 // volatile loads with NEON intrinsics not supported
20932 return true;
20933 }
20934 case Intrinsic::arm_neon_vld1x2:
20935 case Intrinsic::arm_neon_vld1x3:
20936 case Intrinsic::arm_neon_vld1x4: {
20938 // Conservatively set memVT to the entire set of vectors loaded.
20939 auto &DL = I.getDataLayout();
20940 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20941 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20942 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
20943 Info.offset = 0;
20944 Info.align = I.getParamAlign(I.arg_size() - 1).valueOrOne();
20945 // volatile loads with NEON intrinsics not supported
20947 return true;
20948 }
20949 case Intrinsic::arm_neon_vst1:
20950 case Intrinsic::arm_neon_vst2:
20951 case Intrinsic::arm_neon_vst3:
20952 case Intrinsic::arm_neon_vst4:
20953 case Intrinsic::arm_neon_vst2lane:
20954 case Intrinsic::arm_neon_vst3lane:
20955 case Intrinsic::arm_neon_vst4lane: {
20957 // Conservatively set memVT to the entire set of vectors stored.
20958 auto &DL = I.getDataLayout();
20959 unsigned NumElts = 0;
20960 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
20961 Type *ArgTy = I.getArgOperand(ArgI)->getType();
20962 if (!ArgTy->isVectorTy())
20963 break;
20964 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
20965 }
20966 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20967 Info.ptrVal = I.getArgOperand(0);
20968 Info.offset = 0;
20969 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
20970 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
20971 // volatile stores with NEON intrinsics not supported
20973 return true;
20974 }
20975 case Intrinsic::arm_neon_vst1x2:
20976 case Intrinsic::arm_neon_vst1x3:
20977 case Intrinsic::arm_neon_vst1x4: {
20979 // Conservatively set memVT to the entire set of vectors stored.
20980 auto &DL = I.getDataLayout();
20981 unsigned NumElts = 0;
20982 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
20983 Type *ArgTy = I.getArgOperand(ArgI)->getType();
20984 if (!ArgTy->isVectorTy())
20985 break;
20986 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
20987 }
20988 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20989 Info.ptrVal = I.getArgOperand(0);
20990 Info.offset = 0;
20991 Info.align = I.getParamAlign(0).valueOrOne();
20992 // volatile stores with NEON intrinsics not supported
20994 return true;
20995 }
20996 case Intrinsic::arm_mve_vld2q:
20997 case Intrinsic::arm_mve_vld4q: {
20999 // Conservatively set memVT to the entire set of vectors loaded.
21000 Type *VecTy = cast<StructType>(I.getType())->getElementType(1);
21001 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
21002 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
21003 Info.ptrVal = I.getArgOperand(0);
21004 Info.offset = 0;
21005 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
21006 // volatile loads with MVE intrinsics not supported
21008 return true;
21009 }
21010 case Intrinsic::arm_mve_vst2q:
21011 case Intrinsic::arm_mve_vst4q: {
21013 // Conservatively set memVT to the entire set of vectors stored.
21014 Type *VecTy = I.getArgOperand(1)->getType();
21015 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
21016 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
21017 Info.ptrVal = I.getArgOperand(0);
21018 Info.offset = 0;
21019 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
21020 // volatile stores with MVE intrinsics not supported
21022 return true;
21023 }
21024 case Intrinsic::arm_mve_vldr_gather_base:
21025 case Intrinsic::arm_mve_vldr_gather_base_predicated: {
21027 Info.ptrVal = nullptr;
21028 Info.memVT = MVT::getVT(I.getType());
21029 Info.align = Align(1);
21031 return true;
21032 }
21033 case Intrinsic::arm_mve_vldr_gather_base_wb:
21034 case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {
21036 Info.ptrVal = nullptr;
21037 Info.memVT = MVT::getVT(I.getType()->getContainedType(0));
21038 Info.align = Align(1);
21040 return true;
21041 }
21042 case Intrinsic::arm_mve_vldr_gather_offset:
21043 case Intrinsic::arm_mve_vldr_gather_offset_predicated: {
21045 Info.ptrVal = nullptr;
21046 MVT DataVT = MVT::getVT(I.getType());
21047 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
21048 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21049 DataVT.getVectorNumElements());
21050 Info.align = Align(1);
21052 return true;
21053 }
21054 case Intrinsic::arm_mve_vstr_scatter_base:
21055 case Intrinsic::arm_mve_vstr_scatter_base_predicated: {
21057 Info.ptrVal = nullptr;
21058 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21059 Info.align = Align(1);
21061 return true;
21062 }
21063 case Intrinsic::arm_mve_vstr_scatter_base_wb:
21064 case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: {
21066 Info.ptrVal = nullptr;
21067 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21068 Info.align = Align(1);
21070 return true;
21071 }
21072 case Intrinsic::arm_mve_vstr_scatter_offset:
21073 case Intrinsic::arm_mve_vstr_scatter_offset_predicated: {
21075 Info.ptrVal = nullptr;
21076 MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType());
21077 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
21078 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21079 DataVT.getVectorNumElements());
21080 Info.align = Align(1);
21082 return true;
21083 }
21084 case Intrinsic::arm_ldaex:
21085 case Intrinsic::arm_ldrex: {
21086 auto &DL = I.getDataLayout();
21087 Type *ValTy = I.getParamElementType(0);
21089 Info.memVT = MVT::getVT(ValTy);
21090 Info.ptrVal = I.getArgOperand(0);
21091 Info.offset = 0;
21092 Info.align = DL.getABITypeAlign(ValTy);
21094 return true;
21095 }
21096 case Intrinsic::arm_stlex:
21097 case Intrinsic::arm_strex: {
21098 auto &DL = I.getDataLayout();
21099 Type *ValTy = I.getParamElementType(1);
21101 Info.memVT = MVT::getVT(ValTy);
21102 Info.ptrVal = I.getArgOperand(1);
21103 Info.offset = 0;
21104 Info.align = DL.getABITypeAlign(ValTy);
21106 return true;
21107 }
21108 case Intrinsic::arm_stlexd:
21109 case Intrinsic::arm_strexd:
21111 Info.memVT = MVT::i64;
21112 Info.ptrVal = I.getArgOperand(2);
21113 Info.offset = 0;
21114 Info.align = Align(8);
21116 return true;
21117
21118 case Intrinsic::arm_ldaexd:
21119 case Intrinsic::arm_ldrexd:
21121 Info.memVT = MVT::i64;
21122 Info.ptrVal = I.getArgOperand(0);
21123 Info.offset = 0;
21124 Info.align = Align(8);
21126 return true;
21127
21128 default:
21129 break;
21130 }
21131
21132 return false;
21133}
21134
21135/// Returns true if it is beneficial to convert a load of a constant
21136/// to just the constant itself.
21138 Type *Ty) const {
21139 assert(Ty->isIntegerTy());
21140
21141 unsigned Bits = Ty->getPrimitiveSizeInBits();
21142 if (Bits == 0 || Bits > 32)
21143 return false;
21144 return true;
21145}
21146
21148 unsigned Index) const {
21150 return false;
21151
21152 return (Index == 0 || Index == ResVT.getVectorNumElements());
21153}
21154
21156 ARM_MB::MemBOpt Domain) const {
21157 // First, if the target has no DMB, see what fallback we can use.
21158 if (!Subtarget->hasDataBarrier()) {
21159 // Some ARMv6 cpus can support data barriers with an mcr instruction.
21160 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
21161 // here.
21162 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
21163 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
21164 Builder.getInt32(0), Builder.getInt32(7),
21165 Builder.getInt32(10), Builder.getInt32(5)};
21166 return Builder.CreateIntrinsic(Intrinsic::arm_mcr, args);
21167 } else {
21168 // Instead of using barriers, atomic accesses on these subtargets use
21169 // libcalls.
21170 llvm_unreachable("makeDMB on a target so old that it has no barriers");
21171 }
21172 } else {
21173 // Only a full system barrier exists in the M-class architectures.
21174 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
21175 Constant *CDomain = Builder.getInt32(Domain);
21176 return Builder.CreateIntrinsic(Intrinsic::arm_dmb, CDomain);
21177 }
21178}
21179
21180// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
21182 Instruction *Inst,
21183 AtomicOrdering Ord) const {
21184 switch (Ord) {
21187 llvm_unreachable("Invalid fence: unordered/non-atomic");
21190 return nullptr; // Nothing to do
21192 if (!Inst->hasAtomicStore())
21193 return nullptr; // Nothing to do
21194 [[fallthrough]];
21197 if (Subtarget->preferISHSTBarriers())
21198 return makeDMB(Builder, ARM_MB::ISHST);
21199 // FIXME: add a comment with a link to documentation justifying this.
21200 else
21201 return makeDMB(Builder, ARM_MB::ISH);
21202 }
21203 llvm_unreachable("Unknown fence ordering in emitLeadingFence");
21204}
21205
21207 Instruction *Inst,
21208 AtomicOrdering Ord) const {
21209 switch (Ord) {
21212 llvm_unreachable("Invalid fence: unordered/not-atomic");
21215 return nullptr; // Nothing to do
21219 return makeDMB(Builder, ARM_MB::ISH);
21220 }
21221 llvm_unreachable("Unknown fence ordering in emitTrailingFence");
21222}
21223
21224// Loads and stores less than 64-bits are already atomic; ones above that
21225// are doomed anyway, so defer to the default libcall and blame the OS when
21226// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21227// anything for those.
21230 bool has64BitAtomicStore;
21231 if (Subtarget->isMClass())
21232 has64BitAtomicStore = false;
21233 else if (Subtarget->isThumb())
21234 has64BitAtomicStore = Subtarget->hasV7Ops();
21235 else
21236 has64BitAtomicStore = Subtarget->hasV6Ops();
21237
21238 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
21239 return Size == 64 && has64BitAtomicStore ? AtomicExpansionKind::Expand
21241}
21242
21243// Loads and stores less than 64-bits are already atomic; ones above that
21244// are doomed anyway, so defer to the default libcall and blame the OS when
21245// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21246// anything for those.
21247// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
21248// guarantee, see DDI0406C ARM architecture reference manual,
21249// sections A8.8.72-74 LDRD)
21252 bool has64BitAtomicLoad;
21253 if (Subtarget->isMClass())
21254 has64BitAtomicLoad = false;
21255 else if (Subtarget->isThumb())
21256 has64BitAtomicLoad = Subtarget->hasV7Ops();
21257 else
21258 has64BitAtomicLoad = Subtarget->hasV6Ops();
21259
21260 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
21261 return (Size == 64 && has64BitAtomicLoad) ? AtomicExpansionKind::LLOnly
21263}
21264
21265// For the real atomic operations, we have ldrex/strex up to 32 bits,
21266// and up to 64 bits on the non-M profiles
21269 if (AI->isFloatingPointOperation())
21271
21272 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
21273 bool hasAtomicRMW;
21274 if (Subtarget->isMClass())
21275 hasAtomicRMW = Subtarget->hasV8MBaselineOps();
21276 else if (Subtarget->isThumb())
21277 hasAtomicRMW = Subtarget->hasV7Ops();
21278 else
21279 hasAtomicRMW = Subtarget->hasV6Ops();
21280 if (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) {
21281 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21282 // implement atomicrmw without spilling. If the target address is also on
21283 // the stack and close enough to the spill slot, this can lead to a
21284 // situation where the monitor always gets cleared and the atomic operation
21285 // can never succeed. So at -O0 lower this operation to a CAS loop.
21286 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
21289 }
21291}
21292
21293// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32
21294// bits, and up to 64 bits on the non-M profiles.
21297 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21298 // implement cmpxchg without spilling. If the address being exchanged is also
21299 // on the stack and close enough to the spill slot, this can lead to a
21300 // situation where the monitor always gets cleared and the atomic operation
21301 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
21302 unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
21303 bool HasAtomicCmpXchg;
21304 if (Subtarget->isMClass())
21305 HasAtomicCmpXchg = Subtarget->hasV8MBaselineOps();
21306 else if (Subtarget->isThumb())
21307 HasAtomicCmpXchg = Subtarget->hasV7Ops();
21308 else
21309 HasAtomicCmpXchg = Subtarget->hasV6Ops();
21310 if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None &&
21311 HasAtomicCmpXchg && Size <= (Subtarget->isMClass() ? 32U : 64U))
21314}
21315
21317 const Instruction *I) const {
21318 return InsertFencesForAtomic;
21319}
21320
21322 // ROPI/RWPI are not supported currently.
21323 return !Subtarget->isROPI() && !Subtarget->isRWPI();
21324}
21325
21327 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
21328 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
21329 if (SecurityCheckCookieLibcall == RTLIB::Unsupported)
21331
21332 // MSVC CRT has a global variable holding security cookie.
21333 M.getOrInsertGlobal("__security_cookie",
21334 PointerType::getUnqual(M.getContext()));
21335
21336 // MSVC CRT has a function to validate security cookie.
21337 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
21338 getLibcallImplName(SecurityCheckCookieLibcall),
21339 Type::getVoidTy(M.getContext()), PointerType::getUnqual(M.getContext()));
21340 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
21341 F->addParamAttr(0, Attribute::AttrKind::InReg);
21342}
21343
21345 // MSVC CRT has a function to validate security cookie.
21346 RTLIB::LibcallImpl SecurityCheckCookie =
21347 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
21348 if (SecurityCheckCookie != RTLIB::Unsupported)
21349 return M.getFunction(getLibcallImplName(SecurityCheckCookie));
21351}
21352
21354 unsigned &Cost) const {
21355 // If we do not have NEON, vector types are not natively supported.
21356 if (!Subtarget->hasNEON())
21357 return false;
21358
21359 // Floating point values and vector values map to the same register file.
21360 // Therefore, although we could do a store extract of a vector type, this is
21361 // better to leave at float as we have more freedom in the addressing mode for
21362 // those.
21363 if (VectorTy->isFPOrFPVectorTy())
21364 return false;
21365
21366 // If the index is unknown at compile time, this is very expensive to lower
21367 // and it is not possible to combine the store with the extract.
21368 if (!isa<ConstantInt>(Idx))
21369 return false;
21370
21371 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
21372 unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();
21373 // We can do a store + vector extract on any vector that fits perfectly in a D
21374 // or Q register.
21375 if (BitWidth == 64 || BitWidth == 128) {
21376 Cost = 0;
21377 return true;
21378 }
21379 return false;
21380}
21381
21383 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21384}
21385
21387 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21388}
21389
21391 const Instruction &AndI) const {
21392 if (!Subtarget->hasV7Ops())
21393 return false;
21394
21395 // Sink the `and` instruction only if the mask would fit into a modified
21396 // immediate operand.
21397 ConstantInt *Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
21398 if (!Mask || Mask->getValue().getBitWidth() > 32u)
21399 return false;
21400 auto MaskVal = unsigned(Mask->getValue().getZExtValue());
21401 return (Subtarget->isThumb2() ? ARM_AM::getT2SOImmVal(MaskVal)
21402 : ARM_AM::getSOImmVal(MaskVal)) != -1;
21403}
21404
21407 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
21408 if (Subtarget->hasMinSize() && !Subtarget->isTargetWindows())
21411 ExpansionFactor);
21412}
21413
21415 Value *Addr,
21416 AtomicOrdering Ord) const {
21417 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21418 bool IsAcquire = isAcquireOrStronger(Ord);
21419
21420 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
21421 // intrinsic must return {i32, i32} and we have to recombine them into a
21422 // single i64 here.
21423 if (ValueTy->getPrimitiveSizeInBits() == 64) {
21425 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
21426
21427 Value *LoHi =
21428 Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");
21429
21430 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
21431 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
21432 if (!Subtarget->isLittle())
21433 std::swap (Lo, Hi);
21434 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
21435 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
21436 return Builder.CreateOr(
21437 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64");
21438 }
21439
21440 Type *Tys[] = { Addr->getType() };
21441 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
21442 CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
21443
21444 CI->addParamAttr(
21445 0, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy));
21446 return Builder.CreateTruncOrBitCast(CI, ValueTy);
21447}
21448
21450 IRBuilderBase &Builder) const {
21451 if (!Subtarget->hasV7Ops())
21452 return;
21453 Builder.CreateIntrinsic(Intrinsic::arm_clrex, {});
21454}
21455
21457 Value *Val, Value *Addr,
21458 AtomicOrdering Ord) const {
21459 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21460 bool IsRelease = isReleaseOrStronger(Ord);
21461
21462 // Since the intrinsics must have legal type, the i64 intrinsics take two
21463 // parameters: "i32, i32". We must marshal Val into the appropriate form
21464 // before the call.
21465 if (Val->getType()->getPrimitiveSizeInBits() == 64) {
21467 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
21468 Type *Int32Ty = Type::getInt32Ty(M->getContext());
21469
21470 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
21471 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
21472 if (!Subtarget->isLittle())
21473 std::swap(Lo, Hi);
21474 return Builder.CreateIntrinsic(Int, {Lo, Hi, Addr});
21475 }
21476
21477 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
21478 Type *Tys[] = { Addr->getType() };
21480
21481 CallInst *CI = Builder.CreateCall(
21482 Strex, {Builder.CreateZExtOrBitCast(
21483 Val, Strex->getFunctionType()->getParamType(0)),
21484 Addr});
21485 CI->addParamAttr(1, Attribute::get(M->getContext(), Attribute::ElementType,
21486 Val->getType()));
21487 return CI;
21488}
21489
21490
21492 return Subtarget->isMClass();
21493}
21494
21495/// A helper function for determining the number of interleaved accesses we
21496/// will generate when lowering accesses of the given type.
21497unsigned
21499 const DataLayout &DL) const {
21500 return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
21501}
21502
21504 unsigned Factor, FixedVectorType *VecTy, Align Alignment,
21505 const DataLayout &DL) const {
21506
21507 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
21508 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
21509
21510 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
21511 return false;
21512
21513 // Ensure the vector doesn't have f16 elements. Even though we could do an
21514 // i16 vldN, we can't hold the f16 vectors and will end up converting via
21515 // f32.
21516 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
21517 return false;
21518 if (Subtarget->hasMVEIntegerOps() && Factor == 3)
21519 return false;
21520
21521 // Ensure the number of vector elements is greater than 1.
21522 if (VecTy->getNumElements() < 2)
21523 return false;
21524
21525 // Ensure the element type is legal.
21526 if (ElSize != 8 && ElSize != 16 && ElSize != 32)
21527 return false;
21528 // And the alignment if high enough under MVE.
21529 if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8)
21530 return false;
21531
21532 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
21533 // 128 will be split into multiple interleaved accesses.
21534 if (Subtarget->hasNEON() && VecSize == 64)
21535 return true;
21536 return VecSize % 128 == 0;
21537}
21538
21540 if (Subtarget->hasNEON())
21541 return 4;
21542 if (Subtarget->hasMVEIntegerOps())
21545}
21546
21547/// Lower an interleaved load into a vldN intrinsic.
21548///
21549/// E.g. Lower an interleaved load (Factor = 2):
21550/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
21551/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
21552/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
21553///
21554/// Into:
21555/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
21556/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
21557/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
21559 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
21560 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
21561 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21562 "Invalid interleave factor");
21563 assert(!Shuffles.empty() && "Empty shufflevector input");
21564 assert(Shuffles.size() == Indices.size() &&
21565 "Unmatched number of shufflevectors and indices");
21566
21567 auto *LI = dyn_cast<LoadInst>(Load);
21568 if (!LI)
21569 return false;
21570 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
21571
21572 auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
21573 Type *EltTy = VecTy->getElementType();
21574
21575 const DataLayout &DL = LI->getDataLayout();
21576 Align Alignment = LI->getAlign();
21577
21578 // Skip if we do not have NEON and skip illegal vector types. We can
21579 // "legalize" wide vector types into multiple interleaved accesses as long as
21580 // the vector types are divisible by 128.
21581 if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL))
21582 return false;
21583
21584 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
21585
21586 // A pointer vector can not be the return type of the ldN intrinsics. Need to
21587 // load integer vectors first and then convert to pointer vectors.
21588 if (EltTy->isPointerTy())
21589 VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);
21590
21591 IRBuilder<> Builder(LI);
21592
21593 // The base address of the load.
21594 Value *BaseAddr = LI->getPointerOperand();
21595
21596 if (NumLoads > 1) {
21597 // If we're going to generate more than one load, reset the sub-vector type
21598 // to something legal.
21599 VecTy = FixedVectorType::get(VecTy->getElementType(),
21600 VecTy->getNumElements() / NumLoads);
21601 }
21602
21603 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
21604
21605 auto createLoadIntrinsic = [&](Value *BaseAddr) {
21606 if (Subtarget->hasNEON()) {
21607 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21608 Type *Tys[] = {VecTy, PtrTy};
21609 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
21610 Intrinsic::arm_neon_vld3,
21611 Intrinsic::arm_neon_vld4};
21612
21614 Ops.push_back(BaseAddr);
21615 Ops.push_back(Builder.getInt32(LI->getAlign().value()));
21616
21617 return Builder.CreateIntrinsic(LoadInts[Factor - 2], Tys, Ops,
21618 /*FMFSource=*/nullptr, "vldN");
21619 } else {
21620 assert((Factor == 2 || Factor == 4) &&
21621 "expected interleave factor of 2 or 4 for MVE");
21622 Intrinsic::ID LoadInts =
21623 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
21624 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21625 Type *Tys[] = {VecTy, PtrTy};
21626
21628 Ops.push_back(BaseAddr);
21629 return Builder.CreateIntrinsic(LoadInts, Tys, Ops, /*FMFSource=*/nullptr,
21630 "vldN");
21631 }
21632 };
21633
21634 // Holds sub-vectors extracted from the load intrinsic return values. The
21635 // sub-vectors are associated with the shufflevector instructions they will
21636 // replace.
21638
21639 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
21640 // If we're generating more than one load, compute the base address of
21641 // subsequent loads as an offset from the previous.
21642 if (LoadCount > 0)
21643 BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,
21644 VecTy->getNumElements() * Factor);
21645
21646 CallInst *VldN = createLoadIntrinsic(BaseAddr);
21647
21648 // Replace uses of each shufflevector with the corresponding vector loaded
21649 // by ldN.
21650 for (unsigned i = 0; i < Shuffles.size(); i++) {
21651 ShuffleVectorInst *SV = Shuffles[i];
21652 unsigned Index = Indices[i];
21653
21654 Value *SubVec = Builder.CreateExtractValue(VldN, Index);
21655
21656 // Convert the integer vector to pointer vector if the element is pointer.
21657 if (EltTy->isPointerTy())
21658 SubVec = Builder.CreateIntToPtr(
21659 SubVec,
21661
21662 SubVecs[SV].push_back(SubVec);
21663 }
21664 }
21665
21666 // Replace uses of the shufflevector instructions with the sub-vectors
21667 // returned by the load intrinsic. If a shufflevector instruction is
21668 // associated with more than one sub-vector, those sub-vectors will be
21669 // concatenated into a single wide vector.
21670 for (ShuffleVectorInst *SVI : Shuffles) {
21671 auto &SubVec = SubVecs[SVI];
21672 auto *WideVec =
21673 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
21674 SVI->replaceAllUsesWith(WideVec);
21675 }
21676
21677 return true;
21678}
21679
21680/// Lower an interleaved store into a vstN intrinsic.
21681///
21682/// E.g. Lower an interleaved store (Factor = 3):
21683/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
21684/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
21685/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
21686///
21687/// Into:
21688/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
21689/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
21690/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
21691/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21692///
21693/// Note that the new shufflevectors will be removed and we'll only generate one
21694/// vst3 instruction in CodeGen.
21695///
21696/// Example for a more general valid mask (Factor 3). Lower:
21697/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
21698/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
21699/// store <12 x i32> %i.vec, <12 x i32>* %ptr
21700///
21701/// Into:
21702/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
21703/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
21704/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
21705/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21707 Value *LaneMask,
21708 ShuffleVectorInst *SVI,
21709 unsigned Factor,
21710 const APInt &GapMask) const {
21711 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21712 "Invalid interleave factor");
21713 auto *SI = dyn_cast<StoreInst>(Store);
21714 if (!SI)
21715 return false;
21716 assert(!LaneMask && GapMask.popcount() == Factor &&
21717 "Unexpected mask on store");
21718
21719 auto *VecTy = cast<FixedVectorType>(SVI->getType());
21720 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
21721
21722 unsigned LaneLen = VecTy->getNumElements() / Factor;
21723 Type *EltTy = VecTy->getElementType();
21724 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
21725
21726 const DataLayout &DL = SI->getDataLayout();
21727 Align Alignment = SI->getAlign();
21728
21729 // Skip if we do not have NEON and skip illegal vector types. We can
21730 // "legalize" wide vector types into multiple interleaved accesses as long as
21731 // the vector types are divisible by 128.
21732 if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
21733 return false;
21734
21735 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
21736
21737 Value *Op0 = SVI->getOperand(0);
21738 Value *Op1 = SVI->getOperand(1);
21739 IRBuilder<> Builder(SI);
21740
21741 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
21742 // vectors to integer vectors.
21743 if (EltTy->isPointerTy()) {
21744 Type *IntTy = DL.getIntPtrType(EltTy);
21745
21746 // Convert to the corresponding integer vector.
21747 auto *IntVecTy =
21748 FixedVectorType::get(IntTy, cast<FixedVectorType>(Op0->getType()));
21749 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
21750 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
21751
21752 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
21753 }
21754
21755 // The base address of the store.
21756 Value *BaseAddr = SI->getPointerOperand();
21757
21758 if (NumStores > 1) {
21759 // If we're going to generate more than one store, reset the lane length
21760 // and sub-vector type to something legal.
21761 LaneLen /= NumStores;
21762 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
21763 }
21764
21765 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
21766
21767 auto Mask = SVI->getShuffleMask();
21768
21769 auto createStoreIntrinsic = [&](Value *BaseAddr,
21770 SmallVectorImpl<Value *> &Shuffles) {
21771 if (Subtarget->hasNEON()) {
21772 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
21773 Intrinsic::arm_neon_vst3,
21774 Intrinsic::arm_neon_vst4};
21775 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21776 Type *Tys[] = {PtrTy, SubVecTy};
21777
21779 Ops.push_back(BaseAddr);
21780 append_range(Ops, Shuffles);
21781 Ops.push_back(Builder.getInt32(SI->getAlign().value()));
21782 Builder.CreateIntrinsic(StoreInts[Factor - 2], Tys, Ops);
21783 } else {
21784 assert((Factor == 2 || Factor == 4) &&
21785 "expected interleave factor of 2 or 4 for MVE");
21786 Intrinsic::ID StoreInts =
21787 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
21788 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21789 Type *Tys[] = {PtrTy, SubVecTy};
21790
21792 Ops.push_back(BaseAddr);
21793 append_range(Ops, Shuffles);
21794 for (unsigned F = 0; F < Factor; F++) {
21795 Ops.push_back(Builder.getInt32(F));
21796 Builder.CreateIntrinsic(StoreInts, Tys, Ops);
21797 Ops.pop_back();
21798 }
21799 }
21800 };
21801
21802 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
21803 // If we generating more than one store, we compute the base address of
21804 // subsequent stores as an offset from the previous.
21805 if (StoreCount > 0)
21806 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
21807 BaseAddr, LaneLen * Factor);
21808
21809 SmallVector<Value *, 4> Shuffles;
21810
21811 // Split the shufflevector operands into sub vectors for the new vstN call.
21812 for (unsigned i = 0; i < Factor; i++) {
21813 unsigned IdxI = StoreCount * LaneLen * Factor + i;
21814 if (Mask[IdxI] >= 0) {
21815 Shuffles.push_back(Builder.CreateShuffleVector(
21816 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
21817 } else {
21818 unsigned StartMask = 0;
21819 for (unsigned j = 1; j < LaneLen; j++) {
21820 unsigned IdxJ = StoreCount * LaneLen * Factor + j;
21821 if (Mask[IdxJ * Factor + IdxI] >= 0) {
21822 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
21823 break;
21824 }
21825 }
21826 // Note: If all elements in a chunk are undefs, StartMask=0!
21827 // Note: Filling undef gaps with random elements is ok, since
21828 // those elements were being written anyway (with undefs).
21829 // In the case of all undefs we're defaulting to using elems from 0
21830 // Note: StartMask cannot be negative, it's checked in
21831 // isReInterleaveMask
21832 Shuffles.push_back(Builder.CreateShuffleVector(
21833 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
21834 }
21835 }
21836
21837 createStoreIntrinsic(BaseAddr, Shuffles);
21838 }
21839 return true;
21840}
21841
21849
21851 uint64_t &Members) {
21852 if (auto *ST = dyn_cast<StructType>(Ty)) {
21853 for (unsigned i = 0; i < ST->getNumElements(); ++i) {
21854 uint64_t SubMembers = 0;
21855 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
21856 return false;
21857 Members += SubMembers;
21858 }
21859 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
21860 uint64_t SubMembers = 0;
21861 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
21862 return false;
21863 Members += SubMembers * AT->getNumElements();
21864 } else if (Ty->isFloatTy()) {
21865 if (Base != HA_UNKNOWN && Base != HA_FLOAT)
21866 return false;
21867 Members = 1;
21868 Base = HA_FLOAT;
21869 } else if (Ty->isDoubleTy()) {
21870 if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
21871 return false;
21872 Members = 1;
21873 Base = HA_DOUBLE;
21874 } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
21875 Members = 1;
21876 switch (Base) {
21877 case HA_FLOAT:
21878 case HA_DOUBLE:
21879 return false;
21880 case HA_VECT64:
21881 return VT->getPrimitiveSizeInBits().getFixedValue() == 64;
21882 case HA_VECT128:
21883 return VT->getPrimitiveSizeInBits().getFixedValue() == 128;
21884 case HA_UNKNOWN:
21885 switch (VT->getPrimitiveSizeInBits().getFixedValue()) {
21886 case 64:
21887 Base = HA_VECT64;
21888 return true;
21889 case 128:
21890 Base = HA_VECT128;
21891 return true;
21892 default:
21893 return false;
21894 }
21895 }
21896 }
21897
21898 return (Members > 0 && Members <= 4);
21899}
21900
21901/// Return the correct alignment for the current calling convention.
21903 Type *ArgTy, const DataLayout &DL) const {
21904 const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
21905 if (!ArgTy->isVectorTy())
21906 return ABITypeAlign;
21907
21908 // Avoid over-aligning vector parameters. It would require realigning the
21909 // stack and waste space for no real benefit.
21910 MaybeAlign StackAlign = DL.getStackAlignment();
21911 assert(StackAlign && "data layout string is missing stack alignment");
21912 return std::min(ABITypeAlign, *StackAlign);
21913}
21914
21915/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
21916/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
21917/// passing according to AAPCS rules.
21919 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
21920 const DataLayout &DL) const {
21921 if (getEffectiveCallingConv(CallConv, isVarArg) !=
21923 return false;
21924
21926 uint64_t Members = 0;
21927 bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
21928 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
21929
21930 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
21931 return IsHA || IsIntArray;
21932}
21933
21935 const Constant *PersonalityFn) const {
21936 // Platforms which do not use SjLj EH may return values in these registers
21937 // via the personality function.
21939 return EM == ExceptionHandling::SjLj ? Register() : ARM::R0;
21940}
21941
21943 const Constant *PersonalityFn) const {
21944 // Platforms which do not use SjLj EH may return values in these registers
21945 // via the personality function.
21947 return EM == ExceptionHandling::SjLj ? Register() : ARM::R1;
21948}
21949
21950void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
21951 // Update IsSplitCSR in ARMFunctionInfo.
21952 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
21953 AFI->setIsSplitCSR(true);
21954}
21955
21956void ARMTargetLowering::insertCopiesSplitCSR(
21957 MachineBasicBlock *Entry,
21958 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
21959 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
21960 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
21961 if (!IStart)
21962 return;
21963
21964 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21965 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
21966 MachineBasicBlock::iterator MBBI = Entry->begin();
21967 for (const MCPhysReg *I = IStart; *I; ++I) {
21968 const TargetRegisterClass *RC = nullptr;
21969 if (ARM::GPRRegClass.contains(*I))
21970 RC = &ARM::GPRRegClass;
21971 else if (ARM::DPRRegClass.contains(*I))
21972 RC = &ARM::DPRRegClass;
21973 else
21974 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
21975
21976 Register NewVR = MRI->createVirtualRegister(RC);
21977 // Create copy from CSR to a virtual register.
21978 // FIXME: this currently does not emit CFI pseudo-instructions, it works
21979 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
21980 // nounwind. If we want to generalize this later, we may need to emit
21981 // CFI pseudo-instructions.
21982 assert(Entry->getParent()->getFunction().hasFnAttribute(
21983 Attribute::NoUnwind) &&
21984 "Function should be nounwind in insertCopiesSplitCSR!");
21985 Entry->addLiveIn(*I);
21986 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
21987 .addReg(*I);
21988
21989 // Insert the copy-back instructions right before the terminator.
21990 for (auto *Exit : Exits)
21991 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
21992 TII->get(TargetOpcode::COPY), *I)
21993 .addReg(NewVR);
21994 }
21995}
21996
22000}
22001
22003 return Subtarget->hasMVEIntegerOps();
22004}
22005
22008 auto *VTy = dyn_cast<FixedVectorType>(Ty);
22009 if (!VTy)
22010 return false;
22011
22012 auto *ScalarTy = VTy->getScalarType();
22013 unsigned NumElements = VTy->getNumElements();
22014
22015 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
22016 if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth))
22017 return false;
22018
22019 // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32
22020 if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy())
22021 return Subtarget->hasMVEFloatOps();
22022
22024 return false;
22025
22026 return Subtarget->hasMVEIntegerOps() &&
22027 (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
22028 ScalarTy->isIntegerTy(32));
22029}
22030
22033 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
22034 Value *Accumulator) const {
22035
22036 FixedVectorType *Ty = cast<FixedVectorType>(InputA->getType());
22037
22038 unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
22039
22040 assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits");
22041
22042 if (TyWidth > 128) {
22043 int Stride = Ty->getNumElements() / 2;
22044 auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());
22045 auto SplitSeqVec = llvm::to_vector(SplitSeq);
22046 ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
22047 ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
22048
22049 auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);
22050 auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);
22051 auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);
22052 auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);
22053 Value *LowerSplitAcc = nullptr;
22054 Value *UpperSplitAcc = nullptr;
22055
22056 if (Accumulator) {
22057 LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);
22058 UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);
22059 }
22060
22061 auto *LowerSplitInt = createComplexDeinterleavingIR(
22062 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
22063 auto *UpperSplitInt = createComplexDeinterleavingIR(
22064 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
22065
22066 ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
22067 return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
22068 }
22069
22070 auto *IntTy = Type::getInt32Ty(B.getContext());
22071
22072 ConstantInt *ConstRotation = nullptr;
22073 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
22074 ConstRotation = ConstantInt::get(IntTy, (int)Rotation);
22075
22076 if (Accumulator)
22077 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty,
22078 {ConstRotation, Accumulator, InputB, InputA});
22079 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty,
22080 {ConstRotation, InputB, InputA});
22081 }
22082
22083 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
22084 // 1 means the value is not halved.
22085 auto *ConstHalving = ConstantInt::get(IntTy, 1);
22086
22088 ConstRotation = ConstantInt::get(IntTy, 0);
22090 ConstRotation = ConstantInt::get(IntTy, 1);
22091
22092 if (!ConstRotation)
22093 return nullptr; // Invalid rotation for arm_mve_vcaddq
22094
22095 return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty,
22096 {ConstHalving, ConstRotation, InputA, InputB});
22097 }
22098
22099 return nullptr;
22100}
unsigned const MachineRegisterInfo * MRI
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static const MCPhysReg GPRArgRegs[]
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &DL)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
constexpr MVT FlagsVT
Value type used for NZCV flags.
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static bool isConstant(const MachineInstr &MI)
constexpr LLT F64
constexpr LLT S1
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG)
static bool isStore(int Opcode)
static bool isThumb(const MCSubtargetInfo &STI)
static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT)
static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
@ HA_DOUBLE
@ HA_VECT128
@ HA_VECT64
@ HA_FLOAT
@ HA_UNKNOWN
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total value size to 64 bits.
static cl::opt< unsigned > ConstpoolPromotionMaxSize("arm-promote-constant-max-size", cl::Hidden, cl::desc("Maximum size of constant to promote into a constant pool"), cl::init(64))
static bool isZeroOrAllOnes(SDValue N, bool AllOnes)
static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isVTBLMask(ArrayRef< int > M, EVT VT)
static SDValue PerformSUBCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
static cl::opt< bool > EnableConstpoolPromotion("arm-promote-constant", cl::Hidden, cl::desc("Enable / disable promotion of unnamed_addr constants into " "constant pools"), cl::init(false))
static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG)
static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
static SDValue PerformExtractEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static const APInt * isPowerOf2Constant(SDValue V)
static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) can replace combinations of ...
static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static bool isValidMVECond(unsigned CC, bool IsFloat)
static SDValue PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC)
IntCCToARMCC - Convert a DAG integer condition code to an ARM CC.
static SDValue PerformSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSTORECombine - Target-specific dag combine xforms for ISD::STORE.
static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, SelectionDAG &DAG)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isGTorGE(ISD::CondCode CC)
static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1) intrinsic,...
static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask)
static bool isReverseMask(ArrayRef< int > M, EVT VT)
static bool isVZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of "vector_shuffle v,...
static SDValue PerformSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVMulVCTPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD) can replace combinations...
static SDValue createGPRPairNode2xi32(SelectionDAG &DAG, SDValue V0, SDValue V1)
static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG)
static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc)
static bool isVTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool CanInvertMVEVCMP(SDValue N)
static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG)
static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformShiftCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
PerformShiftCombine - Checks for immediate versions of vector shifts and lowers them.
static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, ARMCC::CondCodes &CondCode2)
FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static EVT getVectorTyFromPredicateVector(EVT VT)
static SDValue PerformFADDVCMLACombine(SDNode *N, SelectionDAG &DAG)
static SDValue handleCMSEValue(const SDValue &Value, const ISD::InputArg &Arg, SelectionDAG &DAG, const SDLoc &DL)
static SDValue PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
static bool isSRL16(const SDValue &Op)
static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC)
static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr, SDValue Inc, const SelectionDAG &DAG)
static SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static Register genTPEntry(MachineBasicBlock *TpEntry, MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpExit, Register OpSizeReg, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI)
Adds logic in loop entry MBB to calculate loop iteration count and adds t2WhileLoopSetup and t2WhileL...
static SDValue createGPRPairNodei64(SelectionDAG &DAG, SDValue V)
static bool isLTorLE(ISD::CondCode CC)
static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static SDValue PerformBITCASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG)
static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG)
static bool hasNormalLoadOperand(SDNode *N)
hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node are normal,...
static SDValue PerformInsertEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
PerformInsertEltCombine - Target-specific dag combine xforms for ISD::INSERT_VECTOR_ELT.
static SDValue PerformVDUPLANECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVDUPLANECombine - Target-specific dag combine xforms for ARMISD::VDUPLANE.
static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static cl::opt< unsigned > ConstpoolPromotionMaxTotal("arm-promote-constant-max-total", cl::Hidden, cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128))
static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static RTLIB::Libcall getDivRemLibcall(const SDNode *N, MVT::SimpleValueType SVT)
static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG &DAG)
SkipLoadExtensionForVMULL - return a load of the original vector size that does not do any sign/zero ...
static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static const MCPhysReg GPRArgRegs[]
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, SelectionDAG &DAG)
static bool isVZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformORCombineToSMULWBT(SDNode *OR, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isVTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of "vector_shuffle v,...
static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue FindBFIToCombineWith(SDNode *N)
static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, SelectionDAG &DAG)
ShuffleOpCodes
@ OP_VEXT3
@ OP_VTRNR
@ OP_VDUP1
@ OP_VZIPR
@ OP_VUZPR
@ OP_VREV
@ OP_VZIPL
@ OP_VTRNL
@ OP_COPY
@ OP_VEXT1
@ OP_VDUP0
@ OP_VEXT2
@ OP_VUZPL
@ OP_VDUP3
@ OP_VDUP2
static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, bool &swpCmpOps, bool &swpVselOps)
static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isS16(const SDValue &Op, SelectionDAG &DAG)
static bool isSRA16(const SDValue &Op)
constexpr MVT FlagsVT
Value type used for "flags" operands / results (either CPSR or FPSCR_NZCV).
static SDValue AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue LowerInterruptReturn(SmallVectorImpl< SDValue > &RetOps, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, SelectionDAG &DAG)
static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, SDValue &RetVal1, SDValue &RetVal2)
static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isSHL16(const SDValue &Op)
static bool isVEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseVEXT, unsigned &Imm)
static SDValue PerformMVEVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
cl::opt< unsigned > ArmMaxBaseUpdatesToCheck("arm-max-base-updates-to-check", cl::Hidden, cl::desc("Maximum number of base-updates to check generating postindex."), cl::init(64))
static bool isTruncMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2)
Return the load opcode for a given load size.
static bool isLegalT2AddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
static bool isLegalMVEShuffleOp(unsigned PFEntry)
static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N, SelectionDAG &DAG)
static bool isVUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG)
PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for ISD::VECTOR_SHUFFLE.
static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG)
SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, ANY_EXTEND,...
static bool isVMOVNTruncMask(ArrayRef< int > M, EVT ToVT, bool rev)
static SDValue PerformVQMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static MachineBasicBlock * OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ)
static SDValue LowerVecReduceMinMax(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformAddcSubcCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static TargetLowering::ArgListTy getDivRemArgList(const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget)
static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static ARMCC::CondCodes getVCMPCondCode(SDValue N)
static cl::opt< bool > ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true))
static void ReplaceREADCYCLECOUNTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformORCombineToBFI(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDValue &CC, bool &Invert, SDValue &OtherOp, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVSetCCToVCTPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isZeroVector(SDValue N)
static SDValue PerformAddeSubeCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void ReplaceCMP_SWAP_64Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, const SDValue TrueVal, const SDValue FalseVal, const ISD::CondCode CC, const SDValue K)
static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG)
static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned StSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment store operation with given size.
static bool isVMOVNMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, NEON load/store intrinsics,...
static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVRRDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMOVRRDCombine - Target-specific dag combine xforms for ARMISD::VMOVRRD.
static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain)
static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue PerformVMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMULCombine Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the special multi...
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformORCombine - Target-specific dag combine xforms for ISD::OR.
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG)
static unsigned SelectPairHalf(unsigned Elements, ArrayRef< int > Mask, unsigned Index)
static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned LdSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment load operation with given size.
static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG)
static bool isValidBaseUpdate(SDNode *N, SDNode *User)
static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, const ARMSubtarget *ST, const SDLoc &dl)
static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op)
static SDValue PerformMinMaxToSatCombine(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformXORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, bool isSEXTLoad, bool IsMasked, bool isLE, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
std::pair< unsigned, const TargetRegisterClass * > RCPair
static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, TargetLowering::DAGCombinerInfo &DCI, bool AllOnes=false)
static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, ISD::ZERO_EXTEND,...
static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, const SDLoc &dl, EVT &VT, EVT VectorVT, VMOVModImmType type)
isVMOVModifiedImm - Check if the specified splat value corresponds to a valid vector constant for a N...
static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, SelectionDAG &DAG)
BC is a bitcast that is about to be turned into a VMOVDRR.
static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, const GlobalValue *GV, SelectionDAG &DAG, EVT PtrVT, const SDLoc &dl)
static unsigned isNEONTwoResultShuffleMask(ArrayRef< int > ShuffleMask, EVT VT, unsigned &WhichResult, bool &isV_UNDEF)
Check if ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), and return the corresponding AR...
static bool BitsProperlyConcatenate(const APInt &A, const APInt &B)
static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG)
static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target, struct BaseUpdateUser &User, bool SimpleConstIncOnly, TargetLowering::DAGCombinerInfo &DCI)
static bool allUsersAreInFunction(const Value *V, const Function *F)
Return true if all users of V are within function F, looking through ConstantExprs.
static bool isSingletonVEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG)
PerformVMOVDRRCombine - Target-specific dag combine xforms for ARMISD::VMOVDRR.
static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, SDValue &SatK)
static bool isLegalAddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
isLegalAddressImmediate - Return true if the integer value can be used as the offset of the target ad...
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isLegalT1AddressImmediate(int64_t V, EVT VT)
static SDValue CombineANDShift(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformADDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDECombine - Target-specific dag combine transform from ARMISD::ADDC, ARMISD::ADDE,...
static SDValue PerformReduceShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformHWLoopCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool isVUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of "vector_shuffle v,...
static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, uint64_t &Members)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG)
static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, bool &Negate)
static bool canChangeToInt(SDValue Op, bool &SeenZero, const ARMSubtarget *Subtarget)
canChangeToInt - Given the fp compare operand, return true if it is suitable to morph to an integer c...
static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2)
Return the store opcode for a given store size.
static bool IsVUZPShuffleNode(SDNode *N)
static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, MachineInstr &MI, const SDNode *Node)
Attaches vregs to MEMCPY that it will use as scratch registers when it is expanded into LDM/STM.
static bool isFloatingPointZero(SDValue Op)
isFloatingPointZero - Return true if this is +0.0.
static SDValue findMUL_LOHI(SDValue V)
static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG)
static void genTPLoopBody(MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI, Register OpSrcReg, Register OpDestReg, Register ElementCountReg, Register TotalIterationsReg, bool IsMemcpy)
Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and t2DoLoopEnd.
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformMinMaxCombine - Target-specific DAG combining for creating truncating saturates.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
This file a TargetTransformInfoImplBase conforming object specific to the ARM target machine.
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
BlockVerifier::State From
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static std::optional< bool > isBigEndian(const SmallDenseMap< int64_t, int64_t, 8 > &MemOffset2Idx, int64_t LowestIdx)
Given a map from byte offsets in memory to indices in a load/store, determine if that map corresponds...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset, dxil::ResourceTypeInfo &RTI)
static void createStoreIntrinsic(IntrinsicInst *II, StoreInst *SI, Value *Offset, dxil::ResourceTypeInfo &RTI)
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
uint64_t Addr
std::string Name
uint64_t Size
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
#define Check(C,...)
#define op(i)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
lazy value info
loop Loop Strength Reduction
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
This file declares the MachineConstantPool class which is an abstract constant pool to keep track of ...
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
#define MAKE_CASE(V)
Register const TargetRegisterInfo * TRI
nvptx lower args
uint64_t High
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
PowerPC Reduce CR logical Operation
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
SI Lower i1 Copies
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:480
static cl::opt< unsigned > MaxSteps("has-predecessor-max-steps", cl::Hidden, cl::init(8192), cl::desc("DAG combiner limit number of steps when searching DAG " "for predecessor nodes"))
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition: Debug.h:119
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
static bool isIntrinsic(const CallBase &Call, Intrinsic::ID ID)
bool getExactInverse(APFloat *Inv) const
If this value is normal and has an exact, normal, multiplicative inverse, store it in inv and return ...
Definition: APFloat.cpp:5999
APInt bitcastToAPInt() const
Definition: APFloat.h:1353
opStatus convertToInteger(MutableArrayRef< integerPart > Input, unsigned int Width, bool IsSigned, roundingMode RM, bool *IsExact) const
Definition: APFloat.h:1332
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1540
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1670
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1033
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1512
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:936
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition: APInt.h:1201
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1111
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1639
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition: APInt.h:1598
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:651
unsigned logBase2() const
Definition: APInt.h:1761
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition: APInt.h:475
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:239
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1562
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:858
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition: APInt.h:851
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1656
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1221
An arbitrary precision integer that knows its signedness.
Definition: APSInt.h:24
virtual const ARMBaseRegisterInfo & getRegisterInfo() const =0
const uint32_t * getSjLjDispatchPreservedMask(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
Code Generation virtual methods...
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
const uint32_t * getTLSCallPreservedMask(const MachineFunction &MF) const
const uint32_t * getThisReturnPreservedMask(const MachineFunction &MF, CallingConv::ID) const
getThisReturnPreservedMask - Returns a call preserved mask specific to the case that 'returned' is on...
static ARMConstantPoolConstant * Create(const Constant *C, unsigned ID)
static ARMConstantPoolMBB * Create(LLVMContext &C, const MachineBasicBlock *mbb, unsigned ID, unsigned char PCAdj)
static ARMConstantPoolSymbol * Create(LLVMContext &C, StringRef s, unsigned ID, unsigned char PCAdj)
ARMConstantPoolValue - ARM specific constantpool value.
ARMFunctionInfo - This class is derived from MachineFunctionInfo and contains private ARM-specific in...
SmallPtrSet< const GlobalVariable *, 2 > & getGlobalsPromotedToConstantPool()
void setArgumentStackToRestore(unsigned v)
void setPromotedConstpoolIncrease(int Sz)
void setArgRegsSaveSize(unsigned s)
void setReturnRegsCount(unsigned s)
void setVarArgsFrameIndex(int Index)
unsigned getArgRegsSaveSize() const
void markGlobalAsPromotedToConstantPool(const GlobalVariable *GV)
Indicate to the backend that GV has had its storage changed to inside a constant pool.
void setArgumentStackSize(unsigned size)
unsigned getArgumentStackSize() const
bool isTargetMachO() const
Definition: ARMSubtarget.h:346
bool useMovt() const
bool isTargetAEABI() const
Definition: ARMSubtarget.h:348
bool hasARMOps() const
Definition: ARMSubtarget.h:298
bool supportsTailCall() const
Definition: ARMSubtarget.h:399
const Triple & getTargetTriple() const
Definition: ARMSubtarget.h:330
bool hasVFP4Base() const
Definition: ARMSubtarget.h:306
const ARMBaseInstrInfo * getInstrInfo() const override
Definition: ARMSubtarget.h:235
bool isThumb1Only() const
Definition: ARMSubtarget.h:375
bool useFPVFMx() const
Definition: ARMSubtarget.h:315
bool hasFPARMv8Base() const
Definition: ARMSubtarget.h:307
bool isThumb2() const
Definition: ARMSubtarget.h:376
bool isTargetWindows() const
Definition: ARMSubtarget.h:342
bool isGVIndirectSymbol(const GlobalValue *GV) const
True if the GV will be accessed via an indirect symbol.
bool hasBaseDSP() const
Definition: ARMSubtarget.h:320
const ARMTargetLowering * getTargetLowering() const override
Definition: ARMSubtarget.h:239
bool isTargetDarwin() const
Definition: ARMSubtarget.h:335
const ARMBaseRegisterInfo * getRegisterInfo() const override
Definition: ARMSubtarget.h:247
bool hasVFP2Base() const
Definition: ARMSubtarget.h:304
bool isTargetAndroid() const
Definition: ARMSubtarget.h:365
bool isROPI() const
bool isTargetCOFF() const
Definition: ARMSubtarget.h:344
bool isTargetGNUAEABI() const
Definition: ARMSubtarget.h:350
bool hasVFP3Base() const
Definition: ARMSubtarget.h:305
bool useFPVFMx64() const
Definition: ARMSubtarget.h:319
unsigned getPreferBranchLogAlignment() const
Definition: ARMSubtarget.h:486
bool hasMinSize() const
Definition: ARMSubtarget.h:374
bool useNEONForSinglePrecisionFP() const
Definition: ARMSubtarget.h:300
bool hasAnyDataBarrier() const
Definition: ARMSubtarget.h:309
bool isRWPI() const
bool isLittle() const
Definition: ARMSubtarget.h:407
bool allowsUnalignedMem() const
Definition: ARMSubtarget.h:401
bool isTargetMuslAEABI() const
Definition: ARMSubtarget.h:352
bool useFPVFMx16() const
Definition: ARMSubtarget.h:318
bool isMClass() const
Definition: ARMSubtarget.h:377
bool useMulOps() const
Definition: ARMSubtarget.h:313
bool isTargetELF() const
Definition: ARMSubtarget.h:345
Align getDualLoadStoreAlignment() const
Definition: ARMSubtarget.h:443
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isReadOnly(const GlobalValue *GV) const
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getABIAlignmentForCallingConv(Type *ArgTy, const DataLayout &DL) const override
Return the correct alignment for the current calling convention.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const override
Examine constraint string and operand type and determine a weight value.
bool shouldExpandCmpUsingSelects(EVT VT) const override
Should we expand [US]CMP nodes using two selects and two compares, or by doing arithmetic on boolean ...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
const ARMSubtarget * getSubtarget() const
bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const
bool isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const
Returns true if the addressing mode representing by AM is legal for the Thumb1 target,...
bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPreIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mod...
bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, Align &PrefAlign) const override
Return true if the pointer arguments to CI should be aligned by aligning the object whose address is ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
ReplaceNodeResults - Replace the results of node with an illegal result type with new values built ou...
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool isLegalAddImmediate(int64_t Imm) const override
isLegalAddImmediate - Return true if the specified immediate is legal add immediate,...
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
SDValue PerformMVETruncCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize=false) const override
isFPImmLegal - Returns true if the target can instruction select the specified FP immediate natively.
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const
PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Return true if it is profitable to combine an XOR of a logical shift to create a logical shift of NOT...
bool ExpandInlineAsm(CallInst *CI) const override
This hook allows the target to expand an inline asm call to be explicit llvm code if it wants to.
SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const
PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
SDValue PerformMVEExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
getSetCCResultType - Return the value type to use for ISD::SETCC.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
createFastISel - This method returns a target specific FastISel object, or null if the target does no...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
This method should be implemented by targets that mark instructions with the 'hasPostISelHook' flag.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isTruncateFree(Type *SrcTy, Type *DstTy) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
isShuffleMaskLegal - Targets can use this to indicate that they only support some VECTOR_SHUFFLE oper...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved store into a vstN intrinsic.
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const override
getRegClassFor - Return the register class that should be used for the specified value type.
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved load into a vldN intrinsic.
std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override
Return the largest legal super-reg register class of the register class for the specified type and it...
bool isZExtFree(SDValue Val, EVT VT2) const override
Return true if zero-extending the specific node Val to type VT2 is free (either because it's implicit...
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI)
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const
PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
Type * shouldConvertSplatType(ShuffleVectorInst *SVI) const override
Given a shuffle vector SVI representing a vector splat, return a new scalar type of size equal to SVI...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
Instruction * makeDMB(IRBuilderBase &Builder, ARM_MB::MemBOpt Domain) const
bool isLegalICmpImmediate(int64_t Imm) const override
isLegalICmpImmediate - Return true if the specified immediate is legal icmp immediate,...
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override
Return true if it is profitable for dag combiner to transform a floating point op of specified opcode...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const override
allowsMisalignedMemoryAccesses - Returns true if the target allows unaligned memory accesses of the s...
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
bool isVectorLoadExtDesirable(SDValue ExtVal) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override
Return true if the target can combine store(extractelement VectorTy, Idx).
bool useSoftFloat() const override
bool alignLoopsWithOptSize() const override
Should loops be aligned even when the function is marked OptSize (but not MinSize).
SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
LowerAsmOperandForConstraint - Lower the specified operand into the Ops vector.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
Returns true if an argument of type Ty needs to be passed in a contiguous block of registers in calli...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
const ARMBaseTargetMachine & getTM() const
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPostIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mo...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:32
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:506
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:709
bool isFloatingPointOperation() const
Definition: Instructions.h:898
LLVM_ABI bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:95
static LLVM_ABI BaseIndexOffset match(const SDNode *N, const SelectionDAG &DAG)
Parses tree in N for base, index, offset addresses.
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:213
The address of a basic block.
Definition: Constants.h:899
static BranchProbability getZero()
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
void getInRegsParamInfo(unsigned InRegsParamRecordIndex, unsigned &BeginReg, unsigned &EndReg) const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
void rewindByValRegsInfo()
unsigned getInRegsParamsProcessed() const
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
void addInRegsParamInfo(unsigned RegBegin, unsigned RegEnd)
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
unsigned getInRegsParamsCount() const
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
bool isMemLoc() const
int64_t getLocMemOffset() const
unsigned getValNo() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1116
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
LLVM_ABI bool isIndirectCall() const
Return true if the callsite is an indirect call.
Value * getCalledOperand() const
Definition: InstrTypes.h:1340
AttributeList getAttributes() const
Return the attributes for this call.
Definition: InstrTypes.h:1424
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1506
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition: Constants.h:715
const APFloat & getValueAPF() const
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:277
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
This is an important base class in LLVM.
Definition: Constant.h:43
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
bool isLittleEndian() const
Layout endianness...
Definition: DataLayout.h:198
bool isBigEndian() const
Definition: DataLayout.h:199
MaybeAlign getStackAlignment() const
Returns the natural stack alignment, or MaybeAlign() if one wasn't specified.
Definition: DataLayout.h:228
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
LLVM_ABI Align getPreferredAlign(const GlobalVariable *GV) const
Returns the preferred alignment of the specified global.
Definition: DataLayout.cpp:987
StringRef getPrivateGlobalPrefix() const
Definition: DataLayout.h:286
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:846
A debug info location.
Definition: DebugLoc.h:124
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:177
unsigned size() const
Definition: DenseMap.h:120
bool empty() const
Definition: DenseMap.h:119
iterator begin()
Definition: DenseMap.h:78
iterator end()
Definition: DenseMap.h:87
Implements a dense probed hash-table based set.
Definition: DenseSet.h:263
Diagnostic information for unsupported feature in backend.
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:592
unsigned getNumElements() const
Definition: DerivedTypes.h:635
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:803
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:170
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:137
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:209
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:270
arg_iterator arg_begin()
Definition: Function.h:866
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:359
bool hasStructRetAttr() const
Determine if the function returns a structure through first or second pointer argument.
Definition: Function.h:687
bool isVarArg() const
isVarArg - Return true if this function takes a variable number of arguments.
Definition: Function.h:227
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:727
const GlobalValue * getGlobal() const
bool isDSOLocal() const
Definition: GlobalValue.h:307
bool hasExternalWeakLinkage() const
Definition: GlobalValue.h:531
bool hasDLLImportStorageClass() const
Definition: GlobalValue.h:280
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:663
bool isStrongDefinitionForLinker() const
Returns true if this global's definition will be the one chosen by the linker.
Definition: GlobalValue.h:638
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:60
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:114
Value * CreateZExtOrBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2214
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1936
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2618
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2199
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1513
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:201
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:834
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:522
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1492
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2082
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2593
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2194
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2508
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2068
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:605
Value * CreateTruncOrBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2230
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition: IRBuilder.h:1573
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2780
std::optional< unsigned > getOperandCycle(unsigned ItinClassIndx, unsigned OperandIdx) const
Return the cycle for the given class and operand.
bool isEmpty() const
Returns true if there are no itineraries.
LLVM_ABI bool hasAtomicStore() const LLVM_READONLY
Return true if this atomic instruction stores to memory.
Class to represent integer types.
Definition: DerivedTypes.h:42
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:74
static bool LowerToByteSwap(CallInst *CI)
Try to replace a call instruction with a call to a bswap intrinsic.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:180
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:199
unsigned getSchedClass() const
Return the scheduling class for this instruction.
Definition: MCInstrDesc.h:603
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:238
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:240
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
Definition: MCInstrDesc.h:249
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
Definition: MCInstrDesc.h:220
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:42
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isInteger() const
Return true if this is an integer or a vector integer type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:247
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
bool is64BitVector() const
Return true if this is a 64-bit vector type.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
LLVM_ABI void moveAfter(MachineBasicBlock *NewBefore)
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineConstantPool class keeps track of constants referenced by a function which must be spilled...
unsigned getConstantPoolIndex(const Constant *C, Align Alignment)
getConstantPoolIndex - Create a new entry in the constant pool or return an existing one.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
Properties which a MachineFunction may have at a given point in time.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineConstantPool * getConstantPool()
getConstantPool - Return the constant pool object for the current function.
const MachineFunctionProperties & getProperties() const
Get the function properties.
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addConstantPoolIndex(unsigned Idx, int Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:72
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:595
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
LLVM_ABI void setIsDef(bool Val=true)
Change a def to a use, or a use to a def.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class is used to represent an MLOAD node.
This class is used to represent an MSTORE node.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
virtual void print(raw_ostream &OS, const Module *M) const
print - Print out the internal state of the pass.
Definition: Pass.cpp:140
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:720
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< use_iterator > uses()
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
void dump() const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:229
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:758
LLVM_ABI SDValue getStackArgumentTokenFactor(SDValue Chain)
Compute a TokenFactor to force all the incoming stack arguments to be loaded from the stack.
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:500
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:813
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:504
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:768
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:868
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:839
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:498
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:719
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:499
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:707
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:493
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:511
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:777
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:581
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a logical NOT operation as (XOR Val, BooleanOne).
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
void reserve(size_type NewNumEntries)
Definition: SmallPtrSet.h:117
void insert_range(Range &&R)
Definition: SmallPtrSet.h:490
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134
bool empty() const
Definition: SmallSet.h:169
bool erase(const T &V)
Definition: SmallSet.h:198
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:182
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:806
void resize(size_type N)
Definition: SmallVector.h:639
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
An instruction for storing to memory.
Definition: Instructions.h:296
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
const unsigned char * bytes_end() const
Definition: StringRef.h:135
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:154
const unsigned char * bytes_begin() const
Definition: StringRef.h:132
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:43
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:68
R Default(T Value)
Definition: StringSwitch.h:177
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:414
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
void setMinStackArgumentAlignment(Align Alignment)
Set the minimum stack alignment of an argument.
const TargetMachine & getTargetMachine() const
void setIndexedMaskedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked load does or does not work with the specified type and ind...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
Sched::Preference getSchedulingPreference() const
Return target scheduling preference.
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual unsigned getMaxSupportedInterleaveFactor() const
Get the maximum supported factor for interleaved memory accesses.
void setIndexedMaskedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked store does or does not work with the specified type and in...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
virtual std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const
Return the largest legal super-reg register class of the register class for the specified type and it...
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Get the libcall impl routine name for the specified libcall.
static StringRef getLibcallImplName(RTLIB::LibcallImpl Call)
Get the libcall routine name for the specified libcall implementation.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
void setLibcallImpl(RTLIB::Libcall Call, RTLIB::LibcallImpl Impl)
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
SDValue buildLegalVectorShuffle(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, MutableArrayRef< int > Mask, SelectionDAG &DAG) const
Tries to build a legal vector shuffle using the provided parameters or equivalent variations.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool isConstTrueVal(SDValue N) const
Return if the N is a constant or constant vector equal to the true value from getBooleanContents().
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
ExceptionHandling getExceptionModel() const
Return the ExceptionHandling to use, considering TargetOptions and the Triple's default.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
TargetOptions Options
unsigned EnableFastISel
EnableFastISel - This flag enables fast-path instruction selection which trades away generated code q...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned EmitCallGraphSection
Emit section containing call graph metadata.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:47
ObjectFormatType getObjectFormat() const
Get the object format for this triple.
Definition: Triple.h:434
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:273
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:264
LLVM_ABI void dump() const
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:267
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Type * getArrayElementType() const
Definition: Type.h:408
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:352
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:35
Value * getOperand(unsigned i) const
Definition: User.h:232
unsigned getNumOperands() const
Definition: User.h:254
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:439
Base class of all SIMD vector types.
Definition: DerivedTypes.h:430
Type * getElementType() const
Definition: DerivedTypes.h:463
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:194
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:169
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:203
const ParentTy * getParent() const
Definition: ilist_node.h:34
self_iterator getIterator()
Definition: ilist_node.h:134
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
static CondCodes getOppositeCondition(CondCodes CC)
Definition: ARMBaseInfo.h:48
@ SECREL
Thread Pointer Offset.
@ SBREL
Section Relative (Windows TLS)
@ GOTTPOFF
Global Offset Table, PC Relative.
@ TPOFF
Global Offset Table, Thread Pointer Offset.
TOF
Target Operand Flag enum.
Definition: ARMBaseInfo.h:242
@ MO_NONLAZY
MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it represents a symbol which,...
Definition: ARMBaseInfo.h:288
@ MO_SBREL
MO_SBREL - On a symbol operand, this represents a static base relative relocation.
Definition: ARMBaseInfo.h:270
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
Definition: ARMBaseInfo.h:275
@ MO_GOT
MO_GOT - On a symbol operand, this represents a GOT relative relocation.
Definition: ARMBaseInfo.h:266
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
Definition: ARMBaseInfo.h:263
static ShiftOpc getShiftOpcForNode(unsigned Opcode)
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits)
decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the element value and the element ...
unsigned getAM2Offset(unsigned AM2Opc)
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
unsigned createVMOVModImm(unsigned OpCmode, unsigned Val)
int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm)
int getFP32FP16Imm(const APInt &Imm)
If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding for it.
AddrOpc getAM2Op(unsigned AM2Opc)
bool isBitFieldInvertedMask(unsigned v)
const unsigned FPStatusBits
const unsigned FPReservedBits
const unsigned RoundingBitsPos
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
@ Entry
Definition: COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Swift
Calling convention for Swift.
Definition: CallingConv.h:69
@ ARM_APCS
ARM Procedure Calling Standard (obsolete, but still used on some targets).
Definition: CallingConv.h:107
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition: CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition: CallingConv.h:63
@ ARM_AAPCS
ARM Architecture Procedure Calling Standard calling convention (aka EABI).
Definition: CallingConv.h:111
@ CXX_FAST_TLS
Used for access functions.
Definition: CallingConv.h:72
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition: CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition: CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ ARM_AAPCS_VFP
Same as ARM_AAPCS, but uses hard floating point ABI.
Definition: CallingConv.h:114
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:256
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1236
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1232
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:774
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:504
@ SET_FPENV
Sets the current floating-point environment.
Definition: ISDOpcodes.h:1108
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
Definition: ISDOpcodes.h:1401
@ VECREDUCE_SMIN
Definition: ISDOpcodes.h:1491
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition: ISDOpcodes.h:163
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition: ISDOpcodes.h:525
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:270
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1379
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:765
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1265
@ ConstantFP
Definition: ISDOpcodes.h:87
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1381
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1351
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1382
@ RESET_FPENV
Set floating-point environment to default state.
Definition: ISDOpcodes.h:1112
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:259
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1141
@ SET_FPMODE
Sets the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1131
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:215
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition: ISDOpcodes.h:167
@ GlobalAddress
Definition: ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:571
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
Definition: ISDOpcodes.h:1476
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:738
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
Definition: ISDOpcodes.h:1343
@ RESET_FPMODE
Sets default dynamic floating-point control modes.
Definition: ISDOpcodes.h:1135
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:275
@ VECREDUCE_SMAX
Definition: ISDOpcodes.h:1490
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:505
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:985
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1377
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:975
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:249
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1378
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:1018
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1568
@ GlobalTLSAddress
Definition: ISDOpcodes.h:89
@ FrameIndex
Definition: ISDOpcodes.h:90
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:957
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:826
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:706
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:656
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1157
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
Definition: ISDOpcodes.h:1473
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:773
@ WRITE_REGISTER
Definition: ISDOpcodes.h:135
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1331
@ VECREDUCE_FMIN
Definition: ISDOpcodes.h:1477
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1090
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:809
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:1002
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1187
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:347
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1380
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1166
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:778
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1347
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:228
@ VECREDUCE_UMAX
Definition: ISDOpcodes.h:1492
@ RegisterMask
Definition: ISDOpcodes.h:85
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:242
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1261
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:343
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
Definition: ISDOpcodes.h:1485
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:695
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1126
@ GET_FPENV
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1103
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:636
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1375
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:601
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition: ISDOpcodes.h:134
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:832
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1321
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:928
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:793
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1358
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1383
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
Definition: ISDOpcodes.h:1059
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:351
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1151
@ ConstantPool
Definition: ISDOpcodes.h:92
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:718
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:960
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:323
@ VECREDUCE_UMIN
Definition: ISDOpcodes.h:1493
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:994
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:110
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1373
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:470
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1081
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1374
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:908
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1292
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:730
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1318
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:200
@ SCMP
[US]CMP - 3-way comparison of signed or unsigned integers.
Definition: ISDOpcodes.h:726
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:701
@ VECREDUCE_FMUL
Definition: ISDOpcodes.h:1474
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:53
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1372
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1025
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:941
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition: ISDOpcodes.h:122
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:434
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:927
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition: ISDOpcodes.h:157
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:838
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1256
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1180
@ BlockAddress
Definition: ISDOpcodes.h:94
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:815
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:360
@ AssertZext
Definition: ISDOpcodes.h:63
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition: ISDOpcodes.h:713
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:543
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isZEXTLoad(const SDNode *N)
Returns true if the specified node is a ZEXTLOAD.
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1718
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1634
bool isSEXTLoad(const SDNode *N)
Returns true if the specified node is a SEXTLOAD.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1685
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1665
static const int LAST_INDEXED_MODE
Definition: ISDOpcodes.h:1636
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:751
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPEXT(EVT OpVT, EVT RetVT)
getFPEXT - Return the FPEXT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition: LLVMContext.h:55
@ GeneralDynamic
Definition: CodeGen.h:46
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
bool RetFastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:477
@ Length
Definition: DWP.cpp:477
void stable_sort(R &&Range)
Definition: STLExtras.h:2077
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1770
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns true if Val1 has a lower Constant Materialization Cost than Val2.
@ Read
Definition: CodeGenData.h:108
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isStrongerThanMonotonic(AtomicOrdering AO)
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition: bit.h:260
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
bool CC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:264
ExceptionHandling
Definition: CodeGen.h:53
@ SjLj
setjmp/longjmp based exceptions
bool RetCC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool RetCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2155
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:252
bool RetCC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:293
void shuffle(Iterator first, Iterator last, RNG &&g)
Definition: STLExtras.h:1563
bool CC_ARM_APCS_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
static std::array< MachineOperand, 2 > predOps(ARMCC::CondCodes Pred, unsigned PredReg=0)
Get the operands corresponding to the given Pred value.
constexpr bool isShiftedMask_32(uint32_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (32 bit ver...
Definition: MathExtras.h:276
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:157
bool isReleaseOrStronger(AtomicOrdering AO)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:336
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:203
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
Definition: SPIRVUtils.cpp:976
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
bool FastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
bool CC_ARM_Win32_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition: Error.cpp:167
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Definition: SmallVector.h:1300
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
CombineLevel
Definition: DAGCombine.h:15
@ BeforeLegalizeTypes
Definition: DAGCombine.h:16
unsigned ConstantMaterializationCost(unsigned Val, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns the number of instructions required to materialize the given constant in a register,...
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr U AbsoluteValue(T X)
Return the absolute value of a signed integer, converted to the corresponding unsigned integer type.
Definition: MathExtras.h:597
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:223
static MachineOperand t1CondCodeOp(bool isDead=false)
Get the operand corresponding to the conditional code result for Thumb1.
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1980
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1777
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:257
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
static MachineOperand condCodeOp(unsigned CCReg=0)
Get the operand corresponding to the conditional code result.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
unsigned gettBLXrOpcode(const MachineFunction &MF)
bool CC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool CC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
unsigned convertAddSubFlagsOpcode(unsigned OldOpc)
Map pseudo instructions that imply an 'S' bit onto real opcodes.
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:858
#define N
Load/store instruction that can be merged with a base address update.
SDNode * N
Instruction that updates a pointer.
unsigned ConstInc
Pointer increment value if it is a constant, or 0 otherwise.
SDValue Inc
Pointer increment operand.
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition: Metadata.h:760
static constexpr roundingMode rmTowardZero
Definition: APFloat.h:308
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Extended Value Type.
Definition: ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:279
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:345
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:458
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:354
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:465
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:299
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:376
bool isFixedLengthVector() const
Definition: ValueTypes.h:181
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:216
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:102
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition: ValueTypes.h:303
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:202
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
EVT ArgVT
Usually the non-legalized type of the argument, which is the EVT corresponding to the OrigTy IR type.
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:294
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:66
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition: KnownBits.h:165
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:74
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:304
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition: KnownBits.h:173
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:340
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
Definition: KnownBits.cpp:803
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition: KnownBits.h:128
Matching combinators.
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getJumpTable(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a jump table entry.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setInRegister(bool Value=true)
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
SmallVector< ISD::InputArg, 32 > Ins
CallLoweringInfo & setDiscardResult(bool Value=true)
CallLoweringInfo & setZExtResult(bool Value=true)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setSExtResult(bool Value=true)
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
CallLoweringInfo & setChain(SDValue InChain)
CallLoweringInfo & setCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList, AttributeSet ResultAttrs={})
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)